From 60e1e5b99aa0aa919b7615662527847a564bf226 Mon Sep 17 00:00:00 2001 From: Ali Cheraghi Date: Thu, 21 Nov 2024 17:41:35 +0330 Subject: [PATCH 001/346] build: Add ClangCL profiles Also fix some compilation errors catched by clang Signed-off-by: Ali Cheraghi --- 3rdparty/CMakeLists.txt | 2 +- 3rdparty/dxc/CMakeLists.txt | 2 +- CMakeLists.txt | 8 +-- CMakePresets.json | 24 +++++-- cmake/adjust/flags.cmake | 64 +++++++++++++++---- cmake/adjust/template/vendor/CXX_Clang.cmake | 45 +++++++++++++ cmake/adjust/template/vendor/CXX_MSVC.cmake | 46 +++++++++++++ cmake/adjust/template/vendor/C_Clang.cmake | 46 +++++++++++++ .../msvc.cmake => vendor/C_MSVC.cmake} | 31 --------- cmake/common.cmake | 6 +- include/nbl/asset/IFramebuffer.h | 2 +- include/nbl/asset/IRenderpass.h | 2 +- include/nbl/macros.h | 2 +- include/nbl/video/CVulkanDeviceMemoryBacked.h | 4 +- include/nbl/video/ISwapchain.h | 6 +- include/nbl/video/TimelineEventHandlers.h | 2 +- src/nbl/CMakeLists.txt | 2 +- src/nbl/builtin/utils.cmake | 4 +- 18 files changed, 231 insertions(+), 67 deletions(-) create mode 100644 cmake/adjust/template/vendor/CXX_Clang.cmake create mode 100644 cmake/adjust/template/vendor/CXX_MSVC.cmake create mode 100644 cmake/adjust/template/vendor/C_Clang.cmake rename cmake/adjust/template/{windows/msvc.cmake => vendor/C_MSVC.cmake} (58%) diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt index 7b9b6da784..d8ac2a0d25 100755 --- a/3rdparty/CMakeLists.txt +++ b/3rdparty/CMakeLists.txt @@ -493,7 +493,7 @@ if(ENABLE_HLSL) endif() foreach(trgt IN LISTS NBL_3RDPARTY_TARGETS) - if(NBL_DYNAMIC_MSVC_RUNTIME) + if(NBL_COMPILER_DYNAMIC_RUNTIME) set_property(TARGET ${trgt} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>DLL") else() set_property(TARGET ${trgt} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") diff --git a/3rdparty/dxc/CMakeLists.txt b/3rdparty/dxc/CMakeLists.txt index 8b34c76f88..b6e3e21e16 100644 --- a/3rdparty/dxc/CMakeLists.txt +++ b/3rdparty/dxc/CMakeLists.txt @@ -62,7 +62,7 @@ if(WIN32) endif() endif() -if(NBL_DYNAMIC_MSVC_RUNTIME) +if(NBL_COMPILER_DYNAMIC_RUNTIME) list(APPEND NBL_DXC_CMAKE_OPTIONS "-DCMAKE_MSVC_RUNTIME_LIBRARY:STATIC=MultiThreaded$<$:Debug>DLL") else() list(APPEND NBL_DXC_CMAKE_OPTIONS "-DCMAKE_MSVC_RUNTIME_LIBRARY:STATIC=MultiThreaded$<$:Debug>") diff --git a/CMakeLists.txt b/CMakeLists.txt index 339a89d27d..a8c9013eaa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,10 +20,10 @@ if(MSVC) endif() option(NBL_STATIC_BUILD "" OFF) # ON for static builds, OFF for shared -option(NBL_DYNAMIC_MSVC_RUNTIME "" ON) +option(NBL_COMPILER_DYNAMIC_RUNTIME "" ON) option(NBL_SANITIZE_ADDRESS OFF) -if(MSVC) +if(MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL MSVC) if(NBL_SANITIZE_ADDRESS) set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$:ProgramDatabase>") else() @@ -35,10 +35,10 @@ if(NBL_STATIC_BUILD) message(STATUS "Static Nabla build enabled!") else() if(MSVC) - if(NBL_DYNAMIC_MSVC_RUNTIME) + if(NBL_COMPILER_DYNAMIC_RUNTIME) message(STATUS "Shared Nabla build enabled!") else() - message(FATAL_ERROR "Turn NBL_DYNAMIC_MSVC_RUNTIME on! For dynamic Nabla builds dynamic MSVC runtime is mandatory!") + message(FATAL_ERROR "Turn NBL_COMPILER_DYNAMIC_RUNTIME on! For dynamic Nabla builds dynamic MSVC runtime is mandatory!") endif() else() message(FATAL_ERROR "Nabla can't be built with shared libraries! Please make sure you are targetting Windows OS and MSVC compiler!") diff --git a/CMakePresets.json b/CMakePresets.json index 8d0b62367a..da28fc1aff 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -46,7 +46,7 @@ "hidden": true, "inherits": "ci-configure-static-base", "cacheVariables": { - "NBL_DYNAMIC_MSVC_RUNTIME": "OFF" + "NBL_COMPILER_DYNAMIC_RUNTIME": "OFF" }, "condition": { "type": "allOf", @@ -69,7 +69,7 @@ "hidden": true, "inherits": "ci-configure-dynamic-base", "cacheVariables": { - "NBL_DYNAMIC_MSVC_RUNTIME": "ON" + "NBL_COMPILER_DYNAMIC_RUNTIME": "ON" }, "condition": { "type": "allOf", @@ -156,7 +156,7 @@ "hidden": true, "inherits": "user-configure-static-base", "cacheVariables": { - "NBL_DYNAMIC_MSVC_RUNTIME": "OFF" + "NBL_COMPILER_DYNAMIC_RUNTIME": "OFF" }, "condition": { "type": "equals", @@ -169,7 +169,7 @@ "hidden": true, "inherits": "user-configure-dynamic-base", "cacheVariables": { - "NBL_DYNAMIC_MSVC_RUNTIME": "ON" + "NBL_COMPILER_DYNAMIC_RUNTIME": "ON" }, "condition": { "type": "equals", @@ -193,6 +193,22 @@ "generator": "Visual Studio 17 2022", "toolset": "v143" }, + { + "name": "user-configure-static-clangcl", + "inherits": "user-configure-static-windows-base", + "displayName": "[USER]: Static library target, Visual Studio 17 2022 generator, ClangCL toolset", + "description": "Configure as static library with Visual Studio 17 2022 generator and ClangCL toolset", + "generator": "Visual Studio 17 2022", + "toolset": "ClangCL" + }, + { + "name": "user-configure-dynamic-clangcl", + "inherits": "user-configure-dynamic-windows-base", + "displayName": "[USER]: Dynamic library target, Visual Studio 17 2022 generator, ClangCL toolset", + "description": "Configure as dynamic library with Visual Studio 17 2022 generator and ClangCL toolset", + "generator": "Visual Studio 17 2022", + "toolset": "ClangCL" + }, { "name": "user-configure-static-ninja-multi", "inherits": "user-configure-static-windows-base", diff --git a/cmake/adjust/flags.cmake b/cmake/adjust/flags.cmake index 59764cb02d..430d507c93 100644 --- a/cmake/adjust/flags.cmake +++ b/cmake/adjust/flags.cmake @@ -40,17 +40,57 @@ option(NBL_REQUEST_SSE_4_2 "Request compilation with SSE 4.2 instruction set ena option(NBL_REQUEST_SSE_AXV2 "Request compilation with SSE Intel Advanced Vector Extensions 2 for Nabla projects" ON) # profiles -if(MSVC) - include("${CMAKE_CURRENT_LIST_DIR}/template/windows/msvc.cmake") -elseif(ANDROID) - include("${CMAKE_CURRENT_LIST_DIR}/template/unix/android.cmake") -elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - include("${CMAKE_CURRENT_LIST_DIR}/template/unix/gnu.cmake") -elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - include("${CMAKE_CURRENT_LIST_DIR}/template/unix/clang.cmake") -else() - message(WARNING "UNTESTED COMPILER DETECTED, EXPECT WRONG OPTIMIZATION FLAGS! SUBMIT ISSUE ON GITHUB https://github.com/Devsh-Graphics-Programming/Nabla/issues") -endif() +foreach(NBL_COMPILER_LANGUAGE IN ITEMS C CXX) + # all list of all known by CMake vendors: + # https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html + set(NBL_COMPILER_VENDOR "${CMAKE_${NBL_COMPILER_LANGUAGE}_COMPILER_ID}") + set(NBL_PROFILE_NAME "${NBL_COMPILER_LANGUAGE}_${NBL_COMPILER_VENDOR}") # eg. "cxx_MSVC.cmake" + set(NBL_PROFILE_PATH "${CMAKE_CURRENT_LIST_DIR}/template/vendor/${NBL_PROFILE_NAME}.cmake") + + include("${NBL_PROFILE_PATH}" RESULT_VARIABLE _NBL_FOUND_) + + if(NOT _NBL_FOUND_) + message(WARNING "UNSUPPORTED \"${NBL_COMPILER_LANGUAGE}\" COMPILER LANGUAGE FOR \"${NBL_COMPILER_VENDOR}\" DETECTED, CMAKE CONFIGURATION OR BUILD MAY FAIL AND COMPILE OPTIONS FLAGS WILL NOT BE SET! SUBMIT ISSUE ON GITHUB https://github.com/Devsh-Graphics-Programming/Nabla/issues") + continue() + endif() + + # a profile MUST define + # - "NBL_${NBL_COMPILER_LANGUAGE}_${CONFIGURATION}_COMPILE_OPTIONS" (configuration dependent) + # - "NBL_${NBL_COMPILER_LANGUAGE}_COMPILE_OPTIONS" (global) + + # a profile MUST NOT define + # - NBL_COMPILE_OPTIONS + + set(NBL_COMPILE_OPTIONS_VAR_NAME NBL_${NBL_COMPILER_LANGUAGE}_COMPILE_OPTIONS) + set(NBL_COMPILE_OPTIONS_VAR_VALUE ${${NBL_COMPILE_OPTIONS_VAR_NAME}}) + + if(NOT DEFINED ${NBL_COMPILE_OPTIONS_VAR_NAME}) + message(FATAL_ERROR "\"${NBL_PROFILE_PATH}\" did not define \"${NBL_COMPILE_OPTIONS_VAR_NAME}\"!") + endif() + + # update map with configuration dependent compile options + foreach(CONFIGURATION IN ITEMS RELEASE RELWITHDEBINFO DEBUG) + set(NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_NAME NBL_${NBL_COMPILER_LANGUAGE}_${CONFIGURATION}_COMPILE_OPTIONS) + set(NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_VALUE ${${NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_NAME}}) + + if(NOT DEFINED ${NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_NAME}) + message(FATAL_ERROR "\"${NBL_PROFILE_PATH}\" did not define \"${NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_NAME}\"!") + endif() + + list(APPEND NBL_${CONFIGURATION}_COMPILE_OPTIONS + # note that "${NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_VALUE}" MUST NOT contain ANY + # $<$> generator expression in order to support our configuration mapping features + $<$:${NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_VALUE}> + ) + + set(NBL_${CONFIGURATION}_COMPILE_OPTIONS ${NBL_${CONFIGURATION}_COMPILE_OPTIONS}) + endforeach() + + # update map with global compile options + list(APPEND NBL_COMPILE_OPTIONS $<$:${NBL_${NBL_COMPILER_LANGUAGE}_COMPILE_OPTIONS}>) + + set(NBL_COMPILE_OPTIONS ${NBL_COMPILE_OPTIONS}) +endforeach() function(NBL_EXT_P_APPEND_COMPILE_OPTIONS NBL_LIST_NAME MAP_RELEASE MAP_RELWITHDEBINFO MAP_DEBUG) macro(NBL_MAP_CONFIGURATION NBL_CONFIG_FROM NBL_CONFIG_TO) @@ -173,7 +213,7 @@ function(nbl_adjust_flags) set(MAPPED_CONFIG $>) - if(MSVC) + if(MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL MSVC) if(NBL_SANITIZE_ADDRESS) set(NBL_TARGET_MSVC_DEBUG_INFORMATION_FORMAT "$<$,$>:ProgramDatabase>") else() diff --git a/cmake/adjust/template/vendor/CXX_Clang.cmake b/cmake/adjust/template/vendor/CXX_Clang.cmake new file mode 100644 index 0000000000..4ab7d4ae83 --- /dev/null +++ b/cmake/adjust/template/vendor/CXX_Clang.cmake @@ -0,0 +1,45 @@ +include_guard(GLOBAL) + +# Debug +set(NBL_CXX_DEBUG_COMPILE_OPTIONS + -ggdb3 -Wall -fno-omit-frame-pointer -fstack-protector-strong +) + +# Release +set(NBL_CXX_RELEASE_COMPILE_OPTIONS + -fexpensive-optimizations +) + +# RelWithDebInfo +set(NBL_CXX_RELWITHDEBINFO_COMPILE_OPTIONS "") + +# Global +list(APPEND NBL_CXX_COMPILE_OPTIONS + -Wextra + -fno-strict-aliasing + -msse4.2 + -mfpmath=sse + -Wextra + -Wno-sequence-point + -Wno-unused-parameter + -Wno-unused-but-set-parameter + -Wno-error=ignored-attributes + -Wno-error=unused-function + -Wno-error=unused-variable + -Wno-error=unused-parameter + -Wno-error=ignored-attributes + -Wno-error=non-pod-varargs + -fno-exceptions +) + +if(NBL_SANITIZE_ADDRESS) + list(APPEND NBL_CXX_COMPILE_OPTIONS -fsanitize=address) +endif() + +if(NBL_SANITIZE_THREAD) + list(APPEND NBL_CXX_COMPILE_OPTIONS -fsanitize=thread) +endif() + +# our pervious flags-set function called this, does not affect flags nor configs so I will keep it here temporary +# TODO: move it out from the profile +link_libraries(-fuse-ld=gold) \ No newline at end of file diff --git a/cmake/adjust/template/vendor/CXX_MSVC.cmake b/cmake/adjust/template/vendor/CXX_MSVC.cmake new file mode 100644 index 0000000000..8b07390ed6 --- /dev/null +++ b/cmake/adjust/template/vendor/CXX_MSVC.cmake @@ -0,0 +1,46 @@ +include_guard(GLOBAL) + +# https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170 + +# The default instruction set is SSE2 if no /arch option is specified. +if(NBL_REQUEST_SSE_4_2) + NBL_REQUEST_COMPILE_OPTION_SUPPORT("/arch:SSE4.2") +endif() + +# Enables Intel Advanced Vector Extensions 2. +if(NBL_REQUEST_SSE_AXV2) + NBL_REQUEST_COMPILE_OPTION_SUPPORT("/arch:AVX2") +endif() + +# Debug +set(NBL_CXX_DEBUG_COMPILE_OPTIONS + /Zc:__cplusplus /Ob0 /Od /MP${_NBL_JOBS_AMOUNT_} /fp:fast /Zc:wchar_t /INCREMENTAL +) + +if(NBL_SANITIZE_ADDRESS) + list(APPEND NBL_CXX_DEBUG_COMPILE_OPTIONS /RTC1) +endif() + +set(NBL_DEBUG_CXX_COMPILE_OPTIONS + $<$:${NBL_CXX_DEBUG_COMPILE_OPTIONS}> +) + +# Release +set(NBL_CXX_RELEASE_COMPILE_OPTIONS + /Zc:__cplusplus /O2 /Ob2 /DNDEBUG /GL /MP${_NBL_JOBS_AMOUNT_} /Gy- /Zc:wchar_t /sdl- /GF /GS- /fp:fast +) + +# RelWithDebInfo +set(NBL_CXX_RELWITHDEBINFO_COMPILE_OPTIONS + /Zc:__cplusplus /O2 /Ob1 /DNDEBUG /GL /Zc:wchar_t /MP${_NBL_JOBS_AMOUNT_} /Gy /sdl- /Oy- /fp:fast +) + +if(NBL_SANITIZE_ADDRESS) + list(APPEND NBL_CXX_COMPILE_OPTIONS /fsanitize=address) +endif() + +# this should also be not part of profile, pasting from old flags-set function temporary +# TODO: use profile + +#reason for INCREMENTAL:NO: https://docs.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=vs-2019 /LTCG is not valid for use with /INCREMENTAL. +set(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO} /INCREMENTAL:NO /LTCG:incremental") diff --git a/cmake/adjust/template/vendor/C_Clang.cmake b/cmake/adjust/template/vendor/C_Clang.cmake new file mode 100644 index 0000000000..e4eb0d6ad9 --- /dev/null +++ b/cmake/adjust/template/vendor/C_Clang.cmake @@ -0,0 +1,46 @@ +include_guard(GLOBAL) + +# Debug +set(NBL_C_DEBUG_COMPILE_OPTIONS + -ggdb3 -Wall -fno-omit-frame-pointer -fstack-protector-strong +) + +# Release +set(NBL_C_RELEASE_COMPILE_OPTIONS + -fexpensive-optimizations +) + +# RelWithDebInfo +set(NBL_C_RELWITHDEBINFO_COMPILE_OPTIONS "") + +# Global +list(APPEND NBL_C_COMPILE_OPTIONS + -Wextra + -fno-strict-aliasing + -msse4.2 + -mfpmath=sse + -maes + -Wextra + -Wno-sequence-point + -Wno-unused-parameter + -Wno-unused-but-set-parameter + -Wno-error=ignored-attributes + -Wno-error=unused-function + -Wno-error=unused-variable + -Wno-error=unused-parameter + -Wno-error=ignored-attributes + -Wno-error=non-pod-varargs + -fno-exceptions +) + +if(NBL_SANITIZE_ADDRESS) + list(APPEND NBL_C_COMPILE_OPTIONS -fsanitize=address) +endif() + +if(NBL_SANITIZE_THREAD) + list(APPEND NBL_C_COMPILE_OPTIONS -fsanitize=thread) +endif() + +# our pervious flags-set function called this, does not affect flags nor configs so I will keep it here temporary +# TODO: move it out from the profile +link_libraries(-fuse-ld=gold) \ No newline at end of file diff --git a/cmake/adjust/template/windows/msvc.cmake b/cmake/adjust/template/vendor/C_MSVC.cmake similarity index 58% rename from cmake/adjust/template/windows/msvc.cmake rename to cmake/adjust/template/vendor/C_MSVC.cmake index e0eaa82e80..76bace680f 100644 --- a/cmake/adjust/template/windows/msvc.cmake +++ b/cmake/adjust/template/vendor/C_MSVC.cmake @@ -21,51 +21,20 @@ if(NBL_SANITIZE_ADDRESS) list(APPEND NBL_C_DEBUG_COMPILE_OPTIONS /RTC1) endif() -set(NBL_CXX_DEBUG_COMPILE_OPTIONS - /Zc:__cplusplus ${NBL_C_DEBUG_COMPILE_OPTIONS} -) - -set(NBL_DEBUG_COMPILE_OPTIONS - $<$:${NBL_CXX_DEBUG_COMPILE_OPTIONS}> - $<$:${NBL_C_DEBUG_COMPILE_OPTIONS}> -) - # Release set(NBL_C_RELEASE_COMPILE_OPTIONS /O2 /Ob2 /DNDEBUG /GL /MP${_NBL_JOBS_AMOUNT_} /Gy- /Zc:wchar_t /sdl- /GF /GS- /fp:fast ) -set(NBL_CXX_RELEASE_COMPILE_OPTIONS - /Zc:__cplusplus ${NBL_C_RELEASE_COMPILE_OPTIONS} -) - -set(NBL_RELEASE_COMPILE_OPTIONS - $<$:${NBL_CXX_RELEASE_COMPILE_OPTIONS}> - $<$:${NBL_C_RELEASE_COMPILE_OPTIONS}> -) # RelWithDebInfo set(NBL_C_RELWITHDEBINFO_COMPILE_OPTIONS /O2 /Ob1 /DNDEBUG /GL /Zc:wchar_t /MP${_NBL_JOBS_AMOUNT_} /Gy /sdl- /Oy- /fp:fast ) -set(NBL_CXX_RELWITHDEBINFO_COMPILE_OPTIONS - /Zc:__cplusplus ${NBL_C_RELWITHDEBINFO_COMPILE_OPTIONS} -) - -set(NBL_RELWITHDEBINFO_COMPILE_OPTIONS - $<$:${NBL_CXX_RELWITHDEBINFO_COMPILE_OPTIONS}> - $<$:${NBL_C_RELWITHDEBINFO_COMPILE_OPTIONS}> -) if(NBL_SANITIZE_ADDRESS) list(APPEND NBL_C_COMPILE_OPTIONS /fsanitize=address) - list(APPEND NBL_CXX_COMPILE_OPTIONS ${NBL_C_COMPILE_OPTIONS}) endif() -set(NBL_COMPILE_OPTIONS - $<$:${NBL_CXX_COMPILE_OPTIONS}> - $<$:${NBL_C_COMPILE_OPTIONS}> -) - # this should also be not part of profile, pasting from old flags-set function temporary # TODO: use profile diff --git a/cmake/common.cmake b/cmake/common.cmake index 86b1856ed3..d89c1ae071 100755 --- a/cmake/common.cmake +++ b/cmake/common.cmake @@ -25,7 +25,7 @@ function(nbl_handle_dll_definitions _TARGET_ _SCOPE_) message(FATAL_ERROR "Internal error, requsted \"${_TARGET_}\" is not defined!") endif() - if(NBL_DYNAMIC_MSVC_RUNTIME) + if(NBL_COMPILER_DYNAMIC_RUNTIME) set(_NABLA_OUTPUT_DIR_ "${NBL_ROOT_PATH_BINARY}/src/nbl/$/devshgraphicsprogramming.nabla") target_compile_definitions(${_TARGET_} ${_SCOPE_} @@ -43,7 +43,7 @@ function(nbl_handle_runtime_lib_properties _TARGET_) message(FATAL_ERROR "Internal error, requsted \"${_TARGET_}\" is not defined!") endif() - if(NBL_DYNAMIC_MSVC_RUNTIME) + if(MSVC) set_target_properties(${_TARGET_} PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>DLL") else() set_target_properties(${_TARGET_} PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") @@ -75,7 +75,7 @@ macro(nbl_create_executable_project _EXTRA_SOURCES _EXTRA_OPTIONS _EXTRA_INCLUDE nbl_handle_runtime_lib_properties(${EXECUTABLE_NAME}) if(WIN32 AND MSVC) - if(NBL_DYNAMIC_MSVC_RUNTIME) + if(NBL_COMPILER_DYNAMIC_RUNTIME) target_link_options(${EXECUTABLE_NAME} PUBLIC "/DELAYLOAD:$") endif() diff --git a/include/nbl/asset/IFramebuffer.h b/include/nbl/asset/IFramebuffer.h index 9c78fe1e42..4f4abb89da 100644 --- a/include/nbl/asset/IFramebuffer.h +++ b/include/nbl/asset/IFramebuffer.h @@ -121,7 +121,7 @@ class IFramebuffer return true; // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkFramebufferCreateInfo.html#VUID-VkFramebufferCreateInfo-pAttachments-00884 - if (viewParams.components!=ImageViewType::SComponentMapping()) + if (viewParams.components!=typename ImageViewType::SComponentMapping()) return true; // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkFramebufferCreateInfo.html#VUID-VkFramebufferCreateInfo-flags-04533 diff --git a/include/nbl/asset/IRenderpass.h b/include/nbl/asset/IRenderpass.h index 7595911716..1e44d8b526 100644 --- a/include/nbl/asset/IRenderpass.h +++ b/include/nbl/asset/IRenderpass.h @@ -707,7 +707,7 @@ inline bool IRenderpass::SCreationParams::SSubpassDescription::SDepthStencilAtta template inline bool IRenderpass::SCreationParams::SSubpassDescription::SRenderAttachmentsRef::valid(const typename attachment_ref_t::description_t* descs, const uint32_t attachmentCount) const { - if (!render.valid(descs,attachmentCount) || !resolve.valid(descs,attachmentCount)) + if (!render.template valid(descs,attachmentCount) || !resolve.template valid(descs,attachmentCount)) return false; const bool renderUsed = render.used(); if (resolve.used()) diff --git a/include/nbl/macros.h b/include/nbl/macros.h index 4927f21899..fe93201a11 100644 --- a/include/nbl/macros.h +++ b/include/nbl/macros.h @@ -81,7 +81,7 @@ //! Workarounds for compiler specific bugs // MSVC 2019 is a special snowflake -#if defined(_MSC_VER) && _MSC_VER>=1920 +#if defined(_MSC_VER) && !defined(__clang__) && _MSC_VER>=1920 #define NBL_TYPENAME_4_STTC_MBR typename #else #define NBL_TYPENAME_4_STTC_MBR diff --git a/include/nbl/video/CVulkanDeviceMemoryBacked.h b/include/nbl/video/CVulkanDeviceMemoryBacked.h index c996000e04..e6d17ddf3e 100644 --- a/include/nbl/video/CVulkanDeviceMemoryBacked.h +++ b/include/nbl/video/CVulkanDeviceMemoryBacked.h @@ -47,8 +47,8 @@ class CVulkanDeviceMemoryBacked : public Interface }; #ifndef _NBL_VIDEO_C_VULKAN_DEVICE_MEMORY_BACKED_CPP_ -extern template CVulkanDeviceMemoryBacked; -extern template CVulkanDeviceMemoryBacked; +extern template class CVulkanDeviceMemoryBacked; +extern template class CVulkanDeviceMemoryBacked; #endif } // end namespace nbl::video diff --git a/include/nbl/video/ISwapchain.h b/include/nbl/video/ISwapchain.h index d052a819bd..99ba2e7975 100644 --- a/include/nbl/video/ISwapchain.h +++ b/include/nbl/video/ISwapchain.h @@ -21,6 +21,8 @@ class ISwapchain : public IBackendObject struct SSharedCreationParams { + SSharedCreationParams() {} + inline bool valid(const IPhysicalDevice* physDev, const ISurface* surface) const { ISurface::SCapabilities caps; @@ -465,10 +467,10 @@ class ISwapchain : public IBackendObject virtual const void* getNativeHandle() const = 0; // returns the maximum number of time acquires with infinite timeout which can be called before releasing the image index through present. - virtual uint8_t getMaxBlockingAcquiresBeforePresent() const = 0u; + virtual uint8_t getMaxBlockingAcquiresBeforePresent() const = 0; // returns the maximum number of acquires you can request without waiting for previous acquire semaphores to signal. - virtual uint8_t getMaxAcquiresInFlight() const = 0u; + virtual uint8_t getMaxAcquiresInFlight() const = 0; // only public because MultiTimelineEventHandlerST needs to know about it class DeferredFrameSemaphoreDrop final diff --git a/include/nbl/video/TimelineEventHandlers.h b/include/nbl/video/TimelineEventHandlers.h index 9405accf78..a3d6aa4c8b 100644 --- a/include/nbl/video/TimelineEventHandlers.h +++ b/include/nbl/video/TimelineEventHandlers.h @@ -410,7 +410,7 @@ class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable sum += handler->count(); else { - const auto local = handler->poll_impl(std::forward(args)...); + const auto local = handler->template poll_impl(std::forward(args)...); bailed = local.bailed; // if don't have any events left, remove the timeline if (local.eventsLeft) diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 83845b9c84..f96e031fca 100755 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -350,7 +350,7 @@ if(NBL_CPACK_NO_BUILD_DIRECTORY_MODULES) target_compile_definitions(Nabla PUBLIC NBL_CPACK_NO_BUILD_DIRECTORY_MODULES) endif() -if(NBL_DYNAMIC_MSVC_RUNTIME) +if(NBL_COMPILER_DYNAMIC_RUNTIME) set_property(TARGET Nabla PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>DLL") else() set_property(TARGET Nabla PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") diff --git a/src/nbl/builtin/utils.cmake b/src/nbl/builtin/utils.cmake index 0a76d1c67e..04c15de86d 100644 --- a/src/nbl/builtin/utils.cmake +++ b/src/nbl/builtin/utils.cmake @@ -39,7 +39,7 @@ endmacro() # _NAMESPACE_ is a C++ namespace builtin resources will be wrapped into # _OUTPUT_INCLUDE_SEARCH_DIRECTORY_ is an absolute path to output directory for builtin resources header files which will be a search directory for generated headers outputed to ${_OUTPUT_HEADER_DIRECTORY_}/${_NAMESPACE_PREFIX_} where namespace prefix is the namespace turned into a path # _OUTPUT_SOURCE_DIRECTORY_ is an absolute path to output directory for builtin resources source files -# _STATIC_ optional last argument is a bool, if true then add_library will use STATIC, SHARED otherwise. Pay attention that MSVC runtime is controlled by NBL_DYNAMIC_MSVC_RUNTIME which is not an argument of this function +# _STATIC_ optional last argument is a bool, if true then add_library will use STATIC, SHARED otherwise. Pay attention that MSVC runtime is controlled by NBL_COMPILER_DYNAMIC_RUNTIME which is not an argument of this function # # As an example one could list a resource as following # LIST_BUILTIN_RESOURCE(SOME_RESOURCES_TO_EMBED "glsl/blit/default_compute_normalization.comp") @@ -207,7 +207,7 @@ function(ADD_CUSTOM_BUILTIN_RESOURCES _TARGET_NAME_ _BUNDLE_NAME_ _BUNDLE_SEARCH ) set_target_properties(${_TARGET_NAME_} PROPERTIES CXX_STANDARD 20) - if(NBL_DYNAMIC_MSVC_RUNTIME) + if(NBL_COMPILER_DYNAMIC_RUNTIME) set_property(TARGET ${_TARGET_NAME_} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>DLL") else() set_property(TARGET ${_TARGET_NAME_} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") From 114c549f13f9d8e1de7b7ea6eb53daeacc2d78a5 Mon Sep 17 00:00:00 2001 From: Ali Cheraghi Date: Thu, 21 Nov 2024 20:21:22 +0330 Subject: [PATCH 002/346] build: one liner ifs and some fixes Signed-off-by: Ali Cheraghi --- 3rdparty/CMakeLists.txt | 6 +----- 3rdparty/dxc/CMakeLists.txt | 6 +----- CMakeLists.txt | 12 +++--------- cmake/adjust/template/vendor/CXX_MSVC.cmake | 4 ---- cmake/common.cmake | 6 +----- src/nbl/builtin/utils.cmake | 8 ++------ 6 files changed, 8 insertions(+), 34 deletions(-) diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt index d8ac2a0d25..b27ea0437c 100755 --- a/3rdparty/CMakeLists.txt +++ b/3rdparty/CMakeLists.txt @@ -493,11 +493,7 @@ if(ENABLE_HLSL) endif() foreach(trgt IN LISTS NBL_3RDPARTY_TARGETS) - if(NBL_COMPILER_DYNAMIC_RUNTIME) - set_property(TARGET ${trgt} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>DLL") - else() - set_property(TARGET ${trgt} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") - endif() + set_property(TARGET ${trgt} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>$<$:DLL>") if(MSVC AND NBL_SANITIZE_ADDRESS) set_property(TARGET ${trgt} PROPERTY COMPILE_OPTIONS /fsanitize=address) diff --git a/3rdparty/dxc/CMakeLists.txt b/3rdparty/dxc/CMakeLists.txt index b6e3e21e16..2142a574ec 100644 --- a/3rdparty/dxc/CMakeLists.txt +++ b/3rdparty/dxc/CMakeLists.txt @@ -62,11 +62,7 @@ if(WIN32) endif() endif() -if(NBL_COMPILER_DYNAMIC_RUNTIME) - list(APPEND NBL_DXC_CMAKE_OPTIONS "-DCMAKE_MSVC_RUNTIME_LIBRARY:STATIC=MultiThreaded$<$:Debug>DLL") -else() - list(APPEND NBL_DXC_CMAKE_OPTIONS "-DCMAKE_MSVC_RUNTIME_LIBRARY:STATIC=MultiThreaded$<$:Debug>") -endif() +list(APPEND NBL_DXC_CMAKE_OPTIONS "-DCMAKE_MSVC_RUNTIME_LIBRARY:STATIC=MultiThreaded$<$:Debug>$<$:DLL>") # perform DXC compile standard requirement test set(CMAKE_CXX_STANDARD_REQUIRED ON) diff --git a/CMakeLists.txt b/CMakeLists.txt index a8c9013eaa..68e913770c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,7 +23,7 @@ option(NBL_STATIC_BUILD "" OFF) # ON for static builds, OFF for shared option(NBL_COMPILER_DYNAMIC_RUNTIME "" ON) option(NBL_SANITIZE_ADDRESS OFF) -if(MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL MSVC) +if(CMAKE_CXX_COMPILER_ID STREQUAL MSVC) if(NBL_SANITIZE_ADDRESS) set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$:ProgramDatabase>") else() @@ -34,14 +34,8 @@ endif() if(NBL_STATIC_BUILD) message(STATUS "Static Nabla build enabled!") else() - if(MSVC) - if(NBL_COMPILER_DYNAMIC_RUNTIME) - message(STATUS "Shared Nabla build enabled!") - else() - message(FATAL_ERROR "Turn NBL_COMPILER_DYNAMIC_RUNTIME on! For dynamic Nabla builds dynamic MSVC runtime is mandatory!") - endif() - else() - message(FATAL_ERROR "Nabla can't be built with shared libraries! Please make sure you are targetting Windows OS and MSVC compiler!") + if(NOT NBL_COMPILER_DYNAMIC_RUNTIME) + message(FATAL_ERROR "Turn NBL_COMPILER_DYNAMIC_RUNTIME on! For dynamic Nabla builds dynamic runtime is mandatory!") endif() endif() diff --git a/cmake/adjust/template/vendor/CXX_MSVC.cmake b/cmake/adjust/template/vendor/CXX_MSVC.cmake index 8b07390ed6..1abb66c9da 100644 --- a/cmake/adjust/template/vendor/CXX_MSVC.cmake +++ b/cmake/adjust/template/vendor/CXX_MSVC.cmake @@ -21,10 +21,6 @@ if(NBL_SANITIZE_ADDRESS) list(APPEND NBL_CXX_DEBUG_COMPILE_OPTIONS /RTC1) endif() -set(NBL_DEBUG_CXX_COMPILE_OPTIONS - $<$:${NBL_CXX_DEBUG_COMPILE_OPTIONS}> -) - # Release set(NBL_CXX_RELEASE_COMPILE_OPTIONS /Zc:__cplusplus /O2 /Ob2 /DNDEBUG /GL /MP${_NBL_JOBS_AMOUNT_} /Gy- /Zc:wchar_t /sdl- /GF /GS- /fp:fast diff --git a/cmake/common.cmake b/cmake/common.cmake index d89c1ae071..c663a98443 100755 --- a/cmake/common.cmake +++ b/cmake/common.cmake @@ -43,11 +43,7 @@ function(nbl_handle_runtime_lib_properties _TARGET_) message(FATAL_ERROR "Internal error, requsted \"${_TARGET_}\" is not defined!") endif() - if(MSVC) - set_target_properties(${_TARGET_} PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>DLL") - else() - set_target_properties(${_TARGET_} PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") - endif() + set_target_properties(${_TARGET_} PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>$<$:DLL>") endfunction() # Macro creating project for an executable diff --git a/src/nbl/builtin/utils.cmake b/src/nbl/builtin/utils.cmake index 04c15de86d..e5b1741a95 100644 --- a/src/nbl/builtin/utils.cmake +++ b/src/nbl/builtin/utils.cmake @@ -206,12 +206,8 @@ function(ADD_CUSTOM_BUILTIN_RESOURCES _TARGET_NAME_ _BUNDLE_NAME_ _BUNDLE_SEARCH "${_OUTPUT_HEADER_DIRECTORY_}" ) set_target_properties(${_TARGET_NAME_} PROPERTIES CXX_STANDARD 20) - - if(NBL_COMPILER_DYNAMIC_RUNTIME) - set_property(TARGET ${_TARGET_NAME_} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>DLL") - else() - set_property(TARGET ${_TARGET_NAME_} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") - endif() + + set_property(TARGET ${_TARGET_NAME_} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>$<$:DLL>") set(NBL_BUILTIN_RESOURCES ${NBL_BUILTIN_RESOURCES}) # turn builtin resources paths list into variable From ff5513b33e434f8ce21f06d3e71c85b59f905c99 Mon Sep 17 00:00:00 2001 From: Ali Cheraghi Date: Thu, 21 Nov 2024 21:43:55 +0330 Subject: [PATCH 003/346] update dxc submodule Signed-off-by: Ali Cheraghi --- 3rdparty/dxc/dxc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc index 5adc27f9e4..b8e1df19be 160000 --- a/3rdparty/dxc/dxc +++ b/3rdparty/dxc/dxc @@ -1 +1 @@ -Subproject commit 5adc27f9e42de7681d65a98873048af661b9b367 +Subproject commit b8e1df19bebaf18ff1d6b9b90d7d020cf86f3205 From 44acfcfbbe0034946307f39cebaef809de386a47 Mon Sep 17 00:00:00 2001 From: Ali Cheraghi Date: Fri, 22 Nov 2024 20:21:29 +0330 Subject: [PATCH 004/346] build: simplify if Signed-off-by: Ali Cheraghi --- cmake/adjust/flags.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/adjust/flags.cmake b/cmake/adjust/flags.cmake index 430d507c93..bec887ef1a 100644 --- a/cmake/adjust/flags.cmake +++ b/cmake/adjust/flags.cmake @@ -213,7 +213,7 @@ function(nbl_adjust_flags) set(MAPPED_CONFIG $>) - if(MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL MSVC) + if(CMAKE_CXX_COMPILER_ID STREQUAL MSVC) if(NBL_SANITIZE_ADDRESS) set(NBL_TARGET_MSVC_DEBUG_INFORMATION_FORMAT "$<$,$>:ProgramDatabase>") else() From 2b4a1214177aa5db5514664e47c7ba5f6f42fdaa Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Fri, 4 Apr 2025 12:53:51 +0200 Subject: [PATCH 005/346] save work --- tools/nsc/CMakeLists.txt | 69 ++++++++++++++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 9 deletions(-) diff --git a/tools/nsc/CMakeLists.txt b/tools/nsc/CMakeLists.txt index 1582e9ecd6..bb45442982 100644 --- a/tools/nsc/CMakeLists.txt +++ b/tools/nsc/CMakeLists.txt @@ -120,7 +120,9 @@ set(NBL_CE_GENERATE_CONFIG_COMMAND -P "${CMAKE_CURRENT_SOURCE_DIR}/ce-generate-config.cmake" ) -set(NBL_DOCKER_CE_COMPOSE_BASE "${NBL_ROOT_PATH}/docker/compiler-explorer/compose.yml") +set(NBL_DOCKER_CE_DOCKER_CTX "${NBL_ROOT_PATH}/docker/compiler-explorer") +set(NBL_DOCKER_CE_DOCKERFILE_BASE "${NBL_DOCKER_CE_DOCKER_CTX}/Dockerfile") +set(NBL_DOCKER_CE_COMPOSE_BASE "${NBL_DOCKER_CE_DOCKER_CTX}/compose.yml") cmake_path(NATIVE_PATH NBL_DOCKER_CE_COMPOSE_BASE NORMALIZE NBL_DOCKER_CE_COMPOSE_BASE) set(NBL_DOCKER_CE_COMPOSE_TARGET "${GODBOLT_BINARY_DIRECTORY}/.dev-compose.yml") @@ -273,20 +275,21 @@ ON set(BASE_IMAGE dr.devsh.eu/compiler-explorer/windows) # NOTE to self: could be all done with single docker file & compose file but buildkit works bad with windows driver, yet need to wait for stuff to be implemented -set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/docker/devel") +set(DEVEL_CTX "${CMAKE_CURRENT_BINARY_DIR}/docker/devel") set(CT_REDIST_DIR "${CT_TOOLSET_REDIST_TARGET}/${REDIST_CRT_TOOLSET_VERSION}") set(CT_NONREDIST_CTR_DIR "${CT_REDIST_DIR}/${DEBUG_CRT_RELATIVE}") cmake_path(NATIVE_PATH CT_REDIST_DIR NORMALIZE CT_REDIST_DIR) cmake_path(NATIVE_PATH CT_NONREDIST_CTR_DIR NORMALIZE CT_NONREDIST_CTR_DIR) -set(DEVEL_DOCKERFILE "${OUTPUT_DIRECTORY}/Dockerfile") +set(DEVEL_DOCKERFILE "${DEVEL_CTX}/Dockerfile") -GEN_DOCKER_CONTENT("" "${OUTPUT_DIRECTORY}" +GEN_DOCKER_CONTENT("" "${DEVEL_CTX}" [=[ -COPY --from=@DOCKER_VULKAN_TAG@ /@CT_VULKAN_TARGET@ /@CT_VULKAN_TARGET@ -COPY --from=@DOCKER_CRT_TAG@ /@CT_TOOLSET_REDIST_TARGET@ /@CT_TOOLSET_REDIST_TARGET@ +COPY --link --from=@DOCKER_VULKAN_TAG@ /@CT_VULKAN_TARGET@ /@CT_VULKAN_TARGET@ +COPY --link --from=@DOCKER_CRT_TAG@ /@CT_TOOLSET_REDIST_TARGET@ /@CT_TOOLSET_REDIST_TARGET@ -RUN .\@CT_REDIST_DIR@\vc_redist.x64.exe /quiet /install +# TODO +# RUN .\@CT_REDIST_DIR@\vc_redist.x64.exe /quiet /install RUN xcopy .\@CT_NONREDIST_CTR_DIR@\*.dll %SystemRoot%\System32 /Y RUN xcopy .\@CT_TOOLSET_REDIST_TARGET@\ucrtbased.dll %SystemRoot%\System32 /Y @@ -348,8 +351,56 @@ string(CONFIGURE "${COMPOSE_CONTENT}" COMPOSE_CONTENT @ONLY) file(WRITE "${NBL_DOCKER_CE_COMPOSE_TARGET}" "${COMPOSE_CONTENT}") make_directory("${GODBOLT_BINARY_DIRECTORY}/.ctx") -execute_process(COMMAND "${DOCKER_EXE}" compose -f "${NBL_DOCKER_CE_COMPOSE_BASE}" build) -execute_process(COMMAND "${DOCKER_EXE}" compose -f "${NBL_DOCKER_CE_COMPOSE_TARGET}" build) +function(_PROMOTE_PROCESS_ISOLATION_ KERNEL BASES VAR) + set(${VAR} True) + set(ix 0) + list(LENGTH BASES LEN) + + while(ix LESS ${LEN}) + list(GET BASES ${ix} BASE) + + execute_process(COMMAND "${DOCKER_EXE}" inspect --format={{.OsVersion}} ${BASE} RESULT_VARIABLE EXIT_LEVEL OUTPUT_VARIABLE TARGET_KERNEL OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(${EXIT_LEVEL} EQUAL 0) + if(${KERNEL} VERSION_LESS ${TARGET_KERNEL}) + set(${VAR} False PARENT_SCOPE) + message(STATUS "While inspecting ${BASE} - host Kernel ${KERNEL} too low to use container process isolation (target ${TARGET_KERNEL}), falling back to HyperV. Please update your host OS.") + return() + endif() + math(EXPR ix "${ix} + 1") + else() + message(STATUS "Docker image ${BASE} not found locally, pulling...") + execute_process(COMMAND "${DOCKER_EXE}" pull ${BASE}) + endif() + endwhile() + + set(${VAR} ${${VAR}} PARENT_SCOPE) +endfunction() + +execute_process(COMMAND cmd /C ver OUTPUT_VARIABLE PIPE OUTPUT_STRIP_TRAILING_WHITESPACE) +string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+" HOST_KERNEL "${PIPE}") + +set(BASES + mcr.microsoft.com/windows/nanoserver:ltsc2022 + mcr.microsoft.com/powershell:lts-nanoserver-ltsc2022 +) + +_PROMOTE_PROCESS_ISOLATION_("${HOST_KERNEL}" "${BASES}" PROMOTE_TO_PROCESS) + +function(_BUILD_IMAGE_ DOCKERFILE CTX TAG) + set(CMD "${DOCKER_EXE}" build) + if(PROMOTE_TO_PROCESS) + list(APPEND CMD --isolation "process") + endif() + list(APPEND CMD -t ${TAG} -f "${DOCKERFILE}" .) + + execute_process(COMMAND ${CMD} WORKING_DIRECTORY "${CTX}") +endfunction() + +_BUILD_IMAGE_("${NBL_DOCKER_CE_DOCKERFILE_BASE}" "${NBL_DOCKER_CE_DOCKER_CTX}" godbolt/base/windows) +_BUILD_IMAGE_("${DEVEL_DOCKERFILE}" "${DEVEL_CTX}" godbolt/devel/windows) + +message(FATAL_ERROR "STOP TEST, PROMOTE_TO_PROCESS = ${PROMOTE_TO_PROCESS}") string(APPEND BAT_PRODUCTION_INSTALL [=[ From 616f7d7b210b95c079da20659f8762b7f1a743ae Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Tue, 8 Apr 2025 11:01:50 +0200 Subject: [PATCH 006/346] fixing CLang build, save work --- cmake/adjust/template/vendor/CXX_Clang.cmake | 7 +++++++ cmake/adjust/template/vendor/C_Clang.cmake | 10 ++++++++-- include/nbl/asset/IDescriptorSetLayout.h | 4 ++-- include/nbl/asset/IRenderpass.h | 1 + include/nbl/asset/utils/CSPIRVIntrospector.h | 8 ++++---- .../builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl | 2 +- 6 files changed, 23 insertions(+), 9 deletions(-) diff --git a/cmake/adjust/template/vendor/CXX_Clang.cmake b/cmake/adjust/template/vendor/CXX_Clang.cmake index 4ab7d4ae83..258fef3d8a 100644 --- a/cmake/adjust/template/vendor/CXX_Clang.cmake +++ b/cmake/adjust/template/vendor/CXX_Clang.cmake @@ -15,14 +15,21 @@ set(NBL_CXX_RELWITHDEBINFO_COMPILE_OPTIONS "") # Global list(APPEND NBL_CXX_COMPILE_OPTIONS + -Wno-everything # TMP -Wextra -fno-strict-aliasing -msse4.2 + -maes -mfpmath=sse -Wextra -Wno-sequence-point -Wno-unused-parameter -Wno-unused-but-set-parameter + -Wno-c++98-compat + -Wno-c++98-compat-pedantic + -Wno-padded + -Wno-unsafe-buffer-usage + -Wno-switch-enum -Wno-error=ignored-attributes -Wno-error=unused-function -Wno-error=unused-variable diff --git a/cmake/adjust/template/vendor/C_Clang.cmake b/cmake/adjust/template/vendor/C_Clang.cmake index e4eb0d6ad9..3dc21dec15 100644 --- a/cmake/adjust/template/vendor/C_Clang.cmake +++ b/cmake/adjust/template/vendor/C_Clang.cmake @@ -15,15 +15,21 @@ set(NBL_C_RELWITHDEBINFO_COMPILE_OPTIONS "") # Global list(APPEND NBL_C_COMPILE_OPTIONS + -Wno-everything # TMP -Wextra -fno-strict-aliasing -msse4.2 - -mfpmath=sse - -maes + -maes + -mfpmath=sse -Wextra -Wno-sequence-point -Wno-unused-parameter -Wno-unused-but-set-parameter + -Wno-c++98-compat + -Wno-c++98-compat-pedantic + -Wno-padded + -Wno-unsafe-buffer-usage + -Wno-switch-enum -Wno-error=ignored-attributes -Wno-error=unused-function -Wno-error=unused-variable diff --git a/include/nbl/asset/IDescriptorSetLayout.h b/include/nbl/asset/IDescriptorSetLayout.h index 44e8be71ea..ec3c182fdc 100644 --- a/include/nbl/asset/IDescriptorSetLayout.h +++ b/include/nbl/asset/IDescriptorSetLayout.h @@ -330,7 +330,7 @@ class IDescriptorSetLayout : public IDescriptorSetLayoutBase bindings[i].binding = i; bindings[i].type = type; bindings[i].createFlags = SBinding::E_CREATE_FLAGS::ECF_NONE; - bindings[i].stageFlags = stageAccessFlags ? stageAccessFlags[i]:asset::IShader::ESS_ALL_OR_LIBRARY; + bindings[i].stageFlags = stageAccessFlags ? stageAccessFlags[i]:asset::IShader::E_SHADER_STAGE::ESS_ALL_OR_LIBRARY; bindings[i].count = counts ? counts[i]:1u; bindings[i].samplers = nullptr; } @@ -354,7 +354,7 @@ class IDescriptorSetLayout : public IDescriptorSetLayoutBase for (uint32_t b = 0u; b < bindingCnt; ++b) { auto bindingNumber = m_descriptorRedirects[t].m_bindingNumbers[b]; - CBindingRedirect::template binding_number_t otherBindingNumber(CBindingRedirect::Invalid); + CBindingRedirect::binding_number_t otherBindingNumber(CBindingRedirect::Invalid); // TODO: std::find instead? for (uint32_t ob = 0u; ob < otherBindingCnt; ++ob) { diff --git a/include/nbl/asset/IRenderpass.h b/include/nbl/asset/IRenderpass.h index 657b0fcaff..b9554fc2a6 100644 --- a/include/nbl/asset/IRenderpass.h +++ b/include/nbl/asset/IRenderpass.h @@ -81,6 +81,7 @@ class NBL_API2 IRenderpass { bool valid() const; }; + // The arrays pointed to by this array must be terminated by `DepthStencilAttachmentsEnd` value, which implicitly satisfies a few VUIDs constexpr static inline SDepthStencilAttachmentDescription DepthStencilAttachmentsEnd = {}; const SDepthStencilAttachmentDescription* depthStencilAttachments = &DepthStencilAttachmentsEnd; diff --git a/include/nbl/asset/utils/CSPIRVIntrospector.h b/include/nbl/asset/utils/CSPIRVIntrospector.h index f756a58a42..7a2310a62e 100644 --- a/include/nbl/asset/utils/CSPIRVIntrospector.h +++ b/include/nbl/asset/utils/CSPIRVIntrospector.h @@ -326,8 +326,8 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable template inline std::enable_if_t isLastMemberRuntimeSized() const { - if (type->memberCount) - return type->memberTypes()[type->memberCount-1].count.front().isRuntimeSized(); + if (this->type->memberCount) + return this->type->memberTypes()[this->type->memberCount-1].count.front().isRuntimeSized(); return false; } template @@ -335,9 +335,9 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable { if (isLastMemberRuntimeSized()) { - const auto& lastMember = type->memberTypes()[type->memberCount-1]; + const auto& lastMember = this->type->memberTypes()[this->type->memberCount-1]; assert(!lastMember.count.front().isSpecConstantID); - return sizeWithoutLastMember+lastMemberElementCount*type->memberStrides()[type->memberCount-1]; + return sizeWithoutLastMember+lastMemberElementCount* this->type->memberStrides()[this->type->memberCount-1]; } return sizeWithoutLastMember; } diff --git a/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl b/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl index 0309b78e0d..94da595ef2 100644 --- a/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl +++ b/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl @@ -598,7 +598,7 @@ struct nClamp_helper using return_t = T; static inline return_t __call(const T x, const T _min, const T _max) { - return nMin_helper::_call(nMax_helper::_call(x, _min), _max); + return nMin_helper::_call(nMin_helper::_call(x, _min), _max); } }; From aad8bb1445ffece46681f11c73bf5372421ea5d0 Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Tue, 8 Apr 2025 14:23:40 +0200 Subject: [PATCH 007/346] make Nabla Clang build work --- cmake/adjust/template/vendor/CXX_Clang.cmake | 1 - cmake/adjust/template/vendor/C_Clang.cmake | 3 +-- include/nbl/asset/IRenderpass.h | 16 +++++++++++----- include/nbl/asset/filters/CBlitImageFilter.h | 2 +- .../nbl/asset/filters/kernels/WeightFunctions.h | 4 ++-- include/nbl/asset/utils/CSPIRVIntrospector.h | 8 +++++++- include/nbl/video/ILogicalDevice.h | 2 +- include/nbl/video/utilities/CSubpassKiln.h | 2 +- src/nbl/video/CVulkanCommandBuffer.cpp | 2 +- src/nbl/video/CVulkanDeviceMemoryBacked.cpp | 4 ++-- src/nbl/video/IGPUAccelerationStructure.cpp | 8 ++++---- src/nbl/video/utilities/CAssetConverter.cpp | 14 +++++++------- 12 files changed, 38 insertions(+), 28 deletions(-) diff --git a/cmake/adjust/template/vendor/CXX_Clang.cmake b/cmake/adjust/template/vendor/CXX_Clang.cmake index 258fef3d8a..62c12075d1 100644 --- a/cmake/adjust/template/vendor/CXX_Clang.cmake +++ b/cmake/adjust/template/vendor/CXX_Clang.cmake @@ -15,7 +15,6 @@ set(NBL_CXX_RELWITHDEBINFO_COMPILE_OPTIONS "") # Global list(APPEND NBL_CXX_COMPILE_OPTIONS - -Wno-everything # TMP -Wextra -fno-strict-aliasing -msse4.2 diff --git a/cmake/adjust/template/vendor/C_Clang.cmake b/cmake/adjust/template/vendor/C_Clang.cmake index 3dc21dec15..1c00f78e84 100644 --- a/cmake/adjust/template/vendor/C_Clang.cmake +++ b/cmake/adjust/template/vendor/C_Clang.cmake @@ -14,8 +14,7 @@ set(NBL_C_RELEASE_COMPILE_OPTIONS set(NBL_C_RELWITHDEBINFO_COMPILE_OPTIONS "") # Global -list(APPEND NBL_C_COMPILE_OPTIONS - -Wno-everything # TMP +list(APPEND NBL_C_COMPILE_OPTIONS -Wextra -fno-strict-aliasing -msse4.2 diff --git a/include/nbl/asset/IRenderpass.h b/include/nbl/asset/IRenderpass.h index b9554fc2a6..ce41e35573 100644 --- a/include/nbl/asset/IRenderpass.h +++ b/include/nbl/asset/IRenderpass.h @@ -83,10 +83,10 @@ class NBL_API2 IRenderpass }; // The arrays pointed to by this array must be terminated by `DepthStencilAttachmentsEnd` value, which implicitly satisfies a few VUIDs - constexpr static inline SDepthStencilAttachmentDescription DepthStencilAttachmentsEnd = {}; + static const SDepthStencilAttachmentDescription DepthStencilAttachmentsEnd; // have to initialize out of line because of https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88165 const SDepthStencilAttachmentDescription* depthStencilAttachments = &DepthStencilAttachmentsEnd; // The arrays pointed to by this array must be terminated by `ColorAttachmentsEnd` value, which implicitly satisfies a few VUIDs - constexpr static inline SColorAttachmentDescription ColorAttachmentsEnd = {}; + static const SColorAttachmentDescription ColorAttachmentsEnd; // have to initialize out of line because of https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88165 const SColorAttachmentDescription* colorAttachments = &ColorAttachmentsEnd; struct SSubpassDescription final @@ -200,7 +200,7 @@ class NBL_API2 IRenderpass SColorAttachmentsRef colorAttachments[MaxColorAttachments] = {}; // The arrays pointed to by this array must be terminated by `InputAttachmentsEnd` value - constexpr static inline SInputAttachmentRef InputAttachmentsEnd = {}; + static const SInputAttachmentRef InputAttachmentsEnd; const SInputAttachmentRef* inputAttachments = &InputAttachmentsEnd; struct SPreserveAttachmentRef @@ -233,7 +233,7 @@ class NBL_API2 IRenderpass // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSubpassDescription2.html#VUID-VkSubpassDescription2-pipelineBindPoint-04953 //E_PIPELINE_BIND_POINT pipelineBindPoint : 2 = EPBP_GRAPHICS; }; - constexpr static inline SSubpassDescription SubpassesEnd = {}; + static const SSubpassDescription SubpassesEnd; const SSubpassDescription* subpasses = &SubpassesEnd; struct SSubpassDependency final @@ -259,7 +259,7 @@ class NBL_API2 IRenderpass bool valid() const; }; // The arrays pointed to by this array must be terminated by `DependenciesEnd` value - constexpr static inline SSubpassDependency DependenciesEnd = {}; + static const SSubpassDependency DependenciesEnd; const SSubpassDependency* dependencies = &DependenciesEnd; @@ -380,6 +380,12 @@ class NBL_API2 IRenderpass uint32_t m_loadOpColorAttachmentEnd = ~0u; }; +constexpr inline IRenderpass::SCreationParams::SDepthStencilAttachmentDescription IRenderpass::SCreationParams::DepthStencilAttachmentsEnd = {}; +constexpr inline IRenderpass::SCreationParams::SColorAttachmentDescription IRenderpass::SCreationParams::ColorAttachmentsEnd = {}; +constexpr inline IRenderpass::SCreationParams::SSubpassDescription::SInputAttachmentRef IRenderpass::SCreationParams::SSubpassDescription::InputAttachmentsEnd = {}; +constexpr inline IRenderpass::SCreationParams::SSubpassDescription IRenderpass::SCreationParams::SubpassesEnd = {}; +constexpr inline IRenderpass::SCreationParams::SSubpassDependency IRenderpass::SCreationParams::DependenciesEnd = {}; + inline bool IRenderpass::compatible(const IRenderpass* other) const { // If you find yourself spending a lot of time here in your profile, go ahead and implement a precomputed hash and store it in the renderpass diff --git a/include/nbl/asset/filters/CBlitImageFilter.h b/include/nbl/asset/filters/CBlitImageFilter.h index 1dbc7809ba..f228fea325 100644 --- a/include/nbl/asset/filters/CBlitImageFilter.h +++ b/include/nbl/asset/filters/CBlitImageFilter.h @@ -464,7 +464,7 @@ class CBlitImageFilter : auto phaseCount = IBlitUtilities::getPhaseCount(inExtentLayerCount.xyz, outExtentLayerCount.xyz, inImageType); phaseCount = hlsl::max(phaseCount,hlsl::uint32_t3(1,1,1)); - const auto axisOffsets = blit_utils_t::template getScaledKernelPhasedLUTAxisOffsets(phaseCount,real_window_size); + const auto axisOffsets = blit_utils_t::getScaledKernelPhasedLUTAxisOffsets(phaseCount,real_window_size); constexpr auto MaxAxisCount = 3; lut_value_t* scaledKernelPhasedLUTPixel[MaxAxisCount]; for (auto i = 0; i < MaxAxisCount; ++i) diff --git a/include/nbl/asset/filters/kernels/WeightFunctions.h b/include/nbl/asset/filters/kernels/WeightFunctions.h index bb0b8fb9b4..af2782dfac 100644 --- a/include/nbl/asset/filters/kernels/WeightFunctions.h +++ b/include/nbl/asset/filters/kernels/WeightFunctions.h @@ -337,12 +337,12 @@ class CWeightFunction1D final : public impl::IWeightFunction1Dscale(base_t::value_t(1)/stretchFactor); + this->scale(typename base_t::value_t(1)/stretchFactor); } inline base_t::value_t weight(const float x) const { - return static_cast(this->getTotalScale()*function_t::weight(x*this->getInvStretch())); + return static_cast(this->getTotalScale()*function_t::template weight(x*this->getInvStretch())); } // Integral of `weight(x) dx` from -INF to +INF diff --git a/include/nbl/asset/utils/CSPIRVIntrospector.h b/include/nbl/asset/utils/CSPIRVIntrospector.h index 7a2310a62e..45fcb0e3a7 100644 --- a/include/nbl/asset/utils/CSPIRVIntrospector.h +++ b/include/nbl/asset/utils/CSPIRVIntrospector.h @@ -208,7 +208,13 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable // `memberStrides[i]` only relevant if `memberTypes[i]->isArray()` inline ptr_t memberStrides() const {return memberOffsets()+memberCount;} using member_matrix_info_t = MatrixInfo; - inline ptr_t memberMatrixInfos() const {return reinterpret_cast&>(memberStrides()+memberCount); } + inline ptr_t memberMatrixInfos() const + { + auto t = memberStrides() + memberCount; + + return reinterpret_cast&>(t); + + } constexpr static inline size_t StoragePerMember = sizeof(member_type_t)+sizeof(member_name_t)+sizeof(member_size_t)+sizeof(member_offset_t)+sizeof(member_stride_t)+sizeof(member_matrix_info_t); diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 0cc6608b16..46f7dc1ce7 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -1502,7 +1502,7 @@ inline bool ILogicalDevice::validateMemoryBarrier(const uint32_t queueFamilyInde return false; }; // CANNOT CHECK: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkImageMemoryBarrier2-oldLayout-01197 - if (mismatchedLayout.operator()(barrier.oldLayout) || mismatchedLayout.operator()(barrier.newLayout)) + if (mismatchedLayout.template operator()(barrier.oldLayout) || mismatchedLayout.template operator()(barrier.newLayout)) return false; } diff --git a/include/nbl/video/utilities/CSubpassKiln.h b/include/nbl/video/utilities/CSubpassKiln.h index 7df6cc0caa..c41ec3dd7e 100644 --- a/include/nbl/video/utilities/CSubpassKiln.h +++ b/include/nbl/video/utilities/CSubpassKiln.h @@ -198,7 +198,7 @@ class CSubpassKiln if (begin==end) return; - bake_impl(cmdbuf->getOriginDevice()->getPhysicalDevice()->getLimits().indirectDrawCount, drawIndirectBuffer, drawCountBuffer)(cmdbuf, begin, end); + bake_impl(cmdbuf->getOriginDevice()->getPhysicalDevice()->getLimits().drawIndirectCount, drawIndirectBuffer, drawCountBuffer)(cmdbuf, begin, end); } protected: diff --git a/src/nbl/video/CVulkanCommandBuffer.cpp b/src/nbl/video/CVulkanCommandBuffer.cpp index b569a5fde2..9f0b0a83e1 100644 --- a/src/nbl/video/CVulkanCommandBuffer.cpp +++ b/src/nbl/video/CVulkanCommandBuffer.cpp @@ -661,7 +661,7 @@ bool CVulkanCommandBuffer::beginRenderPass_impl(const SRenderpassBeginInfo& info .renderArea = info.renderArea, // Implicitly but could be optimizedif needed // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkRenderPassBeginInfo.html#VUID-VkRenderPassBeginInfo-clearValueCount-00902 - .clearValueCount = vk_clearValues.size()/sizeof(VkClearValue), + .clearValueCount = static_cast(vk_clearValues.size()/sizeof(VkClearValue)), // Implicit // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkRenderPassBeginInfo.html#VUID-VkRenderPassBeginInfo-clearValueCount-04962 .pClearValues = vk_clearValues.data() diff --git a/src/nbl/video/CVulkanDeviceMemoryBacked.cpp b/src/nbl/video/CVulkanDeviceMemoryBacked.cpp index 2bec9e9d06..90b2993cb3 100644 --- a/src/nbl/video/CVulkanDeviceMemoryBacked.cpp +++ b/src/nbl/video/CVulkanDeviceMemoryBacked.cpp @@ -40,7 +40,7 @@ CVulkanDeviceMemoryBacked::CVulkanDeviceMemoryBacked( assert(vkHandle!=VK_NULL_HANDLE); } -template CVulkanDeviceMemoryBacked; -template CVulkanDeviceMemoryBacked; +template class CVulkanDeviceMemoryBacked; +template class CVulkanDeviceMemoryBacked; } \ No newline at end of file diff --git a/src/nbl/video/IGPUAccelerationStructure.cpp b/src/nbl/video/IGPUAccelerationStructure.cpp index eafbe08d6f..ae78754b1e 100644 --- a/src/nbl/video/IGPUAccelerationStructure.cpp +++ b/src/nbl/video/IGPUAccelerationStructure.cpp @@ -140,11 +140,11 @@ uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo::valid(cons retval += geometryCount*MaxBuffersPerGeometry; return retval; } -template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo::template valid(const uint32_t* const) const; -template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo::template valid(const uint32_t* const) const; +template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo::valid(const uint32_t* const) const; +template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo::valid(const uint32_t* const) const; using BuildRangeInfo = hlsl::acceleration_structures::bottom_level::BuildRangeInfo; -template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo::template valid(const BuildRangeInfo* const) const; -template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo::template valid(const BuildRangeInfo* const) const; +template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo::valid(const BuildRangeInfo* const) const; +template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo::valid(const BuildRangeInfo* const) const; bool IGPUBottomLevelAccelerationStructure::validVertexFormat(const asset::E_FORMAT format) const { diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 796f3dcaec..fdb5c61ca8 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -2142,7 +2142,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult { for (auto& entry : conversionRequests) for (auto i=0ull; i(entry.first,entry.second.firstCopyIx,i,device->createSampler(entry.second.canonicalAsset->getParams())); + assign.template operator()(entry.first,entry.second.firstCopyIx,i,device->createSampler(entry.second.canonicalAsset->getParams())); } if constexpr (std::is_same_v) { @@ -2461,7 +2461,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult { // since we don't have dependants we don't care about our group ID // we create threadsafe pipeline caches, because we have no idea how they may be used - assign.operator()(entry.first,entry.second.firstCopyIx,i,device->createPipelineCache(asset,false)); + assign.template operator()(entry.first,entry.second.firstCopyIx,i,device->createPipelineCache(asset,false)); } } } @@ -2506,7 +2506,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult { // since we don't have dependants we don't care about our group ID // we create threadsafe pipeline caches, because we have no idea how they may be used - assign.operator()(entry.first,entry.second.firstCopyIx,i,device->createRenderpass(asset->getCreationParameters())); + assign.template operator()(entry.first,entry.second.firstCopyIx,i,device->createRenderpass(asset->getCreationParameters())); } } } @@ -2653,7 +2653,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult gpuObj.get()->setObjectDebugName(debugName.str().c_str()); } // insert into staging cache - stagingCache.emplace(gpuObj.get(),CCache::key_t(contentHash,uniqueCopyGroupID)); + stagingCache.emplace(gpuObj.get(),typename CCache::key_t(contentHash,uniqueCopyGroupID)); // propagate back to dfsCache created.gpuObj = std::move(gpuObj); // record if a device memory allocation will be needed @@ -2668,11 +2668,11 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult // this is super annoying, was hoping metaprogramming with `has_type` would actually work auto getConversionRequests = [&]()->auto&{return std::get>(retval.m_conversionRequests);}; if constexpr (std::is_same_v) - getConversionRequests.operator()().emplace_back(core::smart_refctd_ptr(instance.asset),created.gpuObj.get());; + getConversionRequests.template operator()().emplace_back(core::smart_refctd_ptr(instance.asset),created.gpuObj.get());; if constexpr (std::is_same_v) { const uint16_t recomputeMips = created.patch.recomputeMips; - getConversionRequests.operator()().emplace_back(core::smart_refctd_ptr(instance.asset),created.gpuObj.get(),recomputeMips); + getConversionRequests.template operator()().emplace_back(core::smart_refctd_ptr(instance.asset),created.gpuObj.get(),recomputeMips); } // TODO: BLAS and TLAS requests } @@ -2939,7 +2939,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult // if something with this content hash is in the stagingCache, then it must match the `found->gpuObj` if (auto finalCacheIt=stagingCache.find(gpuObj.get()); finalCacheIt!=stagingCache.end()) { - const bool matches = finalCacheIt->second==CCache::key_t(found.contentHash,uniqueCopyGroupID); + const bool matches = finalCacheIt->second==typename CCache::key_t(found.contentHash,uniqueCopyGroupID); assert(matches); } } From 7b8cb61f0cbd56580a216e02c87a3627a28d7a5d Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Tue, 8 Apr 2025 15:05:51 +0200 Subject: [PATCH 008/346] bad typo --- include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl b/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl index 94da595ef2..0d95c032b0 100644 --- a/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl +++ b/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl @@ -598,7 +598,7 @@ struct nClamp_helper using return_t = T; static inline return_t __call(const T x, const T _min, const T _max) { - return nMin_helper::_call(nMin_helper::_call(x, _min), _max); + return nMin_helper::_call(nMax_helper::_call(x, _min), _max); } }; From 062b5baa4ae3af2284f39b9b6d983b8a55c354a7 Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Fri, 11 Apr 2025 12:00:42 +0200 Subject: [PATCH 009/346] update profiles & flags check handle, take care of Clang profile; enter new compile errors after upgrading VS and toolsets (coming from DXC source files) --- cmake/adjust/flags.cmake | 59 ++++++++++----- cmake/adjust/template/vendor/CXX_Clang.cmake | 52 +------------ cmake/adjust/template/vendor/CXX_MSVC.cmake | 43 +---------- cmake/adjust/template/vendor/C_Clang.cmake | 52 +------------ cmake/adjust/template/vendor/C_MSVC.cmake | 45 +---------- cmake/adjust/template/vendor/impl/Clang.cmake | 75 +++++++++++++++++++ cmake/adjust/template/vendor/impl/MSVC.cmake | 71 ++++++++++++++++++ cmake/adjust/template/vendor/impl/reset.cmake | 8 ++ 8 files changed, 208 insertions(+), 197 deletions(-) create mode 100644 cmake/adjust/template/vendor/impl/Clang.cmake create mode 100644 cmake/adjust/template/vendor/impl/MSVC.cmake create mode 100644 cmake/adjust/template/vendor/impl/reset.cmake diff --git a/cmake/adjust/flags.cmake b/cmake/adjust/flags.cmake index eb16a95791..ead5a086e6 100644 --- a/cmake/adjust/flags.cmake +++ b/cmake/adjust/flags.cmake @@ -12,32 +12,57 @@ define_property(TARGET PROPERTY NBL_CONFIGURATION_MAP BRIEF_DOCS "Stores configuration map for a target, it will evaluate to the configuration it's mapped to" ) -function(NBL_REQUEST_COMPILE_OPTION_SUPPORT _NBL_COMPILE_OPTION_) - set(NBL_COMPILE_OPTION "${_NBL_COMPILE_OPTION_}") +# Usage: NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG CONFIG OPTIONS ) +# LANG, CONFIG - optional, OPTIONS - required +function(NBL_REQUEST_COMPILE_OPTION_SUPPORT) + cmake_parse_arguments(IMPL "" "" "LANG;CONFIG;OPTIONS" ${ARGN}) - foreach(COMPILER IN ITEMS c cxx) + set(DEFAULT_COMPILERS c cxx) + + if(NOT IMPL_LANG) + list(APPEND IMPL_LANG ${DEFAULT_COMPILERS}) + endif() + + if(NOT IMPL_OPTIONS) + message(FATAL_ERROR "NBL_REQUEST_COMPILE_OPTION_SUPPORT's OPTIONS empty!") + endif() + + foreach(COMPILER IN ITEMS ${IMPL_LANG}) string(TOUPPER "${COMPILER}" COMPILER_UPPER) - string(REGEX REPLACE "[-=:;/.]" "_" flag_signature "${NBL_COMPILE_OPTION}") - set(flag_var "__${COMPILER_UPPER}_Flag_${flag_signature}") + if(COMPILER_UPPER STREQUAL C) + macro(VALIDATE_FLAG) + check_c_compiler_flag(${ARGV}) + endmacro() + elseif(COMPILER_UPPER STREQUAL CXX) + macro(VALIDATE_FLAG) + check_cxx_compiler_flag(${ARGV}) + endmacro() + endif() + + foreach(COMPILE_OPTION ${IMPL_OPTIONS}) + string(REGEX REPLACE "[-=:;/.]" "_" FLAG_SIGNATURE "${COMPILE_OPTION}") + set(FLAG_VAR "NBL_${COMPILER_UPPER}_COMPILER_HAS_${FLAG_SIGNATURE}_FLAG") - if(COMPILER STREQUAL "c") - check_c_compiler_flag("${NBL_COMPILE_OPTION}" ${flag_var}) - elseif(COMPILER STREQUAL "cxx") - check_cxx_compiler_flag("${NBL_COMPILE_OPTION}" ${flag_var}) - endif() + VALIDATE_FLAG("${COMPILE_OPTION}" "${FLAG_VAR}") - if(${flag_var}) - message(STATUS "Enabled \"${NBL_COMPILE_OPTION}\" ${COMPILER_UPPER} compile option for Nabla projects!") - set(NBL_${COMPILER_UPPER}_COMPILE_OPTIONS "${NBL_${COMPILER_UPPER}_COMPILE_OPTIONS};${NBL_COMPILE_OPTION}" PARENT_SCOPE) - else() - message(STATUS "Disabled \"${NBL_COMPILE_OPTION}\" ${COMPILER_UPPER} compile option for Nabla projects! (no support)") - endif() + if(${FLAG_VAR}) + if(IMPL_CONFIG) + foreach(CONFIG ${IMPL_CONFIG}) + # TODO: validate (${CONFIG} \in ${CMAKE_CONFIGURATION_TYPES}) + string(TOUPPER "${CONFIG}" CONFIG_UPPER) + set(NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_COMPILE_OPTIONS "${NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_COMPILE_OPTIONS};${COMPILE_OPTION}" PARENT_SCOPE) + endforeach() + else() + set(NBL_${COMPILER_UPPER}_COMPILE_OPTIONS "${NBL_${COMPILER_UPPER}_COMPILE_OPTIONS};${COMPILE_OPTION}" PARENT_SCOPE) + endif() + endif() + endforeach() endforeach() endfunction() option(NBL_REQUEST_SSE_4_2 "Request compilation with SSE 4.2 instruction set enabled for Nabla projects" ON) -option(NBL_REQUEST_SSE_AXV2 "Request compilation with SSE Intel Advanced Vector Extensions 2 for Nabla projects" ON) +option(NBL_REQUEST_SSE_AVX2 "Request compilation with SSE Intel Advanced Vector Extensions 2 for Nabla projects" ON) # profiles foreach(NBL_COMPILER_LANGUAGE IN ITEMS C CXX) diff --git a/cmake/adjust/template/vendor/CXX_Clang.cmake b/cmake/adjust/template/vendor/CXX_Clang.cmake index 62c12075d1..2cc877c028 100644 --- a/cmake/adjust/template/vendor/CXX_Clang.cmake +++ b/cmake/adjust/template/vendor/CXX_Clang.cmake @@ -1,51 +1,5 @@ include_guard(GLOBAL) -# Debug -set(NBL_CXX_DEBUG_COMPILE_OPTIONS - -ggdb3 -Wall -fno-omit-frame-pointer -fstack-protector-strong -) - -# Release -set(NBL_CXX_RELEASE_COMPILE_OPTIONS - -fexpensive-optimizations -) - -# RelWithDebInfo -set(NBL_CXX_RELWITHDEBINFO_COMPILE_OPTIONS "") - -# Global -list(APPEND NBL_CXX_COMPILE_OPTIONS - -Wextra - -fno-strict-aliasing - -msse4.2 - -maes - -mfpmath=sse - -Wextra - -Wno-sequence-point - -Wno-unused-parameter - -Wno-unused-but-set-parameter - -Wno-c++98-compat - -Wno-c++98-compat-pedantic - -Wno-padded - -Wno-unsafe-buffer-usage - -Wno-switch-enum - -Wno-error=ignored-attributes - -Wno-error=unused-function - -Wno-error=unused-variable - -Wno-error=unused-parameter - -Wno-error=ignored-attributes - -Wno-error=non-pod-varargs - -fno-exceptions -) - -if(NBL_SANITIZE_ADDRESS) - list(APPEND NBL_CXX_COMPILE_OPTIONS -fsanitize=address) -endif() - -if(NBL_SANITIZE_THREAD) - list(APPEND NBL_CXX_COMPILE_OPTIONS -fsanitize=thread) -endif() - -# our pervious flags-set function called this, does not affect flags nor configs so I will keep it here temporary -# TODO: move it out from the profile -link_libraries(-fuse-ld=gold) \ No newline at end of file +set(LANG CXX) +include("${CMAKE_CURRENT_LIST_DIR}/impl/Clang.cmake") +# append unique CXX options here \ No newline at end of file diff --git a/cmake/adjust/template/vendor/CXX_MSVC.cmake b/cmake/adjust/template/vendor/CXX_MSVC.cmake index 1abb66c9da..59f4e59cdd 100644 --- a/cmake/adjust/template/vendor/CXX_MSVC.cmake +++ b/cmake/adjust/template/vendor/CXX_MSVC.cmake @@ -1,42 +1,5 @@ include_guard(GLOBAL) -# https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170 - -# The default instruction set is SSE2 if no /arch option is specified. -if(NBL_REQUEST_SSE_4_2) - NBL_REQUEST_COMPILE_OPTION_SUPPORT("/arch:SSE4.2") -endif() - -# Enables Intel Advanced Vector Extensions 2. -if(NBL_REQUEST_SSE_AXV2) - NBL_REQUEST_COMPILE_OPTION_SUPPORT("/arch:AVX2") -endif() - -# Debug -set(NBL_CXX_DEBUG_COMPILE_OPTIONS - /Zc:__cplusplus /Ob0 /Od /MP${_NBL_JOBS_AMOUNT_} /fp:fast /Zc:wchar_t /INCREMENTAL -) - -if(NBL_SANITIZE_ADDRESS) - list(APPEND NBL_CXX_DEBUG_COMPILE_OPTIONS /RTC1) -endif() - -# Release -set(NBL_CXX_RELEASE_COMPILE_OPTIONS - /Zc:__cplusplus /O2 /Ob2 /DNDEBUG /GL /MP${_NBL_JOBS_AMOUNT_} /Gy- /Zc:wchar_t /sdl- /GF /GS- /fp:fast -) - -# RelWithDebInfo -set(NBL_CXX_RELWITHDEBINFO_COMPILE_OPTIONS - /Zc:__cplusplus /O2 /Ob1 /DNDEBUG /GL /Zc:wchar_t /MP${_NBL_JOBS_AMOUNT_} /Gy /sdl- /Oy- /fp:fast -) - -if(NBL_SANITIZE_ADDRESS) - list(APPEND NBL_CXX_COMPILE_OPTIONS /fsanitize=address) -endif() - -# this should also be not part of profile, pasting from old flags-set function temporary -# TODO: use profile - -#reason for INCREMENTAL:NO: https://docs.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=vs-2019 /LTCG is not valid for use with /INCREMENTAL. -set(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO} /INCREMENTAL:NO /LTCG:incremental") +set(LANG CXX) +include("${CMAKE_CURRENT_LIST_DIR}/impl/MSVC.cmake") +# append unique CXX options here \ No newline at end of file diff --git a/cmake/adjust/template/vendor/C_Clang.cmake b/cmake/adjust/template/vendor/C_Clang.cmake index 1c00f78e84..046ccaa902 100644 --- a/cmake/adjust/template/vendor/C_Clang.cmake +++ b/cmake/adjust/template/vendor/C_Clang.cmake @@ -1,51 +1,5 @@ include_guard(GLOBAL) -# Debug -set(NBL_C_DEBUG_COMPILE_OPTIONS - -ggdb3 -Wall -fno-omit-frame-pointer -fstack-protector-strong -) - -# Release -set(NBL_C_RELEASE_COMPILE_OPTIONS - -fexpensive-optimizations -) - -# RelWithDebInfo -set(NBL_C_RELWITHDEBINFO_COMPILE_OPTIONS "") - -# Global -list(APPEND NBL_C_COMPILE_OPTIONS - -Wextra - -fno-strict-aliasing - -msse4.2 - -maes - -mfpmath=sse - -Wextra - -Wno-sequence-point - -Wno-unused-parameter - -Wno-unused-but-set-parameter - -Wno-c++98-compat - -Wno-c++98-compat-pedantic - -Wno-padded - -Wno-unsafe-buffer-usage - -Wno-switch-enum - -Wno-error=ignored-attributes - -Wno-error=unused-function - -Wno-error=unused-variable - -Wno-error=unused-parameter - -Wno-error=ignored-attributes - -Wno-error=non-pod-varargs - -fno-exceptions -) - -if(NBL_SANITIZE_ADDRESS) - list(APPEND NBL_C_COMPILE_OPTIONS -fsanitize=address) -endif() - -if(NBL_SANITIZE_THREAD) - list(APPEND NBL_C_COMPILE_OPTIONS -fsanitize=thread) -endif() - -# our pervious flags-set function called this, does not affect flags nor configs so I will keep it here temporary -# TODO: move it out from the profile -link_libraries(-fuse-ld=gold) \ No newline at end of file +set(LANG C) +include("${CMAKE_CURRENT_LIST_DIR}/impl/Clang.cmake") +# append unique C options here \ No newline at end of file diff --git a/cmake/adjust/template/vendor/C_MSVC.cmake b/cmake/adjust/template/vendor/C_MSVC.cmake index ddc0007bb5..f9aca4a5b7 100644 --- a/cmake/adjust/template/vendor/C_MSVC.cmake +++ b/cmake/adjust/template/vendor/C_MSVC.cmake @@ -1,44 +1,5 @@ include_guard(GLOBAL) -# https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170 - -# The default instruction set is SSE2 if no /arch option is specified. -if(NBL_REQUEST_SSE_4_2) - NBL_REQUEST_COMPILE_OPTION_SUPPORT("/arch:SSE4.2") -endif() - -# Enables Intel Advanced Vector Extensions 2. -if(NBL_REQUEST_SSE_AXV2) - NBL_REQUEST_COMPILE_OPTION_SUPPORT("/arch:AVX2") -endif() - -NBL_REQUEST_COMPILE_OPTION_SUPPORT(/Zc:preprocessor) - -# Debug -set(NBL_C_DEBUG_COMPILE_OPTIONS - /Ob0 /Od /MP${_NBL_JOBS_AMOUNT_} /fp:fast /Zc:wchar_t /INCREMENTAL -) - -if(NBL_SANITIZE_ADDRESS) - list(APPEND NBL_C_DEBUG_COMPILE_OPTIONS /RTC1) -endif() - -# Release -set(NBL_C_RELEASE_COMPILE_OPTIONS - /O2 /Ob2 /DNDEBUG /GL /MP${_NBL_JOBS_AMOUNT_} /Gy- /Zc:wchar_t /sdl- /GF /GS- /fp:fast -) - -# RelWithDebInfo -set(NBL_C_RELWITHDEBINFO_COMPILE_OPTIONS - /O2 /Ob1 /DNDEBUG /GL /Zc:wchar_t /MP${_NBL_JOBS_AMOUNT_} /Gy /sdl- /Oy- /fp:fast -) - -if(NBL_SANITIZE_ADDRESS) - list(APPEND NBL_C_COMPILE_OPTIONS /fsanitize=address) -endif() - -# this should also be not part of profile, pasting from old flags-set function temporary -# TODO: use profile - -#reason for INCREMENTAL:NO: https://docs.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=vs-2019 /LTCG is not valid for use with /INCREMENTAL. -set(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO} /INCREMENTAL:NO /LTCG:incremental") \ No newline at end of file +set(LANG C) +include("${CMAKE_CURRENT_LIST_DIR}/impl/MSVC.cmake") +# append unique C options here \ No newline at end of file diff --git a/cmake/adjust/template/vendor/impl/Clang.cmake b/cmake/adjust/template/vendor/impl/Clang.cmake new file mode 100644 index 0000000000..868309f828 --- /dev/null +++ b/cmake/adjust/template/vendor/impl/Clang.cmake @@ -0,0 +1,75 @@ +include("${CMAKE_CURRENT_LIST_DIR}/reset.cmake") + +# vendor template with options fitting for both C and CXX LANGs + +if(NOT DEFINED LANG) + message(FATAL_ERROR "LANG must be defined!") +endif() + +if(NBL_REQUEST_SSE_4_2) + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS + -msse4.2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang1-msse4.2 + ) +endif() + +if(NBL_REQUEST_SSE_AVX2) + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS + -mavx2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mavx2 + ) +endif() + +list(APPEND NBL_${LANG}_COMPILE_OPTIONS + -Wextra # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-W-warning + -maes # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-maes + -mfpmath=sse # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mfpmath + + # TODO: Yas, eliminate all below + -fno-strict-aliasing + -Wno-sequence-point + -Wno-c++98-compat + -Wno-c++98-compat-pedantic + -Wno-padded + -Wno-unsafe-buffer-usage + -Wno-switch-enum + -Wno-error=ignored-attributes + -Wno-unused-parameter + -Wno-unused-but-set-parameter + -Wno-error=unused-function + -Wno-error=unused-variable + -Wno-error=unused-parameter + -Wno-error=ignored-attributes + -Wno-error=non-pod-varargs +) + +if(NBL_SANITIZE_ADDRESS) + list(APPEND NBL_${LANG}_COMPILE_OPTIONS -fsanitize=address) +endif() + +if(NBL_SANITIZE_THREAD) + list(APPEND NBL_${LANG}_COMPILE_OPTIONS -fsanitize=thread) +endif() + +set(NBL_${LANG}_DEBUG_COMPILE_OPTIONS + -g # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-g + -mincremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible + -fincremental-extensions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-fincremental-extensions + -Wall # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-W-warning + -fstack-protector-strong # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-fstack-protector-strong + -gline-tables-only # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-gline-tables-only + -fno-omit-frame-pointer # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-fomit-frame-pointer + -fno-inline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions +) + +set(NBL_${LANG}_RELEASE_COMPILE_OPTIONS + -O2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg + -finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions + -mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible +) + +set(NBL_${LANG}_RELWITHDEBINFO_COMPILE_OPTIONS + -g # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-g + -O1 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg + -finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions + -mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible + -fno-omit-frame-pointer # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-fomit-frame-pointer +) \ No newline at end of file diff --git a/cmake/adjust/template/vendor/impl/MSVC.cmake b/cmake/adjust/template/vendor/impl/MSVC.cmake new file mode 100644 index 0000000000..5b73b9073e --- /dev/null +++ b/cmake/adjust/template/vendor/impl/MSVC.cmake @@ -0,0 +1,71 @@ +include("${CMAKE_CURRENT_LIST_DIR}/reset.cmake") + +# vendor template with options fitting for both C and CXX LANGs + +if(NOT DEFINED LANG) + message(FATAL_ERROR "LANG must be defined!") +endif() + +if(NBL_REQUEST_SSE_4_2) + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS + /arch:SSE4.2 # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170 + ) +endif() + +if(NBL_REQUEST_SSE_AVX2) + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS + /arch:AVX2 # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170 + ) +endif() + +NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS + /Zc:preprocessor # https://learn.microsoft.com/en-us/cpp/build/reference/zc-preprocessor?view=msvc-170 +) + +list(APPEND NBL_${LANG}_COMPILE_OPTIONS + /Zc:__cplusplus # https://learn.microsoft.com/en-us/cpp/build/reference/zc-cplusplus?view=msvc-170 + /Zc:wchar_t # https://learn.microsoft.com/en-us/cpp/build/reference/zc-wchar-t-wchar-t-is-native-type?view=msvc-170 + /fp:fast # https://learn.microsoft.com/en-us/cpp/build/reference/fp-specify-floating-point-behavior?view=msvc-170 + /MP${_NBL_JOBS_AMOUNT_} # https://learn.microsoft.com/en-us/cpp/build/reference/mp-build-with-multiple-processes?view=msvc-170 +) + +if(NBL_SANITIZE_ADDRESS) + list(APPEND NBL_${LANG}_COMPILE_OPTIONS + /fsanitize=address # https://learn.microsoft.com/en-us/cpp/build/reference/fsanitize?view=msvc-170 + ) + + list(APPEND NBL_${LANG}_DEBUG_COMPILE_OPTIONS + /RTC1 # https://learn.microsoft.com/en-us/cpp/build/reference/rtc-run-time-error-checks?view=msvc-170 + ) +endif() + +list(APPEND NBL_${LANG}_DEBUG_COMPILE_OPTIONS + /Ob0 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170 + /Od # https://learn.microsoft.com/en-us/cpp/build/reference/od-disable-debug?view=msvc-170 + /INCREMENTAL # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 + /Oy- # https://learn.microsoft.com/en-us/cpp/build/reference/oy-frame-pointer-omission?view=msvc-170 +) + +list(APPEND NBL_${LANG}_RELEASE_COMPILE_OPTIONS + /O2 # https://learn.microsoft.com/en-us/cpp/build/reference/o1-o2-minimize-size-maximize-speed?view=msvc-170 + /Ob2 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170 + /INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 + /DNDEBUG # https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/assert-macro-assert-wassert?view=msvc-170 + /GL # https://learn.microsoft.com/en-us/cpp/build/reference/gl-whole-program-optimization?view=msvc-170 + /Gy- # https://learn.microsoft.com/en-us/cpp/build/reference/gy-enable-function-level-linking?view=msvc-170 + /sdl- # https://learn.microsoft.com/en-us/cpp/build/reference/sdl-enable-additional-security-checks?view=msvc-170 + /GF # https://learn.microsoft.com/en-us/cpp/build/reference/gf-eliminate-duplicate-strings?view=msvc-170 + /GS- # https://learn.microsoft.com/en-us/cpp/build/reference/gs-buffer-security-check?view=msvc-170 +) + +list(APPEND NBL_${LANG}_RELWITHDEBINFO_COMPILE_OPTIONS + /O2 # https://learn.microsoft.com/en-us/cpp/build/reference/o1-o2-minimize-size-maximize-speed?view=msvc-170 + /Ob1 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170 + /INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 + /LTCG:incremental # https://learn.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=msvc-170 + /Oy- # https://learn.microsoft.com/en-us/cpp/build/reference/oy-frame-pointer-omission?view=msvc-170 + /DNDEBUG # https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/assert-macro-assert-wassert?view=msvc-170 + /GL # https://learn.microsoft.com/en-us/cpp/build/reference/gl-whole-program-optimization?view=msvc-170 + /Gy # https://learn.microsoft.com/en-us/cpp/build/reference/gy-enable-function-level-linking?view=msvc-170 + /sdl- # https://learn.microsoft.com/en-us/cpp/build/reference/sdl-enable-additional-security-checks?view=msvc-170 +) \ No newline at end of file diff --git a/cmake/adjust/template/vendor/impl/reset.cmake b/cmake/adjust/template/vendor/impl/reset.cmake new file mode 100644 index 0000000000..6eb95b6cfd --- /dev/null +++ b/cmake/adjust/template/vendor/impl/reset.cmake @@ -0,0 +1,8 @@ +# reset profile vars, for sanity + +foreach(LANG CXX C) + unset(NBL_${LANG}_COMPILE_OPTIONS) + unset(NBL_${LANG}_RELEASE_COMPILE_OPTIONS) + unset(NBL_${LANG}_RELWITHDEBINFO_COMPILE_OPTIONS) + unset(NBL_${LANG}_DEBUG_COMPILE_OPTIONS) +endforeach() \ No newline at end of file From 39bb3e1ba6d46710f8d6a4e98741737da6a2f02f Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Fri, 11 Apr 2025 15:40:30 +0200 Subject: [PATCH 010/346] update dxc submodule with fixed clang 19.1.1 build, upgrade & correct NBL_REQUEST_COMPILE_OPTION_SUPPORT, add required instruction set features for simdjson explicitly; now I hit GLI errors due to bad templates --- 3rdparty/dxc/dxc | 2 +- cmake/adjust/flags.cmake | 26 ++++++++++++++++--- cmake/adjust/template/vendor/impl/Clang.cmake | 14 ++++++++++ 3 files changed, 38 insertions(+), 4 deletions(-) diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc index b2e75826b7..49b89ae671 160000 --- a/3rdparty/dxc/dxc +++ b/3rdparty/dxc/dxc @@ -1 +1 @@ -Subproject commit b2e75826b70d85d03686dd8a755ef477b4fa3807 +Subproject commit 49b89ae6712f74fba2352e099f024724bcc32673 diff --git a/cmake/adjust/flags.cmake b/cmake/adjust/flags.cmake index ead5a086e6..6982e0593d 100644 --- a/cmake/adjust/flags.cmake +++ b/cmake/adjust/flags.cmake @@ -15,9 +15,10 @@ define_property(TARGET PROPERTY NBL_CONFIGURATION_MAP # Usage: NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG CONFIG OPTIONS ) # LANG, CONFIG - optional, OPTIONS - required function(NBL_REQUEST_COMPILE_OPTION_SUPPORT) - cmake_parse_arguments(IMPL "" "" "LANG;CONFIG;OPTIONS" ${ARGN}) + cmake_parse_arguments(IMPL "" "REQUEST_VAR;REQUIRED" "LANG;CONFIG;OPTIONS" ${ARGN}) set(DEFAULT_COMPILERS c cxx) + set(REQUEST_ALL_OPTIONS_PRESENT True) if(NOT IMPL_LANG) list(APPEND IMPL_LANG ${DEFAULT_COMPILERS}) @@ -51,14 +52,33 @@ function(NBL_REQUEST_COMPILE_OPTION_SUPPORT) foreach(CONFIG ${IMPL_CONFIG}) # TODO: validate (${CONFIG} \in ${CMAKE_CONFIGURATION_TYPES}) string(TOUPPER "${CONFIG}" CONFIG_UPPER) - set(NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_COMPILE_OPTIONS "${NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_COMPILE_OPTIONS};${COMPILE_OPTION}" PARENT_SCOPE) + set(NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_COMPILE_OPTIONS "${NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_COMPILE_OPTIONS};${COMPILE_OPTION}") endforeach() else() - set(NBL_${COMPILER_UPPER}_COMPILE_OPTIONS "${NBL_${COMPILER_UPPER}_COMPILE_OPTIONS};${COMPILE_OPTION}" PARENT_SCOPE) + set(NBL_${COMPILER_UPPER}_COMPILE_OPTIONS "${NBL_${COMPILER_UPPER}_COMPILE_OPTIONS};${COMPILE_OPTION}") endif() + else() + if(IMPL_REQUIRED) + message(FATAL_ERROR "Terminating, NBL_REQUEST_COMPILE_OPTION_SUPPORT was invoked with REQUIRED qualifier!") + endif() + + set(REQUEST_ALL_OPTIONS_PRESENT False) endif() endforeach() + + if(IMPL_CONFIG) + foreach(CONFIG ${IMPL_CONFIG}) + string(TOUPPER "${CONFIG}" CONFIG_UPPER) + set(NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_COMPILE_OPTIONS ${NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_COMPILE_OPTIONS} PARENT_SCOPE) + endforeach() + else() + set(NBL_${COMPILER_UPPER}_COMPILE_OPTIONS ${NBL_${COMPILER_UPPER}_COMPILE_OPTIONS} PARENT_SCOPE) + endif() endforeach() + + if(IMPL_REQUEST_VAR) + set(${IMPL_REQUEST_VAR} ${REQUEST_ALL_OPTIONS_PRESENT} PARENT_SCOPE) + endif() endfunction() option(NBL_REQUEST_SSE_4_2 "Request compilation with SSE 4.2 instruction set enabled for Nabla projects" ON) diff --git a/cmake/adjust/template/vendor/impl/Clang.cmake b/cmake/adjust/template/vendor/impl/Clang.cmake index 868309f828..63549974c6 100644 --- a/cmake/adjust/template/vendor/impl/Clang.cmake +++ b/cmake/adjust/template/vendor/impl/Clang.cmake @@ -18,6 +18,20 @@ if(NBL_REQUEST_SSE_AVX2) ) endif() +NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS + # latest Clang(CL) 19.1.1 shipped with VS seems to require explicitly features to be listed (simdjson) + # TODO: Yas, use with REQUEST_VAR, if the request fail then do not promote simdjson to build with + # HASWELL implementation because those flags + avx2 compose subset it wants in this case + + # also instead of enabling single options maybe we could consider requesting an + # instruction implementation set instead, eg -march=haswel, though this approach + # could add a few more flags then we actually need while building - to rethink + + -mbmi # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mbmi + -mlzcnt # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mlzcnt + -mpclmul # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mpclmul +) + list(APPEND NBL_${LANG}_COMPILE_OPTIONS -Wextra # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-W-warning -maes # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-maes From cbb4db1c448e9e03972ad20bb23d880db4408361 Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Fri, 11 Apr 2025 18:42:25 +0200 Subject: [PATCH 011/346] update GLI (use custom location for GLM + fix with templates) and GLM (to latest and our own fork not mine, this one was 6 years old) submodules --- .gitmodules | 6 +++--- 3rdparty/CMakeLists.txt | 9 ++------- 3rdparty/gli | 2 +- 3rdparty/glm | 2 +- src/nbl/CMakeLists.txt | 2 +- 5 files changed, 8 insertions(+), 13 deletions(-) diff --git a/.gitmodules b/.gitmodules index 8edc1cead9..caca5b69a1 100644 --- a/.gitmodules +++ b/.gitmodules @@ -27,9 +27,6 @@ path = 3rdparty/libexpat url = git@github.com:Devsh-Graphics-Programming/libexpat.git branch = master -[submodule "3rdparty/glm"] - path = 3rdparty/glm - url = git@github.com:AnastaZIuk/glm.git [submodule "3rdparty/freetype2"] path = 3rdparty/freetype2 url = git@github.com:Devsh-Graphics-Programming/freetype.git @@ -117,3 +114,6 @@ [submodule "docker/compiler-explorer"] path = docker/compiler-explorer url = git@github.com:Devsh-Graphics-Programming/Compiler-Explorer-Docker.git +[submodule "3rdparty/glm"] + path = 3rdparty/glm + url = git@github.com:Devsh-Graphics-Programming/glm.git diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt index d838f92127..0335baf7e5 100755 --- a/3rdparty/CMakeLists.txt +++ b/3rdparty/CMakeLists.txt @@ -231,7 +231,7 @@ if(_NBL_COMPILE_WITH_OPEN_EXR_) endif() -#gli +# gli option(_NBL_COMPILE_WITH_GLI_ "Build with GLI library" ON) if(_NBL_COMPILE_WITH_GLI_) set(_OLD_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) @@ -240,6 +240,7 @@ if(_NBL_COMPILE_WITH_GLI_) set(BUILD_SHARED_LIBS OFF) set(BUILD_STATIC_LIBS OFF) set(BUILD_TESTING OFF) + set(GLI_GLM_LOCATION "${CMAKE_CURRENT_SOURCE_DIR}/glm") add_subdirectory(gli gli EXCLUDE_FROM_ALL) set(BUILD_SHARED_LIBS ${_OLD_BUILD_SHARED_LIBS}) set(BUILD_STATIC_LIBS ${_OLD_BUILD_STATIC_LIBS}) @@ -419,12 +420,6 @@ add_library(aesGladman OBJECT add_subdirectory(argparse argparse EXCLUDE_FROM_ALL) -option(GLM_TEST_ENABLE_SIMD_SSE4_2 "Enable SSE 4.2 optimizations" ON) -option(GLM_TEST_ENABLE "Build unit tests" OFF) -#add_subdirectory(glm EXCLUDE_FROM_ALL) -set(BUILD_SHARED_LIBS ${_OLD_BUILD_SHARED_LIBS}) -set(BUILD_STATIC_LIBS ${_OLD_BUILD_STATIC_LIBS}) - if (NBL_BUILD_MITSUBA_LOADER) option(BUILD_tools "EXPAT: build the xmlwf tool for expat library" OFF) option(BUILD_examples "EXPAT: build the examples for expat library" OFF) diff --git a/3rdparty/gli b/3rdparty/gli index 559cbe1ec3..c4e6446d3b 160000 --- a/3rdparty/gli +++ b/3rdparty/gli @@ -1 +1 @@ -Subproject commit 559cbe1ec38878e182507d331e0780fbae5baf15 +Subproject commit c4e6446d3b646538026fd5a95533daed952878d4 diff --git a/3rdparty/glm b/3rdparty/glm index d162eee1e6..2d4c4b4dd3 160000 --- a/3rdparty/glm +++ b/3rdparty/glm @@ -1 +1 @@ -Subproject commit d162eee1e6f7c317a09229fe6ceab8ec6ab9a4b4 +Subproject commit 2d4c4b4dd31fde06cfffad7915c2b3006402322f diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 26acb8de10..bde7182ebd 100755 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -308,7 +308,7 @@ endif() set(COMMON_INCLUDE_DIRS ${THIRD_PARTY_SOURCE_DIR}/glm - ${THIRD_PARTY_SOURCE_DIR}/renderdoc # for renderdoc api header + ${THIRD_PARTY_SOURCE_DIR}/renderdoc # for renderdoc api header ${CMAKE_BINARY_DIR}/3rdparty/zlib #for dynamically generated zconf.h $ #for dynamically generated pnglibconf.h $ #for dynamically generated jconfig.h From c1cc48b0454b2f5f3d58e6be59fa4ce20fb86717 Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Sun, 13 Apr 2025 12:39:20 +0200 Subject: [PATCH 012/346] explicitly set limits for Clang toolset, correct some of backend options which must be passed with proxy XClang arg (they were ignored before), use NBL_REQUEST_COMPILE_OPTION_SUPPORT for Clang profile hence enforce flags validation at configure time (TODO: do the same for MSVC). It still crashes at JIT loader's cpp with -1073741819 - windooze access violation, I need to attach diagnostic outputs for LLVM team --- cmake/adjust/flags.cmake | 8 ++- cmake/adjust/template/vendor/impl/Clang.cmake | 55 ++++++++++++------- 2 files changed, 40 insertions(+), 23 deletions(-) diff --git a/cmake/adjust/flags.cmake b/cmake/adjust/flags.cmake index 6982e0593d..1718ac0520 100644 --- a/cmake/adjust/flags.cmake +++ b/cmake/adjust/flags.cmake @@ -15,7 +15,7 @@ define_property(TARGET PROPERTY NBL_CONFIGURATION_MAP # Usage: NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG CONFIG OPTIONS ) # LANG, CONFIG - optional, OPTIONS - required function(NBL_REQUEST_COMPILE_OPTION_SUPPORT) - cmake_parse_arguments(IMPL "" "REQUEST_VAR;REQUIRED" "LANG;CONFIG;OPTIONS" ${ARGN}) + cmake_parse_arguments(IMPL "REQUIRED" "REQUEST_VAR" "LANG;CONFIG;OPTIONS" ${ARGN}) set(DEFAULT_COMPILERS c cxx) set(REQUEST_ALL_OPTIONS_PRESENT True) @@ -43,7 +43,9 @@ function(NBL_REQUEST_COMPILE_OPTION_SUPPORT) foreach(COMPILE_OPTION ${IMPL_OPTIONS}) string(REGEX REPLACE "[-=:;/.]" "_" FLAG_SIGNATURE "${COMPILE_OPTION}") - set(FLAG_VAR "NBL_${COMPILER_UPPER}_COMPILER_HAS_${FLAG_SIGNATURE}_FLAG") + + set(TEST_NAME "NBL_${COMPILER_UPPER}_COMPILER_HAS_${FLAG_SIGNATURE}_FLAG") + set(FLAG_VAR ${TEST_NAME}) VALIDATE_FLAG("${COMPILE_OPTION}" "${FLAG_VAR}") @@ -59,7 +61,7 @@ function(NBL_REQUEST_COMPILE_OPTION_SUPPORT) endif() else() if(IMPL_REQUIRED) - message(FATAL_ERROR "Terminating, NBL_REQUEST_COMPILE_OPTION_SUPPORT was invoked with REQUIRED qualifier!") + message(FATAL_ERROR "${TEST_NAME} (a.k.a \"${COMPILE_OPTION}\") failed because its marked as REQUIRED!") endif() set(REQUEST_ALL_OPTIONS_PRESENT False) diff --git a/cmake/adjust/template/vendor/impl/Clang.cmake b/cmake/adjust/template/vendor/impl/Clang.cmake index 63549974c6..62c1c2568b 100644 --- a/cmake/adjust/template/vendor/impl/Clang.cmake +++ b/cmake/adjust/template/vendor/impl/Clang.cmake @@ -9,15 +9,32 @@ endif() if(NBL_REQUEST_SSE_4_2) NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS -msse4.2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang1-msse4.2 - ) + REQUIRED) # TODO: (****) optional but then adjust 3rdparty options on fail endif() if(NBL_REQUEST_SSE_AVX2) NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS -mavx2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mavx2 - ) + REQUIRED) # TODO: (****) endif() +NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS + -Xclang=-fconstexpr-backtrace-limit=696969 + -Xclang=-fconstexpr-depth=696969 + -Xclang=-fconstexpr-steps=696969 + -Xclang=-ftemplate-backtrace-limit=0 # no limit + -Xclang=-ftemplate-depth=696969 + -Xclang=-fmacro-backtrace-limit=0 # no limit + -Xclang=-fspell-checking-limit=0 # no limit + -Xclang=-fcaret-diagnostics-max-lines=0 # no limit + + # whenever clang frontend or backend crashes we put diagnostics into top build direcotry + # use it to make a repro and attach to an issue - it outputs preprocessed cpp files with + # sh script for compilation + -fcrash-diagnostics=compiler + "-fcrash-diagnostics-dir=${NBL_ROOT_PATH_BINARY}/.crash-report" +REQUIRED) + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS # latest Clang(CL) 19.1.1 shipped with VS seems to require explicitly features to be listed (simdjson) # TODO: Yas, use with REQUEST_VAR, if the request fail then do not promote simdjson to build with @@ -27,12 +44,13 @@ NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS # instruction implementation set instead, eg -march=haswel, though this approach # could add a few more flags then we actually need while building - to rethink + ################ + # TODO: (****) -> -mbmi # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mbmi -mlzcnt # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mlzcnt -mpclmul # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mpclmul -) + ################ <- -list(APPEND NBL_${LANG}_COMPILE_OPTIONS -Wextra # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-W-warning -maes # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-maes -mfpmath=sse # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mfpmath @@ -53,37 +71,34 @@ list(APPEND NBL_${LANG}_COMPILE_OPTIONS -Wno-error=unused-parameter -Wno-error=ignored-attributes -Wno-error=non-pod-varargs -) +REQUIRED) if(NBL_SANITIZE_ADDRESS) - list(APPEND NBL_${LANG}_COMPILE_OPTIONS -fsanitize=address) + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS -fsanitize=address REQUIRED) endif() if(NBL_SANITIZE_THREAD) - list(APPEND NBL_${LANG}_COMPILE_OPTIONS -fsanitize=thread) + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS -fsanitize=thread) endif() -set(NBL_${LANG}_DEBUG_COMPILE_OPTIONS +NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG OPTIONS -g # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-g -mincremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible - -fincremental-extensions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-fincremental-extensions + -Xclang=-fincremental-extensions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-fincremental-extensions -Wall # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-W-warning - -fstack-protector-strong # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-fstack-protector-strong -gline-tables-only # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-gline-tables-only - -fno-omit-frame-pointer # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-fomit-frame-pointer - -fno-inline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions -) + -Xclang=-fno-inline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions +REQUIRED) -set(NBL_${LANG}_RELEASE_COMPILE_OPTIONS +NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELEASE OPTIONS -O2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg - -finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions + -Xclang=-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions -mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible -) +REQUIRED) -set(NBL_${LANG}_RELWITHDEBINFO_COMPILE_OPTIONS +NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELWITHDEBINFO OPTIONS -g # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-g -O1 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg - -finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions + -Xclang=-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions -mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible - -fno-omit-frame-pointer # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-fomit-frame-pointer -) \ No newline at end of file +REQUIRED) \ No newline at end of file From 16088b980f69b9c13c973b98e28459a48a10abf2 Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Mon, 14 Apr 2025 10:31:47 +0200 Subject: [PATCH 013/346] Reduce device_capabilities_traits_jit.h instructions & use std::ostringstream for generated line, make it build with Clang(CL) 19.1.1 --- src/nbl/device/DeviceGen.py | 4 ++-- src/nbl/device/gen.py | 2 +- src/nbl/video/CJITIncludeLoader.cpp | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/nbl/device/DeviceGen.py b/src/nbl/device/DeviceGen.py index 288732de9b..9ad485fc84 100644 --- a/src/nbl/device/DeviceGen.py +++ b/src/nbl/device/DeviceGen.py @@ -562,7 +562,7 @@ def buildTraitsHeader(**params): res.append(emptyline) if 'enable_jit' in params and params['enable_jit']: - res.append("std::string jit_traits = R\"===(") + res.append("std::ostringstream oss;") buildTraitsHeaderHelper( res, @@ -582,7 +582,7 @@ def buildTraitsHeader(**params): ) if 'enable_jit' in params and params['enable_jit']: - res.append(")===\";") + res.append("std::string jit_traits = oss.str();") return res diff --git a/src/nbl/device/gen.py b/src/nbl/device/gen.py index b910d1aa8f..253d529b3d 100644 --- a/src/nbl/device/gen.py +++ b/src/nbl/device/gen.py @@ -120,7 +120,7 @@ args.jit_traits_output_path, buildTraitsHeader, type="JIT Members", - template="NBL_CONSTEXPR_STATIC_INLINE {} {} = )===\" + std::string(\"({})\") + CJITIncludeLoader::to_string({}.{}) + R\"===(;", + template="oss << \"NBL_CONSTEXPR_STATIC_INLINE {} {} = ({})\" + CJITIncludeLoader::to_string({}.{});", limits_json=limits, features_json=features, format_params=["type", "name", "type", "json_type", "cpp_name"], diff --git a/src/nbl/video/CJITIncludeLoader.cpp b/src/nbl/video/CJITIncludeLoader.cpp index edab1c046a..a9f27e5afd 100644 --- a/src/nbl/video/CJITIncludeLoader.cpp +++ b/src/nbl/video/CJITIncludeLoader.cpp @@ -49,4 +49,4 @@ std::string CJITIncludeLoader::collectDeviceCaps(const SPhysicalDeviceLimits& li return start + jit_traits + end; } -} \ No newline at end of file +} From 8f454a98a6b037b6e06f715248d03a2c84de5af5 Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Mon, 14 Apr 2025 14:47:34 +0200 Subject: [PATCH 014/346] update bzip2 submodule to latest *official* revision, adjust build system + apply workaround for CLang(CL) 19.1.1 due to error : use of undeclared label "errhandler"; for some reason if in single translation unit we have identical label names (goto) in separate function bodies we hit this error --- 3rdparty/CMakeLists.txt | 23 +++++++++++------------ 3rdparty/bzip2 | 2 +- src/nbl/CMakeLists.txt | 8 +++++++- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt index 0335baf7e5..ffbf8e4cbd 100755 --- a/3rdparty/CMakeLists.txt +++ b/3rdparty/CMakeLists.txt @@ -247,6 +247,16 @@ if(_NBL_COMPILE_WITH_GLI_) set(BUILD_TESTING ${_OLD_BUILD_TESTING}) endif() +set(ENABLE_STATIC_LIB ON) +set(ENABLE_SHARED_LIB OFF) +set(ENABLE_EXAMPLES OFF) +set(ENABLE_DOCS OFF) +set(ENABLE_APP OFF) +set(ENABLE_LIB_ONLY ON) +set(ENABLE_TESTS OFF) +set(ENABLE_SUMMARY OFF) +add_subdirectory(bzip2 bzip2 EXCLUDE_FROM_ALL) + add_library(lzma OBJECT lzma/C/Alloc.c lzma/C/LzFind.c @@ -263,17 +273,6 @@ add_library(lz4 OBJECT lz4/lib/xxhash.c ) - -add_library(bzip2 OBJECT - bzip2/blocksort.c - bzip2/bzlib.c - bzip2/compress.c - bzip2/crctable.c - bzip2/decompress.c - bzip2/huffman.c - bzip2/randtable.c -) - add_library(spirv_cross OBJECT nbl_spirv_cross/spirv_cfg.cpp nbl_spirv_cross/spirv_cross.cpp @@ -460,7 +459,7 @@ set(NBL_3RDPARTY_TARGETS shaderc_util shaderc jpeg-static - bzip2 + bz2_static simdjson nlohmann_json glslang diff --git a/3rdparty/bzip2 b/3rdparty/bzip2 index c4a14bb87e..f4301b0eac 160000 --- a/3rdparty/bzip2 +++ b/3rdparty/bzip2 @@ -1 +1 @@ -Subproject commit c4a14bb87ee395fb2c69ef5dbb50762fe862517e +Subproject commit f4301b0eac69eb109c5419813102be6f82d2b73a diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index bde7182ebd..0f0e4867b5 100755 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -324,7 +324,6 @@ set(NBL_LIBRARY_CREATION_SOURCES ${NABLA_SRCS_COMMON} ${NABLA_HEADERS} $ - $ $ $ $ @@ -391,6 +390,13 @@ if(_NBL_BUILD_DPL_) target_link_libraries(Nabla INTERFACE tbb tbbmalloc tbbmalloc_proxy) endif() +# bzip2 +if(NBL_STATIC_BUILD) + target_link_libraries(Nabla INTERFACE bz2_static) +else() + target_link_libraries(Nabla PRIVATE bz2_static) +endif() + # boost target_include_directories(Nabla PUBLIC "${BOOST_PREPROCESSOR_INCLUDE}") From b4e722a4709af985b1f18ec1d3a35b90663bba46 Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Mon, 14 Apr 2025 16:14:14 +0200 Subject: [PATCH 015/346] remove `-Xclang=-fincremental-extensions` which causes funny compile errors with goto statements (https://github.com/Devsh-Graphics-Programming/Nabla/commit/8f454a98a6b037b6e06f715248d03a2c84de5af5) --- cmake/adjust/template/vendor/impl/Clang.cmake | 1 - 1 file changed, 1 deletion(-) diff --git a/cmake/adjust/template/vendor/impl/Clang.cmake b/cmake/adjust/template/vendor/impl/Clang.cmake index 62c1c2568b..1c3581d425 100644 --- a/cmake/adjust/template/vendor/impl/Clang.cmake +++ b/cmake/adjust/template/vendor/impl/Clang.cmake @@ -84,7 +84,6 @@ endif() NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG OPTIONS -g # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-g -mincremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible - -Xclang=-fincremental-extensions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-fincremental-extensions -Wall # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-W-warning -gline-tables-only # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-gline-tables-only -Xclang=-fno-inline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions From eda05ee269c7be25c8cadfc1c82d459a86e2f692 Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Tue, 15 Apr 2025 19:54:26 +0200 Subject: [PATCH 016/346] adjust MSVC profile + correct incremental link options, update NBL_REQUEST_COMPILE_OPTION_SUPPORT & build system to correctly handle compile & link options, validate build options at configure time --- cmake/adjust/flags.cmake | 207 +++++++++++------- cmake/adjust/template/vendor/impl/Clang.cmake | 16 +- cmake/adjust/template/vendor/impl/MSVC.cmake | 37 ++-- cmake/adjust/template/vendor/impl/reset.cmake | 12 +- src/nbl/CMakeLists.txt | 1 + 5 files changed, 162 insertions(+), 111 deletions(-) diff --git a/cmake/adjust/flags.cmake b/cmake/adjust/flags.cmake index 1718ac0520..d8519aea07 100644 --- a/cmake/adjust/flags.cmake +++ b/cmake/adjust/flags.cmake @@ -12,10 +12,13 @@ define_property(TARGET PROPERTY NBL_CONFIGURATION_MAP BRIEF_DOCS "Stores configuration map for a target, it will evaluate to the configuration it's mapped to" ) -# Usage: NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG CONFIG OPTIONS ) -# LANG, CONFIG - optional, OPTIONS - required +# https://github.com/Kitware/CMake/blob/05e77b8a27033e6fd086456bd6cef28338ff1474/Modules/Internal/CheckCompilerFlag.cmake#L26C7-L26C42 +# must be cached because parse utility clears locals in the CheckCompilerFlag module +set(CHECK_COMPILER_FLAG_OUTPUT_VARIABLE NBL_COMPILER_FLAG_OUTPUT CACHE INTERNAL "") + +# Usage: NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG CONFIG COMPILE_OPTIONS LINK_OPTIONS ) function(NBL_REQUEST_COMPILE_OPTION_SUPPORT) - cmake_parse_arguments(IMPL "REQUIRED" "REQUEST_VAR" "LANG;CONFIG;OPTIONS" ${ARGN}) + cmake_parse_arguments(IMPL "REQUIRED" "REQUEST_VAR" "LANG;CONFIG;COMPILE_OPTIONS;LINK_OPTIONS" ${ARGN}) set(DEFAULT_COMPILERS c cxx) set(REQUEST_ALL_OPTIONS_PRESENT True) @@ -24,63 +27,39 @@ function(NBL_REQUEST_COMPILE_OPTION_SUPPORT) list(APPEND IMPL_LANG ${DEFAULT_COMPILERS}) endif() - if(NOT IMPL_OPTIONS) - message(FATAL_ERROR "NBL_REQUEST_COMPILE_OPTION_SUPPORT's OPTIONS empty!") - endif() - foreach(COMPILER IN ITEMS ${IMPL_LANG}) string(TOUPPER "${COMPILER}" COMPILER_UPPER) - if(COMPILER_UPPER STREQUAL C) - macro(VALIDATE_FLAG) - check_c_compiler_flag(${ARGV}) - endmacro() - elseif(COMPILER_UPPER STREQUAL CXX) - macro(VALIDATE_FLAG) - check_cxx_compiler_flag(${ARGV}) - endmacro() - endif() - - foreach(COMPILE_OPTION ${IMPL_OPTIONS}) - string(REGEX REPLACE "[-=:;/.]" "_" FLAG_SIGNATURE "${COMPILE_OPTION}") - - set(TEST_NAME "NBL_${COMPILER_UPPER}_COMPILER_HAS_${FLAG_SIGNATURE}_FLAG") - set(FLAG_VAR ${TEST_NAME}) + foreach(WHAT_OPTIONS IN ITEMS IMPL_COMPILE_OPTIONS IMPL_LINK_OPTIONS) + if(NOT ${WHAT_OPTIONS}) + continue() + endif() - VALIDATE_FLAG("${COMPILE_OPTION}" "${FLAG_VAR}") + set(IMPL_OPTIONS ${${WHAT_OPTIONS}}) + string(REPLACE IMPL_ "" WHAT_OPTIONS "${WHAT_OPTIONS}") - if(${FLAG_VAR}) + foreach(COMPILE_OPTION ${IMPL_OPTIONS}) if(IMPL_CONFIG) foreach(CONFIG ${IMPL_CONFIG}) # TODO: validate (${CONFIG} \in ${CMAKE_CONFIGURATION_TYPES}) string(TOUPPER "${CONFIG}" CONFIG_UPPER) - set(NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_COMPILE_OPTIONS "${NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_COMPILE_OPTIONS};${COMPILE_OPTION}") + set(NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_${WHAT_OPTIONS} "${NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_${WHAT_OPTIONS}};${COMPILE_OPTION}") endforeach() else() - set(NBL_${COMPILER_UPPER}_COMPILE_OPTIONS "${NBL_${COMPILER_UPPER}_COMPILE_OPTIONS};${COMPILE_OPTION}") - endif() - else() - if(IMPL_REQUIRED) - message(FATAL_ERROR "${TEST_NAME} (a.k.a \"${COMPILE_OPTION}\") failed because its marked as REQUIRED!") + set(NBL_${COMPILER_UPPER}_${WHAT_OPTIONS} "${NBL_${COMPILER_UPPER}_${WHAT_OPTIONS}};${COMPILE_OPTION}") endif() + endforeach() - set(REQUEST_ALL_OPTIONS_PRESENT False) + if(IMPL_CONFIG) + foreach(CONFIG ${IMPL_CONFIG}) + string(TOUPPER "${CONFIG}" CONFIG_UPPER) + set(NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_${WHAT_OPTIONS} ${NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_${WHAT_OPTIONS}} PARENT_SCOPE) + endforeach() + else() + set(NBL_${COMPILER_UPPER}_${WHAT_OPTIONS} ${NBL_${COMPILER_UPPER}_${WHAT_OPTIONS}} PARENT_SCOPE) endif() endforeach() - - if(IMPL_CONFIG) - foreach(CONFIG ${IMPL_CONFIG}) - string(TOUPPER "${CONFIG}" CONFIG_UPPER) - set(NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_COMPILE_OPTIONS ${NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_COMPILE_OPTIONS} PARENT_SCOPE) - endforeach() - else() - set(NBL_${COMPILER_UPPER}_COMPILE_OPTIONS ${NBL_${COMPILER_UPPER}_COMPILE_OPTIONS} PARENT_SCOPE) - endif() endforeach() - - if(IMPL_REQUEST_VAR) - set(${IMPL_REQUEST_VAR} ${REQUEST_ALL_OPTIONS_PRESENT} PARENT_SCOPE) - endif() endfunction() option(NBL_REQUEST_SSE_4_2 "Request compilation with SSE 4.2 instruction set enabled for Nabla projects" ON) @@ -101,42 +80,104 @@ foreach(NBL_COMPILER_LANGUAGE IN ITEMS C CXX) continue() endif() - # a profile MUST define - # - "NBL_${NBL_COMPILER_LANGUAGE}_${CONFIGURATION}_COMPILE_OPTIONS" (configuration dependent) - # - "NBL_${NBL_COMPILER_LANGUAGE}_COMPILE_OPTIONS" (global) + # a profile MUST define - # a profile MUST NOT define - # - NBL_COMPILE_OPTIONS + # - "NBL_${NBL_COMPILER_LANGUAGE}_${CONFIGURATION}_${WHAT}_OPTIONS" (configuration dependent) + # - "NBL_${NBL_COMPILER_LANGUAGE}_${WHAT}_OPTIONS" (global) - set(NBL_COMPILE_OPTIONS_VAR_NAME NBL_${NBL_COMPILER_LANGUAGE}_COMPILE_OPTIONS) - set(NBL_COMPILE_OPTIONS_VAR_VALUE ${${NBL_COMPILE_OPTIONS_VAR_NAME}}) + # a profile MUST NOT define + # - NBL_${WHAT}_OPTIONS + + # note: + # - use NBL_REQUEST_COMPILE_OPTION_SUPPORT in profile to creates those vars + # - include reset utility in profiles to init vars with empty lists - if(NOT DEFINED ${NBL_COMPILE_OPTIONS_VAR_NAME}) - message(FATAL_ERROR "\"${NBL_PROFILE_PATH}\" did not define \"${NBL_COMPILE_OPTIONS_VAR_NAME}\"!") - endif() + # TODO: DEFINITIONS for WHAT to unify the API - # update map with configuration dependent compile options - foreach(CONFIGURATION IN ITEMS RELEASE RELWITHDEBINFO DEBUG) - set(NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_NAME NBL_${NBL_COMPILER_LANGUAGE}_${CONFIGURATION}_COMPILE_OPTIONS) - set(NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_VALUE ${${NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_NAME}}) + foreach(WHAT COMPILE LINK) + set(NBL_OPTIONS_VAR_NAME NBL_${NBL_COMPILER_LANGUAGE}_${WHAT}_OPTIONS) + set(NBL_OPTIONS_VAR_VALUE ${${NBL_OPTIONS_VAR_NAME}}) - if(NOT DEFINED ${NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_NAME}) - message(FATAL_ERROR "\"${NBL_PROFILE_PATH}\" did not define \"${NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_NAME}\"!") - endif() + if(NOT DEFINED ${NBL_OPTIONS_VAR_NAME}) + message(FATAL_ERROR "\"${NBL_PROFILE_PATH}\" did not define \"${NBL_OPTIONS_VAR_NAME}\"!") + endif() - list(APPEND NBL_${CONFIGURATION}_COMPILE_OPTIONS - # note that "${NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_VALUE}" MUST NOT contain ANY - # $<$> generator expression in order to support our configuration mapping features - $<$:${NBL_CONFIGURATION_COMPILE_OPTIONS_VAR_VALUE}> - ) + # update map with configuration dependent compile options + foreach(CONFIGURATION IN ITEMS RELEASE RELWITHDEBINFO DEBUG) + set(NBL_CONFIGURATION_OPTIONS_VAR_NAME NBL_${NBL_COMPILER_LANGUAGE}_${CONFIGURATION}_${WHAT}_OPTIONS) + set(NBL_CONFIGURATION_OPTIONS_VAR_VALUE ${${NBL_CONFIGURATION_OPTIONS_VAR_NAME}}) - set(NBL_${CONFIGURATION}_COMPILE_OPTIONS ${NBL_${CONFIGURATION}_COMPILE_OPTIONS}) - endforeach() + if(NOT DEFINED ${NBL_CONFIGURATION_OPTIONS_VAR_NAME}) + message(FATAL_ERROR "\"${NBL_PROFILE_PATH}\" did not define \"${NBL_CONFIGURATION_OPTIONS_VAR_NAME}\"!") + endif() + + set(NBL_${CONFIGURATION}_${WHAT}_OPTIONS ${NBL_${CONFIGURATION}_${WHAT}_OPTIONS} + # note that "${NBL_CONFIGURATION_OPTIONS_VAR_VALUE}" MUST NOT contain ANY + # $<$> generator expression in order to support our configuration mapping features + $<$<${WHAT}_LANGUAGE:${NBL_COMPILER_LANGUAGE}>:${NBL_CONFIGURATION_OPTIONS_VAR_VALUE}> + ) + endforeach() + + # update map with global compile options + set(NBL_${WHAT}_OPTIONS ${NBL_${WHAT}_OPTIONS} + $<$<${WHAT}_LANGUAGE:${NBL_COMPILER_LANGUAGE}>:${NBL_${NBL_COMPILER_LANGUAGE}_${WHAT}_OPTIONS}> + ) + endforeach() + + block() + # validate build with a vendor profile, any warning diagnostic = error + # if you hit error it means the profile generates diagnostics due to: + # - an option (compile or link) which doesn't exist (typo? check vendor docs) + # - a set of options which invalidates an option (eg. MSVC's /INCREMENTAL with /LTCG:incremental is invalid, however linker will emit a warning by default + do a fall-back) + # https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_FLAGS.html#variable:CMAKE_%3CLANG%3E_FLAGS + # https://cmake.org/cmake/help/latest/module/CheckCompilerFlag.html#command:check_compiler_flag + + set(CMAKE_${NBL_COMPILER_LANGUAGE}_FLAGS) + + foreach(CONFIGURATION IN ITEMS Release RelWithDebInfo Debug) + set(CMAKE_TRY_COMPILE_CONFIGURATION ${CONFIGURATION}) + string(TOUPPER "${CONFIGURATION}" CONFIGURATION) + + set(TEST_NAME "NBL_${NBL_COMPILER_LANGUAGE}_LANG_${CONFIGURATION}_BUILD_OPTIONS_SUPPORT") + set(CMAKE_${NBL_COMPILER_LANGUAGE}_FLAGS_${CONFIGURATION}) + + set(COMPILE_OPTIONS ${NBL_${NBL_COMPILER_LANGUAGE}_COMPILE_OPTIONS} ${NBL_${NBL_COMPILER_LANGUAGE}_${CONFIGURATION}_COMPILE_OPTIONS}) + set(LINK_OPTIONS ${NBL_${NBL_COMPILER_LANGUAGE}_${CONFIGURATION}_LINK_OPTIONS}) + set(COMBINED ${COMPILE_OPTIONS} ${LINK_OPTIONS}) + + set(NBL_OUTPUT_FILE "${CMAKE_BINARY_DIR}/.nbl/try-compile/${TEST_NAME}.output") # no hash in output diagnostic file, desired + + string(SHA1 OPTIONS_HASH "${COMBINED}") + string(APPEND TEST_NAME "_HASH_${OPTIONS_HASH}") + + set(FLAG_VAR ${TEST_NAME}) + set(CMAKE_REQUIRED_LINK_OPTIONS ${LINK_OPTIONS}) + string(REPLACE ";" " " CLI_COMPILE_OPTIONS "${COMPILE_OPTIONS}") + + if(NBL_COMPILER_LANGUAGE STREQUAL C) + check_c_compiler_flag("${CLI_COMPILE_OPTIONS}" "${FLAG_VAR}") + elseif(NBL_COMPILER_LANGUAGE STREQUAL CXX) + check_cxx_compiler_flag("${CLI_COMPILE_OPTIONS}" "${FLAG_VAR}") + endif() + + if(NOT ${FLAG_VAR}) + if(NOT "${NBL_COMPILER_FLAG_OUTPUT}" STREQUAL "") + file(WRITE "${NBL_OUTPUT_FILE}" "${NBL_COMPILER_FLAG_OUTPUT}") # lock into file, do not cache, must read from the file because of NBL_COMPILER_FLAG_OUTPUT availability (CMake module writes an output only once before a signature flag status is created) + endif() - # update map with global compile options - list(APPEND NBL_COMPILE_OPTIONS $<$:${NBL_${NBL_COMPILER_LANGUAGE}_COMPILE_OPTIONS}>) + if(EXISTS "${NBL_OUTPUT_FILE}") + file(READ "${NBL_OUTPUT_FILE}" NBL_DIAGNOSTICS) + set(NBL_DIAGNOSTICS "Diagnostics:\n${NBL_DIAGNOSTICS}") + else() + set(NBL_DIAGNOSTICS) + endif() - set(NBL_COMPILE_OPTIONS ${NBL_COMPILE_OPTIONS}) + if(NOT DEFINED NBL_SKIP_BUILD_OPTIONS_VALIDATION) + message(FATAL_ERROR "${TEST_NAME} failed! To skip the validation define \"NBL_SKIP_BUILD_OPTIONS_VALIDATION\". ${NBL_DIAGNOSTICS}") + endif() + endif() + endforeach() + endblock() endforeach() function(NBL_EXT_P_APPEND_COMPILE_OPTIONS NBL_LIST_NAME MAP_RELEASE MAP_RELWITHDEBINFO MAP_DEBUG) @@ -240,23 +281,27 @@ function(nbl_adjust_flags) # global compile options list(APPEND _D_NBL_COMPILE_OPTIONS_ ${NBL_COMPILE_OPTIONS}) - - # per configuration compile options with mapping - list(APPEND _D_NBL_COMPILE_OPTIONS_ $<$:${NBL_${NBL_MAP_DEBUG_ITEM_U}_COMPILE_OPTIONS}>) - list(APPEND _D_NBL_COMPILE_OPTIONS_ $<$:${NBL_${NBL_MAP_RELEASE_ITEM_U}_COMPILE_OPTIONS}>) - list(APPEND _D_NBL_COMPILE_OPTIONS_ $<$:${NBL_${NBL_MAP_RELWITHDEBINFO_ITEM_U}_COMPILE_OPTIONS}>) - - # configuration mapping properties - string(APPEND _D_NBL_CONFIGURATION_MAP_ $<$:${NBL_MAP_DEBUG_ITEM_U}>) - string(APPEND _D_NBL_CONFIGURATION_MAP_ $<$:${NBL_MAP_RELEASE_ITEM_U}>) - string(APPEND _D_NBL_CONFIGURATION_MAP_ $<$:${NBL_MAP_RELWITHDEBINFO_ITEM_U}>) + + foreach(CONFIG ${CMAKE_CONFIGURATION_TYPES}) + string(TOUPPER "${CONFIG}" CONFIG_U) + + # per configuration options with mapping + foreach(WHAT COMPILE LINK) + list(APPEND _D_NBL_${WHAT}_OPTIONS_ $<$:${NBL_${NBL_MAP_${CONFIG_U}_ITEM_U}_${WHAT}_OPTIONS}>) + endforeach() + + # configuration mapping properties + string(APPEND _D_NBL_CONFIGURATION_MAP_ $<$:${NBL_MAP_${CONFIG_U}_ITEM_U}>) + endforeach() set_target_properties(${NBL_TARGET_ITEM} PROPERTIES NBL_CONFIGURATION_MAP "${_D_NBL_CONFIGURATION_MAP_}" COMPILE_OPTIONS "${_D_NBL_COMPILE_OPTIONS_}" + LINK_OPTIONS "${_D_NBL_LINK_OPTIONS_}" ) unset(_D_NBL_CONFIGURATION_MAP_) unset(_D_NBL_COMPILE_OPTIONS_) + unset(_D_NBL_LINK_OPTIONS_) set(MAPPED_CONFIG $>) diff --git a/cmake/adjust/template/vendor/impl/Clang.cmake b/cmake/adjust/template/vendor/impl/Clang.cmake index 1c3581d425..9f9f432e98 100644 --- a/cmake/adjust/template/vendor/impl/Clang.cmake +++ b/cmake/adjust/template/vendor/impl/Clang.cmake @@ -9,13 +9,13 @@ endif() if(NBL_REQUEST_SSE_4_2) NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS -msse4.2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang1-msse4.2 - REQUIRED) # TODO: (****) optional but then adjust 3rdparty options on fail +) # TODO: (****) optional but then adjust 3rdparty options on fail endif() if(NBL_REQUEST_SSE_AVX2) NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS -mavx2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mavx2 - REQUIRED) # TODO: (****) +) # TODO: (****) endif() NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS @@ -33,7 +33,7 @@ NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS # sh script for compilation -fcrash-diagnostics=compiler "-fcrash-diagnostics-dir=${NBL_ROOT_PATH_BINARY}/.crash-report" -REQUIRED) +) NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS # latest Clang(CL) 19.1.1 shipped with VS seems to require explicitly features to be listed (simdjson) @@ -71,10 +71,10 @@ NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS -Wno-error=unused-parameter -Wno-error=ignored-attributes -Wno-error=non-pod-varargs -REQUIRED) +) if(NBL_SANITIZE_ADDRESS) - NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS -fsanitize=address REQUIRED) + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS -fsanitize=address) endif() if(NBL_SANITIZE_THREAD) @@ -87,17 +87,17 @@ NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG OPTIONS -Wall # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-W-warning -gline-tables-only # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-gline-tables-only -Xclang=-fno-inline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions -REQUIRED) +) NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELEASE OPTIONS -O2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg -Xclang=-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions -mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible -REQUIRED) +) NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELWITHDEBINFO OPTIONS -g # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-g -O1 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg -Xclang=-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions -mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible -REQUIRED) \ No newline at end of file +) \ No newline at end of file diff --git a/cmake/adjust/template/vendor/impl/MSVC.cmake b/cmake/adjust/template/vendor/impl/MSVC.cmake index 5b73b9073e..62129690f9 100644 --- a/cmake/adjust/template/vendor/impl/MSVC.cmake +++ b/cmake/adjust/template/vendor/impl/MSVC.cmake @@ -7,22 +7,19 @@ if(NOT DEFINED LANG) endif() if(NBL_REQUEST_SSE_4_2) - NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS /arch:SSE4.2 # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170 - ) +) # TODO: (****) optional but then adjust 3rdparty options on fail endif() if(NBL_REQUEST_SSE_AVX2) - NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS /arch:AVX2 # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170 - ) +) # TODO: (****) optional but then adjust 3rdparty options on fail endif() -NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS +NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS /Zc:preprocessor # https://learn.microsoft.com/en-us/cpp/build/reference/zc-preprocessor?view=msvc-170 -) - -list(APPEND NBL_${LANG}_COMPILE_OPTIONS /Zc:__cplusplus # https://learn.microsoft.com/en-us/cpp/build/reference/zc-cplusplus?view=msvc-170 /Zc:wchar_t # https://learn.microsoft.com/en-us/cpp/build/reference/zc-wchar-t-wchar-t-is-native-type?view=msvc-170 /fp:fast # https://learn.microsoft.com/en-us/cpp/build/reference/fp-specify-floating-point-behavior?view=msvc-170 @@ -30,42 +27,48 @@ list(APPEND NBL_${LANG}_COMPILE_OPTIONS ) if(NBL_SANITIZE_ADDRESS) - list(APPEND NBL_${LANG}_COMPILE_OPTIONS + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS /fsanitize=address # https://learn.microsoft.com/en-us/cpp/build/reference/fsanitize?view=msvc-170 ) - list(APPEND NBL_${LANG}_DEBUG_COMPILE_OPTIONS + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG COMPILE_OPTIONS /RTC1 # https://learn.microsoft.com/en-us/cpp/build/reference/rtc-run-time-error-checks?view=msvc-170 ) endif() -list(APPEND NBL_${LANG}_DEBUG_COMPILE_OPTIONS +NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG COMPILE_OPTIONS /Ob0 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170 /Od # https://learn.microsoft.com/en-us/cpp/build/reference/od-disable-debug?view=msvc-170 - /INCREMENTAL # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 /Oy- # https://learn.microsoft.com/en-us/cpp/build/reference/oy-frame-pointer-omission?view=msvc-170 + + LINK_OPTIONS + /INCREMENTAL # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 ) -list(APPEND NBL_${LANG}_RELEASE_COMPILE_OPTIONS +NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELEASE COMPILE_OPTIONS /O2 # https://learn.microsoft.com/en-us/cpp/build/reference/o1-o2-minimize-size-maximize-speed?view=msvc-170 /Ob2 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170 - /INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 /DNDEBUG # https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/assert-macro-assert-wassert?view=msvc-170 /GL # https://learn.microsoft.com/en-us/cpp/build/reference/gl-whole-program-optimization?view=msvc-170 /Gy- # https://learn.microsoft.com/en-us/cpp/build/reference/gy-enable-function-level-linking?view=msvc-170 /sdl- # https://learn.microsoft.com/en-us/cpp/build/reference/sdl-enable-additional-security-checks?view=msvc-170 /GF # https://learn.microsoft.com/en-us/cpp/build/reference/gf-eliminate-duplicate-strings?view=msvc-170 /GS- # https://learn.microsoft.com/en-us/cpp/build/reference/gs-buffer-security-check?view=msvc-170 + + LINK_OPTIONS + /INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 ) -list(APPEND NBL_${LANG}_RELWITHDEBINFO_COMPILE_OPTIONS +NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELWITHDEBINFO COMPILE_OPTIONS /O2 # https://learn.microsoft.com/en-us/cpp/build/reference/o1-o2-minimize-size-maximize-speed?view=msvc-170 /Ob1 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170 - /INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 - /LTCG:incremental # https://learn.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=msvc-170 /Oy- # https://learn.microsoft.com/en-us/cpp/build/reference/oy-frame-pointer-omission?view=msvc-170 /DNDEBUG # https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/assert-macro-assert-wassert?view=msvc-170 /GL # https://learn.microsoft.com/en-us/cpp/build/reference/gl-whole-program-optimization?view=msvc-170 /Gy # https://learn.microsoft.com/en-us/cpp/build/reference/gy-enable-function-level-linking?view=msvc-170 /sdl- # https://learn.microsoft.com/en-us/cpp/build/reference/sdl-enable-additional-security-checks?view=msvc-170 + + LINK_OPTIONS + /INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 (note: cannot use with /LTCG:incremental) + /LTCG:incremental # https://learn.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=msvc-170 ) \ No newline at end of file diff --git a/cmake/adjust/template/vendor/impl/reset.cmake b/cmake/adjust/template/vendor/impl/reset.cmake index 6eb95b6cfd..fc1230f326 100644 --- a/cmake/adjust/template/vendor/impl/reset.cmake +++ b/cmake/adjust/template/vendor/impl/reset.cmake @@ -1,8 +1,10 @@ -# reset profile vars, for sanity +# init profiles vars by resetting required lists foreach(LANG CXX C) - unset(NBL_${LANG}_COMPILE_OPTIONS) - unset(NBL_${LANG}_RELEASE_COMPILE_OPTIONS) - unset(NBL_${LANG}_RELWITHDEBINFO_COMPILE_OPTIONS) - unset(NBL_${LANG}_DEBUG_COMPILE_OPTIONS) + foreach(WHAT COMPILE LINK DEFINITIONS) + set(NBL_${LANG}_${WHAT}_OPTIONS "") + foreach(CONFIG RELEASE RELWITHDEBINFO DEBUG) + set(NBL_${LANG}_${CONFIG}_${WHAT}_OPTIONS "") + endforeach() + endforeach() endforeach() \ No newline at end of file diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 0f0e4867b5..98c7620159 100755 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -396,6 +396,7 @@ if(NBL_STATIC_BUILD) else() target_link_libraries(Nabla PRIVATE bz2_static) endif() +add_dependencies(Nabla bz2_static) # boost target_include_directories(Nabla PUBLIC "${BOOST_PREPROCESSOR_INCLUDE}") From 25e0120e49206a8585da7d38d174bb153e52203f Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Wed, 16 Apr 2025 11:03:32 +0200 Subject: [PATCH 017/346] get rid of MSVC Release linker fallback with /LTCG due to /GL - manually specify the flag --- cmake/adjust/template/vendor/impl/MSVC.cmake | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cmake/adjust/template/vendor/impl/MSVC.cmake b/cmake/adjust/template/vendor/impl/MSVC.cmake index 62129690f9..b1b6b01a99 100644 --- a/cmake/adjust/template/vendor/impl/MSVC.cmake +++ b/cmake/adjust/template/vendor/impl/MSVC.cmake @@ -9,13 +9,13 @@ endif() if(NBL_REQUEST_SSE_4_2) NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS /arch:SSE4.2 # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170 -) # TODO: (****) optional but then adjust 3rdparty options on fail +) # TODO: (****) should be (?) optional but then adjust 3rdparty options on fail endif() if(NBL_REQUEST_SSE_AVX2) NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS /arch:AVX2 # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170 -) # TODO: (****) optional but then adjust 3rdparty options on fail +) # TODO: (****) should be (?) optional but then adjust 3rdparty options on fail endif() NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS @@ -57,6 +57,7 @@ NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELEASE COMPILE_OPTIONS LINK_OPTIONS /INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 + /LTCG # https://learn.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=msvc-170 (note: /GL implies fallback with LTCG) ) NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELWITHDEBINFO COMPILE_OPTIONS @@ -69,6 +70,6 @@ NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELWITHDEBINFO COMPILE_OP /sdl- # https://learn.microsoft.com/en-us/cpp/build/reference/sdl-enable-additional-security-checks?view=msvc-170 LINK_OPTIONS - /INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 (note: cannot use with /LTCG:incremental) - /LTCG:incremental # https://learn.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=msvc-170 + /INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 (note: cannot use /INCREMENTAL with /LTCG:incremental, would cause fallback) + /LTCG:incremental # https://learn.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=msvc-170 ) \ No newline at end of file From b5d6795e293eba4e6c4e3cf658aa1d0178a03248 Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Thu, 17 Apr 2025 17:05:09 +0200 Subject: [PATCH 018/346] Create vendor/template/frontend/MSVC.cmake, update profiles, respect CMAKE__COMPILER_FRONTEND_VARIANT, fix issues with /DELAYLOAD & debug info format for ClangCL, use MSVC-frontend checking logic, inherit default MSVC frontend options in Clang profile if using Windows' ClangCL; upgrade minimum CMake version to 3.31 --- CMakeLists.txt | 2 +- cmake/adjust/template/vendor/impl/Clang.cmake | 111 ++++++++++-------- cmake/adjust/template/vendor/impl/MSVC.cmake | 69 +---------- .../template/vendor/impl/frontend/MSVC.cmake | 68 +++++++++++ cmake/common.cmake | 8 -- src/nbl/CMakeLists.txt | 7 ++ 6 files changed, 138 insertions(+), 127 deletions(-) create mode 100644 cmake/adjust/template/vendor/impl/frontend/MSVC.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 571743f5b0..3c5fa8da4a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ # This file is part of the "Nabla Engine". # For conditions of distribution and use, see copyright notice in nabla.h.in or nabla.h -cmake_minimum_required(VERSION 3.29) +cmake_minimum_required(VERSION 3.31) cmake_policy(SET CMP0112 NEW) cmake_policy(SET CMP0141 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0141.html#policy:CMP0141 cmake_policy(SET CMP0118 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0118.html#policy:CMP0118 diff --git a/cmake/adjust/template/vendor/impl/Clang.cmake b/cmake/adjust/template/vendor/impl/Clang.cmake index 9f9f432e98..4002bc4f65 100644 --- a/cmake/adjust/template/vendor/impl/Clang.cmake +++ b/cmake/adjust/template/vendor/impl/Clang.cmake @@ -6,19 +6,16 @@ if(NOT DEFINED LANG) message(FATAL_ERROR "LANG must be defined!") endif() -if(NBL_REQUEST_SSE_4_2) - NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS - -msse4.2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang1-msse4.2 -) # TODO: (****) optional but then adjust 3rdparty options on fail -endif() - -if(NBL_REQUEST_SSE_AVX2) - NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS - -mavx2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mavx2 -) # TODO: (****) +if(NBL_WITH_COMPILER_CRASH_DIAGNOSTICS) + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS + # use it to make a repro and attach to an issue if you Clang crashes + # - it outputs preprocessed cpp files with sh script for compilation + -fcrash-diagnostics=compiler + -fcrash-diagnostics-dir=${NBL_ROOT_PATH_BINARY}/.crash-report + ) endif() -NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS +NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS -Xclang=-fconstexpr-backtrace-limit=696969 -Xclang=-fconstexpr-depth=696969 -Xclang=-fconstexpr-steps=696969 @@ -28,21 +25,10 @@ NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS -Xclang=-fspell-checking-limit=0 # no limit -Xclang=-fcaret-diagnostics-max-lines=0 # no limit - # whenever clang frontend or backend crashes we put diagnostics into top build direcotry - # use it to make a repro and attach to an issue - it outputs preprocessed cpp files with - # sh script for compilation - -fcrash-diagnostics=compiler - "-fcrash-diagnostics-dir=${NBL_ROOT_PATH_BINARY}/.crash-report" -) - -NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS # latest Clang(CL) 19.1.1 shipped with VS seems to require explicitly features to be listed (simdjson) - # TODO: Yas, use with REQUEST_VAR, if the request fail then do not promote simdjson to build with - # HASWELL implementation because those flags + avx2 compose subset it wants in this case - - # also instead of enabling single options maybe we could consider requesting an - # instruction implementation set instead, eg -march=haswel, though this approach - # could add a few more flags then we actually need while building - to rethink + # TODO: Yas, we should first do independent check if host has the flags, if the request fail then + # do not promote simdjson to build with HASWELL implementation because those flags + avx2 compose + # subset it wants in this case ################ # TODO: (****) -> @@ -73,31 +59,54 @@ NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS -Wno-error=non-pod-varargs ) -if(NBL_SANITIZE_ADDRESS) - NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS -fsanitize=address) -endif() - -if(NBL_SANITIZE_THREAD) - NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} OPTIONS -fsanitize=thread) +if(NBL_REQUEST_SSE_4_2) + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS + -msse4.2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang1-msse4.2 +) # TODO: (****) optional but then adjust 3rdparty options on fail endif() -NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG OPTIONS - -g # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-g - -mincremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible - -Wall # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-W-warning - -gline-tables-only # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-gline-tables-only - -Xclang=-fno-inline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions -) - -NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELEASE OPTIONS - -O2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg - -Xclang=-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions - -mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible -) - -NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELWITHDEBINFO OPTIONS - -g # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-g - -O1 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg - -Xclang=-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions - -mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible -) \ No newline at end of file +if(CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES MSVC) + # ClangCL with MSVC frontend (most of the options are compatible but eg /arch:SSE4.2 seems to be not) + include("${CMAKE_CURRENT_LIST_DIR}/frontend/MSVC.cmake") + + # https://cmake.org/cmake/help/latest/variable/CMAKE_MSVC_DEBUG_INFORMATION_FORMAT.html + # should be set with CMAKE_MSVC_DEBUG_INFORMATION_FORMAT but for some reason it doesn't respect with ClangCL even though its MSVC frontend + # https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_FRONTEND_VARIANT.html#variable:CMAKE_%3CLANG%3E_COMPILER_FRONTEND_VARIANT + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG RELWITHDEBINFO COMPILE_OPTIONS /Zi) + return() +else() + if(NBL_REQUEST_SSE_AVX2) + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS + -mavx2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mavx2 + ) # TODO: (****) + endif() + + if(NBL_SANITIZE_ADDRESS) + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS -fsanitize=address) + endif() + + if(NBL_SANITIZE_THREAD) + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS -fsanitize=thread) + endif() + + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG COMPILE_OPTIONS + -g # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-g + -mincremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible + -Wall # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-W-warning + -gline-tables-only # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-gline-tables-only + -Xclang=-fno-inline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions + ) + + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELEASE COMPILE_OPTIONS + -O2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg + -Xclang=-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions + -mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible + ) + + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELWITHDEBINFO COMPILE_OPTIONS + -g # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-g + -O1 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg + -Xclang=-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions + -mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible + ) +endif() \ No newline at end of file diff --git a/cmake/adjust/template/vendor/impl/MSVC.cmake b/cmake/adjust/template/vendor/impl/MSVC.cmake index b1b6b01a99..803adb1754 100644 --- a/cmake/adjust/template/vendor/impl/MSVC.cmake +++ b/cmake/adjust/template/vendor/impl/MSVC.cmake @@ -1,75 +1,10 @@ include("${CMAKE_CURRENT_LIST_DIR}/reset.cmake") +include("${CMAKE_CURRENT_LIST_DIR}/frontend/MSVC.cmake") # vendor template with options fitting for both C and CXX LANGs -if(NOT DEFINED LANG) - message(FATAL_ERROR "LANG must be defined!") -endif() - if(NBL_REQUEST_SSE_4_2) NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS /arch:SSE4.2 # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170 ) # TODO: (****) should be (?) optional but then adjust 3rdparty options on fail -endif() - -if(NBL_REQUEST_SSE_AVX2) - NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS - /arch:AVX2 # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170 -) # TODO: (****) should be (?) optional but then adjust 3rdparty options on fail -endif() - -NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS - /Zc:preprocessor # https://learn.microsoft.com/en-us/cpp/build/reference/zc-preprocessor?view=msvc-170 - /Zc:__cplusplus # https://learn.microsoft.com/en-us/cpp/build/reference/zc-cplusplus?view=msvc-170 - /Zc:wchar_t # https://learn.microsoft.com/en-us/cpp/build/reference/zc-wchar-t-wchar-t-is-native-type?view=msvc-170 - /fp:fast # https://learn.microsoft.com/en-us/cpp/build/reference/fp-specify-floating-point-behavior?view=msvc-170 - /MP${_NBL_JOBS_AMOUNT_} # https://learn.microsoft.com/en-us/cpp/build/reference/mp-build-with-multiple-processes?view=msvc-170 -) - -if(NBL_SANITIZE_ADDRESS) - NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS - /fsanitize=address # https://learn.microsoft.com/en-us/cpp/build/reference/fsanitize?view=msvc-170 - ) - - NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG COMPILE_OPTIONS - /RTC1 # https://learn.microsoft.com/en-us/cpp/build/reference/rtc-run-time-error-checks?view=msvc-170 - ) -endif() - -NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG COMPILE_OPTIONS - /Ob0 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170 - /Od # https://learn.microsoft.com/en-us/cpp/build/reference/od-disable-debug?view=msvc-170 - /Oy- # https://learn.microsoft.com/en-us/cpp/build/reference/oy-frame-pointer-omission?view=msvc-170 - - LINK_OPTIONS - /INCREMENTAL # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 -) - -NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELEASE COMPILE_OPTIONS - /O2 # https://learn.microsoft.com/en-us/cpp/build/reference/o1-o2-minimize-size-maximize-speed?view=msvc-170 - /Ob2 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170 - /DNDEBUG # https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/assert-macro-assert-wassert?view=msvc-170 - /GL # https://learn.microsoft.com/en-us/cpp/build/reference/gl-whole-program-optimization?view=msvc-170 - /Gy- # https://learn.microsoft.com/en-us/cpp/build/reference/gy-enable-function-level-linking?view=msvc-170 - /sdl- # https://learn.microsoft.com/en-us/cpp/build/reference/sdl-enable-additional-security-checks?view=msvc-170 - /GF # https://learn.microsoft.com/en-us/cpp/build/reference/gf-eliminate-duplicate-strings?view=msvc-170 - /GS- # https://learn.microsoft.com/en-us/cpp/build/reference/gs-buffer-security-check?view=msvc-170 - - LINK_OPTIONS - /INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 - /LTCG # https://learn.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=msvc-170 (note: /GL implies fallback with LTCG) -) - -NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELWITHDEBINFO COMPILE_OPTIONS - /O2 # https://learn.microsoft.com/en-us/cpp/build/reference/o1-o2-minimize-size-maximize-speed?view=msvc-170 - /Ob1 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170 - /Oy- # https://learn.microsoft.com/en-us/cpp/build/reference/oy-frame-pointer-omission?view=msvc-170 - /DNDEBUG # https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/assert-macro-assert-wassert?view=msvc-170 - /GL # https://learn.microsoft.com/en-us/cpp/build/reference/gl-whole-program-optimization?view=msvc-170 - /Gy # https://learn.microsoft.com/en-us/cpp/build/reference/gy-enable-function-level-linking?view=msvc-170 - /sdl- # https://learn.microsoft.com/en-us/cpp/build/reference/sdl-enable-additional-security-checks?view=msvc-170 - - LINK_OPTIONS - /INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 (note: cannot use /INCREMENTAL with /LTCG:incremental, would cause fallback) - /LTCG:incremental # https://learn.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=msvc-170 -) \ No newline at end of file +endif() \ No newline at end of file diff --git a/cmake/adjust/template/vendor/impl/frontend/MSVC.cmake b/cmake/adjust/template/vendor/impl/frontend/MSVC.cmake new file mode 100644 index 0000000000..06ab606104 --- /dev/null +++ b/cmake/adjust/template/vendor/impl/frontend/MSVC.cmake @@ -0,0 +1,68 @@ +# https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_FRONTEND_VARIANT.html#variable:CMAKE_%3CLANG%3E_COMPILER_FRONTEND_VARIANT +# vendor frontend template with options fitting for both C and CXX LANGs + +if(NOT DEFINED LANG) + message(FATAL_ERROR "LANG must be defined!") +endif() + +if(NBL_REQUEST_SSE_AVX2) + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS + /arch:AVX2 # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170 +) # TODO: (****) should be (?) optional but then adjust 3rdparty options on fail +endif() + +NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS + /Zc:preprocessor # https://learn.microsoft.com/en-us/cpp/build/reference/zc-preprocessor?view=msvc-170 + /Zc:__cplusplus # https://learn.microsoft.com/en-us/cpp/build/reference/zc-cplusplus?view=msvc-170 + /Zc:wchar_t # https://learn.microsoft.com/en-us/cpp/build/reference/zc-wchar-t-wchar-t-is-native-type?view=msvc-170 + /fp:fast # https://learn.microsoft.com/en-us/cpp/build/reference/fp-specify-floating-point-behavior?view=msvc-170 + /MP${_NBL_JOBS_AMOUNT_} # https://learn.microsoft.com/en-us/cpp/build/reference/mp-build-with-multiple-processes?view=msvc-170 +) + +if(NBL_SANITIZE_ADDRESS) + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS + /fsanitize=address # https://learn.microsoft.com/en-us/cpp/build/reference/fsanitize?view=msvc-170 + ) + + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG COMPILE_OPTIONS + /RTC1 # https://learn.microsoft.com/en-us/cpp/build/reference/rtc-run-time-error-checks?view=msvc-170 + ) +endif() + +NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG COMPILE_OPTIONS + /Ob0 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170 + /Od # https://learn.microsoft.com/en-us/cpp/build/reference/od-disable-debug?view=msvc-170 + /Oy- # https://learn.microsoft.com/en-us/cpp/build/reference/oy-frame-pointer-omission?view=msvc-170 + + LINK_OPTIONS + /INCREMENTAL # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 +) + +NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELEASE COMPILE_OPTIONS + /O2 # https://learn.microsoft.com/en-us/cpp/build/reference/o1-o2-minimize-size-maximize-speed?view=msvc-170 + /Ob2 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170 + /DNDEBUG # https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/assert-macro-assert-wassert?view=msvc-170 + /GL # https://learn.microsoft.com/en-us/cpp/build/reference/gl-whole-program-optimization?view=msvc-170 + /Gy- # https://learn.microsoft.com/en-us/cpp/build/reference/gy-enable-function-level-linking?view=msvc-170 + /sdl- # https://learn.microsoft.com/en-us/cpp/build/reference/sdl-enable-additional-security-checks?view=msvc-170 + /GF # https://learn.microsoft.com/en-us/cpp/build/reference/gf-eliminate-duplicate-strings?view=msvc-170 + /GS- # https://learn.microsoft.com/en-us/cpp/build/reference/gs-buffer-security-check?view=msvc-170 + + LINK_OPTIONS + /INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 + /LTCG # https://learn.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=msvc-170 (note: /GL implies fallback with LTCG) +) + +NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELWITHDEBINFO COMPILE_OPTIONS + /O2 # https://learn.microsoft.com/en-us/cpp/build/reference/o1-o2-minimize-size-maximize-speed?view=msvc-170 + /Ob1 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170 + /Oy- # https://learn.microsoft.com/en-us/cpp/build/reference/oy-frame-pointer-omission?view=msvc-170 + /DNDEBUG # https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/assert-macro-assert-wassert?view=msvc-170 + /GL # https://learn.microsoft.com/en-us/cpp/build/reference/gl-whole-program-optimization?view=msvc-170 + /Gy # https://learn.microsoft.com/en-us/cpp/build/reference/gy-enable-function-level-linking?view=msvc-170 + /sdl- # https://learn.microsoft.com/en-us/cpp/build/reference/sdl-enable-additional-security-checks?view=msvc-170 + + LINK_OPTIONS + /INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 (note: cannot use /INCREMENTAL with /LTCG:incremental, would cause fallback) + /LTCG:incremental # https://learn.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=msvc-170 +) \ No newline at end of file diff --git a/cmake/common.cmake b/cmake/common.cmake index 69b915bbc7..69a0a5b980 100755 --- a/cmake/common.cmake +++ b/cmake/common.cmake @@ -69,14 +69,6 @@ macro(nbl_create_executable_project _EXTRA_SOURCES _EXTRA_OPTIONS _EXTRA_INCLUDE add_executable(${EXECUTABLE_NAME} ${NBL_EXECUTABLE_SOURCES}) nbl_handle_runtime_lib_properties(${EXECUTABLE_NAME}) - - if(WIN32 AND MSVC) - if(NBL_COMPILER_DYNAMIC_RUNTIME) - target_link_options(${EXECUTABLE_NAME} PUBLIC "/DELAYLOAD:$") - endif() - - target_link_options(${EXECUTABLE_NAME} PUBLIC "/DELAYLOAD:dxcompiler.dll") - endif() endif() nbl_handle_dll_definitions(${EXECUTABLE_NAME} PUBLIC) diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 98c7620159..ad5aa7c463 100755 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -358,6 +358,13 @@ endif() target_compile_definitions(Nabla PRIVATE __NBL_BUILDING_NABLA__) +target_link_options(Nabla INTERFACE # proxy to downstream targets + $<$: + $<$:/DELAYLOAD:$> + /DELAYLOAD:dxcompiler.dll + > +) + if (ANDROID) add_library(android_native_app_glue STATIC ${ANDROID_NDK_ROOT_PATH}/sources/android/native_app_glue/android_native_app_glue.c From a1b9b99777496b62f439dd82d4e19a2c667a7d1d Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Fri, 18 Apr 2025 11:17:28 +0200 Subject: [PATCH 019/346] ah DXC needs to point to devshFixes_clang_19_1_1 --- 3rdparty/dxc/dxc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc index 5ab4d368b6..4621c707ed 160000 --- a/3rdparty/dxc/dxc +++ b/3rdparty/dxc/dxc @@ -1 +1 @@ -Subproject commit 5ab4d368b666d365217c751f5610b496b828ff96 +Subproject commit 4621c707ed774ab8382391f6434810ebecd37111 From 77ed416733f6e337445df4e27b1b62043da47eb7 Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Fri, 18 Apr 2025 12:56:18 +0200 Subject: [PATCH 020/346] keep designated initializers for ISwapchain's SSharedCreationParams, use hlsl::ShaderStage in IDescriptorSetLayout.h --- include/nbl/asset/IDescriptorSetLayout.h | 3 ++- include/nbl/video/ISwapchain.h | 5 ++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/nbl/asset/IDescriptorSetLayout.h b/include/nbl/asset/IDescriptorSetLayout.h index ec3c182fdc..a50f267355 100644 --- a/include/nbl/asset/IDescriptorSetLayout.h +++ b/include/nbl/asset/IDescriptorSetLayout.h @@ -330,7 +330,8 @@ class IDescriptorSetLayout : public IDescriptorSetLayoutBase bindings[i].binding = i; bindings[i].type = type; bindings[i].createFlags = SBinding::E_CREATE_FLAGS::ECF_NONE; - bindings[i].stageFlags = stageAccessFlags ? stageAccessFlags[i]:asset::IShader::E_SHADER_STAGE::ESS_ALL_OR_LIBRARY; + + bindings[i].stageFlags = stageAccessFlags ? stageAccessFlags[i]:hlsl::ShaderStage::ESS_ALL_OR_LIBRARY; bindings[i].count = counts ? counts[i]:1u; bindings[i].samplers = nullptr; } diff --git a/include/nbl/video/ISwapchain.h b/include/nbl/video/ISwapchain.h index 99ba2e7975..882ac16648 100644 --- a/include/nbl/video/ISwapchain.h +++ b/include/nbl/video/ISwapchain.h @@ -21,8 +21,6 @@ class ISwapchain : public IBackendObject struct SSharedCreationParams { - SSharedCreationParams() {} - inline bool valid(const IPhysicalDevice* physDev, const ISurface* surface) const { ISurface::SCapabilities caps; @@ -456,12 +454,13 @@ class ISwapchain : public IBackendObject { return params.deduce(getOriginDevice()->getPhysicalDevice(),m_params.surface.get(),{&m_params.sharedParams.presentMode.value,1},{&m_params.sharedParams.compositeAlpha.value,1},{&m_params.sharedParams.preTransform.value,1}); } - inline core::smart_refctd_ptr recreate(SSharedCreationParams params={}) + inline core::smart_refctd_ptr recreate(SSharedCreationParams params) { if (!deduceRecreationParams(params)) return nullptr; return recreate_impl(std::move(params)); } + inline core::smart_refctd_ptr recreate() { return recreate({}); } // Vulkan: const VkSwapchainKHR* virtual const void* getNativeHandle() const = 0; From a2a7e72f42501fe71b6f4f141b92bac4d2b8cd93 Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Fri, 18 Apr 2025 13:47:08 +0200 Subject: [PATCH 021/346] for my sanity - add -DNDEBUG to Clang profile (Unix) --- cmake/adjust/template/vendor/impl/Clang.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/adjust/template/vendor/impl/Clang.cmake b/cmake/adjust/template/vendor/impl/Clang.cmake index 4002bc4f65..a8ddfcb6bf 100644 --- a/cmake/adjust/template/vendor/impl/Clang.cmake +++ b/cmake/adjust/template/vendor/impl/Clang.cmake @@ -101,6 +101,7 @@ else() -O2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg -Xclang=-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions -mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible + -DNDEBUG ) NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELWITHDEBINFO COMPILE_OPTIONS @@ -108,5 +109,6 @@ else() -O1 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg -Xclang=-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions -mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible + -DNDEBUG ) endif() \ No newline at end of file From cde9e7971a890baf9b82fde753307d38cadf17fe Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Fri, 18 Apr 2025 17:24:28 +0200 Subject: [PATCH 022/346] correct CMAKE_MSVC_DEBUG_INFORMATION_FORMAT logic hence make it work without need to specify debug format flags with ClangCL by hand, enforce ProgramDatabase regardless the case (https://gitlab.kitware.com/cmake/cmake/-/issues/26879#note_1649970) --- CMakeLists.txt | 8 +------- cmake/adjust/flags.cmake | 13 +++---------- cmake/adjust/template/vendor/impl/Clang.cmake | 5 ----- 3 files changed, 4 insertions(+), 22 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3c5fa8da4a..60c10acfab 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,13 +24,7 @@ option(NBL_STATIC_BUILD "" OFF) # ON for static builds, OFF for shared option(NBL_COMPILER_DYNAMIC_RUNTIME "" ON) option(NBL_SANITIZE_ADDRESS OFF) -if(CMAKE_CXX_COMPILER_ID STREQUAL MSVC) - if(NBL_SANITIZE_ADDRESS) - set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$:ProgramDatabase>") - else() - set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$:EditAndContinue>$<$:ProgramDatabase>") - endif() -endif() +set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT $<$:ProgramDatabase>) # ignored on non xMSVC-ABI targets if(NBL_STATIC_BUILD) message(STATUS "Static Nabla build enabled!") diff --git a/cmake/adjust/flags.cmake b/cmake/adjust/flags.cmake index d8519aea07..1e67914ae0 100644 --- a/cmake/adjust/flags.cmake +++ b/cmake/adjust/flags.cmake @@ -305,17 +305,10 @@ function(nbl_adjust_flags) set(MAPPED_CONFIG $>) - if(CMAKE_CXX_COMPILER_ID STREQUAL MSVC) - if(NBL_SANITIZE_ADDRESS) - set(NBL_TARGET_MSVC_DEBUG_INFORMATION_FORMAT "$<$,$>:ProgramDatabase>") - else() - set(NBL_TARGET_MSVC_DEBUG_INFORMATION_FORMAT "$<$:EditAndContinue>$<$:ProgramDatabase>") - endif() - endif() - set_target_properties(${NBL_TARGET_ITEM} PROPERTIES - MSVC_DEBUG_INFORMATION_FORMAT "${NBL_TARGET_MSVC_DEBUG_INFORMATION_FORMAT}" - ) + MSVC_DEBUG_INFORMATION_FORMAT $<$,$>:ProgramDatabase> # ignored on non xMSVC-ABI targets + ) + math(EXPR _NBL_ARG_I_ "${_NBL_ARG_I_} + 1") endwhile() else() # DIRECTORY mode diff --git a/cmake/adjust/template/vendor/impl/Clang.cmake b/cmake/adjust/template/vendor/impl/Clang.cmake index a8ddfcb6bf..0b00294411 100644 --- a/cmake/adjust/template/vendor/impl/Clang.cmake +++ b/cmake/adjust/template/vendor/impl/Clang.cmake @@ -68,11 +68,6 @@ endif() if(CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES MSVC) # ClangCL with MSVC frontend (most of the options are compatible but eg /arch:SSE4.2 seems to be not) include("${CMAKE_CURRENT_LIST_DIR}/frontend/MSVC.cmake") - - # https://cmake.org/cmake/help/latest/variable/CMAKE_MSVC_DEBUG_INFORMATION_FORMAT.html - # should be set with CMAKE_MSVC_DEBUG_INFORMATION_FORMAT but for some reason it doesn't respect with ClangCL even though its MSVC frontend - # https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_FRONTEND_VARIANT.html#variable:CMAKE_%3CLANG%3E_COMPILER_FRONTEND_VARIANT - NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG RELWITHDEBINFO COMPILE_OPTIONS /Zi) return() else() if(NBL_REQUEST_SSE_AVX2) From 6e4392e7e6e0e87a990fc7d65677e94c41ae9ef3 Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Tue, 22 Apr 2025 10:35:39 +0200 Subject: [PATCH 023/346] add more NEW policies, leave TODO comment for `cmake_policy` --- CMakeLists.txt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 60c10acfab..a63d30a89d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,12 @@ # For conditions of distribution and use, see copyright notice in nabla.h.in or nabla.h cmake_minimum_required(VERSION 3.31) -cmake_policy(SET CMP0112 NEW) +# TODO: Yas - once we deploy 4.x we will fire `cmake_policy` instead of manually picking policies +# https://cmake.org/cmake/help/latest/command/cmake_minimum_required.html#policy-version +# also we should update deps which throw warnings about < 3.10 compatibility +cmake_policy(SET CMP0003 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0077.html#cmp0077 +cmake_policy(SET CMP0077 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0077.html#cmp0077 +cmake_policy(SET CMP0112 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0112.html#cmp0112 cmake_policy(SET CMP0141 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0141.html#policy:CMP0141 cmake_policy(SET CMP0118 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0118.html#policy:CMP0118 From dc41722b4ab0da664239f1157a9606a6c6ada868 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 28 Apr 2025 19:59:15 +0700 Subject: [PATCH 024/346] Implement mutable shader spec info --- include/nbl/asset/ICPUComputePipeline.h | 48 +++++++--- include/nbl/asset/ICPUGraphicsPipeline.h | 106 ++++++++++++++++------- include/nbl/asset/ICPUPipeline.h | 102 +--------------------- include/nbl/asset/IGraphicsPipeline.h | 4 +- include/nbl/asset/IPipeline.h | 26 ++++-- 5 files changed, 135 insertions(+), 151 deletions(-) diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h index b9b707d9fc..704c4c05fc 100644 --- a/include/nbl/asset/ICPUComputePipeline.h +++ b/include/nbl/asset/ICPUComputePipeline.h @@ -12,20 +12,21 @@ namespace nbl::asset { //! CPU Version of Compute Pipeline -class ICPUComputePipeline : public ICPUPipeline,1> +class ICPUComputePipeline : public ICPUPipeline> { - using base_t = ICPUPipeline,1>; + using base_t = ICPUPipeline>; public: struct SCreationParams final : IPipeline::SCreationParams { - SShaderSpecInfo shader; + IPipelineBase::SShaderSpecInfo shader; }; static core::smart_refctd_ptr create(const SCreationParams& params) { if (!params.layout) return nullptr; auto retval = new ICPUComputePipeline(core::smart_refctd_ptr(params.layout)); + if (!retval->setSpecInfo(params.shader)) { retval->drop(); @@ -34,35 +35,54 @@ class ICPUComputePipeline : public ICPUPipeline,1> return core::smart_refctd_ptr(retval,core::dont_grab); } + inline core::smart_refctd_ptr clone(uint32_t _depth = ~0u) const override final + { + core::smart_refctd_ptr layout; + if (_depth>0u && m_layout) + layout = core::smart_refctd_ptr_static_cast(m_layout->clone(_depth-1u)); + + auto cp = new ICPUComputePipeline(std::move(layout)); + if (m_specInfo.shader) + { + SShaderSpecInfo specInfo = m_specInfo; + if (_depth > 0u) + { + specInfo.shader = core::smart_refctd_ptr_static_cast(m_specInfo.shader->clone(_depth - 1u)); + } + cp->setSpecInfo(specInfo); + } + return core::smart_refctd_ptr(cp,core::dont_grab); + } + constexpr static inline auto AssetType = ET_COMPUTE_PIPELINE; inline E_TYPE getAssetType() const override { return AssetType; } //! inline size_t getDependantCount() const override {return 2;} - // provide default arg - inline IPipelineBase::SShaderSpecInfo getSpecInfo() const {return base_t::getSpecInfo(hlsl::ShaderStage::ESS_COMPUTE);} - protected: using base_t::base_t; virtual ~ICPUComputePipeline() = default; - base_t* clone_impl(core::smart_refctd_ptr&& layout) const override - { - return new ICPUComputePipeline(std::move(layout)); - } - inline IAsset* getDependant_impl(const size_t ix) override { if (ix!=0) - return m_stages[0].shader.get(); + return m_specInfo.shader.get(); return const_cast(m_layout.get()); } - inline int8_t stageToIndex(const hlsl::ShaderStage stage) const override + inline bool setSpecInfo(const IPipelineBase::SShaderSpecInfo& info) { - return stage!=hlsl::ShaderStage::ESS_COMPUTE ? (-1):0; + const auto specSize = info.valid(); + if (specSize < 0) return false; + if (info.stage != hlsl::ESS_COMPUTE) return false; + m_specInfo = info; + return true; } + + private: + SShaderSpecInfo m_specInfo; + }; } diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h index 2643db7550..b624d53aa9 100644 --- a/include/nbl/asset/ICPUGraphicsPipeline.h +++ b/include/nbl/asset/ICPUGraphicsPipeline.h @@ -13,10 +13,10 @@ namespace nbl::asset { -class ICPUGraphicsPipeline final : public ICPUPipeline,5u> +class ICPUGraphicsPipeline final : public ICPUPipeline, ICPUPipelineLayout,ICPURenderpass>> { - using pipeline_base_t = IGraphicsPipeline; - using base_t = ICPUPipeline; + using pipeline_base_t = IGraphicsPipeline,ICPUPipelineLayout, ICPURenderpass>; + using base_t = ICPUPipeline; public: struct SCreationParams final : pipeline_base_t::SCreationParams @@ -29,27 +29,65 @@ class ICPUGraphicsPipeline final : public ICPUPipeline create(const SCreationParams& params) { // we'll validate the specialization info later when attempting to set it - if (!params.impl_valid([](const IPipelineBase::SShaderSpecInfo& info)->bool{return true;})) - return nullptr; - auto retval = new ICPUGraphicsPipeline(params); - for (const auto spec : params.shaders) - if (spec.shader) - retval->setSpecInfo(spec); - return core::smart_refctd_ptr(retval,core::dont_grab); + if (!params.impl_valid([](const SShaderSpecInfo& info)->bool{return true;})) + return nullptr; + auto retval = new ICPUGraphicsPipeline(params); + for (const auto spec : params.shaders) + { + if (spec.shader) retval->setSpecInfo(spec); + } + return core::smart_refctd_ptr(retval,core::dont_grab); } + inline core::smart_refctd_ptr clone(uint32_t _depth = ~0u) const override final + { + core::smart_refctd_ptr layout; + if (_depth>0u && m_layout) + layout = core::smart_refctd_ptr_static_cast(m_layout->clone(_depth-1u)); + + auto* cp = [&] { + std::array, GRAPHICS_SHADER_STAGE_COUNT> _shaders; + for (auto i = 0; i < GRAPHICS_SHADER_STAGE_COUNT; i++) + _shaders[i] = m_specInfos[i]; + const SCreationParams params = { { + .shaders = _shaders, + .cached = m_params, + .renderpass = m_renderpass.get() + } }; + return new ICPUGraphicsPipeline(params); + }(); + for (auto specInfo : m_specInfos) + { + if (specInfo.shader) + { + auto newSpecInfo = specInfo; + if (_depth>0u) + { + newSpecInfo.shader = core::smart_refctd_ptr_static_cast(specInfo.shader->clone(_depth-1u)); + } + cp->setSpecInfo(newSpecInfo); + } + } + + return core::smart_refctd_ptr(cp,core::dont_grab); + } + + constexpr static inline auto AssetType = ET_GRAPHICS_PIPELINE; inline E_TYPE getAssetType() const override { return AssetType; } inline size_t getDependantCount() const override { auto stageCount = 2; // the layout and renderpass - for (const auto& stage : m_stages) - if (stage.shader) - stageCount++; + for (const auto& info : m_specInfos) + { + if (info.shader) + stageCount++; + } return stageCount; } @@ -65,18 +103,6 @@ class ICPUGraphicsPipeline final : public ICPUPipeline&& layout) const override - { - std::array _shaders; - for (auto i=0; i=GRAPHICS_SHADER_STAGE_COUNT || hlsl::bitCount(stage)!=1) + if (stageIx<0 || stageIx>= GRAPHICS_SHADER_STAGE_COUNT || hlsl::bitCount(stage)!=1) return -1; return stageIx; } + + inline bool setSpecInfo(const SShaderSpecInfo& info) + { + assert(isMutable()); + const auto specSize = info.valid(); + if (specSize<0) return false; + const auto stage = info.stage; + const auto stageIx = stageToIndex(stage); + if (stageIx<0) return false; + m_specInfos[stageIx] = info; + return true; + } + + SShaderSpecInfo m_specInfos[GRAPHICS_SHADER_STAGE_COUNT]; }; } diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h index d1693f18eb..eb634d3f12 100644 --- a/include/nbl/asset/ICPUPipeline.h +++ b/include/nbl/asset/ICPUPipeline.h @@ -14,37 +14,13 @@ namespace nbl::asset { // Common Base class for pipelines -template +template class ICPUPipeline : public IAsset, public PipelineNonAssetBase { - using this_t = ICPUPipeline; + using this_t = ICPUPipeline; + using shader_info_spec_t = IPipelineBase::SShaderSpecInfo; public: - inline core::smart_refctd_ptr clone(uint32_t _depth = ~0u) const override final - { - core::smart_refctd_ptr layout; - if (_depth>0u && PipelineNonAssetBase::m_layout) - layout = core::smart_refctd_ptr_static_cast(PipelineNonAssetBase::m_layout->clone(_depth-1u)); - - auto cp = clone_impl(std::move(layout)); - for (auto i=0; i newShader; - if (_depth>0u) - { - newShader = core::smart_refctd_ptr_static_cast(shader->clone(_depth-1u)); - stageInfo.shader = newShader.get(); - } - cp->setSpecInfo(stageInfo); - } - } - - return core::smart_refctd_ptr(cp,core::dont_grab); - } // extras for this class ICPUPipelineLayout* getLayout() @@ -60,82 +36,10 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase PipelineNonAssetBase::m_layout = std::move(_layout); } - // The getters are weird because the shader pointer, spec constant map and entry point needs patching - inline IShader* getShader(const hlsl::ShaderStage stage) - { - assert(isMutable()); - return const_cast(getSpecInfo(stage).shader); - } - inline std::string* getEntryPoint(const hlsl::ShaderStage stage) - { - const auto stageIx = stageToIndex(stage); - if (stageIx<0) - return {}; - return &m_stages[stageIx].entryPoint; - } - inline IPipelineBase::SShaderSpecInfo::spec_constant_map_t* getSpecConstantMap(const hlsl::ShaderStage stage) - { - assert(isMutable()); - return const_cast(getSpecInfo(stage).entries); - } - // - inline IPipelineBase::SShaderSpecInfo getSpecInfo(const hlsl::ShaderStage stage) const - { - const auto stageIx = stageToIndex(stage); - if (stageIx<0) - return {}; - return m_stages[stageIx].info; - } - inline bool setSpecInfo(const IPipelineBase::SShaderSpecInfo& info) - { - assert(isMutable()); - const int64_t specSize = info.valid(); - if (specSize<0) - return false; - const auto stageIx = stageToIndex(info.stage); - if (stageIx<0) - return false; - auto& outStage = m_stages[stageIx]; - outStage.info = info; - outStage.entryPoint = info.entryPoint; - outStage.shader = core::smart_refctd_ptr(const_cast(info.shader)); - outStage.info.shader = outStage.shader.get(); - auto& outEntries = outStage.entries; - if (specSize>0) - { - outEntries = std::make_unique(); - outEntries->reserve(info.entries->size()); - std::copy(info.entries->begin(),info.entries->end(),std::insert_iterator(*outEntries,outEntries->begin())); - } - else - outEntries = nullptr; - outStage.info.entries = outEntries.get(); - return true; - } - inline bool clearStage(const hlsl::ShaderStage stage) - { - assert(isMutable()); - const auto stageIx = stageToIndex(stage); - if (stageIx<0) - return false; - m_stages[stageIx] = {}; - return true; - } - protected: using PipelineNonAssetBase::PipelineNonAssetBase; virtual ~ICPUPipeline() = default; - virtual this_t* clone_impl(core::smart_refctd_ptr&& layout) const = 0; - virtual int8_t stageToIndex(const hlsl::ShaderStage stage) const = 0; - - struct ShaderStage - { - std::string entryPoint = {}; - core::smart_refctd_ptr shader = {}; - std::unique_ptr entries = {}; - IPipelineBase::SShaderSpecInfo info = {}; - } m_stages[MaxShaderStageCount] = {}; }; } diff --git a/include/nbl/asset/IGraphicsPipeline.h b/include/nbl/asset/IGraphicsPipeline.h index c59ad51ca9..1f3bec79a1 100644 --- a/include/nbl/asset/IGraphicsPipeline.h +++ b/include/nbl/asset/IGraphicsPipeline.h @@ -81,7 +81,7 @@ class IGraphicsPipelineBase : public virtual core::IReferenceCounted }; }; -template +template class IGraphicsPipeline : public IPipeline, public IGraphicsPipelineBase { protected: @@ -91,7 +91,7 @@ class IGraphicsPipeline : public IPipeline, public IGraphics struct SCreationParams : IPipeline::SCreationParams { protected: - using SpecInfo = IPipelineBase::SShaderSpecInfo; + using SpecInfo = SpecInfoType; template inline bool impl_valid(ExtraLambda&& extra) const { diff --git a/include/nbl/asset/IPipeline.h b/include/nbl/asset/IPipeline.h index 036a684729..8ecb2f0fb3 100644 --- a/include/nbl/asset/IPipeline.h +++ b/include/nbl/asset/IPipeline.h @@ -132,8 +132,10 @@ class IPipelineBase Without Specialization Constants, you would have to commit to a final value before the SPIR-V compilation */ + template struct SShaderSpecInfo final { + //! Structure specifying a specialization map entry /* Note that if specialization constant ID is used @@ -146,7 +148,7 @@ class IPipelineBase */ //!< The ID of the specialization constant in SPIR-V. If it isn't used in the shader, the map entry does not affect the behavior of the pipeline. using spec_constant_id_t = uint32_t; - struct SSpecConstantValue + struct SSpecConstantValueImmutable { const void* data = nullptr; //!< The byte size of the specialization constant value within the supplied data buffer. @@ -154,8 +156,18 @@ class IPipelineBase inline operator bool() const {return data&&size;} - auto operator<=>(const SSpecConstantValue&) const = default; + auto operator<=>(const SSpecConstantValueImmutable&) const = default; + }; + + struct SSPecConstantValueMutable + { + core::vector data; + inline operator bool() const { return data.size(); } + auto operator<=>(const SSPecConstantValueMutable&) const = default; }; + + using SSpecConstantValue = std::conditional_t; + inline SSpecConstantValue getSpecializationByteValue(const spec_constant_id_t _specConstID) const { if (!entries) @@ -231,11 +243,14 @@ class IPipelineBase return static_cast(specData); } + using shader_ptr_t = std::conditional_t, const IShader*>; + using entry_point_t = std::conditional_t; using spec_constant_map_t = core::unordered_map; + using entries_t = std::conditional_t; - const IShader* shader = nullptr; + shader_ptr_t shader = nullptr; // A name of the function where the entry point of an shader executable begins. It's often "main" function. - std::string_view entryPoint = {}; + entry_point_t entryPoint = {}; // stage must be set hlsl::ShaderStage stage = hlsl::ShaderStage::ESS_UNKNOWN; // there's some padding here @@ -244,12 +259,13 @@ class IPipelineBase uint8_t requireFullSubgroups : 1 = false; // Container choice implicitly satisfies: // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-constantID-04911 - const spec_constant_map_t* entries = nullptr; + entries_t entries = nullptr; // By requiring Nabla Core Profile features we implicitly satisfy: // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02784 // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02785 // Also because our API is sane, it satisfies the following by construction: // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02754 + }; }; template From 7fe3431366e436dc63fa1795afd324ba99cf473d Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 28 Apr 2025 20:14:37 +0700 Subject: [PATCH 025/346] Rework IGPUGraphicsPipeline --- include/nbl/asset/IGraphicsPipeline.h | 6 +++--- include/nbl/video/IGPUGraphicsPipeline.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/nbl/asset/IGraphicsPipeline.h b/include/nbl/asset/IGraphicsPipeline.h index 1f3bec79a1..77f220aa78 100644 --- a/include/nbl/asset/IGraphicsPipeline.h +++ b/include/nbl/asset/IGraphicsPipeline.h @@ -86,12 +86,12 @@ class IGraphicsPipeline : public IPipeline, public IGraphics { protected: using renderpass_t = RenderpassType; + using spec_info_t = SpecInfoType; public: struct SCreationParams : IPipeline::SCreationParams { protected: - using SpecInfo = SpecInfoType; template inline bool impl_valid(ExtraLambda&& extra) const { @@ -136,7 +136,7 @@ class IGraphicsPipeline : public IPipeline, public IGraphics public: inline bool valid() const { - return impl_valid([](const SpecInfo& info)->bool + return impl_valid([](const spec_info_t& info)->bool { if (!info.valid()) return false; @@ -144,7 +144,7 @@ class IGraphicsPipeline : public IPipeline, public IGraphics }); } - std::span shaders = {}; + std::span shaders = {}; SCachedCreationParams cached = {}; renderpass_t* renderpass = nullptr; }; diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h index 8240bcea94..4838d7f4d3 100644 --- a/include/nbl/video/IGPUGraphicsPipeline.h +++ b/include/nbl/video/IGPUGraphicsPipeline.h @@ -11,9 +11,9 @@ namespace nbl::video { -class IGPUGraphicsPipeline : public IBackendObject, public asset::IGraphicsPipeline +class IGPUGraphicsPipeline : public IBackendObject, public asset::IGraphicsPipeline, const IGPUPipelineLayout,const IGPURenderpass> { - using pipeline_t = asset::IGraphicsPipeline; + using pipeline_t = asset::IGraphicsPipeline, const IGPUPipelineLayout,const IGPURenderpass>; public: struct SCreationParams final : pipeline_t::SCreationParams, SPipelineCreationParams @@ -36,7 +36,7 @@ class IGPUGraphicsPipeline : public IBackendObject, public asset::IGraphicsPipel if (!layout) return {}; SSpecializationValidationResult retval = {.count=0,.dataSize=0}; - const bool valid = pipeline_t::SCreationParams::impl_valid([&retval](const IPipelineBase::SShaderSpecInfo& info)->bool + const bool valid = pipeline_t::SCreationParams::impl_valid([&retval](const spec_info_t& info)->bool { const auto dataSize = info.valid(); if (dataSize<0) From 8dbe9c782ef1f0ac7f154d9c04598f740c061d8f Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 28 Apr 2025 20:14:51 +0700 Subject: [PATCH 026/346] Rework IGPUComputePipeline.h --- include/nbl/video/IGPUComputePipeline.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/nbl/video/IGPUComputePipeline.h b/include/nbl/video/IGPUComputePipeline.h index 49e44dfcc1..ba29cc58e2 100644 --- a/include/nbl/video/IGPUComputePipeline.h +++ b/include/nbl/video/IGPUComputePipeline.h @@ -17,6 +17,7 @@ namespace nbl::video class IGPUComputePipeline : public IBackendObject, public asset::IPipeline { using pipeline_t = asset::IPipeline; + using spec_info_t = SShaderSpecInfo; public: struct SCreationParams final : pipeline_t::SCreationParams, SPipelineCreationParams @@ -63,11 +64,11 @@ class IGPUComputePipeline : public IBackendObject, public asset::IPipeline(dataSize)}; } - inline std::span getShaders() const {return {&shader,1}; } + inline std::span getShaders() const {return {&shader,1}; } // TODO: Could guess the required flags from SPIR-V introspection of declared caps core::bitflag flags = FLAGS::NONE; - IPipelineBase::SShaderSpecInfo shader = {}; + spec_info_t shader = {}; }; inline core::bitflag getCreationFlags() const {return m_flags;} From 436e6e16e51f1abf19c2c599834ed9a7fa1f0d38 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 29 Apr 2025 15:09:19 +0700 Subject: [PATCH 027/346] Remove default value for mutable template parameter --- include/nbl/asset/IPipeline.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/asset/IPipeline.h b/include/nbl/asset/IPipeline.h index 8ecb2f0fb3..97d7ab9c94 100644 --- a/include/nbl/asset/IPipeline.h +++ b/include/nbl/asset/IPipeline.h @@ -132,7 +132,7 @@ class IPipelineBase Without Specialization Constants, you would have to commit to a final value before the SPIR-V compilation */ - template + template struct SShaderSpecInfo final { From 2e367d12519c473135785b689700b7ce62457104 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 5 May 2025 17:00:39 +0700 Subject: [PATCH 028/346] Implement IGPUPipeline and refactor SCreationParams --- include/nbl/asset/ICPUComputePipeline.h | 67 ++-- include/nbl/asset/ICPUGraphicsPipeline.h | 183 +++++----- include/nbl/asset/ICPUPipeline.h | 111 ++++++- include/nbl/asset/IGraphicsPipeline.h | 74 +---- include/nbl/asset/IPipeline.h | 349 ++++++-------------- include/nbl/video/IGPUComputePipeline.h | 20 +- include/nbl/video/IGPUGraphicsPipeline.h | 68 +++- include/nbl/video/IGPUPipeline.h | 110 ++++++ include/nbl/video/IGPURayTracingPipeline.h | 32 +- include/nbl/video/SPipelineCreationParams.h | 2 +- 10 files changed, 518 insertions(+), 498 deletions(-) create mode 100644 include/nbl/video/IGPUPipeline.h diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h index 704c4c05fc..d9bc8dd646 100644 --- a/include/nbl/asset/ICPUComputePipeline.h +++ b/include/nbl/asset/ICPUComputePipeline.h @@ -17,68 +17,53 @@ class ICPUComputePipeline : public ICPUPipeline> using base_t = ICPUPipeline>; public: - struct SCreationParams final : IPipeline::SCreationParams - { - IPipelineBase::SShaderSpecInfo shader; - }; - static core::smart_refctd_ptr create(const SCreationParams& params) + explicit ICPUComputePipeline(const ICPUPipelineLayout* layout): + base_t(core::smart_refctd_ptr(layout)) + {} + + static core::smart_refctd_ptr create(const ICPUPipelineLayout* layout) { - if (!params.layout) - return nullptr; - auto retval = new ICPUComputePipeline(core::smart_refctd_ptr(params.layout)); - - if (!retval->setSpecInfo(params.shader)) - { - retval->drop(); - return nullptr; - } + auto retval = new ICPUComputePipeline(layout); return core::smart_refctd_ptr(retval,core::dont_grab); } - inline core::smart_refctd_ptr clone(uint32_t _depth = ~0u) const override final + inline base_t* clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final { - core::smart_refctd_ptr layout; - if (_depth>0u && m_layout) - layout = core::smart_refctd_ptr_static_cast(m_layout->clone(_depth-1u)); - - auto cp = new ICPUComputePipeline(std::move(layout)); - if (m_specInfo.shader) - { - SShaderSpecInfo specInfo = m_specInfo; - if (_depth > 0u) - { - specInfo.shader = core::smart_refctd_ptr_static_cast(m_specInfo.shader->clone(_depth - 1u)); - } - cp->setSpecInfo(specInfo); - } - return core::smart_refctd_ptr(cp,core::dont_grab); + auto newPipeline = new ICPUComputePipeline(std::move(layout)); + newPipeline->m_specInfo = newPipeline->cloneSpecInfo(m_specInfo, depth); + return newPipeline; } constexpr static inline auto AssetType = ET_COMPUTE_PIPELINE; inline E_TYPE getAssetType() const override { return AssetType; } - //! - inline size_t getDependantCount() const override {return 2;} + //! + inline size_t getDependantCount() const override { return 2; } + + inline virtual std::span getSpecInfo(hlsl::ShaderStage stage) override final + { + if (stage==hlsl::ShaderStage::ESS_COMPUTE && isMutable()) + return {m_specInfo,1}; + return {}; + } + + inline virtual bool valid() const override final + { + // TODO(kevinyu): Fix this temporary dummy code + return true; + } protected: using base_t::base_t; virtual ~ICPUComputePipeline() = default; - inline IAsset* getDependant_impl(const size_t ix) override + inline IAsset* getDependant_impl(const size_t ix) override { if (ix!=0) return m_specInfo.shader.get(); return const_cast(m_layout.get()); } - inline bool setSpecInfo(const IPipelineBase::SShaderSpecInfo& info) - { - const auto specSize = info.valid(); - if (specSize < 0) return false; - if (info.stage != hlsl::ESS_COMPUTE) return false; - m_specInfo = info; - return true; - } private: SShaderSpecInfo m_specInfo; diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h index b624d53aa9..b93b8165aa 100644 --- a/include/nbl/asset/ICPUGraphicsPipeline.h +++ b/include/nbl/asset/ICPUGraphicsPipeline.h @@ -13,135 +13,102 @@ namespace nbl::asset { -class ICPUGraphicsPipeline final : public ICPUPipeline, ICPUPipelineLayout,ICPURenderpass>> +class ICPUGraphicsPipeline final : public ICPUPipeline> { - using pipeline_base_t = IGraphicsPipeline,ICPUPipelineLayout, ICPURenderpass>; + using pipeline_base_t = IGraphicsPipeline; using base_t = ICPUPipeline; public: - struct SCreationParams final : pipeline_base_t::SCreationParams - { - private: - friend class ICPUGraphicsPipeline; - template - inline bool impl_valid(ExtraLambda&& extra) const - { - return pipeline_base_t::SCreationParams::impl_valid(std::move(extra)); - } - }; - - static core::smart_refctd_ptr create(const SCreationParams& params) - { - // we'll validate the specialization info later when attempting to set it - if (!params.impl_valid([](const SShaderSpecInfo& info)->bool{return true;})) - return nullptr; - auto retval = new ICPUGraphicsPipeline(params); - for (const auto spec : params.shaders) + explicit ICPUGraphicsPipeline(const ICPUPipelineLayout* layout) + : base_t(layout, {}, {}) + {} + + static core::smart_refctd_ptr create(const ICPUPipelineLayout* layout) { - if (spec.shader) retval->setSpecInfo(spec); + auto retval = new ICPUGraphicsPipeline(layout); + return core::smart_refctd_ptr(retval,core::dont_grab); } - return core::smart_refctd_ptr(retval,core::dont_grab); - } - - inline core::smart_refctd_ptr clone(uint32_t _depth = ~0u) const override final - { - core::smart_refctd_ptr layout; - if (_depth>0u && m_layout) - layout = core::smart_refctd_ptr_static_cast(m_layout->clone(_depth-1u)); - auto* cp = [&] { - std::array, GRAPHICS_SHADER_STAGE_COUNT> _shaders; + inline base_t* clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final + { + auto* newPipeline = new ICPUGraphicsPipeline(layout.get()); for (auto i = 0; i < GRAPHICS_SHADER_STAGE_COUNT; i++) - _shaders[i] = m_specInfos[i]; - const SCreationParams params = { { - .shaders = _shaders, - .cached = m_params, - .renderpass = m_renderpass.get() - } }; - return new ICPUGraphicsPipeline(params); - }(); - for (auto specInfo : m_specInfos) - { - if (specInfo.shader) + newPipeline->m_specInfos[i] = m_specInfos[i]; + newPipeline->m_params = m_params; + newPipeline->m_renderpass = m_renderpass; + + for (auto specInfo_i = 0u; specInfo_i < m_specInfos.size(); specInfo_i++) { - auto newSpecInfo = specInfo; - if (_depth>0u) - { - newSpecInfo.shader = core::smart_refctd_ptr_static_cast(specInfo.shader->clone(_depth-1u)); - } - cp->setSpecInfo(newSpecInfo); + newPipeline->m_specInfos[specInfo_i] = newPipeline->cloneSpecInfo(m_specInfos[specInfo_i], depth); } - } - - return core::smart_refctd_ptr(cp,core::dont_grab); - } - - - constexpr static inline auto AssetType = ET_GRAPHICS_PIPELINE; - inline E_TYPE getAssetType() const override { return AssetType; } - - inline size_t getDependantCount() const override - { - auto stageCount = 2; // the layout and renderpass - for (const auto& info : m_specInfos) - { - if (info.shader) - stageCount++; - } - return stageCount; - } - - // extras for this class - inline const SCachedCreationParams& getCachedCreationParams() const {return base_t::getCachedCreationParams();} + + return newPipeline; + } + + constexpr static inline auto AssetType = ET_GRAPHICS_PIPELINE; + inline E_TYPE getAssetType() const override { return AssetType; } + + inline size_t getDependantCount() const override + { + auto stageCount = 2; // the layout and renderpass + for (const auto& info : m_specInfos) + { + if (info.shader) + stageCount++; + } + return stageCount; + } + inline SCachedCreationParams& getCachedCreationParams() { assert(isMutable()); return m_params; } + inline virtual std::span getSpecInfo(hlsl::ShaderStage stage) override final + { + const auto stageIndex = stageToIndex(stage); + if (isMutable() && stageIndex != -1) + { + return { &m_specInfos[stageIndex], 1 }; + } + return {}; + } + + inline virtual bool valid() const override final + { + // TODO(kevinyu): Fix this temporary stub code + return true; + } + protected: - using base_t::base_t; + using base_t::base_t; ~ICPUGraphicsPipeline() = default; - inline IAsset* getDependant_impl(const size_t ix) override - { - if (ix==0) - return const_cast(m_layout.get()); - if (ix==1) - return m_renderpass.get(); - size_t stageCount = 0; - for (auto& specInfo : m_specInfos) - { - if (specInfo.shader) + inline IAsset* getDependant_impl(const size_t ix) override { - if ((stageCount++)==ix-2) - return specInfo.shader.get(); + if (ix==0) + return const_cast(m_layout.get()); + if (ix==1) + return m_renderpass.get(); + size_t stageCount = 0; + for (auto& specInfo : m_specInfos) + { + if (specInfo.shader) + if ((stageCount++)==ix-2) return specInfo.shader.get(); + } + return nullptr; + } + + inline int8_t stageToIndex(const hlsl::ShaderStage stage) const + { + const auto stageIx = hlsl::findLSB(stage); + if (stageIx<0 || stageIx>= GRAPHICS_SHADER_STAGE_COUNT || hlsl::bitCount(stage)!=1) + return -1; + return stageIx; } - } - return nullptr; - } - - inline int8_t stageToIndex(const hlsl::ShaderStage stage) const - { - const auto stageIx = hlsl::findLSB(stage); - if (stageIx<0 || stageIx>= GRAPHICS_SHADER_STAGE_COUNT || hlsl::bitCount(stage)!=1) - return -1; - return stageIx; - } - - inline bool setSpecInfo(const SShaderSpecInfo& info) - { - assert(isMutable()); - const auto specSize = info.valid(); - if (specSize<0) return false; - const auto stage = info.stage; - const auto stageIx = stageToIndex(stage); - if (stageIx<0) return false; - m_specInfos[stageIx] = info; - return true; - } - - SShaderSpecInfo m_specInfos[GRAPHICS_SHADER_STAGE_COUNT]; + + std::array m_specInfos; }; } diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h index eb634d3f12..623d5ae2df 100644 --- a/include/nbl/asset/ICPUPipeline.h +++ b/include/nbl/asset/ICPUPipeline.h @@ -13,12 +13,95 @@ namespace nbl::asset { +class ICPUPipelineBase +{ + public: + struct SShaderSpecInfo + { + //! Structure specifying a specialization map entry + /* + Note that if specialization constant ID is used + in a shader, \bsize\b and \boffset'b must match + to \isuch an ID\i accordingly. + + By design the API satisfies: + https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-offset-00773 + https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-pMapEntries-00774 + */ + //!< The ID of the specialization constant in SPIR-V. If it isn't used in the shader, the map entry does not affect the behavior of the pipeline. + using spec_constant_id_t = uint32_t; + + struct SSpecConstantValue + { + core::vector data; + inline operator bool() const { return data.size(); } + inline size_t size() const { return data.size(); } + }; + + inline SSpecConstantValue* getSpecializationByteValue(const spec_constant_id_t _specConstID) + { + const auto found = entries.find(_specConstID); + if (found != entries.end() && bool(found->second)) return &found->second; + else return nullptr; + } + + static constexpr int32_t INVALID_SPEC_INFO = -1; + inline int32_t valid() const + { + if (!shader) return INVALID_SPEC_INFO; + + // Impossible to check: https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pName-00707 + if (entryPoint.empty()) return INVALID_SPEC_INFO; + + // Impossible to efficiently check anything from: + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-maxClipDistances-00708 + // to: + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-06686 + // and from: + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02756 + // to: + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-module-08987 + + int64_t specData = 0; + for (const auto& entry : entries) + { + if (!entry.second) return INVALID_SPEC_INFO; + specData += entry.second.size(); + } + if (specData > 0x7fffffff) return INVALID_SPEC_INFO; + return static_cast(specData); + } + + core::smart_refctd_ptr shader = nullptr; + std::string entryPoint = ""; + IPipelineBase::SUBGROUP_SIZE requiredSubgroupSize : 3 = IPipelineBase::SUBGROUP_SIZE::UNKNOWN; //!< Default value of 8 means no requirement + uint8_t requireFullSubgroups : 1 = false; + + // Container choice implicitly satisfies: + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-constantID-04911 + core::unordered_map entries; + // By requiring Nabla Core Profile features we implicitly satisfy: + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02784 + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02785 + // Also because our API is sane, it satisfies the following by construction: + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02754 + + }; + + virtual std::span getSpecInfo(const hlsl::ShaderStage stage) = 0; + inline std::span getSpecInfo(const hlsl::ShaderStage stage) const + { + return getSpecInfo(stage); + } + + virtual bool valid() const = 0; +}; + // Common Base class for pipelines template -class ICPUPipeline : public IAsset, public PipelineNonAssetBase +class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipelineBase { using this_t = ICPUPipeline; - using shader_info_spec_t = IPipelineBase::SShaderSpecInfo; public: @@ -36,9 +119,33 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase PipelineNonAssetBase::m_layout = std::move(_layout); } + inline core::smart_refctd_ptr clone(uint32_t _depth = ~0u) const override final + { + core::smart_refctd_ptr layout; + if (_depth>0u && getLayout()) + layout = core::smart_refctd_ptr_static_cast(getLayout->clone(_depth-1u)); + + auto* newPipeline = clone_impl(std::move(layout), _depth); + + return core::smart_refctd_ptr(newPipeline,core::dont_grab); + } + + SShaderSpecInfo cloneSpecInfo(const SShaderSpecInfo& specInfo, uint32_t depth) + { + auto newSpecInfo = specInfo; + if (depth>0u) + { + newSpecInfo.shader = core::smart_refctd_ptr_static_cast(specInfo.shader->clone(depth - 1u)); + } + return newSpecInfo; + } + protected: + using PipelineNonAssetBase::PipelineNonAssetBase; virtual ~ICPUPipeline() = default; + + virtual this_t* clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const = 0; }; diff --git a/include/nbl/asset/IGraphicsPipeline.h b/include/nbl/asset/IGraphicsPipeline.h index 77f220aa78..3e029e76b2 100644 --- a/include/nbl/asset/IGraphicsPipeline.h +++ b/include/nbl/asset/IGraphicsPipeline.h @@ -81,85 +81,23 @@ class IGraphicsPipelineBase : public virtual core::IReferenceCounted }; }; -template +template class IGraphicsPipeline : public IPipeline, public IGraphicsPipelineBase { protected: using renderpass_t = RenderpassType; - using spec_info_t = SpecInfoType; public: - struct SCreationParams : IPipeline::SCreationParams - { - protected: - template - inline bool impl_valid(ExtraLambda&& extra) const - { - if (!IPipeline::SCreationParams::layout) - return false; - - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-dynamicRendering-06576 - if (!renderpass || cached.subpassIx>=renderpass->getSubpassCount()) - return false; - - // TODO: check rasterization samples, etc. - //rp->getCreationParameters().subpasses[i] - - core::bitflag stagePresence = {}; - for (const auto info : shaders) - if (info.shader) - { - if (!extra(info)) - return false; - const auto stage = info.stage; - if (stage>hlsl::ShaderStage::ESS_FRAGMENT) - return false; - if (stagePresence.hasFlags(stage)) - return false; - stagePresence |= stage; - } - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-stage-02096 - if (!stagePresence.hasFlags(hlsl::ShaderStage::ESS_VERTEX)) - return false; - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-00729 - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-00730 - if (stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_CONTROL)!=stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION)) - return false; - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-08888 - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-topology-08889 - if (stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION)!=(cached.primitiveAssembly.primitiveType==EPT_PATCH_LIST)) - return false; - - return true; - } - - public: - inline bool valid() const - { - return impl_valid([](const spec_info_t& info)->bool - { - if (!info.valid()) - return false; - return false; - }); - } - - std::span shaders = {}; - SCachedCreationParams cached = {}; - renderpass_t* renderpass = nullptr; - }; - inline const SCachedCreationParams& getCachedCreationParams() const {return m_params;} - inline const renderpass_t* getRenderpass() const {return m_renderpass.get();} protected: - explicit IGraphicsPipeline(const SCreationParams& _params) : - IPipeline(core::smart_refctd_ptr(_params.layout)), - m_params(_params.cached), m_renderpass(core::smart_refctd_ptr(_params.renderpass)) {} + explicit IGraphicsPipeline(const PipelineLayoutType* layout, const SCachedCreationParams& cachedParams, const renderpass_t* renderpass) : + IPipeline(core::smart_refctd_ptr(layout)), m_renderpass(core::smart_refctd_ptr(renderpass)) + {} - SCachedCreationParams m_params; - core::smart_refctd_ptr m_renderpass; + SCachedCreationParams m_params = {}; + core::smart_refctd_ptr m_renderpass = nullptr; }; } diff --git a/include/nbl/asset/IPipeline.h b/include/nbl/asset/IPipeline.h index 97d7ab9c94..98f1671cca 100644 --- a/include/nbl/asset/IPipeline.h +++ b/include/nbl/asset/IPipeline.h @@ -27,265 +27,112 @@ namespace nbl::asset */ class IPipelineBase { - public: - struct SCreationParams - { - protected: - // This is not public to make sure that different pipelines only get the enums they support - enum class FLAGS : uint64_t - { - NONE = 0, // disallowed in maintanance5 - DISABLE_OPTIMIZATIONS = 1<<0, - ALLOW_DERIVATIVES = 1<<1, - - // I can just derive this - //DERIVATIVE = 1<<2, + public: + enum class CreationFlags : uint64_t + { + NONE = 0, // disallowed in maintanance5 + DISABLE_OPTIMIZATIONS = 1 << 0, + ALLOW_DERIVATIVES = 1 << 1, + + // I can just derive this + //DERIVATIVE = 1<<2, + + // Graphics Pipelines only + //VIEW_INDEX_FROM_DEVICE_INDEX = 1<<3, + + // Compute Pipelines only + //DISPATCH_BASE = 1<<4, + + // This is for NV-raytracing extension. Now this is done via IDeferredOperation + //DEFER_COMPILE_NV = 1<<5, + + // We use Renderdoc to take care of this for us, + // we won't be parsing the statistics and internal representation ourselves. + //CAPTURE_STATISTICS = 1<<6, + //CAPTURE_INTERNAL_REPRESENTATIONS = 1<<7, + + // Will soon be deprecated due to + // https://github.com/Devsh-Graphics-Programming/Nabla/issues/854 + FAIL_ON_PIPELINE_COMPILE_REQUIRED = 1 << 8, + EARLY_RETURN_ON_FAILURE = 1 << 9, + + // Will be exposed later with the IPipelineLibrary asset implementation + // https://github.com/Devsh-Graphics-Programming/Nabla/issues/853 + //LINK_TIME_OPTIMIZATION = 1<<10, + + // Won't be exposed because we'll introduce Libraries as a separate object/asset-type + // https://github.com/Devsh-Graphics-Programming/Nabla/issues/853 + //CREATE_LIBRARY = 1<<11, + + // Ray Tracing Pipelines only + //SKIP_BUILT_IN_PRIMITIVES = 1<<12, + //SKIP_AABBS = 1<<13, + //NO_NULL_ANY_HIT_SHADERS = 1<<14, + //NO_NULL_CLOSEST_HIT_SHADERS = 1<<15, + //NO_NULL_MISS_SHADERS = 1<<16, + //NO_NULL_INTERSECTION_SHADERS = 1<<17, + + // There is a new Device Generated Commands extension with its own flag that will deprecate this + //INDIRECT_BINDABLE_NV = 1<<18, + + // Ray Tracing Pipelines only + // For debug tools + //RAY_TRACING_SHADER_GROUP_HANDLE_CAPTURE_REPLAY_BIT_KHR = 1<<19, + + // Ray Tracing Pipelines only + //ALLOW_MOTION = 1<<20, + + // Graphics Pipelineonly (we don't support subpass shading) + //RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR = 1<<21, + //RENDERING_FRAGMENT_DENSITY_MAP_ATTACHMENT_BIT_EXT = 1<<22, + + // Will be exposed later with the IPipelineLibrary asset implementation + // https://github.com/Devsh-Graphics-Programming/Nabla/issues/853 + //RETAIN_LINK_TIME_OPTIMIZATION_INFO = 1<<23, + + // Ray Tracing Pipelines only + //RAY_TRACING_OPACITY_MICROMAP_BIT_EXT = 1<<24, + + // Not supported yet, and we will move to dynamic rendering, so this might never be supported + //COLOR_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT = 1<<25, + //DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT = 1<<26, + + // Not Supported Yet + //NO_PROTECTED_ACCESS=1<<27, + //RAY_TRACING_DISPLACEMENT_MICROMAP_BIT_NV = 1<<28, + //DESCRIPTOR_VUFFER_BIT=1<<29, + //PROTECTED_ACCESS_ONLY=1<<30, + }; + + // Nabla requires device's reported subgroup size to be between 4 and 128 + enum class SUBGROUP_SIZE : uint8_t + { + // No constraint but probably means `gl_SubgroupSize` is Dynamically Uniform + UNKNOWN = 0, + // Allows the Subgroup Uniform `gl_SubgroupSize` to be non-Dynamically Uniform and vary between Device's min and max + VARYING = 1, + // The rest we encode as log2(x) of the required value + REQUIRE_4 = 2, + REQUIRE_8 = 3, + REQUIRE_16 = 4, + REQUIRE_32 = 5, + REQUIRE_64 = 6, + REQUIRE_128 = 7 + }; - // Graphics Pipelines only - //VIEW_INDEX_FROM_DEVICE_INDEX = 1<<3, - - // Compute Pipelines only - //DISPATCH_BASE = 1<<4, - - // This is for NV-raytracing extension. Now this is done via IDeferredOperation - //DEFER_COMPILE_NV = 1<<5, - - // We use Renderdoc to take care of this for us, - // we won't be parsing the statistics and internal representation ourselves. - //CAPTURE_STATISTICS = 1<<6, - //CAPTURE_INTERNAL_REPRESENTATIONS = 1<<7, - - // Will soon be deprecated due to - // https://github.com/Devsh-Graphics-Programming/Nabla/issues/854 - FAIL_ON_PIPELINE_COMPILE_REQUIRED = 1<<8, - EARLY_RETURN_ON_FAILURE = 1<<9, - - // Will be exposed later with the IPipelineLibrary asset implementation - // https://github.com/Devsh-Graphics-Programming/Nabla/issues/853 - //LINK_TIME_OPTIMIZATION = 1<<10, - - // Won't be exposed because we'll introduce Libraries as a separate object/asset-type - // https://github.com/Devsh-Graphics-Programming/Nabla/issues/853 - //CREATE_LIBRARY = 1<<11, - - // Ray Tracing Pipelines only - //SKIP_BUILT_IN_PRIMITIVES = 1<<12, - //SKIP_AABBS = 1<<13, - //NO_NULL_ANY_HIT_SHADERS = 1<<14, - //NO_NULL_CLOSEST_HIT_SHADERS = 1<<15, - //NO_NULL_MISS_SHADERS = 1<<16, - //NO_NULL_INTERSECTION_SHADERS = 1<<17, - - // There is a new Device Generated Commands extension with its own flag that will deprecate this - //INDIRECT_BINDABLE_NV = 1<<18, - - // Ray Tracing Pipelines only - // For debug tools - //RAY_TRACING_SHADER_GROUP_HANDLE_CAPTURE_REPLAY_BIT_KHR = 1<<19, - - // Ray Tracing Pipelines only - //ALLOW_MOTION = 1<<20, - - // Graphics Pipelineonly (we don't support subpass shading) - //RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR = 1<<21, - //RENDERING_FRAGMENT_DENSITY_MAP_ATTACHMENT_BIT_EXT = 1<<22, - - // Will be exposed later with the IPipelineLibrary asset implementation - // https://github.com/Devsh-Graphics-Programming/Nabla/issues/853 - //RETAIN_LINK_TIME_OPTIMIZATION_INFO = 1<<23, - - // Ray Tracing Pipelines only - //RAY_TRACING_OPACITY_MICROMAP_BIT_EXT = 1<<24, - - // Not supported yet, and we will move to dynamic rendering, so this might never be supported - //COLOR_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT = 1<<25, - //DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT = 1<<26, - - // Not Supported Yet - //NO_PROTECTED_ACCESS=1<<27, - //RAY_TRACING_DISPLACEMENT_MICROMAP_BIT_NV = 1<<28, - //DESCRIPTOR_VUFFER_BIT=1<<29, - //PROTECTED_ACCESS_ONLY=1<<30, - }; - }; - - /* - Specialization info contains things such as entry point to a shader, - specialization map entry, required subgroup size, etc. for a blob of SPIR-V - - It also handles Specialization Constants. - - In Vulkan, all shaders get halfway-compiled into SPIR-V and - then then lowered (compiled) into the HW ISA by the Vulkan driver. - Normally, the half-way compile folds all constant values - and optimizes the code that uses them. - - But, it would be nice every so often to have your Vulkan - program sneak into the halfway-compiled SPIR-V binary and - manipulate some constants at runtime. This is what - Specialization Constants are for. - - So A Specialization Constant is a way of injecting an integer - constant into a halfway-compiled version of a shader right - before the lowering and linking when creating a pipeline. - - Without Specialization Constants, you would have to commit - to a final value before the SPIR-V compilation - */ - template - struct SShaderSpecInfo final - { - - //! Structure specifying a specialization map entry - /* - Note that if specialization constant ID is used - in a shader, \bsize\b and \boffset'b must match - to \isuch an ID\i accordingly. - - By design the API satisfies: - https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-offset-00773 - https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-pMapEntries-00774 - */ - //!< The ID of the specialization constant in SPIR-V. If it isn't used in the shader, the map entry does not affect the behavior of the pipeline. - using spec_constant_id_t = uint32_t; - struct SSpecConstantValueImmutable - { - const void* data = nullptr; - //!< The byte size of the specialization constant value within the supplied data buffer. - uint32_t size = 0; - - inline operator bool() const {return data&&size;} - - auto operator<=>(const SSpecConstantValueImmutable&) const = default; - }; - - struct SSPecConstantValueMutable - { - core::vector data; - inline operator bool() const { return data.size(); } - auto operator<=>(const SSPecConstantValueMutable&) const = default; - }; - - using SSpecConstantValue = std::conditional_t; - - inline SSpecConstantValue getSpecializationByteValue(const spec_constant_id_t _specConstID) const - { - if (!entries) - return { nullptr,0u }; - - const auto found = entries->find(_specConstID); - if (found != entries->end() && bool(found->second)) - return found->second; - else - return { nullptr,0u }; - } - - // Nabla requires device's reported subgroup size to be between 4 and 128 - enum class SUBGROUP_SIZE : uint8_t - { - // No constraint but probably means `gl_SubgroupSize` is Dynamically Uniform - UNKNOWN = 0, - // Allows the Subgroup Uniform `gl_SubgroupSize` to be non-Dynamically Uniform and vary between Device's min and max - VARYING = 1, - // The rest we encode as log2(x) of the required value - REQUIRE_4 = 2, - REQUIRE_8 = 3, - REQUIRE_16 = 4, - REQUIRE_32 = 5, - REQUIRE_64 = 6, - REQUIRE_128 = 7 - }; - - // - static constexpr int32_t INVALID_SPEC_INFO = -1; - // Returns negative on failure, otherwise the size of the buffer required to reserve for the spec constant data - inline int32_t valid() const - { - if (!shader || hlsl::bitCount(stage)!=1) - return INVALID_SPEC_INFO; - - // Impossible to check: https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pName-00707 - if (entryPoint.empty()) - return INVALID_SPEC_INFO; - - // Shader stages already checked for validity w.r.t. features enabled, during unspec shader creation, only check: - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-08988 - if (requireFullSubgroups) - switch (stage) - { - case hlsl::ShaderStage::ESS_COMPUTE: [[fallthrough]]; - case hlsl::ShaderStage::ESS_TASK: [[fallthrough]]; - case hlsl::ShaderStage::ESS_MESH: - break; - default: - return INVALID_SPEC_INFO; - break; - } - // Impossible to efficiently check anything from: - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-maxClipDistances-00708 - // to: - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-06686 - // and from: - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02756 - // to: - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-module-08987 - - int64_t specData = 0; - if (entries) - for (const auto& entry : *entries) - { - if (!entry.second) - return INVALID_SPEC_INFO; - specData += entry.second.size; - } - if (specData>0x7fffffff) - return INVALID_SPEC_INFO; - return static_cast(specData); - } - - using shader_ptr_t = std::conditional_t, const IShader*>; - using entry_point_t = std::conditional_t; - using spec_constant_map_t = core::unordered_map; - using entries_t = std::conditional_t; - - shader_ptr_t shader = nullptr; - // A name of the function where the entry point of an shader executable begins. It's often "main" function. - entry_point_t entryPoint = {}; - // stage must be set - hlsl::ShaderStage stage = hlsl::ShaderStage::ESS_UNKNOWN; - // there's some padding here - SUBGROUP_SIZE requiredSubgroupSize : 3 = SUBGROUP_SIZE::UNKNOWN; //!< Default value of 8 means no requirement - // Valid only for Compute, Mesh and Task shaders - uint8_t requireFullSubgroups : 1 = false; - // Container choice implicitly satisfies: - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-constantID-04911 - entries_t entries = nullptr; - // By requiring Nabla Core Profile features we implicitly satisfy: - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02784 - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02785 - // Also because our API is sane, it satisfies the following by construction: - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02754 - - }; }; template class IPipeline : public IPipelineBase { - public: - // For now, due to API design we implicitly satisfy a bunch of VUIDs - struct SCreationParams : protected IPipelineBase::SCreationParams - { - public: - const PipelineLayout* layout = nullptr; - }; + public: + inline const PipelineLayout* getLayout() const {return m_layout.get();} - inline const PipelineLayout* getLayout() const {return m_layout.get();} + protected: - protected: - inline IPipeline(core::smart_refctd_ptr&& _layout) - : m_layout(std::move(_layout)) {} + inline IPipeline(core::smart_refctd_ptr&& _layout) + : m_layout(std::move(_layout)) {} - core::smart_refctd_ptr m_layout; + core::smart_refctd_ptr m_layout; }; } diff --git a/include/nbl/video/IGPUComputePipeline.h b/include/nbl/video/IGPUComputePipeline.h index ba29cc58e2..8c5fc017d9 100644 --- a/include/nbl/video/IGPUComputePipeline.h +++ b/include/nbl/video/IGPUComputePipeline.h @@ -7,20 +7,19 @@ #include "nbl/asset/IPipeline.h" -#include "nbl/video/SPipelineCreationParams.h" +#include "nbl/video/IGPUPipeline.h" #include "nbl/video/SPipelineCreationParams.h" namespace nbl::video { -class IGPUComputePipeline : public IBackendObject, public asset::IPipeline +class IGPUComputePipeline : public IGPUPipeline> { using pipeline_t = asset::IPipeline; - using spec_info_t = SShaderSpecInfo; public: - struct SCreationParams final : pipeline_t::SCreationParams, SPipelineCreationParams + struct SCreationParams final : SPipelineCreationParams { // By construction we satisfy from: // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkComputePipelineCreateInfo.html#VUID-VkComputePipelineCreateInfo-flags-03365 @@ -29,7 +28,7 @@ class IGPUComputePipeline : public IBackendObject, public asset::IPipeline(pipeline_t::SCreationParams::FLAGS::F) + #define base_flag(F) static_cast(pipeline_t::CreationFlags::F) enum class FLAGS : uint64_t { NONE = base_flag(NONE), @@ -51,7 +50,7 @@ class IGPUComputePipeline : public IBackendObject, public asset::IPipeline(dataSize)}; } - inline std::span getShaders() const {return {&shader,1}; } + inline std::span getShaders() const {return {&shader,1}; } // TODO: Could guess the required flags from SPIR-V introspection of declared caps core::bitflag flags = FLAGS::NONE; - spec_info_t shader = {}; + SShaderSpecInfo shader = {}; }; inline core::bitflag getCreationFlags() const {return m_flags;} @@ -78,9 +77,8 @@ class IGPUComputePipeline : public IBackendObject, public asset::IPipeline&& _layout, const core::bitflag _flags) : - IBackendObject(core::smart_refctd_ptr(_layout->getOriginDevice())), - pipeline_t(std::move(_layout)), - m_flags(_flags) {} + IGPUPipeline(core::smart_refctd_ptr(_layout->getOriginDevice()), std::move(_layout)), m_flags(_flags) + {} virtual ~IGPUComputePipeline() = default; const core::bitflag m_flags; diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h index 4838d7f4d3..9fba0c4a4a 100644 --- a/include/nbl/video/IGPUGraphicsPipeline.h +++ b/include/nbl/video/IGPUGraphicsPipeline.h @@ -6,20 +6,21 @@ #include "nbl/video/IGPUPipelineLayout.h" #include "nbl/video/IGPURenderpass.h" +#include "nbl/video/IGPUPipeline.h" namespace nbl::video { -class IGPUGraphicsPipeline : public IBackendObject, public asset::IGraphicsPipeline, const IGPUPipelineLayout,const IGPURenderpass> +class IGPUGraphicsPipeline : public IGPUPipeline> { - using pipeline_t = asset::IGraphicsPipeline, const IGPUPipelineLayout,const IGPURenderpass>; + using pipeline_t = asset::IGraphicsPipeline; public: - struct SCreationParams final : pipeline_t::SCreationParams, SPipelineCreationParams - { + struct SCreationParams final : public SPipelineCreationParams + { public: - #define base_flag(F) static_cast(pipeline_t::SCreationParams::FLAGS::F) + #define base_flag(F) static_cast(pipeline_t::CreationFlags::F) enum class FLAGS : uint64_t { NONE = base_flag(NONE), @@ -31,12 +32,53 @@ class IGPUGraphicsPipeline : public IBackendObject, public asset::IGraphicsPipel }; #undef base_flag + template + inline bool impl_valid(ExtraLambda&& extra) const + { + if (!layout) + return false; + + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-dynamicRendering-06576 + if (!renderpass || cached.subpassIx>=renderpass->getSubpassCount()) + return false; + + // TODO: check rasterization samples, etc. + //rp->getCreationParameters().subpasses[i] + + core::bitflag stagePresence = {}; + for (const auto info : shaders) + if (info.shader) + { + if (!extra(info)) + return false; + const auto stage = info.stage; + if (stage>hlsl::ShaderStage::ESS_FRAGMENT) + return false; + if (stagePresence.hasFlags(stage)) + return false; + stagePresence |= stage; + } + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-stage-02096 + if (!stagePresence.hasFlags(hlsl::ShaderStage::ESS_VERTEX)) + return false; + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-00729 + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-00730 + if (stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_CONTROL)!=stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION)) + return false; + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-08888 + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-topology-08889 + if (stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION)!=(cached.primitiveAssembly.primitiveType==asset::EPT_PATCH_LIST)) + return false; + + return true; + } + inline SSpecializationValidationResult valid() const { if (!layout) return {}; SSpecializationValidationResult retval = {.count=0,.dataSize=0}; - const bool valid = pipeline_t::SCreationParams::impl_valid([&retval](const spec_info_t& info)->bool + const bool valid = impl_valid([&retval](const SShaderSpecInfo& info)->bool { const auto dataSize = info.valid(); if (dataSize<0) @@ -55,11 +97,16 @@ class IGPUGraphicsPipeline : public IBackendObject, public asset::IGraphicsPipel return retval; } - inline std::span getShaders() const {return shaders;} + inline std::span getShaders() const {return shaders;} + + IGPUPipelineLayout* layout = nullptr; + std::span shaders = {}; + SCachedCreationParams cached = {}; + renderpass_t* renderpass = nullptr; // TODO: Could guess the required flags from SPIR-V introspection of declared caps core::bitflag flags = FLAGS::NONE; - }; + }; inline core::bitflag getCreationFlags() const {return m_flags;} @@ -67,8 +114,9 @@ class IGPUGraphicsPipeline : public IBackendObject, public asset::IGraphicsPipel virtual const void* getNativeHandle() const = 0; protected: - IGPUGraphicsPipeline(const SCreationParams& params) : IBackendObject(core::smart_refctd_ptr(params.layout->getOriginDevice())), - pipeline_t(params), m_flags(params.flags) {} + IGPUGraphicsPipeline(const SCreationParams& params) : + IGPUPipeline(core::smart_refctd_ptr(params.layout->getOriginDevice()), params.layout, params.cached, params.renderpass), m_flags(params.flags) + {} virtual ~IGPUGraphicsPipeline() = default; const core::bitflag m_flags; diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h new file mode 100644 index 0000000000..0761d5d020 --- /dev/null +++ b/include/nbl/video/IGPUPipeline.h @@ -0,0 +1,110 @@ + + +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_VIDEO_I_GPU_PIPELINE_H_INCLUDED_ +#define _NBL_VIDEO_I_GPU_PIPELINE_H_INCLUDED_ + +#include "nbl/asset/IPipeline.h" + +namespace nbl::video +{ + +class IGPUPipelineBase { + public: + struct SShaderSpecInfo + { + //! Structure specifying a specialization map entry + /* + Note that if specialization constant ID is used + in a shader, \bsize\b and \boffset'b must match + to \isuch an ID\i accordingly. + + By design the API satisfies: + https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-offset-00773 + https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-pMapEntries-00774 + */ + //!< The ID of the specialization constant in SPIR-V. If it isn't used in the shader, the map entry does not affect the behavior of the pipeline. + using spec_constant_id_t = uint32_t; + + struct SSpecConstantValue + { + std::span data; + inline operator bool() const { return data.size(); } + inline size_t size() const { return data.size(); } + }; + + inline SSpecConstantValue getSpecializationByteValue(const spec_constant_id_t _specConstID) const + { + if (!entries) return {}; + + const auto found = entries->find(_specConstID); + if (found != entries->end() && bool(found->second)) return found->second; + else return {}; + } + + static constexpr int32_t INVALID_SPEC_INFO = -1; + inline int32_t valid() const + { + if (!shader) return INVALID_SPEC_INFO; + + // Impossible to check: https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pName-00707 + if (entryPoint.empty()) return INVALID_SPEC_INFO; + + // Impossible to efficiently check anything from: + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-maxClipDistances-00708 + // to: + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-06686 + // and from: + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02756 + // to: + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-module-08987 + + int64_t specData = 0; + for (const auto& entry : *entries) + { + if (!entry.second) + return INVALID_SPEC_INFO; + specData += entry.second.size(); + } + if (specData>0x7fffffff) + return INVALID_SPEC_INFO; + return static_cast(specData); + } + + const asset::IShader* shader = nullptr; + std::string_view entryPoint = ""; + asset::IPipelineBase::SUBGROUP_SIZE requiredSubgroupSize : 3 = asset::IPipelineBase::SUBGROUP_SIZE::UNKNOWN; //!< Default value of 8 means no requirement + uint8_t requireFullSubgroups : 1 = false; + + // Container choice implicitly satisfies: + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-constantID-04911 + const core::unordered_map* entries; + // By requiring Nabla Core Profile features we implicitly satisfy: + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02784 + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02785 + // Also because our API is sane, it satisfies the following by construction: + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02754 + + }; + +}; + +// Common Base class for pipelines +template +class IGPUPipeline : public IBackendObject, public PipelineNonAssetBase, public IGPUPipelineBase +{ + protected: + + template + explicit IGPUPipeline(core::smart_refctd_ptr&& device, Args&&... args) : + PipelineNonAssetBase(std::forward(args...)), IBackendObject(std::move(device)) + {} + virtual ~IGPUPipeline() = default; + +}; + +} + +#endif diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h index fb8c371193..c41ed333a1 100644 --- a/include/nbl/video/IGPURayTracingPipeline.h +++ b/include/nbl/video/IGPURayTracingPipeline.h @@ -10,7 +10,7 @@ namespace nbl::video { -class IGPURayTracingPipeline : public IBackendObject, public asset::IRayTracingPipeline +class IGPURayTracingPipeline : public IGPUPipeline> { using pipeline_t = asset::IRayTracingPipeline; @@ -30,8 +30,28 @@ class IGPURayTracingPipeline : public IBackendObject, public asset::IRayTracingP uint16_t intersection; }; - struct SCreationParams final : pipeline_t::SCreationParams, SPipelineCreationParams + using SGeneralShaderGroupContainer = core::smart_refctd_dynamic_array; + using SHitShaderGroupContainer = core::smart_refctd_dynamic_array; + + struct SCreationParams final : SPipelineCreationParams { + #define base_flag(F) static_cast(IPipelineBase::CreationFlags::F) + enum class FLAGS : uint64_t + { + NONE = base_flag(NONE), + DISABLE_OPTIMIZATIONS = base_flag(DISABLE_OPTIMIZATIONS), + ALLOW_DERIVATIVES = base_flag(ALLOW_DERIVATIVES), + FAIL_ON_PIPELINE_COMPILE_REQUIRED = base_flag(FAIL_ON_PIPELINE_COMPILE_REQUIRED), + EARLY_RETURN_ON_FAILURE = base_flag(EARLY_RETURN_ON_FAILURE), + SKIP_BUILT_IN_PRIMITIVES = 1<<12, + SKIP_AABBS = 1<<13, + NO_NULL_ANY_HIT_SHADERS = 1<<14, + NO_NULL_CLOSEST_HIT_SHADERS = 1<<15, + NO_NULL_MISS_SHADERS = 1<<16, + NO_NULL_INTERSECTION_SHADERS = 1<<17, + ALLOW_MOTION = 1<<20, + }; + #undef base_flag inline SSpecializationValidationResult valid() const { @@ -42,7 +62,7 @@ class IGPURayTracingPipeline : public IBackendObject, public asset::IRayTracingP .count=0, .dataSize=0, }; - const bool valid = pipeline_t::SCreationParams::impl_valid([&retval](const asset::IPipelineBase::SShaderSpecInfo& info)->bool + const bool valid = pipeline_t::SCreationParams::impl_valid([&retval](const spec_info_t& info)->bool { const auto dataSize = info.valid(); if (dataSize<0) @@ -61,8 +81,9 @@ class IGPURayTracingPipeline : public IBackendObject, public asset::IRayTracingP return retval; } - inline std::span getShaders() const { return shaders; } + inline std::span getShaders() const { return shaders; } + IGPUPipelineLayout* layout = nullptr; }; inline core::bitflag getCreationFlags() const { return m_flags; } @@ -82,8 +103,7 @@ class IGPURayTracingPipeline : public IBackendObject, public asset::IRayTracingP virtual uint16_t getDefaultStackSize() const = 0; protected: - IGPURayTracingPipeline(const SCreationParams& params) : IBackendObject(core::smart_refctd_ptr(params.layout->getOriginDevice())), - pipeline_t(params), + IGPURayTracingPipeline(const SCreationParams& params) : IGPUPipeline(core::smart_refctd_ptr(params.layout->getOriginDevice()), params), m_flags(params.flags) {} diff --git a/include/nbl/video/SPipelineCreationParams.h b/include/nbl/video/SPipelineCreationParams.h index 489bff4343..969559d941 100644 --- a/include/nbl/video/SPipelineCreationParams.h +++ b/include/nbl/video/SPipelineCreationParams.h @@ -49,7 +49,7 @@ struct SPipelineCreationParams return basePipelineIndex!=NotDerivingFromPreviousPipeline || basePipeline; } - // If you set this, then we don't take `basePipelineIndex` into account, the pointer takes precedence + const PipelineType* basePipeline = nullptr; int32_t basePipelineIndex = NotDerivingFromPreviousPipeline; }; From a9d5aafcf6188116acb92d3177cb27f1236e9951 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 5 May 2025 17:15:56 +0700 Subject: [PATCH 029/346] Fix gpu graphics pipeline stage validation --- include/nbl/asset/ICPUGraphicsPipeline.h | 8 -------- include/nbl/asset/IGraphicsPipeline.h | 15 +++++++++++++++ include/nbl/video/IGPUGraphicsPipeline.h | 13 +++++-------- 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h index b93b8165aa..0f90f1213d 100644 --- a/include/nbl/asset/ICPUGraphicsPipeline.h +++ b/include/nbl/asset/ICPUGraphicsPipeline.h @@ -100,14 +100,6 @@ class ICPUGraphicsPipeline final : public ICPUPipeline= GRAPHICS_SHADER_STAGE_COUNT || hlsl::bitCount(stage)!=1) - return -1; - return stageIx; - } - std::array m_specInfos; }; diff --git a/include/nbl/asset/IGraphicsPipeline.h b/include/nbl/asset/IGraphicsPipeline.h index 3e029e76b2..d7ccf598ed 100644 --- a/include/nbl/asset/IGraphicsPipeline.h +++ b/include/nbl/asset/IGraphicsPipeline.h @@ -91,6 +91,21 @@ class IGraphicsPipeline : public IPipeline, public IGraphics inline const SCachedCreationParams& getCachedCreationParams() const {return m_params;} inline const renderpass_t* getRenderpass() const {return m_renderpass.get();} + static inline int8_t stageToIndex(const hlsl::ShaderStage stage) + { + const auto stageIx = hlsl::findLSB(stage); + if (stageIx < 0 || stageIx >= GRAPHICS_SHADER_STAGE_COUNT || hlsl::bitCount(stage)!=1) + return -1; + return stageIx; + } + + static inline hlsl::ShaderStage indexToStage(const int8_t index) + { + if (index < 0 || index > GRAPHICS_SHADER_STAGE_COUNT) + return hlsl::ShaderStage::ESS_UNKNOWN; + return static_cast(hlsl::ShaderStage::ESS_VERTEX + index); + } + protected: explicit IGraphicsPipeline(const PipelineLayoutType* layout, const SCachedCreationParams& cachedParams, const renderpass_t* renderpass) : IPipeline(core::smart_refctd_ptr(layout)), m_renderpass(core::smart_refctd_ptr(renderpass)) diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h index 9fba0c4a4a..50c09123cb 100644 --- a/include/nbl/video/IGPUGraphicsPipeline.h +++ b/include/nbl/video/IGPUGraphicsPipeline.h @@ -46,18 +46,15 @@ class IGPUGraphicsPipeline : public IGPUPipelinegetCreationParameters().subpasses[i] core::bitflag stagePresence = {}; - for (const auto info : shaders) - if (info.shader) + for (auto shader_i = 0u; shader_i < shaders.size(); shader_i++) { + const auto& info = shaders[shader_i]; if (!extra(info)) return false; - const auto stage = info.stage; - if (stage>hlsl::ShaderStage::ESS_FRAGMENT) - return false; - if (stagePresence.hasFlags(stage)) - return false; - stagePresence |= stage; + if (info.shader) + stagePresence |= indexToStage(shader_i); } + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-stage-02096 if (!stagePresence.hasFlags(hlsl::ShaderStage::ESS_VERTEX)) return false; From 51b69c1574e89b6b8ff1ac67b748cb7b6f200e77 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 5 May 2025 17:35:42 +0700 Subject: [PATCH 030/346] Fix compute pipeline --- include/nbl/video/IGPUComputePipeline.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/nbl/video/IGPUComputePipeline.h b/include/nbl/video/IGPUComputePipeline.h index 8c5fc017d9..66eb1dba96 100644 --- a/include/nbl/video/IGPUComputePipeline.h +++ b/include/nbl/video/IGPUComputePipeline.h @@ -50,7 +50,7 @@ class IGPUComputePipeline : public IGPUPipeline getShaders() const {return {&shader,1}; } + IGPUPipelineLayout* layout = nullptr; // TODO: Could guess the required flags from SPIR-V introspection of declared caps core::bitflag flags = FLAGS::NONE; SShaderSpecInfo shader = {}; @@ -76,8 +77,8 @@ class IGPUComputePipeline : public IGPUPipeline&& _layout, const core::bitflag _flags) : - IGPUPipeline(core::smart_refctd_ptr(_layout->getOriginDevice()), std::move(_layout)), m_flags(_flags) + inline IGPUComputePipeline(const SCreationParams& params) : + IGPUPipeline(core::smart_refctd_ptr(params.layout->getOriginDevice()), core::smart_refctd_ptr(params.layout)), m_flags(params.flags) {} virtual ~IGPUComputePipeline() = default; From fa759beec86b44dbcf317502870b5d4d713f8e5d Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 5 May 2025 17:55:00 +0700 Subject: [PATCH 031/346] Implement cpu graphics pipeline validation --- include/nbl/asset/ICPUGraphicsPipeline.h | 15 +++++++++++++-- include/nbl/asset/IGraphicsPipeline.h | 16 ++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h index 0f90f1213d..c477d42834 100644 --- a/include/nbl/asset/ICPUGraphicsPipeline.h +++ b/include/nbl/asset/ICPUGraphicsPipeline.h @@ -77,8 +77,19 @@ class ICPUGraphicsPipeline final : public ICPUPipeline= m_renderpass->getSubpassCount()) return false; + + core::bitflag stagePresence = {}; + for (auto shader_i = 0u; shader_i < m_specInfos.size(); shader_i++) + { + const auto& info = m_specInfos[shader_i]; + if (info.shader) + stagePresence |= indexToStage(shader_i); + } + return isValidStagePresence(stagePresence, m_params.primitiveAssembly.primitiveType); } protected: diff --git a/include/nbl/asset/IGraphicsPipeline.h b/include/nbl/asset/IGraphicsPipeline.h index d7ccf598ed..f47cee0fa2 100644 --- a/include/nbl/asset/IGraphicsPipeline.h +++ b/include/nbl/asset/IGraphicsPipeline.h @@ -106,6 +106,22 @@ class IGraphicsPipeline : public IPipeline, public IGraphics return static_cast(hlsl::ShaderStage::ESS_VERTEX + index); } + static inline bool isValidStagePresence(const core::bitflag& stagePresence, E_PRIMITIVE_TOPOLOGY primitiveType) + { + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-stage-02096 + if (!stagePresence.hasFlags(hlsl::ShaderStage::ESS_VERTEX)) + return false; + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-00729 + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-00730 + if (stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_CONTROL)!=stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION)) + return false; + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-08888 + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-topology-08889 + if (stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION)!=(primitiveType==asset::EPT_PATCH_LIST)) + return false; + return true; + } + protected: explicit IGraphicsPipeline(const PipelineLayoutType* layout, const SCachedCreationParams& cachedParams, const renderpass_t* renderpass) : IPipeline(core::smart_refctd_ptr(layout)), m_renderpass(core::smart_refctd_ptr(renderpass)) From de8813feca93e7afdb4a78b37df24987e7c59f48 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 5 May 2025 18:10:14 +0700 Subject: [PATCH 032/346] Implement compute pipeline validation --- include/nbl/asset/ICPUComputePipeline.h | 7 +++---- include/nbl/asset/ICPUGraphicsPipeline.h | 2 +- include/nbl/video/IGPUGraphicsPipeline.h | 15 ++------------- 3 files changed, 6 insertions(+), 18 deletions(-) diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h index d9bc8dd646..480f601fc0 100644 --- a/include/nbl/asset/ICPUComputePipeline.h +++ b/include/nbl/asset/ICPUComputePipeline.h @@ -43,14 +43,13 @@ class ICPUComputePipeline : public ICPUPipeline> inline virtual std::span getSpecInfo(hlsl::ShaderStage stage) override final { if (stage==hlsl::ShaderStage::ESS_COMPUTE && isMutable()) - return {m_specInfo,1}; + return {&m_specInfo,1}; return {}; } inline virtual bool valid() const override final { - // TODO(kevinyu): Fix this temporary dummy code - return true; + return m_specInfo.valid(); } protected: @@ -66,7 +65,7 @@ class ICPUComputePipeline : public ICPUPipeline> private: - SShaderSpecInfo m_specInfo; + SShaderSpecInfo m_specInfo; }; diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h index c477d42834..7d139d6fe9 100644 --- a/include/nbl/asset/ICPUGraphicsPipeline.h +++ b/include/nbl/asset/ICPUGraphicsPipeline.h @@ -94,7 +94,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline(params.layout->getOriginDevice()), params.layout, params.cached, params.renderpass), m_flags(params.flags) {} - virtual ~IGPUGraphicsPipeline() = default; + virtual ~IGPUGraphicsPipeline() override = default; const core::bitflag m_flags; }; From 37ab1ce1b34a82b495581e6f01c0f4c5f6329301 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 7 May 2025 17:53:26 +0700 Subject: [PATCH 033/346] Add FLAGS alias --- include/nbl/asset/IPipeline.h | 1 + include/nbl/video/IGPUComputePipeline.h | 2 +- include/nbl/video/IGPUGraphicsPipeline.h | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/nbl/asset/IPipeline.h b/include/nbl/asset/IPipeline.h index 98f1671cca..c458c34afe 100644 --- a/include/nbl/asset/IPipeline.h +++ b/include/nbl/asset/IPipeline.h @@ -103,6 +103,7 @@ class IPipelineBase //DESCRIPTOR_VUFFER_BIT=1<<29, //PROTECTED_ACCESS_ONLY=1<<30, }; + using FLAGS = CreationFlags; // Nabla requires device's reported subgroup size to be between 4 and 128 enum class SUBGROUP_SIZE : uint8_t diff --git a/include/nbl/video/IGPUComputePipeline.h b/include/nbl/video/IGPUComputePipeline.h index 66eb1dba96..6e825d749b 100644 --- a/include/nbl/video/IGPUComputePipeline.h +++ b/include/nbl/video/IGPUComputePipeline.h @@ -28,7 +28,7 @@ class IGPUComputePipeline : public IGPUPipeline(pipeline_t::CreationFlags::F) + #define base_flag(F) static_cast(pipeline_t::FLAGS::F) enum class FLAGS : uint64_t { NONE = base_flag(NONE), diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h index 53ec20244f..fc596a54e1 100644 --- a/include/nbl/video/IGPUGraphicsPipeline.h +++ b/include/nbl/video/IGPUGraphicsPipeline.h @@ -20,7 +20,7 @@ class IGPUGraphicsPipeline : public IGPUPipeline { public: - #define base_flag(F) static_cast(pipeline_t::CreationFlags::F) + #define base_flag(F) static_cast(pipeline_t::FLAGS::F) enum class FLAGS : uint64_t { NONE = base_flag(NONE), From a0ecd505814f71309de538a994f141397d9e0bcd Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 7 May 2025 17:55:00 +0700 Subject: [PATCH 034/346] Fix clone_impl to return smart pointer --- include/nbl/asset/ICPUComputePipeline.h | 6 +++--- include/nbl/asset/ICPUGraphicsPipeline.h | 6 +++--- include/nbl/asset/ICPUPipeline.h | 10 +++++----- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h index 480f601fc0..656e8faf6f 100644 --- a/include/nbl/asset/ICPUComputePipeline.h +++ b/include/nbl/asset/ICPUComputePipeline.h @@ -27,11 +27,11 @@ class ICPUComputePipeline : public ICPUPipeline> return core::smart_refctd_ptr(retval,core::dont_grab); } - inline base_t* clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final + inline core::smart_refctd_ptr clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final { auto newPipeline = new ICPUComputePipeline(std::move(layout)); - newPipeline->m_specInfo = newPipeline->cloneSpecInfo(m_specInfo, depth); - return newPipeline; + newPipeline->m_specInfo = m_specInfo.clone(depth); + return core::smart_refctd_ptr(newPipeline, core::dont_grab); } constexpr static inline auto AssetType = ET_COMPUTE_PIPELINE; diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h index 7d139d6fe9..915a4a43c2 100644 --- a/include/nbl/asset/ICPUGraphicsPipeline.h +++ b/include/nbl/asset/ICPUGraphicsPipeline.h @@ -29,7 +29,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline(retval,core::dont_grab); } - inline base_t* clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final + inline core::smart_refctd_ptr clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final { auto* newPipeline = new ICPUGraphicsPipeline(layout.get()); for (auto i = 0; i < GRAPHICS_SHADER_STAGE_COUNT; i++) @@ -39,10 +39,10 @@ class ICPUGraphicsPipeline final : public ICPUPipelinem_specInfos[specInfo_i] = newPipeline->cloneSpecInfo(m_specInfos[specInfo_i], depth); + newPipeline->m_specInfos[specInfo_i] = m_specInfos[specInfo_i].clone(depth); } - return newPipeline; + return core::smart_refctd_ptr(newPipeline, core::dont_grab); } constexpr static inline auto AssetType = ET_GRAPHICS_PIPELINE; diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h index 623d5ae2df..3b48ea43f7 100644 --- a/include/nbl/asset/ICPUPipeline.h +++ b/include/nbl/asset/ICPUPipeline.h @@ -121,13 +121,13 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipe inline core::smart_refctd_ptr clone(uint32_t _depth = ~0u) const override final { + if (!getLayout()) return nullptr; + core::smart_refctd_ptr layout; - if (_depth>0u && getLayout()) + if (_depth > 0u) layout = core::smart_refctd_ptr_static_cast(getLayout->clone(_depth-1u)); - auto* newPipeline = clone_impl(std::move(layout), _depth); - - return core::smart_refctd_ptr(newPipeline,core::dont_grab); + return clone_impl(std::move(layout), _depth); } SShaderSpecInfo cloneSpecInfo(const SShaderSpecInfo& specInfo, uint32_t depth) @@ -145,7 +145,7 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipe using PipelineNonAssetBase::PipelineNonAssetBase; virtual ~ICPUPipeline() = default; - virtual this_t* clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const = 0; + virtual core::smart_refctd_ptr clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const = 0; }; From 7890981b72e366e62d6a0e0f9d364e3cf82bb5d4 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 7 May 2025 17:57:40 +0700 Subject: [PATCH 035/346] Add final decoration to ICPUComputePipeline --- include/nbl/asset/ICPUComputePipeline.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h index 656e8faf6f..0869277911 100644 --- a/include/nbl/asset/ICPUComputePipeline.h +++ b/include/nbl/asset/ICPUComputePipeline.h @@ -12,7 +12,7 @@ namespace nbl::asset { //! CPU Version of Compute Pipeline -class ICPUComputePipeline : public ICPUPipeline> +class ICPUComputePipeline final : public ICPUPipeline> { using base_t = ICPUPipeline>; From 9a14aa175333af1170e80aa2811ba8df6e684111 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 7 May 2025 17:58:15 +0700 Subject: [PATCH 036/346] Make cpu pipeline constructor private --- include/nbl/asset/ICPUComputePipeline.h | 7 ++++--- include/nbl/asset/ICPUGraphicsPipeline.h | 8 +++++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h index 0869277911..78dc324b50 100644 --- a/include/nbl/asset/ICPUComputePipeline.h +++ b/include/nbl/asset/ICPUComputePipeline.h @@ -17,9 +17,6 @@ class ICPUComputePipeline final : public ICPUPipeline>; public: - explicit ICPUComputePipeline(const ICPUPipelineLayout* layout): - base_t(core::smart_refctd_ptr(layout)) - {} static core::smart_refctd_ptr create(const ICPUPipelineLayout* layout) { @@ -67,6 +64,10 @@ class ICPUComputePipeline final : public ICPUPipeline(layout)) + {} + }; } diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h index 915a4a43c2..2492329a63 100644 --- a/include/nbl/asset/ICPUGraphicsPipeline.h +++ b/include/nbl/asset/ICPUGraphicsPipeline.h @@ -19,9 +19,6 @@ class ICPUGraphicsPipeline final : public ICPUPipeline; public: - explicit ICPUGraphicsPipeline(const ICPUPipelineLayout* layout) - : base_t(layout, {}, {}) - {} static core::smart_refctd_ptr create(const ICPUPipelineLayout* layout) { @@ -112,6 +109,11 @@ class ICPUGraphicsPipeline final : public ICPUPipeline m_specInfos; + + private: + explicit ICPUGraphicsPipeline(const ICPUPipelineLayout* layout) + : base_t(layout, {}, {}) + {} }; } From 9bb9d1411780cfa708609a031f3ecbc13cb276d8 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 7 May 2025 18:00:14 +0700 Subject: [PATCH 037/346] Add layout validation to compute pipeline validation --- include/nbl/asset/ICPUComputePipeline.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h index 78dc324b50..9db06dbde1 100644 --- a/include/nbl/asset/ICPUComputePipeline.h +++ b/include/nbl/asset/ICPUComputePipeline.h @@ -44,8 +44,11 @@ class ICPUComputePipeline final : public ICPUPipelinevalid()) return false; return m_specInfo.valid(); } From bcb096f97dfad9c69b21c50988ab5da1f4c50456 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 7 May 2025 18:03:28 +0700 Subject: [PATCH 038/346] Refactor getSpecInfo --- include/nbl/asset/ICPUComputePipeline.h | 2 +- include/nbl/asset/ICPUGraphicsPipeline.h | 7 +++---- include/nbl/asset/ICPUPipeline.h | 16 +++++----------- 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h index 9db06dbde1..5f933878b4 100644 --- a/include/nbl/asset/ICPUComputePipeline.h +++ b/include/nbl/asset/ICPUComputePipeline.h @@ -37,7 +37,7 @@ class ICPUComputePipeline final : public ICPUPipeline getSpecInfo(hlsl::ShaderStage stage) override final + inline virtual std::span getSpecInfo(hlsl::ShaderStage stage) const override final { if (stage==hlsl::ShaderStage::ESS_COMPUTE && isMutable()) return {&m_specInfo,1}; diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h index 2492329a63..fb82bd5608 100644 --- a/include/nbl/asset/ICPUGraphicsPipeline.h +++ b/include/nbl/asset/ICPUGraphicsPipeline.h @@ -62,16 +62,15 @@ class ICPUGraphicsPipeline final : public ICPUPipeline getSpecInfo(hlsl::ShaderStage stage) override final + inline virtual std::span getSpecInfo(hlsl::ShaderStage stage) const override final { const auto stageIndex = stageToIndex(stage); - if (isMutable() && stageIndex != -1) - { + if (stageIndex != -1) return { &m_specInfos[stageIndex], 1 }; - } return {}; } + inline virtual bool valid() const override final { if (!m_layout) return false; diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h index 3b48ea43f7..fa77c40b7e 100644 --- a/include/nbl/asset/ICPUPipeline.h +++ b/include/nbl/asset/ICPUPipeline.h @@ -88,11 +88,7 @@ class ICPUPipelineBase }; - virtual std::span getSpecInfo(const hlsl::ShaderStage stage) = 0; - inline std::span getSpecInfo(const hlsl::ShaderStage stage) const - { - return getSpecInfo(stage); - } + virtual std::span getSpecInfo(const hlsl::ShaderStage stage) const = 0; virtual bool valid() const = 0; }; @@ -131,13 +127,11 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipe } SShaderSpecInfo cloneSpecInfo(const SShaderSpecInfo& specInfo, uint32_t depth) + inline std::span getSpecInfo(hlsl::ShaderStage stage) { - auto newSpecInfo = specInfo; - if (depth>0u) - { - newSpecInfo.shader = core::smart_refctd_ptr_static_cast(specInfo.shader->clone(depth - 1u)); - } - return newSpecInfo; + if (!isMutable()) return {}; + const auto specInfo = static_cast(this)->getSpecInfo(stage); + return { const_cast(specInfo.data()), specInfo.size() }; } protected: From 278eb715bcd2e25168f36f30626213a8561ef4f7 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 7 May 2025 18:04:31 +0700 Subject: [PATCH 039/346] Move stageToIndex and indexToStage --- include/nbl/asset/ICPUGraphicsPipeline.h | 15 +++++++++++++++ include/nbl/asset/IGraphicsPipeline.h | 14 -------------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h index fb82bd5608..926ee0ca6c 100644 --- a/include/nbl/asset/ICPUGraphicsPipeline.h +++ b/include/nbl/asset/ICPUGraphicsPipeline.h @@ -113,6 +113,21 @@ class ICPUGraphicsPipeline final : public ICPUPipeline= GRAPHICS_SHADER_STAGE_COUNT || hlsl::bitCount(stage)!=1) + return -1; + return stageIx; + } + + static inline hlsl::ShaderStage indexToStage(const int8_t index) + { + if (index < 0 || index > GRAPHICS_SHADER_STAGE_COUNT) + return hlsl::ShaderStage::ESS_UNKNOWN; + return static_cast(hlsl::ShaderStage::ESS_VERTEX + index); + } }; } diff --git a/include/nbl/asset/IGraphicsPipeline.h b/include/nbl/asset/IGraphicsPipeline.h index f47cee0fa2..859c80b0b7 100644 --- a/include/nbl/asset/IGraphicsPipeline.h +++ b/include/nbl/asset/IGraphicsPipeline.h @@ -91,20 +91,6 @@ class IGraphicsPipeline : public IPipeline, public IGraphics inline const SCachedCreationParams& getCachedCreationParams() const {return m_params;} inline const renderpass_t* getRenderpass() const {return m_renderpass.get();} - static inline int8_t stageToIndex(const hlsl::ShaderStage stage) - { - const auto stageIx = hlsl::findLSB(stage); - if (stageIx < 0 || stageIx >= GRAPHICS_SHADER_STAGE_COUNT || hlsl::bitCount(stage)!=1) - return -1; - return stageIx; - } - - static inline hlsl::ShaderStage indexToStage(const int8_t index) - { - if (index < 0 || index > GRAPHICS_SHADER_STAGE_COUNT) - return hlsl::ShaderStage::ESS_UNKNOWN; - return static_cast(hlsl::ShaderStage::ESS_VERTEX + index); - } static inline bool isValidStagePresence(const core::bitflag& stagePresence, E_PRIMITIVE_TOPOLOGY primitiveType) { From 68bbcff2d77b6635ed8e1e2cbfc154cda8e12029 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 7 May 2025 18:08:34 +0700 Subject: [PATCH 040/346] Add constraint to template parameter of ICPUPipeline and IGPUPipeline --- include/nbl/asset/ICPUPipeline.h | 1 + include/nbl/video/IGPUPipeline.h | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h index fa77c40b7e..8f41de59ec 100644 --- a/include/nbl/asset/ICPUPipeline.h +++ b/include/nbl/asset/ICPUPipeline.h @@ -95,6 +95,7 @@ class ICPUPipelineBase // Common Base class for pipelines template + requires (std::is_base_of_v, PipelineNonAssetBase> && !std::is_base_of_v) class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipelineBase { using this_t = ICPUPipeline; diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h index 0761d5d020..4a96c9e01f 100644 --- a/include/nbl/video/IGPUPipeline.h +++ b/include/nbl/video/IGPUPipeline.h @@ -92,14 +92,15 @@ class IGPUPipelineBase { }; // Common Base class for pipelines -template -class IGPUPipeline : public IBackendObject, public PipelineNonAssetBase, public IGPUPipelineBase +template + requires (std::is_base_of_v, PipelineNonBackendObjectBase> && !std::is_base_of_v) +class IGPUPipeline : public IBackendObject, public PipelineNonBackendObjectBase, public IGPUPipelineBase { protected: template explicit IGPUPipeline(core::smart_refctd_ptr&& device, Args&&... args) : - PipelineNonAssetBase(std::forward(args...)), IBackendObject(std::move(device)) + PipelineNonBackendObjectBase(std::forward(args...)), IBackendObject(std::move(device)) {} virtual ~IGPUPipeline() = default; From 8ec04157beeed8ff19aee04e69ccf9ffeeaa6f17 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 7 May 2025 18:50:06 +0700 Subject: [PATCH 041/346] Rework IGPUPipeline SSpecConstantValue --- include/nbl/video/IGPUPipeline.h | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h index 4a96c9e01f..2a93895b9d 100644 --- a/include/nbl/video/IGPUPipeline.h +++ b/include/nbl/video/IGPUPipeline.h @@ -28,19 +28,14 @@ class IGPUPipelineBase { //!< The ID of the specialization constant in SPIR-V. If it isn't used in the shader, the map entry does not affect the behavior of the pipeline. using spec_constant_id_t = uint32_t; - struct SSpecConstantValue - { - std::span data; - inline operator bool() const { return data.size(); } - inline size_t size() const { return data.size(); } - }; + using SSpecConstantValue = std::span; inline SSpecConstantValue getSpecializationByteValue(const spec_constant_id_t _specConstID) const { if (!entries) return {}; const auto found = entries->find(_specConstID); - if (found != entries->end() && bool(found->second)) return found->second; + if (found != entries->end() && found->second.size()) return found->second; else return {}; } @@ -64,7 +59,7 @@ class IGPUPipelineBase { int64_t specData = 0; for (const auto& entry : *entries) { - if (!entry.second) + if (!entry.second.size()) return INVALID_SPEC_INFO; specData += entry.second.size(); } From fdb4d40a0e846740c3fab9cb2721e3c3aa19742d Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 7 May 2025 14:02:03 +0200 Subject: [PATCH 042/346] create the TLASes and BLASes TODOs: - abstract away the staging cache insert/overwrite - check dependants of TLAS after creation of BLAS - insert the ASes into staging cache - collect the BLASes to use during TLAS builds --- include/nbl/video/utilities/CAssetConverter.h | 26 ++-- src/nbl/video/utilities/CAssetConverter.cpp | 146 +++++++++++------- 2 files changed, 104 insertions(+), 68 deletions(-) diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h index 02d43cff69..7492e5ed59 100644 --- a/include/nbl/video/utilities/CAssetConverter.h +++ b/include/nbl/video/utilities/CAssetConverter.h @@ -1000,7 +1000,12 @@ class CAssetConverter : public core::IReferenceCounted assert(m_minASBuildScratchSize[forHostOps]<=m_maxASBuildScratchSize[forHostOps]); return m_maxASBuildScratchSize[forHostOps]; } -// TODO: `getMinCompactedASAllocatorSpace` + // We do all compactions on the Device for simplicity + inline uint64_t getMinCompactedASAllocatorSpace() const + { + assert(m_compactedASMaxMemory == 0 || willDeviceASBuild() || willHostASBuild()); + return m_compactedASMaxMemory; + } // tells you if you need to provide a valid `SConvertParams::scratchForDeviceASBuild` inline bool willDeviceASBuild() const {return getMinASBuildScratchSize(false)>0;} // tells you if you need to provide a valid `SConvertParams::scratchForHostASBuild` @@ -1013,8 +1018,7 @@ class CAssetConverter : public core::IReferenceCounted // tells you if you need to provide a valid `SConvertParams::compactedASAllocator` inline bool willCompactAS() const { - assert(!m_willCompactSomeAS || willDeviceASBuild() || willHostASBuild()); - return m_willCompactSomeAS; + return getMinCompactedASAllocatorSpace()!=0; } // @@ -1106,29 +1110,23 @@ class CAssetConverter : public core::IReferenceCounted template struct SConvReqAccelerationStructure : SConversionRequestBase { - constexpr static inline uint64_t WontCompact = (0x1ull<<48)-1; - inline bool compact() const {return compactedASWriteOffset!=WontCompact;} - using build_f = typename asset_traits::video_t::BUILD_FLAGS; inline void setBuildFlags(const build_f _flags) {buildFlags = static_cast(_flags);} inline build_f getBuildFlags() const {return static_cast(buildFlags);} - - uint64_t scratchSize; - uint64_t compactedASWriteOffset : 48 = WontCompact; - uint64_t buildFlags : 16 = static_cast(build_f::NONE); + uint64_t scratchSize : 45; + uint64_t compact : 1; + uint64_t buildFlags : 16 = 0; }; using SConvReqBLAS = SConvReqAccelerationStructure; core::vector m_blasConversions[2]; using SConvReqTLAS = SConvReqAccelerationStructure; core::vector m_tlasConversions[2]; - // 0 for device builds, 1 for host builds + // array index 0 for device builds, 1 for host builds uint64_t m_minASBuildScratchSize[2] = {0,0}; uint64_t m_maxASBuildScratchSize[2] = {0,0}; -// TODO: make the compaction count the size - // We do all compactions on the Device for simplicity - uint8_t m_willCompactSomeAS : 1 = false; + uint64_t m_compactedASMaxMemory = 0; // This tracks non-root BLASes which are needed for a subsequent TLAS build. Note that even things which are NOT in the staging cache are tracked here to make sure they don't finish their lifetimes early. struct BLASUsedInTLASBuild { diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 1f9ca46462..0d98609c2c 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -2530,13 +2530,15 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult // BLAS and TLAS creation is somewhat delayed by buffer creation and allocation struct DeferredASCreationParams { + const IAccelerationStructure* canonical; asset_cached_t storage; - size_t scratchSize : 62 = 0; + size_t scratchSize : 45 = 0; size_t motionBlur : 1 = false; + size_t buildFlags : 16 = 0; + size_t hostBuild : 1 = false; size_t compactAfterBuild : 1 = false; #ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION size_t inputSize = 0; - uint32_t maxInstanceCount = 0; #endif }; core::vector accelerationStructureParams[2]; @@ -2721,25 +2723,25 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult { const auto* as = entry.second.canonicalAsset; const auto& patch = dfsCache.nodes[entry.second.patchIndex.value].patch; - const bool motionBlur = as->usesMotion(); + const bool motionBlur = patch.isMotion; + const auto buildFlags = patch.getBuildFlags(as); + const auto outIx = i+entry.second.firstCopyIx; + const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx]; ILogicalDevice::AccelerationStructureBuildSizes sizes = {}; -#ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION - // we will need to temporarily store the build input buffers somewhere - size_t inputSize = 0; +// size_t inputSize = 0; { - const auto buildFlags = patch.getBuildFlags(as); if constexpr (IsTLAS) { AssetVisitor> visitor = { {visitBase}, - {asset,uniqueCopyGroupID}, + {as,uniqueCopyGroupID}, patch }; if (!visitor()) continue; const auto instanceCount = as->getInstances().size(); sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,instanceCount); - inputSize = (motionBlur ? sizeof(IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance):sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance))*instanceCount; +// inputSize = (motionBlur ? sizeof(IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance):sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance))*instanceCount; } else { @@ -2762,14 +2764,14 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult }; sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts); // TODO: check if the strides need to be aligned to 4 bytes for AABBs - for (const auto& geom : geoms) - if (const auto aabbCount=*(pMaxPrimitiveCounts++); aabbCount) - inputSize = core::roundUp(inputSize,sizeof(float))+aabbCount*geom.stride; +// for (const auto& geom : geoms) +// if (const auto aabbCount=*(pMaxPrimitiveCounts++); aabbCount) +// inputSize = core::roundUp(inputSize,sizeof(float))+aabbCount*geom.stride; } } else { - core::map allocationsPerStride; +// core::map allocationsPerStride; const auto geoms = as->getTriangleGeometries(); if (patch.hostBuild) { @@ -2784,6 +2786,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult reinterpret_cast*>(geoms.data()),geoms.size() }; sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts); +#if 0 // TODO: check if the strides need to be aligned to 4 bytes for AABBs for (const auto& geom : geoms) if (const auto triCount=*(pMaxPrimitiveCounts++); triCount) @@ -2804,17 +2807,19 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult bytesPerVertex += bytesPerVertex; allocationsPerStride[geom.vertexStride] += geom.maxVertex; } +#endif } - for (const auto& entry : allocationsPerStride) - inputSize = core::roundUp(inputSize,entry.first)+entry.first*entry.second; +// for (const auto& entry : allocationsPerStride) +// inputSize = core::roundUp(inputSize,entry.first)+entry.first*entry.second; } } } if (!sizes) continue; -#endif + // we need to save the buffer in a side-channel for later auto& out = accelerationStructureParams[IsTLAS][entry.second.firstCopyIx+i]; + out.canonical = as; // this is where it gets a bit weird, we need to create a buffer to back the acceleration structure { IGPUBuffer::SCreationParams params = {}; @@ -2822,8 +2827,6 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult params.size = core::roundUp(sizes.accelerationStructureSize,MinASBufferAlignment); params.usage = IGPUBuffer::E_USAGE_FLAGS::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT|IGPUBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT; // concurrent ownership if any - const auto outIx = i + entry.second.firstCopyIx; - const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx]; const auto queueFamilies = inputs.getSharedOwnershipQueueFamilies(uniqueCopyGroupID,as,patch); params.queueFamilyIndexCount = queueFamilies.size(); params.queueFamilyIndices = queueFamilies.data(); @@ -2831,6 +2834,8 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult } out.scratchSize = sizes.buildScratchSize; out.motionBlur = motionBlur; + out.buildFlags = static_cast(buildFlags.value); + out.hostBuild = patch.hostBuild; out.compactAfterBuild = patch.compactAfterBuild; if (out.storage && !deferredAllocator.request(&out.storage,patch.hostBuild ? hostBuildMemoryTypes:deviceBuildMemoryTypes)) out.storage.value = nullptr; @@ -3276,6 +3281,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult } // Propagate the results back, since the dfsCache has the original asset pointers as keys, we map in reverse + // TODO: this probably could go at the end of the object creation routines // This gets deferred till AFTER the Buffer Memory Allocations and Binding for Acceleration Structures if constexpr (!std::is_same_v && !std::is_same_v) dfsCache.for_each([&](const instance_t& instance, dfs_cache::created_t& created)->void @@ -3304,6 +3310,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult // unhashables were not supposed to be added to conversion requests assert(contentHash!=CHashCache::NoContentHash); +// abstract away start const auto copyIx = found->second.firstCopyIx++; // the counting sort was stable assert(uniqueCopyGroupID==gpuObjUniqueCopyGroupIDs[copyIx]); @@ -3333,6 +3340,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult stagingCache.emplace(gpuObj.get(),typename CCache::key_t(contentHash,uniqueCopyGroupID)); // propagate back to dfsCache created.gpuObj = std::move(gpuObj); +// abstract away end // record if a device memory allocation will be needed if constexpr (std::is_base_of_v::video_t>) { @@ -3351,7 +3359,6 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult const uint16_t recomputeMips = created.patch.recomputeMips; retval.m_imageConversions.emplace_back(SReserveResult::SConversionRequestBase{core::smart_refctd_ptr(instance.asset),created.gpuObj.get()},recomputeMips); } -// TODO: BLAS and TLAS requests } ); }; @@ -3392,51 +3399,72 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult } // Deal with Deferred Creation of Acceleration structures { - for (auto asLevel=0; asLevel<2; asLevel++) + const auto minScratchAlignment = device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment; + auto createAccelerationStructures = [&]()->void { - // each of these stages must have a barrier inbetween - size_t scratchSizeFullParallelBuild = 0; - size_t scratchSizeFullParallelCompact = 0; + constexpr bool IsTLAS = std::is_same_v; + // TLAS and BLAS can't build concurrently, index 0 is device build, 1 is host build + size_t scratchSizeFullParallelBuild[2] = {0,0}; + // + core::vector>* pConversions; + if constexpr (IsTLAS) + pConversions = retval.m_tlasConversions; + else + pConversions = retval.m_blasConversions; // we collect that stats AFTER making sure that the BLAS / TLAS can actually be created - for (const auto& deferredParams : accelerationStructureParams[asLevel]) + for (const auto& deferredParams : accelerationStructureParams[IsTLAS]) { // buffer failed to create/allocate if (!deferredParams.storage) continue; -#ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION + const auto bufSz = deferredParams.storage.get()->getSize(); IGPUAccelerationStructure::SCreationParams baseParams; { - auto* buf = deferredParams.storage.get(); - const auto bufSz = buf->getSize(); using create_f = IGPUAccelerationStructure::SCreationParams::FLAGS; baseParams = { - .bufferRange = {.offset=0,.size=bufSz,.buffer=smart_refctd_ptr(buf)}, + .bufferRange = {.offset=0,.size=bufSz,.buffer=deferredParams.storage.value}, .flags = deferredParams.motionBlur ? create_f::MOTION_BIT:create_f::NONE }; } - smart_refctd_ptr as; - if (asLevel) + // + auto& request = pConversions[deferredParams.hostBuild].emplace_back(); + request.canonical = smart_refctd_ptr(static_cast(deferredParams.canonical)); + smart_refctd_ptr::video_t> as; + if constexpr (IsTLAS) { - as = device->createBottomLevelAccelerationStructure({baseParams,deferredParams.maxInstanceCount}); + // is there any reason for it to be more? + const uint32_t maxInstances = request.canonical->getInstances().size(); + as = device->createTopLevelAccelerationStructure({std::move(baseParams),maxInstances}); } else + as = device->createBottomLevelAccelerationStructure(std::move(baseParams)); + request.gpuObj = as.get(); + if (!request.gpuObj) { - as = device->createTopLevelAccelerationStructure({baseParams,deferredParams.maxInstanceCount}); + inputs.logger.log("Failed to Create Acceleration Structure.",system::ILogger::ELL_ERROR); + continue; } + request.scratchSize = deferredParams.scratchSize; + request.compact = deferredParams.compactAfterBuild; + request.buildFlags = deferredParams.buildFlags; + // best case + size_t buildSize = 0; +// TODO: compute inputs with alignment + buildSize = core::alignUp(buildSize,minScratchAlignment)+deferredParams.scratchSize; + // sizes for building 1-by-1 vs parallel, note that BLAS and TLAS can't be built concurrently + retval.m_minASBuildScratchSize[deferredParams.hostBuild] = core::max(retval.m_minASBuildScratchSize[deferredParams.hostBuild],buildSize); + scratchSizeFullParallelBuild[deferredParams.hostBuild] += buildSize; // note that in order to compact an AS you need to allocate a buffer range whose size is known only after the build -// TODO: compute with alignment - const auto buildSize = deferredParams.inputSize+deferredParams.scratchSize; - // sizes for building 1-by-1 vs parallel, note that - retval.m_minASBuildScratchSize = core::max(buildSize,retval.m_minASBuildScratchSize); - scratchSizeFullParallelBuild += buildSize; - // triangles, AABBs or Instance Transforms will need to be supplied from VRAM -#endif + if (deferredParams.compactAfterBuild) + retval.m_compactedASMaxMemory += bufSz; } - // -// retval.m_maxASBuildScratchSize[0] = core::max(scratchSizeFullParallelBuild,retval.m_maxASBuildScratchSize); - } + retval.m_maxASBuildScratchSize[0] = core::max(scratchSizeFullParallelBuild[0],retval.m_maxASBuildScratchSize[0]); + retval.m_maxASBuildScratchSize[1] = core::max(scratchSizeFullParallelBuild[1],retval.m_maxASBuildScratchSize[1]); + }; + createAccelerationStructures.template operator()(); + createAccelerationStructures.template operator()(); // - if (retval.willDeviceASBuild()) + if (retval.willDeviceASBuild() || retval.willCompactAS()) retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; } @@ -3555,13 +3583,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul return retval; } using buffer_usage_f = IGPUBuffer::E_USAGE_FLAGS; - constexpr buffer_usage_f asBuildInputFlags = buffer_usage_f::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT|buffer_usage_f::EUF_TRANSFER_DST_BIT|buffer_usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT; - // we may use the staging buffer directly to skip an extra copy on small enough geometries - if (!params.utilities->getDefaultUpStreamingBuffer()->getBuffer()->getCreationParams().usage.hasFlags(asBuildInputFlags)) - { - logger.log("An Acceleration Structure will be built on Device but Default UpStreaming Buffer from IUtilities doesn't have required usage flags!",system::ILogger::ELL_ERROR); - return retval; - } + constexpr buffer_usage_f asBuildInputFlags = buffer_usage_f::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT|buffer_usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT; constexpr buffer_usage_f asBuildScratchFlags = buffer_usage_f::EUF_STORAGE_BUFFER_BIT|buffer_usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT; auto* scratchBuffer = params.scratchForDeviceASBuild->getBuffer(); const auto& scratchParams = scratchBuffer->getCachedCreationParams(); @@ -3583,6 +3605,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul logger.log("Acceleration Structure Scratch Device Memory Allocator not large enough!",system::ILogger::ELL_ERROR); return retval; } + // this alignment is probably bigger than required by any Build Input const auto minScratchAlignment = device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment; if (addrAlloc.max_alignment() CAssetConverter::convert_impl(SReserveResul logger.log("Acceleration Structure Scratch Device Memory Allocator not mapped and not concurrently share-able by Transfer Family %d!",system::ILogger::ELL_ERROR,transferFamily); return retval; } + if (!scratchBuffer->getCreationParams().usage.hasFlags(buffer_usage_f::EUF_TRANSFER_DST_BIT)) + { + logger.log("Acceleration Structure Scratch Device Memory Allocator not mapped and doesn't the transfer destination usage flag!",system::ILogger::ELL_ERROR); + return retval; + } + // Right now we copy from staging to scratch, but in the future we may use the staging buffer directly to skip an extra copy on small enough geometries + if (!params.utilities->getDefaultUpStreamingBuffer()->getBuffer()->getCreationParams().usage.hasFlags(asBuildInputFlags|buffer_usage_f::EUF_TRANSFER_SRC_BIT)) + { + logger.log("An Acceleration Structure will be built on Device but Default UpStreaming Buffer from IUtilities doesn't have required usage flags!", system::ILogger::ELL_ERROR); + return retval; + } reqQueueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; } } @@ -3617,10 +3651,14 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul return retval; } // and compacting - if (reservations.willCompactAS() && !params.compactedASAllocator) + if (reservations.willCompactAS()) { - logger.log("An Acceleration Structure will be compacted but no Device Memory Allocator provided!", system::ILogger::ELL_ERROR); - return retval; + if (!params.compactedASAllocator) + { + logger.log("An Acceleration Structure will be compacted but no Device Memory Allocator provided!", system::ILogger::ELL_ERROR); + return retval; + } + // note that can't check the compacted AS allocator being large enough against `reservations.m_compactedASMaxMemory` } // @@ -4741,7 +4779,7 @@ if (worstSize>minScratchSize) // no special extra byte offset into the instance buffer rangeInfos.emplace_back(instanceCount,0u); // - const bool willCompact = tlasToBuild.compact(); + const bool willCompact = tlasToBuild.compact; if (willCompact) compactions.push_back(as); // enqueue ownership release if necessary From 57136e8cb8bd148a18286859ac8d79def85b9039 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 7 May 2025 19:16:58 +0700 Subject: [PATCH 043/346] Rework SShaderSpecInfo for ICPUPIpeline --- include/nbl/asset/ICPUGraphicsPipeline.h | 2 +- include/nbl/asset/ICPUPipeline.h | 11 +++-------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h index 926ee0ca6c..62b25443cc 100644 --- a/include/nbl/asset/ICPUGraphicsPipeline.h +++ b/include/nbl/asset/ICPUGraphicsPipeline.h @@ -85,7 +85,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline data; - inline operator bool() const { return data.size(); } - inline size_t size() const { return data.size(); } - }; + using SSpecConstantValue = core::vector; inline SSpecConstantValue* getSpecializationByteValue(const spec_constant_id_t _specConstID) { const auto found = entries.find(_specConstID); - if (found != entries.end() && bool(found->second)) return &found->second; + if (found != entries.end() && found->second.size()) return &found->second; else return nullptr; } @@ -65,7 +60,7 @@ class ICPUPipelineBase int64_t specData = 0; for (const auto& entry : entries) { - if (!entry.second) return INVALID_SPEC_INFO; + if (!entry.second.size()) return INVALID_SPEC_INFO; specData += entry.second.size(); } if (specData > 0x7fffffff) return INVALID_SPEC_INFO; From 7983e62a27f29c906d64a2152d4033dd9a28a185 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 7 May 2025 19:17:15 +0700 Subject: [PATCH 044/346] Move cloneSpecInfo into SShaderSpecInfo --- include/nbl/asset/ICPUPipeline.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h index ddfb4628c8..69d709d1d0 100644 --- a/include/nbl/asset/ICPUPipeline.h +++ b/include/nbl/asset/ICPUPipeline.h @@ -81,6 +81,15 @@ class ICPUPipelineBase // Also because our API is sane, it satisfies the following by construction: // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02754 + SShaderSpecInfo clone(uint32_t depth) const + { + auto newSpecInfo = *this; + if (depth > 0u) + { + newSpecInfo.shader = core::smart_refctd_ptr_static_cast(this->shader->clone(depth - 1u)); + } + return newSpecInfo; + } }; virtual std::span getSpecInfo(const hlsl::ShaderStage stage) const = 0; @@ -122,7 +131,6 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipe return clone_impl(std::move(layout), _depth); } - SShaderSpecInfo cloneSpecInfo(const SShaderSpecInfo& specInfo, uint32_t depth) inline std::span getSpecInfo(hlsl::ShaderStage stage) { if (!isMutable()) return {}; From 071f1ebbb0da7090d097233248b15e25092580b6 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 7 May 2025 19:17:45 +0700 Subject: [PATCH 045/346] Remove valid virtual function from ICPUPipeline to IAsset --- include/nbl/asset/IAsset.h | 2 ++ include/nbl/asset/ICPUPipeline.h | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/nbl/asset/IAsset.h b/include/nbl/asset/IAsset.h index fdb41ed298..3b8b123ce3 100644 --- a/include/nbl/asset/IAsset.h +++ b/include/nbl/asset/IAsset.h @@ -169,6 +169,8 @@ class IAsset : virtual public core::IReferenceCounted return retval; } + virtual bool valid() const = 0; + protected: inline IAsset() = default; //! Pure virtual destructor to ensure no instantiation diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h index 69d709d1d0..8b90458f21 100644 --- a/include/nbl/asset/ICPUPipeline.h +++ b/include/nbl/asset/ICPUPipeline.h @@ -94,7 +94,6 @@ class ICPUPipelineBase virtual std::span getSpecInfo(const hlsl::ShaderStage stage) const = 0; - virtual bool valid() const = 0; }; // Common Base class for pipelines From b8f8ba04db3e44aa30e1c265bd108367a2707b19 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 7 May 2025 19:18:33 +0700 Subject: [PATCH 046/346] Remove getShaders from SShaderSpecInfo --- include/nbl/video/IGPUComputePipeline.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/nbl/video/IGPUComputePipeline.h b/include/nbl/video/IGPUComputePipeline.h index 6e825d749b..42503e1f12 100644 --- a/include/nbl/video/IGPUComputePipeline.h +++ b/include/nbl/video/IGPUComputePipeline.h @@ -63,8 +63,6 @@ class IGPUComputePipeline : public IGPUPipeline(dataSize)}; } - inline std::span getShaders() const {return {&shader,1}; } - IGPUPipelineLayout* layout = nullptr; // TODO: Could guess the required flags from SPIR-V introspection of declared caps core::bitflag flags = FLAGS::NONE; From f661366d6a35cc83fe578436ce0b168f99d381de Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 7 May 2025 19:21:20 +0700 Subject: [PATCH 047/346] Rename isValidStagePresence to hasRequiredStages --- include/nbl/asset/IGraphicsPipeline.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/asset/IGraphicsPipeline.h b/include/nbl/asset/IGraphicsPipeline.h index 859c80b0b7..ef49e4c03a 100644 --- a/include/nbl/asset/IGraphicsPipeline.h +++ b/include/nbl/asset/IGraphicsPipeline.h @@ -92,7 +92,7 @@ class IGraphicsPipeline : public IPipeline, public IGraphics inline const renderpass_t* getRenderpass() const {return m_renderpass.get();} - static inline bool isValidStagePresence(const core::bitflag& stagePresence, E_PRIMITIVE_TOPOLOGY primitiveType) + static inline bool hasRequiredStages(const core::bitflag& stagePresence, E_PRIMITIVE_TOPOLOGY primitiveType) { // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-stage-02096 if (!stagePresence.hasFlags(hlsl::ShaderStage::ESS_VERTEX)) From 8c10cbdaba40e8fa0eb569ef8e00f39f181d065d Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 7 May 2025 19:25:01 +0700 Subject: [PATCH 048/346] Rework IGPUGraphicsPipeline to have individual shaderSpecInfo per stages --- include/nbl/video/IGPUGraphicsPipeline.h | 31 +++++++++++++++--------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h index fc596a54e1..f5d6e40275 100644 --- a/include/nbl/video/IGPUGraphicsPipeline.h +++ b/include/nbl/video/IGPUGraphicsPipeline.h @@ -46,16 +46,21 @@ class IGPUGraphicsPipeline : public IGPUPipelinegetCreationParameters().subpasses[i] core::bitflag stagePresence = {}; - for (auto shader_i = 0u; shader_i < shaders.size(); shader_i++) - { - const auto& info = shaders[shader_i]; - if (!extra(info)) - return false; - if (info.shader) - stagePresence |= indexToStage(shader_i); - } - return isValidStagePresence(stagePresence, cached.primitiveAssembly.primitiveType); + auto processSpecInfo = [&](const SShaderSpecInfo& specInfo, hlsl::ShaderStage stage) + { + if (!extra(specInfo)) return false; + if (!specInfo.shader) return false; + stagePresence != stage; + return true; + }; + if (!processSpecInfo(vertexShader)) return false; + if (!processSpecInfo(tesselationControlShader)) return false; + if (!processSpecInfo(tesselationEvaluationShader)) return false; + if (!processSpecInfo(geometryShader)) return false; + if (!processSpecInfo(fragmentShader)) return false; + + return hasRequiredStages(stagePresence, cached.primitiveAssembly.primitiveType); } @@ -83,10 +88,12 @@ class IGPUGraphicsPipeline : public IGPUPipeline getShaders() const {return shaders;} - IGPUPipelineLayout* layout = nullptr; - std::span shaders = {}; + SShaderSpecInfo vertexShader; + SShaderSpecInfo tesselationControlShader; + SShaderSpecInfo tesselationEvaluationShader; + SShaderSpecInfo geometryShader; + SShaderSpecInfo fragmentShader; SCachedCreationParams cached = {}; renderpass_t* renderpass = nullptr; From 71056f2274b118a0ed6c22aff5b1f5ff5d96e133 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 7 May 2025 19:25:24 +0700 Subject: [PATCH 049/346] Add IGPUPipelineLayout to IGPUPipeline --- include/nbl/video/IGPUPipeline.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h index 2a93895b9d..826026d9aa 100644 --- a/include/nbl/video/IGPUPipeline.h +++ b/include/nbl/video/IGPUPipeline.h @@ -6,6 +6,7 @@ #ifndef _NBL_VIDEO_I_GPU_PIPELINE_H_INCLUDED_ #define _NBL_VIDEO_I_GPU_PIPELINE_H_INCLUDED_ +#include "nbl/video/IGPUPipelineLayout.h" #include "nbl/asset/IPipeline.h" namespace nbl::video From 802ff9aefc3dfca7028034232789f1b31e89aa0e Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 7 May 2025 14:38:48 +0200 Subject: [PATCH 050/346] get the thing to compile and estimate build input size --- include/nbl/video/IGPUAccelerationStructure.h | 3 +++ src/nbl/video/utilities/CAssetConverter.cpp | 21 ++++++++++++++----- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/include/nbl/video/IGPUAccelerationStructure.h b/include/nbl/video/IGPUAccelerationStructure.h index 5d8f0ca29b..c3a24080d0 100644 --- a/include/nbl/video/IGPUAccelerationStructure.h +++ b/include/nbl/video/IGPUAccelerationStructure.h @@ -638,6 +638,9 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr // I don't do an actual union because the preceeding members don't play nicely with alignment of `core::matrix3x4SIMD` and Vulkan requires this struct to be packed SRTMotionInstance largestUnionMember = {}; static_assert(alignof(SRTMotionInstance)==8ull); + + public: + constexpr static inline size_t LargestUnionMemberSize = sizeof(largestUnionMember); }; using DevicePolymorphicInstance = PolymorphicInstance; using HostPolymorphicInstance = PolymorphicInstance; diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 0d98609c2c..0134991976 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -3447,10 +3447,21 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult request.scratchSize = deferredParams.scratchSize; request.compact = deferredParams.compactAfterBuild; request.buildFlags = deferredParams.buildFlags; - // best case - size_t buildSize = 0; -// TODO: compute inputs with alignment - buildSize = core::alignUp(buildSize,minScratchAlignment)+deferredParams.scratchSize; + // prevent CPU hangs by making sure allocator big enough to service us in worst case but with best case allocator (no other allocations, clean alloc) + // TODO: take into account the minimal allocation size from the allocator (ask for it) + size_t buildSize = deferredParams.scratchSize; + if constexpr (IsTLAS) + { + const uint32_t instanceCount = request.canonical->getInstances().size(); + // Worst case approximation, not all instances will be that size (note that host and device instance data are same size) + const size_t approxInstanceSize = deferredParams.motionBlur ? IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance::LargestUnionMemberSize:sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance); + buildSize = core::alignUp(buildSize,approxInstanceSize)+instanceCount*approxInstanceSize; + buildSize = core::alignUp(buildSize,alignof(uint64_t))+instanceCount*sizeof(uint64_t); + } + else + { +// TODO: compute BLAS input size with alignments + } // sizes for building 1-by-1 vs parallel, note that BLAS and TLAS can't be built concurrently retval.m_minASBuildScratchSize[deferredParams.hostBuild] = core::max(retval.m_minASBuildScratchSize[deferredParams.hostBuild],buildSize); scratchSizeFullParallelBuild[deferredParams.hostBuild] += buildSize; @@ -4652,7 +4663,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul addr_t offsets[MaxAllocCount] = {scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value}; const addr_t sizes[MaxAllocCount] = {tlasToBuild.scratchSize,instanceDataSize,sizeof(void*)*instanceCount}; { - const addr_t alignments[MaxAllocCount] = {limits.minAccelerationStructureScratchOffsetAlignment,16,8}; + const addr_t alignments[MaxAllocCount] = {limits.minAccelerationStructureScratchOffsetAlignment,16,alignof(uint64_t)}; /* TODO: move to reserve phase - prevent CPU hangs by making sure allocator big enough to service us { addr_t worstSize = sizes[0]; From 8f1911221f474f159c844070c2b0b61ef9333bb6 Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 7 May 2025 16:45:59 +0200 Subject: [PATCH 051/346] check the GPU BLASes needed for TLAS build exist before creating TLAS Finish getting build sizes for BLAS and TLAS Also ask for minimum allocation size constraint of the allocator --- include/nbl/video/utilities/CAssetConverter.h | 3 + src/nbl/video/utilities/CAssetConverter.cpp | 147 +++++++++--------- 2 files changed, 80 insertions(+), 70 deletions(-) diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h index 7492e5ed59..3e134b913d 100644 --- a/include/nbl/video/utilities/CAssetConverter.h +++ b/include/nbl/video/utilities/CAssetConverter.h @@ -900,6 +900,9 @@ class CAssetConverter : public core::IReferenceCounted IGPUPipelineCache* pipelineCache = nullptr; // optional, defaults to the device IDeviceMemoryAllocator* allocator = nullptr; + // optional, defaults to worst case (Apple Silicon page size) + uint32_t scratchForDeviceASBuildMinAllocSize = 1<<14; + uint32_t scratchForHostASBuildMinAllocSize = 1<<14; }; // Split off from inputs because only assets that build on IPreHashed need uploading struct SConvertParams diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 0134991976..6ab4b319d2 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -1655,6 +1655,8 @@ template<> class GetDependantVisit : public GetDependantVisitBase { public: + // all instances need to be aligned to 16 bytes so alignment irrelevant (everything can be tightly packed) and implicit + uint64_t buildInputSize = 0; // because of zero access to the lifetime tracking between TLASes and BLASes, do nothing //core::smart_refctd_ptr* const outBLASes; @@ -1668,6 +1670,9 @@ class GetDependantVisit : public GetDependant auto depObj = getDependant(dep,soloPatch); if (!depObj) return false; + const auto instances = user.asset->getInstances(); + assert(instanceIndex SReserveResult struct DeferredASCreationParams { const IAccelerationStructure* canonical; - asset_cached_t storage; - size_t scratchSize : 45 = 0; - size_t motionBlur : 1 = false; - size_t buildFlags : 16 = 0; - size_t hostBuild : 1 = false; - size_t compactAfterBuild : 1 = false; -#ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION - size_t inputSize = 0; -#endif + asset_cached_t storage = {}; + uint64_t scratchSize : 45 = 0; + uint64_t motionBlur : 1 = false; + uint64_t buildFlags : 16 = 0; + uint64_t hostBuild : 1 = false; + uint64_t compactAfterBuild : 1 = false; + uint64_t buildSize = 0; }; core::vector accelerationStructureParams[2]; // Deduplication, Creation and Propagation @@ -2547,6 +2550,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult { auto& dfsCache = std::get>(dfsCaches); // This map contains the assets by-hash, identical asset+patch hash the same. + // It only has entries for GPU objects that need to be created conversions_t conversionRequests; // We now go through the dfsCache and work out each entry's content hashes, so that we can carry out unique conversions. @@ -2727,8 +2731,16 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult const auto buildFlags = patch.getBuildFlags(as); const auto outIx = i+entry.second.firstCopyIx; const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx]; + // prevent CPU hangs by making sure allocator big enough to service us in worst case but with best case allocator (no other allocations, clean alloc) + const auto minScratchAllocSize = patch.hostBuild ? inputs.scratchForHostASBuildMinAllocSize:inputs.scratchForDeviceASBuildMinAllocSize; + uint64_t buildSize = 0; uint32_t buildAlignment = 4; + auto incrementBuildSize = [minScratchAllocSize,&buildSize,&buildAlignment](const uint64_t size, const uint32_t alignment)->void + { + buildSize = core::alignUp(buildSize,alignment)+hlsl::max(size,minScratchAllocSize); + buildAlignment = hlsl::max(buildAlignment,alignment); + }; ILogicalDevice::AccelerationStructureBuildSizes sizes = {}; -// size_t inputSize = 0; + const auto hashAsU64 = reinterpret_cast(entry.first.data); { if constexpr (IsTLAS) { @@ -2738,10 +2750,17 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult patch }; if (!visitor()) + { + inputs.logger.log( + "Failed to find all GPU Bottom Level Acceleration Structures needed to build TLAS %8llx%8llx%8llx%8llx", + system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] + ); continue; + } const auto instanceCount = as->getInstances().size(); sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,instanceCount); -// inputSize = (motionBlur ? sizeof(IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance):sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance))*instanceCount; + incrementBuildSize(visitor.buildInputSize,16); + incrementBuildSize(sizeof(uint64_t)*instanceCount,alignof(uint64_t)); } else { @@ -2763,15 +2782,15 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult reinterpret_cast*>(geoms.data()),geoms.size() }; sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts); - // TODO: check if the strides need to be aligned to 4 bytes for AABBs -// for (const auto& geom : geoms) -// if (const auto aabbCount=*(pMaxPrimitiveCounts++); aabbCount) -// inputSize = core::roundUp(inputSize,sizeof(float))+aabbCount*geom.stride; } + // TODO: check if the strides need to be aligned to 4 bytes for AABBs + for (const auto& geom : geoms) + if (const auto aabbCount=*(pMaxPrimitiveCounts++); aabbCount) + incrementBuildSize(aabbCount*geom.stride,alignof(float)); } else { -// core::map allocationsPerStride; + core::map allocationsPerStride; const auto geoms = as->getTriangleGeometries(); if (patch.hostBuild) { @@ -2786,36 +2805,38 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult reinterpret_cast*>(geoms.data()),geoms.size() }; sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts); -#if 0 - // TODO: check if the strides need to be aligned to 4 bytes for AABBs - for (const auto& geom : geoms) - if (const auto triCount=*(pMaxPrimitiveCounts++); triCount) + } + for (const auto& geom : geoms) + if (const auto triCount=*(pMaxPrimitiveCounts++); triCount) + { + switch (geom.indexType) { - switch (geom.indexType) - { - case E_INDEX_TYPE::EIT_16BIT: - allocationsPerStride[sizeof(uint16_t)] += triCount*3; - break; - case E_INDEX_TYPE::EIT_32BIT: - allocationsPerStride[sizeof(uint32_t)] += triCount*3; - break; - default: - break; - } - size_t bytesPerVertex = geom.vertexStride; - if (geom.vertexData[1]) - bytesPerVertex += bytesPerVertex; - allocationsPerStride[geom.vertexStride] += geom.maxVertex; + case E_INDEX_TYPE::EIT_16BIT: + allocationsPerStride[sizeof(uint16_t)] += triCount*3; + break; + case E_INDEX_TYPE::EIT_32BIT: + allocationsPerStride[sizeof(uint32_t)] += triCount*3; + break; + default: + break; } -#endif + allocationsPerStride[geom.vertexStride] += (geom.vertexData[1] ? 2:1)*geom.maxVertex; } -// for (const auto& entry : allocationsPerStride) -// inputSize = core::roundUp(inputSize,entry.first)+entry.first*entry.second; + for (const auto& entry : allocationsPerStride) + incrementBuildSize(entry.first*entry.second,entry.first); } } } - if (!sizes) + if (!buildSize) + { + inputs.logger.log( + "Build Size Input is 0 for Acceleration Structure %8llx%8llx%8llx%8llx", + system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] + ); continue; + } + // scratch gets allocated first + buildSize = core::alignUp(hlsl::max(sizes.buildScratchSize,minScratchAllocSize),buildAlignment)+buildSize; // we need to save the buffer in a side-channel for later auto& out = accelerationStructureParams[IsTLAS][entry.second.firstCopyIx+i]; @@ -2831,14 +2852,19 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult params.queueFamilyIndexCount = queueFamilies.size(); params.queueFamilyIndices = queueFamilies.data(); out.storage.value = device->createBuffer(std::move(params)); + if (out.storage) + if (!deferredAllocator.request(&out.storage,patch.hostBuild ? hostBuildMemoryTypes:deviceBuildMemoryTypes)) + { + out.storage.value = nullptr; + continue; + } } out.scratchSize = sizes.buildScratchSize; out.motionBlur = motionBlur; out.buildFlags = static_cast(buildFlags.value); out.hostBuild = patch.hostBuild; out.compactAfterBuild = patch.compactAfterBuild; - if (out.storage && !deferredAllocator.request(&out.storage,patch.hostBuild ? hostBuildMemoryTypes:deviceBuildMemoryTypes)) - out.storage.value = nullptr; + out.buildSize = buildSize; } } if constexpr (std::is_same_v) @@ -3447,24 +3473,9 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult request.scratchSize = deferredParams.scratchSize; request.compact = deferredParams.compactAfterBuild; request.buildFlags = deferredParams.buildFlags; - // prevent CPU hangs by making sure allocator big enough to service us in worst case but with best case allocator (no other allocations, clean alloc) - // TODO: take into account the minimal allocation size from the allocator (ask for it) - size_t buildSize = deferredParams.scratchSize; - if constexpr (IsTLAS) - { - const uint32_t instanceCount = request.canonical->getInstances().size(); - // Worst case approximation, not all instances will be that size (note that host and device instance data are same size) - const size_t approxInstanceSize = deferredParams.motionBlur ? IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance::LargestUnionMemberSize:sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance); - buildSize = core::alignUp(buildSize,approxInstanceSize)+instanceCount*approxInstanceSize; - buildSize = core::alignUp(buildSize,alignof(uint64_t))+instanceCount*sizeof(uint64_t); - } - else - { -// TODO: compute BLAS input size with alignments - } // sizes for building 1-by-1 vs parallel, note that BLAS and TLAS can't be built concurrently - retval.m_minASBuildScratchSize[deferredParams.hostBuild] = core::max(retval.m_minASBuildScratchSize[deferredParams.hostBuild],buildSize); - scratchSizeFullParallelBuild[deferredParams.hostBuild] += buildSize; + retval.m_minASBuildScratchSize[deferredParams.hostBuild] = core::max(retval.m_minASBuildScratchSize[deferredParams.hostBuild],deferredParams.buildSize); + scratchSizeFullParallelBuild[deferredParams.hostBuild] += deferredParams.buildSize; // note that in order to compact an AS you need to allocate a buffer range whose size is known only after the build if (deferredParams.compactAfterBuild) retval.m_compactedASMaxMemory += bufSz; @@ -3623,6 +3634,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul logger.log("Accceleration Structure Scratch Device Memory Allocator cannot allocate with Physical Device's minimum required AS-build scratch alignment %u",system::ILogger::ELL_ERROR,minScratchAlignment); return retval; } + // TODO: check scratchForDeviceASBuildMinAllocSize // returns non-null pointer if the buffer is writeable directly byt the host deviceASBuildScratchPtr = reinterpret_cast(scratchBuffer->getBoundMemory().memory->getMappedPointer()); // Need to use Transfer Queue and copy via staging buffer @@ -3656,10 +3668,14 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul } } // the elusive and exotic host builds - if (reservations.willHostASBuild() && !params.scratchForHostASBuild) + if (reservations.willHostASBuild()) { - logger.log("An Acceleration Structure will be built on the Host but no Scratch Memory Allocator provided!", system::ILogger::ELL_ERROR); - return retval; + if (!params.scratchForHostASBuild) + { + logger.log("An Acceleration Structure will be built on the Host but no Scratch Memory Allocator provided!", system::ILogger::ELL_ERROR); + return retval; + } + // TODO: check everything else when we actually support host builds } // and compacting if (reservations.willCompactAS()) @@ -4664,14 +4680,6 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul const addr_t sizes[MaxAllocCount] = {tlasToBuild.scratchSize,instanceDataSize,sizeof(void*)*instanceCount}; { const addr_t alignments[MaxAllocCount] = {limits.minAccelerationStructureScratchOffsetAlignment,16,alignof(uint64_t)}; -/* TODO: move to reserve phase - prevent CPU hangs by making sure allocator big enough to service us -{ -addr_t worstSize = sizes[0]; -for (auto i=1u; iminScratchSize) - minScratchSize = worstSize; -}*/ const auto AllocCount = as->usesMotion() ? 2:3; // if fail then flush and keep trying till space is made for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(AllocCount,&offsets[0],&sizes[0],&alignments[0])!=0u; t++) @@ -4692,7 +4700,6 @@ if (worstSize>minScratchSize) // stream the instance/geometry input in { bool success = true; -// TODO: make sure the overflow submit work callback is doing some CPU work { struct FillInstances : IUtilities::IUpstreamingDataProducer { From 7b643b68bc27022ddc974ab2b83e95aa0879cc10 Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 7 May 2025 18:06:09 +0200 Subject: [PATCH 052/346] Make ReBAR buffer copies happen during `convert` and not `reserve` because that's supposed to be the "expensive" call Also prevent attempting to map the same memory multiple times (relevant in APIs that only allow a single mapping and we suballocate from same `IDeviceMemoryAllocation`) So now for a ReBAR upload to succeed, the memory allocation given out needs to be on the correct heap AND start off mapped (Asset Converter won't attempt to map by itself). --- src/nbl/video/utilities/CAssetConverter.cpp | 106 ++++++++++++-------- 1 file changed, 66 insertions(+), 40 deletions(-) diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 6ab4b319d2..19b8f18a66 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -2293,6 +2293,15 @@ class MetaDeviceMemoryAllocator final core::map> allocationRequests; }; +// for dem ReBAR goodies +bool canHostWriteToMemoryRange(const IDeviceMemoryBacked::SMemoryBinding& binding, const size_t length) +{ + assert(binding.isValid()); + const auto* memory = binding.memory; + const auto& mappedRange = memory->getMappedRange(); + return memory->isCurrentlyMapped() && memory->getCurrentMappingAccess().hasFlags(IDeviceMemoryAllocation::EMCAF_WRITE) && mappedRange.offset<=binding.offset && binding.offset+length<=mappedRange.offset+mappedRange.length; +} + // auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult { @@ -2660,12 +2669,11 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult }(); core::vector> gpuObjects(gpuObjUniqueCopyGroupIDs.size()); - // Only warn once to reduce log spam auto assign = [&](const core::blake3_hash_t& contentHash, const size_t baseIx, const size_t copyIx, asset_cached_t::type&& gpuObj)->bool { const auto hashAsU64 = reinterpret_cast(contentHash.data); if constexpr (GPUObjectWhollyImmutable) // including any deps! - if (copyIx==1) + if (copyIx==1) // Only warn once to reduce log spam inputs.logger.log( "Why are you creating multiple Objects for asset content %8llx%8llx%8llx%8llx, when they are a readonly GPU Object Type with no dependants!?", system::ILogger::ELL_PERFORMANCE,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] @@ -3398,31 +3406,19 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult // now allocate the memory for buffers and images deferredAllocator.finalize(); - // can remove buffers from conversion requests which can be written to directly - { - core::vector flushRanges; - flushRanges.reserve(retval.m_bufferConversions.size()); - std::erase_if(retval.m_bufferConversions,[&flushRanges](const SReserveResult::SConvReqBuffer& conv)->bool - { - const auto boundMemory = conv.gpuObj->getBoundMemory(); - auto* const memory = boundMemory.memory; - if (!boundMemory.memory->isMappable()) - return false; - const size_t size = conv.gpuObj->getSize(); - const IDeviceMemoryAllocation::MemoryRange range = {boundMemory.offset,size}; - // slightly inefficient but oh well - void* dst = memory->map(range,IDeviceMemoryAllocation::EMCAF_WRITE); - memcpy(dst,conv.canonical->getPointer(),size); - if (boundMemory.memory->haveToMakeVisible()) - flushRanges.emplace_back(memory,range.offset,range.length,ILogicalDevice::MappedMemoryRange::align_non_coherent_tag); + // find out which buffers need to be uploaded via a staging buffer + std::erase_if(retval.m_bufferConversions,[&](const SReserveResult::SConvReqBuffer& conv)->bool + { + if (!conv.gpuObj) return true; - } - ); - if (!flushRanges.empty()) - device->flushMappedMemoryRanges(flushRanges); - if (!retval.m_bufferConversions.empty()) - retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; - } + const auto boundMemory = conv.gpuObj->getBoundMemory(); + if (!boundMemory.isValid()) + return true; + if (!canHostWriteToMemoryRange(boundMemory,conv.gpuObj->getSize())) + retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; + return false; + } + ); // Deal with Deferred Creation of Acceleration structures { const auto minScratchAlignment = device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment; @@ -3489,6 +3485,8 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult if (retval.willDeviceASBuild() || retval.willCompactAS()) retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; } + std::erase_if(retval.m_imageConversions,[&](const SReserveResult::SConvReqImage& conv)->bool {return !conv.gpuObj || !conv.gpuObj->getBoundMemory().isValid();}); + dedupCreateProp.template operator()(); dedupCreateProp.template operator()(); @@ -3559,6 +3557,32 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul assert(reservations.m_converter.get()==this); auto device = m_params.device; + auto hostBufferXferIt = reservations.m_bufferConversions.begin(); + core::vector memoryHostFlushRanges; + memoryHostFlushRanges.reserve(reservations.m_bufferConversions.size()); + auto hostUploadBuffers = [&](auto&& pred)->void + { + for (; hostBufferXferIt!=reservations.m_bufferConversions.end() && pred(); hostBufferXferIt++) + { + const size_t size = hostBufferXferIt->gpuObj->getSize(); + const auto boundMemory = hostBufferXferIt->gpuObj->getBoundMemory(); + if (!canHostWriteToMemoryRange(boundMemory,size)) + continue; + auto* const memory = boundMemory.memory; + const IDeviceMemoryAllocation::MemoryRange range = {boundMemory.offset,size}; + memcpy(reinterpret_cast(memory->getMappedPointer())+range.offset,hostBufferXferIt->canonical->getPointer(),size); + // let go of canonical asset (may free RAM) + hostBufferXferIt->canonical = nullptr; + if (memory->haveToMakeVisible()) + memoryHostFlushRanges.emplace_back(memory,range.offset,range.length,ILogicalDevice::MappedMemoryRange::align_non_coherent_tag); + } + if (!memoryHostFlushRanges.empty()) + { + device->flushMappedMemoryRanges(memoryHostFlushRanges); + memoryHostFlushRanges.clear(); + } + }; + // compacted TLASes need to be substituted in cache and Descriptor Sets core::unordered_map> compactedTLASMap; // Anything to do? @@ -3825,11 +3849,10 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul for (auto& item : buffersToUpload) { auto* buffer = item.gpuObj; - const SBufferRange range = { - .offset = 0, - .size = item.gpuObj->getCreationParams().size, - .buffer = core::smart_refctd_ptr(buffer) - }; + const size_t size = item.gpuObj->getCreationParams().size; + // host will upload + if (canHostWriteToMemoryRange(buffer->getBoundMemory(),size)) + continue; auto pFoundHash = findInStaging.template operator()(buffer); // const auto ownerQueueFamily = checkOwnership(buffer,params.getFinalOwnerQueueFamily(buffer,*pFoundHash),transferFamily); @@ -3839,6 +3862,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul continue; } // do the upload + const SBufferRange range = {.offset=0,.size=size,.buffer=core::smart_refctd_ptr(buffer)}; const bool success = params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,item.canonical->getPointer()); // current recording buffer may have changed xferCmdBuf = params.transfer->getCommandBufferForRecording(); @@ -3870,7 +3894,6 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul xferCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Upload Buffers END"); xferCmdBuf->cmdbuf->endDebugMarker(); } - buffersToUpload.clear(); // release ownership if (!finalReleases.empty()) pipelineBarrier(xferCmdBuf,{.memBarriers={},.bufBarriers=finalReleases},"Ownership Releases of Buffers Failed"); @@ -3908,15 +3931,16 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul return IQueue::RESULT::OTHER_ERROR; return res; }; - // compose our overflow callback on top of what's already there, only if we need to ofc + + // We want to be doing Host operations while stalled for GPU, compose our overflow callback on top of what's already there, only if we need to ofc auto origXferStallCallback = params.transfer->overflowCallback; - if (shouldDoSomeCompute) - params.transfer->overflowCallback = [&origXferStallCallback,&drainCompute](const ISemaphore::SWaitInfo& tillScratchResettable)->void - { - drainCompute(); - if (origXferStallCallback) - origXferStallCallback(tillScratchResettable); - }; + params.transfer->overflowCallback = [device,&hostUploadBuffers,&origXferStallCallback,&drainCompute](const ISemaphore::SWaitInfo& tillScratchResettable)->void + { + drainCompute(); + if (origXferStallCallback) + origXferStallCallback(tillScratchResettable); + hostUploadBuffers([device,&tillScratchResettable]()->bool{return device->waitForSemaphores({&tillScratchResettable,1},false,0)==ISemaphore::WAIT_RESULT::TIMEOUT;}); + }; // when overflowing compute resources, we need to submit the Xfer before submitting Compute auto drainBoth = [¶ms,&xferCmdBuf,&drainCompute](const std::span extraSignal={})->auto { @@ -4987,6 +5011,8 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul } } + // finish host tasks if not done yet + hostUploadBuffers([]()->bool{return true;}); // Descriptor Sets need their TLAS descriptors substituted if they've been compacted // want to check if deps successfully exist From a90ef105242c9bc47074ad189a531d1bbc03fb2d Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 8 May 2025 11:41:36 +0200 Subject: [PATCH 053/346] Make the memory requests happen immediately during `assign` also set Vulkan debug names on the created GPU objects right away. Push conversion requests to right after successful `assign` beause that makes more sense. Prep for host_image_copy support. Also make should-be-private structs private. --- include/nbl/video/utilities/CAssetConverter.h | 28 +-- src/nbl/video/utilities/CAssetConverter.cpp | 195 +++++++++--------- 2 files changed, 114 insertions(+), 109 deletions(-) diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h index 3e134b913d..02cc9ab447 100644 --- a/include/nbl/video/utilities/CAssetConverter.h +++ b/include/nbl/video/utilities/CAssetConverter.h @@ -1064,21 +1064,10 @@ class CAssetConverter : public core::IReferenceCounted return enqueueSuccess; } - // public only because `GetDependantVisit` needs it - struct SDeferredTLASWrite - { - inline bool operator==(const SDeferredTLASWrite& other) const - { - return dstSet == other.dstSet && binding == other.binding && arrayElement == other.arrayElement; - } - - IGPUDescriptorSet* dstSet; - uint32_t binding; - uint32_t arrayElement; - core::smart_refctd_ptr tlas; - }; private: friend class CAssetConverter; + // internal classes + template friend class GetDependantVisit; inline SReserveResult() = default; @@ -1141,6 +1130,19 @@ class CAssetConverter : public core::IReferenceCounted }; using cpu_to_gpu_blas_map_t = core::unordered_map; cpu_to_gpu_blas_map_t m_blasBuildMap; + // + struct SDeferredTLASWrite + { + inline bool operator==(const SDeferredTLASWrite& other) const + { + return dstSet == other.dstSet && binding == other.binding && arrayElement == other.arrayElement; + } + + IGPUDescriptorSet* dstSet; + uint32_t binding; + uint32_t arrayElement; + core::smart_refctd_ptr tlas; + }; struct SDeferredTLASWriteHasher { inline size_t operator()(const SDeferredTLASWrite& write) const diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 19b8f18a66..d25dcae4f1 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -1657,8 +1657,8 @@ class GetDependantVisit : public GetDependant public: // all instances need to be aligned to 16 bytes so alignment irrelevant (everything can be tightly packed) and implicit uint64_t buildInputSize = 0; - // because of zero access to the lifetime tracking between TLASes and BLASes, do nothing - //core::smart_refctd_ptr* const outBLASes; + // + CAssetConverter::SReserveResult::cpu_to_gpu_blas_map_t* blasBuildMap; protected: bool descend_impl( @@ -1673,7 +1673,12 @@ class GetDependantVisit : public GetDependant const auto instances = user.asset->getInstances(); assert(instanceIndexfind(dep.asset); + if (foundBLAS!=blasBuildMap->end()) + foundBLAS->second.remainingUsages++; + else + blasBuildMap->insert(foundBLAS,{dep.asset,{depObj}}); return true; } }; @@ -2669,7 +2674,9 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult }(); core::vector> gpuObjects(gpuObjUniqueCopyGroupIDs.size()); - auto assign = [&](const core::blake3_hash_t& contentHash, const size_t baseIx, const size_t copyIx, asset_cached_t::type&& gpuObj)->bool + auto assign = [&]( + const core::blake3_hash_t& contentHash, const size_t baseIx, const size_t copyIx, asset_cached_t::type&& gpuObj, const AssetType* asset=nullptr + )->asset_traits::video_t* { const auto hashAsU64 = reinterpret_cast(contentHash.data); if constexpr (GPUObjectWhollyImmutable) // including any deps! @@ -2685,16 +2692,37 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult "Failed to create GPU Object for asset content %8llx%8llx%8llx%8llx", system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] ); - return false; + return nullptr; } - gpuObjects[copyIx+baseIx].value = std::move(gpuObj); - return true; + auto output = gpuObjects.data()+copyIx+baseIx; + const uint64_t uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[copyIx+baseIx]; + if constexpr (std::is_same_v || std::is_same_v) + { + const auto constrainMask = inputs.constrainMemoryTypeBits(uniqueCopyGroupID,asset,contentHash,gpuObj.get()); + if (!deferredAllocator.request(output,constrainMask)) + return nullptr; + } + // set debug names on everything! + { + std::ostringstream debugName; + debugName << "Created by Converter "; + debugName << std::hex; + debugName << this; + debugName << " from Asset with hash "; + for (const auto& byte : contentHash.data) + debugName << uint32_t(byte) << " "; + debugName << "for Group " << uniqueCopyGroupID; + gpuObj.get()->setObjectDebugName(debugName.str().c_str()); + } + output->value = std::move(gpuObj); + return output->value.get(); }; GetDependantVisitBase visitBase = { .inputs = inputs, .dfsCaches = dfsCaches }; + // Dispatch to correct creation of GPU objects if constexpr (std::is_same_v) { @@ -2707,19 +2735,21 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult for (auto& entry : conversionRequests) for (auto i=0ull; igetSize(); + params.size = asset->getSize(); params.usage = patch.usage; // concurrent ownership if any const auto outIx = i+entry.second.firstCopyIx; const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx]; - const auto queueFamilies = inputs.getSharedOwnershipQueueFamilies(uniqueCopyGroupID,entry.second.canonicalAsset,patch); + const auto queueFamilies = inputs.getSharedOwnershipQueueFamilies(uniqueCopyGroupID,asset,patch); params.queueFamilyIndexCount = queueFamilies.size(); params.queueFamilyIndices = queueFamilies.data(); - // if creation successful, we will upload - assign(entry.first,entry.second.firstCopyIx,i,device->createBuffer(std::move(params))); + // if creation successful, we will request some memory allocation to bind to, and if thats okay we preliminarily request a conversion + if (IGPUBuffer* const gpuObj=assign(entry.first,entry.second.firstCopyIx,i,device->createBuffer(std::move(params)),asset); gpuObj) + retval.m_bufferConversions.push_back({core::smart_refctd_ptr(asset),gpuObj}); } } if constexpr (std::is_same_v || std::is_same_v) @@ -2950,19 +2980,9 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult // gpu image specifics params.tiling = static_cast(patch.linearTiling); params.preinitialized = false; - // if creation successful, we check what queues we need if uploading - if (assign(entry.first,entry.second.firstCopyIx,i,device->createImage(std::move(params))) && !asset->getRegions().empty()) - { - // for now until host_image_copy - retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; - // Best effort guess, without actually looking at all regions - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdCopyBufferToImage.html#VUID-vkCmdCopyBufferToImage-commandBuffer-07739 - if (isDepthOrStencilFormat(patch.format) && (patch.usageFlags|patch.stencilUsage).hasFlags(IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT)) - retval.m_queueFlags |= IQueue::FAMILY_FLAGS::GRAPHICS_BIT; - // only if we upload some data can we recompute the mips - if (patch.recomputeMips) - retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; - } + // if creation successful, we will request some memory allocation to bind to, and if thats okay we preliminarily request a conversion (if we have content to upload) + if (IGPUImage* const gpuObj=assign(entry.first,entry.second.firstCopyIx,i,device->createImage(std::move(params)),asset); gpuObj && !asset->getRegions().empty()) + retval.m_imageConversions.push_back({{core::smart_refctd_ptr(asset),gpuObj},bool(patch.recomputeMips)}); } } if constexpr (std::is_same_v) @@ -3314,87 +3334,54 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult } } - // Propagate the results back, since the dfsCache has the original asset pointers as keys, we map in reverse - // TODO: this probably could go at the end of the object creation routines + // Propagate the results back, since the dfsCache has the original asset pointers as keys, we map in reverse (multiple `instance_t` can map to the same content hash and GPU object) // This gets deferred till AFTER the Buffer Memory Allocations and Binding for Acceleration Structures if constexpr (!std::is_same_v && !std::is_same_v) dfsCache.for_each([&](const instance_t& instance, dfs_cache::created_t& created)->void - { - auto& stagingCache = std::get>(retval.m_stagingCaches); - // already found in read cache and not converted - if (created.gpuObj) - return; + { + auto& stagingCache = std::get>(retval.m_stagingCaches); + // already found in read cache and not converted + if (created.gpuObj) + return; - const auto& contentHash = created.contentHash; - auto found = conversionRequests.find(contentHash); + const auto& contentHash = created.contentHash; + auto found = conversionRequests.find(contentHash); - const auto uniqueCopyGroupID = instance.uniqueCopyGroupID; + const auto uniqueCopyGroupID = instance.uniqueCopyGroupID; - const auto hashAsU64 = reinterpret_cast(contentHash.data); - // can happen if deps were unconverted dummies - if (found==conversionRequests.end()) - { - if (contentHash!=CHashCache::NoContentHash) - inputs.logger.log( - "Could not find GPU Object for Asset %p in group %ull with Content Hash %8llx%8llx%8llx%8llx", - system::ILogger::ELL_ERROR,instance.asset,uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] - ); - return; - } - // unhashables were not supposed to be added to conversion requests - assert(contentHash!=CHashCache::NoContentHash); + const auto hashAsU64 = reinterpret_cast(contentHash.data); + // can happen if deps were unconverted dummies + if (found==conversionRequests.end()) + { + if (contentHash!=CHashCache::NoContentHash) + inputs.logger.log( + "Could not find GPU Object for Asset %p in group %ull with Content Hash %8llx%8llx%8llx%8llx", + system::ILogger::ELL_ERROR,instance.asset,uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] + ); + return; + } + // unhashables were not supposed to be added to conversion requests + assert(contentHash!=CHashCache::NoContentHash); -// abstract away start - const auto copyIx = found->second.firstCopyIx++; - // the counting sort was stable - assert(uniqueCopyGroupID==gpuObjUniqueCopyGroupIDs[copyIx]); + const auto copyIx = found->second.firstCopyIx++; + // the counting sort was stable + assert(uniqueCopyGroupID==gpuObjUniqueCopyGroupIDs[copyIx]); - auto& gpuObj = gpuObjects[copyIx]; - if (!gpuObj) - { - inputs.logger.log( - "Conversion for Content Hash %8llx%8llx%8llx%8llx Copy Index %d from Canonical Asset %p Failed.", - system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3],copyIx,found->second.canonicalAsset - ); - return; - } - // set debug names on everything! - { - std::ostringstream debugName; - debugName << "Created by Converter "; - debugName << std::hex; - debugName << this; - debugName << " from Asset with hash "; - for (const auto& byte : contentHash.data) - debugName << uint32_t(byte) << " "; - debugName << "for Group " << uniqueCopyGroupID; - gpuObj.get()->setObjectDebugName(debugName.str().c_str()); - } - // insert into staging cache - stagingCache.emplace(gpuObj.get(),typename CCache::key_t(contentHash,uniqueCopyGroupID)); - // propagate back to dfsCache - created.gpuObj = std::move(gpuObj); -// abstract away end - // record if a device memory allocation will be needed - if constexpr (std::is_base_of_v::video_t>) - { - const auto constrainMask = inputs.constrainMemoryTypeBits(uniqueCopyGroupID,instance.asset,contentHash,created.gpuObj.get()); - if (!deferredAllocator.request(&created.gpuObj,constrainMask)) + auto& gpuObj = gpuObjects[copyIx]; + if (!gpuObj) { - created.gpuObj.value = nullptr; + inputs.logger.log( + "Creation of GPU Object (or its dependents) for Content Hash %8llx%8llx%8llx%8llx Copy Index %d from Canonical Asset %p Failed.", + system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3],copyIx,found->second.canonicalAsset + ); return; } + // insert into staging cache + stagingCache.emplace(gpuObj.get(),typename CCache::key_t(contentHash,uniqueCopyGroupID)); + // propagate back to dfsCache + created.gpuObj = std::move(gpuObj); } - // - if constexpr (std::is_same_v) - retval.m_bufferConversions.emplace_back(SReserveResult::SConvReqBuffer{core::smart_refctd_ptr(instance.asset),created.gpuObj.get()}); - if constexpr (std::is_same_v) - { - const uint16_t recomputeMips = created.patch.recomputeMips; - retval.m_imageConversions.emplace_back(SReserveResult::SConversionRequestBase{core::smart_refctd_ptr(instance.asset),created.gpuObj.get()},recomputeMips); - } - } - ); + ); }; // The order of these calls is super important to go BOTTOM UP in terms of hashing and conversion dependants. // Both so we can hash in O(Depth) and not O(Depth^2) but also so we have all the possible dependants ready. @@ -3409,8 +3396,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult // find out which buffers need to be uploaded via a staging buffer std::erase_if(retval.m_bufferConversions,[&](const SReserveResult::SConvReqBuffer& conv)->bool { - if (!conv.gpuObj) - return true; + assert(conv.gpuObj); const auto boundMemory = conv.gpuObj->getBoundMemory(); if (!boundMemory.isValid()) return true; @@ -3485,7 +3471,24 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult if (retval.willDeviceASBuild() || retval.willCompactAS()) retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; } - std::erase_if(retval.m_imageConversions,[&](const SReserveResult::SConvReqImage& conv)->bool {return !conv.gpuObj || !conv.gpuObj->getBoundMemory().isValid();}); + // find out which images need what caps for the transfer and mipmapping + std::erase_if(retval.m_imageConversions,[&](const SReserveResult::SConvReqImage& conv)->bool + { + assert(conv.gpuObj); + const auto boundMemory = conv.gpuObj->getBoundMemory(); + if (!boundMemory.isValid()) + return true; + retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; + if (conv.recomputeMips) + retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; + // Best effort guess, without actually looking at all regions + const auto& params = conv.gpuObj->getCreationParameters(); + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdCopyBufferToImage.html#VUID-vkCmdCopyBufferToImage-commandBuffer-07739 + if (isDepthOrStencilFormat(params.format) && (params.depthUsage|params.stencilUsage).hasFlags(IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT)) + retval.m_queueFlags |= IQueue::FAMILY_FLAGS::GRAPHICS_BIT; + return false; + } + ); dedupCreateProp.template operator()(); From 11255d4f7d99279851f41ae5f025912f437b73f6 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 8 May 2025 20:24:27 +0700 Subject: [PATCH 054/346] Implement ICPURayTracingPipeline --- include/nbl/asset/ICPURayTracingPipeline.h | 122 +++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 include/nbl/asset/ICPURayTracingPipeline.h diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h new file mode 100644 index 0000000000..23a1d82225 --- /dev/null +++ b/include/nbl/asset/ICPURayTracingPipeline.h @@ -0,0 +1,122 @@ + +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_ASSET_I_CPU_RAY_TRACING_PIPELINE_H_INCLUDED_ +#define _NBL_ASSET_I_CPU_RAY_TRACING_PIPELINE_H_INCLUDED_ + +#include "nbl/asset/IRayTracingPipeline.h" +#include "nbl/asset/ICPUPipeline.h" + + +namespace nbl::asset +{ + +//! CPU Version of RayTracing Pipeline +class ICPURayTracingPipeline final : public ICPUPipeline> +{ + using pipeline_base_t = IRayTracingPipeline; + using base_t = ICPUPipeline; + + public: + struct SHitGroupSpecInfo { + SShaderSpecInfo closestHit; + SShaderSpecInfo anyHit; + SShaderSpecInfo intersection; + + SHitGroupSpecInfo clone(uint32_t depth) const + { + auto newSpecInfo = *this; + if (depth > 0u) + { + newSpecInfo.closestHit.shader = core::smart_refctd_ptr_static_cast(this->closestHit.shader->clone(depth - 1u)); + newSpecInfo.anyHit.shader = core::smart_refctd_ptr_static_cast(this->anyHit.shader->clone(depth - 1u)); + newSpecInfo.intersection.shader = core::smart_refctd_ptr_static_cast(this->intersection.shader->clone(depth - 1u)); + } + return newSpecInfo; + } + }; + + static core::smart_refctd_ptr create(const ICPUPipelineLayout* layout) + { + auto retval = new ICPURayTracingPipeline(layout); + return core::smart_refctd_ptr(retval,core::dont_grab); + } + + inline core::smart_refctd_ptr clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final + { + auto newPipeline = new ICPURayTracingPipeline(layout.get()); + newPipeline->m_raygen = m_raygen.clone(depth); + + newPipeline->m_misses.resize(m_misses.size()); + for (auto specInfo_i = 0u; specInfo_i < m_misses.size(); specInfo_i++) + { + newPipeline->m_misses[specInfo_i] = m_misses[specInfo_i].clone(depth); + } + + newPipeline->m_hitGroups.resize(m_hitGroups.size()); + for (auto specInfo_i = 0u; specInfo_i < m_misses.size(); specInfo_i++) + { + newPipeline->m_hitGroups[specInfo_i] = m_hitGroups[specInfo_i].clone(depth); + } + + newPipeline->m_callables.resize(m_callables.size()); + for (auto specInfo_i = 0u; specInfo_i < m_callables.size(); specInfo_i++) + { + newPipeline->m_callables[specInfo_i] = m_callables[specInfo_i].clone(depth); + } + + newPipeline->m_params = m_params; + return core::smart_refctd_ptr(newPipeline); + } + + constexpr static inline auto AssetType = ET_RAYTRACING_PIPELINE; + inline E_TYPE getAssetType() const override { return AssetType; } + + //! + inline size_t getDependantCount() const override { + //TODO(kevinyu): Implement or refactor the api design to something else + return 0; + } + + inline virtual std::span getSpecInfo(hlsl::ShaderStage stage) const override final + { + switch (stage) + { + case hlsl::ShaderStage::ESS_RAYGEN: + return { &m_raygen, 1 }; + } + return {}; + } + + inline virtual bool valid() const override final + { + // TODO(kevinyu): Fix this temporary dummy code + return true; + } + + protected: + virtual ~ICPURayTracingPipeline() = default; + + inline IAsset* getDependant_impl(const size_t ix) override + { + //TODO(kevinyu): remove this function, since this is expensive + return nullptr; + } + + + private: + + SShaderSpecInfo m_raygen; + core::vector m_misses; + core::vector m_hitGroups; + core::vector m_callables; + + explicit ICPURayTracingPipeline(const ICPUPipelineLayout* layout) + : base_t(layout, {}) + {} + +}; + +} +#endif From 343f3954db9ec73607d5fbe5dd1e4b3641a88f20 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 8 May 2025 17:39:54 +0200 Subject: [PATCH 055/346] prepare the refactor to be able to propagate the deferredely created Acceleration Structures to staging cache --- src/nbl/video/utilities/CAssetConverter.cpp | 179 +++++++++++--------- 1 file changed, 96 insertions(+), 83 deletions(-) diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index d25dcae4f1..b87cbbfdde 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -1993,20 +1993,6 @@ class GetDependantVisit : public GetDependantVisitBase -struct unique_conversion_t -{ - const AssetType* canonicalAsset = nullptr; - patch_index_t patchIndex = {}; - size_t firstCopyIx : 40 = 0u; - size_t copyCount : 24 = 1u; -}; - -// Map from ContentHash to canonical asset & patch and the list of uniqueCopyGroupIDs -template -using conversions_t = core::unordered_map>; - // Needed both for reservation and conversion class MetaDeviceMemoryAllocator final { @@ -2307,6 +2293,24 @@ bool canHostWriteToMemoryRange(const IDeviceMemoryBacked::SMemoryBinding& bindin return memory->isCurrentlyMapped() && memory->getCurrentMappingAccess().hasFlags(IDeviceMemoryAllocation::EMCAF_WRITE) && mappedRange.offset<=binding.offset && binding.offset+length<=mappedRange.offset+mappedRange.length; } +// +template +struct unique_conversion_t +{ + const AssetType* canonicalAsset = nullptr; + patch_index_t patchIndex = {}; + size_t firstCopyIx : 40 = 0u; + size_t copyCount : 24 = 1u; +}; + +// Map from ContentHash to canonical asset & patch and the list of uniqueCopyGroupIDs +template +struct conversions_t +{ + core::unordered_map> contentHashToCanonical; + core::vector> gpuObjects; +}; + // auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult { @@ -2544,6 +2548,53 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult // can now spawn our own hash cache retval.m_hashCache = core::make_smart_refctd_ptr(); + // Since the dfsCache has the original asset pointers as keys, we map in reverse (multiple `instance_t` can map to the same unique content hash and GPU object) + auto propagateToStagingCache = [&inputs,&dfsCaches,&retval](conversions_t& conversionRequests)->void + { + std::get>(dfsCaches).for_each([&](const instance_t& instance, dfs_cache::created_t& created)->void + { + auto& stagingCache = std::get>(retval.m_stagingCaches); + // already found in read cache and not converted + if (created.gpuObj) + return; + + const auto uniqueCopyGroupID = instance.uniqueCopyGroupID; + const auto& contentHash = created.contentHash; + const auto hashAsU64 = reinterpret_cast(contentHash.data); + + auto& map = conversionRequests.contentHashToCanonical; + auto found = map.find(contentHash); + // can happen if deps were unconverted dummies + if (found==map.end()) + { + if (contentHash!=CHashCache::NoContentHash) + inputs.logger.log( + "Could not find GPU Object for Asset %p in group %ull with Content Hash %8llx%8llx%8llx%8llx", + system::ILogger::ELL_ERROR, instance.asset, uniqueCopyGroupID, hashAsU64[0], hashAsU64[1], hashAsU64[2], hashAsU64[3] + ); + return; + } + // unhashables were not supposed to be added to conversion requests + assert(contentHash!=CHashCache::NoContentHash); + + const auto copyIx = found->second.firstCopyIx++; + auto& gpuObj = conversionRequests.gpuObjects[copyIx]; + if (!gpuObj) + { + inputs.logger.log( + "Creation of GPU Object (or its dependents) for Content Hash %8llx%8llx%8llx%8llx Copy Index %d from Canonical Asset %p Failed.", + system::ILogger::ELL_ERROR, hashAsU64[0], hashAsU64[1], hashAsU64[2], hashAsU64[3], copyIx, found->second.canonicalAsset + ); + return; + } + // insert into staging cache + stagingCache.emplace(gpuObj.get(),typename CCache::key_t(contentHash,uniqueCopyGroupID)); + // propagate back to dfsCache + created.gpuObj = std::move(gpuObj); + } + ); + }; + MetaDeviceMemoryAllocator deferredAllocator(inputs.allocator ? inputs.allocator:device,inputs.logger); // BLAS and TLAS creation is somewhat delayed by buffer creation and allocation @@ -2560,7 +2611,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult }; core::vector accelerationStructureParams[2]; // Deduplication, Creation and Propagation - auto dedupCreateProp = [&]()->void + auto dedupCreateProp = [&]()->conversions_t { auto& dfsCache = std::get>(dfsCaches); // This map contains the assets by-hash, identical asset+patch hash the same. @@ -2623,7 +2674,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult } // then de-duplicate the conversions needed const patch_index_t patchIx = {static_cast(std::distance(dfsCache.nodes.data(),&created))}; - auto [inSetIt,inserted] = conversionRequests.emplace(contentHash,unique_conversion_t{.canonicalAsset=instance.asset,.patchIndex=patchIx}); + auto [inSetIt,inserted] = conversionRequests.contentHashToCanonical.emplace(contentHash,unique_conversion_t{.canonicalAsset=instance.asset,.patchIndex=patchIx}); if (!inserted) { // If an element prevented insertion, the patch must be identical! @@ -2642,7 +2693,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult auto exclScanConvReqs = [&]()->size_t { size_t sum = 0; - for (auto& entry : conversionRequests) + for (auto& entry : conversionRequests.contentHashToCanonical) { entry.second.firstCopyIx = sum; sum += entry.second.copyCount; @@ -2655,9 +2706,10 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult { if (created.gpuObj) return; - auto found = conversionRequests.find(created.contentHash); + auto& map = conversionRequests.contentHashToCanonical; + auto found = map.find(created.contentHash); // may not find things because of unconverted dummy deps - if (found!=conversionRequests.end()) + if (found!=map.end()) retval[found->second.firstCopyIx++] = instance.uniqueCopyGroupID; else { @@ -2673,7 +2725,9 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult return retval; }(); - core::vector> gpuObjects(gpuObjUniqueCopyGroupIDs.size()); + // + conversionRequests.gpuObjects.resize(gpuObjUniqueCopyGroupIDs.size()); + // auto assign = [&]( const core::blake3_hash_t& contentHash, const size_t baseIx, const size_t copyIx, asset_cached_t::type&& gpuObj, const AssetType* asset=nullptr )->asset_traits::video_t* @@ -2694,7 +2748,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult ); return nullptr; } - auto output = gpuObjects.data()+copyIx+baseIx; + auto output = conversionRequests.gpuObjects.data()+copyIx+baseIx; const uint64_t uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[copyIx+baseIx]; if constexpr (std::is_same_v || std::is_same_v) { @@ -2726,13 +2780,13 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult // Dispatch to correct creation of GPU objects if constexpr (std::is_same_v) { - for (auto& entry : conversionRequests) + for (auto& entry : conversionRequests.contentHashToCanonical) for (auto i=0ull; i(entry.first,entry.second.firstCopyIx,i,device->createSampler(entry.second.canonicalAsset->getParams())); } if constexpr (std::is_same_v) { - for (auto& entry : conversionRequests) + for (auto& entry : conversionRequests.contentHashToCanonical) for (auto i=0ull; i SReserveResult const auto hostBuildMemoryTypes = device->getPhysicalDevice()->getMemoryTypeBitsFromMemoryTypeFlags(mem_prop_f::EMPF_DEVICE_LOCAL_BIT|mem_prop_f::EMPF_HOST_WRITABLE_BIT|mem_prop_f::EMPF_HOST_CACHED_BIT); constexpr bool IsTLAS = std::is_same_v; - accelerationStructureParams[IsTLAS].resize(gpuObjects.size()); - for (auto& entry : conversionRequests) + accelerationStructureParams[IsTLAS].resize(conversionRequests.gpuObjects.size()); + for (auto& entry : conversionRequests.contentHashToCanonical) for (auto i=0ull; i SReserveResult } if constexpr (std::is_same_v) { - for (auto& entry : conversionRequests) + for (auto& entry : conversionRequests.contentHashToCanonical) for (auto i=0ull; i SReserveResult } if constexpr (std::is_same_v) { - for (auto& entry : conversionRequests) + for (auto& entry : conversionRequests.contentHashToCanonical) { const ICPUBufferView* asset = entry.second.canonicalAsset; const auto& patch = dfsCache.nodes[entry.second.patchIndex.value].patch; @@ -3009,7 +3063,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult } if constexpr (std::is_same_v) { - for (auto& entry : conversionRequests) + for (auto& entry : conversionRequests.contentHashToCanonical) { const ICPUImageView* asset = entry.second.canonicalAsset; const auto& cpuParams = asset->getCreationParameters(); @@ -3057,7 +3111,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult .readCache = inputs.readShaderCache, .writeCache = inputs.writeShaderCache }; - for (auto& entry : conversionRequests) + for (auto& entry : conversionRequests.contentHashToCanonical) for (auto i=0ull; i SReserveResult } if constexpr (std::is_same_v) { - for (auto& entry : conversionRequests) + for (auto& entry : conversionRequests.contentHashToCanonical) { const ICPUDescriptorSetLayout* asset = entry.second.canonicalAsset; // there is no patching possible for this asset @@ -3135,7 +3189,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult { core::vector pcRanges; pcRanges.reserve(CSPIRVIntrospector::MaxPushConstantsSize); - for (auto& entry : conversionRequests) + for (auto& entry : conversionRequests.contentHashToCanonical) { const ICPUPipelineLayout* asset = entry.second.canonicalAsset; const auto& patch = dfsCache.nodes[entry.second.patchIndex.value].patch; @@ -3185,7 +3239,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult } if constexpr (std::is_same_v) { - for (auto& entry : conversionRequests) + for (auto& entry : conversionRequests.contentHashToCanonical) { const ICPUPipelineCache* asset = entry.second.canonicalAsset; // there is no patching possible for this asset @@ -3199,7 +3253,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult } if constexpr (std::is_same_v) { - for (auto& entry : conversionRequests) + for (auto& entry : conversionRequests.contentHashToCanonical) { const ICPUComputePipeline* asset = entry.second.canonicalAsset; // there is no patching possible for this asset @@ -3230,7 +3284,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult } if constexpr (std::is_same_v) { - for (auto& entry : conversionRequests) + for (auto& entry : conversionRequests.contentHashToCanonical) { const ICPURenderpass* asset = entry.second.canonicalAsset; // there is no patching possible for this asset @@ -3246,7 +3300,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult { core::vector tmpSpecInfo; tmpSpecInfo.reserve(5); - for (auto& entry : conversionRequests) + for (auto& entry : conversionRequests.contentHashToCanonical) { const ICPUGraphicsPipeline* asset = entry.second.canonicalAsset; // there is no patching possible for this asset @@ -3294,7 +3348,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult // Descriptor Pools have large up-front slots reserved for all descriptor types, if we were to merge // multiple descriptor sets to be allocated from one pool, dropping any set wouldn't result in the // reclamation of the memory used, it would at most (with the FREE pool create flag) return to pool. - for (auto& entry : conversionRequests) + for (auto& entry : conversionRequests.contentHashToCanonical) { const ICPUDescriptorSet* asset = entry.second.canonicalAsset; for (auto i=0ull; i SReserveResult } } - // Propagate the results back, since the dfsCache has the original asset pointers as keys, we map in reverse (multiple `instance_t` can map to the same content hash and GPU object) // This gets deferred till AFTER the Buffer Memory Allocations and Binding for Acceleration Structures - if constexpr (!std::is_same_v && !std::is_same_v) - dfsCache.for_each([&](const instance_t& instance, dfs_cache::created_t& created)->void - { - auto& stagingCache = std::get>(retval.m_stagingCaches); - // already found in read cache and not converted - if (created.gpuObj) - return; - - const auto& contentHash = created.contentHash; - auto found = conversionRequests.find(contentHash); - - const auto uniqueCopyGroupID = instance.uniqueCopyGroupID; - - const auto hashAsU64 = reinterpret_cast(contentHash.data); - // can happen if deps were unconverted dummies - if (found==conversionRequests.end()) - { - if (contentHash!=CHashCache::NoContentHash) - inputs.logger.log( - "Could not find GPU Object for Asset %p in group %ull with Content Hash %8llx%8llx%8llx%8llx", - system::ILogger::ELL_ERROR,instance.asset,uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] - ); - return; - } - // unhashables were not supposed to be added to conversion requests - assert(contentHash!=CHashCache::NoContentHash); - - const auto copyIx = found->second.firstCopyIx++; - // the counting sort was stable - assert(uniqueCopyGroupID==gpuObjUniqueCopyGroupIDs[copyIx]); - - auto& gpuObj = gpuObjects[copyIx]; - if (!gpuObj) - { - inputs.logger.log( - "Creation of GPU Object (or its dependents) for Content Hash %8llx%8llx%8llx%8llx Copy Index %d from Canonical Asset %p Failed.", - system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3],copyIx,found->second.canonicalAsset - ); - return; - } - // insert into staging cache - stagingCache.emplace(gpuObj.get(),typename CCache::key_t(contentHash,uniqueCopyGroupID)); - // propagate back to dfsCache - created.gpuObj = std::move(gpuObj); - } - ); + if constexpr (!std::is_base_of_v) + { + propagateToStagingCache.template operator()(conversionRequests); + return {}; + } + return conversionRequests; }; // The order of these calls is super important to go BOTTOM UP in terms of hashing and conversion dependants. // Both so we can hash in O(Depth) and not O(Depth^2) but also so we have all the possible dependants ready. From bbce9f51a4a8fc5401a4ce0317eb8eba7b854460 Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 9 May 2025 15:13:08 +0200 Subject: [PATCH 056/346] refactor the conversion request system --- src/nbl/video/utilities/CAssetConverter.cpp | 659 ++++++++++---------- 1 file changed, 335 insertions(+), 324 deletions(-) diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index b87cbbfdde..0dc431f8ae 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -2025,6 +2025,7 @@ class MetaDeviceMemoryAllocator final if ((memReqs.memoryTypeBits&memoryTypeConstraint)==0) { m_logger.log("Overconstrained the Memory Type Index bitmask %d with %d for %s",system::ILogger::ELL_ERROR,memReqs.memoryTypeBits,memoryTypeConstraint,gpuObj->getObjectDebugName()); + pGpuObj->value = nullptr; return false; } // @@ -2044,6 +2045,7 @@ class MetaDeviceMemoryAllocator final if (!allocation.isValid()) { m_logger.log("Failed to allocate and bind dedicated memory for %s",system::ILogger::ELL_ERROR,gpuObj->getObjectDebugName()); + pGpuObj->value = nullptr; return false; } } @@ -2307,8 +2309,210 @@ struct unique_conversion_t template struct conversions_t { - core::unordered_map> contentHashToCanonical; - core::vector> gpuObjects; + public: + // Go through the dfsCache and work out each entry's content hashes, so that we can carry out unique conversions. + void gather(core::tuple_transform_t& dfsCaches, CAssetConverter::CHashCache* hashCache, const CAssetConverter::CCache* readCache) + { + auto& dfsCache = std::get>(dfsCaches); + dfsCache.for_each([&](const instance_t& instance, dfs_cache::created_t& created)->void + { + // compute the hash or look it up if it exists + // We mistrust every dependency such that the eject/update if needed. + // Its really important that the Deduplication gets performed Bottom-Up + auto& contentHash = created.contentHash; + PatchOverride patchOverride(*inputs,dfsCaches,instance.uniqueCopyGroupID); + contentHash = hashCache->hash( + {instance.asset,&created.patch}, + &patchOverride, + /*.mistrustLevel =*/ 1 + ); + // failed to hash all together (only possible reason is failure of `PatchGetter` to provide a valid patch) + if (contentHash==CAssetConverter::CHashCache::NoContentHash) + { + inputs->logger.log("Could not compute hash for asset %p in group %d, maybe an IPreHashed dependant's content hash is missing?",system::ILogger::ELL_ERROR,instance.asset,instance.uniqueCopyGroupID); + return; + } + const auto hashAsU64 = reinterpret_cast(contentHash.data); + { + inputs->logger.log("Asset (%p,%d) has hash %8llx%8llx%8llx%8llx",system::ILogger::ELL_DEBUG,instance.asset,instance.uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]); + } + // if we have a read cache, lets retry looking the item up! + if (readCache) + { + // We can't look up "near misses" (supersets of patches) because they'd have different hashes + // and we can't afford to split hairs like finding overlapping buffer ranges, etc. + // Stuff like that would require a completely different hashing/lookup strategy (or multiple fake entries). + const auto found = readCache->find({contentHash,instance.uniqueCopyGroupID}); + if (found!=readCache->forwardMapEnd()) + { + created.gpuObj = found->second; + inputs->logger.log( + "Asset (%p,%d) with hash %8llx%8llx%8llx%8llx found its GPU Object in Read Cache",system::ILogger::ELL_DEBUG, + instance.asset,instance.uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] + ); + return; + } + } + // The conversion request we insert needs an instance asset whose unconverted dependencies don't have missing content + // SUPER SIMPLIFICATION: because we hash and search for readCache items bottom up (BFS), we don't need a stack (DFS) here! + // Any dependant that's not getting a GPU object due to missing content or GPU cache object for its cache, will show up later during `getDependant` + // An additional optimization would be to improve the `PatchGetter` to check dependants (only deps) during hashing for missing dfs cache gpu Object (no read cache) and no conversion request. + auto* isPrehashed = dynamic_cast(instance.asset); + if (isPrehashed && isPrehashed->missingContent()) + { + inputs->logger.log( + "PreHashed Asset (%p,%d) with hash %8llx%8llx%8llx%8llx has missing content and no GPU Object in Read Cache!",system::ILogger::ELL_ERROR, + instance.asset,instance.uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] + ); + return; + } + // then de-duplicate the conversions needed + const patch_index_t patchIx = {static_cast(std::distance(dfsCache.nodes.data(),&created))}; + auto [inSetIt,inserted] = contentHashToCanonical.emplace(contentHash,unique_conversion_t{.canonicalAsset=instance.asset,.patchIndex=patchIx}); + if (!inserted) + { + // If an element prevented insertion, the patch must be identical! + // Because the conversions don't care about groupIDs, the patches may be identical but not the same object in memory. + assert(inSetIt->second.patchIndex==patchIx || dfsCache.nodes[inSetIt->second.patchIndex.value].patch==dfsCache.nodes[patchIx.value].patch); + inSetIt->second.copyCount++; + } + } + ); + + // work out mapping of `conversionRequests` to multiple GPU objects and their copy groups via counting sort + { + // assign storage offsets via exclusive scan and put the `uniqueGroupID` mappings in sorted order + auto exclScanConvReqs = [&]()->size_t + { + size_t sum = 0; + for (auto& entry : contentHashToCanonical) + { + entry.second.firstCopyIx = sum; + sum += entry.second.copyCount; + } + return sum; + }; + gpuObjUniqueCopyGroupIDs.resize(exclScanConvReqs()); + // + dfsCache.for_each([&](const instance_t& instance, dfs_cache::created_t& created)->void + { + if (created.gpuObj) + return; + auto found = contentHashToCanonical.find(created.contentHash); + // may not find things because of unconverted dummy deps + if (found!=contentHashToCanonical.end()) + gpuObjUniqueCopyGroupIDs[found->second.firstCopyIx++] = instance.uniqueCopyGroupID; + else + { + inputs->logger.log( + "No conversion request made for Asset %p in group %d, its impossible to convert.", + system::ILogger::ELL_ERROR,instance.asset,instance.uniqueCopyGroupID + ); + } + } + ); + // `{conversionRequests}.firstCopyIx` needs to be brought back down to exclusive scan form + exclScanConvReqs(); + } + + // we now know the size of out output array + gpuObjects.resize(gpuObjUniqueCopyGroupIDs.size()); + } + + // + template + void assign(const core::blake3_hash_t& contentHash, const size_t baseIx, const size_t copyIx, asset_cached_t::type&& gpuObj, const AssetType* asset=nullptr) + { + const auto hashAsU64 = reinterpret_cast(contentHash.data); + if constexpr (GPUObjectWhollyImmutable) // including any deps! + if (copyIx==1) // Only warn once to reduce log spam + inputs->logger.log( + "Why are you creating multiple Objects for asset content %8llx%8llx%8llx%8llx, when they are a readonly GPU Object Type with no dependants!?", + system::ILogger::ELL_PERFORMANCE,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] + ); + // + if (!gpuObj) + { + inputs->logger.log( + "Failed to create GPU Object for asset content %8llx%8llx%8llx%8llx", + system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] + ); + return; + } + auto output = gpuObjects.data()+copyIx+baseIx; + output->value = std::move(gpuObj); + const uint64_t uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[copyIx+baseIx]; + if constexpr (std::is_same_v || std::is_same_v) + { + const auto constrainMask = inputs->constrainMemoryTypeBits(uniqueCopyGroupID,asset,contentHash,gpuObj.get()); + if (!deferredAllocator->request(output,constrainMask)) + return; + } + // set debug names on everything! + { + std::ostringstream debugName; + debugName << "Created by Converter "; + debugName << std::hex; + debugName << this; + debugName << " from Asset with hash "; + for (const auto& byte : contentHash.data) + debugName << uint32_t(byte) << " "; + debugName << "for Group " << uniqueCopyGroupID; + output->get()->setObjectDebugName(debugName.str().c_str()); + } + } + + // Since the dfsCache has the original asset pointers as keys, we map in reverse (multiple `instance_t` can map to the same unique content hash and GPU object) + void propagateToCaches(dfs_cache& dfsCache, CAssetConverter::SReserveResult::staging_cache_t& stagingCache) + { + assert(gpuObjUniqueCopyGroupIDs.empty()); + dfsCache.for_each([&](const instance_t& instance, dfs_cache::created_t& created)->void + { + // already found in read cache and not converted + if (created.gpuObj) + return; + + const auto uniqueCopyGroupID = instance.uniqueCopyGroupID; + const auto& contentHash = created.contentHash; + const auto hashAsU64 = reinterpret_cast(contentHash.data); + + auto found = contentHashToCanonical.find(contentHash); + // can happen if deps were unconverted dummies + if (found==contentHashToCanonical.end()) + { + if (contentHash!=CAssetConverter::CHashCache::NoContentHash) + inputs->logger.log( + "Could not find GPU Object for Asset %p in group %ull with Content Hash %8llx%8llx%8llx%8llx", + system::ILogger::ELL_ERROR, instance.asset, uniqueCopyGroupID, hashAsU64[0], hashAsU64[1], hashAsU64[2], hashAsU64[3] + ); + return; + } + // unhashables were not supposed to be added to conversion requests + assert(contentHash!=CAssetConverter::CHashCache::NoContentHash); + + const auto copyIx = found->second.firstCopyIx++; + auto& gpuObj = gpuObjects[copyIx]; + if (!gpuObj) + { + inputs->logger.log( + "Creation of GPU Object (or its dependents) for Content Hash %8llx%8llx%8llx%8llx Copy Index %d from Canonical Asset %p Failed.", + system::ILogger::ELL_ERROR, hashAsU64[0], hashAsU64[1], hashAsU64[2], hashAsU64[3], copyIx, found->second.canonicalAsset + ); + return; + } + // insert into staging cache + stagingCache.emplace(gpuObj.get(),typename CAssetConverter::CCache::key_t(contentHash,uniqueCopyGroupID)); + // propagate back to dfsCache + created.gpuObj = std::move(gpuObj); + } + ); + } + + const CAssetConverter::SInputs* inputs; + MetaDeviceMemoryAllocator* deferredAllocator; + core::unordered_map> contentHashToCanonical; + core::vector gpuObjUniqueCopyGroupIDs; + core::vector> gpuObjects; }; // @@ -2548,53 +2752,6 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult // can now spawn our own hash cache retval.m_hashCache = core::make_smart_refctd_ptr(); - // Since the dfsCache has the original asset pointers as keys, we map in reverse (multiple `instance_t` can map to the same unique content hash and GPU object) - auto propagateToStagingCache = [&inputs,&dfsCaches,&retval](conversions_t& conversionRequests)->void - { - std::get>(dfsCaches).for_each([&](const instance_t& instance, dfs_cache::created_t& created)->void - { - auto& stagingCache = std::get>(retval.m_stagingCaches); - // already found in read cache and not converted - if (created.gpuObj) - return; - - const auto uniqueCopyGroupID = instance.uniqueCopyGroupID; - const auto& contentHash = created.contentHash; - const auto hashAsU64 = reinterpret_cast(contentHash.data); - - auto& map = conversionRequests.contentHashToCanonical; - auto found = map.find(contentHash); - // can happen if deps were unconverted dummies - if (found==map.end()) - { - if (contentHash!=CHashCache::NoContentHash) - inputs.logger.log( - "Could not find GPU Object for Asset %p in group %ull with Content Hash %8llx%8llx%8llx%8llx", - system::ILogger::ELL_ERROR, instance.asset, uniqueCopyGroupID, hashAsU64[0], hashAsU64[1], hashAsU64[2], hashAsU64[3] - ); - return; - } - // unhashables were not supposed to be added to conversion requests - assert(contentHash!=CHashCache::NoContentHash); - - const auto copyIx = found->second.firstCopyIx++; - auto& gpuObj = conversionRequests.gpuObjects[copyIx]; - if (!gpuObj) - { - inputs.logger.log( - "Creation of GPU Object (or its dependents) for Content Hash %8llx%8llx%8llx%8llx Copy Index %d from Canonical Asset %p Failed.", - system::ILogger::ELL_ERROR, hashAsU64[0], hashAsU64[1], hashAsU64[2], hashAsU64[3], copyIx, found->second.canonicalAsset - ); - return; - } - // insert into staging cache - stagingCache.emplace(gpuObj.get(),typename CCache::key_t(contentHash,uniqueCopyGroupID)); - // propagate back to dfsCache - created.gpuObj = std::move(gpuObj); - } - ); - }; - MetaDeviceMemoryAllocator deferredAllocator(inputs.allocator ? inputs.allocator:device,inputs.logger); // BLAS and TLAS creation is somewhat delayed by buffer creation and allocation @@ -2613,176 +2770,27 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult // Deduplication, Creation and Propagation auto dedupCreateProp = [&]()->conversions_t { - auto& dfsCache = std::get>(dfsCaches); // This map contains the assets by-hash, identical asset+patch hash the same. // It only has entries for GPU objects that need to be created - conversions_t conversionRequests; + conversions_t conversionRequests = {&inputs,&deferredAllocator}; - // We now go through the dfsCache and work out each entry's content hashes, so that we can carry out unique conversions. + // const CCache* readCache = inputs.readCache ? (&std::get>(inputs.readCache->m_caches)):nullptr; - dfsCache.for_each([&](const instance_t& instance, dfs_cache::created_t& created)->void - { - // compute the hash or look it up if it exists - // We mistrust every dependency such that the eject/update if needed. - // Its really important that the Deduplication gets performed Bottom-Up - auto& contentHash = created.contentHash; - PatchOverride patchOverride(inputs,dfsCaches,instance.uniqueCopyGroupID); - contentHash = retval.getHashCache()->hash( - {instance.asset,&created.patch}, - &patchOverride, - /*.mistrustLevel =*/ 1 - ); - // failed to hash all together (only possible reason is failure of `PatchGetter` to provide a valid patch) - if (contentHash==CHashCache::NoContentHash) - { - inputs.logger.log("Could not compute hash for asset %p in group %d, maybe an IPreHashed dependant's content hash is missing?",system::ILogger::ELL_ERROR,instance.asset,instance.uniqueCopyGroupID); - return; - } - const auto hashAsU64 = reinterpret_cast(contentHash.data); - { - inputs.logger.log("Asset (%p,%d) has hash %8llx%8llx%8llx%8llx",system::ILogger::ELL_DEBUG,instance.asset,instance.uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]); - } - // if we have a read cache, lets retry looking the item up! - if (readCache) - { - // We can't look up "near misses" (supersets of patches) because they'd have different hashes - // and we can't afford to split hairs like finding overlapping buffer ranges, etc. - // Stuff like that would require a completely different hashing/lookup strategy (or multiple fake entries). - const auto found = readCache->find({contentHash,instance.uniqueCopyGroupID}); - if (found!=readCache->forwardMapEnd()) - { - created.gpuObj = found->second; - inputs.logger.log( - "Asset (%p,%d) with hash %8llx%8llx%8llx%8llx found its GPU Object in Read Cache",system::ILogger::ELL_DEBUG, - instance.asset,instance.uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] - ); - return; - } - } - // The conversion request we insert needs an instance asset whose unconverted dependencies don't have missing content - // SUPER SIMPLIFICATION: because we hash and search for readCache items bottom up (BFS), we don't need a stack (DFS) here! - // Any dependant that's not getting a GPU object due to missing content or GPU cache object for its cache, will show up later during `getDependant` - // An additional optimization would be to improve the `PatchGetter` to check dependants (only deps) during hashing for missing dfs cache gpu Object (no read cache) and no conversion request. - auto* isPrehashed = dynamic_cast(instance.asset); - if (isPrehashed && isPrehashed->missingContent()) - { - inputs.logger.log( - "PreHashed Asset (%p,%d) with hash %8llx%8llx%8llx%8llx has missing content and no GPU Object in Read Cache!",system::ILogger::ELL_ERROR, - instance.asset,instance.uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] - ); - return; - } - // then de-duplicate the conversions needed - const patch_index_t patchIx = {static_cast(std::distance(dfsCache.nodes.data(),&created))}; - auto [inSetIt,inserted] = conversionRequests.contentHashToCanonical.emplace(contentHash,unique_conversion_t{.canonicalAsset=instance.asset,.patchIndex=patchIx}); - if (!inserted) - { - // If an element prevented insertion, the patch must be identical! - // Because the conversions don't care about groupIDs, the patches may be identical but not the same object in memory. - assert(inSetIt->second.patchIndex==patchIx || dfsCache.nodes[inSetIt->second.patchIndex.value].patch==dfsCache.nodes[patchIx.value].patch); - inSetIt->second.copyCount++; - } - } - ); + conversionRequests.gather(dfsCaches,retval.m_hashCache.get(),readCache); - // work out mapping of `conversionRequests` to multiple GPU objects and their copy groups via counting sort - const auto gpuObjUniqueCopyGroupIDs = [&]()->core::vector - { - core::vector retval; - // now assign storage offsets via exclusive scan and put the `uniqueGroupID` mappings in sorted order - auto exclScanConvReqs = [&]()->size_t - { - size_t sum = 0; - for (auto& entry : conversionRequests.contentHashToCanonical) - { - entry.second.firstCopyIx = sum; - sum += entry.second.copyCount; - } - return sum; - }; - retval.resize(exclScanConvReqs()); - // - dfsCache.for_each([&inputs,&retval,&conversionRequests](const instance_t& instance, dfs_cache::created_t& created)->void - { - if (created.gpuObj) - return; - auto& map = conversionRequests.contentHashToCanonical; - auto found = map.find(created.contentHash); - // may not find things because of unconverted dummy deps - if (found!=map.end()) - retval[found->second.firstCopyIx++] = instance.uniqueCopyGroupID; - else - { - inputs.logger.log( - "No conversion request made for Asset %p in group %d, its impossible to convert.", - system::ILogger::ELL_ERROR,instance.asset,instance.uniqueCopyGroupID - ); - } - } - ); - // `{conversionRequests}.firstCopyIx` needs to be brought back down to exclusive scan form - exclScanConvReqs(); - return retval; - }(); - // - conversionRequests.gpuObjects.resize(gpuObjUniqueCopyGroupIDs.size()); - // - auto assign = [&]( - const core::blake3_hash_t& contentHash, const size_t baseIx, const size_t copyIx, asset_cached_t::type&& gpuObj, const AssetType* asset=nullptr - )->asset_traits::video_t* - { - const auto hashAsU64 = reinterpret_cast(contentHash.data); - if constexpr (GPUObjectWhollyImmutable) // including any deps! - if (copyIx==1) // Only warn once to reduce log spam - inputs.logger.log( - "Why are you creating multiple Objects for asset content %8llx%8llx%8llx%8llx, when they are a readonly GPU Object Type with no dependants!?", - system::ILogger::ELL_PERFORMANCE,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] - ); - // - if (!gpuObj) - { - inputs.logger.log( - "Failed to create GPU Object for asset content %8llx%8llx%8llx%8llx", - system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] - ); - return nullptr; - } - auto output = conversionRequests.gpuObjects.data()+copyIx+baseIx; - const uint64_t uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[copyIx+baseIx]; - if constexpr (std::is_same_v || std::is_same_v) - { - const auto constrainMask = inputs.constrainMemoryTypeBits(uniqueCopyGroupID,asset,contentHash,gpuObj.get()); - if (!deferredAllocator.request(output,constrainMask)) - return nullptr; - } - // set debug names on everything! - { - std::ostringstream debugName; - debugName << "Created by Converter "; - debugName << std::hex; - debugName << this; - debugName << " from Asset with hash "; - for (const auto& byte : contentHash.data) - debugName << uint32_t(byte) << " "; - debugName << "for Group " << uniqueCopyGroupID; - gpuObj.get()->setObjectDebugName(debugName.str().c_str()); - } - output->value = std::move(gpuObj); - return output->value.get(); - }; - GetDependantVisitBase visitBase = { .inputs = inputs, .dfsCaches = dfsCaches }; // Dispatch to correct creation of GPU objects + auto& dfsCache = std::get>(dfsCaches); if constexpr (std::is_same_v) { for (auto& entry : conversionRequests.contentHashToCanonical) for (auto i=0ull; i(entry.first,entry.second.firstCopyIx,i,device->createSampler(entry.second.canonicalAsset->getParams())); + conversionRequests.template assign(entry.first,entry.second.firstCopyIx,i,device->createSampler(entry.second.canonicalAsset->getParams())); } if constexpr (std::is_same_v) { @@ -2797,13 +2805,12 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult params.usage = patch.usage; // concurrent ownership if any const auto outIx = i+entry.second.firstCopyIx; - const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx]; + const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx]; const auto queueFamilies = inputs.getSharedOwnershipQueueFamilies(uniqueCopyGroupID,asset,patch); params.queueFamilyIndexCount = queueFamilies.size(); params.queueFamilyIndices = queueFamilies.data(); // if creation successful, we will request some memory allocation to bind to, and if thats okay we preliminarily request a conversion - if (IGPUBuffer* const gpuObj=assign(entry.first,entry.second.firstCopyIx,i,device->createBuffer(std::move(params)),asset); gpuObj) - retval.m_bufferConversions.push_back({core::smart_refctd_ptr(asset),gpuObj}); + conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,device->createBuffer(std::move(params)),asset); } } if constexpr (std::is_same_v || std::is_same_v) @@ -2822,7 +2829,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult const bool motionBlur = patch.isMotion; const auto buildFlags = patch.getBuildFlags(as); const auto outIx = i+entry.second.firstCopyIx; - const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx]; + const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx]; // prevent CPU hangs by making sure allocator big enough to service us in worst case but with best case allocator (no other allocations, clean alloc) const auto minScratchAllocSize = patch.hostBuild ? inputs.scratchForHostASBuildMinAllocSize:inputs.scratchForDeviceASBuildMinAllocSize; uint64_t buildSize = 0; uint32_t buildAlignment = 4; @@ -2946,10 +2953,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult out.storage.value = device->createBuffer(std::move(params)); if (out.storage) if (!deferredAllocator.request(&out.storage,patch.hostBuild ? hostBuildMemoryTypes:deviceBuildMemoryTypes)) - { - out.storage.value = nullptr; continue; - } } out.scratchSize = sizes.buildScratchSize; out.motionBlur = motionBlur; @@ -3027,16 +3031,15 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult } // concurrent ownership if any const auto outIx = i+entry.second.firstCopyIx; - const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx]; + const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx]; const auto queueFamilies = inputs.getSharedOwnershipQueueFamilies(uniqueCopyGroupID,asset,patch); params.queueFamilyIndexCount = queueFamilies.size(); params.queueFamilyIndices = queueFamilies.data(); // gpu image specifics params.tiling = static_cast(patch.linearTiling); params.preinitialized = false; - // if creation successful, we will request some memory allocation to bind to, and if thats okay we preliminarily request a conversion (if we have content to upload) - if (IGPUImage* const gpuObj=assign(entry.first,entry.second.firstCopyIx,i,device->createImage(std::move(params)),asset); gpuObj && !asset->getRegions().empty()) - retval.m_imageConversions.push_back({{core::smart_refctd_ptr(asset),gpuObj},bool(patch.recomputeMips)}); + // if creation successful, we will request some memory allocation to bind to + conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,device->createImage(std::move(params)),asset); } } if constexpr (std::is_same_v) @@ -3048,7 +3051,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult for (auto i=0ull; i> visitor = { {visitBase}, {asset,uniqueCopyGroupID}, @@ -3057,7 +3060,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult if (!visitor()) continue; // no format promotion for buffer views - assign(entry.first,entry.second.firstCopyIx,i,device->createBufferView(visitor.underlying,asset->getFormat())); + conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,device->createBufferView(visitor.underlying,asset->getFormat())); } } } @@ -3071,7 +3074,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult for (auto i=0ull; i> visitor = { {visitBase}, {asset,uniqueCopyGroupID}, @@ -3100,7 +3103,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult // if underlying image had mip-chain extended then we extend our own if (imageParams.mipLevels!=visitor.oldMipCount) params.subresourceRange.levelCount = imageParams.mipLevels-params.subresourceRange.baseMipLevel; - assign(entry.first,entry.second.firstCopyIx,i,device->createImageView(std::move(params))); + conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,device->createImageView(std::move(params))); } } } @@ -3115,7 +3118,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult for (auto i=0ull; icreateShader(createParams)); + conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,device->createShader(createParams)); } } if constexpr (std::is_same_v) @@ -3169,7 +3172,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult for (auto i=0ull; i> visitor = { { @@ -3181,7 +3184,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult }; if (!visitor()) continue; - assign(entry.first,entry.second.firstCopyIx,i,device->createDescriptorSetLayout(bindings)); + conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,device->createDescriptorSetLayout(bindings)); } } } @@ -3224,7 +3227,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult for (auto i=0ull; i> visitor = { {visitBase}, {asset,uniqueCopyGroupID}, @@ -3233,7 +3236,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult if (!visitor()) continue; auto layout = device->createPipelineLayout(pcRanges,std::move(visitor.dsLayouts[0]),std::move(visitor.dsLayouts[1]),std::move(visitor.dsLayouts[2]),std::move(visitor.dsLayouts[3])); - assign(entry.first,entry.second.firstCopyIx,i,std::move(layout)); + conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,std::move(layout)); } } } @@ -3247,7 +3250,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult { // since we don't have dependants we don't care about our group ID // we create threadsafe pipeline caches, because we have no idea how they may be used - assign.template operator()(entry.first,entry.second.firstCopyIx,i,device->createPipelineCache(asset,false)); + conversionRequests.template assign(entry.first,entry.second.firstCopyIx,i,device->createPipelineCache(asset,false)); } } } @@ -3260,7 +3263,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult for (auto i=0ull; i> visitor = { {visitBase}, {asset,uniqueCopyGroupID}, @@ -3278,7 +3281,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult params.shader = visitor.getSpecInfo(IShader::E_SHADER_STAGE::ESS_COMPUTE); device->createComputePipelines(inputs.pipelineCache,{¶ms,1},&ppln); } - assign(entry.first,entry.second.firstCopyIx,i,std::move(ppln)); + conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,std::move(ppln)); } } } @@ -3292,7 +3295,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult { // since we don't have dependants we don't care about our group ID // we create threadsafe pipeline caches, because we have no idea how they may be used - assign.template operator()(entry.first,entry.second.firstCopyIx,i,device->createRenderpass(asset->getCreationParameters())); + conversionRequests.template assign(entry.first,entry.second.firstCopyIx,i,device->createRenderpass(asset->getCreationParameters())); } } } @@ -3307,7 +3310,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult for (auto i=0ull; i> visitor = { {visitBase}, {asset,uniqueCopyGroupID}, @@ -3337,7 +3340,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult } params.cached = asset->getCachedCreationParams(); device->createGraphicsPipelines(inputs.pipelineCache,{¶ms,1},&ppln); - assign(entry.first,entry.second.firstCopyIx,i,std::move(ppln)); + conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,std::move(ppln)); } } } @@ -3354,7 +3357,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult for (auto i=0ull; i> visitor = { {visitBase}, {asset,uniqueCopyGroupID}, @@ -3383,127 +3386,135 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult } else inputs.logger.log("Failed to create Descriptor Pool suited for Layout %s",system::ILogger::ELL_ERROR,layout->getObjectDebugName()); - assign(entry.first,entry.second.firstCopyIx,i,std::move(ds)); + conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,std::move(ds)); } } } - // This gets deferred till AFTER the Buffer Memory Allocations and Binding for Acceleration Structures - if constexpr (!std::is_base_of_v) + // clear what we don't need + conversionRequests.gpuObjUniqueCopyGroupIDs.clear(); + // This gets deferred till AFTER the Buffer Memory Allocations and Binding + if constexpr (!std::is_base_of_v && !std::is_base_of_v::video_t>) { - propagateToStagingCache.template operator()(conversionRequests); + conversionRequests.propagateToCaches(std::get>(dfsCaches),std::get>(retval.m_stagingCaches)); return {}; } return conversionRequests; }; - // The order of these calls is super important to go BOTTOM UP in terms of hashing and conversion dependants. - // Both so we can hash in O(Depth) and not O(Depth^2) but also so we have all the possible dependants ready. - // If two Asset chains are independent then we order them from most catastrophic failure to least. - dedupCreateProp.template operator()(); - dedupCreateProp.template operator()(); - dedupCreateProp.template operator()(); - dedupCreateProp.template operator()(); - // now allocate the memory for buffers and images - deferredAllocator.finalize(); - - // find out which buffers need to be uploaded via a staging buffer - std::erase_if(retval.m_bufferConversions,[&](const SReserveResult::SConvReqBuffer& conv)->bool + // scope so the conversion requests go our of scope early + { + // The order of these calls is super important to go BOTTOM UP in terms of hashing and conversion dependants. + // Both so we can hash in O(Depth) and not O(Depth^2) but also so we have all the possible dependants ready. + // If two Asset chains are independent then we order them from most catastrophic failure to least. + auto bufferConversions = dedupCreateProp.template operator()(); + auto blasConversions = dedupCreateProp.template operator()(); + auto tlasConversions = dedupCreateProp.template operator()(); + auto imageConversions = dedupCreateProp.template operator()(); + // now allocate the memory for buffers and images + deferredAllocator.finalize(); + + // find out which buffers need to be uploaded via a staging buffer + for (auto& entry : bufferConversions.contentHashToCanonical) + for (auto i=0ull; igetBoundMemory(); - if (!boundMemory.isValid()) - return true; - if (!canHostWriteToMemoryRange(boundMemory,conv.gpuObj->getSize())) + const auto boundMemory = gpuBuff->getBoundMemory(); + assert(boundMemory.isValid()); + if (!canHostWriteToMemoryRange(boundMemory,gpuBuff->getSize())) retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; - return false; + retval.m_bufferConversions.push_back({core::smart_refctd_ptr(entry.second.canonicalAsset),gpuBuff.get()}); } - ); - // Deal with Deferred Creation of Acceleration structures - { - const auto minScratchAlignment = device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment; - auto createAccelerationStructures = [&]()->void + bufferConversions.propagateToCaches(std::get>(dfsCaches),std::get>(retval.m_stagingCaches)); + // Deal with Deferred Creation of Acceleration structures { - constexpr bool IsTLAS = std::is_same_v; - // TLAS and BLAS can't build concurrently, index 0 is device build, 1 is host build - size_t scratchSizeFullParallelBuild[2] = {0,0}; - // - core::vector>* pConversions; - if constexpr (IsTLAS) - pConversions = retval.m_tlasConversions; - else - pConversions = retval.m_blasConversions; - // we collect that stats AFTER making sure that the BLAS / TLAS can actually be created - for (const auto& deferredParams : accelerationStructureParams[IsTLAS]) + const auto minScratchAlignment = device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment; + auto createAccelerationStructures = [&]()->void { - // buffer failed to create/allocate - if (!deferredParams.storage) - continue; - const auto bufSz = deferredParams.storage.get()->getSize(); - IGPUAccelerationStructure::SCreationParams baseParams; - { - using create_f = IGPUAccelerationStructure::SCreationParams::FLAGS; - baseParams = { - .bufferRange = {.offset=0,.size=bufSz,.buffer=deferredParams.storage.value}, - .flags = deferredParams.motionBlur ? create_f::MOTION_BIT:create_f::NONE - }; - } + constexpr bool IsTLAS = std::is_same_v; + // TLAS and BLAS can't build concurrently, index 0 is device build, 1 is host build + size_t scratchSizeFullParallelBuild[2] = {0,0}; // - auto& request = pConversions[deferredParams.hostBuild].emplace_back(); - request.canonical = smart_refctd_ptr(static_cast(deferredParams.canonical)); - smart_refctd_ptr::video_t> as; + core::vector>* pConversions; if constexpr (IsTLAS) - { - // is there any reason for it to be more? - const uint32_t maxInstances = request.canonical->getInstances().size(); - as = device->createTopLevelAccelerationStructure({std::move(baseParams),maxInstances}); - } + pConversions = retval.m_tlasConversions; else - as = device->createBottomLevelAccelerationStructure(std::move(baseParams)); - request.gpuObj = as.get(); - if (!request.gpuObj) + pConversions = retval.m_blasConversions; + // we collect that stats AFTER making sure that the BLAS / TLAS can actually be created + for (const auto& deferredParams : accelerationStructureParams[IsTLAS]) { - inputs.logger.log("Failed to Create Acceleration Structure.",system::ILogger::ELL_ERROR); - continue; + // buffer failed to create/allocate + if (!deferredParams.storage) + continue; + const auto bufSz = deferredParams.storage.get()->getSize(); + IGPUAccelerationStructure::SCreationParams baseParams; + { + using create_f = IGPUAccelerationStructure::SCreationParams::FLAGS; + baseParams = { + .bufferRange = {.offset=0,.size=bufSz,.buffer=deferredParams.storage.value}, + .flags = deferredParams.motionBlur ? create_f::MOTION_BIT:create_f::NONE + }; + } + // + auto& request = pConversions[deferredParams.hostBuild].emplace_back(); + request.canonical = smart_refctd_ptr(static_cast(deferredParams.canonical)); + smart_refctd_ptr::video_t> as; + if constexpr (IsTLAS) + { + // is there any reason for it to be more? + const uint32_t maxInstances = request.canonical->getInstances().size(); + as = device->createTopLevelAccelerationStructure({std::move(baseParams),maxInstances}); + } + else + as = device->createBottomLevelAccelerationStructure(std::move(baseParams)); + request.gpuObj = as.get(); + if (!request.gpuObj) + { + inputs.logger.log("Failed to Create Acceleration Structure.",system::ILogger::ELL_ERROR); + continue; + } + request.scratchSize = deferredParams.scratchSize; + request.compact = deferredParams.compactAfterBuild; + request.buildFlags = deferredParams.buildFlags; + // sizes for building 1-by-1 vs parallel, note that BLAS and TLAS can't be built concurrently + retval.m_minASBuildScratchSize[deferredParams.hostBuild] = core::max(retval.m_minASBuildScratchSize[deferredParams.hostBuild],deferredParams.buildSize); + scratchSizeFullParallelBuild[deferredParams.hostBuild] += deferredParams.buildSize; + // note that in order to compact an AS you need to allocate a buffer range whose size is known only after the build + if (deferredParams.compactAfterBuild) + retval.m_compactedASMaxMemory += bufSz; } - request.scratchSize = deferredParams.scratchSize; - request.compact = deferredParams.compactAfterBuild; - request.buildFlags = deferredParams.buildFlags; - // sizes for building 1-by-1 vs parallel, note that BLAS and TLAS can't be built concurrently - retval.m_minASBuildScratchSize[deferredParams.hostBuild] = core::max(retval.m_minASBuildScratchSize[deferredParams.hostBuild],deferredParams.buildSize); - scratchSizeFullParallelBuild[deferredParams.hostBuild] += deferredParams.buildSize; - // note that in order to compact an AS you need to allocate a buffer range whose size is known only after the build - if (deferredParams.compactAfterBuild) - retval.m_compactedASMaxMemory += bufSz; - } - retval.m_maxASBuildScratchSize[0] = core::max(scratchSizeFullParallelBuild[0],retval.m_maxASBuildScratchSize[0]); - retval.m_maxASBuildScratchSize[1] = core::max(scratchSizeFullParallelBuild[1],retval.m_maxASBuildScratchSize[1]); - }; - createAccelerationStructures.template operator()(); - createAccelerationStructures.template operator()(); - // - if (retval.willDeviceASBuild() || retval.willCompactAS()) - retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; - } - // find out which images need what caps for the transfer and mipmapping - std::erase_if(retval.m_imageConversions,[&](const SReserveResult::SConvReqImage& conv)->bool + retval.m_maxASBuildScratchSize[0] = core::max(scratchSizeFullParallelBuild[0],retval.m_maxASBuildScratchSize[0]); + retval.m_maxASBuildScratchSize[1] = core::max(scratchSizeFullParallelBuild[1],retval.m_maxASBuildScratchSize[1]); + }; + createAccelerationStructures.template operator()(); + blasConversions.propagateToCaches(std::get>(dfsCaches),std::get>(retval.m_stagingCaches)); +// TODO: don't build BLASes which aren't roots or use by any TLAS + createAccelerationStructures.template operator()(); + tlasConversions.propagateToCaches(std::get>(dfsCaches),std::get>(retval.m_stagingCaches)); + // + if (retval.willDeviceASBuild() || retval.willCompactAS()) + retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; + } + // find out which images need what caps for the transfer and mipmapping + auto& dfsCacheImages = std::get>(dfsCaches); + for (auto& entry : imageConversions.contentHashToCanonical) + for (auto i=0ull; igetRegions().empty()) { - assert(conv.gpuObj); - const auto boundMemory = conv.gpuObj->getBoundMemory(); - if (!boundMemory.isValid()) - return true; + const auto boundMemory = gpuImg->getBoundMemory(); + assert(boundMemory.isValid()); retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; - if (conv.recomputeMips) + const bool recomputeMips = dfsCacheImages.nodes[entry.second.patchIndex.value].patch.recomputeMips; + if (recomputeMips) retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; // Best effort guess, without actually looking at all regions - const auto& params = conv.gpuObj->getCreationParameters(); + const auto& params = gpuImg->getCreationParameters(); // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdCopyBufferToImage.html#VUID-vkCmdCopyBufferToImage-commandBuffer-07739 if (isDepthOrStencilFormat(params.format) && (params.depthUsage|params.stencilUsage).hasFlags(IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT)) retval.m_queueFlags |= IQueue::FAMILY_FLAGS::GRAPHICS_BIT; - return false; + retval.m_imageConversions.push_back({{core::smart_refctd_ptr(entry.second.canonicalAsset),gpuImg.get()},recomputeMips}); } - ); - - + imageConversions.propagateToCaches(dfsCacheImages,std::get>(retval.m_stagingCaches)); + } dedupCreateProp.template operator()(); dedupCreateProp.template operator()(); dedupCreateProp.template operator()(); From e0fe7ed43fea13f2db838f94d276f5bb45794b9c Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 9 May 2025 17:23:02 +0200 Subject: [PATCH 057/346] Start work on fixing orphan GPU objects due to parent failures in `CAssetConverter::reserve` TODO: make the staging cache refcounted with heterogenous (non-refcounted) lookup --- include/nbl/video/utilities/CAssetConverter.h | 3 +- src/nbl/video/utilities/CAssetConverter.cpp | 277 +++++++++++------- 2 files changed, 170 insertions(+), 110 deletions(-) diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h index 02cc9ab447..9175f20a86 100644 --- a/include/nbl/video/utilities/CAssetConverter.h +++ b/include/nbl/video/utilities/CAssetConverter.h @@ -1083,6 +1083,7 @@ class CAssetConverter : public core::IReferenceCounted // we don't insert into the writeCache until conversions are successful core::tuple_transform_t m_stagingCaches; + // need a more explicit list of GPU objects that need device-assisted conversion template struct SConversionRequestBase @@ -1141,7 +1142,7 @@ class CAssetConverter : public core::IReferenceCounted IGPUDescriptorSet* dstSet; uint32_t binding; uint32_t arrayElement; - core::smart_refctd_ptr tlas; + const IGPUTopLevelAccelerationStructure* tlas; }; struct SDeferredTLASWriteHasher { diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 0dc431f8ae..32f9408365 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -1655,10 +1655,8 @@ template<> class GetDependantVisit : public GetDependantVisitBase { public: - // all instances need to be aligned to 16 bytes so alignment irrelevant (everything can be tightly packed) and implicit - uint64_t buildInputSize = 0; - // - CAssetConverter::SReserveResult::cpu_to_gpu_blas_map_t* blasBuildMap; + // TODO: deal with usages not going through because of cancelled TLAS builds, by gathering in a top-down pass at the end of `reserve` + CAssetConverter::SReserveResult::cpu_to_gpu_blas_map_t* blasBuildMap = nullptr; protected: bool descend_impl( @@ -1670,15 +1668,16 @@ class GetDependantVisit : public GetDependant auto depObj = getDependant(dep,soloPatch); if (!depObj) return false; - const auto instances = user.asset->getInstances(); - assert(instanceIndexfind(dep.asset); - if (foundBLAS!=blasBuildMap->end()) - foundBLAS->second.remainingUsages++; - else - blasBuildMap->insert(foundBLAS,{dep.asset,{depObj}}); + if (blasBuildMap) + { + const auto instances = user.asset->getInstances(); + assert(instanceIndexfind(dep.asset); + if (foundBLAS!=blasBuildMap->end()) + foundBLAS->second.remainingUsages++; + else + blasBuildMap->insert(foundBLAS,{dep.asset,{depObj}}); + } return true; } }; @@ -1961,7 +1960,7 @@ class GetDependantVisit : public GetDependantVisitBase) { - deferredTLASWrites.push_back({nullptr,binding.data,element,depObj}); + deferredTLASWrites.push_back({nullptr,binding.data,element,depObj.get()}); return true; } // @@ -2305,6 +2304,20 @@ struct unique_conversion_t size_t copyCount : 24 = 1u; }; +// +inline void setDebugName(const CAssetConverter* conv, IBackendObject* gpuObj, const core::blake3_hash_t& contentHash, const uint64_t uniqueCopyGroupID) +{ + std::ostringstream debugName; + debugName << "Created by Converter "; + debugName << std::hex; + debugName << conv; + debugName << " from Asset with hash "; + for (const auto& byte : contentHash.data) + debugName << uint32_t(byte) << " "; + debugName << "for Group " << uniqueCopyGroupID; + gpuObj->setObjectDebugName(debugName.str().c_str()); +} + // Map from ContentHash to canonical asset & patch and the list of uniqueCopyGroupIDs template struct conversions_t @@ -2449,17 +2462,7 @@ struct conversions_t return; } // set debug names on everything! - { - std::ostringstream debugName; - debugName << "Created by Converter "; - debugName << std::hex; - debugName << this; - debugName << " from Asset with hash "; - for (const auto& byte : contentHash.data) - debugName << uint32_t(byte) << " "; - debugName << "for Group " << uniqueCopyGroupID; - output->get()->setObjectDebugName(debugName.str().c_str()); - } + setDebugName(conv,output->get(),contentHash,uniqueCopyGroupID); } // Since the dfsCache has the original asset pointers as keys, we map in reverse (multiple `instance_t` can map to the same unique content hash and GPU object) @@ -2508,6 +2511,7 @@ struct conversions_t ); } + const CAssetConverter* conv; const CAssetConverter::SInputs* inputs; MetaDeviceMemoryAllocator* deferredAllocator; core::unordered_map> contentHashToCanonical; @@ -2759,11 +2763,9 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult { const IAccelerationStructure* canonical; asset_cached_t storage = {}; - uint64_t scratchSize : 45 = 0; - uint64_t motionBlur : 1 = false; - uint64_t buildFlags : 16 = 0; - uint64_t hostBuild : 1 = false; - uint64_t compactAfterBuild : 1 = false; + uint64_t patchIx = 0; + uint64_t uniqueCopyGroupID = 0; + uint64_t scratchSize = 0; uint64_t buildSize = 0; }; core::vector accelerationStructureParams[2]; @@ -2772,7 +2774,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult { // This map contains the assets by-hash, identical asset+patch hash the same. // It only has entries for GPU objects that need to be created - conversions_t conversionRequests = {&inputs,&deferredAllocator}; + conversions_t conversionRequests = {this,&inputs,&deferredAllocator}; // const CCache* readCache = inputs.readCache ? (&std::get>(inputs.readCache->m_caches)):nullptr; @@ -2825,7 +2827,8 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult for (auto i=0ull; i SReserveResult { if constexpr (IsTLAS) { - AssetVisitor> visitor = { - {visitBase}, - {as,uniqueCopyGroupID}, - patch - }; - if (!visitor()) - { - inputs.logger.log( - "Failed to find all GPU Bottom Level Acceleration Structures needed to build TLAS %8llx%8llx%8llx%8llx", - system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] - ); - continue; - } + // TLAS can't check for the BLASes existing yet, because they haven't had their backing buffers allocated yet const auto instanceCount = as->getInstances().size(); sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,instanceCount); - incrementBuildSize(visitor.buildInputSize,16); + // all instances need to be aligned to 16 bytes so alignment irrelevant (everything can be tightly packed) and implicit + const uint64_t worstCaseInstanceSize = motionBlur ? IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance::LargestUnionMemberSize:sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance); + // worst case approximation is fine here + incrementBuildSize(worstCaseInstanceSize*instanceCount,16); incrementBuildSize(sizeof(uint64_t)*instanceCount,alignof(uint64_t)); } else @@ -2952,14 +2946,15 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult params.queueFamilyIndices = queueFamilies.data(); out.storage.value = device->createBuffer(std::move(params)); if (out.storage) - if (!deferredAllocator.request(&out.storage,patch.hostBuild ? hostBuildMemoryTypes:deviceBuildMemoryTypes)) - continue; + { + nbl::video::setDebugName(this,out.storage.value.get(),entry.first,uniqueCopyGroupID); + if (!deferredAllocator.request(&out.storage,patch.hostBuild ? hostBuildMemoryTypes:deviceBuildMemoryTypes)) + continue; + } } + out.patchIx = patchIx; + out.uniqueCopyGroupID = uniqueCopyGroupID; out.scratchSize = sizes.buildScratchSize; - out.motionBlur = motionBlur; - out.buildFlags = static_cast(buildFlags.value); - out.hostBuild = patch.hostBuild; - out.compactAfterBuild = patch.compactAfterBuild; out.buildSize = buildSize; } } @@ -3413,6 +3408,8 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult // now allocate the memory for buffers and images deferredAllocator.finalize(); + // TODO: everything below is slightly wrong due to not having a final top-down dependency checking pass throwing away useless non-root GPU subtrees + // find out which buffers need to be uploaded via a staging buffer for (auto& entry : bufferConversions.contentHashToCanonical) for (auto i=0ull; i SReserveResult else pConversions = retval.m_blasConversions; // we collect that stats AFTER making sure that the BLAS / TLAS can actually be created - for (const auto& deferredParams : accelerationStructureParams[IsTLAS]) + for (size_t i=0; i(deferredParams.canonical); + const auto& dfsNode = std::get>(dfsCaches).nodes[deferredParams.patchIx]; + const auto& patch = dfsNode.patch; + // create the AS const auto bufSz = deferredParams.storage.get()->getSize(); IGPUAccelerationStructure::SCreationParams baseParams; { using create_f = IGPUAccelerationStructure::SCreationParams::FLAGS; baseParams = { .bufferRange = {.offset=0,.size=bufSz,.buffer=deferredParams.storage.value}, - .flags = deferredParams.motionBlur ? create_f::MOTION_BIT:create_f::NONE + .flags = patch.isMotion ? create_f::MOTION_BIT:create_f::NONE }; } - // - auto& request = pConversions[deferredParams.hostBuild].emplace_back(); - request.canonical = smart_refctd_ptr(static_cast(deferredParams.canonical)); smart_refctd_ptr::video_t> as; if constexpr (IsTLAS) { + // check if the BLASes we want to use for the instances were successfully allocated and created + AssetVisitor> visitor = { + {inputs,dfsCaches,&retval.m_blasBuildMap}, + {canonical,deferredParams.uniqueCopyGroupID}, + patch + }; + if (!visitor()) + { + inputs.logger.log( + "Failed to find all GPU Bottom Level Acceleration Structures needed to build TLAS %8llx%8llx%8llx%8llx", + system::ILogger::ELL_ERROR//,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] + ); + continue; + } // is there any reason for it to be more? - const uint32_t maxInstances = request.canonical->getInstances().size(); + const uint32_t maxInstances = canonical->getInstances().size(); as = device->createTopLevelAccelerationStructure({std::move(baseParams),maxInstances}); } else as = device->createBottomLevelAccelerationStructure(std::move(baseParams)); - request.gpuObj = as.get(); - if (!request.gpuObj) + if (!as) { inputs.logger.log("Failed to Create Acceleration Structure.",system::ILogger::ELL_ERROR); continue; } + // file the request for conversion + auto& request = pConversions[patch.hostBuild].emplace_back(); + request.canonical = smart_refctd_ptr(canonical); + request.gpuObj = as.get(); request.scratchSize = deferredParams.scratchSize; - request.compact = deferredParams.compactAfterBuild; - request.buildFlags = deferredParams.buildFlags; + request.compact = patch.compactAfterBuild; + request.buildFlags = static_cast(patch.getBuildFlags(canonical).value); // sizes for building 1-by-1 vs parallel, note that BLAS and TLAS can't be built concurrently - retval.m_minASBuildScratchSize[deferredParams.hostBuild] = core::max(retval.m_minASBuildScratchSize[deferredParams.hostBuild],deferredParams.buildSize); - scratchSizeFullParallelBuild[deferredParams.hostBuild] += deferredParams.buildSize; + retval.m_minASBuildScratchSize[patch.hostBuild] = core::max(retval.m_minASBuildScratchSize[patch.hostBuild],deferredParams.buildSize); + scratchSizeFullParallelBuild[patch.hostBuild] += deferredParams.buildSize; // note that in order to compact an AS you need to allocate a buffer range whose size is known only after the build - if (deferredParams.compactAfterBuild) + if (patch.compactAfterBuild) retval.m_compactedASMaxMemory += bufSz; } retval.m_maxASBuildScratchSize[0] = core::max(scratchSizeFullParallelBuild[0],retval.m_maxASBuildScratchSize[0]); @@ -3487,7 +3500,6 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult }; createAccelerationStructures.template operator()(); blasConversions.propagateToCaches(std::get>(dfsCaches),std::get>(retval.m_stagingCaches)); -// TODO: don't build BLASes which aren't roots or use by any TLAS createAccelerationStructures.template operator()(); tlasConversions.propagateToCaches(std::get>(dfsCaches),std::get>(retval.m_stagingCaches)); // @@ -3566,6 +3578,52 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult }; core::for_each_in_tuple(inputs.assets,finalize); + // A failed conversion can cause dangling GPU object pointers, and needless work for objects which will die soon after, so prune with a Top-Down pass anything thats not reachable from a root + { + // we use a genious trick, if someone else is using the GPU object, the refcount must obviously be greater than 1 + auto pruneStaging = [&]()->void + { + auto& stagingCache = std::get>(retval.m_stagingCaches); + phmap::erase_if(stagingCache,[](const auto& entry)->bool + { + if constexpr (std::is_same_v) + { + // TODO: gather into m_deferredTLASDescriptorWrites + } + return entry.first->getReferenceCount()==1; + } + ); + }; + // The order these are called is paramount, the Higher Level User needs to die to let go of dependants and make our Garbage Collection work +// pruneStaging.template operator()(); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + // need to nerf any writes to descriptor sets which don't exist anymore before checking the refcounts on them + phmap::erase_if(retval.m_deferredTLASDescriptorWrites,[&](const auto& entry)->bool + { + auto& dsStaging = std::get>(retval.m_stagingCaches); + return dsStaging.find(entry.dstSet)!=dsStaging.end(); + } + ); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + } + + // TODO: defer the conversion requests until final objects are known (or knock them out) -> maybe change the conversion requests to unordered_map ? + + // TODO: only now get the queue flags + retval.m_converter = core::smart_refctd_ptr(this); retval.m_logger = system::logger_opt_smart_ptr(core::smart_refctd_ptr(inputs.logger.get())); return retval; @@ -3610,6 +3668,40 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul } }; + // + auto findInStaging = [&reservations](const typename asset_traits::video_t* gpuObj)->core::blake3_hash_t* + { + auto& stagingCache = std::get>(reservations.m_stagingCaches); + const auto found = stagingCache.find(const_cast::video_t*>(gpuObj)); + assert(found!=stagingCache.end()); + return const_cast(&found->second.value); + }; + // wipe gpu item in staging cache (this may drop it as well if it was made for only a root asset == no users) + core::unordered_map outputReverseMap; + core::for_each_in_tuple(reservations.m_gpuObjects,[&outputReverseMap](const auto& gpuObjects)->void + { + uint32_t i = 0; + for (const auto& gpuObj : gpuObjects) + outputReverseMap[gpuObj.value.get()] = i++; + } + ); + auto markFailureInStaging = [&reservations,&outputReverseMap,logger](const char* message, smart_refctd_ptr& canonical, const typename asset_traits::video_t* gpuObj, core::blake3_hash_t* hash)->void + { + // wipe the smart pointer to the canonical, make sure we release that memory ASAP if no other user is around + canonical = nullptr; + logger.log("%s failed for \"%s\"",system::ILogger::ELL_ERROR,message,gpuObj->getObjectDebugName()); + // change the content hash on the reverse map to a NoContentHash + *hash = CHashCache::NoContentHash; + // also drop the smart pointer from the output array so failures release memory quickly + const auto foundIx = outputReverseMap.find(gpuObj); + if (foundIx!=outputReverseMap.end()) + { + auto& resultOutput = std::get>(reservations.m_gpuObjects); + resultOutput[foundIx->second].value = nullptr; + outputReverseMap.erase(foundIx); + } + }; + // compacted TLASes need to be substituted in cache and Descriptor Sets core::unordered_map> compactedTLASMap; // Anything to do? @@ -3776,40 +3868,6 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul return retval; } - // - auto findInStaging = [&reservations](const typename asset_traits::video_t* gpuObj)->core::blake3_hash_t* - { - auto& stagingCache = std::get>(reservations.m_stagingCaches); - const auto found = stagingCache.find(const_cast::video_t*>(gpuObj)); - assert(found!=stagingCache.end()); - return const_cast(&found->second.value); - }; - // wipe gpu item in staging cache (this may drop it as well if it was made for only a root asset == no users) - core::unordered_map outputReverseMap; - core::for_each_in_tuple(reservations.m_gpuObjects,[&outputReverseMap](const auto& gpuObjects)->void - { - uint32_t i = 0; - for (const auto& gpuObj : gpuObjects) - outputReverseMap[gpuObj.value.get()] = i++; - } - ); - auto markFailureInStaging = [&reservations,&outputReverseMap,logger](const char* message, smart_refctd_ptr& canonical, const typename asset_traits::video_t* gpuObj, core::blake3_hash_t* hash)->void - { - // wipe the smart pointer to the canonical, make sure we release that memory ASAP if no other user is around - canonical = nullptr; - logger.log("%s failed for \"%s\"",system::ILogger::ELL_ERROR,message,gpuObj->getObjectDebugName()); - // change the content hash on the reverse map to a NoContentHash - *hash = CHashCache::NoContentHash; - // also drop the smart pointer from the output array so failures release memory quickly - const auto foundIx = outputReverseMap.find(gpuObj); - if (foundIx!=outputReverseMap.end()) - { - auto& resultOutput = std::get>(reservations.m_gpuObjects); - resultOutput[foundIx->second].value = nullptr; - outputReverseMap.erase(foundIx); - } - }; - // core::bitflag submitsNeeded = IQueue::FAMILY_FLAGS::NONE; @@ -5191,16 +5249,13 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul for (auto& inWrite : tlasWriteMap) { // I know what I'm doing, this member has no influence on the set key hash - auto& tlas = const_cast&>(inWrite.tlas); + auto tlas = core::smart_refctd_ptr(const_cast(inWrite.tlas)); assert(tlas); if (missingDependent.template operator()(tlas.get())) - { - tlas = nullptr; continue; - } if (const auto foundCompacted=compactedTLASMap.find(tlas.get()); foundCompacted!=compactedTLASMap.end()) tlas = foundCompacted->second; - pInfo->desc = tlas; + pInfo->desc = std::move(tlas); writes.push_back({ .dstSet = inWrite.dstSet, .binding = inWrite.binding, @@ -5214,7 +5269,11 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul // if the descriptor write fails, we make the Descriptor Sets behave as-if the TLAS build failed (dep is missing) if (!writes.empty() && !device->updateDescriptorSets(writes,{})) for (auto& inWrite : tlasWriteMap) - const_cast&>(inWrite.tlas) = nullptr; + { + auto* pHash = findInStaging.template operator()(inWrite.dstSet); + smart_refctd_ptr dummy; + markFailureInStaging("writing TLAS to Descriptor Set binding",dummy,inWrite.dstSet,pHash); + } } mergeCache.template operator()(); // needed for the IGPUDescriptorSets to check if TLAS exists/was written, can be released now From b044144e5c16c8e5e87ce4e5c5d3392d62f76f10 Mon Sep 17 00:00:00 2001 From: devsh Date: Sat, 10 May 2025 07:13:19 +0200 Subject: [PATCH 058/346] the deferred TLAS descriptor writes need to refcount the TLASes --- include/nbl/video/utilities/CAssetConverter.h | 4 ++-- src/nbl/video/utilities/CAssetConverter.cpp | 19 +++++++++---------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h index 9175f20a86..829735327c 100644 --- a/include/nbl/video/utilities/CAssetConverter.h +++ b/include/nbl/video/utilities/CAssetConverter.h @@ -1136,13 +1136,13 @@ class CAssetConverter : public core::IReferenceCounted { inline bool operator==(const SDeferredTLASWrite& other) const { - return dstSet == other.dstSet && binding == other.binding && arrayElement == other.arrayElement; + return dstSet==other.dstSet && binding==other.binding && arrayElement==other.arrayElement; } IGPUDescriptorSet* dstSet; uint32_t binding; uint32_t arrayElement; - const IGPUTopLevelAccelerationStructure* tlas; + core::smart_refctd_ptr tlas; }; struct SDeferredTLASWriteHasher { diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 32f9408365..e1816dbe1e 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -1960,7 +1960,7 @@ class GetDependantVisit : public GetDependantVisitBase) { - deferredTLASWrites.push_back({nullptr,binding.data,element,depObj.get()}); + deferredTLASWrites.push_back({nullptr,binding.data,element,depObj}); return true; } // @@ -3586,11 +3586,9 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult auto& stagingCache = std::get>(retval.m_stagingCaches); phmap::erase_if(stagingCache,[](const auto& entry)->bool { - if constexpr (std::is_same_v) - { - // TODO: gather into m_deferredTLASDescriptorWrites - } - return entry.first->getReferenceCount()==1; + if (entry.first->getReferenceCount()==1) + return true; + return false; } ); }; @@ -3608,19 +3606,20 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult pruneStaging.template operator()(); pruneStaging.template operator()(); pruneStaging.template operator()(); - // need to nerf any writes to descriptor sets which don't exist anymore before checking the refcounts on them + // because Descriptor Sets don't hold onto TLASes yet, we need to drop the TLASes in deferred descriptor writes phmap::erase_if(retval.m_deferredTLASDescriptorWrites,[&](const auto& entry)->bool { auto& dsStaging = std::get>(retval.m_stagingCaches); - return dsStaging.find(entry.dstSet)!=dsStaging.end(); + return dsStaging.find(entry.dstSet)==dsStaging.end(); } ); pruneStaging.template operator()(); +// go over pruneStaging.template operator()(); pruneStaging.template operator()(); } - // TODO: defer the conversion requests until final objects are known (or knock them out) -> maybe change the conversion requests to unordered_map ? + // TODO: prune the conversion requests -> maybe change the conversion requests to unordered_map ? // TODO: only now get the queue flags @@ -5249,7 +5248,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul for (auto& inWrite : tlasWriteMap) { // I know what I'm doing, this member has no influence on the set key hash - auto tlas = core::smart_refctd_ptr(const_cast(inWrite.tlas)); + auto tlas = core::smart_refctd_ptr(const_cast(inWrite.tlas.get())); assert(tlas); if (missingDependent.template operator()(tlas.get())) continue; From 8555fad476c7ee91e8bfb37ab23d05b3ce2de83e Mon Sep 17 00:00:00 2001 From: devsh Date: Sat, 10 May 2025 14:44:42 +0200 Subject: [PATCH 059/346] turned conversion requests into `unordered_map`s so they're easier to knock out --- include/nbl/video/utilities/CAssetConverter.h | 36 +-- src/nbl/video/utilities/CAssetConverter.cpp | 244 ++++++++++-------- 2 files changed, 156 insertions(+), 124 deletions(-) diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h index 829735327c..12326acc6c 100644 --- a/include/nbl/video/utilities/CAssetConverter.h +++ b/include/nbl/video/utilities/CAssetConverter.h @@ -1085,36 +1085,29 @@ class CAssetConverter : public core::IReferenceCounted core::tuple_transform_t m_stagingCaches; // need a more explicit list of GPU objects that need device-assisted conversion - template - struct SConversionRequestBase - { - // canonical asset (the one that provides content) - core::smart_refctd_ptr canonical; - // gpu object to transfer canonical's data to or build it from - asset_traits::video_t* gpuObj; - }; - using SConvReqBuffer = SConversionRequestBase; - core::vector m_bufferConversions; - struct SConvReqImage : SConversionRequestBase + core::unordered_map> m_bufferConversions; + struct SConvReqImage { + core::smart_refctd_ptr canonical = nullptr; uint16_t recomputeMips = 0; }; - core::vector m_imageConversions; + core::unordered_map m_imageConversions; template - struct SConvReqAccelerationStructure : SConversionRequestBase + struct SConvReqAccelerationStructure { using build_f = typename asset_traits::video_t::BUILD_FLAGS; inline void setBuildFlags(const build_f _flags) {buildFlags = static_cast(_flags);} inline build_f getBuildFlags() const {return static_cast(buildFlags);} + core::smart_refctd_ptr canonical = nullptr; uint64_t scratchSize : 45; uint64_t compact : 1; uint64_t buildFlags : 16 = 0; }; using SConvReqBLAS = SConvReqAccelerationStructure; - core::vector m_blasConversions[2]; + core::unordered_map m_blasConversions[2]; using SConvReqTLAS = SConvReqAccelerationStructure; - core::vector m_tlasConversions[2]; + core::unordered_map m_tlasConversions[2]; // array index 0 for device builds, 1 for host builds uint64_t m_minASBuildScratchSize[2] = {0,0}; @@ -1136,25 +1129,22 @@ class CAssetConverter : public core::IReferenceCounted { inline bool operator==(const SDeferredTLASWrite& other) const { - return dstSet==other.dstSet && binding==other.binding && arrayElement==other.arrayElement; + return binding==other.binding && arrayElement==other.arrayElement; } - IGPUDescriptorSet* dstSet; uint32_t binding; uint32_t arrayElement; - core::smart_refctd_ptr tlas; + core::smart_refctd_ptr tlas; }; struct SDeferredTLASWriteHasher { inline size_t operator()(const SDeferredTLASWrite& write) const { - size_t retval = std::bit_cast(write.dstSet); - core::hash_combine(retval,write.binding); - core::hash_combine(retval,write.arrayElement); - return retval; + return std::hash()((uint64_t(write.binding)<<32)|write.arrayElement); } }; - core::unordered_set m_deferredTLASDescriptorWrites; + using deferred_tlas_write_set_t = core::unordered_set; + core::unordered_map m_deferredTLASDescriptorWrites; // core::bitflag m_queueFlags = IQueue::FAMILY_FLAGS::NONE; diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index e1816dbe1e..b90be0b323 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -1881,8 +1881,6 @@ class GetDependantVisit : public GetDependantVisitBase : public GetDependantVisitBase writes = {}; core::vector infos = {}; - core::vector deferredTLASWrites; + CAssetConverter::SReserveResult::deferred_tlas_write_set_t deferredTLASWrites; // has to be public because of aggregate init, but its only for internal usage! uint32_t lastBinding; uint32_t lastElement; @@ -1960,7 +1958,8 @@ class GetDependantVisit : public GetDependantVisitBase) { - deferredTLASWrites.push_back({nullptr,binding.data,element,depObj}); + const auto [where,inserted] =deferredTLASWrites.insert({binding.data,element,depObj}); + assert(inserted); return true; } // @@ -3377,7 +3376,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult ds = nullptr; } else - retval.m_deferredTLASDescriptorWrites.insert(visitor.deferredTLASWrites.begin(),visitor.deferredTLASWrites.end()); + retval.m_deferredTLASDescriptorWrites[ds.get()] = std::move(visitor.deferredTLASWrites); } else inputs.logger.log("Failed to create Descriptor Pool suited for Layout %s",system::ILogger::ELL_ERROR,layout->getObjectDebugName()); @@ -3415,11 +3414,8 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult for (auto i=0ull; igetBoundMemory(); - assert(boundMemory.isValid()); - if (!canHostWriteToMemoryRange(boundMemory,gpuBuff->getSize())) - retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; - retval.m_bufferConversions.push_back({core::smart_refctd_ptr(entry.second.canonicalAsset),gpuBuff.get()}); + auto [where,inserted] = retval.m_bufferConversions.insert({gpuBuff.get(),core::smart_refctd_ptr(entry.second.canonicalAsset)}); + assert(inserted); } bufferConversions.propagateToCaches(std::get>(dfsCaches),std::get>(retval.m_stagingCaches)); // Deal with Deferred Creation of Acceleration structures @@ -3431,7 +3427,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult // TLAS and BLAS can't build concurrently, index 0 is device build, 1 is host build size_t scratchSizeFullParallelBuild[2] = {0,0}; // - core::vector>* pConversions; + core::unordered_map::video_t*,SReserveResult::SConvReqAccelerationStructure>* pConversions; if constexpr (IsTLAS) pConversions = retval.m_tlasConversions; else @@ -3482,9 +3478,8 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult continue; } // file the request for conversion - auto& request = pConversions[patch.hostBuild].emplace_back(); + auto& request = pConversions[patch.hostBuild][as.get()]; request.canonical = smart_refctd_ptr(canonical); - request.gpuObj = as.get(); request.scratchSize = deferredParams.scratchSize; request.compact = patch.compactAfterBuild; request.buildFlags = static_cast(patch.getBuildFlags(canonical).value); @@ -3510,20 +3505,14 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult auto& dfsCacheImages = std::get>(dfsCaches); for (auto& entry : imageConversions.contentHashToCanonical) for (auto i=0ull; igetRegions().empty()) { - const auto boundMemory = gpuImg->getBoundMemory(); - assert(boundMemory.isValid()); - retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; - const bool recomputeMips = dfsCacheImages.nodes[entry.second.patchIndex.value].patch.recomputeMips; - if (recomputeMips) - retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; - // Best effort guess, without actually looking at all regions - const auto& params = gpuImg->getCreationParameters(); - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdCopyBufferToImage.html#VUID-vkCmdCopyBufferToImage-commandBuffer-07739 - if (isDepthOrStencilFormat(params.format) && (params.depthUsage|params.stencilUsage).hasFlags(IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT)) - retval.m_queueFlags |= IQueue::FAMILY_FLAGS::GRAPHICS_BIT; - retval.m_imageConversions.push_back({{core::smart_refctd_ptr(entry.second.canonicalAsset),gpuImg.get()},recomputeMips}); + const auto* cpuImg = entry.second.canonicalAsset; + if (auto& gpuImg=imageConversions.gpuObjects[i+entry.second.firstCopyIx].value; gpuImg && !cpuImg->getRegions().empty()) + { + const bool recomputeMips = dfsCacheImages.nodes[entry.second.patchIndex.value].patch.recomputeMips; + auto [where,inserted] = retval.m_imageConversions.insert({gpuImg.get(),SReserveResult::SConvReqImage{core::smart_refctd_ptr(cpuImg),recomputeMips}}); + assert(inserted); + } } imageConversions.propagateToCaches(dfsCacheImages,std::get>(retval.m_stagingCaches)); } @@ -3584,10 +3573,25 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult auto pruneStaging = [&]()->void { auto& stagingCache = std::get>(retval.m_stagingCaches); - phmap::erase_if(stagingCache,[](const auto& entry)->bool + phmap::erase_if(stagingCache,[&retval](const auto& entry)->bool { if (entry.first->getReferenceCount()==1) + { + if constexpr (std::is_same_v) + retval.m_bufferConversions.erase(entry.first); + if constexpr (std::is_same_v) + { + } + if constexpr (std::is_same_v) + { + } + if constexpr (std::is_same_v) + retval.m_imageConversions.erase(entry.first); + // because Descriptor Sets don't hold onto TLASes yet, we need to drop the TLASes in deferred descriptor writes + if constexpr (std::is_same_v) + retval.m_deferredTLASDescriptorWrites.erase(entry.first); return true; + } return false; } ); @@ -3606,13 +3610,6 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult pruneStaging.template operator()(); pruneStaging.template operator()(); pruneStaging.template operator()(); - // because Descriptor Sets don't hold onto TLASes yet, we need to drop the TLASes in deferred descriptor writes - phmap::erase_if(retval.m_deferredTLASDescriptorWrites,[&](const auto& entry)->bool - { - auto& dsStaging = std::get>(retval.m_stagingCaches); - return dsStaging.find(entry.dstSet)==dsStaging.end(); - } - ); pruneStaging.template operator()(); // go over pruneStaging.template operator()(); @@ -3621,7 +3618,35 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult // TODO: prune the conversion requests -> maybe change the conversion requests to unordered_map ? - // TODO: only now get the queue flags + // only now get the queue flags + { + using q_fam_f = IQueue::FAMILY_FLAGS; + // images are trickier, we can't finish iterating until all possible flags are there + for (auto it=retval.m_imageConversions.begin(); !retval.m_queueFlags.hasFlags(q_fam_f::TRANSFER_BIT|q_fam_f::COMPUTE_BIT|q_fam_f::GRAPHICS_BIT) && it!=retval.m_imageConversions.end(); it++) + { + const auto boundMemory = it->first->getBoundMemory(); + assert(boundMemory.isValid()); + // Note: with `host_image_copy` this will get conditional + { + retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; + // Best effort guess, without actually looking at all regions + const auto& params = it->first->getCreationParameters(); + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdCopyBufferToImage.html#VUID-vkCmdCopyBufferToImage-commandBuffer-07739 + if (isDepthOrStencilFormat(params.format) && (params.depthUsage | params.stencilUsage).hasFlags(IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT)) + retval.m_queueFlags |= IQueue::FAMILY_FLAGS::GRAPHICS_BIT; + if (it->second.recomputeMips) + retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; + } + } + // buffer conversions + for (auto it=retval.m_bufferConversions.begin(); !retval.m_queueFlags.hasFlags(q_fam_f::TRANSFER_BIT) && it!=retval.m_bufferConversions.end(); it++) + { + const auto boundMemory = it->first->getBoundMemory(); + assert(boundMemory.isValid()); + if (!canHostWriteToMemoryRange(boundMemory,it->first->getSize())) + retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; + } + } retval.m_converter = core::smart_refctd_ptr(this); retval.m_logger = system::logger_opt_smart_ptr(core::smart_refctd_ptr(inputs.logger.get())); @@ -3648,15 +3673,16 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul { for (; hostBufferXferIt!=reservations.m_bufferConversions.end() && pred(); hostBufferXferIt++) { - const size_t size = hostBufferXferIt->gpuObj->getSize(); - const auto boundMemory = hostBufferXferIt->gpuObj->getBoundMemory(); + IGPUBuffer* buff = hostBufferXferIt->first; + const size_t size = buff->getSize(); + const auto boundMemory = buff->getBoundMemory(); if (!canHostWriteToMemoryRange(boundMemory,size)) continue; auto* const memory = boundMemory.memory; const IDeviceMemoryAllocation::MemoryRange range = {boundMemory.offset,size}; - memcpy(reinterpret_cast(memory->getMappedPointer())+range.offset,hostBufferXferIt->canonical->getPointer(),size); + memcpy(reinterpret_cast(memory->getMappedPointer())+range.offset,hostBufferXferIt->second->getPointer(),size); // let go of canonical asset (may free RAM) - hostBufferXferIt->canonical = nullptr; + hostBufferXferIt->second = nullptr; if (memory->haveToMakeVisible()) memoryHostFlushRanges.emplace_back(memory,range.offset,range.length,ILogicalDevice::MappedMemoryRange::align_non_coherent_tag); } @@ -3932,8 +3958,8 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul } for (auto& item : buffersToUpload) { - auto* buffer = item.gpuObj; - const size_t size = item.gpuObj->getCreationParams().size; + auto* buffer = item.first; + const size_t size = buffer->getCreationParams().size; // host will upload if (canHostWriteToMemoryRange(buffer->getBoundMemory(),size)) continue; @@ -3942,21 +3968,21 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul const auto ownerQueueFamily = checkOwnership(buffer,params.getFinalOwnerQueueFamily(buffer,*pFoundHash),transferFamily); if (ownerQueueFamily==QueueFamilyInvalid) { - markFailureInStaging("invalid Final Queue Family given by user callback",item.canonical,buffer,pFoundHash); + markFailureInStaging("invalid Final Queue Family given by user callback",item.second,buffer,pFoundHash); continue; } // do the upload const SBufferRange range = {.offset=0,.size=size,.buffer=core::smart_refctd_ptr(buffer)}; - const bool success = params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,item.canonical->getPointer()); + const bool success = params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,item.second->getPointer()); // current recording buffer may have changed xferCmdBuf = params.transfer->getCommandBufferForRecording(); if (!success) { - markFailureInStaging("Data Upload",item.canonical,buffer,pFoundHash); + markFailureInStaging("Data Upload",item.second,buffer,pFoundHash); continue; } // let go of canonical asset (may free RAM) - item.canonical = nullptr; + item.second = nullptr; submitsNeeded |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; // enqueue ownership release if necessary if (ownerQueueFamily!=IQueue::FamilyIgnored) @@ -4116,8 +4142,8 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul for (auto& item : imagesToUpload) { // basiscs - const auto* cpuImg = item.canonical.get(); - auto* image = item.gpuObj; + auto& cpuImg = item.second.canonical; + auto* image = item.first; auto pFoundHash = findInStaging.template operator()(image); // get params const auto& creationParams = image->getCreationParameters(); @@ -4136,7 +4162,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul }); IGPUImageView::E_TYPE viewType = IGPUImageView::E_TYPE::ET_2D_ARRAY; // create Mipmapping source Image View, allocate its place in the descriptor set and write it - if (item.recomputeMips) + if (item.second.recomputeMips) { switch (creationParams.type) { @@ -4168,7 +4194,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul } if (!quickWriteDescriptor(SrcMipBinding,srcIx,std::move(srcView))) { - markFailureInStaging("Source Mip Level Descriptor Write",item.canonical,image,pFoundHash); + markFailureInStaging("Source Mip Level Descriptor Write",cpuImg,image,pFoundHash); continue; } } @@ -4177,7 +4203,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul { // Transfer and Compute barriers get recorded for image individually (see the TODO why its horrible) // so we only need to worry about QFOTs for current image if they even exist - if (item.recomputeMips && !transferBarriers.empty()) + if (item.second.recomputeMips && !transferBarriers.empty()) { // so now we need a immeidate QFOT Release cause we already recorded some compute mipmapping for current image if (pipelineBarrier(xferCmdBuf,{.memBarriers={},.bufBarriers={},.imgBarriers=transferBarriers},"Recording QFOT Release from Transfer Queue Family after overflow failed")) @@ -4189,7 +4215,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul } else { - markFailureInStaging("Image QFOT Pipeline Barrier",item.canonical,image,pFoundHash); + markFailureInStaging("Image QFOT Pipeline Barrier",cpuImg,image,pFoundHash); return false; } return true; @@ -4205,6 +4231,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul computeBarriers.clear(); const bool concurrentSharing = image->getCachedCreationParams().isConcurrentSharing(); uint8_t lvl = 0; + const auto recomputeMipMask = item.second.recomputeMips; bool _prevRecompute = false; for (; lvl CAssetConverter::convert_impl(SReserveResul // if any op, it will always be a release (Except acquisition of first source mip in compute) barrier.ownershipOp = ownership_op_t::RELEASE; // if we're recomputing this mip level - const bool recomputeMip = lvl && (item.recomputeMips&(0x1u<<(lvl-1))); + const bool recomputeMip = lvl && (recomputeMipMask&(0x1u<<(lvl-1))); // query final layout from callback const auto finalLayout = params.getFinalLayout(image,*pFoundHash,lvl); // get region data for upload @@ -4434,7 +4461,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT; barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; // whether next mip will need to read from this one to recompute itself - const bool sourceForNextMipCompute = item.recomputeMips&(0x1u<general transition tmp.newLayout = sourceForNextMipCompute ? layout_t::GENERAL : layout_t::TRANSFER_DST_OPTIMAL; // fire off the pipeline barrier so we can start uploading right away @@ -4503,18 +4530,18 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul // failed in the for-loop if (lvl != creationParams.mipLevels) { - markFailureInStaging("Compute Mip Mapping",item.canonical,image,pFoundHash); + markFailureInStaging("Compute Mip Mapping",cpuImg,image,pFoundHash); continue; } // let go of canonical asset (may free RAM) - item.canonical = nullptr; + cpuImg = nullptr; } // here we only record barriers that do final layout transitions and release ownership to final queue family if (!transferBarriers.empty()) { if (!pipelineBarrier(xferCmdBuf,{.memBarriers={},.bufBarriers={},.imgBarriers=transferBarriers},"Final Pipeline Barrier recording to Transfer Command Buffer failed")) { - markFailureInStaging("Image Data Upload Pipeline Barrier",item.canonical,image,pFoundHash); + markFailureInStaging("Image Data Upload Pipeline Barrier",cpuImg,image,pFoundHash); continue; } // even if no uploads performed, we do layout transitions on empty images from Xfer Queue @@ -4526,7 +4553,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul dsAlloc->multi_deallocate(SrcMipBinding,1,&srcIx,params.compute->getFutureScratchSemaphore()); if (!pipelineBarrier(computeCmdBuf,{.memBarriers={},.bufBarriers={},.imgBarriers=computeBarriers},"Final Pipeline Barrier recording to Compute Command Buffer failed")) { - markFailureInStaging("Compute Mip Mapping Pipeline Barrier",item.canonical,image,pFoundHash); + markFailureInStaging("Compute Mip Mapping Pipeline Barrier",cpuImg,image,pFoundHash); continue; } } @@ -4751,17 +4778,18 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul for (auto& tlasToBuild : tlasesToBuild) { dedupBLASesUsed.clear(); - const auto as = tlasToBuild.gpuObj; + auto& canonical = tlasToBuild.second.canonical; + const auto as = tlasToBuild.first; const auto pFoundHash = findInStaging.template operator()(as); const auto& backingRange = as->getCreationParams().bufferRange; // checking ownership for the future on old buffer, but compacted will be made with same sharing creation parameters const auto finalOwnerQueueFamily = checkOwnership(backingRange.buffer.get(),params.getFinalOwnerQueueFamily(as,*pFoundHash),computeFamily); if (finalOwnerQueueFamily==QueueFamilyInvalid) { - markFailureInStaging("invalid Final Queue Family given by user callback",tlasToBuild.canonical,as,pFoundHash); + markFailureInStaging("invalid Final Queue Family given by user callback",canonical,as,pFoundHash); continue; } - const auto instances = tlasToBuild.canonical->getInstances(); + const auto instances = canonical->getInstances(); const auto instanceCount = static_cast(instances.size()); size_t instanceDataSize = 0; // gather total input size and check dependants exist @@ -4779,13 +4807,13 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul // problem with finding the dependents (BLASes) if (instanceDataSize==0) { - markFailureInStaging("finding valid Dependant GPU BLASes for TLAS build",tlasToBuild.canonical,as,pFoundHash); + markFailureInStaging("finding valid Dependant GPU BLASes for TLAS build",canonical,as,pFoundHash); continue; } // allocate scratch and build inputs constexpr uint32_t MaxAllocCount = 3; addr_t offsets[MaxAllocCount] = {scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value}; - const addr_t sizes[MaxAllocCount] = {tlasToBuild.scratchSize,instanceDataSize,sizeof(void*)*instanceCount}; + const addr_t sizes[MaxAllocCount] = {tlasToBuild.second.scratchSize,instanceDataSize,sizeof(void*)*instanceCount}; { const addr_t alignments[MaxAllocCount] = {limits.minAccelerationStructureScratchOffsetAlignment,16,alignof(uint64_t)}; const auto AllocCount = as->usesMotion() ? 2:3; @@ -4879,16 +4907,16 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul xferCmdBuf = params.transfer->getCommandBufferForRecording(); if (!success) { - markFailureInStaging("Uploading Instance Data for TLAS build failed",tlasToBuild.canonical,as,pFoundHash); + markFailureInStaging("Uploading Instance Data for TLAS build failed",canonical,as,pFoundHash); continue; } // let go of canonical asset (may free RAM) - tlasToBuild.canonical = nullptr; + canonical = nullptr; } // prepare build infos auto& buildInfo = buildInfos.emplace_back(); buildInfo.scratch = {.offset=offsets[0],.buffer=smart_refctd_ptr(scratchBuffer)}; - buildInfo.buildFlags = tlasToBuild.getBuildFlags(); + buildInfo.buildFlags = tlasToBuild.second.getBuildFlags(); buildInfo.instanceDataTypeEncodedInPointersLSB = as->usesMotion(); buildInfo.dstAS = as; // note we don't build directly from staging, because only very small inputs could come from there and they'd impede the transfer efficiency of the larger ones @@ -4905,7 +4933,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul // no special extra byte offset into the instance buffer rangeInfos.emplace_back(instanceCount,0u); // - const bool willCompact = tlasToBuild.compact; + const bool willCompact = tlasToBuild.second.compact; if (willCompact) compactions.push_back(as); // enqueue ownership release if necessary @@ -5180,13 +5208,18 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul const redirect_t& redirect = layout->getDescriptorRedirect(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE); const auto bindingRange = redirect.findBindingStorageIndex(redirect_t::storage_offset_t(i)); const auto firstElementOffset = redirect.getStorageOffset(bindingRange).data; - const auto foundWrite = reservations.m_deferredTLASDescriptorWrites.find({ - .dstSet = item.first, - .binding = redirect.getBinding(bindingRange).data, - .arrayElement = i-firstElementOffset - }); - // was scheduled to write some TLAS to this binding, but TLAS is now null - depsMissing = foundWrite!=reservations.m_deferredTLASDescriptorWrites.end() && !foundWrite->tlas; + auto foundSet = reservations.m_deferredTLASDescriptorWrites.find(item.first); + if (foundSet!=reservations.m_deferredTLASDescriptorWrites.end()) + { + const auto foundWrite = foundSet->second.find({ + .binding = redirect.getBinding(bindingRange).data, + .arrayElement = i-firstElementOffset + }); + // was scheduled to write some TLAS to this binding, but TLAS is now null + depsMissing = foundWrite!=foundSet->second.end() && !foundWrite->tlas; + } + else + depsMissing = true; break; } default: @@ -5239,40 +5272,49 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul mergeCache.template operator()(); mergeCache.template operator()(); // write the TLASes into Descriptor Set finally - if (auto& tlasWriteMap=reservations.m_deferredTLASDescriptorWrites; !tlasWriteMap.empty()) + if (auto& tlasWriteDSMap=reservations.m_deferredTLASDescriptorWrites; !tlasWriteDSMap.empty()) { core::vector writes; - writes.reserve(tlasWriteMap.size()); - core::vector infos(writes.size()); - auto* pInfo = infos.data(); - for (auto& inWrite : tlasWriteMap) + core::vector infos; + for (auto& tlasWriteMap : tlasWriteDSMap) { - // I know what I'm doing, this member has no influence on the set key hash - auto tlas = core::smart_refctd_ptr(const_cast(inWrite.tlas.get())); - assert(tlas); - if (missingDependent.template operator()(tlas.get())) - continue; - if (const auto foundCompacted=compactedTLASMap.find(tlas.get()); foundCompacted!=compactedTLASMap.end()) - tlas = foundCompacted->second; - pInfo->desc = std::move(tlas); - writes.push_back({ - .dstSet = inWrite.dstSet, - .binding = inWrite.binding, - .arrayElement = inWrite.arrayElement, - .count = 1, - .info = pInfo++ - }); + writes.clear(); + infos.clear(); + auto* dstSet = tlasWriteMap.first; + for (auto& inWrite : tlasWriteMap.second) + { + // I know what I'm doing, this member has no influence on the set key hash or equal comparison operator + auto& tlas = const_cast&>(inWrite.tlas); + assert(tlas); + if (missingDependent.template operator()(tlas.get())) + { + tlas = {}; + continue; + } + if (const auto foundCompacted=compactedTLASMap.find(tlas.get()); foundCompacted!=compactedTLASMap.end()) + tlas = foundCompacted->second; + infos.emplace_back().desc = std::move(tlas); + writes.push_back({ + .dstSet = dstSet, + .binding = inWrite.binding, + .arrayElement = inWrite.arrayElement, + .count = 1 + }); + } + // + auto* pInfo = infos.data(); + for (auto& outWrite : writes) + outWrite.info = pInfo++; + // if the descriptor write fails, we make the Descriptor Sets behave as-if the TLAS build failed (dep is missing) + if (!writes.empty() && !device->updateDescriptorSets(writes,{})) + { + auto* pHash = findInStaging.template operator()(dstSet); + smart_refctd_ptr dummy; + markFailureInStaging("writing TLAS to Descriptor Set binding",dummy,dstSet,pHash); + } } // not strictly necessary, just provoking refcounting bugs right away if they exist compactedTLASMap.clear(); - // if the descriptor write fails, we make the Descriptor Sets behave as-if the TLAS build failed (dep is missing) - if (!writes.empty() && !device->updateDescriptorSets(writes,{})) - for (auto& inWrite : tlasWriteMap) - { - auto* pHash = findInStaging.template operator()(inWrite.dstSet); - smart_refctd_ptr dummy; - markFailureInStaging("writing TLAS to Descriptor Set binding",dummy,inWrite.dstSet,pHash); - } } mergeCache.template operator()(); // needed for the IGPUDescriptorSets to check if TLAS exists/was written, can be released now From 69df18a8d115d9a201eff57afc90ce3e5e75f5f2 Mon Sep 17 00:00:00 2001 From: devsh Date: Sun, 11 May 2025 00:11:21 +0200 Subject: [PATCH 060/346] save progress before attempting to remove `m_deferredTLASDescriptorWrites` --- include/nbl/video/utilities/CAssetConverter.h | 10 +- src/nbl/video/utilities/CAssetConverter.cpp | 117 +++++++++++------- 2 files changed, 81 insertions(+), 46 deletions(-) diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h index 12326acc6c..e309a24fc3 100644 --- a/include/nbl/video/utilities/CAssetConverter.h +++ b/include/nbl/video/utilities/CAssetConverter.h @@ -1103,11 +1103,13 @@ class CAssetConverter : public core::IReferenceCounted uint64_t scratchSize : 45; uint64_t compact : 1; uint64_t buildFlags : 16 = 0; + // scratch + input size also accounting for worst case padding due to alignment + uint64_t buildSize; }; - using SConvReqBLAS = SConvReqAccelerationStructure; - core::unordered_map m_blasConversions[2]; - using SConvReqTLAS = SConvReqAccelerationStructure; - core::unordered_map m_tlasConversions[2]; + template + using SConvReqAccelerationStructureMap = core::unordered_map::video_t*,SConvReqAccelerationStructure>; + SConvReqAccelerationStructureMap m_blasConversions[2]; + SConvReqAccelerationStructureMap m_tlasConversions[2]; // array index 0 for device builds, 1 for host builds uint64_t m_minASBuildScratchSize[2] = {0,0}; diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index b90be0b323..4fadb1ee7f 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -1654,10 +1654,6 @@ class GetDependantVisit; template<> class GetDependantVisit : public GetDependantVisitBase { - public: - // TODO: deal with usages not going through because of cancelled TLAS builds, by gathering in a top-down pass at the end of `reserve` - CAssetConverter::SReserveResult::cpu_to_gpu_blas_map_t* blasBuildMap = nullptr; - protected: bool descend_impl( const instance_t& user, const CAssetConverter::patch_t& userPatch, @@ -1668,16 +1664,6 @@ class GetDependantVisit : public GetDependant auto depObj = getDependant(dep,soloPatch); if (!depObj) return false; - if (blasBuildMap) - { - const auto instances = user.asset->getInstances(); - assert(instanceIndexfind(dep.asset); - if (foundBLAS!=blasBuildMap->end()) - foundBLAS->second.remainingUsages++; - else - blasBuildMap->insert(foundBLAS,{dep.asset,{depObj}}); - } return true; } }; @@ -1958,9 +1944,13 @@ class GetDependantVisit : public GetDependantVisitBase) { - const auto [where,inserted] =deferredTLASWrites.insert({binding.data,element,depObj}); - assert(inserted); - return true; + // not built yet? + if (depObj->) + { + const auto [where,inserted] = deferredTLASWrites.insert({binding.data,element,depObj}); + assert(inserted); + return true; + } } // auto& outInfo = infos.emplace_back(); @@ -3420,19 +3410,16 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult bufferConversions.propagateToCaches(std::get>(dfsCaches),std::get>(retval.m_stagingCaches)); // Deal with Deferred Creation of Acceleration structures { - const auto minScratchAlignment = device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment; auto createAccelerationStructures = [&]()->void { constexpr bool IsTLAS = std::is_same_v; - // TLAS and BLAS can't build concurrently, index 0 is device build, 1 is host build - size_t scratchSizeFullParallelBuild[2] = {0,0}; // - core::unordered_map::video_t*,SReserveResult::SConvReqAccelerationStructure>* pConversions; + SReserveResult::SConvReqAccelerationStructureMap* pConversions; if constexpr (IsTLAS) pConversions = retval.m_tlasConversions; else pConversions = retval.m_blasConversions; - // we collect that stats AFTER making sure that the BLAS / TLAS can actually be created + // we enqueue the conversions AFTER making sure that the BLAS / TLAS can actually be created for (size_t i=0; i SReserveResult { // check if the BLASes we want to use for the instances were successfully allocated and created AssetVisitor> visitor = { - {inputs,dfsCaches,&retval.m_blasBuildMap}, + {inputs,dfsCaches}, {canonical,deferredParams.uniqueCopyGroupID}, patch }; @@ -3483,23 +3470,13 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult request.scratchSize = deferredParams.scratchSize; request.compact = patch.compactAfterBuild; request.buildFlags = static_cast(patch.getBuildFlags(canonical).value); - // sizes for building 1-by-1 vs parallel, note that BLAS and TLAS can't be built concurrently - retval.m_minASBuildScratchSize[patch.hostBuild] = core::max(retval.m_minASBuildScratchSize[patch.hostBuild],deferredParams.buildSize); - scratchSizeFullParallelBuild[patch.hostBuild] += deferredParams.buildSize; - // note that in order to compact an AS you need to allocate a buffer range whose size is known only after the build - if (patch.compactAfterBuild) - retval.m_compactedASMaxMemory += bufSz; + request.buildSize = deferredParams.buildSize; } - retval.m_maxASBuildScratchSize[0] = core::max(scratchSizeFullParallelBuild[0],retval.m_maxASBuildScratchSize[0]); - retval.m_maxASBuildScratchSize[1] = core::max(scratchSizeFullParallelBuild[1],retval.m_maxASBuildScratchSize[1]); }; createAccelerationStructures.template operator()(); blasConversions.propagateToCaches(std::get>(dfsCaches),std::get>(retval.m_stagingCaches)); createAccelerationStructures.template operator()(); tlasConversions.propagateToCaches(std::get>(dfsCaches),std::get>(retval.m_stagingCaches)); - // - if (retval.willDeviceASBuild() || retval.willCompactAS()) - retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; } // find out which images need what caps for the transfer and mipmapping auto& dfsCacheImages = std::get>(dfsCaches); @@ -3580,11 +3557,11 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult if constexpr (std::is_same_v) retval.m_bufferConversions.erase(entry.first); if constexpr (std::is_same_v) - { - } + for (auto i=0; i<2; i++) + retval.m_blasConversions[i].erase(entry.first); if constexpr (std::is_same_v) - { - } + for (auto i=0; i<2; i++) + retval.m_tlasConversions[i].erase(entry.first); if constexpr (std::is_same_v) retval.m_imageConversions.erase(entry.first); // because Descriptor Sets don't hold onto TLASes yet, we need to drop the TLASes in deferred descriptor writes @@ -3592,6 +3569,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult retval.m_deferredTLASDescriptorWrites.erase(entry.first); return true; } + // still referenced, keep it around return false; } ); @@ -3611,16 +3589,71 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult pruneStaging.template operator()(); pruneStaging.template operator()(); pruneStaging.template operator()(); -// go over + // go over future TLAS builds to gather used BLASes + for (auto i=0; i<2; i++) + for (const auto& req : retval.m_tlasConversions[i]) + { + auto* const cpuTLAS = req.second.canonical.get(); + assert(cpuTLAS); + for (const auto& instance : cpuTLAS->getInstances()) + { + auto* const cpuBLAS = instance.getBase().blas.get(); + auto foundBLAS = retval.m_blasBuildMap.find(cpuBLAS); + if (foundBLAS!=retval.m_blasBuildMap.end()) + foundBLAS->second.remainingUsages++; + else + { + smart_refctd_ptr gpuBLAS; +// TODO + retval.m_blasBuildMap.insert(foundBLAS,{cpuBLAS,{std::move(gpuBLAS),1,1}}); + } + } + } pruneStaging.template operator()(); pruneStaging.template operator()(); } - // TODO: prune the conversion requests -> maybe change the conversion requests to unordered_map ? - // only now get the queue flags { using q_fam_f = IQueue::FAMILY_FLAGS; + // acceleration structures, get scratch size + auto computeAccelerationStructureScratchSizes = [device,&retval]()->void + { + constexpr bool IsTLAS = std::is_same_v; + const auto& limits = device->getPhysicalDevice()->getLimits(); + const auto minScratchAlignment = limits.minAccelerationStructureScratchOffsetAlignment; + // index 0 is device build, 1 is host build + size_t scratchSizeFullParallelBuild[2] = {0,0}; + // + const SReserveResult::SConvReqAccelerationStructureMap* pConversions; + if constexpr (IsTLAS) + pConversions = retval.m_tlasConversions; + else + pConversions = retval.m_blasConversions; + // we collect the stats AFTER making sure only needed TLAS and BLAS will be built + for (auto i=0; i<2; i++) + for (auto req : pConversions[i]) + { + const auto buildSize = req.second.buildSize; + // sizes for building 1-by-1 vs parallel, note that BLAS and TLAS can't be built concurrently + retval.m_minASBuildScratchSize[i] = core::max(retval.m_minASBuildScratchSize[i],buildSize); + scratchSizeFullParallelBuild[i] = core::alignUp(scratchSizeFullParallelBuild[i],minScratchAlignment)+buildSize; + // note that in order to compact an AS you need to allocate a buffer range whose size is known only after the build + if (req.second.compact) + { + const auto asSize = req.first->getCreationParams().bufferRange.size; + assert(core::is_aligned_to(asSize,256)); + retval.m_compactedASMaxMemory += asSize; + } + } + // TLAS and BLAS can't build concurrently + retval.m_maxASBuildScratchSize[0] = core::max(scratchSizeFullParallelBuild[0],retval.m_maxASBuildScratchSize[0]); + retval.m_maxASBuildScratchSize[1] = core::max(scratchSizeFullParallelBuild[1],retval.m_maxASBuildScratchSize[1]); + }; + computeAccelerationStructureScratchSizes.template operator()(); + computeAccelerationStructureScratchSizes.template operator()(); + if (retval.willDeviceASBuild() || retval.willCompactAS()) + retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; // images are trickier, we can't finish iterating until all possible flags are there for (auto it=retval.m_imageConversions.begin(); !retval.m_queueFlags.hasFlags(q_fam_f::TRANSFER_BIT|q_fam_f::COMPUTE_BIT|q_fam_f::GRAPHICS_BIT) && it!=retval.m_imageConversions.end(); it++) { @@ -3632,7 +3665,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult // Best effort guess, without actually looking at all regions const auto& params = it->first->getCreationParameters(); // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdCopyBufferToImage.html#VUID-vkCmdCopyBufferToImage-commandBuffer-07739 - if (isDepthOrStencilFormat(params.format) && (params.depthUsage | params.stencilUsage).hasFlags(IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT)) + if (isDepthOrStencilFormat(params.format) && (params.depthUsage|params.stencilUsage).hasFlags(IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT)) retval.m_queueFlags |= IQueue::FAMILY_FLAGS::GRAPHICS_BIT; if (it->second.recomputeMips) retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; From 99e473d984e1be5f9bac070aac31ce4879c08fe2 Mon Sep 17 00:00:00 2001 From: devsh Date: Sun, 11 May 2025 01:57:40 +0200 Subject: [PATCH 061/346] Ok, so descriptor sets can actually track TLASes which are yet-unbuilt, makes life a lot easier. add `IGPUTopLevelAccelerationStructure::getPendingBuildVer()` to detect if TLAS built yet also make sure the maxInstanceCount gets hashed properly --- include/nbl/video/IGPUAccelerationStructure.h | 2 + include/nbl/video/utilities/CAssetConverter.h | 16 ++- src/nbl/video/utilities/CAssetConverter.cpp | 131 +++++++----------- 3 files changed, 59 insertions(+), 90 deletions(-) diff --git a/include/nbl/video/IGPUAccelerationStructure.h b/include/nbl/video/IGPUAccelerationStructure.h index c3a24080d0..60c6add5fb 100644 --- a/include/nbl/video/IGPUAccelerationStructure.h +++ b/include/nbl/video/IGPUAccelerationStructure.h @@ -667,6 +667,8 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr // using build_ver_t = uint32_t; + // + inline build_ver_t getPendingBuildVer() const {return m_pendingBuildVer;} // this gets called when execution is sure to happen 100%, e.g. not during command recording but during submission inline build_ver_t registerNextBuildVer() { diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h index e309a24fc3..f7faa9598b 100644 --- a/include/nbl/video/utilities/CAssetConverter.h +++ b/include/nbl/video/utilities/CAssetConverter.h @@ -1131,22 +1131,24 @@ class CAssetConverter : public core::IReferenceCounted { inline bool operator==(const SDeferredTLASWrite& other) const { - return binding==other.binding && arrayElement==other.arrayElement; + return dstSet==other.dstSet && storageOffset.data==other.storageOffset.data; } - uint32_t binding; - uint32_t arrayElement; - core::smart_refctd_ptr tlas; + IGPUDescriptorSet* dstSet; + // binding and array element rolled up into one + IGPUDescriptorSetLayout::CBindingRedirect::storage_offset_t storageOffset; }; struct SDeferredTLASWriteHasher { inline size_t operator()(const SDeferredTLASWrite& write) const { - return std::hash()((uint64_t(write.binding)<<32)|write.arrayElement); + size_t retval = write.storageOffset.data; + core::hash_combine(retval,write.dstSet); + return retval; } }; - using deferred_tlas_write_set_t = core::unordered_set; - core::unordered_map m_deferredTLASDescriptorWrites; + using compacted_tlas_rewrite_set_t = core::unordered_set; + compacted_tlas_rewrite_set_t m_potentialTLASRewrites; // core::bitflag m_queueFlags = IQueue::FAMILY_FLAGS::NONE; diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 4fadb1ee7f..733be3f058 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -612,8 +612,9 @@ class AssetVisitor : public CRTP const IDescriptorSetLayoutBase::CBindingRedirect::storage_range_index_t storageRangeIx(j); const auto binding = redirect.getBinding(storageRangeIx); const uint32_t count = redirect.getCount(storageRangeIx); - // this is where the descriptors have their flattened place in a unified array - const auto* infos = allInfos.data()+redirect.getStorageOffset(storageRangeIx).data; + // this is where the descriptors have their flattened place in a unified array + const auto storageBaseOffset = redirect.getStorageOffset(storageRangeIx); + const auto* infos = allInfos.data()+storageBaseOffset.data; for (uint32_t el=0u; el(untypedDesc); - if (!descend(tlas,{tlas},type,binding,el)) + if (!descend(tlas,{tlas},type,binding,el,storageBaseOffset)) return false; break; } @@ -1164,6 +1165,7 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_thostBuild; hasher << lookup.patch->compactAfterBuild; + hasher << (lookup.patch->isMotion ? lookup.patch->maxInstances:0u); const auto instances = asset->getInstances(); hasher << instances.size(); AssetVisitor> visitor = { @@ -1883,7 +1885,7 @@ class GetDependantVisit : public GetDependantVisitBase writes = {}; core::vector infos = {}; - CAssetConverter::SReserveResult::deferred_tlas_write_set_t deferredTLASWrites; + core::vector potentialTLASRewrites = {}; // has to be public because of aggregate init, but its only for internal usage! uint32_t lastBinding; uint32_t lastElement; @@ -1941,17 +1943,6 @@ class GetDependantVisit : public GetDependantVisitBase) - { - // not built yet? - if (depObj->) - { - const auto [where,inserted] = deferredTLASWrites.insert({binding.data,element,depObj}); - assert(inserted); - return true; - } - } // auto& outInfo = infos.emplace_back(); outInfo.desc = std::move(depObj); @@ -1962,10 +1953,18 @@ class GetDependantVisit : public GetDependantVisitBase(argTuple); - outInfo.info.buffer.offset= std::get<0>(argTuple).offset; + outInfo.info.buffer.offset = std::get<0>(argTuple).offset; outInfo.info.buffer.size = std::get<0>(argTuple).size; } } + // mark potential TLAS rewrites (with compaction) so we don't have to scan entire descriptor set for potentially compacted TLASes + if constexpr (std::is_same_v) + if (depObj->getPendingBuildVer()==0) // means not built yet, so compactable by next `convert` run + { + auto storageOffset = std::get<0>(argTuple); + storageOffset.data += element; + potentialTLASRewrites.push_back(storageOffset); + } if constexpr (std::is_same_v) { outInfo.info.image.imageLayout = std::get<0>(argTuple); @@ -3366,7 +3365,8 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult ds = nullptr; } else - retval.m_deferredTLASDescriptorWrites[ds.get()] = std::move(visitor.deferredTLASWrites); + for (const auto storageIx : visitor.potentialTLASRewrites) + retval.m_potentialTLASRewrites.insert({ds.get(),storageIx}); } else inputs.logger.log("Failed to create Descriptor Pool suited for Layout %s",system::ILogger::ELL_ERROR,layout->getObjectDebugName()); @@ -3453,9 +3453,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult ); continue; } - // is there any reason for it to be more? - const uint32_t maxInstances = canonical->getInstances().size(); - as = device->createTopLevelAccelerationStructure({std::move(baseParams),maxInstances}); + as = device->createTopLevelAccelerationStructure({std::move(baseParams),patch.maxInstances}); } else as = device->createBottomLevelAccelerationStructure(std::move(baseParams)); @@ -3564,9 +3562,6 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult retval.m_tlasConversions[i].erase(entry.first); if constexpr (std::is_same_v) retval.m_imageConversions.erase(entry.first); - // because Descriptor Sets don't hold onto TLASes yet, we need to drop the TLASes in deferred descriptor writes - if constexpr (std::is_same_v) - retval.m_deferredTLASDescriptorWrites.erase(entry.first); return true; } // still referenced, keep it around @@ -3604,7 +3599,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult else { smart_refctd_ptr gpuBLAS; -// TODO +// TODO: figure out the BLAS that will be used, (this requires UUID) retval.m_blasBuildMap.insert(foundBLAS,{cpuBLAS,{std::move(gpuBLAS),1,1}}); } } @@ -5231,30 +5226,8 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul depsMissing = missingDependent.template operator()(static_cast(untypedDesc)); break; case asset::IDescriptor::EC_ACCELERATION_STRUCTURE: - { - const auto* tlas = static_cast(untypedDesc); - // successfully written a TLAS into the binding, nothing to check - if (tlas) - break; - // we have a null TLAS in the binding, and we have to check if we were supposed to have one in it - using redirect_t = IDescriptorSetLayoutBase::CBindingRedirect; - const redirect_t& redirect = layout->getDescriptorRedirect(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE); - const auto bindingRange = redirect.findBindingStorageIndex(redirect_t::storage_offset_t(i)); - const auto firstElementOffset = redirect.getStorageOffset(bindingRange).data; - auto foundSet = reservations.m_deferredTLASDescriptorWrites.find(item.first); - if (foundSet!=reservations.m_deferredTLASDescriptorWrites.end()) - { - const auto foundWrite = foundSet->second.find({ - .binding = redirect.getBinding(bindingRange).data, - .arrayElement = i-firstElementOffset - }); - // was scheduled to write some TLAS to this binding, but TLAS is now null - depsMissing = foundWrite!=foundSet->second.end() && !foundWrite->tlas; - } - else - depsMissing = true; + depsMissing = missingDependent.template operator()(static_cast(untypedDesc)); break; - } default: assert(false); depsMissing = true; @@ -5305,53 +5278,45 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul mergeCache.template operator()(); mergeCache.template operator()(); // write the TLASes into Descriptor Set finally - if (auto& tlasWriteDSMap=reservations.m_deferredTLASDescriptorWrites; !tlasWriteDSMap.empty()) + if (auto& tlasRewriteSet=reservations.m_potentialTLASRewrites; !tlasRewriteSet.empty()) { core::vector writes; - core::vector infos; - for (auto& tlasWriteMap : tlasWriteDSMap) + writes.reserve(tlasRewriteSet.size()); + core::vector infos(tlasRewriteSet.size()); + auto* pInfo = infos.data(); + for (auto& entry : tlasRewriteSet) { - writes.clear(); - infos.clear(); - auto* dstSet = tlasWriteMap.first; - for (auto& inWrite : tlasWriteMap.second) + auto* const dstSet = entry.dstSet; + // we need to check if the descriptor set itself didn't get deleted in the meantime + auto& stagingCache = std::get>(reservations.m_stagingCaches); + const auto found = stagingCache.find(dstSet); + if (found==stagingCache.end()) + continue; + // rewtrieve the binding from the TLAS + const auto* const tlas = static_cast(dstSet->getAllDescriptors(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE)[entry.storageOffset.data].get()); + assert(tlas); + // only rewrite if successfully compacted + if (const auto foundCompacted=compactedTLASMap.find(tlas); foundCompacted!=compactedTLASMap.end()) { - // I know what I'm doing, this member has no influence on the set key hash or equal comparison operator - auto& tlas = const_cast&>(inWrite.tlas); - assert(tlas); - if (missingDependent.template operator()(tlas.get())) - { - tlas = {}; - continue; - } - if (const auto foundCompacted=compactedTLASMap.find(tlas.get()); foundCompacted!=compactedTLASMap.end()) - tlas = foundCompacted->second; - infos.emplace_back().desc = std::move(tlas); + pInfo->desc = foundCompacted->second; + using redirect_t = IDescriptorSetLayoutBase::CBindingRedirect; + const redirect_t& redirect = dstSet->getLayout()->getDescriptorRedirect(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE); + const auto bindingRange = redirect.findBindingStorageIndex(entry.storageOffset); + const auto firstElementOffset = redirect.getStorageOffset(bindingRange); writes.push_back({ .dstSet = dstSet, - .binding = inWrite.binding, - .arrayElement = inWrite.arrayElement, - .count = 1 + .binding = redirect.getBinding(bindingRange).data, + .arrayElement = entry.storageOffset.data-firstElementOffset.data, + .count = 1, + .info = pInfo++ }); } - // - auto* pInfo = infos.data(); - for (auto& outWrite : writes) - outWrite.info = pInfo++; - // if the descriptor write fails, we make the Descriptor Sets behave as-if the TLAS build failed (dep is missing) - if (!writes.empty() && !device->updateDescriptorSets(writes,{})) - { - auto* pHash = findInStaging.template operator()(dstSet); - smart_refctd_ptr dummy; - markFailureInStaging("writing TLAS to Descriptor Set binding",dummy,dstSet,pHash); - } } - // not strictly necessary, just provoking refcounting bugs right away if they exist - compactedTLASMap.clear(); + // if the descriptor write fails, we make the Descriptor Sets behave as-if the TLAS build failed (dep is missing) + if (!writes.empty() && !device->updateDescriptorSets(writes,{})) + logger.log("Failed to write one of the compacted TLASes into a Descriptor Set, all Descriptor Sets will still use non-compacted TLASes",system::ILogger::ELL_ERROR); } mergeCache.template operator()(); - // needed for the IGPUDescriptorSets to check if TLAS exists/was written, can be released now - reservations.m_deferredTLASDescriptorWrites.clear(); // mergeCache.template operator()(); // no submit was necessary, so should signal the extra semaphores from the host From 8c549fb7105637649b6b71ae2bb481b8729daa4a Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 12 May 2025 17:40:48 +0700 Subject: [PATCH 062/346] Add computeDependants virtual function to IAsset --- include/nbl/asset/IAsset.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/nbl/asset/IAsset.h b/include/nbl/asset/IAsset.h index 3b8b123ce3..3802536029 100644 --- a/include/nbl/asset/IAsset.h +++ b/include/nbl/asset/IAsset.h @@ -169,6 +169,8 @@ class IAsset : virtual public core::IReferenceCounted return retval; } + virtual core::unordered_set computeDependants() const = 0; + virtual bool valid() const = 0; protected: From 01c4ac66ad760c843853eb1dfb9bc18fbf6a4bd0 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 12 May 2025 17:41:22 +0700 Subject: [PATCH 063/346] Implement computeDependants for ICPUComputePipeline --- include/nbl/asset/ICPUComputePipeline.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h index 5f933878b4..aa7656af86 100644 --- a/include/nbl/asset/ICPUComputePipeline.h +++ b/include/nbl/asset/ICPUComputePipeline.h @@ -37,6 +37,11 @@ class ICPUComputePipeline final : public ICPUPipeline computeDependants() const override + { + return {m_layout.get(), m_specInfo.shader.get()}; + } + inline virtual std::span getSpecInfo(hlsl::ShaderStage stage) const override final { if (stage==hlsl::ShaderStage::ESS_COMPUTE && isMutable()) From d9efa1a60e17995271a966ffdc20d93f4490fa53 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 12 May 2025 17:41:56 +0700 Subject: [PATCH 064/346] Implement compute pipeline base --- include/nbl/asset/ICPUComputePipeline.h | 9 ++-- include/nbl/asset/IComputePipeline.h | 56 +++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 4 deletions(-) create mode 100644 include/nbl/asset/IComputePipeline.h diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h index aa7656af86..01859e0c3f 100644 --- a/include/nbl/asset/ICPUComputePipeline.h +++ b/include/nbl/asset/ICPUComputePipeline.h @@ -6,15 +6,16 @@ #include "nbl/asset/ICPUPipeline.h" +#include "nbl/asset/IComputePipeline.h" namespace nbl::asset { //! CPU Version of Compute Pipeline -class ICPUComputePipeline final : public ICPUPipeline> +class ICPUComputePipeline final : public ICPUPipeline> { - using base_t = ICPUPipeline>; + using base_t = ICPUPipeline>; public: @@ -26,7 +27,7 @@ class ICPUComputePipeline final : public ICPUPipeline clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final { - auto newPipeline = new ICPUComputePipeline(std::move(layout)); + auto newPipeline = new ICPUComputePipeline(layout.get()); newPipeline->m_specInfo = m_specInfo.clone(depth); return core::smart_refctd_ptr(newPipeline, core::dont_grab); } @@ -73,7 +74,7 @@ class ICPUComputePipeline final : public ICPUPipeline(layout)) + base_t(layout, {}) {} }; diff --git a/include/nbl/asset/IComputePipeline.h b/include/nbl/asset/IComputePipeline.h new file mode 100644 index 0000000000..4f439d7100 --- /dev/null +++ b/include/nbl/asset/IComputePipeline.h @@ -0,0 +1,56 @@ +#ifndef _NBL_ASSET_I_COMPUTE_PIPELINE_H_INCLUDED_ +#define _NBL_ASSET_I_COMPUTE_PIPELINE_H_INCLUDED_ + +#include "nbl/asset/IPipeline.h" + +namespace nbl::asset +{ + +class IComputePipelineBase : public virtual core::IReferenceCounted +{ + public: + // Nabla requires device's reported subgroup size to be between 4 and 128 + enum class SUBGROUP_SIZE : uint8_t + { + // No constraint but probably means `gl_SubgroupSize` is Dynamically Uniform + UNKNOWN = 0, + // Allows the Subgroup Uniform `gl_SubgroupSize` to be non-Dynamically Uniform and vary between Device's min and max + VARYING = 1, + // The rest we encode as log2(x) of the required value + REQUIRE_4 = 2, + REQUIRE_8 = 3, + REQUIRE_16 = 4, + REQUIRE_32 = 5, + REQUIRE_64 = 6, + REQUIRE_128 = 7 + }; + + struct SCachedCreationParams final + { + SUBGROUP_SIZE requiredSubgroupSize : 3 = SUBGROUP_SIZE::UNKNOWN; //!< Default value of 8 means no requirement + uint8_t requireFullSubgroups : 1 = false; + }; +}; + +template +class IComputePipeline : public IPipeline, public IComputePipelineBase +{ + using base_creation_params_t = IPipeline; + + public: + + inline const SCachedCreationParams& getCachedCreationParams() const { return m_params; } + + protected: + explicit IComputePipeline(const PipelineLayoutType* layout, const SCachedCreationParams& cachedParams) : + IPipeline(core::smart_refctd_ptr(layout)), + m_params(cachedParams) + {} + + SCachedCreationParams m_params; + +}; + +} + +#endif From 4d5097b81eb79dd71ccf630696921a715babeaa8 Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 12 May 2025 16:08:04 +0200 Subject: [PATCH 065/346] finish the Acceleration Structure `CAssetConverter::reserve` --- include/nbl/video/utilities/CAssetConverter.h | 32 ++-- src/nbl/video/utilities/CAssetConverter.cpp | 138 +++++++++--------- 2 files changed, 84 insertions(+), 86 deletions(-) diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h index f7faa9598b..d9ace6226e 100644 --- a/include/nbl/video/utilities/CAssetConverter.h +++ b/include/nbl/video/utilities/CAssetConverter.h @@ -1100,32 +1100,30 @@ class CAssetConverter : public core::IReferenceCounted inline build_f getBuildFlags() const {return static_cast(buildFlags);} core::smart_refctd_ptr canonical = nullptr; - uint64_t scratchSize : 45; - uint64_t compact : 1; + uint64_t scratchSize : 47 = 0; uint64_t buildFlags : 16 = 0; + uint64_t compact : 1; // scratch + input size also accounting for worst case padding due to alignment uint64_t buildSize; }; - template - using SConvReqAccelerationStructureMap = core::unordered_map::video_t*,SConvReqAccelerationStructure>; - SConvReqAccelerationStructureMap m_blasConversions[2]; - SConvReqAccelerationStructureMap m_tlasConversions[2]; + using SConvReqBLASMap = core::unordered_map>; + SConvReqBLASMap m_blasConversions[2]; + struct SConvReqTLAS : SConvReqAccelerationStructure + { + // This tracks non-root BLASes which are needed for a subsequent TLAS build. + // Because the copy group ID of the BLAS can only depend on the copy group and pointer of the TLAS and BLAS, + // we can be sure that all instances of the same BLAS within a TLAS will have the same copy group ID and use a map instead of a vector for storage + // Note that even things which are NOT in the staging cache are tracked here to make sure they don't finish their lifetimes prematurely. + using cpu_to_gpu_blas_map_t = core::unordered_map>; + cpu_to_gpu_blas_map_t instanceMap; + }; + using SConvReqTLASMap = core::unordered_map; + SConvReqTLASMap m_tlasConversions[2]; // array index 0 for device builds, 1 for host builds uint64_t m_minASBuildScratchSize[2] = {0,0}; uint64_t m_maxASBuildScratchSize[2] = {0,0}; uint64_t m_compactedASMaxMemory = 0; - // This tracks non-root BLASes which are needed for a subsequent TLAS build. Note that even things which are NOT in the staging cache are tracked here to make sure they don't finish their lifetimes early. - struct BLASUsedInTLASBuild - { - // This is the BLAS meant to be used for the instance, note that compaction of a BLAS overwrites the initial values at the end of `reserve` - core::smart_refctd_ptr gpuBLAS; - uint64_t buildDuringConvertCall : 1 = false; - // internal micro-refcount which lets us know when we should remove the entry from the map below - uint64_t remainingUsages : 63 = 0; - }; - using cpu_to_gpu_blas_map_t = core::unordered_map; - cpu_to_gpu_blas_map_t m_blasBuildMap; // struct SDeferredTLASWrite { diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 733be3f058..7bfd361e94 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -445,7 +445,7 @@ class AssetVisitor : public CRTP } private: - // there is no `impl()` overload taking `ICPUTopLevelAccelerationStructure` same as there is no `ICPUmage` + // there is no `impl()` overload taking `ICPUBottomLevelAccelerationStructure` same as there is no `ICPUmage` inline bool impl(const instance_t& instance, const CAssetConverter::patch_t& userPatch) { const auto blasInstances = instance.asset->getInstances(); @@ -1656,6 +1656,9 @@ class GetDependantVisit; template<> class GetDependantVisit : public GetDependantVisitBase { + public: + CAssetConverter::SReserveResult::SConvReqTLAS::cpu_to_gpu_blas_map_t* instanceMap; + protected: bool descend_impl( const instance_t& user, const CAssetConverter::patch_t& userPatch, @@ -1666,6 +1669,7 @@ class GetDependantVisit : public GetDependant auto depObj = getDependant(dep,soloPatch); if (!depObj) return false; + instanceMap->operator[](dep.asset) = std::move(depObj); return true; } }; @@ -3397,9 +3401,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult // now allocate the memory for buffers and images deferredAllocator.finalize(); - // TODO: everything below is slightly wrong due to not having a final top-down dependency checking pass throwing away useless non-root GPU subtrees - - // find out which buffers need to be uploaded via a staging buffer + // enqueue successfully created buffers for conversion for (auto& entry : bufferConversions.contentHashToCanonical) for (auto i=0ull; i SReserveResult { constexpr bool IsTLAS = std::is_same_v; // - SReserveResult::SConvReqAccelerationStructureMap* pConversions; + std::conditional_t* pConversions; if constexpr (IsTLAS) pConversions = retval.m_tlasConversions; else @@ -3437,11 +3439,12 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult }; } smart_refctd_ptr::video_t> as; + CAssetConverter::SReserveResult::SConvReqTLAS::cpu_to_gpu_blas_map_t blasInstanceMap; if constexpr (IsTLAS) { // check if the BLASes we want to use for the instances were successfully allocated and created AssetVisitor> visitor = { - {inputs,dfsCaches}, + {inputs,dfsCaches,&blasInstanceMap}, {canonical,deferredParams.uniqueCopyGroupID}, patch }; @@ -3469,6 +3472,8 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult request.compact = patch.compactAfterBuild; request.buildFlags = static_cast(patch.getBuildFlags(canonical).value); request.buildSize = deferredParams.buildSize; + if constexpr (IsTLAS) + request.instanceMap = std::move(blasInstanceMap); } }; createAccelerationStructures.template operator()(); @@ -3476,7 +3481,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult createAccelerationStructures.template operator()(); tlasConversions.propagateToCaches(std::get>(dfsCaches),std::get>(retval.m_stagingCaches)); } - // find out which images need what caps for the transfer and mipmapping + // enqueue successfully created images with data to upload for conversion auto& dfsCacheImages = std::get>(dfsCaches); for (auto& entry : imageConversions.contentHashToCanonical) for (auto i=0ull; i SReserveResult pruneStaging.template operator()(); pruneStaging.template operator()(); pruneStaging.template operator()(); - // go over future TLAS builds to gather used BLASes - for (auto i=0; i<2; i++) - for (const auto& req : retval.m_tlasConversions[i]) - { - auto* const cpuTLAS = req.second.canonical.get(); - assert(cpuTLAS); - for (const auto& instance : cpuTLAS->getInstances()) - { - auto* const cpuBLAS = instance.getBase().blas.get(); - auto foundBLAS = retval.m_blasBuildMap.find(cpuBLAS); - if (foundBLAS!=retval.m_blasBuildMap.end()) - foundBLAS->second.remainingUsages++; - else - { - smart_refctd_ptr gpuBLAS; -// TODO: figure out the BLAS that will be used, (this requires UUID) - retval.m_blasBuildMap.insert(foundBLAS,{cpuBLAS,{std::move(gpuBLAS),1,1}}); - } - } - } pruneStaging.template operator()(); pruneStaging.template operator()(); } @@ -3620,7 +3605,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult // index 0 is device build, 1 is host build size_t scratchSizeFullParallelBuild[2] = {0,0}; // - const SReserveResult::SConvReqAccelerationStructureMap* pConversions; + const std::conditional_t* pConversions; if constexpr (IsTLAS) pConversions = retval.m_tlasConversions; else @@ -3755,7 +3740,25 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul } }; - // compacted TLASes need to be substituted in cache and Descriptor Sets + // want to check if deps successfully exist + struct SMissingDependent + { + // This only checks if whether we had to convert and failed, but the dependent might be in readCache of one or more converters, so if in doubt assume its okay + inline operator bool() const {return wasInStaging && gotWiped;} + + bool wasInStaging; + bool gotWiped; + }; + auto missingDependent = [&reservations](const typename asset_traits::video_t* dep)->SMissingDependent + { + auto& stagingCache = std::get>(reservations.m_stagingCaches); + auto found = stagingCache.find(const_cast::video_t*>(dep)); + SMissingDependent retval = {.wasInStaging=found!=stagingCache.end()}; + retval.gotWiped = retval.wasInStaging && found->second.value==CHashCache::NoContentHash; + return retval; + }; + + // Descriptor Sets need their TLAS descriptors substituted if they've been compacted core::unordered_map> compactedTLASMap; // Anything to do? auto reqQueueFlags = reservations.m_queueFlags; @@ -4672,6 +4675,9 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul .dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT }; + // compacted BLASes need to be substituted in cache and TLAS Build Inputs + using compacted_blas_map_t = core::unordered_map>; + compacted_blas_map_t compactedBLASMap; // Device BLAS builds if (blasCount) { @@ -4749,7 +4755,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul computeCmdBuf->cmdbuf->endDebugMarker(); { // the already compacted BLASes need to be written into the TLASes using them, want to swap them out ASAP -//reservations.m_blasBuildMap[canonical].gpuBLAS = compacted; +//compactedBLASMap[as] = compacted; } computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact BLASes END"); computeCmdBuf->cmdbuf->endDebugMarker(); @@ -4801,11 +4807,8 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul using scratch_allocator_t = std::remove_reference_t; using addr_t = typename scratch_allocator_t::size_type; const auto& limits = physDev->getLimits(); - core::unordered_set> dedupBLASesUsed; - dedupBLASesUsed.reserve(reservations.m_blasBuildMap.size()); for (auto& tlasToBuild : tlasesToBuild) { - dedupBLASesUsed.clear(); auto& canonical = tlasToBuild.second.canonical; const auto as = tlasToBuild.first; const auto pFoundHash = findInStaging.template operator()(as); @@ -4819,19 +4822,30 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul } const auto instances = canonical->getInstances(); const auto instanceCount = static_cast(instances.size()); + const auto& instanceMap = tlasToBuild.second.instanceMap; size_t instanceDataSize = 0; // gather total input size and check dependants exist + bool dependsOnBLASBuilds = false; for (const auto& instance : instances) { - // failed BLAS builds erase themselves from this map, so this checks if some BLAS used but which had to be built failed the build - const auto found = reservations.m_blasBuildMap.find(instance.getBase().blas.get()); - if (found==reservations.m_blasBuildMap.end() || failedBLASBarrier && found->second.buildDuringConvertCall) + auto found = instanceMap.find(instance.getBase().blas.get()); + assert(instanceMap.end()!=found); + const auto depInfo = missingDependent.template operator()(found->second.get()); + if (depInfo) { instanceDataSize = 0; break; } + if (depInfo.wasInStaging) + dependsOnBLASBuilds; instanceDataSize += ITopLevelAccelerationStructure::getInstanceSize(instance.getType()); } + // problem with building some Dependent BLASes + if (failedBLASBarrier && dependsOnBLASBuilds) + { + markFailureInStaging("building BLASes which current TLAS build wants to instance",canonical,as,pFoundHash); + continue; + } // problem with finding the dependents (BLASes) if (instanceDataSize==0) { @@ -4862,6 +4876,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul params.scratchForDeviceASBuild->multi_deallocate(AllocCount,&offsets[0],&sizes[0],params.compute->getFutureScratchSemaphore()); } // stream the instance/geometry input in + const size_t trackedBLASesOffset = trackedBLASes.size(); { bool success = true; { @@ -4881,27 +4896,30 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul const auto newWritten = bytesWritten+size; if (newWritten>=blockSize) return bytesWritten; - auto found = blasBuildMap->find(instance.getBase().blas.get()); - assert(found!=blasBuildMap->end()); - const auto& blas = found->second.gpuBLAS; - dst = IGPUTopLevelAccelerationStructure::writeInstance(dst,instance,blas.get()->getReferenceForDeviceOperations()); - dedupBLASesUsed->emplace(blas); - if (--found->second.remainingUsages == 0) - blasBuildMap->erase(found); + auto found = instanceMap->find(instance.getBase().blas.get()); + auto blas = found->second.get(); + if (auto found=compactedBLASMap->find(blas); found!=compactedBLASMap->end()) + blas = found->second.get(); + trackedBLASes->emplace_back(blas); + dst = IGPUTopLevelAccelerationStructure::writeInstance(dst,instance,blas->getReferenceForDeviceOperations()); bytesWritten = newWritten; } } - SReserveResult::cpu_to_gpu_blas_map_t* blasBuildMap; - core::unordered_set>* dedupBLASesUsed; + const compacted_blas_map_t* compactedBLASMap; + core::vector>* trackedBLASes; + SReserveResult::SConvReqTLAS::cpu_to_gpu_blas_map_t* instanceMap; std::span instances; uint32_t instanceIndex = 0; }; FillInstances fillInstances; - fillInstances.blasBuildMap = &reservations.m_blasBuildMap; - fillInstances.dedupBLASesUsed = &dedupBLASesUsed; + fillInstances.compactedBLASMap = &compactedBLASMap; + fillInstances.trackedBLASes = &trackedBLASes; + fillInstances.instanceMap = &tlasToBuild.second.instanceMap; fillInstances.instances = instances; success = streamDataToScratch(offsets[1],sizes[1],fillInstances); + // provoke refcounting bugs right away + tlasToBuild.second.instanceMap.clear(); } if (success && as->usesMotion()) { @@ -4935,6 +4953,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul xferCmdBuf = params.transfer->getCommandBufferForRecording(); if (!success) { + trackedBLASes.resize(trackedBLASesOffset); markFailureInStaging("Uploading Instance Data for TLAS build failed",canonical,as,pFoundHash); continue; } @@ -4950,14 +4969,8 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul // note we don't build directly from staging, because only very small inputs could come from there and they'd impede the transfer efficiency of the larger ones buildInfo.instanceData = {.offset=offsets[as->usesMotion() ? 2:1],.buffer=smart_refctd_ptr(scratchBuffer)}; // be based cause vectors can grow - { - const auto offset = trackedBLASes.size(); - using p_p_BLAS_t = const IGPUBottomLevelAccelerationStructure**; - buildInfo.trackedBLASes = {reinterpret_cast(offset),dedupBLASesUsed.size()}; - for (auto& blas : dedupBLASesUsed) - trackedBLASes.emplace_back(std::move(blas)); - - } + using p_p_BLAS_t = const IGPUBottomLevelAccelerationStructure**; + buildInfo.trackedBLASes = {reinterpret_cast(trackedBLASesOffset),trackedBLASes.size()-trackedBLASesOffset}; // no special extra byte offset into the instance buffer rangeInfos.emplace_back(instanceCount,0u); // @@ -4984,7 +4997,6 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul else compactedOwnershipReleaseIndices.push_back(~0u); } - reservations.m_blasBuildMap.clear(); // finish the last batch recordBuildCommands(); if (!flushRanges.empty()) @@ -5154,18 +5166,6 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul // finish host tasks if not done yet hostUploadBuffers([]()->bool{return true;}); - // Descriptor Sets need their TLAS descriptors substituted if they've been compacted - // want to check if deps successfully exist - auto missingDependent = [&reservations](const typename asset_traits::video_t* dep)->bool - { - auto& stagingCache = std::get>(reservations.m_stagingCaches); - auto found = stagingCache.find(const_cast::video_t*>(dep)); - // this only checks if whether we had to convert and failed - if (found!=stagingCache.end() && found->second.value==CHashCache::NoContentHash) - return true; - // but the dependent might be in readCache of one or more converters, so if in doubt assume its okay - return false; - }; // insert items into cache if overflows handled fine and commandbuffers ready to be recorded auto mergeCache = [&]()->void { @@ -5277,7 +5277,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul mergeCache.template operator()(); mergeCache.template operator()(); mergeCache.template operator()(); - // write the TLASes into Descriptor Set finally + // overwrite the compacted TLASes in Descriptor Sets if (auto& tlasRewriteSet=reservations.m_potentialTLASRewrites; !tlasRewriteSet.empty()) { core::vector writes; From 09f16c2b36335cb7044d7935054fdb24e71f9263 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 28 Apr 2025 10:54:49 +0700 Subject: [PATCH 066/346] minor fixes, example --- examples_tests | 2 +- .../builtin/hlsl/workgroup2/arithmetic.hlsl | 36 +++++ .../builtin/hlsl/workgroup2/shared_scan.hlsl | 125 ++++++++++++++++++ 3 files changed, 162 insertions(+), 1 deletion(-) create mode 100644 include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl create mode 100644 include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl diff --git a/examples_tests b/examples_tests index 8c76367c1c..20011f5fdd 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 8c76367c1c226cce3d66f1c60f540e29a501a1cb +Subproject commit 20011f5fdd3e8454bb830ded6f4221ec75036809 diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl new file mode 100644 index 0000000000..dcd2a5df5d --- /dev/null +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl @@ -0,0 +1,36 @@ +// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_INCLUDED_ +#define _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_INCLUDED_ + + +#include "nbl/builtin/hlsl/functional.hlsl" +#include "nbl/builtin/hlsl/workgroup/ballot.hlsl" +#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl" +#include "nbl/builtin/hlsl/workgroup2/shared_scan.hlsl" + + +namespace nbl +{ +namespace hlsl +{ +namespace workgroup2 +{ + +template +struct reduction +{ + template + static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + { + impl::reduce fn; + fn.__call(dataAccessor, scratchAccessor); + } +} + +} +} +} + +#endif diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl new file mode 100644 index 0000000000..9c2eb164cf --- /dev/null +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -0,0 +1,125 @@ +// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_SHARED_SCAN_INCLUDED_ +#define _NBL_BUILTIN_HLSL_WORKGROUP2_SHARED_SCAN_INCLUDED_ + +#include "nbl/builtin/hlsl/cpp_compat.hlsl" +#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl" +#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" +#include "nbl/builtin/hlsl/subgroup/ballot.hlsl" +#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace workgroup2 +{ + +template +struct Configuration +{ + NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(_WorkgroupSize); + NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = uint16_t(_SubgroupSizeLog2); + NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation = uint16_t(_ItemsPerInvocation); + + // must have at least enough level 0 outputs to feed a single subgroup + NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroup = hlsl::max(WorkgroupSize >> SubgroupSizeLog2, SubgroupSize); + NBL_CONSTEXPR_STATIC_INLINE uint32_t VirtualWorkgroupSize = SubgroupsPerVirtualWorkgroup << SubgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation[2] = { Config::ItemsPerInvocation, SubgroupsPerVirtualWorkgroup >> SubgroupSizeLog2 }; + static_assert(ItemsPerInvocation[1]<=4, "3 level scan would have been needed with this config!"); +}; + +namespace impl +{ + +template +struct reduce +{ + using scalar_t = typename BinOp::type_t; + using vector_lv0_t = vector; // data accessor needs to be this type + using vector_lv1_t = vector; // scratch smem accessor needs to be this type + + template + void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) // groupshared vector_lv1_t scratch[Config::SubgroupsPerVirtualWorkgroup] + { + using config_t = subgroup2::Configuration; + using params_lv0_t = subgroup2::ArithmeticParams; + using params_lv1_t = subgroup2::ArithmeticParams; + BinOp binop; + + vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize]; + const uint32_t invocationIndex = SubgroupContiguousIndex(); + subgroup2::inclusive_scan inclusiveScan0; + // level 0 scan + [unroll] + for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) + { + scan_local[idx] = inclusiveScan0(dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex)); + if (subgroup::ElectLast()) + { + const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); + scratchAccessor.set(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation[0]-1]); // set last element of subgroup scan (reduction) to level 1 scan + } + } + scratchAccessor.workgroupExecutionAndMemoryBarrier(); + + subgroup2::inclusive_scan inclusiveScan1; + // level 1 scan + if (glsl::gl_SubgroupID() == 0) + { + scratchAccessor.set(invocationIndex, inclusiveScan1(scratchAccessor.get(invocationIndex))); + } + scratchAccessor.workgroupExecutionAndMemoryBarrier(); + + // set as last element in scan (reduction) + [unroll] + for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) + { + const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); + dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, scratchAccessor.get(Config::SubgroupsPerVirtualWorkgroup-1)); + } + } +}; + +template +struct scan +{ + using scalar_t = typename BinOp::type_t; + using vector_lv0_t = vector; // data accessor needs to be this type + using vector_lv1_t = vector; // scratch smem accessor needs to be this type + + template + void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) // groupshared vector_lv1_t scratch[Config::SubgroupsPerVirtualWorkgroup] + { + // TODO get this working + // same thing for level 0 + + subgroup2::inclusive_scan inclusiveScan1; + // level 1 scan + if (glsl::gl_SubgroupID() == 0) + { + const vector_lv1_t shiftedInput = hlsl::mix(BinOp::identity, scratchAccessor.get(invocationIndex-1), bool(invocationIndex)); + scratchAccessor.set(invocationIndex, inclusiveScan1(shiftedInput)); + } + scratchAccessor.workgroupExecutionAndMemoryBarrier(); + + // combine with level 0 + [unroll] + for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) + { + const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); + dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, binop(scratchAccessor.get(virtualSubgroupID), scan_local[idx])); + } + } +}; + +} + +} +} +} + +#endif From 6f5f8b05bc33cc8ea848d3f003bc7218a2d6bbac Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 28 Apr 2025 17:03:39 +0700 Subject: [PATCH 067/346] bug fixes and example --- .../builtin/hlsl/workgroup2/arithmetic.hlsl | 4 +- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 69 ++++++++++--------- 2 files changed, 40 insertions(+), 33 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl index dcd2a5df5d..2753344e43 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl @@ -25,9 +25,9 @@ struct reduction static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { impl::reduce fn; - fn.__call(dataAccessor, scratchAccessor); + fn.template __call(dataAccessor, scratchAccessor); } -} +}; } } diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 9c2eb164cf..7be002e8d3 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -9,6 +9,7 @@ #include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" #include "nbl/builtin/hlsl/subgroup/ballot.hlsl" #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" +#include "nbl/builtin/hlsl/mpl.hlsl" namespace nbl { @@ -23,13 +24,15 @@ struct Configuration NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(_WorkgroupSize); NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = uint16_t(_SubgroupSizeLog2); NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2; - NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation = uint16_t(_ItemsPerInvocation); + // NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation = uint16_t(_ItemsPerInvocation); // must have at least enough level 0 outputs to feed a single subgroup - NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroup = hlsl::max(WorkgroupSize >> SubgroupSizeLog2, SubgroupSize); + NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroup = mpl::max> SubgroupSizeLog2), SubgroupSize>::value; //TODO expression not constant apparently NBL_CONSTEXPR_STATIC_INLINE uint32_t VirtualWorkgroupSize = SubgroupsPerVirtualWorkgroup << SubgroupSizeLog2; - NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation[2] = { Config::ItemsPerInvocation, SubgroupsPerVirtualWorkgroup >> SubgroupSizeLog2 }; - static_assert(ItemsPerInvocation[1]<=4, "3 level scan would have been needed with this config!"); + // NBL_CONSTEXPR_STATIC_INLINE uint32_t2 ItemsPerInvocation; TODO? doesn't allow inline definitions for uint32_t2 for some reason, uint32_t[2] as well ; declaring out of line results in not constant expression + NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_0 = _ItemsPerInvocation; + NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_1 = SubgroupsPerVirtualWorkgroup >> SubgroupSizeLog2; + static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!"); }; namespace impl @@ -39,19 +42,19 @@ template struct reduce { using scalar_t = typename BinOp::type_t; - using vector_lv0_t = vector; // data accessor needs to be this type - using vector_lv1_t = vector; // scratch smem accessor needs to be this type + using vector_lv0_t = vector; // data accessor needs to be this type + using vector_lv1_t = vector; // scratch smem accessor needs to be this type template void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) // groupshared vector_lv1_t scratch[Config::SubgroupsPerVirtualWorkgroup] { using config_t = subgroup2::Configuration; - using params_lv0_t = subgroup2::ArithmeticParams; - using params_lv1_t = subgroup2::ArithmeticParams; + using params_lv0_t = subgroup2::ArithmeticParams; + using params_lv1_t = subgroup2::ArithmeticParams; BinOp binop; vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize]; - const uint32_t invocationIndex = SubgroupContiguousIndex(); + const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); subgroup2::inclusive_scan inclusiveScan0; // level 0 scan [unroll] @@ -61,7 +64,7 @@ struct reduce if (subgroup::ElectLast()) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); - scratchAccessor.set(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation[0]-1]); // set last element of subgroup scan (reduction) to level 1 scan + scratchAccessor.set(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -88,31 +91,35 @@ template; // data accessor needs to be this type - using vector_lv1_t = vector; // scratch smem accessor needs to be this type + using vector_lv0_t = vector; // data accessor needs to be this type + using vector_lv1_t = vector; // scratch smem accessor needs to be this type template void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) // groupshared vector_lv1_t scratch[Config::SubgroupsPerVirtualWorkgroup] { - // TODO get this working - // same thing for level 0 - - subgroup2::inclusive_scan inclusiveScan1; - // level 1 scan - if (glsl::gl_SubgroupID() == 0) - { - const vector_lv1_t shiftedInput = hlsl::mix(BinOp::identity, scratchAccessor.get(invocationIndex-1), bool(invocationIndex)); - scratchAccessor.set(invocationIndex, inclusiveScan1(shiftedInput)); - } - scratchAccessor.workgroupExecutionAndMemoryBarrier(); - - // combine with level 0 - [unroll] - for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) - { - const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); - dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, binop(scratchAccessor.get(virtualSubgroupID), scan_local[idx])); - } + // // TODO get this working + // // same thing for level 0 + // using config_t = subgroup2::Configuration; + // using params_lv0_t = subgroup2::ArithmeticParams; + // using params_lv1_t = subgroup2::ArithmeticParams; + // BinOp binop; + + // subgroup2::inclusive_scan inclusiveScan1; + // // level 1 scan + // if (glsl::gl_SubgroupID() == 0) + // { + // const vector_lv1_t shiftedInput = hlsl::mix(BinOp::identity, scratchAccessor.get(invocationIndex-1), bool(invocationIndex)); + // scratchAccessor.set(invocationIndex, inclusiveScan1(shiftedInput)); + // } + // scratchAccessor.workgroupExecutionAndMemoryBarrier(); + + // // combine with level 0 + // [unroll] + // for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) + // { + // const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); + // dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, binop(scratchAccessor.get(virtualSubgroupID), scan_local[idx])); + // } } }; From 1bac2478f5f09c05b45fa625c70da6ca44023970 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 29 Apr 2025 12:05:04 +0700 Subject: [PATCH 068/346] fix to data accessor indexing --- include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 7be002e8d3..3cba3a2d57 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -60,7 +60,7 @@ struct reduce [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { - scan_local[idx] = inclusiveScan0(dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex)); + scan_local[idx] = inclusiveScan0(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex)); if (subgroup::ElectLast()) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); @@ -70,6 +70,7 @@ struct reduce scratchAccessor.workgroupExecutionAndMemoryBarrier(); subgroup2::inclusive_scan inclusiveScan1; + // subgroup2::reduction reduce1; // level 1 scan if (glsl::gl_SubgroupID() == 0) { @@ -81,8 +82,8 @@ struct reduce [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { - const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); - dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, scratchAccessor.get(Config::SubgroupsPerVirtualWorkgroup-1)); + // const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); + dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scratchAccessor.get(Config::SubgroupSize-1)); } } }; From 305ac7bd3997f7b491ff9adb30a8f9c8e54ab5ca Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 29 Apr 2025 16:58:04 +0700 Subject: [PATCH 069/346] added template spec for vector dim 1 --- include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl | 1 + 1 file changed, 1 insertion(+) diff --git a/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl b/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl index 9aefc3b3d8..652cabd7c7 100644 --- a/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl +++ b/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl @@ -28,6 +28,7 @@ struct vector_traits >\ NBL_CONSTEXPR_STATIC_INLINE bool IsVector = true;\ };\ +DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(1) DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(2) DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(3) DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(4) From c08063da62a3bed85cb4ff9d59668ed7474604f7 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 29 Apr 2025 17:03:13 +0700 Subject: [PATCH 070/346] added inclusive scan --- .../builtin/hlsl/workgroup2/arithmetic.hlsl | 11 +++ .../builtin/hlsl/workgroup2/shared_scan.hlsl | 77 +++++++++++-------- 2 files changed, 57 insertions(+), 31 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl index 2753344e43..acfa5feba8 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl @@ -29,6 +29,17 @@ struct reduction } }; +template +struct inclusive_scan +{ + template + static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + { + impl::scan fn; + fn.template __call(dataAccessor, scratchAccessor); + } +}; + } } } diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 3cba3a2d57..6358bf24ad 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -24,7 +24,6 @@ struct Configuration NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(_WorkgroupSize); NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = uint16_t(_SubgroupSizeLog2); NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2; - // NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation = uint16_t(_ItemsPerInvocation); // must have at least enough level 0 outputs to feed a single subgroup NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroup = mpl::max> SubgroupSizeLog2), SubgroupSize>::value; //TODO expression not constant apparently @@ -46,7 +45,7 @@ struct reduce using vector_lv1_t = vector; // scratch smem accessor needs to be this type template - void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) // groupshared vector_lv1_t scratch[Config::SubgroupsPerVirtualWorkgroup] + void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { using config_t = subgroup2::Configuration; using params_lv0_t = subgroup2::ArithmeticParams; @@ -55,8 +54,8 @@ struct reduce vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize]; const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); - subgroup2::inclusive_scan inclusiveScan0; // level 0 scan + subgroup2::inclusive_scan inclusiveScan0; [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { @@ -69,9 +68,8 @@ struct reduce } scratchAccessor.workgroupExecutionAndMemoryBarrier(); - subgroup2::inclusive_scan inclusiveScan1; - // subgroup2::reduction reduce1; // level 1 scan + subgroup2::inclusive_scan inclusiveScan1; if (glsl::gl_SubgroupID() == 0) { scratchAccessor.set(invocationIndex, inclusiveScan1(scratchAccessor.get(invocationIndex))); @@ -82,13 +80,12 @@ struct reduce [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { - // const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scratchAccessor.get(Config::SubgroupSize-1)); } } }; -template +template struct scan { using scalar_t = typename BinOp::type_t; @@ -96,31 +93,49 @@ struct scan using vector_lv1_t = vector; // scratch smem accessor needs to be this type template - void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) // groupshared vector_lv1_t scratch[Config::SubgroupsPerVirtualWorkgroup] + void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { - // // TODO get this working - // // same thing for level 0 - // using config_t = subgroup2::Configuration; - // using params_lv0_t = subgroup2::ArithmeticParams; - // using params_lv1_t = subgroup2::ArithmeticParams; - // BinOp binop; - - // subgroup2::inclusive_scan inclusiveScan1; - // // level 1 scan - // if (glsl::gl_SubgroupID() == 0) - // { - // const vector_lv1_t shiftedInput = hlsl::mix(BinOp::identity, scratchAccessor.get(invocationIndex-1), bool(invocationIndex)); - // scratchAccessor.set(invocationIndex, inclusiveScan1(shiftedInput)); - // } - // scratchAccessor.workgroupExecutionAndMemoryBarrier(); - - // // combine with level 0 - // [unroll] - // for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) - // { - // const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); - // dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, binop(scratchAccessor.get(virtualSubgroupID), scan_local[idx])); - // } + using config_t = subgroup2::Configuration; + using params_lv0_t = subgroup2::ArithmeticParams; + using params_lv1_t = subgroup2::ArithmeticParams; + BinOp binop; + + vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize]; + const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); + subgroup2::inclusive_scan inclusiveScan0; + // level 0 scan + [unroll] + for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) + { + scan_local[idx] = inclusiveScan0(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex)); + if (subgroup::ElectLast()) + { + const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); + scratchAccessor.set(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + } + } + scratchAccessor.workgroupExecutionAndMemoryBarrier(); + + // level 1 scan + subgroup2::inclusive_scan inclusiveScan1; + if (glsl::gl_SubgroupID() == 0) + { + const vector_lv1_t shiftedInput = hlsl::mix(hlsl::promote(BinOp::identity), scratchAccessor.get(invocationIndex-1), bool(invocationIndex)); + scratchAccessor.set(invocationIndex, inclusiveScan1(shiftedInput)); + } + scratchAccessor.workgroupExecutionAndMemoryBarrier(); + + // combine with level 0 + [unroll] + for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) + { + const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); + const vector_lv1_t lhs = scratchAccessor.get(virtualSubgroupID); + [unroll] + for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++) + scan_local[idx][i] = binop(lhs, scan_local[idx][i]); + dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); + } } }; From b1d804f520eed03d72a1d625bb904e777a34b23a Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 30 Apr 2025 14:08:38 +0700 Subject: [PATCH 071/346] exclusive scan working --- .../builtin/hlsl/workgroup2/arithmetic.hlsl | 11 +++++++++++ .../builtin/hlsl/workgroup2/shared_scan.hlsl | 18 ++++++++++++++---- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl index acfa5feba8..6824e92afa 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl @@ -40,6 +40,17 @@ struct inclusive_scan } }; +template +struct exclusive_scan +{ + template + static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + { + impl::scan fn; + fn.template __call(dataAccessor, scratchAccessor); + } +}; + } } } diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 6358bf24ad..331951d3f3 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -130,10 +130,20 @@ struct scan for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); - const vector_lv1_t lhs = scratchAccessor.get(virtualSubgroupID); - [unroll] - for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++) - scan_local[idx][i] = binop(lhs, scan_local[idx][i]); + const vector_lv1_t left = scratchAccessor.get(virtualSubgroupID); + if (Exclusive) + { + scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp(scan_local[idx][Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID())); + [unroll] + for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++) + scan_local[idx][Config::ItemsPerInvocation_0-i-1] = binop(left, hlsl::mix(scan_local[idx][Config::ItemsPerInvocation_0-i-2], left_last_elem, (Config::ItemsPerInvocation_0-i-1==0))); + } + else + { + [unroll] + for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++) + scan_local[idx][i] = binop(left, scan_local[idx][i]); + } dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); } } From 3cf98ab4abe77fecd7a779d58c7f85c42d85251e Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 30 Apr 2025 14:12:55 +0700 Subject: [PATCH 072/346] removed outdated comment --- include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 331951d3f3..cd49cb1c1b 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -26,7 +26,7 @@ struct Configuration NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2; // must have at least enough level 0 outputs to feed a single subgroup - NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroup = mpl::max> SubgroupSizeLog2), SubgroupSize>::value; //TODO expression not constant apparently + NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroup = mpl::max> SubgroupSizeLog2), SubgroupSize>::value; NBL_CONSTEXPR_STATIC_INLINE uint32_t VirtualWorkgroupSize = SubgroupsPerVirtualWorkgroup << SubgroupSizeLog2; // NBL_CONSTEXPR_STATIC_INLINE uint32_t2 ItemsPerInvocation; TODO? doesn't allow inline definitions for uint32_t2 for some reason, uint32_t[2] as well ; declaring out of line results in not constant expression NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_0 = _ItemsPerInvocation; From 7b310e01f9c4c557dec87555121c3ee7cebed456 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 1 May 2025 12:18:35 +0700 Subject: [PATCH 073/346] minor changes to config usage --- include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index cd49cb1c1b..c789c8a482 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -18,19 +18,20 @@ namespace hlsl namespace workgroup2 { -template +template struct Configuration { - NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(_WorkgroupSize); + NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << WorkgroupSizeLog2; NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = uint16_t(_SubgroupSizeLog2); NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2; + static_assert(WorkgroupSizeLog2>=_SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize"); // must have at least enough level 0 outputs to feed a single subgroup - NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroup = mpl::max> SubgroupSizeLog2), SubgroupSize>::value; - NBL_CONSTEXPR_STATIC_INLINE uint32_t VirtualWorkgroupSize = SubgroupsPerVirtualWorkgroup << SubgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroupLog2 = mpl::max::value - SubgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint32_t VirtualWorkgroupSize = uint32_t(0x1u) << (SubgroupsPerVirtualWorkgroupLog2 + SubgroupSizeLog2); // NBL_CONSTEXPR_STATIC_INLINE uint32_t2 ItemsPerInvocation; TODO? doesn't allow inline definitions for uint32_t2 for some reason, uint32_t[2] as well ; declaring out of line results in not constant expression NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_0 = _ItemsPerInvocation; - NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_1 = SubgroupsPerVirtualWorkgroup >> SubgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_1 = uint32_t(0x1u) << (SubgroupsPerVirtualWorkgroupLog2 - SubgroupSizeLog2); static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!"); }; From 4b4e7e8f3685f4a825997ba7a3ea5fc2594883f4 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 1 May 2025 17:19:13 +0700 Subject: [PATCH 074/346] add 1 level scans --- .../builtin/hlsl/workgroup2/arithmetic.hlsl | 6 +- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 69 ++++++++++++++++++- 2 files changed, 69 insertions(+), 6 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl index 6824e92afa..3b4a028d2c 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl @@ -24,7 +24,7 @@ struct reduction template static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { - impl::reduce fn; + impl::reduce fn; fn.template __call(dataAccessor, scratchAccessor); } }; @@ -35,7 +35,7 @@ struct inclusive_scan template static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { - impl::scan fn; + impl::scan fn; fn.template __call(dataAccessor, scratchAccessor); } }; @@ -46,7 +46,7 @@ struct exclusive_scan template static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { - impl::scan fn; + impl::scan fn; fn.template __call(dataAccessor, scratchAccessor); } }; diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index c789c8a482..c18c00f83e 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -26,11 +26,13 @@ struct Configuration NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2; static_assert(WorkgroupSizeLog2>=_SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize"); + NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = conditional_value::value; + // must have at least enough level 0 outputs to feed a single subgroup NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroupLog2 = mpl::max::value - SubgroupSizeLog2; NBL_CONSTEXPR_STATIC_INLINE uint32_t VirtualWorkgroupSize = uint32_t(0x1u) << (SubgroupsPerVirtualWorkgroupLog2 + SubgroupSizeLog2); // NBL_CONSTEXPR_STATIC_INLINE uint32_t2 ItemsPerInvocation; TODO? doesn't allow inline definitions for uint32_t2 for some reason, uint32_t[2] as well ; declaring out of line results in not constant expression - NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_0 = _ItemsPerInvocation; + NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_0 = conditional_value::value; NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_1 = uint32_t(0x1u) << (SubgroupsPerVirtualWorkgroupLog2 - SubgroupSizeLog2); static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!"); }; @@ -38,8 +40,69 @@ struct Configuration namespace impl { +template +struct reduce; + +template +struct scan; + +// 1-level scans +template +struct reduce +{ + using scalar_t = typename BinOp::type_t; + using vector_t = vector; // data accessor needs to be this type + // doesn't use scratch smem, need as param? + + template + void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + { + using config_t = subgroup2::Configuration; + using params_t = subgroup2::ArithmeticParams; + + subgroup2::reduction reduction; + if (glsl::gl_SubgroupID() == 0) + { + vector_t value = reduction(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex())); + dataAccessor.set(glsl::gl_WorkGroupID().x * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex(), value); // can be safely merged with top line? + } + } +}; + +template +struct scan +{ + using scalar_t = typename BinOp::type_t; + using vector_t = vector; // data accessor needs to be this type + // doesn't use scratch smem, need as param? + + template + void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + { + using config_t = subgroup2::Configuration; + using params_t = subgroup2::ArithmeticParams; + + if (glsl::gl_SubgroupID() == 0) + { + vector_t value; + if (Exclusive) + { + subgroup2::exclusive_scan excl_scan; + value = excl_scan(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex())); + } + else + { + subgroup2::inclusive_scan incl_scan; + value = incl_scan(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex())); + } + dataAccessor.set(glsl::gl_WorkGroupID().x * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex(), value); // can be safely merged with above lines? + } + } +}; + +// 2-level scans template -struct reduce +struct reduce { using scalar_t = typename BinOp::type_t; using vector_lv0_t = vector; // data accessor needs to be this type @@ -87,7 +150,7 @@ struct reduce }; template -struct scan +struct scan { using scalar_t = typename BinOp::type_t; using vector_lv0_t = vector; // data accessor needs to be this type From 2e5f29f10e53f1f8632e8f45099cece1e4b72601 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 2 May 2025 09:41:52 +0700 Subject: [PATCH 075/346] fixes to 1 level scans --- include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index c18c00f83e..0128c3320d 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -63,8 +63,8 @@ struct reduce subgroup2::reduction reduction; if (glsl::gl_SubgroupID() == 0) { - vector_t value = reduction(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex())); - dataAccessor.set(glsl::gl_WorkGroupID().x * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex(), value); // can be safely merged with top line? + vector_t value = reduction(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex())); + dataAccessor.set(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value); // can be safely merged with top line? } } }; @@ -88,14 +88,14 @@ struct scan if (Exclusive) { subgroup2::exclusive_scan excl_scan; - value = excl_scan(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex())); + value = excl_scan(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex())); } else { subgroup2::inclusive_scan incl_scan; - value = incl_scan(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex())); + value = incl_scan(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex())); } - dataAccessor.set(glsl::gl_WorkGroupID().x * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex(), value); // can be safely merged with above lines? + dataAccessor.set(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value); // can be safely merged with above lines? } } }; From 054b26916204d3ece92e474cb87ec74ebdead9bb Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 2 May 2025 10:54:33 +0700 Subject: [PATCH 076/346] added handling >1 vectors on level 1 scan (untested) --- include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 0128c3320d..b32bc3efde 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -127,7 +127,7 @@ struct reduce if (subgroup::ElectLast()) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); - scratchAccessor.set(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + scratchAccessor.setByComponent(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -144,7 +144,7 @@ struct reduce [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { - dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scratchAccessor.get(Config::SubgroupSize-1)); + dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scratchAccessor.getByComponent((1u << Config::SubgroupsPerVirtualWorkgroupLog2)-1)); } } }; @@ -175,7 +175,7 @@ struct scan if (subgroup::ElectLast()) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); - scratchAccessor.set(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + scratchAccessor.setByComponent(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -194,7 +194,7 @@ struct scan for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); - const vector_lv1_t left = scratchAccessor.get(virtualSubgroupID); + const scalar_t left = scratchAccessor.getByComponent(virtualSubgroupID); if (Exclusive) { scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp(scan_local[idx][Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID())); From 1b5282c8b5c37a3d387ec89ce2c2ea12384c41b7 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 5 May 2025 17:16:12 +0700 Subject: [PATCH 077/346] move load/store smem into scan funcs, setup config for 3 levels --- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 200 +++++++++++++++++- 1 file changed, 191 insertions(+), 9 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index b32bc3efde..c88694d1ac 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -18,6 +18,25 @@ namespace hlsl namespace workgroup2 { +namespace impl +{ +template +struct virtual_wg_size_log2 +{ + NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2+2),uint16_t,conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2*2+2),uint16_t,3,2>::value,1>::value; + NBL_CONSTEXPR_STATIC_INLINE uint16_t value = mpl::max_v+SubgroupSizeLog2; +}; + +template +struct items_per_invocation +{ + NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocationProductLog2 = mpl::max_v; + NBL_CONSTEXPR_STATIC_INLINE uint16_t value0 = conditional_value::value; + NBL_CONSTEXPR_STATIC_INLINE uint16_t value1 = uint16_t(0x1u) << conditional_value, ItemsPerInvocationProductLog2>::value; + NBL_CONSTEXPR_STATIC_INLINE uint16_t value2 = uint16_t(0x1u) << mpl::max_v; +}; +} + template struct Configuration { @@ -26,17 +45,43 @@ struct Configuration NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2; static_assert(WorkgroupSizeLog2>=_SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize"); - NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = conditional_value::value; - // must have at least enough level 0 outputs to feed a single subgroup - NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroupLog2 = mpl::max::value - SubgroupSizeLog2; - NBL_CONSTEXPR_STATIC_INLINE uint32_t VirtualWorkgroupSize = uint32_t(0x1u) << (SubgroupsPerVirtualWorkgroupLog2 + SubgroupSizeLog2); + NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroupLog2 = mpl::max_v; + + using virtual_wg_t = impl::virtual_wg_size_log2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = virtual_wg_t::levels; + NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << virtual_wg_t::value; + using items_per_invoc_t = impl::items_per_invocation; // NBL_CONSTEXPR_STATIC_INLINE uint32_t2 ItemsPerInvocation; TODO? doesn't allow inline definitions for uint32_t2 for some reason, uint32_t[2] as well ; declaring out of line results in not constant expression - NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_0 = conditional_value::value; - NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_1 = uint32_t(0x1u) << (SubgroupsPerVirtualWorkgroupLog2 - SubgroupSizeLog2); + NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_0 = items_per_invoc_t::value0; + NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_1 = items_per_invoc_t::value1; + NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_2 = items_per_invoc_t::value2; static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!"); }; +// special case when workgroup size 2048 and subgroup size 16 needs 3 levels and virtual workgroup size 4096 to get a full subgroup scan each on level 1 and 2 16x16x16=4096 +// specializing with macros because of DXC bug: https://github.com/microsoft/DirectXShaderCom0piler/issues/7007 +#define SPECIALIZE_CONFIG_CASE_2048_16(ITEMS_PER_INVOC) template<>\ +struct Configuration<11, 4, ITEMS_PER_INVOC>\ +{\ + NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << 11u;\ + NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = uint16_t(4u);\ + NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;\ + NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroupLog2 = 128u;\ + NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = 3;\ + NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << 4096;\ + NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_0 = ITEMS_PER_INVOC;\ + NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_1 = 1u;\ + NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_2 = 1u;\ +};\ + +SPECIALIZE_CONFIG_CASE_2048_16(1) +SPECIALIZE_CONFIG_CASE_2048_16(2) +SPECIALIZE_CONFIG_CASE_2048_16(4) + +#undef SPECIALIZE_CONFIG_CASE_2048_16 + + namespace impl { @@ -127,7 +172,62 @@ struct reduce if (subgroup::ElectLast()) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); - scratchAccessor.setByComponent(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + scratchAccessor.set(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + } + } + scratchAccessor.workgroupExecutionAndMemoryBarrier(); + + // level 1 scan + subgroup2::inclusive_scan inclusiveScan1; + if (glsl::gl_SubgroupID() == 0) + { + vector_lv1_t lv1_val; + [unroll] + for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) + scratchAccessor.get(invocationIndex*Config::ItemsPerInvocation_1+i,lv1_val[i]); + lv1_val = inclusiveScan1(lv1_val); + scratchAccessor.set(invocationIndex, lv1_val[Config::ItemsPerInvocation_1-1]); + } + scratchAccessor.workgroupExecutionAndMemoryBarrier(); + + // set as last element in scan (reduction) + [unroll] + for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) + { + scalar_t reduce_val; + scratchAccessor.get(Config::SubgroupSize-1,reduce_val); + dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val); + } + } +}; + +template +struct scan +{ + using scalar_t = typename BinOp::type_t; + using vector_lv0_t = vector; // data accessor needs to be this type + using vector_lv1_t = vector; // scratch smem accessor needs to be this type + + template + void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + { + using config_t = subgroup2::Configuration; + using params_lv0_t = subgroup2::ArithmeticParams; + using params_lv1_t = subgroup2::ArithmeticParams; + BinOp binop; + + vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize]; + const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); + subgroup2::inclusive_scan inclusiveScan0; + // level 0 scan + [unroll] + for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) + { + scan_local[idx] = inclusiveScan0(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex)); + if (subgroup::ElectLast()) + { + const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); + scratchAccessor.set(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -135,11 +235,93 @@ struct reduce // level 1 scan subgroup2::inclusive_scan inclusiveScan1; if (glsl::gl_SubgroupID() == 0) + { + vector_lv1_t lv1_val; + const uint32_t prevIndex = invocationIndex-1; + [unroll] + for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) + scratchAccessor.get(prevIndex*Config::ItemsPerInvocation_1+i,lv1_val[i]); + vector_lv1_t shiftedInput = hlsl::mix(hlsl::promote(BinOp::identity), lv1_val, bool(invocationIndex)); + shiftedInput = inclusiveScan1(shiftedInput); + scratchAccessor.set(invocationIndex, shiftedInput[Config::ItemsPerInvocation_1-1]); + } + scratchAccessor.workgroupExecutionAndMemoryBarrier(); + + // combine with level 0 + [unroll] + for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) + { + const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); + scalar_t left; + scratchAccessor.get(virtualSubgroupID,left); + if (Exclusive) + { + scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp(scan_local[idx][Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID())); + [unroll] + for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++) + scan_local[idx][Config::ItemsPerInvocation_0-i-1] = binop(left, hlsl::mix(scan_local[idx][Config::ItemsPerInvocation_0-i-2], left_last_elem, (Config::ItemsPerInvocation_0-i-1==0))); + } + else + { + [unroll] + for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++) + scan_local[idx][i] = binop(left, scan_local[idx][i]); + } + dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); + } + } +}; + +// 2-level scans +/* +template +struct reduce +{ + using scalar_t = typename BinOp::type_t; + using vector_lv0_t = vector; // data accessor needs to be this type + using vector_lv1_t = vector; // scratch smem accessor needs to be this type + + template + void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + { + using config_t = subgroup2::Configuration; + using params_lv0_t = subgroup2::ArithmeticParams; + using params_lv1_t = subgroup2::ArithmeticParams; + BinOp binop; + + vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize]; + const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); + // level 0 scan + subgroup2::inclusive_scan inclusiveScan0; + [unroll] + for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) + { + scan_local[idx] = inclusiveScan0(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex)); + if (subgroup::ElectLast()) + { + const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); + scratchAccessor.setByComponent(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + } + } + scratchAccessor.workgroupExecutionAndMemoryBarrier(); + + // level 1 scan + subgroup2::inclusive_scan inclusiveScan1; + if (glsl::gl_SubgroupID() < Config::SubgroupSizeLog2*Config::ItemsPerInvocation_1) { scratchAccessor.set(invocationIndex, inclusiveScan1(scratchAccessor.get(invocationIndex))); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); + // level 2 scan + // TODO + subgroup2::inclusive_scan inclusiveScan2; + if (glsl::gl_SubgroupID() == 0) + { + scratchAccessor.set(invocationIndex, inclusiveScan2(scratchAccessor.get(invocationIndex))); + } + scratchAccessor.workgroupExecutionAndMemoryBarrier(); + // set as last element in scan (reduction) [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) @@ -150,7 +332,7 @@ struct reduce }; template -struct scan +struct scan { using scalar_t = typename BinOp::type_t; using vector_lv0_t = vector; // data accessor needs to be this type @@ -212,7 +394,7 @@ struct scan } } }; - +*/ } } From c6dc5bc9579877d03f2e1e5531ef527cdd1b4eda Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 6 May 2025 10:52:05 +0700 Subject: [PATCH 078/346] change to use coalesced indexing for 2-level scans --- .../nbl/builtin/hlsl/workgroup2/shared_scan.hlsl | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index c88694d1ac..26fb969ace 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -172,7 +172,8 @@ struct reduce if (subgroup::ElectLast()) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); - scratchAccessor.set(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroupLog2 + (virtualSubgroupID/Config::ItemsPerInvocation_1); + scratchAccessor.set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -184,7 +185,7 @@ struct reduce vector_lv1_t lv1_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.get(invocationIndex*Config::ItemsPerInvocation_1+i,lv1_val[i]); + scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+invocationIndex,lv1_val[i]); lv1_val = inclusiveScan1(lv1_val); scratchAccessor.set(invocationIndex, lv1_val[Config::ItemsPerInvocation_1-1]); } @@ -227,7 +228,8 @@ struct scan if (subgroup::ElectLast()) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); - scratchAccessor.set(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroupLog2 + (virtualSubgroupID/Config::ItemsPerInvocation_1); + scratchAccessor.set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -240,7 +242,7 @@ struct scan const uint32_t prevIndex = invocationIndex-1; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.get(prevIndex*Config::ItemsPerInvocation_1+i,lv1_val[i]); + scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+prevIndex,lv1_val[i]); vector_lv1_t shiftedInput = hlsl::mix(hlsl::promote(BinOp::identity), lv1_val, bool(invocationIndex)); shiftedInput = inclusiveScan1(shiftedInput); scratchAccessor.set(invocationIndex, shiftedInput[Config::ItemsPerInvocation_1-1]); @@ -272,8 +274,7 @@ struct scan } }; -// 2-level scans -/* +// 3-level scans template struct reduce { @@ -394,7 +395,7 @@ struct scan } } }; -*/ + } } From aa0c36c8b48f480325c74334fa2fb8400b1fc76e Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 6 May 2025 14:35:02 +0700 Subject: [PATCH 079/346] added 3-level scans --- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 69 +++++++++++++++---- 1 file changed, 56 insertions(+), 13 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 26fb969ace..91596bace0 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -151,7 +151,7 @@ struct reduce { using scalar_t = typename BinOp::type_t; using vector_lv0_t = vector; // data accessor needs to be this type - using vector_lv1_t = vector; // scratch smem accessor needs to be this type + using vector_lv1_t = vector; template void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) @@ -207,7 +207,7 @@ struct scan { using scalar_t = typename BinOp::type_t; using vector_lv0_t = vector; // data accessor needs to be this type - using vector_lv1_t = vector; // scratch smem accessor needs to be this type + using vector_lv1_t = vector; template void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) @@ -280,7 +280,8 @@ struct reduce { using scalar_t = typename BinOp::type_t; using vector_lv0_t = vector; // data accessor needs to be this type - using vector_lv1_t = vector; // scratch smem accessor needs to be this type + using vector_lv1_t = vector; + using vector_lv2_t = vector; template void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) @@ -288,6 +289,7 @@ struct reduce using config_t = subgroup2::Configuration; using params_lv0_t = subgroup2::ArithmeticParams; using params_lv1_t = subgroup2::ArithmeticParams; + using params_lv2_t = subgroup2::ArithmeticParams; BinOp binop; vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize]; @@ -301,7 +303,8 @@ struct reduce if (subgroup::ElectLast()) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); - scratchAccessor.setByComponent(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroupLog2 + (virtualSubgroupID/Config::ItemsPerInvocation_1); + scratchAccessor.set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -310,16 +313,29 @@ struct reduce subgroup2::inclusive_scan inclusiveScan1; if (glsl::gl_SubgroupID() < Config::SubgroupSizeLog2*Config::ItemsPerInvocation_1) { - scratchAccessor.set(invocationIndex, inclusiveScan1(scratchAccessor.get(invocationIndex))); + vector_lv1_t lv1_val; + [unroll] + for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) + scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+invocationIndex,lv1_val[i]); + lv1_val = inclusiveScan1(lv1_val); + if (subgroup::ElectLast()) + { + const uint32_t bankedIndex = (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroupLog2 + (invocationIndex/Config::ItemsPerInvocation_2); + scratchAccessor.set(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); + } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); // level 2 scan - // TODO - subgroup2::inclusive_scan inclusiveScan2; + subgroup2::inclusive_scan inclusiveScan2; if (glsl::gl_SubgroupID() == 0) { - scratchAccessor.set(invocationIndex, inclusiveScan2(scratchAccessor.get(invocationIndex))); + vector_lv2_t lv2_val; + [unroll] + for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) + scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+invocationIndex,lv2_val[i]); + lv2_val = inclusiveScan2(lv2_val); + scratchAccessor.set(invocationIndex, lv2_val[Config::ItemsPerInvocation_2-1]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -327,7 +343,9 @@ struct reduce [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { - dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scratchAccessor.getByComponent((1u << Config::SubgroupsPerVirtualWorkgroupLog2)-1)); + scalar_t reduce_val; + scratchAccessor.get(Config::SubgroupSize-1,reduce_val); + dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val); } } }; @@ -358,17 +376,41 @@ struct scan if (subgroup::ElectLast()) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); - scratchAccessor.setByComponent(virtualSubgroupID, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroupLog2 + (virtualSubgroupID/Config::ItemsPerInvocation_1); + scratchAccessor.set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); // level 1 scan subgroup2::inclusive_scan inclusiveScan1; + if (glsl::gl_SubgroupID() < Config::SubgroupSizeLog2*Config::ItemsPerInvocation_1) + { + vector_lv1_t lv1_val; + [unroll] + for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) + scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+invocationIndex,lv1_val[i]); + lv1_val = inclusiveScan1(lv1_val); + if (subgroup::ElectLast()) + { + const uint32_t bankedIndex = (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroupLog2 + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2); + scratchAccessor.set(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); + } + } + scratchAccessor.workgroupExecutionAndMemoryBarrier(); + + // level 2 scan + subgroup2::inclusive_scan inclusiveScan2; if (glsl::gl_SubgroupID() == 0) { - const vector_lv1_t shiftedInput = hlsl::mix(hlsl::promote(BinOp::identity), scratchAccessor.get(invocationIndex-1), bool(invocationIndex)); - scratchAccessor.set(invocationIndex, inclusiveScan1(shiftedInput)); + vector_lv2_t lv2_val; + const uint32_t prevIndex = invocationIndex-1; + [unroll] + for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) + scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+prevIndex,lv2_val[i]); + vector_lv2_t shiftedInput = hlsl::mix(hlsl::promote(BinOp::identity), lv2_val, bool(invocationIndex)); + shiftedInput = inclusiveScan2(shiftedInput); + scratchAccessor.set(invocationIndex, shiftedInput[Config::ItemsPerInvocation_2-1]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -377,7 +419,8 @@ struct scan for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); - const scalar_t left = scratchAccessor.getByComponent(virtualSubgroupID); + const scalar_t left; + scratchAccessor.get(virtualSubgroupID, left); if (Exclusive) { scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp(scan_local[idx][Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID())); From 74c359bed10f1a2d3d55b126863f3d962b87826d Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 6 May 2025 14:41:01 +0700 Subject: [PATCH 080/346] minor bug fixes --- include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 91596bace0..141deccb7b 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -355,7 +355,8 @@ struct scan { using scalar_t = typename BinOp::type_t; using vector_lv0_t = vector; // data accessor needs to be this type - using vector_lv1_t = vector; // scratch smem accessor needs to be this type + using vector_lv1_t = vector; + using vector_lv2_t = vector; template void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) @@ -363,6 +364,7 @@ struct scan using config_t = subgroup2::Configuration; using params_lv0_t = subgroup2::ArithmeticParams; using params_lv1_t = subgroup2::ArithmeticParams; + using params_lv2_t = subgroup2::ArithmeticParams; BinOp binop; vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize]; From ce244e2d24d2da9e79197226799098aaa7675be9 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 7 May 2025 16:55:34 +0700 Subject: [PATCH 081/346] changes to data accessor usage --- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 141deccb7b..057e9ebd24 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -108,7 +108,9 @@ struct reduce subgroup2::reduction reduction; if (glsl::gl_SubgroupID() == 0) { - vector_t value = reduction(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex())); + vector_t value; + dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value); + value = reduction(value); dataAccessor.set(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value); // can be safely merged with top line? } } @@ -130,15 +132,16 @@ struct scan if (glsl::gl_SubgroupID() == 0) { vector_t value; + dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value); if (Exclusive) { subgroup2::exclusive_scan excl_scan; - value = excl_scan(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex())); + value = excl_scan(value); } else { subgroup2::inclusive_scan incl_scan; - value = incl_scan(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex())); + value = incl_scan(value); } dataAccessor.set(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value); // can be safely merged with above lines? } @@ -168,7 +171,8 @@ struct reduce [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { - scan_local[idx] = inclusiveScan0(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex)); + dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); + scan_local[idx] = inclusiveScan0(scan_local[idx]); if (subgroup::ElectLast()) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); @@ -224,7 +228,8 @@ struct scan [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { - scan_local[idx] = inclusiveScan0(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex)); + dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); + scan_local[idx] = inclusiveScan0(scan_local[idx]); if (subgroup::ElectLast()) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); @@ -299,7 +304,8 @@ struct reduce [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { - scan_local[idx] = inclusiveScan0(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex)); + dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); + scan_local[idx] = inclusiveScan0(scan_local[idx]); if (subgroup::ElectLast()) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); @@ -374,7 +380,8 @@ struct scan [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { - scan_local[idx] = inclusiveScan0(dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex)); + dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); + scan_local[idx] = inclusiveScan0(scan_local[idx]); if (subgroup::ElectLast()) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); From 90b19d817b7d5e9651ed755ff503873881e33311 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 8 May 2025 17:03:47 +0700 Subject: [PATCH 082/346] wg reduction uses reduce instead of scan --- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 057e9ebd24..7ed16faf09 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -167,12 +167,12 @@ struct reduce vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize]; const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 0 scan - subgroup2::inclusive_scan inclusiveScan0; + subgroup2::reduction reduction0; [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); - scan_local[idx] = inclusiveScan0(scan_local[idx]); + scan_local[idx] = reduction0(scan_local[idx]); if (subgroup::ElectLast()) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); @@ -183,14 +183,14 @@ struct reduce scratchAccessor.workgroupExecutionAndMemoryBarrier(); // level 1 scan - subgroup2::inclusive_scan inclusiveScan1; + subgroup2::reduction reduction1; if (glsl::gl_SubgroupID() == 0) { vector_lv1_t lv1_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+invocationIndex,lv1_val[i]); - lv1_val = inclusiveScan1(lv1_val); + lv1_val = reduction1(lv1_val); scratchAccessor.set(invocationIndex, lv1_val[Config::ItemsPerInvocation_1-1]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -200,7 +200,7 @@ struct reduce for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { scalar_t reduce_val; - scratchAccessor.get(Config::SubgroupSize-1,reduce_val); + scratchAccessor.get(glsl::gl_SubgroupInvocationID(),reduce_val); dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val); } } @@ -300,12 +300,12 @@ struct reduce vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize]; const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 0 scan - subgroup2::inclusive_scan inclusiveScan0; + subgroup2::reduction reduction0; [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); - scan_local[idx] = inclusiveScan0(scan_local[idx]); + scan_local[idx] = reduction0(scan_local[idx]); if (subgroup::ElectLast()) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); @@ -316,14 +316,14 @@ struct reduce scratchAccessor.workgroupExecutionAndMemoryBarrier(); // level 1 scan - subgroup2::inclusive_scan inclusiveScan1; + subgroup2::reduction reduction1; if (glsl::gl_SubgroupID() < Config::SubgroupSizeLog2*Config::ItemsPerInvocation_1) { vector_lv1_t lv1_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+invocationIndex,lv1_val[i]); - lv1_val = inclusiveScan1(lv1_val); + lv1_val = reduction1(lv1_val); if (subgroup::ElectLast()) { const uint32_t bankedIndex = (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroupLog2 + (invocationIndex/Config::ItemsPerInvocation_2); @@ -333,14 +333,14 @@ struct reduce scratchAccessor.workgroupExecutionAndMemoryBarrier(); // level 2 scan - subgroup2::inclusive_scan inclusiveScan2; + subgroup2::reduction reduction2; if (glsl::gl_SubgroupID() == 0) { vector_lv2_t lv2_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+invocationIndex,lv2_val[i]); - lv2_val = inclusiveScan2(lv2_val); + lv2_val = reduction2(lv2_val); scratchAccessor.set(invocationIndex, lv2_val[Config::ItemsPerInvocation_2-1]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -350,7 +350,7 @@ struct reduce for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { scalar_t reduce_val; - scratchAccessor.get(Config::SubgroupSize-1,reduce_val); + scratchAccessor.get(glsl::gl_SubgroupInvocationID(),reduce_val); dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val); } } From d2a16634dc52ecd1271d9a39cb6bcbe3ada2056c Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 9 May 2025 14:03:47 +0700 Subject: [PATCH 083/346] fixes to calculating levels in config --- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 70 +++++++++---------- 1 file changed, 33 insertions(+), 37 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 7ed16faf09..7ea8d6594b 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -23,7 +23,7 @@ namespace impl template struct virtual_wg_size_log2 { - NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2+2),uint16_t,conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2*2+2),uint16_t,3,2>::value,1>::value; + NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2),uint16_t,conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2*2+2),uint16_t,3,2>::value,1>::value; NBL_CONSTEXPR_STATIC_INLINE uint16_t value = mpl::max_v+SubgroupSizeLog2; }; @@ -31,7 +31,7 @@ template; - NBL_CONSTEXPR_STATIC_INLINE uint16_t value0 = conditional_value::value; + NBL_CONSTEXPR_STATIC_INLINE uint16_t value0 = BaseItemsPerInvocation; NBL_CONSTEXPR_STATIC_INLINE uint16_t value1 = uint16_t(0x1u) << conditional_value, ItemsPerInvocationProductLog2>::value; NBL_CONSTEXPR_STATIC_INLINE uint16_t value2 = uint16_t(0x1u) << mpl::max_v; }; @@ -47,6 +47,7 @@ struct Configuration // must have at least enough level 0 outputs to feed a single subgroup NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroupLog2 = mpl::max_v; + NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroup = 0x1u << SubgroupsPerVirtualWorkgroupLog2; using virtual_wg_t = impl::virtual_wg_size_log2; NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = virtual_wg_t::levels; @@ -67,8 +68,9 @@ struct Configuration<11, 4, ITEMS_PER_INVOC>\ NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << 11u;\ NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = uint16_t(4u);\ NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;\ - NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroupLog2 = 128u;\ - NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = 3;\ + NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroupLog2 = 7u;\ + NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroup = 128u;\ + NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = 3u;\ NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << 4096;\ NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_0 = ITEMS_PER_INVOC;\ NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_1 = 1u;\ @@ -106,13 +108,10 @@ struct reduce using params_t = subgroup2::ArithmeticParams; subgroup2::reduction reduction; - if (glsl::gl_SubgroupID() == 0) - { - vector_t value; - dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value); - value = reduction(value); - dataAccessor.set(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value); // can be safely merged with top line? - } + vector_t value; + dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value); + value = reduction(value); + dataAccessor.set(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value); // can be safely merged with top line? } }; @@ -129,22 +128,19 @@ struct scan using config_t = subgroup2::Configuration; using params_t = subgroup2::ArithmeticParams; - if (glsl::gl_SubgroupID() == 0) + vector_t value; + dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value); + if (Exclusive) { - vector_t value; - dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value); - if (Exclusive) - { - subgroup2::exclusive_scan excl_scan; - value = excl_scan(value); - } - else - { - subgroup2::inclusive_scan incl_scan; - value = incl_scan(value); - } - dataAccessor.set(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value); // can be safely merged with above lines? + subgroup2::exclusive_scan excl_scan; + value = excl_scan(value); + } + else + { + subgroup2::inclusive_scan incl_scan; + value = incl_scan(value); } + dataAccessor.set(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value); // can be safely merged with above lines? } }; @@ -176,7 +172,7 @@ struct reduce if (subgroup::ElectLast()) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); - const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroupLog2 + (virtualSubgroupID/Config::ItemsPerInvocation_1); + const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); scratchAccessor.set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } @@ -189,7 +185,7 @@ struct reduce vector_lv1_t lv1_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+invocationIndex,lv1_val[i]); + scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]); lv1_val = reduction1(lv1_val); scratchAccessor.set(invocationIndex, lv1_val[Config::ItemsPerInvocation_1-1]); } @@ -233,7 +229,7 @@ struct scan if (subgroup::ElectLast()) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); - const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroupLog2 + (virtualSubgroupID/Config::ItemsPerInvocation_1); + const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); scratchAccessor.set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } @@ -247,7 +243,7 @@ struct scan const uint32_t prevIndex = invocationIndex-1; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+prevIndex,lv1_val[i]); + scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+prevIndex,lv1_val[i]); vector_lv1_t shiftedInput = hlsl::mix(hlsl::promote(BinOp::identity), lv1_val, bool(invocationIndex)); shiftedInput = inclusiveScan1(shiftedInput); scratchAccessor.set(invocationIndex, shiftedInput[Config::ItemsPerInvocation_1-1]); @@ -309,7 +305,7 @@ struct reduce if (subgroup::ElectLast()) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); - const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroupLog2 + (virtualSubgroupID/Config::ItemsPerInvocation_1); + const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); scratchAccessor.set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } @@ -322,11 +318,11 @@ struct reduce vector_lv1_t lv1_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+invocationIndex,lv1_val[i]); + scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]); lv1_val = reduction1(lv1_val); if (subgroup::ElectLast()) { - const uint32_t bankedIndex = (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroupLog2 + (invocationIndex/Config::ItemsPerInvocation_2); + const uint32_t bankedIndex = (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2); scratchAccessor.set(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); } } @@ -339,7 +335,7 @@ struct reduce vector_lv2_t lv2_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) - scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+invocationIndex,lv2_val[i]); + scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv2_val[i]); lv2_val = reduction2(lv2_val); scratchAccessor.set(invocationIndex, lv2_val[Config::ItemsPerInvocation_2-1]); } @@ -385,7 +381,7 @@ struct scan if (subgroup::ElectLast()) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); - const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroupLog2 + (virtualSubgroupID/Config::ItemsPerInvocation_1); + const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); scratchAccessor.set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } @@ -398,11 +394,11 @@ struct scan vector_lv1_t lv1_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+invocationIndex,lv1_val[i]); + scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]); lv1_val = inclusiveScan1(lv1_val); if (subgroup::ElectLast()) { - const uint32_t bankedIndex = (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroupLog2 + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2); + const uint32_t bankedIndex = (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2); scratchAccessor.set(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); } } @@ -416,7 +412,7 @@ struct scan const uint32_t prevIndex = invocationIndex-1; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) - scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroupLog2+prevIndex,lv2_val[i]); + scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+prevIndex,lv2_val[i]); vector_lv2_t shiftedInput = hlsl::mix(hlsl::promote(BinOp::identity), lv2_val, bool(invocationIndex)); shiftedInput = inclusiveScan2(shiftedInput); scratchAccessor.set(invocationIndex, shiftedInput[Config::ItemsPerInvocation_2-1]); From ea39d9e698867a97b0d1f75ff356119d11b12302 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 12 May 2025 16:17:49 +0700 Subject: [PATCH 084/346] fixes to 3-level scan --- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 23 +++++++++++++++---- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 7ea8d6594b..1abd9cccd2 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -58,6 +58,8 @@ struct Configuration NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_1 = items_per_invoc_t::value1; NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_2 = items_per_invoc_t::value2; static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!"); + + NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedMemSize = conditional_value::value + SubgroupsPerVirtualWorkgroup*ItemsPerInvocation_1; }; // special case when workgroup size 2048 and subgroup size 16 needs 3 levels and virtual workgroup size 4096 to get a full subgroup scan each on level 1 and 2 16x16x16=4096 @@ -388,8 +390,9 @@ struct scan scratchAccessor.workgroupExecutionAndMemoryBarrier(); // level 1 scan + const uint32_t lv1_smem_size = Config::SubgroupsPerVirtualWorkgroup*Config::ItemsPerInvocation_1; subgroup2::inclusive_scan inclusiveScan1; - if (glsl::gl_SubgroupID() < Config::SubgroupSizeLog2*Config::ItemsPerInvocation_1) + if (glsl::gl_SubgroupID() < lv1_smem_size) { vector_lv1_t lv1_val; [unroll] @@ -398,8 +401,8 @@ struct scan lv1_val = inclusiveScan1(lv1_val); if (subgroup::ElectLast()) { - const uint32_t bankedIndex = (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2); - scratchAccessor.set(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); + const uint32_t bankedIndex = (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2); + scratchAccessor.set(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -412,10 +415,20 @@ struct scan const uint32_t prevIndex = invocationIndex-1; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) - scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+prevIndex,lv2_val[i]); + scratchAccessor.get(lv1_smem_size+i*Config::SubgroupSize+prevIndex,lv2_val[i]); vector_lv2_t shiftedInput = hlsl::mix(hlsl::promote(BinOp::identity), lv2_val, bool(invocationIndex)); shiftedInput = inclusiveScan2(shiftedInput); - scratchAccessor.set(invocationIndex, shiftedInput[Config::ItemsPerInvocation_2-1]); + + // combine with level 1, only last element of each + [unroll] + for (uint32_t i = 0; i < Config::SubgroupsPerVirtualWorkgroup; i++) + { + scalar_t last_val; + scratchAccessor.get((Config::ItemsPerInvocation_1-1)*Config::SubgroupsPerVirtualWorkgroup+(Config::SubgroupsPerVirtualWorkgroup-1-i),last_val); + scalar_t val = hlsl::mix(hlsl::promote(BinOp::identity), lv2_val, bool(i)); + val = binop(last_val, shiftedInput[Config::ItemsPerInvocation_2-1]); + scratchAccessor.set((Config::ItemsPerInvocation_1-1)*Config::SubgroupsPerVirtualWorkgroup+(Config::SubgroupsPerVirtualWorkgroup-1-i), last_val); + } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); From f356185c87a5ce8ddf8deeeba4376f92d90aa3dd Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 13 May 2025 09:54:28 +0200 Subject: [PATCH 085/346] Make the staging cache reference counted and make failures propagate properly --- include/nbl/video/utilities/CAssetConverter.h | 9 +- src/nbl/video/utilities/CAssetConverter.cpp | 357 +++++++++--------- 2 files changed, 182 insertions(+), 184 deletions(-) diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h index d9ace6226e..01da012a0d 100644 --- a/include/nbl/video/utilities/CAssetConverter.h +++ b/include/nbl/video/utilities/CAssetConverter.h @@ -973,7 +973,14 @@ class CAssetConverter : public core::IReferenceCounted public: template - using staging_cache_t = core::unordered_map::video_t*,typename CCache::key_t>; + struct staging_cache_key + { + core::smart_refctd_ptr::video_t> gpuRef; + typename CCache::key_t cacheKey; + }; + // it may seem weird storing both a smart pointer and a raw pointer, but the reason is to be able to drop a refcount while not loosing the key for lookup + template + using staging_cache_t = core::unordered_map::video_t*,staging_cache_key>; inline SReserveResult(SReserveResult&&) = default; inline SReserveResult(const SReserveResult&) = delete; diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 7bfd361e94..de72e2f360 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -2496,7 +2496,7 @@ struct conversions_t return; } // insert into staging cache - stagingCache.emplace(gpuObj.get(),typename CAssetConverter::CCache::key_t(contentHash,uniqueCopyGroupID)); + stagingCache.emplace(gpuObj.get(),CAssetConverter::SReserveResult::staging_cache_key{gpuObj.value,typename CAssetConverter::CCache::key_t(contentHash,uniqueCopyGroupID)}); // propagate back to dfsCache created.gpuObj = std::move(gpuObj); } @@ -3534,12 +3534,14 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult if (const auto& gpuObj=found.gpuObj; gpuObj) { results[i] = gpuObj; +#ifdef _NBL_DEBUG // if something with this content hash is in the stagingCache, then it must match the `found->gpuObj` if (auto finalCacheIt=stagingCache.find(gpuObj.get()); finalCacheIt!=stagingCache.end()) { - const bool matches = finalCacheIt->second==typename CCache::key_t(found.contentHash,uniqueCopyGroupID); + const bool matches = finalCacheIt->second.cacheKey==typename CCache::key_t(found.contentHash,uniqueCopyGroupID); assert(matches); } +#endif } else inputs.logger.log("No GPU Object could be found or created for Root Asset %p in group %d",system::ILogger::ELL_ERROR,asset,uniqueCopyGroupID); @@ -3557,16 +3559,18 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult { if (entry.first->getReferenceCount()==1) { + // I know what I'm doing, the hashmap is being annoying not letting you look up with const pointer key a non const pointer hashmap + auto* gpuObj = const_cast::video_t*>(entry.first); if constexpr (std::is_same_v) - retval.m_bufferConversions.erase(entry.first); + retval.m_bufferConversions.erase(gpuObj); if constexpr (std::is_same_v) for (auto i=0; i<2; i++) - retval.m_blasConversions[i].erase(entry.first); + retval.m_blasConversions[i].erase(gpuObj); if constexpr (std::is_same_v) for (auto i=0; i<2; i++) - retval.m_tlasConversions[i].erase(entry.first); + retval.m_tlasConversions[i].erase(gpuObj); if constexpr (std::is_same_v) - retval.m_imageConversions.erase(entry.first); + retval.m_imageConversions.erase(gpuObj); return true; } // still referenced, keep it around @@ -3706,16 +3710,8 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul } }; - // - auto findInStaging = [&reservations](const typename asset_traits::video_t* gpuObj)->core::blake3_hash_t* - { - auto& stagingCache = std::get>(reservations.m_stagingCaches); - const auto found = stagingCache.find(const_cast::video_t*>(gpuObj)); - assert(found!=stagingCache.end()); - return const_cast(&found->second.value); - }; // wipe gpu item in staging cache (this may drop it as well if it was made for only a root asset == no users) - core::unordered_map outputReverseMap; + core::unordered_map outputReverseMap; core::for_each_in_tuple(reservations.m_gpuObjects,[&outputReverseMap](const auto& gpuObjects)->void { uint32_t i = 0; @@ -3723,21 +3719,21 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul outputReverseMap[gpuObj.value.get()] = i++; } ); - auto markFailureInStaging = [&reservations,&outputReverseMap,logger](const char* message, smart_refctd_ptr& canonical, const typename asset_traits::video_t* gpuObj, core::blake3_hash_t* hash)->void + auto markFailure = [&reservations,&outputReverseMap,logger](const char* message, smart_refctd_ptr* canonical, typename SReserveResult::staging_cache_t::mapped_type* cacheNode)->void { // wipe the smart pointer to the canonical, make sure we release that memory ASAP if no other user is around - canonical = nullptr; - logger.log("%s failed for \"%s\"",system::ILogger::ELL_ERROR,message,gpuObj->getObjectDebugName()); - // change the content hash on the reverse map to a NoContentHash - *hash = CHashCache::NoContentHash; + *canonical = nullptr; // also drop the smart pointer from the output array so failures release memory quickly - const auto foundIx = outputReverseMap.find(gpuObj); + const auto foundIx = outputReverseMap.find(cacheNode->gpuRef.get()); if (foundIx!=outputReverseMap.end()) { auto& resultOutput = std::get>(reservations.m_gpuObjects); resultOutput[foundIx->second].value = nullptr; outputReverseMap.erase(foundIx); } + logger.log("%s failed for \"%s\"",system::ILogger::ELL_ERROR,message,cacheNode->gpuRef->getObjectDebugName()); + // drop smart pointer + cacheNode->gpuRef = nullptr; }; // want to check if deps successfully exist @@ -3751,10 +3747,10 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul }; auto missingDependent = [&reservations](const typename asset_traits::video_t* dep)->SMissingDependent { - auto& stagingCache = std::get>(reservations.m_stagingCaches); - auto found = stagingCache.find(const_cast::video_t*>(dep)); + const auto& stagingCache = std::get>(reservations.m_stagingCaches); + const auto found = stagingCache.find(dep); SMissingDependent retval = {.wasInStaging=found!=stagingCache.end()}; - retval.gotWiped = retval.wasInStaging && found->second.value==CHashCache::NoContentHash; + retval.gotWiped = retval.wasInStaging && !found->second.gpuRef; return retval; }; @@ -3975,6 +3971,15 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul // some state so we don't need to look later auto xferCmdBuf = shouldDoSomeTransfer ? params.transfer->getCommandBufferForRecording():nullptr; + // + auto findInStaging = [&reservations](const typename asset_traits::video_t* gpuObj)->auto + { + auto& stagingCache = std::get>(reservations.m_stagingCaches); + const auto found = stagingCache.find(gpuObj); + assert(found!=stagingCache.end()); + return found; + }; + using buffer_mem_barrier_t = IGPUCommandBuffer::SBufferMemoryBarrier; // upload Buffers auto& buffersToUpload = reservations.m_bufferConversions; @@ -3994,12 +3999,12 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul // host will upload if (canHostWriteToMemoryRange(buffer->getBoundMemory(),size)) continue; - auto pFoundHash = findInStaging.template operator()(buffer); + auto pFound = &findInStaging.template operator()(buffer)->second; // - const auto ownerQueueFamily = checkOwnership(buffer,params.getFinalOwnerQueueFamily(buffer,*pFoundHash),transferFamily); + const auto ownerQueueFamily = checkOwnership(buffer,params.getFinalOwnerQueueFamily(buffer,pFound->cacheKey.value),transferFamily); if (ownerQueueFamily==QueueFamilyInvalid) { - markFailureInStaging("invalid Final Queue Family given by user callback",item.second,buffer,pFoundHash); + markFailure("invalid Final Queue Family given by user callback",&item.second,pFound); continue; } // do the upload @@ -4009,7 +4014,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul xferCmdBuf = params.transfer->getCommandBufferForRecording(); if (!success) { - markFailureInStaging("Data Upload",item.second,buffer,pFoundHash); + markFailure("Data Upload",&item.second,pFound); continue; } // let go of canonical asset (may free RAM) @@ -4175,7 +4180,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul // basiscs auto& cpuImg = item.second.canonical; auto* image = item.first; - auto pFoundHash = findInStaging.template operator()(image); + auto pFound = &findInStaging.template operator()(image)->second; // get params const auto& creationParams = image->getCreationParameters(); const auto format = creationParams.format; @@ -4225,7 +4230,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul } if (!quickWriteDescriptor(SrcMipBinding,srcIx,std::move(srcView))) { - markFailureInStaging("Source Mip Level Descriptor Write",cpuImg,image,pFoundHash); + markFailure("Source Mip Level Descriptor Write",&cpuImg,pFound); continue; } } @@ -4246,7 +4251,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul } else { - markFailureInStaging("Image QFOT Pipeline Barrier",cpuImg,image,pFoundHash); + markFailure("Image QFOT Pipeline Barrier",&cpuImg,pFound); return false; } return true; @@ -4295,7 +4300,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul // if we're recomputing this mip level const bool recomputeMip = lvl && (recomputeMipMask&(0x1u<<(lvl-1))); // query final layout from callback - const auto finalLayout = params.getFinalLayout(image,*pFoundHash,lvl); + const auto finalLayout = params.getFinalLayout(image,pFound->cacheKey.value,lvl); // get region data for upload auto regions = cpuImg->getRegions(lvl); // basic error checks @@ -4306,7 +4311,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul logger.log("What are you doing requesting layout UNDEFINED for mip level % of image %s after Upload or Mip Recomputation!?",system::ILogger::ELL_ERROR,lvl,image->getObjectDebugName()); break; } - const auto suggestedFinalOwner = params.getFinalOwnerQueueFamily(image,*pFoundHash,lvl); + const auto suggestedFinalOwner = params.getFinalOwnerQueueFamily(image,pFound->cacheKey.value,lvl); // if we'll recompute the mipmap, then do the layout transition on the compute queue (there's one less potential QFOT) if (recomputeMip) { @@ -4561,7 +4566,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul // failed in the for-loop if (lvl != creationParams.mipLevels) { - markFailureInStaging("Compute Mip Mapping",cpuImg,image,pFoundHash); + markFailure("Compute Mip Mapping",&cpuImg,pFound); continue; } // let go of canonical asset (may free RAM) @@ -4572,7 +4577,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul { if (!pipelineBarrier(xferCmdBuf,{.memBarriers={},.bufBarriers={},.imgBarriers=transferBarriers},"Final Pipeline Barrier recording to Transfer Command Buffer failed")) { - markFailureInStaging("Image Data Upload Pipeline Barrier",cpuImg,image,pFoundHash); + markFailure("Image Data Upload Pipeline Barrier",&cpuImg,pFound); continue; } // even if no uploads performed, we do layout transitions on empty images from Xfer Queue @@ -4584,7 +4589,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul dsAlloc->multi_deallocate(SrcMipBinding,1,&srcIx,params.compute->getFutureScratchSemaphore()); if (!pipelineBarrier(computeCmdBuf,{.memBarriers={},.bufBarriers={},.imgBarriers=computeBarriers},"Final Pipeline Barrier recording to Compute Command Buffer failed")) { - markFailureInStaging("Compute Mip Mapping Pipeline Barrier",cpuImg,image,pFoundHash); + markFailure("Compute Mip Mapping Pipeline Barrier",&cpuImg,pFound); continue; } } @@ -4659,9 +4664,9 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul if (!success) for (const auto& info : buildInfos) { - const auto pFoundHash = findInStaging.template operator()(info.dstAS); + const auto stagingFound = findInStaging.template operator()(info.dstAS); smart_refctd_ptr dummy; // already null at this point - markFailureInStaging("AS Build Command Recording",dummy,info.dstAS,pFoundHash); + markFailure("AS Build Command Recording",&dummy,&stagingFound->second); } buildInfos.clear(); rangeInfos.clear(); @@ -4710,14 +4715,14 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul for (auto& item : blasToBuild) { auto* as = item.gpuObj; - auto pFoundHash = findInStaging.template operator()(as); + auto pFound = &findInStaging.template operator()(as)->second; if (item.asBuildParams.host) { auto dOp = device->createDeferredOperation(); // if (!device->buildAccelerationStructure(dOp.get(),info,range)) { - markFailureInStaging("BLAS Build Command Recording",item.canonical,gpuObj,pFoundHash); + markFailure("BLAS Build Command Recording",&item.canonical,pFound); continue; } } @@ -4811,13 +4816,13 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul { auto& canonical = tlasToBuild.second.canonical; const auto as = tlasToBuild.first; - const auto pFoundHash = findInStaging.template operator()(as); + const auto pFound = &findInStaging.template operator()(as)->second; const auto& backingRange = as->getCreationParams().bufferRange; // checking ownership for the future on old buffer, but compacted will be made with same sharing creation parameters - const auto finalOwnerQueueFamily = checkOwnership(backingRange.buffer.get(),params.getFinalOwnerQueueFamily(as,*pFoundHash),computeFamily); + const auto finalOwnerQueueFamily = checkOwnership(backingRange.buffer.get(),params.getFinalOwnerQueueFamily(as,pFound->cacheKey.value),computeFamily); if (finalOwnerQueueFamily==QueueFamilyInvalid) { - markFailureInStaging("invalid Final Queue Family given by user callback",canonical,as,pFoundHash); + markFailure("invalid Final Queue Family given by user callback",&canonical,pFound); continue; } const auto instances = canonical->getInstances(); @@ -4843,13 +4848,13 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul // problem with building some Dependent BLASes if (failedBLASBarrier && dependsOnBLASBuilds) { - markFailureInStaging("building BLASes which current TLAS build wants to instance",canonical,as,pFoundHash); + markFailure("building BLASes which current TLAS build wants to instance",&canonical,pFound); continue; } // problem with finding the dependents (BLASes) if (instanceDataSize==0) { - markFailureInStaging("finding valid Dependant GPU BLASes for TLAS build",canonical,as,pFoundHash); + markFailure("finding valid Dependant GPU BLASes for TLAS build",&canonical,pFound); continue; } // allocate scratch and build inputs @@ -4954,7 +4959,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul if (!success) { trackedBLASes.resize(trackedBLASesOffset); - markFailureInStaging("Uploading Instance Data for TLAS build failed",canonical,as,pFoundHash); + markFailure("Uploading Instance Data for TLAS build failed",&canonical,pFound); continue; } // let go of canonical asset (may free RAM) @@ -5165,159 +5170,145 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul // finish host tasks if not done yet hostUploadBuffers([]()->bool{return true;}); + // in the future we'll also finish host image copies - // insert items into cache if overflows handled fine and commandbuffers ready to be recorded - auto mergeCache = [&]()->void + // check dependents before inserting into cache + if (reqQueueFlags.value!=IQueue::FAMILY_FLAGS::NONE) { - auto& stagingCache = std::get>(reservations.m_stagingCaches); - auto& cache = std::get>(m_caches); - cache.m_forwardMap.reserve(cache.m_forwardMap.size()+stagingCache.size()); - cache.m_reverseMap.reserve(cache.m_reverseMap.size()+stagingCache.size()); - constexpr bool IsTLAS = std::is_same_v; - for (auto& item : stagingCache) - if (item.second.value!=CHashCache::NoContentHash) // didn't get wiped + auto checkDependents = [&]()->void { - // rescan all the GPU objects and find out if they depend on anything that failed, if so add to failure set - bool depsMissing = false; - // only go over types we could actually break via missing upload/build (i.e. pipelines are unbreakable) - if constexpr (IsTLAS) - { - // A built TLAS cannot be queried about the BLASes it contains, so just trust the pre-TLAS-build input validation did its job - } - - if constexpr (std::is_same_v) - depsMissing = missingDependent.template operator()(item.first->getUnderlyingBuffer()); - if constexpr (std::is_same_v) - depsMissing = missingDependent.template operator()(item.first->getCreationParameters().image.get()); - if constexpr (std::is_same_v) - { - const IGPUDescriptorSetLayout* layout = item.first->getLayout(); - // check samplers - { - const auto count = layout->getTotalMutableCombinedSamplerCount(); - const auto* samplers = item.first->getAllMutableCombinedSamplers(); - for (auto i=0u; !depsMissing && i(samplers[i].get()); - } - for (auto i=0u; !depsMissing && i(asset::IDescriptor::E_TYPE::ET_COUNT); i++) + auto& stagingCache = std::get>(reservations.m_stagingCaches); + phmap::erase_if(stagingCache,[&](auto& item)->bool { - const auto type = static_cast(i); - const auto count = layout->getTotalDescriptorCount(type); - auto* psDescriptors = item.first->getAllDescriptors(type); - if (!psDescriptors) - continue; - for (auto i=0u; !depsMissing && i) + depsMissing = missingDependent.template operator()(pGpuObj->getUnderlyingBuffer()); + if constexpr (std::is_same_v) + depsMissing = missingDependent.template operator()(pGpuObj->getCreationParameters().image.get()); + if constexpr (std::is_same_v) { - auto* untypedDesc = psDescriptors[i].get(); - if (untypedDesc) - switch (asset::IDescriptor::GetTypeCategory(type)) + const IGPUDescriptorSetLayout* layout = pGpuObj->getLayout(); + // check samplers { - case asset::IDescriptor::EC_BUFFER: - depsMissing = missingDependent.template operator()(static_cast(untypedDesc)); - break; - case asset::IDescriptor::EC_SAMPLER: - depsMissing = missingDependent.template operator()(static_cast(untypedDesc)); - break; - case asset::IDescriptor::EC_IMAGE: - depsMissing = missingDependent.template operator()(static_cast(untypedDesc)); - break; - case asset::IDescriptor::EC_BUFFER_VIEW: - depsMissing = missingDependent.template operator()(static_cast(untypedDesc)); - break; - case asset::IDescriptor::EC_ACCELERATION_STRUCTURE: - depsMissing = missingDependent.template operator()(static_cast(untypedDesc)); - break; - default: - assert(false); - depsMissing = true; - break; + const auto count = layout->getTotalMutableCombinedSamplerCount(); + const auto* samplers = pGpuObj->getAllMutableCombinedSamplers(); + for (auto i=0u; !depsMissing && i(samplers[i].get()); + } + for (auto i=0u; !depsMissing && i(asset::IDescriptor::E_TYPE::ET_COUNT); i++) + { + const auto type = static_cast(i); + const auto count = layout->getTotalDescriptorCount(type); + auto* psDescriptors = pGpuObj->getAllDescriptors(type); + if (!psDescriptors) + continue; + for (auto i=0u; !depsMissing && i(static_cast(untypedDesc)); + break; + case asset::IDescriptor::EC_SAMPLER: + depsMissing = missingDependent.template operator()(static_cast(untypedDesc)); + break; + case asset::IDescriptor::EC_IMAGE: + depsMissing = missingDependent.template operator()(static_cast(untypedDesc)); + break; + case asset::IDescriptor::EC_BUFFER_VIEW: + depsMissing = missingDependent.template operator()(static_cast(untypedDesc)); + break; + case asset::IDescriptor::EC_ACCELERATION_STRUCTURE: + depsMissing = missingDependent.template operator()(static_cast(untypedDesc)); + break; + default: + assert(false); + depsMissing = true; + break; + } + } } } + if (depsMissing) + { + smart_refctd_ptr dummy; + // I know what I'm doing (breaking the promise of the `erase_if` to not mutate the inputs) + markFailure("because conversion of a dependant failed!",&dummy,&item.second); + } + return depsMissing; } - } - auto* pGpuObj = item.first; - if (depsMissing) - { - logger.log("GPU Obj %s not writing to final cache because conversion of a dependant failed!",system::ILogger::ELL_ERROR,pGpuObj->getObjectDebugName()); - // wipe self, to let users know - item.second.value = {}; - continue; - } - // The BLASes don't need to do this, because no-one checks for them as dependents and we can substitute the `item.first` in the staging cache right away - // For TLASes we need to write the compacted TLAS and not the intermediate build to the Cache - if constexpr (IsTLAS) + ); + }; + // Bottom up, only go over types we could actually break via missing upload/build (i.e. pipelines are unbreakable) + // A built TLAS cannot be queried about the BLASes it contains, so just trust the pre-TLAS-build input validation did its job + checkDependents.template operator()(); + checkDependents.template operator()(); + checkDependents.template operator()(); +// mergeCache.template operator()(); + // overwrite the compacted TLASes in Descriptor Sets + if (auto& tlasRewriteSet=reservations.m_potentialTLASRewrites; !tlasRewriteSet.empty()) + { + core::vector writes; + writes.reserve(tlasRewriteSet.size()); + core::vector infos(tlasRewriteSet.size()); + auto* pInfo = infos.data(); + for (auto& entry : tlasRewriteSet) { - auto found = compactedTLASMap.find(pGpuObj); - if (found!=compactedTLASMap.end()) - pGpuObj = found->second.get(); - + auto* const dstSet = entry.dstSet; + // we need to check if the descriptor set itself didn't get deleted in the meantime + if (missingDependent.template operator()(dstSet)) + continue; + // rewtrieve the binding from the TLAS + const auto* const tlas = static_cast(dstSet->getAllDescriptors(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE)[entry.storageOffset.data].get()); + assert(tlas); + // only rewrite if successfully compacted + if (const auto foundCompacted=compactedTLASMap.find(tlas); foundCompacted!=compactedTLASMap.end()) + { + pInfo->desc = foundCompacted->second; + using redirect_t = IDescriptorSetLayoutBase::CBindingRedirect; + const redirect_t& redirect = dstSet->getLayout()->getDescriptorRedirect(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE); + const auto bindingRange = redirect.findBindingStorageIndex(entry.storageOffset); + const auto firstElementOffset = redirect.getStorageOffset(bindingRange); + writes.push_back({ + .dstSet = dstSet, + .binding = redirect.getBinding(bindingRange).data, + .arrayElement = entry.storageOffset.data-firstElementOffset.data, + .count = 1, + .info = pInfo++ + }); + } } - // We have success now, but ask callback if we write to the new cache. - if (!params.writeCache(item.second)) // TODO: let the user know the pointer to the GPU Object too? - continue; - asset_cached_t cached; - cached.value = core::smart_refctd_ptr::video_t>(pGpuObj); - cache.m_reverseMap.emplace(pGpuObj,item.second); - cache.m_forwardMap.emplace(item.second,std::move(cached)); + // if the descriptor write fails, we make the Descriptor Sets behave as-if the TLAS build failed (dep is missing) + if (!writes.empty() && !device->updateDescriptorSets(writes,{})) + logger.log("Failed to write one of the compacted TLASes into a Descriptor Set, all Descriptor Sets will still use non-compacted TLASes",system::ILogger::ELL_ERROR); } - }; - // again, need to go bottom up so we can check dependencies being successes - mergeCache.template operator()(); - mergeCache.template operator()(); - mergeCache.template operator()(); - mergeCache.template operator()(); - mergeCache.template operator()(); - mergeCache.template operator()(); - mergeCache.template operator()(); - mergeCache.template operator()(); - mergeCache.template operator()(); - mergeCache.template operator()(); - mergeCache.template operator()(); - mergeCache.template operator()(); - mergeCache.template operator()(); - mergeCache.template operator()(); - // overwrite the compacted TLASes in Descriptor Sets - if (auto& tlasRewriteSet=reservations.m_potentialTLASRewrites; !tlasRewriteSet.empty()) + } + + // insert items into cache if overflows handled fine and commandbuffers ready to be recorded + core::for_each_in_tuple(reservations.m_stagingCaches,[&](SReserveResult::staging_cache_t& stagingCache)->void { - core::vector writes; - writes.reserve(tlasRewriteSet.size()); - core::vector infos(tlasRewriteSet.size()); - auto* pInfo = infos.data(); - for (auto& entry : tlasRewriteSet) + auto& cache = std::get>(m_caches); + cache.m_forwardMap.reserve(cache.m_forwardMap.size()+stagingCache.size()); + cache.m_reverseMap.reserve(cache.m_reverseMap.size()+stagingCache.size()); + for (auto& item : stagingCache) + if (item.second.gpuRef) // not wiped { - auto* const dstSet = entry.dstSet; - // we need to check if the descriptor set itself didn't get deleted in the meantime - auto& stagingCache = std::get>(reservations.m_stagingCaches); - const auto found = stagingCache.find(dstSet); - if (found==stagingCache.end()) + // We have success now, but ask callback if we write to the new cache. + if (!params.writeCache(item.second.cacheKey)) // TODO: let the user know the pointer to the GPU Object too? continue; - // rewtrieve the binding from the TLAS - const auto* const tlas = static_cast(dstSet->getAllDescriptors(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE)[entry.storageOffset.data].get()); - assert(tlas); - // only rewrite if successfully compacted - if (const auto foundCompacted=compactedTLASMap.find(tlas); foundCompacted!=compactedTLASMap.end()) - { - pInfo->desc = foundCompacted->second; - using redirect_t = IDescriptorSetLayoutBase::CBindingRedirect; - const redirect_t& redirect = dstSet->getLayout()->getDescriptorRedirect(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE); - const auto bindingRange = redirect.findBindingStorageIndex(entry.storageOffset); - const auto firstElementOffset = redirect.getStorageOffset(bindingRange); - writes.push_back({ - .dstSet = dstSet, - .binding = redirect.getBinding(bindingRange).data, - .arrayElement = entry.storageOffset.data-firstElementOffset.data, - .count = 1, - .info = pInfo++ - }); - } + asset_cached_t cached; + cached.value = std::move(item.second.gpuRef); + cache.m_reverseMap.emplace(item.first,item.second.cacheKey); + cache.m_forwardMap.emplace(item.second.cacheKey,std::move(cached)); } - // if the descriptor write fails, we make the Descriptor Sets behave as-if the TLAS build failed (dep is missing) - if (!writes.empty() && !device->updateDescriptorSets(writes,{})) - logger.log("Failed to write one of the compacted TLASes into a Descriptor Set, all Descriptor Sets will still use non-compacted TLASes",system::ILogger::ELL_ERROR); - } - mergeCache.template operator()(); -// mergeCache.template operator()(); + // provoke refcounting bugs ASAP + stagingCache.clear(); + }); // no submit was necessary, so should signal the extra semaphores from the host if (!retval.blocking()) From 0b791b545b40734c17c240dca92837ebcf8cb5c5 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 13 May 2025 15:26:59 +0700 Subject: [PATCH 086/346] Fix discardDependantsContents and anyDependantDiscardedContents to use computeDependants --- include/nbl/asset/IPreHashed.h | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/include/nbl/asset/IPreHashed.h b/include/nbl/asset/IPreHashed.h index 4bc5ca5dcd..4ffda209df 100644 --- a/include/nbl/asset/IPreHashed.h +++ b/include/nbl/asset/IPreHashed.h @@ -43,28 +43,28 @@ class IPreHashed : public IAsset { struct stack_entry_t { - IAsset* asset; - size_t childCount = 0; - size_t childrenVisited = 0; + const IAsset* asset; + core::unordered_set unvisitedChilds; }; core::stack stack; core::unordered_set alreadyVisited; - auto push = [&stack,&alreadyVisited](IAsset* node) -> void + auto push = [&stack,&alreadyVisited](const IAsset* node) -> void { if (!node) return; const auto [dummy,inserted] = alreadyVisited.insert(node); if (inserted) - stack.push({.asset=node,.childCount=node->getDependantCount()}); + stack.push({ .asset = node, .unvisitedChilds = node->computeDependants()}); }; for (const auto& root : roots) push(root); while (!stack.empty()) { auto& entry = stack.top(); - if (entry.childrenVisited 0) { - const auto dep = entry.asset->getDependant(entry.childrenVisited++); + auto dep = *entry.unvisitedChilds.begin(); + entry.unvisitedChilds.erase(entry.unvisitedChilds.begin()); push(dep); } else @@ -82,8 +82,7 @@ class IPreHashed : public IAsset struct stack_entry_t { const IAsset* asset; - size_t childCount = 0; - size_t childrenVisited = 0; + core::unordered_set unvisitedChilds; }; core::stack stack; core::unordered_set alreadyVisited; @@ -97,7 +96,7 @@ class IPreHashed : public IAsset auto* isPrehashed = dynamic_cast(node); if (isPrehashed && isPrehashed->missingContent()) return true; - stack.push({.asset=node,.childCount=node->getDependantCount()}); + stack.push({ .asset = node, .unvisitedChilds = node->computeDependants() }); } return false; }; @@ -106,9 +105,11 @@ class IPreHashed : public IAsset while (!stack.empty()) { auto& entry = stack.top(); - if (entry.childrenVisited 0) { - const auto dep = entry.asset->getDependant(entry.childrenVisited++); + auto dep = *unvisitedChilds.begin(); + unvisitedChilds.erase(unvisitedChilds.begin()); if (push(dep)) return true; } From e8e43b1fe68f981f8b583941e0b90c359f51fbde Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 13 May 2025 15:30:56 +0700 Subject: [PATCH 087/346] Add Ray Tracing Pipeline Asset to IAsset --- include/nbl/asset/IAsset.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/nbl/asset/IAsset.h b/include/nbl/asset/IAsset.h index 3802536029..a1689daa63 100644 --- a/include/nbl/asset/IAsset.h +++ b/include/nbl/asset/IAsset.h @@ -94,6 +94,7 @@ class IAsset : virtual public core::IReferenceCounted ET_COMPUTE_PIPELINE = 1ull<<20, //!< asset::ICPUComputePipeline ET_PIPELINE_CACHE = 1ull<<21, //!< asset::ICPUPipelineCache ET_SCENE = 1ull<<22, //!< reserved, to implement later + ET_RAYTRACING_PIPELINE = 1ull << 23, //!< asset::ICPURayTracingPipeline ET_IMPLEMENTATION_SPECIFIC_METADATA = 1ull<<31u, //!< lights, etc. //! Reserved special value used for things like terminating lists of this enum From b9db6aa2e1b8a2297c621daab047b757e3b47c36 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 13 May 2025 15:32:52 +0700 Subject: [PATCH 088/346] Remove unnecessary specInfo assignment in clone method --- include/nbl/asset/ICPUGraphicsPipeline.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h index 62b25443cc..e376300121 100644 --- a/include/nbl/asset/ICPUGraphicsPipeline.h +++ b/include/nbl/asset/ICPUGraphicsPipeline.h @@ -29,8 +29,6 @@ class ICPUGraphicsPipeline final : public ICPUPipeline clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final { auto* newPipeline = new ICPUGraphicsPipeline(layout.get()); - for (auto i = 0; i < GRAPHICS_SHADER_STAGE_COUNT; i++) - newPipeline->m_specInfos[i] = m_specInfos[i]; newPipeline->m_params = m_params; newPipeline->m_renderpass = m_renderpass; From 2ae6f7818428562f73ead04408a3ffa55e32066a Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 13 May 2025 15:40:42 +0700 Subject: [PATCH 089/346] Move subgroup argument to computePipelineBase --- include/nbl/asset/ICPUPipeline.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h index 8b90458f21..ae2c64372d 100644 --- a/include/nbl/asset/ICPUPipeline.h +++ b/include/nbl/asset/ICPUPipeline.h @@ -69,8 +69,6 @@ class ICPUPipelineBase core::smart_refctd_ptr shader = nullptr; std::string entryPoint = ""; - IPipelineBase::SUBGROUP_SIZE requiredSubgroupSize : 3 = IPipelineBase::SUBGROUP_SIZE::UNKNOWN; //!< Default value of 8 means no requirement - uint8_t requireFullSubgroups : 1 = false; // Container choice implicitly satisfies: // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-constantID-04911 From 8de6d9a5992b8ff227a9e24cd9e0026ba1e49b80 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 13 May 2025 15:41:08 +0700 Subject: [PATCH 090/346] Remove getDependantCount and getDependant and getDependant_impl from IAsset --- include/nbl/asset/IAsset.h | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/include/nbl/asset/IAsset.h b/include/nbl/asset/IAsset.h index a1689daa63..c3950c4912 100644 --- a/include/nbl/asset/IAsset.h +++ b/include/nbl/asset/IAsset.h @@ -156,20 +156,6 @@ class IAsset : virtual public core::IReferenceCounted //! inline bool isMutable() const {return m_mutable;} - //! - virtual size_t getDependantCount() const = 0; - inline IAsset* getDependant(const size_t ix) - { - if (ix(this)->getDependant(ix); - return retval; - } - virtual core::unordered_set computeDependants() const = 0; virtual bool valid() const = 0; @@ -179,8 +165,6 @@ class IAsset : virtual public core::IReferenceCounted //! Pure virtual destructor to ensure no instantiation NBL_API2 virtual ~IAsset() = 0; - virtual IAsset* getDependant_impl(const size_t ix) = 0; - private: friend IAssetManager; bool m_mutable = true; From 3f6599267befa369bc171f21dac3af67d06f7a0d Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 13 May 2025 15:41:41 +0700 Subject: [PATCH 091/346] Implement computeDependants for ICPUGraphicsPIpeline --- include/nbl/asset/ICPUGraphicsPipeline.h | 26 ++++-------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h index e376300121..0629f82f1c 100644 --- a/include/nbl/asset/ICPUGraphicsPipeline.h +++ b/include/nbl/asset/ICPUGraphicsPipeline.h @@ -43,15 +43,12 @@ class ICPUGraphicsPipeline final : public ICPUPipeline computeDependants() const override { - auto stageCount = 2; // the layout and renderpass + core::unordered_set dependants = { m_layout.get(), m_renderpass.get()}; for (const auto& info : m_specInfos) - { - if (info.shader) - stageCount++; - } - return stageCount; + if (info.shader) dependants.insert(info.shader.get()); + return dependants; } inline SCachedCreationParams& getCachedCreationParams() @@ -90,21 +87,6 @@ class ICPUGraphicsPipeline final : public ICPUPipeline(m_layout.get()); - if (ix==1) - return m_renderpass.get(); - size_t stageCount = 0; - for (auto& specInfo : m_specInfos) - { - if (specInfo.shader) - if ((stageCount++)==ix-2) return specInfo.shader.get(); - } - return nullptr; - } - std::array m_specInfos; private: From 89b8daaaf6618b4da8472629b3408ff55f85539e Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 13 May 2025 15:42:03 +0700 Subject: [PATCH 092/346] Implement computeDependants for ICPURayTracingPIpeline --- include/nbl/asset/ICPURayTracingPipeline.h | 88 ++++++++++++---------- 1 file changed, 47 insertions(+), 41 deletions(-) diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h index 23a1d82225..5be344d1f2 100644 --- a/include/nbl/asset/ICPURayTracingPipeline.h +++ b/include/nbl/asset/ICPURayTracingPipeline.h @@ -19,22 +19,10 @@ class ICPURayTracingPipeline final : public ICPUPipeline; public: - struct SHitGroupSpecInfo { - SShaderSpecInfo closestHit; - SShaderSpecInfo anyHit; - SShaderSpecInfo intersection; - - SHitGroupSpecInfo clone(uint32_t depth) const - { - auto newSpecInfo = *this; - if (depth > 0u) - { - newSpecInfo.closestHit.shader = core::smart_refctd_ptr_static_cast(this->closestHit.shader->clone(depth - 1u)); - newSpecInfo.anyHit.shader = core::smart_refctd_ptr_static_cast(this->anyHit.shader->clone(depth - 1u)); - newSpecInfo.intersection.shader = core::smart_refctd_ptr_static_cast(this->intersection.shader->clone(depth - 1u)); - } - return newSpecInfo; - } + struct SHitGroupSpecInfos { + core::vector closestHits; + core::vector anyHits; + core::vector intersections; }; static core::smart_refctd_ptr create(const ICPUPipelineLayout* layout) @@ -48,23 +36,18 @@ class ICPURayTracingPipeline final : public ICPUPipelinem_raygen = m_raygen.clone(depth); - newPipeline->m_misses.resize(m_misses.size()); - for (auto specInfo_i = 0u; specInfo_i < m_misses.size(); specInfo_i++) - { - newPipeline->m_misses[specInfo_i] = m_misses[specInfo_i].clone(depth); - } - - newPipeline->m_hitGroups.resize(m_hitGroups.size()); - for (auto specInfo_i = 0u; specInfo_i < m_misses.size(); specInfo_i++) - { - newPipeline->m_hitGroups[specInfo_i] = m_hitGroups[specInfo_i].clone(depth); - } - - newPipeline->m_callables.resize(m_callables.size()); - for (auto specInfo_i = 0u; specInfo_i < m_callables.size(); specInfo_i++) - { - newPipeline->m_callables[specInfo_i] = m_callables[specInfo_i].clone(depth); - } + auto cloneSpecInfos = [depth](const core::vector& specInfos) -> core::vector { + core::vector results; + results.resize(specInfos.size()); + for (auto specInfo_i = 0u; specInfo_i < specInfos.size(); specInfo_i++) + results[specInfo_i] = specInfos[specInfo_i].clone(depth); + return results; + }; + newPipeline->m_misses = cloneSpecInfos(m_misses); + newPipeline->m_hitGroups.anyHits = cloneSpecInfos(m_hitGroups.anyHits); + newPipeline->m_hitGroups.closestHits = cloneSpecInfos(m_hitGroups.closestHits); + newPipeline->m_hitGroups.intersections = cloneSpecInfos(m_hitGroups.intersections); + newPipeline->m_callables = cloneSpecInfos(m_callables); newPipeline->m_params = m_params; return core::smart_refctd_ptr(newPipeline); @@ -75,17 +58,39 @@ class ICPURayTracingPipeline final : public ICPUPipeline computeDependants() const override final { + core::unordered_set dependants; + dependants.insert(m_raygen.shader.get()); + for (const auto& missInfo : m_misses) dependants.insert(missInfo.shader.get()); + for (const auto& anyHitInfo : m_hitGroups.anyHits) dependants.insert(anyHitInfo.shader.get()); + for (const auto& closestHitInfo : m_hitGroups.closestHits) dependants.insert(closestHitInfo.shader.get()); + for (const auto& intersectionInfo : m_hitGroups.intersections) dependants.insert(intersectionInfo.shader.get()); + for (const auto& callableInfo : m_callables) dependants.insert(callableInfo.shader.get()); + return dependants; + } + inline virtual std::span getSpecInfo(hlsl::ShaderStage stage) const override final { - switch (stage) - { - case hlsl::ShaderStage::ESS_RAYGEN: - return { &m_raygen, 1 }; - } + switch (stage) + { + case hlsl::ShaderStage::ESS_RAYGEN: + return { &m_raygen, 1 }; + case hlsl::ShaderStage::ESS_MISS: + return m_misses; + case hlsl::ShaderStage::ESS_ANY_HIT: + return m_hitGroups.anyHits; + case hlsl::ShaderStage::ESS_CLOSEST_HIT: + return m_hitGroups.closestHits; + case hlsl::ShaderStage::ESS_INTERSECTION: + return m_hitGroups.intersections; + case hlsl::ShaderStage::ESS_CALLABLE: + return m_callables; + + } return {}; } @@ -100,7 +105,8 @@ class ICPURayTracingPipeline final : public ICPUPipeline m_misses; - core::vector m_hitGroups; + SHitGroupSpecInfos m_hitGroups; core::vector m_callables; explicit ICPURayTracingPipeline(const ICPUPipelineLayout* layout) From 434d73e3063ef5a343ebf9a6909fbbb688a9553a Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 13 May 2025 15:42:26 +0700 Subject: [PATCH 093/346] Fix IGraphicsPIpeline constructor --- include/nbl/asset/IGraphicsPipeline.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/nbl/asset/IGraphicsPipeline.h b/include/nbl/asset/IGraphicsPipeline.h index ef49e4c03a..090a368c2f 100644 --- a/include/nbl/asset/IGraphicsPipeline.h +++ b/include/nbl/asset/IGraphicsPipeline.h @@ -110,7 +110,8 @@ class IGraphicsPipeline : public IPipeline, public IGraphics protected: explicit IGraphicsPipeline(const PipelineLayoutType* layout, const SCachedCreationParams& cachedParams, const renderpass_t* renderpass) : - IPipeline(core::smart_refctd_ptr(layout)), m_renderpass(core::smart_refctd_ptr(renderpass)) + IPipeline(core::smart_refctd_ptr(layout)), + m_params(cachedParams), m_renderpass(core::smart_refctd_ptr(renderpass)) {} SCachedCreationParams m_params = {}; From 1cd1771429d4bbb0c563273ad3f522dfa05e5c34 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 13 May 2025 15:42:46 +0700 Subject: [PATCH 094/346] Remove SUBGROUP_SIZE from IPIpeline --- include/nbl/asset/IPipeline.h | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/include/nbl/asset/IPipeline.h b/include/nbl/asset/IPipeline.h index c458c34afe..eb64de0b0d 100644 --- a/include/nbl/asset/IPipeline.h +++ b/include/nbl/asset/IPipeline.h @@ -105,21 +105,6 @@ class IPipelineBase }; using FLAGS = CreationFlags; - // Nabla requires device's reported subgroup size to be between 4 and 128 - enum class SUBGROUP_SIZE : uint8_t - { - // No constraint but probably means `gl_SubgroupSize` is Dynamically Uniform - UNKNOWN = 0, - // Allows the Subgroup Uniform `gl_SubgroupSize` to be non-Dynamically Uniform and vary between Device's min and max - VARYING = 1, - // The rest we encode as log2(x) of the required value - REQUIRE_4 = 2, - REQUIRE_8 = 3, - REQUIRE_16 = 4, - REQUIRE_32 = 5, - REQUIRE_64 = 6, - REQUIRE_128 = 7 - }; }; template From 5823a841f965293c6a53ca24dbdc3a91405d9913 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 13 May 2025 15:43:21 +0700 Subject: [PATCH 095/346] Refactor IRayTracingPipeline to use new SShaderSpecInfo scheme --- include/nbl/asset/IRayTracingPipeline.h | 172 +----------------------- 1 file changed, 5 insertions(+), 167 deletions(-) diff --git a/include/nbl/asset/IRayTracingPipeline.h b/include/nbl/asset/IRayTracingPipeline.h index 0bc2d68653..50ab7ba3f3 100644 --- a/include/nbl/asset/IRayTracingPipeline.h +++ b/include/nbl/asset/IRayTracingPipeline.h @@ -14,35 +14,6 @@ namespace nbl::asset class IRayTracingPipelineBase : public virtual core::IReferenceCounted { public: - struct SShaderGroupsParams - { - struct SIndex - { - constexpr static inline uint32_t Unused = 0xffFFffFFu; - uint32_t index = Unused; - }; - - struct SHitGroup - { - uint32_t closestHit = SIndex::Unused; - uint32_t anyHit = SIndex::Unused; - uint32_t intersection = SIndex::Unused; - }; - - SIndex raygen; - std::span misses; - std::span hits; - std::span callables; - - inline uint32_t getShaderGroupCount() const - { - return 1 + hits.size() + misses.size() + callables.size(); - } - - }; - using SGeneralShaderGroup = SShaderGroupsParams::SIndex; - using SHitShaderGroup = SShaderGroupsParams::SHitGroup; - struct SCachedCreationParams final { uint32_t maxRecursionDepth : 6 = 0; @@ -53,152 +24,19 @@ class IRayTracingPipelineBase : public virtual core::IReferenceCounted template class IRayTracingPipeline : public IPipeline, public IRayTracingPipelineBase { - using base_creation_params_t = IPipeline::SCreationParams; - public: - - using SGeneralShaderGroupContainer = core::smart_refctd_dynamic_array; - using SHitShaderGroupContainer = core::smart_refctd_dynamic_array; - - struct SCreationParams : base_creation_params_t - { - public: - #define base_flag(F) static_cast(base_creation_params_t::FLAGS::F) - enum class FLAGS : uint64_t - { - NONE = base_flag(NONE), - DISABLE_OPTIMIZATIONS = base_flag(DISABLE_OPTIMIZATIONS), - ALLOW_DERIVATIVES = base_flag(ALLOW_DERIVATIVES), - FAIL_ON_PIPELINE_COMPILE_REQUIRED = base_flag(FAIL_ON_PIPELINE_COMPILE_REQUIRED), - EARLY_RETURN_ON_FAILURE = base_flag(EARLY_RETURN_ON_FAILURE), - SKIP_BUILT_IN_PRIMITIVES = 1<<12, - SKIP_AABBS = 1<<13, - NO_NULL_ANY_HIT_SHADERS = 1<<14, - NO_NULL_CLOSEST_HIT_SHADERS = 1<<15, - NO_NULL_MISS_SHADERS = 1<<16, - NO_NULL_INTERSECTION_SHADERS = 1<<17, - ALLOW_MOTION = 1<<20, - }; - #undef base_flag - - protected: - using SpecInfo = IPipelineBase::SShaderSpecInfo; - template - inline bool impl_valid(ExtraLambda&& extra) const - { - if (!IPipeline::SCreationParams::layout) - return false; + using base_creation_params_t = IPipeline; - for (const auto info : shaders) - { - if (info.shader) - { - if (!extra(info)) - return false; - const auto stage = info.stage; - if ((stage & ~IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING) != 0) - return false; - if (!std::has_single_bit>(stage)) - return false; - } - else - { - // every shader must not be null. use SIndex::Unused to represent unused shader. - return false; - } - } - - auto getShaderStage = [this](size_t index) -> IShader::E_SHADER_STAGE - { - return shaders[index].stage; - }; - - auto isValidShaderIndex = [this, getShaderStage](size_t index, IShader::E_SHADER_STAGE expectedStage, bool is_unused_shader_forbidden) -> bool - { - if (index == SShaderGroupsParams::SIndex::Unused) - return !is_unused_shader_forbidden; - if (index >= shaders.size()) - return false; - if (getShaderStage(index) != expectedStage) - return false; - return true; - }; - - if (!isValidShaderIndex(shaderGroups.raygen.index, IShader::E_SHADER_STAGE::ESS_RAYGEN, true)) - { - return false; - } - - for (const auto& shaderGroup : shaderGroups.hits) - { - // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-flags-03470 - if (!isValidShaderIndex(shaderGroup.anyHit, - IShader::E_SHADER_STAGE::ESS_ANY_HIT, - bool(flags & FLAGS::NO_NULL_ANY_HIT_SHADERS))) - return false; - - // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-flags-03471 - if (!isValidShaderIndex(shaderGroup.closestHit, - IShader::E_SHADER_STAGE::ESS_CLOSEST_HIT, - bool(flags & FLAGS::NO_NULL_CLOSEST_HIT_SHADERS))) - return false; - - if (!isValidShaderIndex(shaderGroup.intersection, - IShader::E_SHADER_STAGE::ESS_INTERSECTION, - false)) - return false; - } - - for (const auto& shaderGroup : shaderGroups.misses) - { - if (!isValidShaderIndex(shaderGroup.index, - IShader::E_SHADER_STAGE::ESS_MISS, - false)) - return false; - } - - for (const auto& shaderGroup : shaderGroups.callables) - { - if (!isValidShaderIndex(shaderGroup.index, IShader::E_SHADER_STAGE::ESS_CALLABLE, false)) - return false; - } - return true; - } - - public: - inline bool valid() const - { - return impl_valid([](const SpecInfo& info)->bool - { - if (!info.valid()) - return false; - return false; - }); - } - - std::span shaders = {}; - SShaderGroupsParams shaderGroups; - SCachedCreationParams cached = {}; - // TODO: Could guess the required flags from SPIR-V introspection of declared caps - core::bitflag flags = FLAGS::NONE; - }; + public: inline const SCachedCreationParams& getCachedCreationParams() const { return m_params; } protected: - explicit IRayTracingPipeline(const SCreationParams& _params) : - IPipeline(core::smart_refctd_ptr(_params.layout)), - m_params(_params.cached), - m_raygenShaderGroup(_params.shaderGroups.raygen), - m_missShaderGroups(core::make_refctd_dynamic_array(_params.shaderGroups.misses)), - m_hitShaderGroups(core::make_refctd_dynamic_array(_params.shaderGroups.hits)), - m_callableShaderGroups(core::make_refctd_dynamic_array(_params.shaderGroups.callables)) + explicit IRayTracingPipeline(const PipelineLayoutType* layout, const SCachedCreationParams& cachedParams) : + IPipeline(core::smart_refctd_ptr(layout)), + m_params(cachedParams) {} SCachedCreationParams m_params; - SGeneralShaderGroup m_raygenShaderGroup; - SGeneralShaderGroupContainer m_missShaderGroups; - SHitShaderGroupContainer m_hitShaderGroups; - SGeneralShaderGroupContainer m_callableShaderGroups; }; From 10ec458eb572b567f82774b18bee541da566d275 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 13 May 2025 15:44:10 +0700 Subject: [PATCH 096/346] Remove Subgroup related argument from IGPUPipeline --- include/nbl/video/IGPUPipeline.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h index 826026d9aa..fc4bc8d219 100644 --- a/include/nbl/video/IGPUPipeline.h +++ b/include/nbl/video/IGPUPipeline.h @@ -71,8 +71,6 @@ class IGPUPipelineBase { const asset::IShader* shader = nullptr; std::string_view entryPoint = ""; - asset::IPipelineBase::SUBGROUP_SIZE requiredSubgroupSize : 3 = asset::IPipelineBase::SUBGROUP_SIZE::UNKNOWN; //!< Default value of 8 means no requirement - uint8_t requireFullSubgroups : 1 = false; // Container choice implicitly satisfies: // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-constantID-04911 From 39904f7d86c251491969619cb1a338618399dda2 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 13 May 2025 15:44:46 +0700 Subject: [PATCH 097/346] Refactor IGPUComputePipeline to use IComputePipeline --- include/nbl/video/IGPUComputePipeline.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/nbl/video/IGPUComputePipeline.h b/include/nbl/video/IGPUComputePipeline.h index 42503e1f12..065c567ee2 100644 --- a/include/nbl/video/IGPUComputePipeline.h +++ b/include/nbl/video/IGPUComputePipeline.h @@ -6,6 +6,7 @@ #include "nbl/asset/IPipeline.h" +#include "nbl/asset/IComputePipeline.h" #include "nbl/video/IGPUPipeline.h" #include "nbl/video/SPipelineCreationParams.h" @@ -14,9 +15,9 @@ namespace nbl::video { -class IGPUComputePipeline : public IGPUPipeline> +class IGPUComputePipeline : public IGPUPipeline> { - using pipeline_t = asset::IPipeline; + using pipeline_t = asset::IComputePipeline; public: struct SCreationParams final : SPipelineCreationParams From 2ce032f87550e3d1a57a638696d8cae62bee53d6 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 13 May 2025 15:45:28 +0700 Subject: [PATCH 098/346] Refactor IGPURayTracingPipeline to use new SShaderSpecInfo scheme --- include/nbl/video/IGPURayTracingPipeline.h | 145 ++++++++++++++++++++- 1 file changed, 143 insertions(+), 2 deletions(-) diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h index c41ed333a1..2a6701c9e6 100644 --- a/include/nbl/video/IGPURayTracingPipeline.h +++ b/include/nbl/video/IGPURayTracingPipeline.h @@ -15,6 +15,147 @@ class IGPURayTracingPipeline : public IGPUPipeline; public: + struct SCreationParams + { + #define base_flag(F) static_cast(IPipelineBase::FLAGS::F) + enum class FLAGS : uint64_t + { + NONE = base_flag(NONE), + DISABLE_OPTIMIZATIONS = base_flag(DISABLE_OPTIMIZATIONS), + ALLOW_DERIVATIVES = base_flag(ALLOW_DERIVATIVES), + FAIL_ON_PIPELINE_COMPILE_REQUIRED = base_flag(FAIL_ON_PIPELINE_COMPILE_REQUIRED), + EARLY_RETURN_ON_FAILURE = base_flag(EARLY_RETURN_ON_FAILURE), + SKIP_BUILT_IN_PRIMITIVES = 1<<12, + SKIP_AABBS = 1<<13, + NO_NULL_ANY_HIT_SHADERS = 1<<14, + NO_NULL_CLOSEST_HIT_SHADERS = 1<<15, + NO_NULL_MISS_SHADERS = 1<<16, + NO_NULL_INTERSECTION_SHADERS = 1<<17, + ALLOW_MOTION = 1<<20, + }; + #undef base_flag + + protected: + template + inline bool impl_valid(ExtraLambda&& extra) const + { + if (!m_layout) return false; + + for (const auto info : shaders) + { + if (info.shader) + { + if (!extra(info)) + return false; + const auto stage = info.stage; + if ((stage & ~hlsl::ShaderStage::ESS_ALL_RAY_TRACING) != 0) + return false; + if (!std::has_single_bit>(stage)) + return false; + } + else + { + // every shader must not be null. use SIndex::Unused to represent unused shader. + return false; + } + } + + auto getShaderStage = [this](size_t index) -> hlsl::ShaderStage + { + return shaders[index].stage; + }; + + auto isValidShaderIndex = [this, getShaderStage](size_t index, hlsl::ShaderStage expectedStage, bool is_unused_shader_forbidden) -> bool + { + if (index == SShaderGroupsParams::SIndex::Unused) + return !is_unused_shader_forbidden; + if (index >= shaders.size()) + return false; + if (getShaderStage(index) != expectedStage) + return false; + return true; + }; + + if (!isValidShaderIndex(shaderGroups.raygen.index, hlsl::ShaderStage::ESS_RAYGEN, true)) + { + return false; + } + + for (const auto& shaderGroup : shaderGroups.hits) + { + // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-flags-03470 + if (!isValidShaderIndex(shaderGroup.anyHit, + hlsl::ShaderStage::ESS_ANY_HIT, + bool(flags & FLAGS::NO_NULL_ANY_HIT_SHADERS))) + return false; + + // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-flags-03471 + if (!isValidShaderIndex(shaderGroup.closestHit, + hlsl::ShaderStage::ESS_CLOSEST_HIT, + bool(flags & FLAGS::NO_NULL_CLOSEST_HIT_SHADERS))) + return false; + + if (!isValidShaderIndex(shaderGroup.intersection, + hlsl::ShaderStage::ESS_INTERSECTION, + false)) + return false; + } + + for (const auto& shaderGroup : shaderGroups.misses) + { + if (!isValidShaderIndex(shaderGroup.index, + hlsl::ShaderStage::ESS_MISS, + false)) + return false; + } + + for (const auto& shaderGroup : shaderGroups.callables) + { + if (!isValidShaderIndex(shaderGroup.index, hlsl::ShaderStage::ESS_CALLABLE, false)) + return false; + } + return true; + } + + public: + inline bool valid() const + { + return impl_valid([](const SShaderSpecInfo& info)->bool + { + if (!info.valid()) + return false; + return false; + }); + } + + struct SShaderGroupsParams + { + struct SHitGroup + { + SShaderSpecInfo closestHit; + SShaderSpecInfo anyHit; + SShaderSpecInfo intersection; + }; + + SShaderSpecInfo raygen; + std::span misses; + std::span hits; + std::span callables; + + inline uint32_t getShaderGroupCount() const + { + return 1 + hits.size() + misses.size() + callables.size(); + } + + }; + + SShaderGroupsParams shaderGroups; + + SCachedCreationParams cached = {}; + // TODO: Could guess the required flags from SPIR-V introspection of declared caps + core::bitflag flags = FLAGS::NONE; + }; + struct SShaderGroupHandle { @@ -62,7 +203,7 @@ class IGPURayTracingPipeline : public IGPUPipelinebool + const bool valid = pipeline_t::SCreationParams::impl_valid([&retval](const SShaderSpecInfo& info)->bool { const auto dataSize = info.valid(); if (dataSize<0) @@ -81,7 +222,7 @@ class IGPURayTracingPipeline : public IGPUPipeline getShaders() const { return shaders; } + inline std::span getShaders() const { return shaders; } IGPUPipelineLayout* layout = nullptr; }; From 058657b8defebb5eff9ea56431bd1f9e20ffc4b2 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 13 May 2025 15:45:42 +0700 Subject: [PATCH 099/346] Restore deleted comments --- include/nbl/video/SPipelineCreationParams.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/video/SPipelineCreationParams.h b/include/nbl/video/SPipelineCreationParams.h index 969559d941..489bff4343 100644 --- a/include/nbl/video/SPipelineCreationParams.h +++ b/include/nbl/video/SPipelineCreationParams.h @@ -49,7 +49,7 @@ struct SPipelineCreationParams return basePipelineIndex!=NotDerivingFromPreviousPipeline || basePipeline; } - + // If you set this, then we don't take `basePipelineIndex` into account, the pointer takes precedence const PipelineType* basePipeline = nullptr; int32_t basePipelineIndex = NotDerivingFromPreviousPipeline; }; From 55703e5ee459bde1858a93a03bd046c4ad7a3cb6 Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 13 May 2025 10:57:40 +0200 Subject: [PATCH 100/346] mark off what's been implemented --- .../utilities/IGPUObjectFromAssetConverter.h | 163 ------------------ 1 file changed, 163 deletions(-) diff --git a/include/nbl/video/utilities/IGPUObjectFromAssetConverter.h b/include/nbl/video/utilities/IGPUObjectFromAssetConverter.h index 600197611b..b7ffc5d0c1 100644 --- a/include/nbl/video/utilities/IGPUObjectFromAssetConverter.h +++ b/include/nbl/video/utilities/IGPUObjectFromAssetConverter.h @@ -11,128 +11,6 @@ #include "nbl/video/ILogicalDevice.h" #if 0 -auto IGPUObjectFromAssetConverter::create(const asset::ICPUAccelerationStructure** _begin, const asset::ICPUAccelerationStructure** _end, SParams& _params) -> created_gpu_object_array -{ - const size_t assetCount = std::distance(_begin, _end); - auto res = core::make_refctd_dynamic_array >(assetCount); - auto toCreateAndBuild = std::vector(); - auto buildRangeInfos = std::vector(); - toCreateAndBuild.reserve(assetCount); - buildRangeInfos.reserve(assetCount); - // Lambda function: creates the acceleration structure and It's buffer - auto allocateBufferAndCreateAccelerationStructure = [&](size_t asSize, const asset::ICPUAccelerationStructure* cpuas) - { - // Create buffer with cpuas->getAccelerationStructureSize - IGPUBuffer::SCreationParams gpuBufParams = {}; - gpuBufParams.size = asSize; - gpuBufParams.usage = core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; - auto gpubuf = _params.device->createBuffer(std::move(gpuBufParams)); - auto mreqs = gpubuf->getMemoryReqs(); - mreqs.memoryTypeBits &= _params.device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); - auto gpubufMem = _params.device->allocate(mreqs, gpubuf.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - assert(gpubufMem.isValid()); - - // Create GPUAccelerationStructure with that buffer - IGPUAccelerationStructure::SCreationParams creatationParams = {}; - creatationParams.bufferRange.buffer = gpubuf; - creatationParams.bufferRange.offset = 0; - creatationParams.bufferRange.size = asSize; - creatationParams.flags = cpuas->getCreationParameters().flags; - creatationParams.type = cpuas->getCreationParameters().type; - return _params.device->createAccelerationStructure(std::move(creatationParams)); - }; - - for (ptrdiff_t i = 0u; i < assetCount; ++i) - { - const asset::ICPUAccelerationStructure* cpuas = _begin[i]; - - if(cpuas->hasBuildInfo()) - { - // Add to toBuild vector of ICPUAccelerationStructure - toCreateAndBuild.push_back(cpuas); - buildRangeInfos.push_back(const_cast(cpuas->getBuildRanges().begin())); - } - else if(cpuas->getAccelerationStructureSize() > 0) - { - res->operator[](i) = allocateBufferAndCreateAccelerationStructure(cpuas->getAccelerationStructureSize(), cpuas); - } - } - - if(toCreateAndBuild.empty() == false) - { - bool hostBuildCommands = false; // get from SFeatures - if(hostBuildCommands) - { - _NBL_TODO(); - } - else - { - core::vector cpuBufferDeps; - constexpr uint32_t MaxGeometryPerBuildInfo = 16; - constexpr uint32_t MaxBuffersPerGeometry = 3; // TrianglesData -> vertex+index+transformation - cpuBufferDeps.reserve(assetCount * MaxGeometryPerBuildInfo * MaxBuffersPerGeometry); - - // Get CPUBuffer Dependencies - for (ptrdiff_t i = 0u; i < toCreateAndBuild.size(); ++i) - { - const asset::ICPUAccelerationStructure* cpuas = toCreateAndBuild[i]; - - auto buildInfo = cpuas->getBuildInfo(); - assert(buildInfo != nullptr); - - auto geoms = buildInfo->getGeometries().begin(); - auto geomsCount = buildInfo->getGeometries().size(); - if(geomsCount == 0) - { - assert(false); - continue; - } - - for(uint32_t g = 0; g < geomsCount; ++g) - { - const auto& geom = geoms[g]; - if(geom.type == asset::IAccelerationStructure::EGT_TRIANGLES) - { - if(geom.data.triangles.indexData.isValid()) - { - auto cpuBuf = geom.data.triangles.indexData.buffer.get(); - cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT); - cpuBufferDeps.push_back(cpuBuf); - } - if(geom.data.triangles.vertexData.isValid()) - { - auto cpuBuf = geom.data.triangles.vertexData.buffer.get(); - cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT); - cpuBufferDeps.push_back(cpuBuf); - } - if(geom.data.triangles.transformData.isValid()) - { - auto cpuBuf = geom.data.triangles.transformData.buffer.get(); - cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT); - cpuBufferDeps.push_back(cpuBuf); - } - } - else if(geom.type == asset::IAccelerationStructure::EGT_AABBS) - { - if(geom.data.aabbs.data.isValid()) - { - auto cpuBuf = geom.data.aabbs.data.buffer.get(); - cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT); - cpuBufferDeps.push_back(cpuBuf); - } - } - else if(geom.type == asset::IAccelerationStructure::EGT_INSTANCES) - { - if(geom.data.instances.data.isValid()) - { - auto cpuBuf = geom.data.instances.data.buffer.get(); - cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT); - cpuBufferDeps.push_back(cpuBuf); - } - } - } - } - // Convert CPUBuffer Deps to GPUBuffers core::vector redirs = eliminateDuplicatesAndGenRedirs(cpuBufferDeps); auto gpuBufs = getGPUObjectsFromAssets(cpuBufferDeps.data(), cpuBufferDeps.data()+cpuBufferDeps.size(), _params); @@ -285,47 +163,6 @@ auto IGPUObjectFromAssetConverter::create(const asset::ICPUAccelerationStructure auto & gpuBuildInfo = buildGeomInfos[i]; gpuBuildInfo.scratchAddr.buffer = gpuScratchBuf; } - - // Record CommandBuffer for Building (We have Completed buildInfos + buildRanges for each CPUAS) - auto & fence = _params.fences[EQU_COMPUTE]; - fence = _params.device->createFence(static_cast(0)); - core::smart_refctd_ptr cmdbuf = _params.perQueue[EQU_COMPUTE].cmdbuf; - - IQueue::SSubmitInfo submit; - { - submit.commandBufferCount = 1u; - submit.commandBuffers = &cmdbuf.get(); - submit.waitSemaphoreCount = 0u; - submit.pWaitDstStageMask = nullptr; - submit.pWaitSemaphores = nullptr; - uint32_t waitSemaphoreCount = 0u; - } - - assert(cmdbuf->getState() == IGPUCommandBuffer::STATE::RECORDING); - cmdbuf->buildAccelerationStructures({buildGeomInfos.data(),buildGeomInfos.data()+buildGeomInfos.size()},buildRangeInfos.data()); - cmdbuf->end(); - - // TODO for future to make this function more sophisticated: Compaction, MemoryLimit for Build - - core::smart_refctd_ptr sem; - - if (_params.perQueue[EQU_COMPUTE].semaphore) - sem = _params.device->createSemaphore(); - - auto* sem_ptr = sem.get(); - auto* fence_ptr = fence.get(); - - submit.signalSemaphoreCount = sem_ptr?1u:0u; - submit.pSignalSemaphores = sem_ptr?&sem_ptr:nullptr; - - _params.perQueue[EQU_COMPUTE].queue->submit(1u, &submit, fence_ptr); - if (_params.perQueue[EQU_COMPUTE].semaphore) - _params.perQueue[EQU_COMPUTE].semaphore[0] = std::move(sem); - } - } - - return res; -} #endif #endif From c4aefda23a1106dc8f18e14a6896dffcd9a4bc4c Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 13 May 2025 11:19:05 +0200 Subject: [PATCH 101/346] protect against `IPreHashed` assets which don't have a valid precomputed hash --- src/nbl/video/utilities/CAssetConverter.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index de72e2f360..d678159511 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -1153,6 +1153,8 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_thostBuild; hasher << lookup.patch->compactAfterBuild; // finally the contents + if (lookup.asset->getContentHash()==NoContentHash) + return false; hasher << lookup.asset->getContentHash(); return true; } @@ -1232,6 +1234,8 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t look creationFlags |= create_flags_t::ECF_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT; hasher << creationFlags; // finally the contents + if (lookup.asset->getContentHash()==NoContentHash) + return false; hasher << lookup.asset->getContentHash(); return true; } @@ -1335,6 +1339,8 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_tdata(),entry.first.meta->size()); } + if (lookup.asset->getContentHash()==NoContentHash) + return false; hasher << lookup.asset->getContentHash(); return true; } From 1c0e72efdf18c17c474e6494a3850f3f132afbcb Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 14 May 2025 15:28:55 +0700 Subject: [PATCH 102/346] split config into new file --- examples_tests | 2 +- .../nbl/builtin/hlsl/subgroup2/ballot.hlsl | 13 +++ .../nbl/builtin/hlsl/workgroup2/config.hlsl | 88 +++++++++++++++++++ .../builtin/hlsl/workgroup2/shared_scan.hlsl | 86 ++---------------- 4 files changed, 111 insertions(+), 78 deletions(-) create mode 100644 include/nbl/builtin/hlsl/workgroup2/config.hlsl diff --git a/examples_tests b/examples_tests index 20011f5fdd..4a951b307b 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 20011f5fdd3e8454bb830ded6f4221ec75036809 +Subproject commit 4a951b307b09ecf4a054f7ac27d4dac01f5e8fb9 diff --git a/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl b/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl index 724887b995..6c7ec4f593 100644 --- a/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl +++ b/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl @@ -11,6 +11,19 @@ namespace hlsl namespace subgroup2 { +uint32_t LastSubgroupInvocation() +{ + // why this code was wrong before: + // - only compute can use SubgroupID + // - but there's no mapping of InvocationID to SubgroupID and Index + return glsl::subgroupBallotFindMSB(glsl::subgroupBallot(true)); +} + +bool ElectLast() +{ + return glsl::gl_SubgroupInvocationID()==LastSubgroupInvocation(); +} + template struct Configuration { diff --git a/include/nbl/builtin/hlsl/workgroup2/config.hlsl b/include/nbl/builtin/hlsl/workgroup2/config.hlsl new file mode 100644 index 0000000000..7855cc1701 --- /dev/null +++ b/include/nbl/builtin/hlsl/workgroup2/config.hlsl @@ -0,0 +1,88 @@ +// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_CONFIG_INCLUDED_ +#define _NBL_BUILTIN_HLSL_WORKGROUP2_CONFIG_INCLUDED_ + +#include "nbl/builtin/hlsl/cpp_compat.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace workgroup2 +{ + +namespace impl +{ +template +struct virtual_wg_size_log2 +{ + NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2),uint16_t,conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2*2+2),uint16_t,3,2>::value,1>::value; + NBL_CONSTEXPR_STATIC_INLINE uint16_t value = mpl::max_v+SubgroupSizeLog2; +}; + +template +struct items_per_invocation +{ + NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocationProductLog2 = mpl::max_v; + NBL_CONSTEXPR_STATIC_INLINE uint16_t value0 = BaseItemsPerInvocation; + NBL_CONSTEXPR_STATIC_INLINE uint16_t value1 = uint16_t(0x1u) << conditional_value, ItemsPerInvocationProductLog2>::value; + NBL_CONSTEXPR_STATIC_INLINE uint16_t value2 = uint16_t(0x1u) << mpl::max_v; +}; +} + +template +struct Configuration +{ + NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << WorkgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = _SubgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2; + static_assert(WorkgroupSizeLog2>=_SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize"); + + // must have at least enough level 0 outputs to feed a single subgroup + NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupsPerVirtualWorkgroupLog2 = mpl::max_v; + NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupsPerVirtualWorkgroup = uint16_t(0x1u) << SubgroupsPerVirtualWorkgroupLog2; + + using virtual_wg_t = impl::virtual_wg_size_log2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = virtual_wg_t::levels; + NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << virtual_wg_t::value; + using items_per_invoc_t = impl::items_per_invocation; + // NBL_CONSTEXPR_STATIC_INLINE uint32_t2 ItemsPerInvocation; TODO? doesn't allow inline definitions for uint32_t2 for some reason, uint32_t[2] as well ; declaring out of line results in not constant expression + NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_0 = items_per_invoc_t::value0; + NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_1 = items_per_invoc_t::value1; + NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = items_per_invoc_t::value2; + static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!"); + + NBL_CONSTEXPR_STATIC_INLINE uint16_t SharedMemSize = conditional_value::value + SubgroupsPerVirtualWorkgroup*ItemsPerInvocation_1; +}; + +// special case when workgroup size 2048 and subgroup size 16 needs 3 levels and virtual workgroup size 4096 to get a full subgroup scan each on level 1 and 2 16x16x16=4096 +// specializing with macros because of DXC bug: https://github.com/microsoft/DirectXShaderCom0piler/issues/7007 +#define SPECIALIZE_CONFIG_CASE_2048_16(ITEMS_PER_INVOC) template<>\ +struct Configuration<11, 4, ITEMS_PER_INVOC>\ +{\ + NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << 11u;\ + NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = uint16_t(4u);\ + NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;\ + NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupsPerVirtualWorkgroupLog2 = 7u;\ + NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupsPerVirtualWorkgroup = 128u;\ + NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = 3u;\ + NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << 4096;\ + NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_0 = ITEMS_PER_INVOC;\ + NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_1 = 1u;\ + NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = 1u;\ +};\ + +SPECIALIZE_CONFIG_CASE_2048_16(1) +SPECIALIZE_CONFIG_CASE_2048_16(2) +SPECIALIZE_CONFIG_CASE_2048_16(4) + +} +} +} + +#undef SPECIALIZE_CONFIG_CASE_2048_16 + +#endif diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 1abd9cccd2..b03120b5f6 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -4,88 +4,20 @@ #ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_SHARED_SCAN_INCLUDED_ #define _NBL_BUILTIN_HLSL_WORKGROUP2_SHARED_SCAN_INCLUDED_ -#include "nbl/builtin/hlsl/cpp_compat.hlsl" #include "nbl/builtin/hlsl/workgroup/broadcast.hlsl" #include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" -#include "nbl/builtin/hlsl/subgroup/ballot.hlsl" +#include "nbl/builtin/hlsl/subgroup2/ballot.hlsl" #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" #include "nbl/builtin/hlsl/mpl.hlsl" +#include "nbl/builtin/hlsl/workgroup2/config.hlsl" -namespace nbl +namespace nbl { namespace hlsl { namespace workgroup2 { -namespace impl -{ -template -struct virtual_wg_size_log2 -{ - NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2),uint16_t,conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2*2+2),uint16_t,3,2>::value,1>::value; - NBL_CONSTEXPR_STATIC_INLINE uint16_t value = mpl::max_v+SubgroupSizeLog2; -}; - -template -struct items_per_invocation -{ - NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocationProductLog2 = mpl::max_v; - NBL_CONSTEXPR_STATIC_INLINE uint16_t value0 = BaseItemsPerInvocation; - NBL_CONSTEXPR_STATIC_INLINE uint16_t value1 = uint16_t(0x1u) << conditional_value, ItemsPerInvocationProductLog2>::value; - NBL_CONSTEXPR_STATIC_INLINE uint16_t value2 = uint16_t(0x1u) << mpl::max_v; -}; -} - -template -struct Configuration -{ - NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << WorkgroupSizeLog2; - NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = uint16_t(_SubgroupSizeLog2); - NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2; - static_assert(WorkgroupSizeLog2>=_SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize"); - - // must have at least enough level 0 outputs to feed a single subgroup - NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroupLog2 = mpl::max_v; - NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroup = 0x1u << SubgroupsPerVirtualWorkgroupLog2; - - using virtual_wg_t = impl::virtual_wg_size_log2; - NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = virtual_wg_t::levels; - NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << virtual_wg_t::value; - using items_per_invoc_t = impl::items_per_invocation; - // NBL_CONSTEXPR_STATIC_INLINE uint32_t2 ItemsPerInvocation; TODO? doesn't allow inline definitions for uint32_t2 for some reason, uint32_t[2] as well ; declaring out of line results in not constant expression - NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_0 = items_per_invoc_t::value0; - NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_1 = items_per_invoc_t::value1; - NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_2 = items_per_invoc_t::value2; - static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!"); - - NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedMemSize = conditional_value::value + SubgroupsPerVirtualWorkgroup*ItemsPerInvocation_1; -}; - -// special case when workgroup size 2048 and subgroup size 16 needs 3 levels and virtual workgroup size 4096 to get a full subgroup scan each on level 1 and 2 16x16x16=4096 -// specializing with macros because of DXC bug: https://github.com/microsoft/DirectXShaderCom0piler/issues/7007 -#define SPECIALIZE_CONFIG_CASE_2048_16(ITEMS_PER_INVOC) template<>\ -struct Configuration<11, 4, ITEMS_PER_INVOC>\ -{\ - NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << 11u;\ - NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = uint16_t(4u);\ - NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;\ - NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroupLog2 = 7u;\ - NBL_CONSTEXPR_STATIC_INLINE uint32_t SubgroupsPerVirtualWorkgroup = 128u;\ - NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = 3u;\ - NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << 4096;\ - NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_0 = ITEMS_PER_INVOC;\ - NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_1 = 1u;\ - NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation_2 = 1u;\ -};\ - -SPECIALIZE_CONFIG_CASE_2048_16(1) -SPECIALIZE_CONFIG_CASE_2048_16(2) -SPECIALIZE_CONFIG_CASE_2048_16(4) - -#undef SPECIALIZE_CONFIG_CASE_2048_16 - - namespace impl { @@ -171,7 +103,7 @@ struct reduce { dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); scan_local[idx] = reduction0(scan_local[idx]); - if (subgroup::ElectLast()) + if (subgroup2::ElectLast()) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); @@ -228,7 +160,7 @@ struct scan { dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); scan_local[idx] = inclusiveScan0(scan_local[idx]); - if (subgroup::ElectLast()) + if (subgroup2::ElectLast()) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); @@ -304,7 +236,7 @@ struct reduce { dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); scan_local[idx] = reduction0(scan_local[idx]); - if (subgroup::ElectLast()) + if (subgroup2::ElectLast()) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); @@ -322,7 +254,7 @@ struct reduce for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]); lv1_val = reduction1(lv1_val); - if (subgroup::ElectLast()) + if (subgroup2::ElectLast()) { const uint32_t bankedIndex = (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2); scratchAccessor.set(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); @@ -380,7 +312,7 @@ struct scan { dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); scan_local[idx] = inclusiveScan0(scan_local[idx]); - if (subgroup::ElectLast()) + if (subgroup2::ElectLast()) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); @@ -399,7 +331,7 @@ struct scan for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]); lv1_val = inclusiveScan1(lv1_val); - if (subgroup::ElectLast()) + if (subgroup2::ElectLast()) { const uint32_t bankedIndex = (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2); scratchAccessor.set(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); From a8794023e368990182b498ccbe5328187fe2662e Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 14 May 2025 11:30:46 +0200 Subject: [PATCH 103/346] add more debug for @kept_secret --- src/nbl/video/utilities/CAssetConverter.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index d678159511..ea4dbf8b0f 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -2918,10 +2918,10 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult } } } - if (!buildSize) + if (buildSize==0 || sizes.buildScratchSize==0) { inputs.logger.log( - "Build Size Input is 0 for Acceleration Structure %8llx%8llx%8llx%8llx", + "Build Size Input is 0 or failed the call to `ILogicalDevice::getAccelerationStructureBuildSizes` for Acceleration Structure %8llx%8llx%8llx%8llx", system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] ); continue; From 61e44254917c3432d698c64545e88c29f8e4fa00 Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 14 May 2025 11:33:53 +0200 Subject: [PATCH 104/346] got the BLAS build size query CPU vs GPU input buffer parameters wrong way around --- src/nbl/video/utilities/CAssetConverter.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index ea4dbf8b0f..1f28c3ac0f 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -2862,15 +2862,15 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult const auto geoms = as->getAABBGeometries(); if (patch.hostBuild) { - const std::span> cpuGeoms = { - reinterpret_cast*>(geoms.data()),geoms.size() + const std::span> cpuGeoms = { + reinterpret_cast*>(geoms.data()),geoms.size() }; sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts); } else { - const std::span> cpuGeoms = { - reinterpret_cast*>(geoms.data()),geoms.size() + const std::span> cpuGeoms = { + reinterpret_cast*>(geoms.data()),geoms.size() }; sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts); } @@ -2885,15 +2885,15 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult const auto geoms = as->getTriangleGeometries(); if (patch.hostBuild) { - const std::span> cpuGeoms = { - reinterpret_cast*>(geoms.data()),geoms.size() + const std::span> cpuGeoms = { + reinterpret_cast*>(geoms.data()),geoms.size() }; sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts); } else { - const std::span> cpuGeoms = { - reinterpret_cast*>(geoms.data()),geoms.size() + const std::span> cpuGeoms = { + reinterpret_cast*>(geoms.data()),geoms.size() }; sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts); } From d3ff417cc616d4560eb0979ad242274f9cd5a2b6 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 14 May 2025 16:42:44 +0700 Subject: [PATCH 105/346] as fixes to asset converter --- examples_tests | 2 +- include/nbl/asset/ICPUAccelerationStructure.h | 2 +- include/nbl/video/asset_traits.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples_tests b/examples_tests index 8c76367c1c..16b7349f55 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 8c76367c1c226cce3d66f1c60f540e29a501a1cb +Subproject commit 16b7349f55344cafc8ec9ab28ce72e129fe938bd diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h index 9c9af32f7b..a2229309b5 100644 --- a/include/nbl/asset/ICPUAccelerationStructure.h +++ b/include/nbl/asset/ICPUAccelerationStructure.h @@ -140,7 +140,7 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo inline core::blake3_hash_t computeContentHash() const override { - if (!missingContent()) + if (missingContent()) return INVALID_HASH; const bool isAABB = m_buildFlags.hasFlags(BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT); core::blake3_hasher hasher; diff --git a/include/nbl/video/asset_traits.h b/include/nbl/video/asset_traits.h index 77bab76f64..442060d879 100644 --- a/include/nbl/video/asset_traits.h +++ b/include/nbl/video/asset_traits.h @@ -194,7 +194,7 @@ struct asset_traits // the asset type using asset_t = asset::ICPUBottomLevelAccelerationStructure; // we don't need to descend during DFS into other assets - constexpr static inline bool HasChildren = true; + constexpr static inline bool HasChildren = false; // the video type using video_t = IGPUBottomLevelAccelerationStructure; // lookup type From 14320663adabe36dfe8e9d3aaef69f609250dc8c Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 14 May 2025 13:00:50 +0200 Subject: [PATCH 106/346] fix passing QueryOnly for the triangle version of `getVkASGeometryFrom` start some light validation code in `ILogicalDevice::getAccelerationStructureBuildSizes` for BLASes --- include/nbl/video/ILogicalDevice.h | 8 ++++++++ src/nbl/video/CVulkanAccelerationStructure.h | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index f2998d8e8c..93aa965416 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -455,6 +455,14 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe uint32_t primsFree = limits.maxAccelerationStructurePrimitiveCount; for (auto i=0u; i,Geometry>) + { + // TODO: do we check `maxVertex`, `vertexStride` and `indexType` for validity? + } + if constexpr (std::is_same_v,Geometry>) + { + // TODO: check stride and geometry flags for validity? + } if (pMaxPrimitiveCounts[i] > primsFree) { NBL_LOG_ERROR("Primitive count exceeds device limit"); diff --git a/src/nbl/video/CVulkanAccelerationStructure.h b/src/nbl/video/CVulkanAccelerationStructure.h index 6b94f9cad7..eb1e0534fe 100644 --- a/src/nbl/video/CVulkanAccelerationStructure.h +++ b/src/nbl/video/CVulkanAccelerationStructure.h @@ -134,7 +134,7 @@ void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles(triangles.transform); else { if constexpr (triangles.Host) @@ -147,7 +147,7 @@ void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles& triangles, VkAccelerationStructureGeometryKHR& outBase, VkAccelerationStructureGeometryMotionTrianglesDataNV* &p_vertexMotion) { - getVkASGeometryFrom(triangles,outBase); + getVkASGeometryFrom(triangles,outBase); if (triangles.vertexData[1].buffer) { p_vertexMotion->sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_MOTION_TRIANGLES_DATA_NV; From 5290d656649419e2334a5b8569ccf850157ef80b Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 14 May 2025 13:01:45 +0200 Subject: [PATCH 107/346] incorrect refactor revert --- src/nbl/video/CVulkanAccelerationStructure.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nbl/video/CVulkanAccelerationStructure.h b/src/nbl/video/CVulkanAccelerationStructure.h index eb1e0534fe..b6c06f158d 100644 --- a/src/nbl/video/CVulkanAccelerationStructure.h +++ b/src/nbl/video/CVulkanAccelerationStructure.h @@ -134,7 +134,7 @@ void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles(triangles.transform); + outBase.geometry.triangles.transformData = DummyNonNullAddress; else { if constexpr (triangles.Host) From 6d8b728d048281c95550e5d0e11be6dae32f53ba Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 14 May 2025 13:29:26 +0200 Subject: [PATCH 108/346] update cmake/submodules/update.cmake, respect private submodules with git config on fly; update examples_tests submodule, private submodule is excluded from recurse update by default from now --- cmake/submodules/update.cmake | 23 +++++++++++++++++++---- examples_tests | 2 +- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/cmake/submodules/update.cmake b/cmake/submodules/update.cmake index 76e3603980..d0365c72ca 100644 --- a/cmake/submodules/update.cmake +++ b/cmake/submodules/update.cmake @@ -8,6 +8,9 @@ option(NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE "Turn this ON to attempt to upda option(NBL_UPDATE_GIT_SUBMODULE_NO_SEPARATE_SHELL "Turn this ON to prevent CMake from executing git submodules update or sync in a separate shell - be aware that the interaction with shell will be impossible in case of paraphrase prompt request of your key!" ON) option(NBL_CI_GIT_SUBMODULES_SHALLOW "" OFF) +# TODO: replace all of this command recording & proxy logic with executing single recurse one-liner including -c options for private submodules +# once we have relative URLs + all .gitmodules configs are polished (so basically we don't have to set some config options on fly) + if(NOT DEFINED NBL_ROOT_PATH) get_filename_component(NBL_ROOT_PATH "${CMAKE_CURRENT_LIST_DIR}/../../" ABSOLUTE) endif() @@ -26,6 +29,18 @@ endif() function(NBL_UPDATE_SUBMODULES) ProcessorCount(_GIT_SUBMODULES_JOBS_AMOUNT_) + + set(PRIVATE_SUBMODULES + Ditt-Reference-Scenes + ) + + foreach(NBL_P_SUBMODULE_NAME ${PRIVATE_SUBMODULES}) + if(NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE) + list(APPEND NBL_CONFIG_PRIVATE_SETUP_CMD "-c submodule.\"${NBL_P_SUBMODULE_NAME}\".update=checkout") + else() + list(APPEND NBL_CONFIG_PRIVATE_SETUP_CMD "-c submodule.\"${NBL_P_SUBMODULE_NAME}\".update=none") + endif() + endforeach() if(NBL_CI_GIT_SUBMODULES_SHALLOW) set(NBL_SHALLOW "--depth=1") @@ -54,9 +69,9 @@ function(NBL_UPDATE_SUBMODULES) endif() if(SHOULD_RECURSIVE) - set(_NBL_EXECUTE_COMMAND_ "\"${GIT_EXECUTABLE}\" -C \"${NBL_ROOT_PATH}/${GIT_RELATIVE_ENTRY}\" ${NBL_EXCLUDE} submodule update --init -j ${_GIT_SUBMODULES_JOBS_AMOUNT_} ${NBL_FORCE} --recursive ${NBL_SHALLOW} ${GIT_SUBMODULE_PATH}") + set(_NBL_EXECUTE_COMMAND_ "\"${GIT_EXECUTABLE}\" -C \"${NBL_ROOT_PATH}/${GIT_RELATIVE_ENTRY}\" ${NBL_EXCLUDE} ${NBL_CONFIG_PRIVATE_SETUP_CMD} submodule update --init -j ${_GIT_SUBMODULES_JOBS_AMOUNT_} ${NBL_FORCE} --recursive ${NBL_SHALLOW} ${GIT_SUBMODULE_PATH}") else() - set(_NBL_EXECUTE_COMMAND_ "\"${GIT_EXECUTABLE}\" -C \"${NBL_ROOT_PATH}/${GIT_RELATIVE_ENTRY}\" ${NBL_EXCLUDE} submodule update --init -j ${_GIT_SUBMODULES_JOBS_AMOUNT_} ${NBL_FORCE} ${NBL_SHALLOW} ${GIT_SUBMODULE_PATH}") + set(_NBL_EXECUTE_COMMAND_ "\"${GIT_EXECUTABLE}\" -C \"${NBL_ROOT_PATH}/${GIT_RELATIVE_ENTRY}\" ${NBL_EXCLUDE} ${NBL_CONFIG_PRIVATE_SETUP_CMD} submodule update --init -j ${_GIT_SUBMODULES_JOBS_AMOUNT_} ${NBL_FORCE} ${NBL_SHALLOW} ${GIT_SUBMODULE_PATH}") endif() string(APPEND _NBL_UPDATE_SUBMODULES_COMMANDS_ "${_NBL_EXECUTE_COMMAND_}\n") @@ -131,6 +146,7 @@ execute_process(COMMAND "${GIT_EXECUTABLE}" ${NBL_CONFIG_SETUP_CMD} submodule up NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./3rdparty TRUE "${NBL_3RDPARTY_MODULES_TO_SKIP}") # boost's 3rdparties, special case + # TODO: fork boost and update .gitmodules to cover only libs we want to use set(NBL_BOOST_LIBS_TO_INIT ${NBL_BOOST_LIBS} wave numeric_conversion) # wave and all of its deps, numeric_conversion is nested in conversion submodule (for some reason boostdep tool doesn't output it properly) foreach(NBL_TARGET ${NBL_BOOST_LIBS_TO_INIT}) list(APPEND NBL_BOOST_SUBMODULES_TO_INIT ${NBL_TARGET}) @@ -153,8 +169,7 @@ execute_process(COMMAND "${GIT_EXECUTABLE}" ${NBL_CONFIG_SETUP_CMD} submodule up # examples and their media if(NBL_BUILD_EXAMPLES) - NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./examples_tests FALSE "") - NBL_WRAPPER_COMMAND_EXCLUSIVE(examples_tests ./media FALSE "") + NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./examples_tests TRUE "") endif() endif() diff --git a/examples_tests b/examples_tests index 8c76367c1c..825c73d5d8 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 8c76367c1c226cce3d66f1c60f540e29a501a1cb +Subproject commit 825c73d5d8307efef2488f0b6ce82b69c32855ea From dff6f4ee1981b9a8de5bcf11e0a781c26a144fcd Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 14 May 2025 13:41:01 +0200 Subject: [PATCH 109/346] exclude 3rdparty/glTFSampleModels from default update --- .gitmodules | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 8edc1cead9..7ed5921c66 100644 --- a/.gitmodules +++ b/.gitmodules @@ -54,6 +54,7 @@ [submodule "3rdparty/glTFSampleModels"] path = 3rdparty/glTFSampleModels url = git@github.com:Devsh-Graphics-Programming/glTF-Sample-Models.git + update = none [submodule "3rdparty/nbl_spirv_cross"] path = 3rdparty/nbl_spirv_cross url = git@github.com:devshgraphicsprogramming/SPIRV-Cross.git @@ -116,4 +117,4 @@ url = git@github.com:Devsh-Graphics-Programming/libdeflate.git [submodule "docker/compiler-explorer"] path = docker/compiler-explorer - url = git@github.com:Devsh-Graphics-Programming/Compiler-Explorer-Docker.git + url = git@github.com:Devsh-Graphics-Programming/Compiler-Explorer-Docker.git \ No newline at end of file From cc9f6943ea34afa6dc375dad312c2af2bcaafbcd Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 14 May 2025 13:54:41 +0200 Subject: [PATCH 110/346] update .gitmodules, allow git to allocate jobs to update submodules --- .gitmodules | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 7ed5921c66..0aacb58ffd 100644 --- a/.gitmodules +++ b/.gitmodules @@ -117,4 +117,8 @@ url = git@github.com:Devsh-Graphics-Programming/libdeflate.git [submodule "docker/compiler-explorer"] path = docker/compiler-explorer - url = git@github.com:Devsh-Graphics-Programming/Compiler-Explorer-Docker.git \ No newline at end of file + url = git@github.com:Devsh-Graphics-Programming/Compiler-Explorer-Docker.git + +[submodule] + # https://git-scm.com/docs/git-config#Documentation/git-config.txt-submodulefetchJobs + fetchJobs = 0 \ No newline at end of file From 41ef540b1a661411e121825f345e4c5a854aefb4 Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 14 May 2025 13:59:24 +0200 Subject: [PATCH 111/346] I forgot to make the deferredly created BLASes and TLASes write to the post GPU object creation output array. Also found that I stored a lot of stuff redundantly in the `DeferredASCreationParams` --- src/nbl/video/utilities/CAssetConverter.cpp | 121 ++++++++++---------- 1 file changed, 62 insertions(+), 59 deletions(-) diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 1f28c3ac0f..0d76f2868b 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -2759,10 +2759,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult // BLAS and TLAS creation is somewhat delayed by buffer creation and allocation struct DeferredASCreationParams { - const IAccelerationStructure* canonical; asset_cached_t storage = {}; - uint64_t patchIx = 0; - uint64_t uniqueCopyGroupID = 0; uint64_t scratchSize = 0; uint64_t buildSize = 0; }; @@ -2931,7 +2928,6 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult // we need to save the buffer in a side-channel for later auto& out = accelerationStructureParams[IsTLAS][entry.second.firstCopyIx+i]; - out.canonical = as; // this is where it gets a bit weird, we need to create a buffer to back the acceleration structure { IGPUBuffer::SCreationParams params = {}; @@ -2950,8 +2946,6 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult continue; } } - out.patchIx = patchIx; - out.uniqueCopyGroupID = uniqueCopyGroupID; out.scratchSize = sizes.buildScratchSize; out.buildSize = buildSize; } @@ -3386,7 +3380,8 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult } // clear what we don't need - conversionRequests.gpuObjUniqueCopyGroupIDs.clear(); + if constexpr (!std::is_base_of_v) + conversionRequests.gpuObjUniqueCopyGroupIDs.clear(); // This gets deferred till AFTER the Buffer Memory Allocations and Binding if constexpr (!std::is_base_of_v && !std::is_base_of_v::video_t>) { @@ -3418,7 +3413,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult bufferConversions.propagateToCaches(std::get>(dfsCaches),std::get>(retval.m_stagingCaches)); // Deal with Deferred Creation of Acceleration structures { - auto createAccelerationStructures = [&]()->void + auto createAccelerationStructures = [&](conversions_t& requests)->void { constexpr bool IsTLAS = std::is_same_v; // @@ -3428,63 +3423,70 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult else pConversions = retval.m_blasConversions; // we enqueue the conversions AFTER making sure that the BLAS / TLAS can actually be created - for (size_t i=0; i(deferredParams.canonical); - const auto& dfsNode = std::get>(dfsCaches).nodes[deferredParams.patchIx]; - const auto& patch = dfsNode.patch; - // create the AS - const auto bufSz = deferredParams.storage.get()->getSize(); - IGPUAccelerationStructure::SCreationParams baseParams; - { - using create_f = IGPUAccelerationStructure::SCreationParams::FLAGS; - baseParams = { - .bufferRange = {.offset=0,.size=bufSz,.buffer=deferredParams.storage.value}, - .flags = patch.isMotion ? create_f::MOTION_BIT:create_f::NONE - }; - } - smart_refctd_ptr::video_t> as; - CAssetConverter::SReserveResult::SConvReqTLAS::cpu_to_gpu_blas_map_t blasInstanceMap; - if constexpr (IsTLAS) - { - // check if the BLASes we want to use for the instances were successfully allocated and created - AssetVisitor> visitor = { - {inputs,dfsCaches,&blasInstanceMap}, - {canonical,deferredParams.uniqueCopyGroupID}, - patch - }; - if (!visitor()) - { - inputs.logger.log( - "Failed to find all GPU Bottom Level Acceleration Structures needed to build TLAS %8llx%8llx%8llx%8llx", - system::ILogger::ELL_ERROR//,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] - ); - continue; - } - as = device->createTopLevelAccelerationStructure({std::move(baseParams),patch.maxInstances}); - } - else - as = device->createBottomLevelAccelerationStructure(std::move(baseParams)); - if (!as) + const auto reqIx = entry.second.firstCopyIx+i; + if (const auto& deferredParams=accelerationStructureParams[IsTLAS][reqIx]; deferredParams.storage) { - inputs.logger.log("Failed to Create Acceleration Structure.",system::ILogger::ELL_ERROR); - continue; + const auto* canonical = entry.second.canonicalAsset; + const auto& dfsNode = std::get>(dfsCaches).nodes[entry.second.patchIndex.value]; + const auto& patch = dfsNode.patch; + // create the AS + const auto bufSz = deferredParams.storage.get()->getSize(); + IGPUAccelerationStructure::SCreationParams baseParams; + { + using create_f = IGPUAccelerationStructure::SCreationParams::FLAGS; + baseParams = { + .bufferRange = {.offset=0,.size=bufSz,.buffer=deferredParams.storage.value}, + .flags = patch.isMotion ? create_f::MOTION_BIT:create_f::NONE + }; + } + smart_refctd_ptr::video_t> as; + CAssetConverter::SReserveResult::SConvReqTLAS::cpu_to_gpu_blas_map_t blasInstanceMap; + if constexpr (IsTLAS) + { + // check if the BLASes we want to use for the instances were successfully allocated and created + AssetVisitor> visitor = { + {inputs,dfsCaches,&blasInstanceMap}, + {canonical,requests.gpuObjUniqueCopyGroupIDs[reqIx]}, + patch + }; + if (!visitor()) + { + const auto hashAsU64 = reinterpret_cast(entry.first.data); + inputs.logger.log( + "Failed to find all GPU Bottom Level Acceleration Structures needed to build TLAS %8llx%8llx%8llx%8llx", + system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] + ); + continue; + } + as = device->createTopLevelAccelerationStructure({std::move(baseParams),patch.maxInstances}); + } + else + as = device->createBottomLevelAccelerationStructure(std::move(baseParams)); + if (!as) + { + inputs.logger.log("Failed to Create Acceleration Structure.",system::ILogger::ELL_ERROR); + continue; + } + // file the request for conversion + auto& request = pConversions[patch.hostBuild][as.get()]; + request.canonical = smart_refctd_ptr(canonical); + request.scratchSize = deferredParams.scratchSize; + request.compact = patch.compactAfterBuild; + request.buildFlags = static_cast(patch.getBuildFlags(canonical).value); + request.buildSize = deferredParams.buildSize; + if constexpr (IsTLAS) + request.instanceMap = std::move(blasInstanceMap); + requests.assign(entry.first,entry.second.firstCopyIx,i,std::move(as)); } - // file the request for conversion - auto& request = pConversions[patch.hostBuild][as.get()]; - request.canonical = smart_refctd_ptr(canonical); - request.scratchSize = deferredParams.scratchSize; - request.compact = patch.compactAfterBuild; - request.buildFlags = static_cast(patch.getBuildFlags(canonical).value); - request.buildSize = deferredParams.buildSize; - if constexpr (IsTLAS) - request.instanceMap = std::move(blasInstanceMap); } + requests.gpuObjUniqueCopyGroupIDs.clear(); }; - createAccelerationStructures.template operator()(); + createAccelerationStructures.template operator()(blasConversions); blasConversions.propagateToCaches(std::get>(dfsCaches),std::get>(retval.m_stagingCaches)); - createAccelerationStructures.template operator()(); + createAccelerationStructures.template operator()(tlasConversions); tlasConversions.propagateToCaches(std::get>(dfsCaches),std::get>(retval.m_stagingCaches)); } // enqueue successfully created images with data to upload for conversion @@ -3577,6 +3579,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult retval.m_tlasConversions[i].erase(gpuObj); if constexpr (std::is_same_v) retval.m_imageConversions.erase(gpuObj); + // TODO: erase from `retval.m_gpuObjects` as well return true; } // still referenced, keep it around From 310eafd491cfacf8089248b0266d25c5ad0a0f2e Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 14 May 2025 15:45:18 +0200 Subject: [PATCH 112/346] fix various typos and bugs in Asset Converter --- include/nbl/video/utilities/CAssetConverter.h | 3 +- src/nbl/video/utilities/CAssetConverter.cpp | 35 +++++++++---------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h index 01da012a0d..182b025ada 100644 --- a/include/nbl/video/utilities/CAssetConverter.h +++ b/include/nbl/video/utilities/CAssetConverter.h @@ -959,7 +959,8 @@ class CAssetConverter : public core::IReferenceCounted uint32_t sampledImageBindingCount = 1<<10; uint32_t storageImageBindingCount = 11<<10; // specific to Acceleration Structure Build, they need to be at least as large as the largest amount of scratch required for an AS build - CAsyncSingleBufferSubAllocatorST>* scratchForDeviceASBuild = nullptr; + using scratch_for_device_AS_build_t = CAsyncSingleBufferSubAllocatorST>; + scratch_for_device_AS_build_t* scratchForDeviceASBuild = nullptr; std::pmr::memory_resource* scratchForHostASBuild = nullptr; // needs to service allocations without limit, unlike the above where failure will just force a flush and performance of already queued up builds IDeviceMemoryAllocator* compactedASAllocator = nullptr; diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 0d76f2868b..b6f0541a3f 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -2455,7 +2455,7 @@ struct conversions_t const uint64_t uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[copyIx+baseIx]; if constexpr (std::is_same_v || std::is_same_v) { - const auto constrainMask = inputs->constrainMemoryTypeBits(uniqueCopyGroupID,asset,contentHash,gpuObj.get()); + const auto constrainMask = inputs->constrainMemoryTypeBits(uniqueCopyGroupID,asset,contentHash,output->value.get()); if (!deferredAllocator->request(output,constrainMask)) return; } @@ -3766,11 +3766,10 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul // Descriptor Sets need their TLAS descriptors substituted if they've been compacted core::unordered_map> compactedTLASMap; // Anything to do? - auto reqQueueFlags = reservations.m_queueFlags; - if (reqQueueFlags.value!=IQueue::FAMILY_FLAGS::NONE) + if (reservations.m_queueFlags.value!=IQueue::FAMILY_FLAGS::NONE) { // whether we actually get around to doing that depends on validity and success of transfers - const bool shouldDoSomeCompute = reqQueueFlags.hasFlags(IQueue::FAMILY_FLAGS::COMPUTE_BIT); + const bool shouldDoSomeCompute = reservations.m_queueFlags.hasFlags(IQueue::FAMILY_FLAGS::COMPUTE_BIT); auto invalidIntended = [device,logger](const IQueue::FAMILY_FLAGS flag, const SIntendedSubmitInfo* intended)->bool { if (!intended || !intended->valid()) @@ -3852,7 +3851,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul const auto transferFamily = params.transfer->queue->getFamilyIndex(); // But don't want to have to do QFOTs between Transfer and Queue Families then if (transferFamily!=computeFamily) - if (!scratchParams.canBeUsedByQueueFamily(transferFamily)) + if (!scratchParams.isConcurrentSharing() || !scratchParams.canBeUsedByQueueFamily(transferFamily)) { logger.log("Acceleration Structure Scratch Device Memory Allocator not mapped and not concurrently share-able by Transfer Family %d!",system::ILogger::ELL_ERROR,transferFamily); return retval; @@ -3868,7 +3867,6 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul logger.log("An Acceleration Structure will be built on Device but Default UpStreaming Buffer from IUtilities doesn't have required usage flags!", system::ILogger::ELL_ERROR); return retval; } - reqQueueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; } } // the elusive and exotic host builds @@ -3885,10 +3883,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul if (reservations.willCompactAS()) { if (!params.compactedASAllocator) - { - logger.log("An Acceleration Structure will be compacted but no Device Memory Allocator provided!", system::ILogger::ELL_ERROR); - return retval; - } + logger.log("Acceleration Structures will be compacted using the ILogicalDevice as the memory allocator!", system::ILogger::ELL_WARNING); // note that can't check the compacted AS allocator being large enough against `reservations.m_compactedASMaxMemory` } @@ -4851,7 +4846,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul break; } if (depInfo.wasInStaging) - dependsOnBLASBuilds; + dependsOnBLASBuilds = true; instanceDataSize += ITopLevelAccelerationStructure::getInstanceSize(instance.getType()); } // problem with building some Dependent BLASes @@ -4872,7 +4867,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul const addr_t sizes[MaxAllocCount] = {tlasToBuild.second.scratchSize,instanceDataSize,sizeof(void*)*instanceCount}; { const addr_t alignments[MaxAllocCount] = {limits.minAccelerationStructureScratchOffsetAlignment,16,alignof(uint64_t)}; - const auto AllocCount = as->usesMotion() ? 2:3; + const auto AllocCount = as->usesMotion() ? 3:2; // if fail then flush and keep trying till space is made for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(AllocCount,&offsets[0],&sizes[0],&alignments[0])!=0u; t++) if (t==1) // don't flush right away cause allocator not defragmented yet @@ -4902,14 +4897,14 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul assert(offsetInRange%16==0); uint32_t bytesWritten = 0; - while (true) + while (instanceIndex=blockSize) - return bytesWritten; + if (newWritten>blockSize) + break; auto found = instanceMap->find(instance.getBase().blas.get()); auto blas = found->second.get(); if (auto found=compactedBLASMap->find(blas); found!=compactedBLASMap->end()) @@ -4918,6 +4913,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul dst = IGPUTopLevelAccelerationStructure::writeInstance(dst,instance,blas->getReferenceForDeviceOperations()); bytesWritten = newWritten; } + return bytesWritten; } const compacted_blas_map_t* compactedBLASMap; @@ -4994,7 +4990,8 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul // enqueue ownership release if necessary if (finalOwnerQueueFamily!=IQueue::FamilyIgnored) { - compactedOwnershipReleaseIndices.push_back(ownershipTransfers.size()); + if (willCompact) + compactedOwnershipReleaseIndices.push_back(ownershipTransfers.size()); ownershipTransfers.push_back({ .barrier = { .dep = { @@ -5008,7 +5005,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul .range = backingRange }); } - else + else if (willCompact) compactedOwnershipReleaseIndices.push_back(~0u); } // finish the last batch @@ -5049,7 +5046,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul // create and allocate backing buffers for compacted TLASes core::vector> backingBuffers(compactions.size()); { - MetaDeviceMemoryAllocator deferredAllocator(params.compactedASAllocator,logger); + MetaDeviceMemoryAllocator deferredAllocator(params.compactedASAllocator ? params.compactedASAllocator:device,logger); // create for (size_t i=0; i CAssetConverter::convert_impl(SReserveResul // in the future we'll also finish host image copies // check dependents before inserting into cache - if (reqQueueFlags.value!=IQueue::FAMILY_FLAGS::NONE) + if (reservations.m_queueFlags.value!=IQueue::FAMILY_FLAGS::NONE) { auto checkDependents = [&]()->void { From 5c519d095903f4cc42ad4628185b82f37ae77563 Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 14 May 2025 16:45:32 +0200 Subject: [PATCH 113/346] `core::makeRAIIExiter` is literally the best thing since sliced bread Fix bugs: - ReBAR only buffer transfers dereferencing a nullptr transfer cmbduf - BLAS and TLAS memory allocations latching on semaphores which will never signal if the command recording fails for some reason --- src/nbl/video/utilities/CAssetConverter.cpp | 55 ++++++++++++++++----- 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index b6f0541a3f..bc9fac01c0 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -3991,7 +3991,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul core::vector finalReleases; finalReleases.reserve(buffersToUpload.size()); // do the uploads - if (!buffersToUpload.empty()) + if (!buffersToUpload.empty() && xferCmdBuf) { xferCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Upload Buffers START"); xferCmdBuf->cmdbuf->endDebugMarker(); @@ -4039,7 +4039,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul .range = range }); } - if (!buffersToUpload.empty()) + if (!buffersToUpload.empty() && xferCmdBuf) { xferCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Upload Buffers END"); xferCmdBuf->cmdbuf->endDebugMarker(); @@ -4653,6 +4653,12 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul return false; }; // + using scratch_allocator_t = std::remove_reference_t; + using addr_t = typename scratch_allocator_t::size_type; + core::vector scratchOffsets; + scratchOffsets.reserve(maxASCount); + core::vector scratchSizes; + scratchSizes.reserve(maxASCount); auto recordBuildCommandsBase = [&](auto& buildInfos, auto& rangeInfos)->void { if (buildInfos.empty()) @@ -4665,13 +4671,25 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul bool success = !uniQueue || !deviceASBuildScratchPtr || pipelineBarrier(computeCmdBuf,{.memBarriers={&readGeometryOrInstanceInASBuildBarrier,1}},"Pipeline Barriers of Acceleration Structure backing Buffers failed!"); // success = success && computeCmdBuf->cmdbuf->buildAccelerationStructures({buildInfos},rangeInfos.data()); - if (!success) - for (const auto& info : buildInfos) + if (success) { - const auto stagingFound = findInStaging.template operator()(info.dstAS); - smart_refctd_ptr dummy; // already null at this point - markFailure("AS Build Command Recording",&dummy,&stagingFound->second); + submitsNeeded |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; + // queue up a deferred allocation + params.scratchForDeviceASBuild->multi_deallocate(scratchOffsets.size(),scratchOffsets.data(),scratchSizes.data(),params.compute->getFutureScratchSemaphore()); + } + else + { + // release right away + params.scratchForDeviceASBuild->multi_deallocate(scratchOffsets.size(),scratchOffsets.data(),scratchSizes.data()); + for (const auto& info : buildInfos) + { + const auto stagingFound = findInStaging.template operator()(info.dstAS); + smart_refctd_ptr dummy; // already null at this point + markFailure("AS Build Command Recording",&dummy,&stagingFound->second); + } } + scratchOffsets.clear(); + scratchSizes.clear(); buildInfos.clear(); rangeInfos.clear(); }; @@ -4813,8 +4831,6 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul trackedBLASes.clear(); }; // - using scratch_allocator_t = std::remove_reference_t; - using addr_t = typename scratch_allocator_t::size_type; const auto& limits = physDev->getLimits(); for (auto& tlasToBuild : tlasesToBuild) { @@ -4865,9 +4881,25 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul constexpr uint32_t MaxAllocCount = 3; addr_t offsets[MaxAllocCount] = {scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value}; const addr_t sizes[MaxAllocCount] = {tlasToBuild.second.scratchSize,instanceDataSize,sizeof(void*)*instanceCount}; + const auto AllocCount = as->usesMotion() ? 3:2; + // clean up the allocation if we fail to make it to the end of loop for whatever reason + bool abortAllocation = true; + auto deallocSrc = core::makeRAIIExiter([¶ms,&scratchOffsets,&scratchSizes,AllocCount,&offsets,&sizes,&abortAllocation]()->void + { + // if got to end of loop queue up the release of memory, otherwise release right away + if (abortAllocation) + params.scratchForDeviceASBuild->multi_deallocate(AllocCount,&offsets[0],&sizes[0]); + else + for (auto i=0; iusesMotion() ? 3:2; // if fail then flush and keep trying till space is made for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(AllocCount,&offsets[0],&sizes[0],&alignments[0])!=0u; t++) if (t==1) // don't flush right away cause allocator not defragmented yet @@ -4881,8 +4913,6 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul } drainCompute(); } - // queue up a deferred allocation - params.scratchForDeviceASBuild->multi_deallocate(AllocCount,&offsets[0],&sizes[0],params.compute->getFutureScratchSemaphore()); } // stream the instance/geometry input in const size_t trackedBLASesOffset = trackedBLASes.size(); @@ -4983,6 +5013,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul buildInfo.trackedBLASes = {reinterpret_cast(trackedBLASesOffset),trackedBLASes.size()-trackedBLASesOffset}; // no special extra byte offset into the instance buffer rangeInfos.emplace_back(instanceCount,0u); + abortAllocation = false; // const bool willCompact = tlasToBuild.second.compact; if (willCompact) From 44f241a977148b2e2b02a04bb7b57b5c6530ac7a Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 14 May 2025 16:54:18 +0200 Subject: [PATCH 114/346] update & make boost submodule URL relative, update NBL_BOOST_GENERATE_DEP_LIST mini tool to autogen boost's .gitmodules --- .gitmodules | 2 +- 3rdparty/boost/CMakeLists.txt | 26 +++++++++++++++++++++++++- 3rdparty/boost/superproject | 2 +- 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/.gitmodules b/.gitmodules index 0aacb58ffd..ba078222e2 100644 --- a/.gitmodules +++ b/.gitmodules @@ -90,7 +90,7 @@ url = git@github.com:Devsh-Graphics-Programming/Nabla-Continous-Integration-Python-Framework.git [submodule "3rdparty/boost/superproject"] path = 3rdparty/boost/superproject - url = git@github.com:boostorg/boost.git + url = ../boost.git [submodule "3rdparty/argparse"] path = 3rdparty/argparse url = git@github.com:p-ranav/argparse.git diff --git a/3rdparty/boost/CMakeLists.txt b/3rdparty/boost/CMakeLists.txt index f3460fe8d6..36e596cbf6 100644 --- a/3rdparty/boost/CMakeLists.txt +++ b/3rdparty/boost/CMakeLists.txt @@ -44,7 +44,7 @@ set(NBL_BOOST_TARGETS PARENT_SCOPE) # Boost uses it's own tool for generating dependency list for targets, therefore we -# can make sure manually added dependnecy subdirectories for a library are valid +# can make sure manually added dependency subdirectories for a library are valid # https://www.boost.org/doc/libs/1_83_0/tools/boostdep/doc/html/index.html#boostdep.introduction.building_boostdep if(NBL_BOOST_GENERATE_DEP_LIST) # internal, for Nabla devs @@ -83,5 +83,29 @@ if(NBL_BOOST_GENERATE_DEP_LIST) # internal, for Nabla devs list(FILTER NBL_BOOST_LIBS EXCLUDE REGEX "(unknown)") string(REPLACE "~" "/" NBL_BOOST_LIBS "${NBL_BOOST_LIBS}") + # we override boost's .gitmodules to pick only those modules we really use (reported by boost's dep executable) + # boost hosts now like 200 repositories, some of them are really big however atm we reference around 60 + set(BOOST_SUBMODULE_TEMPLATE +[=[ + +[submodule "@NAME@"] + path = libs/@NAME@ + url = ../@FLATTEN_NAME@.git + fetchRecurseSubmodules = on-demand + branch = . +]=] + ) + + unset(BOOST_GITMODULES) + foreach(NAME ${NBL_BOOST_LIBS}) + string(REPLACE "/" "_" FLATTEN_NAME "${NAME}") + string(CONFIGURE "${BOOST_SUBMODULE_TEMPLATE}" TEMPLATE) + string(APPEND BOOST_GITMODULES "${TEMPLATE}") + endforeach() + + # NOTE: this you commit to version control file(WRITE "${NBL_BOOST_WAVE_DEP_FILE}" "set(NBL_BOOST_LIBS ${NBL_BOOST_LIBS})") + + # and this one too + you update boost submodule pointer with the update! + file(WRITE "${CMAKE_CURRENT_SOURCE_DIR}/superproject/.gitmodules" "${BOOST_GITMODULES}") endif() diff --git a/3rdparty/boost/superproject b/3rdparty/boost/superproject index 1c4d3531e4..e1a703f795 160000 --- a/3rdparty/boost/superproject +++ b/3rdparty/boost/superproject @@ -1 +1 @@ -Subproject commit 1c4d3531e416a1f72b0e6a5e0f7173f93cf97e92 +Subproject commit e1a703f7956264e463329d49ab05100bdc34e219 From 340cb7511ae24f32ab53e57f79be8350ecede68f Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 14 May 2025 17:07:25 +0200 Subject: [PATCH 115/346] the implementation of `IGPUCommandBuffer::empty()` was completely and utterly broken --- include/nbl/video/IGPUCommandBuffer.h | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/include/nbl/video/IGPUCommandBuffer.h b/include/nbl/video/IGPUCommandBuffer.h index cfe0439cde..2584707ab6 100644 --- a/include/nbl/video/IGPUCommandBuffer.h +++ b/include/nbl/video/IGPUCommandBuffer.h @@ -93,7 +93,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject case STATE::EXECUTABLE: [[fallthrough]]; case STATE::PENDING: - if (m_noCommands) + if (!m_noCommands) return false; [[fallthrough]]; default: @@ -261,13 +261,21 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject inline bool buildAccelerationStructures(const std::span infos, const IGPUBottomLevelAccelerationStructure::DirectBuildRangeRangeInfos buildRangeInfos) { if (const auto totalGeometryCount=buildAccelerationStructures_common(infos,buildRangeInfos); totalGeometryCount) - return buildAccelerationStructures_impl(infos,buildRangeInfos,totalGeometryCount); + if (buildAccelerationStructures_impl(infos,buildRangeInfos,totalGeometryCount)) + { + m_noCommands = false; + return true; + } return false; } inline bool buildAccelerationStructures(const std::span infos, const IGPUTopLevelAccelerationStructure::DirectBuildRangeRangeInfos buildRangeInfos) { if (buildAccelerationStructures_common(infos,buildRangeInfos)) - return buildAccelerationStructures_impl(infos,buildRangeInfos); + if (buildAccelerationStructures_impl(infos,buildRangeInfos)) + { + m_noCommands = false; + return true; + } return false; } // We don't allow different indirect command addresses due to https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdBuildAccelerationStructuresIndirectKHR-pIndirectDeviceAddresses-03646 @@ -300,10 +308,14 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject if (const auto totalGeometryCount=buildAccelerationStructures_common(infos,maxPrimitiveOrInstanceCounts,indirectRangeBuffer); totalGeometryCount) { + bool success; if constexpr(std::is_same_v) - return buildAccelerationStructuresIndirect_impl(indirectRangeBuffer,infos,pIndirectOffsets,pIndirectStrides,maxPrimitiveOrInstanceCounts,totalGeometryCount); + success = buildAccelerationStructuresIndirect_impl(indirectRangeBuffer,infos,pIndirectOffsets,pIndirectStrides,maxPrimitiveOrInstanceCounts,totalGeometryCount); else - return buildAccelerationStructuresIndirect_impl(indirectRangeBuffer,infos,pIndirectOffsets,pIndirectStrides,maxPrimitiveOrInstanceCounts); + success = buildAccelerationStructuresIndirect_impl(indirectRangeBuffer,infos,pIndirectOffsets,pIndirectStrides,maxPrimitiveOrInstanceCounts); + if (success) + m_noCommands = false; + return success; } return false; } From 6d793d74be629064984a6f063c2c04a73bf8158a Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 14 May 2025 17:45:47 +0200 Subject: [PATCH 116/346] remind self to never reinterpret_cast between virtual base and derived --- include/nbl/video/IGPUCommandBuffer.h | 2 +- include/nbl/video/ILogicalDevice.h | 4 ++-- src/nbl/video/IGPUCommandBuffer.cpp | 2 +- src/nbl/video/IQueue.cpp | 6 +++++- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/include/nbl/video/IGPUCommandBuffer.h b/include/nbl/video/IGPUCommandBuffer.h index 2584707ab6..d5a3fac0af 100644 --- a/include/nbl/video/IGPUCommandBuffer.h +++ b/include/nbl/video/IGPUCommandBuffer.h @@ -884,7 +884,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject // If the user wants the builds to be tracking, and make the TLAS remember the BLASes that have been built into it. // NOTE: We know that a TLAS may be rebuilt multiple times per frame on purpose and not only the final BLASes need to be kept alive till submission finishes. // However, the Command Pool already tracks resources referenced in the Build Infos, so we only need pointers into those records. - core::unordered_map> m_TLASToBLASReferenceSets; + core::unordered_map>> m_TLASToBLASReferenceSets; const IGPUGraphicsPipeline* m_boundGraphicsPipeline; const IGPUComputePipeline* m_boundComputePipeline; diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 93aa965416..8ad3b839ab 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -580,7 +580,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe } // the rawpointers are already smartpointers in whatever else the `fillTracking` declared above writes - core::unordered_map> m_TLASToBLASReferenceSets; + core::unordered_map>> m_TLASToBLASReferenceSets; } callback = {}; auto& tracking = deferredOperation->m_resourceTracking; @@ -593,7 +593,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe { const auto blasCount = info.trackedBLASes.size(); if (blasCount) - callback.m_TLASToBLASReferenceSets[info.dstAS] = {reinterpret_cast(oit-blasCount),blasCount}; + callback.m_TLASToBLASReferenceSets[info.dstAS] = {oit-blasCount,blasCount}; else callback.m_TLASToBLASReferenceSets[info.dstAS] = {}; } diff --git a/src/nbl/video/IGPUCommandBuffer.cpp b/src/nbl/video/IGPUCommandBuffer.cpp index fcf55b74c1..6bde593097 100644 --- a/src/nbl/video/IGPUCommandBuffer.cpp +++ b/src/nbl/video/IGPUCommandBuffer.cpp @@ -843,7 +843,7 @@ uint32_t IGPUCommandBuffer::buildAccelerationStructures_common(const std::span(oit-blasCount),blasCount}; + m_TLASToBLASReferenceSets[info.dstAS] = {oit-blasCount,blasCount}; else m_TLASToBLASReferenceSets[info.dstAS] = {}; } diff --git a/src/nbl/video/IQueue.cpp b/src/nbl/video/IQueue.cpp index e761b7a733..e7612cc8d1 100644 --- a/src/nbl/video/IQueue.cpp +++ b/src/nbl/video/IQueue.cpp @@ -157,7 +157,11 @@ IQueue::DeferredSubmitCallback::DeferredSubmitCallback(const SSubmitInfo& info) { const auto tlas = refSet.first; // in theory could assert no duplicate entries, but thats obvious - m_TLASToBLASReferenceSets[tlas] = { .m_BLASes = {refSet.second.begin(),refSet.second.end()}, .m_buildVer = tlas->registerNextBuildVer()}; + auto& out = m_TLASToBLASReferenceSets[tlas]; + out.m_BLASes.reserve(refSet.second.size()); + for (const auto& refCtd : refSet.second) + out.m_BLASes.emplace(dynamic_cast(refCtd.get())); + out.m_buildVer = tlas->registerNextBuildVer(); } } // We don't hold the last signal semaphore, because the timeline does as an Event trigger. From 507904f462c9fe50928b198ca2aabd7fa5c8b460 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 15 May 2025 10:38:03 +0700 Subject: [PATCH 117/346] minor fixes --- examples_tests | 2 +- include/nbl/builtin/hlsl/subgroup2/ballot.hlsl | 9 +++++---- .../{config.hlsl => arithmetic_config.hlsl} | 8 ++++---- include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl | 12 ++++++------ 4 files changed, 16 insertions(+), 15 deletions(-) rename include/nbl/builtin/hlsl/workgroup2/{config.hlsl => arithmetic_config.hlsl} (95%) diff --git a/examples_tests b/examples_tests index a42a742f36..908abd110c 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit a42a742f363bda827991794053fb93fd803023f1 +Subproject commit 908abd110c387d48110ce8aeb67f0e0f2dd68943 diff --git a/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl b/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl index 6c7ec4f593..52ae6de2d9 100644 --- a/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl +++ b/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl @@ -11,12 +11,13 @@ namespace hlsl namespace subgroup2 { +template uint32_t LastSubgroupInvocation() { - // why this code was wrong before: - // - only compute can use SubgroupID - // - but there's no mapping of InvocationID to SubgroupID and Index - return glsl::subgroupBallotFindMSB(glsl::subgroupBallot(true)); + if (AssumeAllActive) + return glsl::gl_SubgroupSize()-1; + else + return glsl::subgroupBallotFindMSB(glsl::subgroupBallot(true)); } bool ElectLast() diff --git a/include/nbl/builtin/hlsl/workgroup2/config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl similarity index 95% rename from include/nbl/builtin/hlsl/workgroup2/config.hlsl rename to include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl index 7855cc1701..2f24c863da 100644 --- a/include/nbl/builtin/hlsl/workgroup2/config.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -1,8 +1,8 @@ // Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_CONFIG_INCLUDED_ -#define _NBL_BUILTIN_HLSL_WORKGROUP2_CONFIG_INCLUDED_ +#ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_CONFIG_INCLUDED_ +#define _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_CONFIG_INCLUDED_ #include "nbl/builtin/hlsl/cpp_compat.hlsl" @@ -33,7 +33,7 @@ struct items_per_invocation } template -struct Configuration +struct ArithmeticConfiguration { NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2; NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << WorkgroupSizeLog2; @@ -61,7 +61,7 @@ struct Configuration // special case when workgroup size 2048 and subgroup size 16 needs 3 levels and virtual workgroup size 4096 to get a full subgroup scan each on level 1 and 2 16x16x16=4096 // specializing with macros because of DXC bug: https://github.com/microsoft/DirectXShaderCom0piler/issues/7007 #define SPECIALIZE_CONFIG_CASE_2048_16(ITEMS_PER_INVOC) template<>\ -struct Configuration<11, 4, ITEMS_PER_INVOC>\ +struct ArithmeticConfiguration<11, 4, ITEMS_PER_INVOC>\ {\ NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << 11u;\ NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = uint16_t(4u);\ diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index b03120b5f6..681ba39911 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -103,7 +103,7 @@ struct reduce { dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); scan_local[idx] = reduction0(scan_local[idx]); - if (subgroup2::ElectLast()) + if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); @@ -160,7 +160,7 @@ struct scan { dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); scan_local[idx] = inclusiveScan0(scan_local[idx]); - if (subgroup2::ElectLast()) + if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); @@ -236,7 +236,7 @@ struct reduce { dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); scan_local[idx] = reduction0(scan_local[idx]); - if (subgroup2::ElectLast()) + if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); @@ -254,7 +254,7 @@ struct reduce for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]); lv1_val = reduction1(lv1_val); - if (subgroup2::ElectLast()) + if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { const uint32_t bankedIndex = (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2); scratchAccessor.set(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); @@ -312,7 +312,7 @@ struct scan { dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); scan_local[idx] = inclusiveScan0(scan_local[idx]); - if (subgroup2::ElectLast()) + if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); @@ -331,7 +331,7 @@ struct scan for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]); lv1_val = inclusiveScan1(lv1_val); - if (subgroup2::ElectLast()) + if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { const uint32_t bankedIndex = (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2); scratchAccessor.set(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); From 59fcc93d2f0c0ac1b2196426e34d9ed8d9586a13 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 15 May 2025 12:45:57 +0700 Subject: [PATCH 118/346] Implement all computeDependants for IAssets --- include/nbl/asset/ICPUAccelerationStructure.h | 21 +++---- include/nbl/asset/ICPUAnimationLibrary.h | 16 +----- include/nbl/asset/ICPUBuffer.h | 10 ++-- include/nbl/asset/ICPUBufferView.h | 9 ++- include/nbl/asset/ICPUComputePipeline.h | 9 --- include/nbl/asset/ICPUDescriptorSet.h | 5 +- include/nbl/asset/ICPUDescriptorSetLayout.h | 15 +++-- include/nbl/asset/ICPUImage.h | 7 ++- include/nbl/asset/ICPUImageView.h | 10 ++-- include/nbl/asset/ICPUMesh.h | 6 +- include/nbl/asset/ICPUMeshBuffer.h | 9 +-- include/nbl/asset/ICPUPipelineCache.h | 7 ++- include/nbl/asset/ICPUPipelineLayout.h | 22 +++----- include/nbl/asset/ICPURayTracingPipeline.h | 14 ----- include/nbl/asset/ICPURenderpass.h | 6 +- .../asset/ICPURenderpassIndependentPipeline.h | 7 ++- include/nbl/asset/ICPUSampler.h | 7 ++- include/nbl/asset/ICPUSkeleton.h | 9 +-- include/nbl/asset/IShader.h | 8 +-- src/nbl/asset/ICPUDescriptorSet.cpp | 55 +++++++++---------- 20 files changed, 107 insertions(+), 145 deletions(-) diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h index 9c9af32f7b..affd165667 100644 --- a/include/nbl/asset/ICPUAccelerationStructure.h +++ b/include/nbl/asset/ICPUAccelerationStructure.h @@ -136,7 +136,10 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo } // Do not report anything as a dependant, we'll simply drop the data instead of discarding its contents - inline size_t getDependantCount() const override {return 0;} + inline core::unordered_set computeDependants() const override + { + return {}; + } inline core::blake3_hash_t computeContentHash() const override { @@ -236,8 +239,6 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo protected: virtual ~ICPUBottomLevelAccelerationStructure() = default; - inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;} - inline void discardContent_impl() override { m_triangleGeoms = nullptr; @@ -263,8 +264,13 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA // ICPUTopLevelAccelerationStructure() = default; - // - inline size_t getDependantCount() const override {return m_instances->size();} + inline core::unordered_set computeDependants() const override + { + core::unordered_set dependants; + for (const auto& instance : m_instances) + dependants.insert(instance.getBase().blas.get()); + return dependants; + } // inline auto& getBuildRangeInfo() @@ -360,11 +366,6 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA protected: virtual ~ICPUTopLevelAccelerationStructure() = default; - inline IAsset* getDependant_impl(const size_t ix) override - { - return m_instances->operator[](ix).getBase().blas.get(); - } - private: core::smart_refctd_dynamic_array m_instances = nullptr; hlsl::acceleration_structures::top_level::BuildRangeInfo m_buildRangeInfo; diff --git a/include/nbl/asset/ICPUAnimationLibrary.h b/include/nbl/asset/ICPUAnimationLibrary.h index 1b02787597..5fea370b63 100644 --- a/include/nbl/asset/ICPUAnimationLibrary.h +++ b/include/nbl/asset/ICPUAnimationLibrary.h @@ -96,21 +96,9 @@ class ICPUAnimationLibrary final : public IAnimationLibrary, public constexpr static inline auto AssetType = ET_ANIMATION_LIBRARY; inline E_TYPE getAssetType() const override { return AssetType; } - inline size_t getDependantCount() const override {return 3;} - - protected: - inline IAsset* getDependant_impl(const size_t ix) override + inline core::unordered_set computeDependants() const override { - switch (ix) - { - case 0: - return m_keyframeStorageBinding.buffer.get(); - case 1: - return m_timestampStorageBinding.buffer.get(); - default: - break; - } - return m_animationStorageRange.buffer.get(); + return { m_keyframeStorageBinding.buffer.get(), m_timestampStorageBinding.buffer.get(), m_animationStorageRange.buffer.get() }; } }; diff --git a/include/nbl/asset/ICPUBuffer.h b/include/nbl/asset/ICPUBuffer.h index 5bb16bd0ac..2d495ef02e 100644 --- a/include/nbl/asset/ICPUBuffer.h +++ b/include/nbl/asset/ICPUBuffer.h @@ -75,7 +75,10 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed constexpr static inline auto AssetType = ET_BUFFER; inline IAsset::E_TYPE getAssetType() const override final { return AssetType; } - inline size_t getDependantCount() const override { return 0; } + inline core::unordered_set computeDependants() const override + { + return {}; + } inline core::blake3_hash_t computeContentHash() const override { @@ -113,11 +116,6 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed } protected: - inline IAsset* getDependant_impl(const size_t ix) override - { - return nullptr; - } - inline void discardContent_impl() override { if (m_data) diff --git a/include/nbl/asset/ICPUBufferView.h b/include/nbl/asset/ICPUBufferView.h index 3819136c98..7f3f676695 100644 --- a/include/nbl/asset/ICPUBufferView.h +++ b/include/nbl/asset/ICPUBufferView.h @@ -28,7 +28,10 @@ class ICPUBufferView : public IBufferView, public IAsset constexpr static inline auto AssetType = ET_BUFFER_VIEW; inline IAsset::E_TYPE getAssetType() const override { return AssetType; } - inline size_t getDependantCount() const override {return 1;} + inline core::unordered_set computeDependants() const override + { + return { m_buffer.get() }; + } ICPUBuffer* getUnderlyingBuffer() { @@ -51,10 +54,6 @@ class ICPUBufferView : public IBufferView, public IAsset protected: virtual ~ICPUBufferView() = default; - inline IAsset* getDependant_impl(const size_t ix) override - { - return m_buffer.get(); - } }; } diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h index 01859e0c3f..8d8b343a3d 100644 --- a/include/nbl/asset/ICPUComputePipeline.h +++ b/include/nbl/asset/ICPUComputePipeline.h @@ -36,8 +36,6 @@ class ICPUComputePipeline final : public ICPUPipeline computeDependants() const override { return {m_layout.get(), m_specInfo.shader.get()}; @@ -62,13 +60,6 @@ class ICPUComputePipeline final : public ICPUPipeline(m_layout.get()); - } - private: SShaderSpecInfo m_specInfo; diff --git a/include/nbl/asset/ICPUDescriptorSet.h b/include/nbl/asset/ICPUDescriptorSet.h index 826c54cc39..77640b8f9f 100644 --- a/include/nbl/asset/ICPUDescriptorSet.h +++ b/include/nbl/asset/ICPUDescriptorSet.h @@ -47,8 +47,6 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSetgetTotalBindingCount()+1;} - // inline ICPUDescriptorSetLayout* getLayout() { @@ -79,10 +77,11 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet clone(uint32_t _depth = ~0u) const override; + core::unordered_set computeDependants() const override; + protected: virtual ~ICPUDescriptorSet() = default; - IAsset* getDependant_impl(size_t ix) override; private: diff --git a/include/nbl/asset/ICPUDescriptorSetLayout.h b/include/nbl/asset/ICPUDescriptorSetLayout.h index 8f45a789ea..2ddf1e26be 100644 --- a/include/nbl/asset/ICPUDescriptorSetLayout.h +++ b/include/nbl/asset/ICPUDescriptorSetLayout.h @@ -57,15 +57,20 @@ class ICPUDescriptorSetLayout : public IDescriptorSetLayout, public constexpr static inline auto AssetType = ET_DESCRIPTOR_SET_LAYOUT; inline E_TYPE getAssetType() const override { return AssetType; } - inline size_t getDependantCount() const override {return m_immutableSamplers ? m_immutableSamplers->size():0;} + core::unordered_set computeDependants() const override + { + if (!m_immutableSamplers) return {}; + core::unordered_set dependants; + for (const auto& sampler: m_immutableSamplers) + { + dependants.insert(sampler.get()); + } + return dependants; + } protected: virtual ~ICPUDescriptorSetLayout() = default; - inline IAsset* getDependant_impl(const size_t ix) override - { - return m_immutableSamplers->operator[](ix).get(); - } }; } diff --git a/include/nbl/asset/ICPUImage.h b/include/nbl/asset/ICPUImage.h index c27cd21b86..2527fd1ecb 100644 --- a/include/nbl/asset/ICPUImage.h +++ b/include/nbl/asset/ICPUImage.h @@ -46,7 +46,10 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed inline IAsset::E_TYPE getAssetType() const override { return AssetType; } // Do not report buffer as dependant, as we will simply drop it instead of discarding its contents! - inline size_t getDependantCount() const override {return 0;} + inline core::unordered_set computeDependants() const override + { + return {}; + } core::blake3_hash_t computeContentHash() const override; @@ -202,8 +205,6 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed inline ICPUImage(const SCreationParams& _params) : IImage(_params) {} virtual ~ICPUImage() = default; - inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;} - inline void discardContent_impl() override { buffer = nullptr; diff --git a/include/nbl/asset/ICPUImageView.h b/include/nbl/asset/ICPUImageView.h index 87df463021..6b3d562a60 100644 --- a/include/nbl/asset/ICPUImageView.h +++ b/include/nbl/asset/ICPUImageView.h @@ -49,8 +49,10 @@ class ICPUImageView final : public IImageView, public IAsset constexpr static inline auto AssetType = ET_IMAGE_VIEW; inline IAsset::E_TYPE getAssetType() const override { return AssetType; } - //! - inline size_t getDependantCount() const override {return 1;} + inline core::unordered_set computeDependants() const override + { + return { params.image.get() }; + } //! const SComponentMapping& getComponents() const { return params.components; } @@ -68,10 +70,6 @@ class ICPUImageView final : public IImageView, public IAsset protected: virtual ~ICPUImageView() = default; - inline IAsset* getDependant_impl(const size_t ix) override - { - return params.image.get(); - } }; } diff --git a/include/nbl/asset/ICPUMesh.h b/include/nbl/asset/ICPUMesh.h index a21f5f3f02..2648900ccc 100644 --- a/include/nbl/asset/ICPUMesh.h +++ b/include/nbl/asset/ICPUMesh.h @@ -82,10 +82,12 @@ class ICPUMesh final : public IMesh, public IAsset } //! CLASS IS DEPRECATED ANYWAY - inline size_t getDependantCount() const override {return 0;} + inline core::unordered_set computeDependants() const override + { + return {}; + } protected: - inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;} private: core::vector> m_meshBuffers; diff --git a/include/nbl/asset/ICPUMeshBuffer.h b/include/nbl/asset/ICPUMeshBuffer.h index 532b622090..61e9168a98 100644 --- a/include/nbl/asset/ICPUMeshBuffer.h +++ b/include/nbl/asset/ICPUMeshBuffer.h @@ -611,11 +611,12 @@ class ICPUMeshBuffer final : public IMeshBuffer(const_cast(this)->getJointAABBs()); } - //! CLASS IS DEPRECATED ANYWAY - inline size_t getDependantCount() const override {return 0;} + //! Class is deprecated anyway. + inline core::unordered_set computeDependants() const override + { + return {}; + } - protected: - inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;} }; } diff --git a/include/nbl/asset/ICPUPipelineCache.h b/include/nbl/asset/ICPUPipelineCache.h index 0c1d8c17cf..6fc019ce7f 100644 --- a/include/nbl/asset/ICPUPipelineCache.h +++ b/include/nbl/asset/ICPUPipelineCache.h @@ -60,7 +60,10 @@ class ICPUPipelineCache final : public IPreHashed return core::make_smart_refctd_ptr(std::move(cache_cp)); } - inline size_t getDependantCount() const override {return 0;} + inline core::unordered_set computeDependants() const override + { + return {}; + } // inline core::blake3_hash_t computeContentHash() const override @@ -86,8 +89,6 @@ class ICPUPipelineCache final : public IPreHashed const auto& getEntries() const {return m_cache;} protected: - inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;} - inline void discardContent_impl() override { for (auto& entry : m_cache) diff --git a/include/nbl/asset/ICPUPipelineLayout.h b/include/nbl/asset/ICPUPipelineLayout.h index c4a76fdea9..994d480b17 100644 --- a/include/nbl/asset/ICPUPipelineLayout.h +++ b/include/nbl/asset/ICPUPipelineLayout.h @@ -30,14 +30,14 @@ class ICPUPipelineLayout : public IAsset, public IPipelineLayout&& _layout2, core::smart_refctd_ptr&& _layout3 ) : IPipelineLayout(_pcRanges,std::move(_layout0),std::move(_layout1),std::move(_layout2),std::move(_layout3)) {} - // - inline size_t getDependantCount() const override + inline core::unordered_set computeDependants() const override { - size_t count = 0; - for (auto i=0; i dependants; + for (auto i = 0; i < m_descSetLayouts.size(); i++) + { + if (m_descSetLayouts[i]) continue; + dependants.insert(m_descSetLayouts[i].get()); + } } // @@ -79,14 +79,6 @@ class ICPUPipelineLayout : public IAsset, public IPipelineLayout computeDependants() const override final { core::unordered_set dependants; dependants.insert(m_raygen.shader.get()); @@ -103,14 +97,6 @@ class ICPURayTracingPipeline final : public ICPUPipeline computeDependants() const override + { + return {}; + } protected: inline ICPURenderpass(const SCreationParams& _params, const SCreationParamValidationResult& _validation) : IRenderpass(_params, _validation) {} inline ~ICPURenderpass() = default; - inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;} }; } diff --git a/include/nbl/asset/ICPURenderpassIndependentPipeline.h b/include/nbl/asset/ICPURenderpassIndependentPipeline.h index ed0171d11f..8638a4965b 100644 --- a/include/nbl/asset/ICPURenderpassIndependentPipeline.h +++ b/include/nbl/asset/ICPURenderpassIndependentPipeline.h @@ -66,7 +66,10 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline, _NBL_STATIC_INLINE_CONSTEXPR auto AssetType = ET_RENDERPASS_INDEPENDENT_PIPELINE; inline E_TYPE getAssetType() const override { return AssetType; } - inline size_t getDependantCount() const override {return 0;} + inline core::unordered_set computeDependants() const override + { + return {}; + } // inline const SCachedCreationParams& getCachedCreationParams() const {return IRenderpassIndependentPipeline::getCachedCreationParams();} @@ -137,8 +140,6 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline, : IRenderpassIndependentPipeline(params), m_layout(std::move(_layout)) {} virtual ~ICPURenderpassIndependentPipeline() = default; - inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;} - core::smart_refctd_ptr m_layout; #if 0 std::array,GRAPHICS_SHADER_STAGE_COUNT> m_shaders = {}; diff --git a/include/nbl/asset/ICPUSampler.h b/include/nbl/asset/ICPUSampler.h index 27a918afaa..46cac56ee0 100644 --- a/include/nbl/asset/ICPUSampler.h +++ b/include/nbl/asset/ICPUSampler.h @@ -17,8 +17,6 @@ class ICPUSampler : public ISampler, public IAsset protected: virtual ~ICPUSampler() = default; - inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;} - public: ICPUSampler(const SParams& _params) : ISampler(_params), IAsset() {} @@ -71,7 +69,10 @@ class ICPUSampler : public ISampler, public IAsset constexpr static inline auto AssetType = ET_SAMPLER; inline IAsset::E_TYPE getAssetType() const override { return AssetType; } - inline size_t getDependantCount() const override {return 0;} + inline core::unordered_set computeDependants() const override + { + return {}; + } }; } diff --git a/include/nbl/asset/ICPUSkeleton.h b/include/nbl/asset/ICPUSkeleton.h index 6f1c576ed8..ce03a9be54 100644 --- a/include/nbl/asset/ICPUSkeleton.h +++ b/include/nbl/asset/ICPUSkeleton.h @@ -79,14 +79,11 @@ class ICPUSkeleton final : public ISkeleton, public IAsset constexpr static inline auto AssetType = ET_SKELETON; inline E_TYPE getAssetType() const override { return AssetType; } - //! - inline size_t getDependantCount() const override {return 2;} - - protected: - inline IAsset* getDependant_impl(const size_t ix) override + inline core::unordered_set computeDependants() const override { - return (ix!=0 ? m_defaultTransforms:m_parentJointIDs).buffer.get(); + return { m_defaultTransforms.buffer.get(), m_parentJointIDs.buffer.get() }; } + }; } diff --git a/include/nbl/asset/IShader.h b/include/nbl/asset/IShader.h index a6dab09b54..5abd7d1980 100644 --- a/include/nbl/asset/IShader.h +++ b/include/nbl/asset/IShader.h @@ -50,8 +50,10 @@ class IShader : public IAsset constexpr static inline auto AssetType = ET_SHADER; inline E_TYPE getAssetType() const override { return AssetType; } - // - inline size_t getDependantCount() const override { return 1; } + inline core::unordered_set computeDependants() const override + { + return { m_code.get() }; + } // inline core::smart_refctd_ptr clone(uint32_t _depth=~0u) const override @@ -96,8 +98,6 @@ class IShader : public IAsset protected: virtual ~IShader() = default; - inline IAsset* getDependant_impl(const size_t ix) override {return m_code.get();} - std::string m_filepathHint; core::smart_refctd_ptr m_code; E_CONTENT_TYPE m_contentType; diff --git a/src/nbl/asset/ICPUDescriptorSet.cpp b/src/nbl/asset/ICPUDescriptorSet.cpp index 03724be1a2..a298fea491 100644 --- a/src/nbl/asset/ICPUDescriptorSet.cpp +++ b/src/nbl/asset/ICPUDescriptorSet.cpp @@ -108,36 +108,35 @@ core::smart_refctd_ptr ICPUDescriptorSet::clone(uint32_t _depth) const return cp; } -IAsset* ICPUDescriptorSet::getDependant_impl(size_t ix) +core::unordered_set ICPUDescriptorSet::computeDependants() const { - for (auto i=0u; i(IDescriptor::E_TYPE::ET_COUNT); i++) - if (m_descriptorInfos[i]) + core::unordered_set dependants = { m_layout.get() }; + for (auto i = 0u; i < static_cast(IDescriptor::E_TYPE::ET_COUNT); i++) { - const auto size = m_descriptorInfos[i]->size(); - if (ixoperator[](ix).desc.get(); - if (desc) - switch (IDescriptor::GetTypeCategory(static_cast(i))) - { - case IDescriptor::EC_BUFFER: - return static_cast(desc); - case IDescriptor::EC_SAMPLER: - return static_cast(desc); - case IDescriptor::EC_IMAGE: - return static_cast(desc); - case IDescriptor::EC_BUFFER_VIEW: - return static_cast(desc); - case IDescriptor::EC_ACCELERATION_STRUCTURE: - return static_cast(desc); - default: - break; - } - return nullptr; - } - else - ix -= size; + if (!m_descriptorInfos[i]) continue; + const auto size = m_descriptorInfos[i]->size(); + for (auto desc_i = 0u; desc_i < size; desc_i++) + { + auto* desc = m_descriptorInfos[i]->operator[](desc_i).desc.get(); + if (!desc) continue; + switch (IDescriptor::GetTypeCategory(static_cast(i))) + { + case IDescriptor::EC_BUFFER: + dependants.insert(static_cast(desc)); + case IDescriptor::EC_SAMPLER: + dependants.insert(static_cast(desc)); + case IDescriptor::EC_IMAGE: + dependants.insert(static_cast(desc)); + case IDescriptor::EC_BUFFER_VIEW: + dependants.insert(static_cast(desc)); + case IDescriptor::EC_ACCELERATION_STRUCTURE: + dependants.insert(static_cast(desc)); + default: + break; + } + } } - return nullptr; + return dependants; } + } \ No newline at end of file From 542592f7c5926f601351bb1872d65e171b742440 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 15 May 2025 14:44:10 +0700 Subject: [PATCH 119/346] soome changes to arithmetic config --- examples_tests | 2 +- .../hlsl/workgroup2/arithmetic_config.hlsl | 46 +++++++++---------- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 2 +- 3 files changed, 23 insertions(+), 27 deletions(-) diff --git a/examples_tests b/examples_tests index 908abd110c..81238adaec 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 908abd110c387d48110ce8aeb67f0e0f2dd68943 +Subproject commit 81238adaecbd8d717bdab0dd73e08e2938a794c6 diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl index 2f24c863da..d0800d6996 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -18,6 +18,8 @@ namespace impl template struct virtual_wg_size_log2 { + static_assert(WorkgroupSizeLog2>=SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize"); + static_assert(WorkgroupSizeLog2<=SubgroupSizeLog2+4, "WorkgroupSize cannot be larger than SubgroupSize*16"); NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2),uint16_t,conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2*2+2),uint16_t,3,2>::value,1>::value; NBL_CONSTEXPR_STATIC_INLINE uint16_t value = mpl::max_v+SubgroupSizeLog2; }; @@ -30,6 +32,24 @@ struct items_per_invocation NBL_CONSTEXPR_STATIC_INLINE uint16_t value1 = uint16_t(0x1u) << conditional_value, ItemsPerInvocationProductLog2>::value; NBL_CONSTEXPR_STATIC_INLINE uint16_t value2 = uint16_t(0x1u) << mpl::max_v; }; + +// explicit specializations for cases that don't fit +#define SPECIALIZE_VIRTUAL_WG_SIZE_CASE(WGLOG2, SGLOG2, LEVELS, VALUE) template<>\ +struct virtual_wg_size_log2\ +{\ + NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = LEVELS;\ + NBL_CONSTEXPR_STATIC_INLINE uint16_t value = VALUE;\ +};\ + +SPECIALIZE_VIRTUAL_WG_SIZE_CASE(11,4,3,12); +SPECIALIZE_VIRTUAL_WG_SIZE_CASE(7,7,1,7); +SPECIALIZE_VIRTUAL_WG_SIZE_CASE(6,6,1,6); +SPECIALIZE_VIRTUAL_WG_SIZE_CASE(5,5,1,5); +SPECIALIZE_VIRTUAL_WG_SIZE_CASE(4,4,1,4); +SPECIALIZE_VIRTUAL_WG_SIZE_CASE(3,3,1,3); +SPECIALIZE_VIRTUAL_WG_SIZE_CASE(2,2,1,2); + +#undef SPECIALIZE_VIRTUAL_WG_SIZE_CASE } template @@ -39,7 +59,6 @@ struct ArithmeticConfiguration NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << WorkgroupSizeLog2; NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = _SubgroupSizeLog2; NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2; - static_assert(WorkgroupSizeLog2>=_SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize"); // must have at least enough level 0 outputs to feed a single subgroup NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupsPerVirtualWorkgroupLog2 = mpl::max_v; @@ -55,34 +74,11 @@ struct ArithmeticConfiguration NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = items_per_invoc_t::value2; static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!"); - NBL_CONSTEXPR_STATIC_INLINE uint16_t SharedMemSize = conditional_value::value + SubgroupsPerVirtualWorkgroup*ItemsPerInvocation_1; + NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementCount = conditional_value::value + SubgroupSize*ItemsPerInvocation_1>::value; }; -// special case when workgroup size 2048 and subgroup size 16 needs 3 levels and virtual workgroup size 4096 to get a full subgroup scan each on level 1 and 2 16x16x16=4096 -// specializing with macros because of DXC bug: https://github.com/microsoft/DirectXShaderCom0piler/issues/7007 -#define SPECIALIZE_CONFIG_CASE_2048_16(ITEMS_PER_INVOC) template<>\ -struct ArithmeticConfiguration<11, 4, ITEMS_PER_INVOC>\ -{\ - NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << 11u;\ - NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = uint16_t(4u);\ - NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;\ - NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupsPerVirtualWorkgroupLog2 = 7u;\ - NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupsPerVirtualWorkgroup = 128u;\ - NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = 3u;\ - NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << 4096;\ - NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_0 = ITEMS_PER_INVOC;\ - NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_1 = 1u;\ - NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = 1u;\ -};\ - -SPECIALIZE_CONFIG_CASE_2048_16(1) -SPECIALIZE_CONFIG_CASE_2048_16(2) -SPECIALIZE_CONFIG_CASE_2048_16(4) - } } } -#undef SPECIALIZE_CONFIG_CASE_2048_16 - #endif diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 681ba39911..461b685c99 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -9,7 +9,7 @@ #include "nbl/builtin/hlsl/subgroup2/ballot.hlsl" #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" #include "nbl/builtin/hlsl/mpl.hlsl" -#include "nbl/builtin/hlsl/workgroup2/config.hlsl" +#include "nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl" namespace nbl { From a9930a025b4b252c1a08c4abc59cd1652cb666ac Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 15 May 2025 16:00:34 +0700 Subject: [PATCH 120/346] removed referencing workgroupID in scans --- examples_tests | 2 +- .../hlsl/workgroup2/arithmetic_config.hlsl | 10 ++++++++ .../builtin/hlsl/workgroup2/shared_scan.hlsl | 24 +++++++++---------- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/examples_tests b/examples_tests index 81238adaec..1de31ddfd7 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 81238adaecbd8d717bdab0dd73e08e2938a794c6 +Subproject commit 1de31ddfd725009bd650f1fe80f1c4a8c2e6a14a diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl index d0800d6996..88ff328e05 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -77,6 +77,16 @@ struct ArithmeticConfiguration NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementCount = conditional_value::value + SubgroupSize*ItemsPerInvocation_1>::value; }; +template +struct is_configuration : bool_constant {}; + +template +struct is_configuration > : bool_constant {}; + +template +NBL_CONSTEXPR bool is_configuration_v = is_configuration::value; + + } } } diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 461b685c99..1043decd73 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -43,9 +43,9 @@ struct reduce subgroup2::reduction reduction; vector_t value; - dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value); + dataAccessor.get(workgroup::SubgroupContiguousIndex(), value); value = reduction(value); - dataAccessor.set(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value); // can be safely merged with top line? + dataAccessor.set(workgroup::SubgroupContiguousIndex(), value); } }; @@ -63,7 +63,7 @@ struct scan using params_t = subgroup2::ArithmeticParams; vector_t value; - dataAccessor.get(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value); + dataAccessor.get(workgroup::SubgroupContiguousIndex(), value); if (Exclusive) { subgroup2::exclusive_scan excl_scan; @@ -74,7 +74,7 @@ struct scan subgroup2::inclusive_scan incl_scan; value = incl_scan(value); } - dataAccessor.set(glsl::gl_WorkGroupID().x * Config::SubgroupSize + workgroup::SubgroupContiguousIndex(), value); // can be safely merged with above lines? + dataAccessor.set(workgroup::SubgroupContiguousIndex(), value); // can be safely merged with above lines? } }; @@ -101,7 +101,7 @@ struct reduce [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { - dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); + dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); scan_local[idx] = reduction0(scan_local[idx]); if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { @@ -131,7 +131,7 @@ struct reduce { scalar_t reduce_val; scratchAccessor.get(glsl::gl_SubgroupInvocationID(),reduce_val); - dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val); + dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val); } } }; @@ -158,7 +158,7 @@ struct scan [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { - dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); + dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); scan_local[idx] = inclusiveScan0(scan_local[idx]); if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { @@ -204,7 +204,7 @@ struct scan for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++) scan_local[idx][i] = binop(left, scan_local[idx][i]); } - dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); + dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); } } }; @@ -234,7 +234,7 @@ struct reduce [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { - dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); + dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); scan_local[idx] = reduction0(scan_local[idx]); if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { @@ -281,7 +281,7 @@ struct reduce { scalar_t reduce_val; scratchAccessor.get(glsl::gl_SubgroupInvocationID(),reduce_val); - dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val); + dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val); } } }; @@ -310,7 +310,7 @@ struct scan [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { - dataAccessor.get(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); + dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); scan_local[idx] = inclusiveScan0(scan_local[idx]); if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { @@ -384,7 +384,7 @@ struct scan for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++) scan_local[idx][i] = binop(left, scan_local[idx][i]); } - dataAccessor.set(glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize + idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); + dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); } } }; From 8a2ebe36f3e1ede2dec4658f4e7130fac7886c24 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Thu, 15 May 2025 13:51:23 +0200 Subject: [PATCH 121/346] correct strategy for boost's .gitmodules mini tool which will *not* lead to issues with non-existing references, update 3rdparty/boost/superproject submodule --- 3rdparty/boost/CMakeLists.txt | 51 +++++++++++++++-------------------- 3rdparty/boost/superproject | 2 +- 2 files changed, 22 insertions(+), 31 deletions(-) diff --git a/3rdparty/boost/CMakeLists.txt b/3rdparty/boost/CMakeLists.txt index 36e596cbf6..194ad3c35c 100644 --- a/3rdparty/boost/CMakeLists.txt +++ b/3rdparty/boost/CMakeLists.txt @@ -56,13 +56,11 @@ if(NBL_BOOST_GENERATE_DEP_LIST) # internal, for Nabla devs set(NBL_BOOSTDEP_EXE_FILEPATH "${CMAKE_CURRENT_BINARY_DIR}/superproject/tools/boostdep/bin/${NBL_BOOSTDEP_EXE}") - if(NOT EXISTS "${NBL_BOOSTDEP_EXE_FILEPATH}") - macro(NBL_BOOST_EXECUTE) - execute_process(COMMAND ${ARGV} - WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/superproject" - ) - endmacro() + macro(NBL_BOOST_EXECUTE) + execute_process(COMMAND ${ARGV} WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/superproject") + endmacro() + if(NOT EXISTS "${NBL_BOOSTDEP_EXE_FILEPATH}") NBL_BOOST_EXECUTE(cmd /C bootstrap.bat) NBL_BOOST_EXECUTE(cmd /C b2.exe tools/boostdep/build) NBL_BOOST_EXECUTE("${CMAKE_COMMAND}" -E copy "./dist/bin/${NBL_BOOSTDEP_EXE}" "${NBL_BOOSTDEP_EXE_FILEPATH}") @@ -70,7 +68,7 @@ if(NBL_BOOST_GENERATE_DEP_LIST) # internal, for Nabla devs NBL_BOOST_EXECUTE(git reset --hard) endif() - execute_process(COMMAND "${NBL_BOOSTDEP_EXE_FILEPATH}" --boost-root "${CMAKE_CURRENT_SOURCE_DIR}/superproject" --brief wave + NBL_BOOST_EXECUTE("${NBL_BOOSTDEP_EXE_FILEPATH}" --boost-root "${CMAKE_CURRENT_SOURCE_DIR}/superproject" --brief wave OUTPUT_VARIABLE NBL_OUTPUT_VAR ) @@ -83,29 +81,22 @@ if(NBL_BOOST_GENERATE_DEP_LIST) # internal, for Nabla devs list(FILTER NBL_BOOST_LIBS EXCLUDE REGEX "(unknown)") string(REPLACE "~" "/" NBL_BOOST_LIBS "${NBL_BOOST_LIBS}") - # we override boost's .gitmodules to pick only those modules we really use (reported by boost's dep executable) - # boost hosts now like 200 repositories, some of them are really big however atm we reference around 60 - set(BOOST_SUBMODULE_TEMPLATE -[=[ - -[submodule "@NAME@"] - path = libs/@NAME@ - url = ../@FLATTEN_NAME@.git - fetchRecurseSubmodules = on-demand - branch = . -]=] - ) + # NOTE: you commit this file to version control AND boost's .gitmodules *if got changed*, use when updating boost to more recent version + file(WRITE "${NBL_BOOST_WAVE_DEP_FILE}" "set(NBL_BOOST_LIBS ${NBL_BOOST_LIBS})") - unset(BOOST_GITMODULES) - foreach(NAME ${NBL_BOOST_LIBS}) - string(REPLACE "/" "_" FLATTEN_NAME "${NAME}") - string(CONFIGURE "${BOOST_SUBMODULE_TEMPLATE}" TEMPLATE) - string(APPEND BOOST_GITMODULES "${TEMPLATE}") - endforeach() + NBL_BOOST_EXECUTE(git config --file .gitmodules --get-regexp path OUTPUT_VARIABLE NBL_OUTPUT_VARIABLE) - # NOTE: this you commit to version control - file(WRITE "${NBL_BOOST_WAVE_DEP_FILE}" "set(NBL_BOOST_LIBS ${NBL_BOOST_LIBS})") + string(REGEX REPLACE "\n" ";" NBL_SUBMODULE_CONFIG_LIST "${NBL_OUTPUT_VARIABLE}") - # and this one too + you update boost submodule pointer with the update! - file(WRITE "${CMAKE_CURRENT_SOURCE_DIR}/superproject/.gitmodules" "${BOOST_GITMODULES}") -endif() + message(STATUS "Updating boost .gitmodules") + foreach(NBL_SUBMODULE_NAME ${NBL_SUBMODULE_CONFIG_LIST}) + string(REGEX MATCH "submodule\\.(.*)\\.path" NBL_SUBMODULE_NAME "${NBL_SUBMODULE_NAME}") + NBL_BOOST_EXECUTE(git config --file .gitmodules submodule.${CMAKE_MATCH_1}.update none) # fallback, ignore all + endforeach() + + foreach(NAME ${NBL_BOOST_LIBS}) + string(REPLACE "/" "_" SUBMODULE "${NAME}") + message(STATUS "BOOST SUBMODULE = ${SUBMODULE}") + NBL_BOOST_EXECUTE(git config --file .gitmodules submodule.${SUBMODULE}.update checkout) # pick only those reported by the module we use + endforeach() +endif() \ No newline at end of file diff --git a/3rdparty/boost/superproject b/3rdparty/boost/superproject index e1a703f795..dcc3e1ade0 160000 --- a/3rdparty/boost/superproject +++ b/3rdparty/boost/superproject @@ -1 +1 @@ -Subproject commit e1a703f7956264e463329d49ab05100bdc34e219 +Subproject commit dcc3e1ade0ae8e7ea0eadc2d951efb1e53450bff From 892595c0263ed70e71aaa948cef6fe2370c44ab5 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 15 May 2025 14:36:53 +0200 Subject: [PATCH 122/346] BLAS and TLAS build code reuse and unification --- include/nbl/video/IGPUAccelerationStructure.h | 2 +- src/nbl/video/utilities/CAssetConverter.cpp | 844 ++++++++++-------- 2 files changed, 486 insertions(+), 360 deletions(-) diff --git a/include/nbl/video/IGPUAccelerationStructure.h b/include/nbl/video/IGPUAccelerationStructure.h index 60c6add5fb..af541bdccb 100644 --- a/include/nbl/video/IGPUAccelerationStructure.h +++ b/include/nbl/video/IGPUAccelerationStructure.h @@ -177,7 +177,7 @@ class IGPUBottomLevelAccelerationStructure : public asset::IBottomLevelAccelerat inline bool usesMotion() const override {return m_params.flags.hasFlags(SCreationParams::FLAGS::MOTION_BIT);} // read the comments in the .hlsl file, AABB builds ignore certain fields - using BuildRangeInfo = hlsl::acceleration_structures::bottom_level::BuildRangeInfo; + using BuildRangeInfo = hlsl::acceleration_structures::bottom_level::BuildRangeInfo; // TODO: rename to GeometryRangeInfo, and make `BuildRangeInfo = const GeometryRangeInfo*` using DirectBuildRangeRangeInfos = const BuildRangeInfo* const*; using MaxInputCounts = const uint32_t* const; diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index bc9fac01c0..0167a96a43 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -2852,7 +2852,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult } else { - const uint32_t* pMaxPrimitiveCounts = as->getGeometryPrimitiveCounts().data(); + const uint32_t* pPrimitiveCounts = as->getGeometryPrimitiveCounts().data(); // the code here is not pretty, but DRY-ing is of this is for later if (buildFlags.hasFlags(ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT)) { @@ -2862,56 +2862,59 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult const std::span> cpuGeoms = { reinterpret_cast*>(geoms.data()),geoms.size() }; - sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts); + sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pPrimitiveCounts); } else { const std::span> cpuGeoms = { reinterpret_cast*>(geoms.data()),geoms.size() }; - sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts); + sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pPrimitiveCounts); } // TODO: check if the strides need to be aligned to 4 bytes for AABBs for (const auto& geom : geoms) - if (const auto aabbCount=*(pMaxPrimitiveCounts++); aabbCount) + if (const auto aabbCount=*(pPrimitiveCounts++); aabbCount) incrementBuildSize(aabbCount*geom.stride,alignof(float)); } else { - core::map allocationsPerStride; const auto geoms = as->getTriangleGeometries(); if (patch.hostBuild) { const std::span> cpuGeoms = { reinterpret_cast*>(geoms.data()),geoms.size() }; - sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts); + sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pPrimitiveCounts); } else { const std::span> cpuGeoms = { reinterpret_cast*>(geoms.data()),geoms.size() }; - sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts); + sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pPrimitiveCounts); } for (const auto& geom : geoms) - if (const auto triCount=*(pMaxPrimitiveCounts++); triCount) + if (const auto triCount=*(pPrimitiveCounts++); triCount) { + auto size = geom.vertexStride*(geom.vertexData[1] ? 2:1)*geom.maxVertex; + if (geom.hasTransform()) + size = core::alignUp(size,alignof(float))+sizeof(hlsl::float32_t3x4); + auto alignment = 0u; switch (geom.indexType) { case E_INDEX_TYPE::EIT_16BIT: - allocationsPerStride[sizeof(uint16_t)] += triCount*3; + alignment = alignof(uint16_t); break; case E_INDEX_TYPE::EIT_32BIT: - allocationsPerStride[sizeof(uint32_t)] += triCount*3; + alignment = alignof(uint32_t); break; default: break; } - allocationsPerStride[geom.vertexStride] += (geom.vertexData[1] ? 2:1)*geom.maxVertex; + if (alignment) + size = core::alignUp(size,alignment)+triCount*3*alignment; + incrementBuildSize(size,hlsl::max(alignment,geom.vertexStride)); } - for (const auto& entry : allocationsPerStride) - incrementBuildSize(entry.first*entry.second,entry.first); } } } @@ -4617,226 +4620,169 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul auto& tlasesToBuild = reservations.m_tlasConversions[0]; const auto blasCount = blasesToBuild.size(); const auto tlasCount = tlasesToBuild.size(); - const auto maxASCount = hlsl::max(tlasCount,blasCount); ownershipTransfers.reserve(blasCount+tlasCount); - auto* scratchBuffer = params.scratchForDeviceASBuild->getBuffer(); - core::vector flushRanges; - const bool manualFlush = scratchBuffer->getBoundMemory().memory->haveToMakeVisible(); - if (manualFlush) // BLAS builds do max 3 writes each TLAS builds do max 2 writes each - flushRanges.reserve(hlsl::max(blasCount*3,tlasCount*2)); // Right now we build all BLAS first, then all TLAS // (didn't fancy horrible concurrency managment taking compactions into account) auto queryPool = device->createQueryPool({.queryCount=hlsl::max(blasCount,tlasCount),.queryType=IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE}); - const asset::SMemoryBarrier readGeometryOrInstanceInASBuildBarrier = { - // the last use of the source BLAS could have been a build or a compaction - .srcStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, - .srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT, - .dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT, - .dstAccessMask = ACCESS_FLAGS::STORAGE_READ_BIT - }; - // lambdas! - auto streamDataToScratch = [&](const size_t offset, const size_t size,IUtilities::IUpstreamingDataProducer& callback) -> bool - { - if (deviceASBuildScratchPtr) - { - callback(deviceASBuildScratchPtr+offset,0ull,size); - if (manualFlush) - flushRanges.emplace_back(scratchBuffer->getBoundMemory().memory,offset,size,ILogicalDevice::MappedMemoryRange::align_non_coherent_tag); - return true; - } - else if (const SBufferRange range={.offset=offset,.size=size,.buffer=smart_refctd_ptr(scratchBuffer)}; params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,callback)) - return true; - else - return false; - }; - // - using scratch_allocator_t = std::remove_reference_t; - using addr_t = typename scratch_allocator_t::size_type; - core::vector scratchOffsets; - scratchOffsets.reserve(maxASCount); - core::vector scratchSizes; - scratchSizes.reserve(maxASCount); - auto recordBuildCommandsBase = [&](auto& buildInfos, auto& rangeInfos)->void - { - if (buildInfos.empty()) - return; - // Lets analyze sync cases: - // - Mapped Host write = no barrier, flush & optional submit sufficient - // - Single Queue = Global Memory Barrier - // - Two distinct Queues = no barrier, semaphore signal-wait is sufficient - // - Two distinct Queue Families Exclusive Sharing mode = QFOT necessary but we require concurrent sharing on the scratch buffer ! - bool success = !uniQueue || !deviceASBuildScratchPtr || pipelineBarrier(computeCmdBuf,{.memBarriers={&readGeometryOrInstanceInASBuildBarrier,1}},"Pipeline Barriers of Acceleration Structure backing Buffers failed!"); - // - success = success && computeCmdBuf->cmdbuf->buildAccelerationStructures({buildInfos},rangeInfos.data()); - if (success) - { - submitsNeeded |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; - // queue up a deferred allocation - params.scratchForDeviceASBuild->multi_deallocate(scratchOffsets.size(),scratchOffsets.data(),scratchSizes.data(),params.compute->getFutureScratchSemaphore()); - } - else - { - // release right away - params.scratchForDeviceASBuild->multi_deallocate(scratchOffsets.size(),scratchOffsets.data(),scratchSizes.data()); - for (const auto& info : buildInfos) - { - const auto stagingFound = findInStaging.template operator()(info.dstAS); - smart_refctd_ptr dummy; // already null at this point - markFailure("AS Build Command Recording",&dummy,&stagingFound->second); - } - } - scratchOffsets.clear(); - scratchSizes.clear(); - buildInfos.clear(); - rangeInfos.clear(); - }; - - // Not messing around with listing AS backing buffers individually, ergonomics of that are null - const asset::SMemoryBarrier readASInASCompactBarrier = { - .srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT, - .srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT, - .dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT, - .dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT - }; - - // compacted BLASes need to be substituted in cache and TLAS Build Inputs - using compacted_blas_map_t = core::unordered_map>; + // leftover for TLAS builds + using compacted_blas_map_t = unordered_map>; compacted_blas_map_t compactedBLASMap; - // Device BLAS builds - if (blasCount) + bool failedBLASBarrier = false; + // returns a map of compacted Acceleration Structures + auto buildAndCompactASes = [&](auto& asesToBuild)->unordered_map> { - core::vector compactions; - // build - { - computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Build BLASes START"); - computeCmdBuf->cmdbuf->endDebugMarker(); -#ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION - constexpr auto GeometryIsAABBFlag = ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT; - - core::vector buildInfos; buildInfos.reserve(blasCount); - core::vector rangeInfo; rangeInfo.reserve(blasCount); - core::vector> triangles; - core::vector> aabbs; - { - size_t totalTriGeoCount = 0; - size_t totalAABBGeoCount = 0; - for (auto& item : blasToBuild) - { - const size_t geoCount = item.canonical->getGeometryCount(); - if (item.canonical->getBuildFlags().hasFlags(GeometryIsAABBFlag)) - totalAABBGeoCount += geoCount; - else - totalTriGeoCount += geoCount; - } - triangles.reserve(totalTriGeoCount); - triangles.reserve(totalAABBGeoCount); - } - for (auto& item : blasToBuild) - { - auto* as = item.gpuObj; - auto pFound = &findInStaging.template operator()(as)->second; - if (item.asBuildParams.host) - { - auto dOp = device->createDeferredOperation(); - // - if (!device->buildAccelerationStructure(dOp.get(),info,range)) - { - markFailure("BLAS Build Command Recording",&item.canonical,pFound); - continue; - } - } - else - { - auto& buildInfo = buildInfo.emplace_back({ - .buildFlags = item.buildFlags, - .geometryCount = item.canonical->getGeometryCount(), - // this is not an update - .srcAS = nullptr, - .dstAS = as.get() - }); - if (item.canonical->getBuildFlags().hasFlags(GeometryIsAABBFlag)) - buildInfo.aabbs = nullptr; - else - buildInfo.triangles = nullptr; - computeCmdBuf->cmdbuf->buildAccelerationStructures(buildInfo,rangeInfo); - } - } -#endif - if (!compactions.empty()) - { - // submit cause host needs to read the queries - drainCompute(); - } - // want to launch the BLAS builds in a separate submit, so the scratch semaphore can signal and free the scratch so more is available for TLAS builds - else if (tlasCount) - drainCompute(); - blasesToBuild.clear(); - computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Build BLASes END"); - computeCmdBuf->cmdbuf->endDebugMarker(); - } - // compact - computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact BLASes START"); - computeCmdBuf->cmdbuf->endDebugMarker(); - { - // the already compacted BLASes need to be written into the TLASes using them, want to swap them out ASAP -//compactedBLASMap[as] = compacted; - } - computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact BLASes END"); - computeCmdBuf->cmdbuf->endDebugMarker(); - } + const auto asCount = asesToBuild.size(); + if (asCount==0) + return {}; + + constexpr bool IsTLAS = std::is_same_v; + using CPUAccelerationStructure = std::conditional_t; - // Device TLAS builds - if (tlasCount) - { - computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Build TLASes START"); - computeCmdBuf->cmdbuf->endDebugMarker(); - // A single pipeline barrier to ensure BLASes build before TLASes is needed - const asset::SMemoryBarrier readBLASInTLASBuildBarrier = { - // the last use of the source BLAS could have been a build or a compaction - .srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT|PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT, - .srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT, - .dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT, - .dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT - }; - // either we built no BLASes (remember we could retrieve already built ones from cache) or we barrier for the previous compactions or builds - const bool failedBLASBarrier = blasCount && !pipelineBarrier(computeCmdBuf,{.memBarriers={&readBLASInTLASBuildBarrier,1}},"Failed to sync BLAS with TLAS build!"); - // TLAS compactions to do later core::vector compactions; // 0xffFFffFFu when not releasing ownership, otherwise index into `ownershipTransfers` where the ownership release for the old buffer was core::vector compactedOwnershipReleaseIndices; - compactions.reserve(tlasCount); - compactedOwnershipReleaseIndices.reserve(tlasCount); + compactions.reserve(asCount); + compactedOwnershipReleaseIndices.reserve(asCount); // build { + auto* scratchBuffer = params.scratchForDeviceASBuild->getBuffer(); + core::vector flushRanges; + const bool manualFlush = scratchBuffer->getBoundMemory().memory->haveToMakeVisible(); + if (manualFlush) // TLAS builds do max 2 writes each and BLAS do much more anyway + flushRanges.reserve(asCount*2); + // lambdas! + auto streamDataToScratch = [&](const size_t offset, const size_t size,IUtilities::IUpstreamingDataProducer& callback) -> bool + { + if (deviceASBuildScratchPtr) + { + callback(deviceASBuildScratchPtr+offset,0ull,size); + if (manualFlush) + flushRanges.emplace_back(scratchBuffer->getBoundMemory().memory,offset,size,ILogicalDevice::MappedMemoryRange::align_non_coherent_tag); + return true; + } + else if (const SBufferRange range={.offset=offset,.size=size,.buffer=smart_refctd_ptr(scratchBuffer)}; params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,callback)) + return true; + else + return false; + }; // - core::vector buildInfos; - buildInfos.reserve(tlasCount); - core::vector rangeInfos; - rangeInfos.reserve(tlasCount); + core::vector buildInfos; + buildInfos.reserve(asCount); + using build_range_info_t = std::conditional_t; + core::vector rangeInfos; + rangeInfos.reserve(asCount); + using scratch_allocator_t = std::remove_reference_t; + using addr_t = typename scratch_allocator_t::size_type; + core::vector allocOffsets; + allocOffsets.reserve(asCount); + core::vector allocSizes; + allocSizes.reserve(asCount); + // BLAS and TLAS specific things + core::vector geometryRangeInfo; + core::vector> triangles; + core::vector> aabbs; core::vector> trackedBLASes; - trackedBLASes.reserve(maxASCount); + if constexpr (IsTLAS) + trackedBLASes.reserve(asCount); + else // would have to count total geometries in BLASes to initialize properly, and we probably don't want to over-reserve + { + geometryRangeInfo.reserve(asCount); + triangles.reserve(asCount); + aabbs.reserve(asCount); + } + // + core::vector alignments; + alignments.reserve(asCount*2); + constexpr auto GeometryIsAABBFlag = IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT; auto recordBuildCommands = [&]()->void { - // rewrite the trackedBLASes pointers - for (auto& info : buildInfos) + bool success = !buildInfos.empty(); + // Lets analyze sync cases: + // - Mapped Host write = no barrier, flush & optional submit sufficient + // - Single Queue = Global Memory Barrier + // - Two distinct Queues = no barrier, semaphore signal-wait is sufficient + // - Two distinct Queue Families Exclusive Sharing mode = QFOT necessary but we require concurrent sharing on the scratch buffer ! + if (success) + { + const asset::SMemoryBarrier readGeometryOrInstanceInASBuildBarrier = { + // the last use of the source BLAS could have been a build or a compaction + .srcStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, + .srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT, + .dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT, + .dstAccessMask = ACCESS_FLAGS::STORAGE_READ_BIT + }; + success = !uniQueue || deviceASBuildScratchPtr || pipelineBarrier(computeCmdBuf,{.memBarriers={&readGeometryOrInstanceInASBuildBarrier,1}},"Pipeline Barriers of Acceleration Structure backing Buffers failed!"); + } + // + constexpr bool IsTLAS = std::is_same_v; + if (success) + { + // rewrite the based pointers + if constexpr (IsTLAS) + for (auto& info : buildInfos) + { + const auto offset = info.trackedBLASes.data(); + const auto correctPtr = trackedBLASes.data()+reinterpret_cast(offset); + info.trackedBLASes = {reinterpret_cast(correctPtr),info.trackedBLASes.size()}; + } + else + { + for (auto& info : buildInfos) + { + if (info.buildFlags.hasFlags(GeometryIsAABBFlag)) + info.aabbs = aabbs.data()+reinterpret_cast(info.aabbs); + else + info.triangles = triangles.data()+reinterpret_cast(info.triangles); + } + for (auto& rangeInfo : rangeInfos) + rangeInfo = geometryRangeInfo.data()+reinterpret_cast(rangeInfo); + } + success = computeCmdBuf->cmdbuf->buildAccelerationStructures({buildInfos},rangeInfos.data()); + } + // account for the in-progress allocation (we may be called from an overflow submit) + const auto oldAllocCount = allocOffsets.size()-alignments.size(); + if (success) + { + submitsNeeded |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; + // queue up a deferred allocation + params.scratchForDeviceASBuild->multi_deallocate(oldAllocCount,allocOffsets.data(),allocSizes.data(),params.compute->getFutureScratchSemaphore()); + } + else + { + // release right away + params.scratchForDeviceASBuild->multi_deallocate(oldAllocCount,allocOffsets.data(),allocSizes.data()); + for (const auto& info : buildInfos) + { + const auto stagingFound = findInStaging.template operator()(info.dstAS); + smart_refctd_ptr dummy; // already null at this point + markFailure("AS Build Command Recording",&dummy,&stagingFound->second); + } + } + allocOffsets.erase(allocOffsets.begin(),allocOffsets.begin()+oldAllocCount); + allocSizes.erase(allocSizes.begin(),allocSizes.begin()+oldAllocCount); + buildInfos.clear(); + rangeInfos.clear(); + if constexpr (IsTLAS) + trackedBLASes.clear(); + else { - const auto offset = info.trackedBLASes.data(); - const auto correctPtr = trackedBLASes.data()+reinterpret_cast(offset); - info.trackedBLASes = {reinterpret_cast(correctPtr),info.trackedBLASes.size()}; + geometryRangeInfo.clear(); + triangles.clear(); + aabbs.clear(); } - recordBuildCommandsBase(buildInfos,rangeInfos); - trackedBLASes.clear(); }; - // + + computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Build Acceleration Structures START"); + computeCmdBuf->cmdbuf->endDebugMarker(); const auto& limits = physDev->getLimits(); - for (auto& tlasToBuild : tlasesToBuild) + for (auto& asToBuild : asesToBuild) { - auto& canonical = tlasToBuild.second.canonical; - const auto as = tlasToBuild.first; - const auto pFound = &findInStaging.template operator()(as)->second; + auto& canonical = asToBuild.second.canonical; + const auto as = asToBuild.first; + const auto pFound = &findInStaging.template operator()(as)->second; const auto& backingRange = as->getCreationParams().bufferRange; // checking ownership for the future on old buffer, but compacted will be made with same sharing creation parameters const auto finalOwnerQueueFamily = checkOwnership(backingRange.buffer.get(),params.getFinalOwnerQueueFamily(as,pFound->cacheKey.value),computeFamily); @@ -4845,79 +4791,137 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul markFailure("invalid Final Queue Family given by user callback",&canonical,pFound); continue; } - const auto instances = canonical->getInstances(); - const auto instanceCount = static_cast(instances.size()); - const auto& instanceMap = tlasToBuild.second.instanceMap; - size_t instanceDataSize = 0; - // gather total input size and check dependants exist - bool dependsOnBLASBuilds = false; - for (const auto& instance : instances) - { - auto found = instanceMap.find(instance.getBase().blas.get()); - assert(instanceMap.end()!=found); - const auto depInfo = missingDependent.template operator()(found->second.get()); - if (depInfo) + // clean up the allocation if we fail to make it to the end of loop for whatever reason + alignments.clear(); + auto allocCount = 0; + auto deallocSrc = core::makeRAIIExiter([¶ms,&allocOffsets,&allocSizes,&alignments,&allocCount]()->void { - instanceDataSize = 0; - break; + const auto beginIx = allocSizes.size()-alignments.size(); + // if got to end of loop queue up the release of memory, otherwise release right away + if (allocCount) + params.scratchForDeviceASBuild->multi_deallocate(allocCount,allocOffsets.data()+beginIx,allocSizes.data()+beginIx); + allocOffsets.resize(beginIx); + allocSizes.resize(beginIx); + alignments.clear(); } - if (depInfo.wasInStaging) - dependsOnBLASBuilds = true; - instanceDataSize += ITopLevelAccelerationStructure::getInstanceSize(instance.getType()); - } - // problem with building some Dependent BLASes - if (failedBLASBarrier && dependsOnBLASBuilds) + ); + allocSizes.push_back(asToBuild.second.scratchSize); + alignments.push_back(limits.minAccelerationStructureScratchOffsetAlignment); + const bitflag buildFlags = asToBuild.second.getBuildFlags(); + if constexpr (IsTLAS) { - markFailure("building BLASes which current TLAS build wants to instance",&canonical,pFound); - continue; + const auto instances = canonical->getInstances(); + // gather total input size and check dependants exist + size_t instanceDataSize = 0; + bool dependsOnBLASBuilds = false; + const auto& instanceMap = asToBuild.second.instanceMap; + for (const auto& instance : instances) + { + auto found = instanceMap.find(instance.getBase().blas.get()); + assert(instanceMap.end()!=found); + const auto depInfo = missingDependent.template operator()(found->second.get()); + if (depInfo) + { + instanceDataSize = 0; + break; + } + if (depInfo.wasInStaging) + dependsOnBLASBuilds = true; + instanceDataSize += ITopLevelAccelerationStructure::getInstanceSize(instance.getType()); + } + // problem with building some Dependent BLASes + if (failedBLASBarrier && dependsOnBLASBuilds) + { + markFailure("building BLASes which current TLAS build wants to instance",&canonical,pFound); + continue; + } + // problem with finding the dependents (BLASes) + if (instanceDataSize==0) + { + markFailure("finding valid Dependant GPU BLASes for TLAS build",&canonical,pFound); + continue; + } + allocSizes.push_back(instanceDataSize); + alignments.push_back(16); + if (as->usesMotion()) + { + allocSizes.push_back(sizeof(void*)*instances.size()); + alignments.push_back(alignof(uint64_t)); + } } - // problem with finding the dependents (BLASes) - if (instanceDataSize==0) + else { - markFailure("finding valid Dependant GPU BLASes for TLAS build",&canonical,pFound); - continue; - } - // allocate scratch and build inputs - constexpr uint32_t MaxAllocCount = 3; - addr_t offsets[MaxAllocCount] = {scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value}; - const addr_t sizes[MaxAllocCount] = {tlasToBuild.second.scratchSize,instanceDataSize,sizeof(void*)*instanceCount}; - const auto AllocCount = as->usesMotion() ? 3:2; - // clean up the allocation if we fail to make it to the end of loop for whatever reason - bool abortAllocation = true; - auto deallocSrc = core::makeRAIIExiter([¶ms,&scratchOffsets,&scratchSizes,AllocCount,&offsets,&sizes,&abortAllocation]()->void + const uint32_t* pPrimitiveCounts = canonical->getGeometryPrimitiveCounts().data(); + if (buildFlags.hasFlags(GeometryIsAABBFlag)) { - // if got to end of loop queue up the release of memory, otherwise release right away - if (abortAllocation) - params.scratchForDeviceASBuild->multi_deallocate(AllocCount,&offsets[0],&sizes[0]); - else - for (auto i=0; igetAABBGeometries()) + if (const auto aabbCount=*(pPrimitiveCounts++); aabbCount) { - scratchOffsets.push_back(offsets[i]); - scratchSizes.push_back(sizes[i]); + allocSizes.push_back(aabbCount*geom.stride); + alignments.push_back(alignof(float)); } } - ); - // allocate out scratch or submit overflow - { - const addr_t alignments[MaxAllocCount] = {limits.minAccelerationStructureScratchOffsetAlignment,16,alignof(uint64_t)}; - // if fail then flush and keep trying till space is made - for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(AllocCount,&offsets[0],&sizes[0],&alignments[0])!=0u; t++) - if (t==1) // don't flush right away cause allocator not defragmented yet + else { - recordBuildCommands(); - // if writing to scratch directly, flush the writes - if (!flushRanges.empty()) + for (const auto& geom : canonical->getTriangleGeometries()) + if (const auto triCount=*(pPrimitiveCounts++); triCount) { - device->flushMappedMemoryRanges(flushRanges); - flushRanges.clear(); + auto size = geom.vertexStride*(geom.vertexData[1] ? 2:1)*geom.maxVertex; + if (geom.hasTransform()) + size = core::alignUp(size,alignof(float))+sizeof(hlsl::float32_t3x4); + auto alignment = 0u; + switch (geom.indexType) + { + case E_INDEX_TYPE::EIT_16BIT: + alignment = alignof(uint16_t); + break; + case E_INDEX_TYPE::EIT_32BIT: + alignment = alignof(uint32_t); + break; + default: + break; + } + if (alignment) + size = core::alignUp(size,alignment)+triCount*3*alignment; + allocSizes.push_back(size); + alignments.push_back(hlsl::max(alignment,geom.vertexStride)); } - drainCompute(); } } - // stream the instance/geometry input in + allocOffsets.resize(allocSizes.size(),scratch_allocator_t::invalid_value); + // allocate out scratch or submit overflow, if fail then flush and keep trying till space is made + auto* const offsets = allocOffsets.data()+allocOffsets.size()-allocCount; + const auto* const sizes = allocSizes.data()+allocSizes.size()-allocCount; + for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(allocCount,offsets,sizes,alignments.data())!=0; t++) + if (t==1) // don't flush right away cause allocator not defragmented yet + { + recordBuildCommands(); + // if writing to scratch directly, flush the writes + if (!flushRanges.empty()) + { + device->flushMappedMemoryRanges(flushRanges); + flushRanges.clear(); + } + drainCompute(); + } + // now upon a failure, our allocations will need to be deallocated + allocCount = alignments.size(); + // prepare build infos + typename AccelerationStructure::DeviceBuildInfo buildInfo; + buildInfo.scratch = {.offset=offsets[0],.buffer=smart_refctd_ptr(scratchBuffer)}; + buildInfo.buildFlags = buildFlags; + buildInfo.dstAS = as; + // abortion backup + bool success = true; + const auto geometryRangeInfoOffset = geometryRangeInfo.size(); + const auto trianglesOffset = triangles.size(); + const auto aabbsOffset = aabbs.size(); const size_t trackedBLASesOffset = trackedBLASes.size(); + if constexpr (IsTLAS) { - bool success = true; + const auto instances = canonical->getInstances(); + const auto instanceCount = static_cast(instances.size()); + // stream the instance/geometry input in { struct FillInstances : IUtilities::IUpstreamingDataProducer { @@ -4955,11 +4959,11 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul FillInstances fillInstances; fillInstances.compactedBLASMap = &compactedBLASMap; fillInstances.trackedBLASes = &trackedBLASes; - fillInstances.instanceMap = &tlasToBuild.second.instanceMap; + fillInstances.instanceMap = &asToBuild.second.instanceMap; fillInstances.instances = instances; success = streamDataToScratch(offsets[1],sizes[1],fillInstances); // provoke refcounting bugs right away - tlasToBuild.second.instanceMap.clear(); + asToBuild.second.instanceMap.clear(); } if (success && as->usesMotion()) { @@ -4989,33 +4993,107 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul fillInstancePointers.instanceAddress = scratchBuffer->getDeviceAddress()+offsets[1]; success = streamDataToScratch(offsets[2],sizes[2],fillInstancePointers); } - // current recording buffer may have changed - xferCmdBuf = params.transfer->getCommandBufferForRecording(); - if (!success) + // + buildInfo.instanceDataTypeEncodedInPointersLSB = as->usesMotion(); + // note we don't build directly from staging, because only very small inputs could come from there and they'd impede the transfer efficiency of the larger ones + buildInfo.instanceData = {.offset=offsets[as->usesMotion() ? 2:1],.buffer=smart_refctd_ptr(scratchBuffer)}; + // be based cause vectors can grow + using p_p_BLAS_t = const IGPUBottomLevelAccelerationStructure**; + buildInfo.trackedBLASes = {reinterpret_cast(trackedBLASesOffset),trackedBLASes.size()-trackedBLASesOffset}; + // no special extra byte offset into the instance buffer + rangeInfos.emplace_back(instanceCount,0u); + } + else + { + buildInfo.geometryCount = canonical->getGeometryCount(); + const auto* offsetIt = offsets+1; + const auto primitiveCounts = canonical->getGeometryPrimitiveCounts(); + for (const auto count : primitiveCounts) + geometryRangeInfo.push_back({ + .primitiveCount = count, + .primitiveByteOffset = 0, + .firstVertex = 0, + .transformByteOffset = 0 + }); + const uint32_t* pPrimitiveCounts = canonical->getGeometryPrimitiveCounts().data(); + if (buildFlags.hasFlags(GeometryIsAABBFlag)) { - trackedBLASes.resize(trackedBLASesOffset); - markFailure("Uploading Instance Data for TLAS build failed",&canonical,pFound); - continue; + for (const auto& geom : canonical->getAABBGeometries()) + if (const auto aabbCount=*(pPrimitiveCounts++); aabbCount) + { + auto offset = *(offsetIt++); +// TODO: stream in the data + aabbs.push_back({ + .data = {.offset=offset,.buffer=smart_refctd_ptr(scratchBuffer)}, + .stride = geom.stride, + .geometryFlags = geom.geometryFlags + }); + } + buildInfo.aabbs = reinterpret_cast* const&>(aabbsOffset); } - // let go of canonical asset (may free RAM) - canonical = nullptr; + else + { + for (const auto& geom : canonical->getTriangleGeometries()) + if (const auto triCount=*(pPrimitiveCounts++); triCount) + { + auto& outGeom = triangles.emplace_back(); + auto offset = *(offsetIt++); +// TODO: stream in the data + outGeom.vertexData[0] = {.offset=offset,.buffer=smart_refctd_ptr(scratchBuffer)}; + offset += geom.vertexStride*geom.maxVertex; + if (geom.vertexData[1]) + { + outGeom.vertexData[1] = {.offset=offset,.buffer=smart_refctd_ptr(scratchBuffer)}; + offset += geom.vertexStride*geom.maxVertex; + } + if (geom.hasTransform()) + { + offset = core::alignUp(offset,alignof(float)); + outGeom.transform = {.offset=offset,.buffer=smart_refctd_ptr(scratchBuffer)}; + offset += sizeof(hlsl::float32_t3x4); + } + switch (geom.indexType) + { + case E_INDEX_TYPE::EIT_16BIT: [[fallthrough]]; + case E_INDEX_TYPE::EIT_32BIT: + { + const auto alignment = geom.indexType==E_INDEX_TYPE::EIT_16BIT ? alignof(uint16_t):alignof(uint32_t); + offset = core::alignUp(offset,alignment); + outGeom.indexData = {.offset=offset,.buffer=smart_refctd_ptr(scratchBuffer)}; + break; + } + default: + break; + } + outGeom.maxVertex = geom.maxVertex; + outGeom.vertexStride = geom.vertexStride; + outGeom.vertexFormat = geom.vertexFormat; + outGeom.indexType = geom.indexType; + outGeom.geometryFlags = geom.geometryFlags; + } + buildInfo.triangles = reinterpret_cast* const&>(trianglesOffset); + } + rangeInfos.push_back(reinterpret_cast(geometryRangeInfoOffset)); +success = false; } - // prepare build infos - auto& buildInfo = buildInfos.emplace_back(); - buildInfo.scratch = {.offset=offsets[0],.buffer=smart_refctd_ptr(scratchBuffer)}; - buildInfo.buildFlags = tlasToBuild.second.getBuildFlags(); - buildInfo.instanceDataTypeEncodedInPointersLSB = as->usesMotion(); - buildInfo.dstAS = as; - // note we don't build directly from staging, because only very small inputs could come from there and they'd impede the transfer efficiency of the larger ones - buildInfo.instanceData = {.offset=offsets[as->usesMotion() ? 2:1],.buffer=smart_refctd_ptr(scratchBuffer)}; - // be based cause vectors can grow - using p_p_BLAS_t = const IGPUBottomLevelAccelerationStructure**; - buildInfo.trackedBLASes = {reinterpret_cast(trackedBLASesOffset),trackedBLASes.size()-trackedBLASesOffset}; - // no special extra byte offset into the instance buffer - rangeInfos.emplace_back(instanceCount,0u); - abortAllocation = false; + // current recording buffer may have changed + xferCmdBuf = params.transfer->getCommandBufferForRecording(); + if (!success) + { + rangeInfos.resize(buildInfos.size()); + geometryRangeInfo.resize(geometryRangeInfoOffset); + triangles.resize(trianglesOffset); + aabbs.resize(aabbsOffset); + trackedBLASes.resize(trackedBLASesOffset); + markFailure("Uploading Input Data for Accleration Structure build failed",&canonical,pFound); + continue; + } + buildInfos.emplace_back(std::move(buildInfo)); + allocCount = 0; + // let go of canonical asset (may free RAM) + canonical = nullptr; // - const bool willCompact = tlasToBuild.second.compact; + const bool willCompact = asToBuild.second.compact; if (willCompact) compactions.push_back(as); // enqueue ownership release if necessary @@ -5041,130 +5119,178 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul } // finish the last batch recordBuildCommands(); + computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Build Acceleration Structures END"); + computeCmdBuf->cmdbuf->endDebugMarker(); + // provoke refcounting bugs + asesToBuild.clear(); + // flush all ranged before potential submit if (!flushRanges.empty()) { device->flushMappedMemoryRanges(flushRanges); flushRanges.clear(); } - computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact TLASes END"); - computeCmdBuf->cmdbuf->endDebugMarker(); } - tlasesToBuild.clear(); - // compact - computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact TLASes START"); - computeCmdBuf->cmdbuf->endDebugMarker(); - // compact needs to wait for Build then record queries + + // Not messing around with listing AS backing buffers individually, ergonomics of that are null + const asset::SMemoryBarrier readASInASCompactBarrier = { + .srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT, + .srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT, + // TODO: do queries or query retrieval have a stage? + .dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT, + .dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT + }; if (!compactions.empty() && pipelineBarrier(computeCmdBuf,{.memBarriers={&readASInASCompactBarrier,1}},"Failed to sync Acceleration Structure builds with compactions!") && computeCmdBuf->cmdbuf->resetQueryPool(queryPool.get(),0,compactions.size()) && computeCmdBuf->cmdbuf->writeAccelerationStructureProperties(compactions,IQueryPool::TYPE::ACCELERATION_STRUCTURE_COMPACTED_SIZE,queryPool.get(),0) ) { - // submit cause host needs to read the queries + // clean AS builds, pipeline barrier, query reset and writes need to get executed before we start waiting on the results drainCompute(); // get queries core::vector sizes(compactions.size()); - if (device->getQueryPoolResults( - queryPool.get(),0,compactions.size(),sizes.data(),sizeof(size_t), - bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT)|IQueryPool::RESULTS_FLAGS::_64_BIT - )) + if (!device->getQueryPoolResults(queryPool.get(),0,compactions.size(),sizes.data(),sizeof(size_t),bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT)|IQueryPool::RESULTS_FLAGS::_64_BIT)) { - auto logFail = [logger](const char* msg, const IGPUAccelerationStructure* as)->void - { - logger.log("Failed to %s for \"%s\"", system::ILogger::ELL_ERROR,as->getObjectDebugName()); - }; - // TODO: normally we'd iteratively record as many compactions as we can, but we don't have a mechanism to release already compacted TLASes, we'd need to defer the writing of the TLAS to the Descriptor Set till later - // create and allocate backing buffers for compacted TLASes - core::vector> backingBuffers(compactions.size()); + logger.log("Failed to Query %sLevelAccelerationStructure compacted sizes, skipping compaction!",system::ILogger::ELL_ERROR,IsTLAS ? "Top":"Bottom"); + return {}; + } + // + auto logFail = [logger](const char* msg, const IGPUAccelerationStructure* as)->void + { + logger.log("Failed to %s for \"%s\"",system::ILogger::ELL_ERROR,msg,as->getObjectDebugName()); + }; + // try to allocate memory for + core::vector> backingBuffers(compactions.size()); + { + MetaDeviceMemoryAllocator deferredAllocator(params.compactedASAllocator ? params.compactedASAllocator:device,logger); + // create + for (size_t i=0; i(compactions[i]); + assert(as); + // silently skip if not worth it + if (!params.confirmCompact(sizes[i],as)) { - const auto* as = static_cast(compactions[i]); - assert(as); - // silently skip if not worth it - if (!params.confirmCompact(sizes[i],as)) + logger.log("Compaction not confirmed for \"%s\" would be compacted size is %d, original %d.",system::ILogger::ELL_DEBUG,as->getObjectDebugName(),sizes[i],as->getCreationParams().bufferRange.size); + continue; + } + // create backing buffer and request an allocation for it + { + const auto* oldBuffer = as->getCreationParams().bufferRange.buffer.get(); + assert(oldBuffer); + // This is a Spec limit/rpomise we don't even expose it + constexpr size_t MinASBufferAlignment = 256u; + using usage_f = IGPUBuffer::E_USAGE_FLAGS; + IGPUBuffer::SCreationParams creationParams = { {.size=core::roundUp(sizes[i],MinASBufferAlignment),.usage=usage_f::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT|usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT},{}}; + // same sharing setup as the previous AS buffer + creationParams.queueFamilyIndexCount = oldBuffer->getCachedCreationParams().queueFamilyIndexCount; + creationParams.queueFamilyIndices = oldBuffer->getCachedCreationParams().queueFamilyIndices; + auto buf = device->createBuffer(std::move(creationParams)); + if (!buf) + { + logFail("create Buffer backing the Compacted Acceleration Structure",as); continue; - smart_refctd_ptr buff; + } + auto bufReqs = buf->getMemoryReqs(); + backingBuffers[i].value = std::move(buf); + // allocate new memory - definitely don't want to be raytracing from across the PCIE slot + if (!deferredAllocator.request(backingBuffers.data()+i,physDev->getDeviceLocalMemoryTypeBits())) { - const auto* oldBuffer = as->getCreationParams().bufferRange.buffer.get(); - assert(oldBuffer); - // - constexpr size_t MinASBufferAlignment = 256u; - using usage_f = IGPUBuffer::E_USAGE_FLAGS; - IGPUBuffer::SCreationParams creationParams = { {.size=core::roundUp(sizes[i],MinASBufferAlignment),.usage = usage_f::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT|usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT},{}}; - creationParams.queueFamilyIndexCount = oldBuffer->getCachedCreationParams().queueFamilyIndexCount; - creationParams.queueFamilyIndices = oldBuffer->getCachedCreationParams().queueFamilyIndices; - auto buf = device->createBuffer(std::move(creationParams)); - if (!buf) - { - logFail("create Buffer backing the Compacted Acceleration Structure",as); - continue; - } - // allocate new memory - auto bufReqs = buff->getMemoryReqs(); - // definitely don't want to be raytracing from across the PCIE slot - if (!deferredAllocator.request(backingBuffers.data()+i,physDev->getDeviceLocalMemoryTypeBits())) - { - logFail("request of a Memory Allocation for the Buffer backing the Compacted Acceleration Structure",as); - continue; - } - backingBuffers[i].value = std::move(buf); + logFail("request of a Memory Allocation for the Buffer backing the Compacted Acceleration Structure",as); + continue; } } - // allocate memory for the buffers - deferredAllocator.finalize(); } + // allocate memory for the buffers + deferredAllocator.finalize(); + unordered_map> retval; + retval.reserve(compactions.size()); // recreate Acceleration Structures for (size_t i=0; i(compactions[i]); + const auto* srcAS = static_cast(compactions[i]); auto& backingBuffer = backingBuffers[i].value; if (!backingBuffer->getBoundMemory().isValid()) { - logFail("allocate Memory for the Buffer backing the Compacted Acceleration Structure",as); - continue; // reason to end a batch, see the TODO above + logFail("allocate Memory for the Buffer backing the Compacted Acceleration Structure",srcAS); + continue; + } + smart_refctd_ptr compactedAS; + { + typename AccelerationStructure::SCreationParams creationParams = {srcAS->getCreationParams()}; + creationParams.bufferRange = {.offset=0,.size=sizes[i],.buffer=std::move(backingBuffer)}; + if constexpr (IsTLAS) + { + creationParams.maxInstanceCount = srcAS->getMaxInstanceCount(); + compactedAS = device->createTopLevelAccelerationStructure(std::move(creationParams)); + } + else + compactedAS = device->createBottomLevelAccelerationStructure(std::move(creationParams)); } - IGPUTopLevelAccelerationStructure::SCreationParams creationParams = {as->getCreationParams()}; - creationParams.bufferRange = {.offset=0,.size=sizes[i],.buffer=std::move(backingBuffer)}; - creationParams.maxInstanceCount = as->getMaxInstanceCount(); - auto compactedAS = device->createTopLevelAccelerationStructure(std::move(creationParams)); if (!compactedAS) { - logFail("create the Compacted Acceleration Structure",as); + logFail("create the Compacted Acceleration Structure",srcAS); continue; } // set the debug name { - std::string debugName = as->getObjectDebugName(); + std::string debugName = srcAS->getObjectDebugName(); debugName += " compacted"; compactedAS->setObjectDebugName(debugName.c_str()); } // record compaction - if (!computeCmdBuf->cmdbuf->copyAccelerationStructure({.src=as,.dst=compactedAS.get(),.mode=IGPUAccelerationStructure::COPY_MODE::COMPACT})) + if (!computeCmdBuf->cmdbuf->copyAccelerationStructure({.src=srcAS,.dst=compactedAS.get(),.mode=IGPUAccelerationStructure::COPY_MODE::COMPACT})) { logFail("record Acceleration Structure compaction",compactedAS.get()); continue; } - // modify the ownership release + // modify the ownership release to be for the final compacted AS if (const auto ix=compactedOwnershipReleaseIndices[i]; ixgetCreationParams().bufferRange; // swap out the conversion result - const auto foundIx = outputReverseMap.find(as); + const auto foundIx = outputReverseMap.find(srcAS); if (foundIx!=outputReverseMap.end()) { - auto& resultOutput = std::get>(reservations.m_gpuObjects); + auto& resultOutput = std::get>(reservations.m_gpuObjects); resultOutput[foundIx->second].value = compactedAS; } // insert into compaction map - compactedTLASMap[as] = std::move(compactedAS); + retval[srcAS] = std::move(compactedAS); } + return retval; } + computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact Acceleration Structures START"); + computeCmdBuf->cmdbuf->endDebugMarker(); + computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact Acceleration Structures END"); + computeCmdBuf->cmdbuf->endDebugMarker(); + } + return {}; + }; + + // compacted BLASes need to be substituted in cache and TLAS Build Inputs + compactedBLASMap = buildAndCompactASes.template operator()(blasesToBuild); + // Device TLAS builds + if (tlasCount) + { + // either we built no BLASes (remember we could retrieve already built ones from cache) + if (blasCount) + { + // Or we barrier for the previous compactions or builds (a single pipeline barrier to ensure BLASes build before TLASes is needed) + const asset::SMemoryBarrier readBLASInTLASBuildBarrier = { + // the last use of the source BLAS could have been a build or a compaction + .srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT|PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT, + .srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT, + .dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT, + .dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT + }; + // submit because we want to launch BLAS builds in a separate submit, so the scratch semaphore can signal and free the scratch and more is available for TLAS builds + if (pipelineBarrier(computeCmdBuf,{.memBarriers={&readBLASInTLASBuildBarrier,1}},"Failed to sync BLAS with TLAS build!")) + drainCompute(); + else + failedBLASBarrier = true; } - computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact TLASes END"); - computeCmdBuf->cmdbuf->endDebugMarker(); + compactedTLASMap = buildAndCompactASes.template operator()(tlasesToBuild); } // release ownership From 4b03383578805920cf98f14c4c9bf168f9c99b08 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 15 May 2025 14:39:35 +0200 Subject: [PATCH 123/346] nuke the old asset converter, nothing useful or not reimplemented there anymore --- include/nbl/video/declarations.h | 1 - .../utilities/IGPUObjectFromAssetConverter.h | 168 ------------------ 2 files changed, 169 deletions(-) delete mode 100644 include/nbl/video/utilities/IGPUObjectFromAssetConverter.h diff --git a/include/nbl/video/declarations.h b/include/nbl/video/declarations.h index ecec442366..2fdfe28e3c 100644 --- a/include/nbl/video/declarations.h +++ b/include/nbl/video/declarations.h @@ -34,7 +34,6 @@ #include "nbl/video/utilities/CDrawIndirectAllocator.h" #include "nbl/video/utilities/CSubpassKiln.h" #include "nbl/video/utilities/IUtilities.h" -#include "nbl/video/utilities/IGPUObjectFromAssetConverter.h" #include "nbl/video/utilities/SPhysicalDeviceFilter.h" #include "nbl/video/utilities/CSimpleResizeSurface.h" #include "nbl/video/utilities/CSmoothResizeSurface.h" diff --git a/include/nbl/video/utilities/IGPUObjectFromAssetConverter.h b/include/nbl/video/utilities/IGPUObjectFromAssetConverter.h deleted file mode 100644 index b7ffc5d0c1..0000000000 --- a/include/nbl/video/utilities/IGPUObjectFromAssetConverter.h +++ /dev/null @@ -1,168 +0,0 @@ -// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h -#ifndef _NBL_VIDEO_I_GPU_OBJECT_FROM_ASSET_CONVERTER_H_INCLUDED_ -#define _NBL_VIDEO_I_GPU_OBJECT_FROM_ASSET_CONVERTER_H_INCLUDED_ - -#include "nbl/core/declarations.h" -#include "nbl/core/alloc/LinearAddressAllocator.h" - -#include "nbl/video/ISemaphore.h" -#include "nbl/video/ILogicalDevice.h" - -#if 0 - // Convert CPUBuffer Deps to GPUBuffers - core::vector redirs = eliminateDuplicatesAndGenRedirs(cpuBufferDeps); - auto gpuBufs = getGPUObjectsFromAssets(cpuBufferDeps.data(), cpuBufferDeps.data()+cpuBufferDeps.size(), _params); - _params.waitForCreationToComplete(); - _params.beginCommandBuffers(); - size_t bufIter = 0ull; - - // Fill buildGeomInfos partially (to later ge Get AS Size before build command) - std::vector buildGeomInfos(toCreateAndBuild.size()); - - using GPUGeometry = IGPUAccelerationStructure::Geometry; - std::vector gpuGeoms; - gpuGeoms.reserve(assetCount * MaxGeometryPerBuildInfo); - - for (ptrdiff_t i = 0u; i < toCreateAndBuild.size(); ++i) - { - const asset::ICPUAccelerationStructure* cpuas = toCreateAndBuild[i]; - - auto cpuBuildInfo = cpuas->getBuildInfo(); - auto & gpuBuildInfo = buildGeomInfos[i]; - - gpuBuildInfo.type = cpuBuildInfo->type; - gpuBuildInfo.buildFlags = cpuBuildInfo->buildFlags; - gpuBuildInfo.buildMode = cpuBuildInfo->buildMode; - assert(cpuBuildInfo->buildMode == asset::IAccelerationStructure::EBM_BUILD); - - // Fill Later: - gpuBuildInfo.srcAS = nullptr; - gpuBuildInfo.dstAS = nullptr; - gpuBuildInfo.scratchAddr = {}; - - auto cpu_geoms = cpuBuildInfo->getGeometries().begin(); - auto geomsCount = cpuBuildInfo->getGeometries().size(); - if(geomsCount == 0) - { - assert(false); - continue; - } - - size_t startGeom = gpuGeoms.size(); - size_t endGeom = gpuGeoms.size() + geomsCount; - - for(uint32_t g = 0; g < geomsCount; ++g) - { - const auto& cpu_geom = cpu_geoms[g]; - - GPUGeometry gpu_geom = {}; - gpu_geom.type = cpu_geom.type; - gpu_geom.flags = cpu_geom.flags; - - if(cpu_geom.type == asset::IAccelerationStructure::EGT_TRIANGLES) - { - gpu_geom.data.triangles.vertexFormat = cpu_geom.data.triangles.vertexFormat; - gpu_geom.data.triangles.vertexStride = cpu_geom.data.triangles.vertexStride; - gpu_geom.data.triangles.maxVertex = cpu_geom.data.triangles.maxVertex; - gpu_geom.data.triangles.indexType = cpu_geom.data.triangles.indexType; - - { - IGPUOffsetBufferPair* gpubuf = (*gpuBufs)[redirs[bufIter++]].get(); - gpu_geom.data.triangles.indexData.buffer = core::smart_refctd_ptr(gpubuf->getBuffer()); - gpu_geom.data.triangles.indexData.offset = gpubuf->getOffset() + cpu_geom.data.triangles.indexData.offset; - } - { - IGPUOffsetBufferPair* gpubuf = (*gpuBufs)[redirs[bufIter++]].get(); - gpu_geom.data.triangles.vertexData.buffer = core::smart_refctd_ptr(gpubuf->getBuffer()); - gpu_geom.data.triangles.vertexData.offset = gpubuf->getOffset() + cpu_geom.data.triangles.vertexData.offset; - } - { - IGPUOffsetBufferPair* gpubuf = (*gpuBufs)[redirs[bufIter++]].get(); - gpu_geom.data.triangles.transformData.buffer = core::smart_refctd_ptr(gpubuf->getBuffer()); - gpu_geom.data.triangles.transformData.offset = gpubuf->getOffset() + cpu_geom.data.triangles.transformData.offset; - } - } - else if(cpu_geom.type == asset::IAccelerationStructure::EGT_AABBS) - { - gpu_geom.data.aabbs.stride = cpu_geom.data.aabbs.stride; - { - IGPUOffsetBufferPair* gpubuf = (*gpuBufs)[redirs[bufIter++]].get(); - gpu_geom.data.aabbs.data.buffer = core::smart_refctd_ptr(gpubuf->getBuffer()); - gpu_geom.data.aabbs.data.offset = gpubuf->getOffset() + cpu_geom.data.aabbs.data.offset; - } - } - else if(cpu_geom.type == asset::IAccelerationStructure::EGT_INSTANCES) - { - { - IGPUOffsetBufferPair* gpubuf = (*gpuBufs)[redirs[bufIter++]].get(); - gpu_geom.data.instances.data.buffer = core::smart_refctd_ptr(gpubuf->getBuffer()); - gpu_geom.data.instances.data.offset = gpubuf->getOffset() + cpu_geom.data.instances.data.offset; - } - } - - gpuGeoms.push_back(gpu_geom); - } - - gpuBuildInfo.geometries = core::SRange(gpuGeoms.data() + startGeom, gpuGeoms.data() + endGeom); - } - - // Get SizeInfo for each CPUAS -> Create the AS -> Get Total Scratch Buffer Size - std::vector buildSizes(toCreateAndBuild.size()); - uint64_t totalScratchBufferSize = 0ull; - uint64_t maxScratchBufferSize = 0ull; - for (ptrdiff_t i = 0u, toBuildIndex = 0u; i < assetCount; ++i) - { - const asset::ICPUAccelerationStructure* cpuas = _begin[i]; - if(cpuas->hasBuildInfo() == false) - { - // Only those with buildInfo (index in toCreateAndBuild vector) will get passed - continue; - } - - assert(cpuas == toCreateAndBuild[toBuildIndex]); - assert(toBuildIndex < toCreateAndBuild.size()); - - auto buildRanges = cpuas->getBuildRanges().begin(); - auto buildRangesCount = cpuas->getBuildRanges().size(); - - auto & gpuBuildInfo = buildGeomInfos[toBuildIndex]; - - std::vector maxPrimCount(buildRangesCount); - for(auto b = 0; b < buildRangesCount; b++) - maxPrimCount[b] = buildRanges[b].primitiveCount; - - auto buildSize = _params.device->getAccelerationStructureBuildSizes(gpuBuildInfo, maxPrimCount.data()); - buildSizes[i] = buildSize; - - auto gpuAS = allocateBufferAndCreateAccelerationStructure(buildSize.accelerationStructureSize, cpuas); - res->operator[](i) = gpuAS; - - // complete the buildGeomInfos (now only thing left is to allocate and set scratchAddr.buffer) - buildGeomInfos[toBuildIndex].dstAS = gpuAS.get(); - buildGeomInfos[toBuildIndex].scratchAddr.offset = totalScratchBufferSize; - - totalScratchBufferSize += buildSize.buildScratchSize; - core::max(maxScratchBufferSize, buildSize.buildScratchSize); // maxScratchBufferSize has no use now (unless we changed this function to build 1 by 1 instead of batch builds or have some kind of memory limit?) - ++toBuildIndex; - } - - // Allocate Scratch Buffer - IGPUBuffer::SCreationParams gpuScratchBufParams = {}; - gpuScratchBufParams.size = totalScratchBufferSize; - gpuScratchBufParams.usage = core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_STORAGE_BUFFER_BIT; - auto gpuScratchBuf = _params.device->createBuffer(std::move(gpuScratchBufParams)); - auto mreqs = gpuScratchBuf->getMemoryReqs(); - mreqs.memoryTypeBits &= _params.device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); - auto gpuScratchBufMem = _params.device->allocate(mreqs, gpuScratchBuf.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - - - for (ptrdiff_t i = 0u; i < toCreateAndBuild.size(); ++i) - { - auto & gpuBuildInfo = buildGeomInfos[i]; - gpuBuildInfo.scratchAddr.buffer = gpuScratchBuf; - } -#endif - -#endif From 0ebdda6eafc89525716f2959c8a5a72cf21f8cd6 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 15 May 2025 14:59:46 +0200 Subject: [PATCH 124/346] proper default initializer for triangle BLAS geometry transforms --- include/nbl/asset/IAccelerationStructure.h | 59 +++++++++++++--------- 1 file changed, 34 insertions(+), 25 deletions(-) diff --git a/include/nbl/asset/IAccelerationStructure.h b/include/nbl/asset/IAccelerationStructure.h index d251dd3077..0efe6781ae 100644 --- a/include/nbl/asset/IAccelerationStructure.h +++ b/include/nbl/asset/IAccelerationStructure.h @@ -92,31 +92,40 @@ class IBottomLevelAccelerationStructure : public IAccelerationStructure template requires std::is_base_of_v struct Triangles { - using buffer_t = std::remove_const_t; - constexpr static inline bool Host = std::is_same_v; - // we make our life easier by not taking pointers to single matrix values - using transform_t = std::conditional_t>; - - inline bool hasTransform() const - { - if constexpr (Host) - return !core::isnan(transform[0][0]); - else - return bool(transform.buffer); - } - - // optional, only useful for baking model transforms of multiple meshes into one BLAS - transform_t transform = {}; - // vertexData[1] are the vertex positions at time 1.0, and only used for AccelerationStructures created with `MOTION_BIT` - asset::SBufferBinding vertexData[2] = {{},{}}; - asset::SBufferBinding indexData = {}; - uint32_t maxVertex = 0u; - // type implicitly satisfies: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkAccelerationStructureGeometryTrianglesDataKHR-vertexStride-03819 - uint32_t vertexStride = sizeof(float); - E_FORMAT vertexFormat = EF_R32G32B32_SFLOAT; - E_INDEX_TYPE indexType = EIT_UNKNOWN; - core::bitflag geometryFlags = GEOMETRY_FLAGS::NONE; - // TODO: opacity and displacement micromap buffers and shizz + public: + using buffer_t = std::remove_const_t; + constexpr static inline bool Host = std::is_same_v; + // we make our life easier by not taking pointers to single matrix values + using transform_t = std::conditional_t>; + + inline bool hasTransform() const + { + if constexpr (Host) + return !core::isnan(transform[0][0]); + else + return bool(transform.buffer); + } + + // optional, only useful for baking model transforms of multiple meshes into one BLAS + transform_t transform = __transform_initializer(); + // vertexData[1] are the vertex positions at time 1.0, and only used for AccelerationStructures created with `MOTION_BIT` + asset::SBufferBinding vertexData[2] = {{},{}}; + asset::SBufferBinding indexData = {}; + uint32_t maxVertex = 0u; + // type implicitly satisfies: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkAccelerationStructureGeometryTrianglesDataKHR-vertexStride-03819 + uint32_t vertexStride = sizeof(float); + E_FORMAT vertexFormat = EF_R32G32B32_SFLOAT; + E_INDEX_TYPE indexType = EIT_UNKNOWN; + core::bitflag geometryFlags = GEOMETRY_FLAGS::NONE; + // TODO: opacity and displacement micromap buffers and shizz + + private: + constexpr static transform_t __transform_initializer() + { + if constexpr (Host) + return hlsl::float32_t3x4(std::numeric_limits::quiet_NaN()); + return {}; + } }; // From c32846fbf8377d221a74c7b010c81090f7f34f65 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 15 May 2025 15:58:35 +0200 Subject: [PATCH 125/346] Stream the BLAS build inputs, fix a bug and note another one that has to get fixed --- src/nbl/video/utilities/CAssetConverter.cpp | 41 ++++++++++++++------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 0167a96a43..285a1dce1d 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -2854,6 +2854,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult { const uint32_t* pPrimitiveCounts = as->getGeometryPrimitiveCounts().data(); // the code here is not pretty, but DRY-ing is of this is for later +// TODO: ILogicalDevice needs code to query build sizes of ICPUBottomLevelAccelerationStructure geometries! if (buildFlags.hasFlags(ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT)) { const auto geoms = as->getAABBGeometries(); @@ -4890,9 +4891,9 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul } allocOffsets.resize(allocSizes.size(),scratch_allocator_t::invalid_value); // allocate out scratch or submit overflow, if fail then flush and keep trying till space is made - auto* const offsets = allocOffsets.data()+allocOffsets.size()-allocCount; - const auto* const sizes = allocSizes.data()+allocSizes.size()-allocCount; - for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(allocCount,offsets,sizes,alignments.data())!=0; t++) + auto* const offsets = allocOffsets.data()+allocOffsets.size()-alignments.size(); + const auto* const sizes = allocSizes.data()+allocSizes.size()-alignments.size(); + for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(alignments.size(),offsets,sizes,alignments.data())!=0; t++) if (t==1) // don't flush right away cause allocator not defragmented yet { recordBuildCommands(); @@ -5007,6 +5008,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul { buildInfo.geometryCount = canonical->getGeometryCount(); const auto* offsetIt = offsets+1; + const auto* sizeIt = sizes+1; const auto primitiveCounts = canonical->getGeometryPrimitiveCounts(); for (const auto count : primitiveCounts) geometryRangeInfo.push_back({ @@ -5015,14 +5017,17 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul .firstVertex = 0, .transformByteOffset = 0 }); - const uint32_t* pPrimitiveCounts = canonical->getGeometryPrimitiveCounts().data(); + const uint32_t* pPrimitiveCounts = primitiveCounts.data(); + IUtilities::CMemcpyUpstreamingDataProducer memcpyCallback; if (buildFlags.hasFlags(GeometryIsAABBFlag)) { for (const auto& geom : canonical->getAABBGeometries()) if (const auto aabbCount=*(pPrimitiveCounts++); aabbCount) { auto offset = *(offsetIt++); -// TODO: stream in the data + memcpyCallback.data = reinterpret_cast(geom.data.buffer->getPointer())+geom.data.offset; + if (!streamDataToScratch(offset,*(sizeIt++),memcpyCallback)) + break; aabbs.push_back({ .data = {.offset=offset,.buffer=smart_refctd_ptr(scratchBuffer)}, .stride = geom.stride, @@ -5038,19 +5043,24 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul { auto& outGeom = triangles.emplace_back(); auto offset = *(offsetIt++); -// TODO: stream in the data - outGeom.vertexData[0] = {.offset=offset,.buffer=smart_refctd_ptr(scratchBuffer)}; - offset += geom.vertexStride*geom.maxVertex; - if (geom.vertexData[1]) + auto size = geom.vertexStride*geom.maxVertex; + for (auto i=0; i<2; i++) + if (geom.vertexData[i]) // could assert that it must be true for i==0 { - outGeom.vertexData[1] = {.offset=offset,.buffer=smart_refctd_ptr(scratchBuffer)}; - offset += geom.vertexStride*geom.maxVertex; + outGeom.vertexData[i] = {.offset=offset,.buffer=smart_refctd_ptr(scratchBuffer)}; + memcpyCallback.data = reinterpret_cast(geom.vertexData[i].buffer->getPointer())+geom.vertexData[i].offset; + if (!streamDataToScratch(offset,size,memcpyCallback)) + break; + offset += size; } if (geom.hasTransform()) { offset = core::alignUp(offset,alignof(float)); outGeom.transform = {.offset=offset,.buffer=smart_refctd_ptr(scratchBuffer)}; - offset += sizeof(hlsl::float32_t3x4); + memcpyCallback.data = &geom.transform; + if (!streamDataToScratch(offset,sizeof(geom.transform),memcpyCallback)) + break; + offset += sizeof(geom.transform); } switch (geom.indexType) { @@ -5060,11 +5070,16 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul const auto alignment = geom.indexType==E_INDEX_TYPE::EIT_16BIT ? alignof(uint16_t):alignof(uint32_t); offset = core::alignUp(offset,alignment); outGeom.indexData = {.offset=offset,.buffer=smart_refctd_ptr(scratchBuffer)}; + size = triCount*3*alignment; + memcpyCallback.data = reinterpret_cast(geom.indexData.buffer->getPointer())+geom.indexData.offset; + success = streamDataToScratch(offset,size,memcpyCallback); break; } default: break; } + if (!success) + break; outGeom.maxVertex = geom.maxVertex; outGeom.vertexStride = geom.vertexStride; outGeom.vertexFormat = geom.vertexFormat; @@ -5073,8 +5088,8 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul } buildInfo.triangles = reinterpret_cast* const&>(trianglesOffset); } + success = pPrimitiveCounts==primitiveCounts.data()+primitiveCounts.size(); rangeInfos.push_back(reinterpret_cast(geometryRangeInfoOffset)); -success = false; } // current recording buffer may have changed xferCmdBuf = params.transfer->getCommandBufferForRecording(); From bc9b5f154a30ed081c6f1e43b2f1c0ac6874d380 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 15 May 2025 15:59:03 +0200 Subject: [PATCH 126/346] make the default memcpy IUTilities buffer streaming callback public (its useful externally too) --- include/nbl/video/utilities/IUtilities.h | 34 +++++++++++------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h index 09877b0d8f..00776ba01d 100644 --- a/include/nbl/video/utilities/IUtilities.h +++ b/include/nbl/video/utilities/IUtilities.h @@ -436,6 +436,18 @@ class NBL_API2 IUtilities : public core::IReferenceCounted return updateBufferRangeViaStagingBuffer(nextSubmit,bufferRange,callback); } + // + class CMemcpyUpstreamingDataProducer final : public IUpstreamingDataProducer + { + public: + inline uint32_t operator()(void* dst, const size_t offsetInRange, const uint32_t blockSize) override + { + memcpy(dst,reinterpret_cast(data)+offsetInRange,blockSize); + return blockSize; + } + + const void* data; + }; //! Copies `data` to stagingBuffer and Records the commands needed to copy the data from stagingBuffer to `bufferRange.buffer`. //! Returns same as `updateBufferRangeViaStagingBuffer` with a callback instead of a pointer, make sure to submit with `nextSubmit.popSubmit()` after this function returns. //! Parameters: @@ -448,25 +460,9 @@ class NBL_API2 IUtilities : public core::IReferenceCounted template requires std::is_same_v,SIntendedSubmitInfo> inline bool updateBufferRangeViaStagingBuffer(IntendedSubmitInfo&& nextSubmit, const asset::SBufferRange& bufferRange, const void* data) { - // We check the guarantees of our documentation with the asserts while we're at it -#ifdef _NBL_DEBUG - size_t prevRangeEnd = 0; -#endif - - auto retval = updateBufferRangeViaStagingBuffer(nextSubmit,bufferRange,wrapUpstreamingDataProducerLambda( - [&](void* dst, const size_t offsetInRange, const uint32_t blockSize) -> uint32_t - { -#ifdef _NBL_DEBUG - assert(offsetInRange==prevRangeEnd); - prevRangeEnd = offsetInRange+blockSize; -#endif - memcpy(dst,reinterpret_cast(data)+offsetInRange,blockSize); - return blockSize; - } - )); -#ifdef _NBL_DEBUG - assert(prevRangeEnd==bufferRange.size); -#endif + CMemcpyUpstreamingDataProducer memcpyCb; + memcpyCb.data = data; + bool retval = updateBufferRangeViaStagingBuffer(nextSubmit,bufferRange,memcpyCb); return retval; } From ce884ca3b6c0818490670c3d3c5df5124a8217f2 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Thu, 15 May 2025 16:13:32 +0200 Subject: [PATCH 127/346] update boost submodule --- 3rdparty/boost/superproject | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/boost/superproject b/3rdparty/boost/superproject index dcc3e1ade0..3b9e116eee 160000 --- a/3rdparty/boost/superproject +++ b/3rdparty/boost/superproject @@ -1 +1 @@ -Subproject commit dcc3e1ade0ae8e7ea0eadc2d951efb1e53450bff +Subproject commit 3b9e116eeee85ab8fd0d8e5a97364fff5f02eb86 From e5f610acb6a9c857c79215f8ee0a22420cde147e Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Thu, 15 May 2025 18:24:26 +0200 Subject: [PATCH 128/346] Resolve issues with private submodule updates, update 3rdparty/boost/CMakeLists.txt and refactor cmake/submodules/update.cmake, never touch private key during CMake configuration if updating public repositories --- 3rdparty/boost/CMakeLists.txt | 140 +++++++++-------- cmake/submodules/update.cmake | 272 +++++++++------------------------- 2 files changed, 149 insertions(+), 263 deletions(-) diff --git a/3rdparty/boost/CMakeLists.txt b/3rdparty/boost/CMakeLists.txt index 194ad3c35c..3c95234b8e 100644 --- a/3rdparty/boost/CMakeLists.txt +++ b/3rdparty/boost/CMakeLists.txt @@ -1,65 +1,38 @@ -set(BOOST_PREPROCESSOR_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/superproject/libs/preprocessor/include" CACHE PATH "" FORCE) - -get_filename_component(_BOOST_PREPROCESSOR_BR_BUNDLE_SEARCH_DIRECTORY_ "${BOOST_PREPROCESSOR_INCLUDE}" ABSOLUTE) -get_filename_component(_BOOST_PREPROCESSOR_BR_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) -get_filename_component(_BOOST_PREPROCESSOR_BR_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) -set(BOOST_BUILTIN_RESOURCES_DIRECTORY_PATH "${_BOOST_PREPROCESSOR_BR_BUNDLE_SEARCH_DIRECTORY_}/boost" CACHE INTERNAL "" FORCE) - -if(NBL_EMBED_BUILTIN_RESOURCES) - include("${NBL_ROOT_PATH}/src/nbl/builtin/utils.cmake") - - file(GLOB_RECURSE BOOST_HEADERS_REC_REL RELATIVE "${BOOST_BUILTIN_RESOURCES_DIRECTORY_PATH}" "${BOOST_PREPROCESSOR_INCLUDE}/*") - - foreach(BOOST_HEADER_REL IN LISTS BOOST_HEADERS_REC_REL) - LIST_BUILTIN_RESOURCE(BOOST_RESOURCES_TO_EMBED "${BOOST_HEADER_REL}") - endforeach() - - ADD_CUSTOM_BUILTIN_RESOURCES(boostBuiltinResourceData BOOST_RESOURCES_TO_EMBED "${_BOOST_PREPROCESSOR_BR_BUNDLE_SEARCH_DIRECTORY_}" "boost" "boost::builtin" "${_BOOST_PREPROCESSOR_BR_OUTPUT_DIRECTORY_HEADER_}" "${_BOOST_PREPROCESSOR_BR_OUTPUT_DIRECTORY_HEADER_}" "STATIC" "INTERNAL") -endif() - get_filename_component(NBL_BOOST_WAVE_DEP_FILE "${CMAKE_CURRENT_SOURCE_DIR}/dep/wave.cmake" ABSOLUTE) -if(NOT EXISTS "${NBL_BOOST_WAVE_DEP_FILE}") - message(FATAL_ERROR "Internal error, generate NBL_BOOST_WAVE_DEP_FILE by enabling NBL_BOOST_GENERATE_DEP_LIST!") -endif() - -set(BOOST_STAGEDIR "${CMAKE_CURRENT_BINARY_DIR}/boost/superproject/stage") -include("${NBL_BOOST_WAVE_DEP_FILE}") - -foreach(BOOST_LIB IN LISTS NBL_BOOST_LIBS) - add_subdirectory(superproject/libs/${BOOST_LIB} EXCLUDE_FROM_ALL) -endforeach() - -add_subdirectory(superproject/libs/wave EXCLUDE_FROM_ALL) - -list(APPEND NBL_BOOST_TARGETS boost_wave) # wave -foreach(BOOST_LIB IN LISTS NBL_BOOST_LIBS) - if(TARGET boost_${BOOST_LIB}) # wave's deps - list(APPEND NBL_BOOST_TARGETS boost_${BOOST_LIB}) - endif() -endforeach() - -set(NBL_BOOST_TARGETS - ${NBL_BOOST_TARGETS} -PARENT_SCOPE) - # Boost uses it's own tool for generating dependency list for targets, therefore we # can make sure manually added dependency subdirectories for a library are valid # https://www.boost.org/doc/libs/1_83_0/tools/boostdep/doc/html/index.html#boostdep.introduction.building_boostdep if(NBL_BOOST_GENERATE_DEP_LIST) # internal, for Nabla devs - if(WIN32) - set(NBL_BOOSTDEP_EXE "boostdep.exe") - else() - set(NBL_BOOSTDEP_EXE "boostdep") + if(NOT WIN32) + message(FATAL_ERROR "NBL_BOOST_GENERATE_DEP_LIST only for Windows host!") endif() - - set(NBL_BOOSTDEP_EXE_FILEPATH "${CMAKE_CURRENT_BINARY_DIR}/superproject/tools/boostdep/bin/${NBL_BOOSTDEP_EXE}") - + macro(NBL_BOOST_EXECUTE) execute_process(COMMAND ${ARGV} WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/superproject") endmacro() + NBL_BOOST_EXECUTE(git config --file .gitmodules --get-regexp path OUTPUT_VARIABLE NBL_OUTPUT_VARIABLE) + string(REGEX REPLACE "\n" ";" NBL_SUBMODULE_CONFIG_LIST "${NBL_OUTPUT_VARIABLE}") + + foreach(NBL_SUBMODULE_NAME ${NBL_SUBMODULE_CONFIG_LIST}) + string(REGEX MATCH "submodule\\.(.*)\\.path" NBL_SUBMODULE_NAME "${NBL_SUBMODULE_NAME}") + list(APPEND BOOST_SUBMODULES "${CMAKE_MATCH_1}") + endforeach() + + # sync & force update of all boost modules first for the tool purpose (sry guys who use the tool, you need to clone all, I want to keep it simple) + NBL_BOOST_EXECUTE(git submodule sync) + list(APPEND BOOST_FORCE_ALL_CONFIG -c url.https://github.com/.insteadOf=git@github.com:) + foreach(SUBMODULE ${BOOST_SUBMODULES}) + list(APPEND BOOST_FORCE_ALL_CONFIG -c submodule.${SUBMODULE}.update=checkout) + endforeach() + + NBL_BOOST_EXECUTE(git ${BOOST_FORCE_ALL_CONFIG} submodule update --init --recursive -f) + + # build boost dep executable + set(NBL_BOOSTDEP_EXE "boostdep.exe") + set(NBL_BOOSTDEP_EXE_FILEPATH "${CMAKE_CURRENT_BINARY_DIR}/superproject/tools/boostdep/bin/${NBL_BOOSTDEP_EXE}") if(NOT EXISTS "${NBL_BOOSTDEP_EXE_FILEPATH}") NBL_BOOST_EXECUTE(cmd /C bootstrap.bat) NBL_BOOST_EXECUTE(cmd /C b2.exe tools/boostdep/build) @@ -68,6 +41,7 @@ if(NBL_BOOST_GENERATE_DEP_LIST) # internal, for Nabla devs NBL_BOOST_EXECUTE(git reset --hard) endif() + # get wave dependency info NBL_BOOST_EXECUTE("${NBL_BOOSTDEP_EXE_FILEPATH}" --boost-root "${CMAKE_CURRENT_SOURCE_DIR}/superproject" --brief wave OUTPUT_VARIABLE NBL_OUTPUT_VAR ) @@ -81,22 +55,66 @@ if(NBL_BOOST_GENERATE_DEP_LIST) # internal, for Nabla devs list(FILTER NBL_BOOST_LIBS EXCLUDE REGEX "(unknown)") string(REPLACE "~" "/" NBL_BOOST_LIBS "${NBL_BOOST_LIBS}") - # NOTE: you commit this file to version control AND boost's .gitmodules *if got changed*, use when updating boost to more recent version + # update boost .gitmodules configuration, discard all but modules reported by wave + # NOTE: you commit this file to version control AND boost's .gitmodules *if got changed*, + # use when updating boost to more recent version file(WRITE "${NBL_BOOST_WAVE_DEP_FILE}" "set(NBL_BOOST_LIBS ${NBL_BOOST_LIBS})") - NBL_BOOST_EXECUTE(git config --file .gitmodules --get-regexp path OUTPUT_VARIABLE NBL_OUTPUT_VARIABLE) - - string(REGEX REPLACE "\n" ";" NBL_SUBMODULE_CONFIG_LIST "${NBL_OUTPUT_VARIABLE}") - message(STATUS "Updating boost .gitmodules") - foreach(NBL_SUBMODULE_NAME ${NBL_SUBMODULE_CONFIG_LIST}) - string(REGEX MATCH "submodule\\.(.*)\\.path" NBL_SUBMODULE_NAME "${NBL_SUBMODULE_NAME}") - NBL_BOOST_EXECUTE(git config --file .gitmodules submodule.${CMAKE_MATCH_1}.update none) # fallback, ignore all + foreach(SUBMODULE ${BOOST_SUBMODULES}) + # 1) fallback, ignore all + NBL_BOOST_EXECUTE(git config --file .gitmodules submodule.${SUBMODULE}.update none) endforeach() foreach(NAME ${NBL_BOOST_LIBS}) string(REPLACE "/" "_" SUBMODULE "${NAME}") - message(STATUS "BOOST SUBMODULE = ${SUBMODULE}") - NBL_BOOST_EXECUTE(git config --file .gitmodules submodule.${SUBMODULE}.update checkout) # pick only those reported by the module we use + message(STATUS "WAVE BOOST DEP SUBMODULE = ${SUBMODULE}") + # 2) pick only submodules reported by wave + NBL_BOOST_EXECUTE(git config --file .gitmodules submodule.${SUBMODULE}.update checkout) + endforeach() + # 3) and the top module itself + NBL_BOOST_EXECUTE(git config --file .gitmodules submodule.wave.update checkout) +endif() + +set(BOOST_PREPROCESSOR_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/superproject/libs/preprocessor/include" CACHE PATH "" FORCE) + +get_filename_component(_BOOST_PREPROCESSOR_BR_BUNDLE_SEARCH_DIRECTORY_ "${BOOST_PREPROCESSOR_INCLUDE}" ABSOLUTE) +get_filename_component(_BOOST_PREPROCESSOR_BR_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) +get_filename_component(_BOOST_PREPROCESSOR_BR_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) +set(BOOST_BUILTIN_RESOURCES_DIRECTORY_PATH "${_BOOST_PREPROCESSOR_BR_BUNDLE_SEARCH_DIRECTORY_}/boost" CACHE INTERNAL "" FORCE) + +if(NBL_EMBED_BUILTIN_RESOURCES) + include("${NBL_ROOT_PATH}/src/nbl/builtin/utils.cmake") + + file(GLOB_RECURSE BOOST_HEADERS_REC_REL RELATIVE "${BOOST_BUILTIN_RESOURCES_DIRECTORY_PATH}" "${BOOST_PREPROCESSOR_INCLUDE}/*") + + foreach(BOOST_HEADER_REL IN LISTS BOOST_HEADERS_REC_REL) + LIST_BUILTIN_RESOURCE(BOOST_RESOURCES_TO_EMBED "${BOOST_HEADER_REL}") endforeach() -endif() \ No newline at end of file + + ADD_CUSTOM_BUILTIN_RESOURCES(boostBuiltinResourceData BOOST_RESOURCES_TO_EMBED "${_BOOST_PREPROCESSOR_BR_BUNDLE_SEARCH_DIRECTORY_}" "boost" "boost::builtin" "${_BOOST_PREPROCESSOR_BR_OUTPUT_DIRECTORY_HEADER_}" "${_BOOST_PREPROCESSOR_BR_OUTPUT_DIRECTORY_HEADER_}" "STATIC" "INTERNAL") +endif() + +if(NOT EXISTS "${NBL_BOOST_WAVE_DEP_FILE}") + message(FATAL_ERROR "Internal error, generate NBL_BOOST_WAVE_DEP_FILE by enabling NBL_BOOST_GENERATE_DEP_LIST!") +endif() + +set(BOOST_STAGEDIR "${CMAKE_CURRENT_BINARY_DIR}/boost/superproject/stage") +include("${NBL_BOOST_WAVE_DEP_FILE}") + +foreach(BOOST_LIB IN LISTS NBL_BOOST_LIBS) + add_subdirectory(superproject/libs/${BOOST_LIB} EXCLUDE_FROM_ALL) +endforeach() + +add_subdirectory(superproject/libs/wave EXCLUDE_FROM_ALL) + +list(APPEND NBL_BOOST_TARGETS boost_wave) # wave +foreach(BOOST_LIB IN LISTS NBL_BOOST_LIBS) + if(TARGET boost_${BOOST_LIB}) # wave's deps + list(APPEND NBL_BOOST_TARGETS boost_${BOOST_LIB}) + endif() +endforeach() + +set(NBL_BOOST_TARGETS + ${NBL_BOOST_TARGETS} +PARENT_SCOPE) \ No newline at end of file diff --git a/cmake/submodules/update.cmake b/cmake/submodules/update.cmake index d0365c72ca..5d2474330e 100644 --- a/cmake/submodules/update.cmake +++ b/cmake/submodules/update.cmake @@ -1,223 +1,91 @@ -include(ProcessorCount) find_package(Git REQUIRED) -option(NBL_UPDATE_GIT_SUBMODULE "Turn this ON to let CMake update all public submodules for you" ON) -option(NBL_FORCE_ON_UPDATE_GIT_SUBMODULE "Submodules will be updated with --force flag if NBL_FORCE_UPDATE_GIT_SUBMODULE is turned ON, use with caution - if there are any uncommited files in submodules' working tree they will be removed!" OFF) -option(NBL_SYNC_ON_UPDATE_GIT_SUBMODULE "Sync initialized submodule paths if NBL_FORCE_UPDATE_GIT_SUBMODULE is turned ON, this is useful when any submodule remote path got modified and you want to apply this modification to your local repository. Turning NBL_FORCE_ON_UPDATE_GIT_SUBMODULE implies this option" OFF) -option(NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE "Turn this ON to attempt to update private Nabla submodules" OFF) -option(NBL_UPDATE_GIT_SUBMODULE_NO_SEPARATE_SHELL "Turn this ON to prevent CMake from executing git submodules update or sync in a separate shell - be aware that the interaction with shell will be impossible in case of paraphrase prompt request of your key!" ON) -option(NBL_CI_GIT_SUBMODULES_SHALLOW "" OFF) +option(NBL_UPDATE_GIT_SUBMODULE "Turn ON to update submodules, only public by default" ON) +option(NBL_FORCE_ON_UPDATE_GIT_SUBMODULE "NBL_UPDATE_GIT_SUBMODULE logic with --force flag" OFF) +option(NBL_SYNC_ON_UPDATE_GIT_SUBMODULE "Sync submodule URLs" OFF) +option(NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE "NBL_UPDATE_GIT_SUBMODULE logic but includes private submodules, for Nabla devs" OFF) +option(NBL_SUBMODULES_SHALLOW "NBL_UPDATE_GIT_SUBMODULE logic with --depth=1" OFF) -# TODO: replace all of this command recording & proxy logic with executing single recurse one-liner including -c options for private submodules -# once we have relative URLs + all .gitmodules configs are polished (so basically we don't have to set some config options on fly) - -if(NOT DEFINED NBL_ROOT_PATH) +if(NBL_UPDATE_GIT_SUBMODULE) +block() get_filename_component(NBL_ROOT_PATH "${CMAKE_CURRENT_LIST_DIR}/../../" ABSOLUTE) -endif() - -if(NOT DEFINED THIRD_PARTY_SOURCE_DIR) set(THIRD_PARTY_SOURCE_DIR "${NBL_ROOT_PATH}/3rdparty") -endif() - -if(NOT DEFINED NBL_ROOT_PATH_BINARY) - set(NBL_ROOT_PATH_BINARY "${NBL_ROOT_PATH}/build/.submodules") -endif() - -if(NOT DEFINED NBL_BUILD_EXAMPLES) - set(NBL_BUILD_EXAMPLES ON) -endif() - -function(NBL_UPDATE_SUBMODULES) - ProcessorCount(_GIT_SUBMODULES_JOBS_AMOUNT_) - - set(PRIVATE_SUBMODULES - Ditt-Reference-Scenes - ) - - foreach(NBL_P_SUBMODULE_NAME ${PRIVATE_SUBMODULES}) - if(NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE) - list(APPEND NBL_CONFIG_PRIVATE_SETUP_CMD "-c submodule.\"${NBL_P_SUBMODULE_NAME}\".update=checkout") - else() - list(APPEND NBL_CONFIG_PRIVATE_SETUP_CMD "-c submodule.\"${NBL_P_SUBMODULE_NAME}\".update=none") - endif() - endforeach() - - if(NBL_CI_GIT_SUBMODULES_SHALLOW) - set(NBL_SHALLOW "--depth=1") - else() - set(NBL_SHALLOW "") + + if(NOT DEFINED NBL_ROOT_PATH_BINARY) + set(NBL_ROOT_PATH_BINARY "${NBL_ROOT_PATH}/build/.submodules") endif() - - if(NBL_FORCE_ON_UPDATE_GIT_SUBMODULE) - set(NBL_FORCE "--force") - else() - set(NBL_FORCE "") + + if(NOT DEFINED NBL_BUILD_EXAMPLES) + set(NBL_BUILD_EXAMPLES ON) endif() - macro(NBL_WRAPPER_COMMAND_EXCLUSIVE GIT_RELATIVE_ENTRY GIT_SUBMODULE_PATH SHOULD_RECURSIVE EXCLUDE_SUBMODULE_PATHS) - set(EXCLUDE_SUBMODULE_PATHS ${EXCLUDE_SUBMODULE_PATHS}) - set(SHOULD_RECURSIVE ${SHOULD_RECURSIVE}) - - if("${EXCLUDE_SUBMODULE_PATHS}" STREQUAL "") - set(NBL_EXCLUDE "") - else() - foreach(EXCLUDE_SUBMODULE_PATH ${EXCLUDE_SUBMODULE_PATHS}) - string(APPEND NBL_EXCLUDE "-c submodule.\"${EXCLUDE_SUBMODULE_PATH}\".update=none ") - endforeach() - - string(STRIP "${NBL_EXCLUDE}" NBL_EXCLUDE) - endif() - - if(SHOULD_RECURSIVE) - set(_NBL_EXECUTE_COMMAND_ "\"${GIT_EXECUTABLE}\" -C \"${NBL_ROOT_PATH}/${GIT_RELATIVE_ENTRY}\" ${NBL_EXCLUDE} ${NBL_CONFIG_PRIVATE_SETUP_CMD} submodule update --init -j ${_GIT_SUBMODULES_JOBS_AMOUNT_} ${NBL_FORCE} --recursive ${NBL_SHALLOW} ${GIT_SUBMODULE_PATH}") - else() - set(_NBL_EXECUTE_COMMAND_ "\"${GIT_EXECUTABLE}\" -C \"${NBL_ROOT_PATH}/${GIT_RELATIVE_ENTRY}\" ${NBL_EXCLUDE} ${NBL_CONFIG_PRIVATE_SETUP_CMD} submodule update --init -j ${_GIT_SUBMODULES_JOBS_AMOUNT_} ${NBL_FORCE} ${NBL_SHALLOW} ${GIT_SUBMODULE_PATH}") - endif() - - string(APPEND _NBL_UPDATE_SUBMODULES_COMMANDS_ "${_NBL_EXECUTE_COMMAND_}\n") - - unset(NBL_EXCLUDE) - endmacro() - - set(_NBL_UPDATE_SUBMODULES_CMD_NAME_ "nbl-update-submodules") - set(_NBL_UPDATE_SUBMODULES_CMD_FILE_ "${NBL_ROOT_PATH_BINARY}/${_NBL_UPDATE_SUBMODULES_CMD_NAME_}.cmd") - get_filename_component(_NBL_UPDATE_IMPL_CMAKE_FILE_ "${NBL_ROOT_PATH_BINARY}/${_NBL_UPDATE_SUBMODULES_CMD_NAME_}.cmake" ABSOLUTE) - - # Proxy script for inclusive submodule updating - string(APPEND NBL_IMPL_SCRIPT "set(NBL_ROOT_PATH \"${NBL_ROOT_PATH}\")\nset(_GIT_SUBMODULES_JOBS_AMOUNT_ ${_GIT_SUBMODULES_JOBS_AMOUNT_})\nset(GIT_EXECUTABLE \"${GIT_EXECUTABLE}\")\nset(NBL_SHALLOW \"${NBL_SHALLOW}\")\nset(NBL_FORCE \"${NBL_FORCE}\")\n\n") - string(APPEND NBL_IMPL_SCRIPT -[=[ -if(NOT DEFINED GIT_RELATIVE_ENTRY) - message(FATAL_ERROR "GIT_RELATIVE_ENTRY must be defined to use this script!") -endif() + # we force HTTPS traffic for all *public* submodules we update from CMake + # NOTE: it *doesn't* rewrite destination URLs after checkout, if you eg. + # clone with SSH you end up with it anyway, this way your private key + # is never involved during CMake configuration, unless you + # use NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE -if(NOT DEFINED INCLUDE_SUBMODULE_PATHS) - message(FATAL_ERROR "INCLUDE_SUBMODULE_PATHS must be defined to use this script!") -endif() + # Private refs (*), exclude from public update + list(APPEND NBL_CONFIG_SUBMODULE -c submodule.\"Ditt-Reference-Scenes\".update=none) -# update an inclusive submodule first -execute_process(COMMAND "${GIT_EXECUTABLE}" -C "${NBL_ROOT_PATH}" submodule update --init "${GIT_RELATIVE_ENTRY}") + unset(NBL_UPDATE_OPTIONS) -if("${INCLUDE_SUBMODULE_PATHS}" STREQUAL "") - set(NBL_SUBMODULE_UPDATE_CONFIG_ENTRY "") -else() - execute_process(COMMAND "${GIT_EXECUTABLE}" -C "${NBL_ROOT_PATH}/${GIT_RELATIVE_ENTRY}" config --file .gitmodules --get-regexp path - OUTPUT_VARIABLE NBL_OUTPUT_VARIABLE - ) + if(NBL_SUBMODULES_SHALLOW) + list(APPEND NBL_UPDATE_OPTIONS --depth=1) + endif() - string(REGEX REPLACE "\n" ";" NBL_SUBMODULE_CONFIG_LIST "${NBL_OUTPUT_VARIABLE}") - - foreach(NBL_SUBMODULE_NAME ${NBL_SUBMODULE_CONFIG_LIST}) - string(REGEX MATCH "submodule\\.(.*)\\.path" NBL_SUBMODULE_NAME "${NBL_SUBMODULE_NAME}") - list(APPEND NBL_ALL_SUBMODULES "${CMAKE_MATCH_1}") - endforeach() - - foreach(NBL_SUBMODULE_NAME ${NBL_ALL_SUBMODULES}) - list(FIND INCLUDE_SUBMODULE_PATHS "${NBL_SUBMODULE_NAME}" NBL_FOUND) - - if("${NBL_FOUND}" STREQUAL "-1") - list(APPEND NBL_CONFIG_SETUP_CMD "-c;submodule.${NBL_SUBMODULE_NAME}.update=none") # filter submodules - only those on the INCLUDE_SUBMODULE_PATHS list will be updated when recursive update is requested, all left will be skipped - endif() - endforeach() -endif() - -execute_process(COMMAND "${GIT_EXECUTABLE}" ${NBL_CONFIG_SETUP_CMD} submodule update --init -j ${_GIT_SUBMODULES_JOBS_AMOUNT_} --recursive ${NBL_SHALLOW} ${NBL_FORCE} - WORKING_DIRECTORY "${NBL_ROOT_PATH}/${GIT_RELATIVE_ENTRY}" -) -]=] -) - file(WRITE "${_NBL_UPDATE_IMPL_CMAKE_FILE_}" "${NBL_IMPL_SCRIPT}") - - macro(NBL_WRAPPER_COMMAND_INCLUSIVE GIT_RELATIVE_ENTRY INCLUDE_SUBMODULE_PATHS) - string(APPEND _NBL_UPDATE_SUBMODULES_COMMANDS_ "\"${CMAKE_COMMAND}\" \"-DGIT_RELATIVE_ENTRY=${GIT_RELATIVE_ENTRY}\" \"-DINCLUDE_SUBMODULE_PATHS=${INCLUDE_SUBMODULE_PATHS}\" -P \"${_NBL_UPDATE_IMPL_CMAKE_FILE_}\"\n") + if(NBL_FORCE_ON_UPDATE_GIT_SUBMODULE) + list(APPEND NBL_UPDATE_OPTIONS --force) + endif() + + if(NOT NBL_BUILD_EXAMPLES) + list(APPEND NBL_CONFIG_SUBMODULE -c submodule.\"examples_tests\".update=none) + endif() + + macro(NBL_GIT_COMMAND) + execute_process(COMMAND "${GIT_EXECUTABLE}" ${ARGV}) endmacro() + + if(NBL_SYNC_ON_UPDATE_GIT_SUBMODULE) + message(STATUS "Syncing Public submodules") + NBL_GIT_COMMAND(${NBL_CONFIG_SUBMODULE} submodule sync --recursive WORKING_DIRECTORY "${NBL_ROOT_PATH}") + endif() - if(NBL_UPDATE_GIT_SUBMODULE) - execute_process(COMMAND ${CMAKE_COMMAND} -E echo "All submodules are about to get updated and initialized in repository because NBL_UPDATE_GIT_SUBMODULE is turned ON!") - - include("${THIRD_PARTY_SOURCE_DIR}/boost/dep/wave.cmake") - - macro(NBL_IMPL_INIT_COMMON_SUBMODULES) - # 3rdparty except boost & gltf - set(NBL_3RDPARTY_MODULES_TO_SKIP - 3rdparty/boost/superproject # a lot of submodules we don't use - 3rdparty/glTFSampleModels # more then 2GB waste of space (disk + .gitmodules data) - ) - NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./3rdparty TRUE "${NBL_3RDPARTY_MODULES_TO_SKIP}") - - # boost's 3rdparties, special case - # TODO: fork boost and update .gitmodules to cover only libs we want to use - set(NBL_BOOST_LIBS_TO_INIT ${NBL_BOOST_LIBS} wave numeric_conversion) # wave and all of its deps, numeric_conversion is nested in conversion submodule (for some reason boostdep tool doesn't output it properly) - foreach(NBL_TARGET ${NBL_BOOST_LIBS_TO_INIT}) - list(APPEND NBL_BOOST_SUBMODULES_TO_INIT ${NBL_TARGET}) - endforeach() - NBL_WRAPPER_COMMAND_INCLUSIVE(3rdparty/boost/superproject "${NBL_BOOST_SUBMODULES_TO_INIT}") - - # tests - NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./tests FALSE "") - - # docker - NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./docker FALSE "") + message(STATUS "Updating Public submodules") + NBL_GIT_COMMAND(-c url.https://github.com/.insteadOf=git@github.com: ${NBL_CONFIG_SUBMODULE} submodule update --init --recursive ${NBL_UPDATE_OPTIONS} WORKING_DIRECTORY "${NBL_ROOT_PATH}") + + if(NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE) + # NOTE: your git must be installed with default Git Bash as shell + # otherwise it *may* fail, whether it works depends on your agent setup + + find_package(GitBash REQUIRED) + + macro(NBL_GIT_BASH_COMMAND) + execute_process(COMMAND "${GIT_BASH_EXECUTABLE}" "-c" ${ARGV}) endmacro() - - NBL_IMPL_INIT_COMMON_SUBMODULES() - - if(NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE) - NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./examples_tests TRUE "") - else() - # NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./ci TRUE "") TODO: enable it once we merge Ditt, etc - - # examples and their media - if(NBL_BUILD_EXAMPLES) - NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./examples_tests TRUE "") - endif() - endif() - - file(WRITE "${_NBL_UPDATE_SUBMODULES_CMD_FILE_}" "${_NBL_UPDATE_SUBMODULES_COMMANDS_}") - - if(WIN32) - if(NBL_UPDATE_GIT_SUBMODULE_NO_SEPARATE_SHELL) - set(UPDATE_COMMAND - nbl-update-submodules.cmd - ) - - execute_process(COMMAND ${UPDATE_COMMAND} - WORKING_DIRECTORY "${NBL_ROOT_PATH_BINARY}" - RESULT_VARIABLE _NBL_TMP_RET_CODE_ - ) - else() - find_package(GitBash REQUIRED) - - execute_process(COMMAND "${GIT_BASH_EXECUTABLE}" "-c" + + message(STATUS "Updating Private submodules") + string(REPLACE ";" " " NBL_UPDATE_OPTIONS "${NBL_UPDATE_OPTIONS}") + set(LOG_FILE "${NBL_ROOT_PATH_BINARY}/nbl-update-private-submodules.log") + set(BASH_CMD [=[ >&2 echo "" clear -./nbl-update-submodules.cmd 2>&1 | tee nbl-update-submodules.log -sleep 1 +{ + echo "=== $(date) :: Starting private submodule update ===" + git -c submodule.Ditt-Reference-Scenes.update=checkout -C @NBL_ROOT_PATH@/examples_tests/media submodule update --init Ditt-Reference-Scenes @NBL_UPDATE_OPTIONS@ + # more private submodule here + + echo "=== $(date) :: Created @LOG_FILE@ in your build directory. ===" + echo "=== $(date) :: Finished private submodule update ===" +} 2>&1 | tee @LOG_FILE@ clear -tput setaf 2; echo -e "Submodules have been updated! -Created nbl-update-submodules.log in your build directory." ]=] - WORKING_DIRECTORY ${NBL_ROOT_PATH_BINARY} - OUTPUT_VARIABLE _NBL_TMP_OUTPUT_ - RESULT_VARIABLE _NBL_TMP_RET_CODE_ - OUTPUT_STRIP_TRAILING_WHITESPACE - ERROR_STRIP_TRAILING_WHITESPACE - ) - - unset(_NBL_TMP_OUTPUT_) - unset(_NBL_TMP_RET_CODE_) - - message(STATUS "Generated \"${NBL_ROOT_PATH_BINARY}/nbl-update-submodules.log\"") - endif() - - message(STATUS "Submodules have been updated!") - else() - execute_process(COMMAND "${_NBL_UPDATE_SUBMODULES_CMD_FILE_}") - endif() - else() - execute_process(COMMAND ${CMAKE_COMMAND} -E echo "NBL_UPDATE_GIT_SUBMODULE is turned OFF therefore submodules won't get updated.") + ) + string(CONFIGURE "${BASH_CMD}" BASH_CMD) + NBL_GIT_BASH_COMMAND("${BASH_CMD}" OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_STRIP_TRAILING_WHITESPACE RESULT_VARIABLE RES) + file(READ "${LOG_FILE}" LOG_CONTENT) + message(STATUS "${LOG_CONTENT}") endif() -endfunction() - -NBL_UPDATE_SUBMODULES() \ No newline at end of file +endblock() +endif() \ No newline at end of file From bf9390018f84be9c762eb6c152fe17a993b4e015 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Thu, 15 May 2025 19:42:46 +0200 Subject: [PATCH 129/346] use fetch.parallel=0 in CMake update --- cmake/submodules/update.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/submodules/update.cmake b/cmake/submodules/update.cmake index 5d2474330e..412cdf04e0 100644 --- a/cmake/submodules/update.cmake +++ b/cmake/submodules/update.cmake @@ -52,7 +52,7 @@ block() endif() message(STATUS "Updating Public submodules") - NBL_GIT_COMMAND(-c url.https://github.com/.insteadOf=git@github.com: ${NBL_CONFIG_SUBMODULE} submodule update --init --recursive ${NBL_UPDATE_OPTIONS} WORKING_DIRECTORY "${NBL_ROOT_PATH}") + NBL_GIT_COMMAND(-c fetch.parallel=0 -c url.https://github.com/.insteadOf=git@github.com: ${NBL_CONFIG_SUBMODULE} submodule update --init --recursive ${NBL_UPDATE_OPTIONS} WORKING_DIRECTORY "${NBL_ROOT_PATH}") if(NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE) # NOTE: your git must be installed with default Git Bash as shell From 55d89c5c2e3be03e178af923f0b70dc3420f63d4 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 16 May 2025 10:09:41 +0700 Subject: [PATCH 130/346] no need to store locals in reduce --- .../nbl/builtin/hlsl/workgroup2/shared_scan.hlsl | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 1043decd73..add3acc687 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -94,20 +94,20 @@ struct reduce using params_lv1_t = subgroup2::ArithmeticParams; BinOp binop; - vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize]; const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 0 scan subgroup2::reduction reduction0; [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { - dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); - scan_local[idx] = reduction0(scan_local[idx]); + vector_lv0_t scan_local; + dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local); + scan_local = reduction0(scan_local); if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); - scratchAccessor.set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + scratchAccessor.set(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -227,20 +227,20 @@ struct reduce using params_lv2_t = subgroup2::ArithmeticParams; BinOp binop; - vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize]; const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 0 scan subgroup2::reduction reduction0; [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { - dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); - scan_local[idx] = reduction0(scan_local[idx]); + vector_lv0_t scan_local; + dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local); + scan_local = reduction0(scan_local); if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); - scratchAccessor.set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + scratchAccessor.set(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); From 4e4f26e994a2ca5c5009ba3768b0121b627f50bd Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 16 May 2025 11:18:51 +0700 Subject: [PATCH 131/346] added workgroup accessor concepts, refactor accessor usage --- examples_tests | 2 +- .../accessors/workgroup_arithmetic.hlsl | 57 ++++++++++++++++ .../builtin/hlsl/workgroup2/arithmetic.hlsl | 7 +- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 66 +++++++++---------- src/nbl/builtin/CMakeLists.txt | 9 +++ 5 files changed, 104 insertions(+), 37 deletions(-) create mode 100644 include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl diff --git a/examples_tests b/examples_tests index 1de31ddfd7..e828dc49ef 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 1de31ddfd725009bd650f1fe80f1c4a8c2e6a14a +Subproject commit e828dc49ef0a223dcbb8b4af8d722974747f29ee diff --git a/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl new file mode 100644 index 0000000000..de5e5a3c35 --- /dev/null +++ b/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl @@ -0,0 +1,57 @@ +#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_WORKGROUP_ARITHMETIC_INCLUDED_ +#define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_WORKGROUP_ARITHMETIC_INCLUDED_ + +#include "nbl/builtin/hlsl/concepts.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace workgroup2 +{ + +#define NBL_CONCEPT_NAME ArithmeticSharedMemoryAccessor +#define NBL_CONCEPT_TPLT_PRM_KINDS (typename) +#define NBL_CONCEPT_TPLT_PRM_NAMES (T) +#define NBL_CONCEPT_PARAM_0 (accessor, T) +#define NBL_CONCEPT_PARAM_1 (index, uint32_t) +#define NBL_CONCEPT_PARAM_2 (val, uint32_t) +NBL_CONCEPT_BEGIN(3) +#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 +#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 +#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 +NBL_CONCEPT_END( + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set(index, val)), is_same_v, void)) + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get(index, val)), is_same_v, void)) + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.workgroupExecutionAndMemoryBarrier()), is_same_v, void)) +); +#undef val +#undef index +#undef accessor +#include + +#define NBL_CONCEPT_NAME ArithmeticDataAccessor +#define NBL_CONCEPT_TPLT_PRM_KINDS (typename) +#define NBL_CONCEPT_TPLT_PRM_NAMES (T) +#define NBL_CONCEPT_PARAM_0 (accessor, T) +#define NBL_CONCEPT_PARAM_1 (index, uint32_t) +#define NBL_CONCEPT_PARAM_2 (val, uint32_t) +NBL_CONCEPT_BEGIN(3) +#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 +#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 +#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 +NBL_CONCEPT_END( + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set(index, val)), is_same_v, void)) + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get(index, val)), is_same_v, void)) + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.workgroupExecutionAndMemoryBarrier()), is_same_v, void)) +); +#undef val +#undef index +#undef accessor +#include + +} +} +} + +#endif diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl index 3b4a028d2c..d0a26cdf94 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl @@ -8,6 +8,7 @@ #include "nbl/builtin/hlsl/functional.hlsl" #include "nbl/builtin/hlsl/workgroup/ballot.hlsl" #include "nbl/builtin/hlsl/workgroup/broadcast.hlsl" +#include "nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl" #include "nbl/builtin/hlsl/workgroup2/shared_scan.hlsl" @@ -21,7 +22,7 @@ namespace workgroup2 template struct reduction { - template + template && ArithmeticSharedMemoryAccessor) static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { impl::reduce fn; @@ -32,7 +33,7 @@ struct reduction template struct inclusive_scan { - template + template && ArithmeticSharedMemoryAccessor) static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { impl::scan fn; @@ -43,7 +44,7 @@ struct inclusive_scan template struct exclusive_scan { - template + template && ArithmeticSharedMemoryAccessor) static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { impl::scan fn; diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index add3acc687..d53bfd6000 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -43,9 +43,9 @@ struct reduce subgroup2::reduction reduction; vector_t value; - dataAccessor.get(workgroup::SubgroupContiguousIndex(), value); + dataAccessor.template get(workgroup::SubgroupContiguousIndex(), value); value = reduction(value); - dataAccessor.set(workgroup::SubgroupContiguousIndex(), value); + dataAccessor.template set(workgroup::SubgroupContiguousIndex(), value); } }; @@ -63,7 +63,7 @@ struct scan using params_t = subgroup2::ArithmeticParams; vector_t value; - dataAccessor.get(workgroup::SubgroupContiguousIndex(), value); + dataAccessor.template get(workgroup::SubgroupContiguousIndex(), value); if (Exclusive) { subgroup2::exclusive_scan excl_scan; @@ -74,7 +74,7 @@ struct scan subgroup2::inclusive_scan incl_scan; value = incl_scan(value); } - dataAccessor.set(workgroup::SubgroupContiguousIndex(), value); // can be safely merged with above lines? + dataAccessor.template set(workgroup::SubgroupContiguousIndex(), value); // can be safely merged with above lines? } }; @@ -101,13 +101,13 @@ struct reduce for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { vector_lv0_t scan_local; - dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local); + dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local); scan_local = reduction0(scan_local); if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); - scratchAccessor.set(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + scratchAccessor.template set(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -119,9 +119,9 @@ struct reduce vector_lv1_t lv1_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]); + scratchAccessor.template get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]); lv1_val = reduction1(lv1_val); - scratchAccessor.set(invocationIndex, lv1_val[Config::ItemsPerInvocation_1-1]); + scratchAccessor.template set(invocationIndex, lv1_val[Config::ItemsPerInvocation_1-1]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -130,8 +130,8 @@ struct reduce for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { scalar_t reduce_val; - scratchAccessor.get(glsl::gl_SubgroupInvocationID(),reduce_val); - dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val); + scratchAccessor.template get(glsl::gl_SubgroupInvocationID(),reduce_val); + dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, hlsl::promote(reduce_val)); } } }; @@ -158,13 +158,13 @@ struct scan [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { - dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); + dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); scan_local[idx] = inclusiveScan0(scan_local[idx]); if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); - scratchAccessor.set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + scratchAccessor.template set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -177,10 +177,10 @@ struct scan const uint32_t prevIndex = invocationIndex-1; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+prevIndex,lv1_val[i]); + scratchAccessor.template get(i*Config::SubgroupsPerVirtualWorkgroup+prevIndex,lv1_val[i]); vector_lv1_t shiftedInput = hlsl::mix(hlsl::promote(BinOp::identity), lv1_val, bool(invocationIndex)); shiftedInput = inclusiveScan1(shiftedInput); - scratchAccessor.set(invocationIndex, shiftedInput[Config::ItemsPerInvocation_1-1]); + scratchAccessor.template set(invocationIndex, shiftedInput[Config::ItemsPerInvocation_1-1]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -190,7 +190,7 @@ struct scan { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); scalar_t left; - scratchAccessor.get(virtualSubgroupID,left); + scratchAccessor.template get(virtualSubgroupID,left); if (Exclusive) { scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp(scan_local[idx][Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID())); @@ -204,7 +204,7 @@ struct scan for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++) scan_local[idx][i] = binop(left, scan_local[idx][i]); } - dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); + dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); } } }; @@ -234,13 +234,13 @@ struct reduce for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { vector_lv0_t scan_local; - dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local); + dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local); scan_local = reduction0(scan_local); if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); - scratchAccessor.set(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + scratchAccessor.template set(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -252,12 +252,12 @@ struct reduce vector_lv1_t lv1_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]); + scratchAccessor.template get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]); lv1_val = reduction1(lv1_val); if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { const uint32_t bankedIndex = (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2); - scratchAccessor.set(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); + scratchAccessor.template set(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -269,9 +269,9 @@ struct reduce vector_lv2_t lv2_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) - scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv2_val[i]); + scratchAccessor.template get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv2_val[i]); lv2_val = reduction2(lv2_val); - scratchAccessor.set(invocationIndex, lv2_val[Config::ItemsPerInvocation_2-1]); + scratchAccessor.template set(invocationIndex, lv2_val[Config::ItemsPerInvocation_2-1]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -280,8 +280,8 @@ struct reduce for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { scalar_t reduce_val; - scratchAccessor.get(glsl::gl_SubgroupInvocationID(),reduce_val); - dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val); + scratchAccessor.template get(glsl::gl_SubgroupInvocationID(),reduce_val); + dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val); } } }; @@ -310,13 +310,13 @@ struct scan [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { - dataAccessor.get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); + dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); scan_local[idx] = inclusiveScan0(scan_local[idx]); if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); - scratchAccessor.set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + scratchAccessor.template set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -329,12 +329,12 @@ struct scan vector_lv1_t lv1_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]); + scratchAccessor.template get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]); lv1_val = inclusiveScan1(lv1_val); if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { const uint32_t bankedIndex = (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2); - scratchAccessor.set(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); + scratchAccessor.template set(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -347,7 +347,7 @@ struct scan const uint32_t prevIndex = invocationIndex-1; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) - scratchAccessor.get(lv1_smem_size+i*Config::SubgroupSize+prevIndex,lv2_val[i]); + scratchAccessor.template get(lv1_smem_size+i*Config::SubgroupSize+prevIndex,lv2_val[i]); vector_lv2_t shiftedInput = hlsl::mix(hlsl::promote(BinOp::identity), lv2_val, bool(invocationIndex)); shiftedInput = inclusiveScan2(shiftedInput); @@ -356,10 +356,10 @@ struct scan for (uint32_t i = 0; i < Config::SubgroupsPerVirtualWorkgroup; i++) { scalar_t last_val; - scratchAccessor.get((Config::ItemsPerInvocation_1-1)*Config::SubgroupsPerVirtualWorkgroup+(Config::SubgroupsPerVirtualWorkgroup-1-i),last_val); + scratchAccessor.template get((Config::ItemsPerInvocation_1-1)*Config::SubgroupsPerVirtualWorkgroup+(Config::SubgroupsPerVirtualWorkgroup-1-i),last_val); scalar_t val = hlsl::mix(hlsl::promote(BinOp::identity), lv2_val, bool(i)); val = binop(last_val, shiftedInput[Config::ItemsPerInvocation_2-1]); - scratchAccessor.set((Config::ItemsPerInvocation_1-1)*Config::SubgroupsPerVirtualWorkgroup+(Config::SubgroupsPerVirtualWorkgroup-1-i), last_val); + scratchAccessor.template set((Config::ItemsPerInvocation_1-1)*Config::SubgroupsPerVirtualWorkgroup+(Config::SubgroupsPerVirtualWorkgroup-1-i), last_val); } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -370,7 +370,7 @@ struct scan { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); const scalar_t left; - scratchAccessor.get(virtualSubgroupID, left); + scratchAccessor.template get(virtualSubgroupID, left); if (Exclusive) { scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp(scan_local[idx][Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID())); @@ -384,7 +384,7 @@ struct scan for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++) scan_local[idx][i] = binop(left, scan_local[idx][i]); } - dataAccessor.set(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); + dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); } } }; diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index 9333a0d3b4..a6405a3c99 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -330,6 +330,10 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/basic.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/arithmetic_portability.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/arithmetic_portability_impl.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/fft.hlsl") +#subgroup2 +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup2/ballot.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup2/arithmetic_portability.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup2/arithmetic_portability_impl.hlsl") #shared header between C++ and HLSL LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/surface_transform.h") #workgroup @@ -341,6 +345,10 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/fft.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/scratch_size.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shared_scan.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shuffle.hlsl") +#workgroup2 +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/arithmetic_config.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/arithmetic.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/shared_scan.hlsl") #Extensions LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ext/FullScreenTriangle/SVertexAttributes.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ext/FullScreenTriangle/default.vert.hlsl") @@ -362,6 +370,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/loadable_i LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/mip_mapped.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/storable_image.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/fft.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/workgroup_arithmetic.hlsl") #tgmath LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tgmath.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tgmath/impl.hlsl") From 6884d4548e758c6591b7b291e2895457de4a36ab Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 16 May 2025 18:19:55 +0700 Subject: [PATCH 132/346] Add non const computeDependants to IAsset and its child classes --- include/nbl/asset/IAsset.h | 8 ++- include/nbl/asset/ICPUAccelerationStructure.h | 5 ++ include/nbl/asset/ICPUAnimationLibrary.h | 9 +++ include/nbl/asset/ICPUBuffer.h | 5 ++ include/nbl/asset/ICPUBufferView.h | 14 ++++- include/nbl/asset/ICPUComputePipeline.h | 36 +++++++---- include/nbl/asset/ICPUDescriptorSet.h | 1 + include/nbl/asset/ICPUDescriptorSetLayout.h | 28 ++++++--- include/nbl/asset/ICPUGraphicsPipeline.h | 23 +++++-- include/nbl/asset/ICPUImage.h | 5 ++ include/nbl/asset/ICPUImageView.h | 14 ++++- include/nbl/asset/ICPUMesh.h | 5 ++ include/nbl/asset/ICPUMeshBuffer.h | 5 ++ include/nbl/asset/ICPUPipeline.h | 2 +- include/nbl/asset/ICPUPipelineCache.h | 5 ++ include/nbl/asset/ICPUPipelineLayout.h | 13 ++++ include/nbl/asset/ICPURayTracingPipeline.h | 26 +++++--- include/nbl/asset/ICPURenderpass.h | 5 ++ .../asset/ICPURenderpassIndependentPipeline.h | 5 ++ include/nbl/asset/ICPUSampler.h | 5 ++ include/nbl/asset/ICPUSkeleton.h | 16 ++++- include/nbl/asset/IShader.h | 15 ++++- src/nbl/asset/ICPUDescriptorSet.cpp | 62 +++++++++++-------- 23 files changed, 248 insertions(+), 64 deletions(-) diff --git a/include/nbl/asset/IAsset.h b/include/nbl/asset/IAsset.h index c3950c4912..0e91b99c36 100644 --- a/include/nbl/asset/IAsset.h +++ b/include/nbl/asset/IAsset.h @@ -158,7 +158,13 @@ class IAsset : virtual public core::IReferenceCounted virtual core::unordered_set computeDependants() const = 0; - virtual bool valid() const = 0; + virtual core::unordered_set computeDependants() = 0; + + virtual bool valid() const + { + //TODO(kevinyu): Temporary set this to true to make changes compile. Will revisit this later for each asset + return true; + } protected: inline IAsset() = default; diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h index affd165667..3ac794a888 100644 --- a/include/nbl/asset/ICPUAccelerationStructure.h +++ b/include/nbl/asset/ICPUAccelerationStructure.h @@ -141,6 +141,11 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo return {}; } + inline core::unordered_set computeDependants() override + { + return {}; + } + inline core::blake3_hash_t computeContentHash() const override { if (!missingContent()) diff --git a/include/nbl/asset/ICPUAnimationLibrary.h b/include/nbl/asset/ICPUAnimationLibrary.h index 5fea370b63..8a6cdaf52a 100644 --- a/include/nbl/asset/ICPUAnimationLibrary.h +++ b/include/nbl/asset/ICPUAnimationLibrary.h @@ -100,6 +100,15 @@ class ICPUAnimationLibrary final : public IAnimationLibrary, public { return { m_keyframeStorageBinding.buffer.get(), m_timestampStorageBinding.buffer.get(), m_animationStorageRange.buffer.get() }; } + + private: + + template + requires(std::same_as, ICPUAnimationLibrary>) + static auto computeDependantsImpl(Self* self) { + using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; + return core::unordered_set{ self->m_keyframeStorageBinding.buffer.get(), self->m_timestampStorageBinding.buffer.get(), self->m_animationStorageRange.buffer.get() }; + } }; } diff --git a/include/nbl/asset/ICPUBuffer.h b/include/nbl/asset/ICPUBuffer.h index 2d495ef02e..0ad1d7bf48 100644 --- a/include/nbl/asset/ICPUBuffer.h +++ b/include/nbl/asset/ICPUBuffer.h @@ -80,6 +80,11 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed return {}; } + inline core::unordered_set computeDependants() override + { + return {}; + } + inline core::blake3_hash_t computeContentHash() const override { core::blake3_hasher hasher; diff --git a/include/nbl/asset/ICPUBufferView.h b/include/nbl/asset/ICPUBufferView.h index 7f3f676695..55d50356c1 100644 --- a/include/nbl/asset/ICPUBufferView.h +++ b/include/nbl/asset/ICPUBufferView.h @@ -30,7 +30,12 @@ class ICPUBufferView : public IBufferView, public IAsset inline core::unordered_set computeDependants() const override { - return { m_buffer.get() }; + return computeDependantsImpl(this); + } + + inline core::unordered_set computeDependants() override + { + return computeDependantsImpl(this); } ICPUBuffer* getUnderlyingBuffer() @@ -54,6 +59,13 @@ class ICPUBufferView : public IBufferView, public IAsset protected: virtual ~ICPUBufferView() = default; + private: + template + requires(std::same_as, ICPUBufferView>) + static auto computeDependantsImpl(Self* self) { + using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; + return core::unordered_set{ self->m_buffer.get() }; + } }; } diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h index 8d8b343a3d..f6b689857f 100644 --- a/include/nbl/asset/ICPUComputePipeline.h +++ b/include/nbl/asset/ICPUComputePipeline.h @@ -25,31 +25,28 @@ class ICPUComputePipeline final : public ICPUPipeline(retval,core::dont_grab); } - inline core::smart_refctd_ptr clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final - { - auto newPipeline = new ICPUComputePipeline(layout.get()); - newPipeline->m_specInfo = m_specInfo.clone(depth); - return core::smart_refctd_ptr(newPipeline, core::dont_grab); - } - constexpr static inline auto AssetType = ET_COMPUTE_PIPELINE; inline E_TYPE getAssetType() const override { return AssetType; } //! - virtual core::unordered_set computeDependants() const override + inline core::unordered_set computeDependants() const override + { + return computeDependantsImpl(this); + } + + inline core::unordered_set computeDependants() override { - return {m_layout.get(), m_specInfo.shader.get()}; + return computeDependantsImpl(this); } - inline virtual std::span getSpecInfo(hlsl::ShaderStage stage) const override final + inline std::span getSpecInfo(hlsl::ShaderStage stage) const override final { - if (stage==hlsl::ShaderStage::ESS_COMPUTE && isMutable()) + if (stage==hlsl::ShaderStage::ESS_COMPUTE) return {&m_specInfo,1}; return {}; } - - inline virtual bool valid() const override final + inline bool valid() const override { if (!m_layout) return false; if (!m_layout->valid()) return false; @@ -64,10 +61,23 @@ class ICPUComputePipeline final : public ICPUPipeline clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final + { + auto newPipeline = new ICPUComputePipeline(layout.get()); + newPipeline->m_specInfo = m_specInfo.clone(depth); + return core::smart_refctd_ptr(newPipeline, core::dont_grab); + } + explicit ICPUComputePipeline(const ICPUPipelineLayout* layout): base_t(layout, {}) {} + template + requires(std::same_as, ICPUComputePipeline>) + static auto computeDependantsImpl(Self* self) { + using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; + return core::unordered_set{ self->m_layout.get(), self->m_specInfo.shader.get() }; + } }; } diff --git a/include/nbl/asset/ICPUDescriptorSet.h b/include/nbl/asset/ICPUDescriptorSet.h index 77640b8f9f..c8a6f68d22 100644 --- a/include/nbl/asset/ICPUDescriptorSet.h +++ b/include/nbl/asset/ICPUDescriptorSet.h @@ -78,6 +78,7 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet clone(uint32_t _depth = ~0u) const override; core::unordered_set computeDependants() const override; + core::unordered_set computeDependants() override; protected: virtual ~ICPUDescriptorSet() = default; diff --git a/include/nbl/asset/ICPUDescriptorSetLayout.h b/include/nbl/asset/ICPUDescriptorSetLayout.h index 2ddf1e26be..b2c06792d6 100644 --- a/include/nbl/asset/ICPUDescriptorSetLayout.h +++ b/include/nbl/asset/ICPUDescriptorSetLayout.h @@ -59,18 +59,32 @@ class ICPUDescriptorSetLayout : public IDescriptorSetLayout, public core::unordered_set computeDependants() const override { - if (!m_immutableSamplers) return {}; - core::unordered_set dependants; - for (const auto& sampler: m_immutableSamplers) - { - dependants.insert(sampler.get()); - } - return dependants; + return computeDependantsImpl(this); + } + + core::unordered_set computeDependants() override + { + return computeDependantsImpl(this); } protected: virtual ~ICPUDescriptorSetLayout() = default; + + private: + template + requires(std::same_as, ICPUDescriptorSetLayout>) + static auto computeDependantsImpl(Self* self) { + using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; + core::unordered_set dependants; + if (!self->m_immutableSamplers) return dependants; + for (const auto& sampler: self->m_immutableSamplers) + { + dependants.insert(sampler.get()); + } + return dependants; + } + }; } diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h index 0629f82f1c..dcdcfb495e 100644 --- a/include/nbl/asset/ICPUGraphicsPipeline.h +++ b/include/nbl/asset/ICPUGraphicsPipeline.h @@ -43,12 +43,14 @@ class ICPUGraphicsPipeline final : public ICPUPipeline computeDependants() const override + inline core::unordered_set computeDependants() const override { - core::unordered_set dependants = { m_layout.get(), m_renderpass.get()}; - for (const auto& info : m_specInfos) - if (info.shader) dependants.insert(info.shader.get()); - return dependants; + return computeDependantsImpl(this); + } + + inline core::unordered_set computeDependants() override + { + return computeDependantsImpl(this); } inline SCachedCreationParams& getCachedCreationParams() @@ -69,6 +71,7 @@ class ICPUGraphicsPipeline final : public ICPUPipelinevalid())return false; // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-dynamicRendering-06576 if (!m_renderpass || m_params.subpassIx >= m_renderpass->getSubpassCount()) return false; @@ -108,6 +111,16 @@ class ICPUGraphicsPipeline final : public ICPUPipeline(hlsl::ShaderStage::ESS_VERTEX + index); } + + template + requires(std::same_as, ICPUGraphicsPipeline>) + static auto computeDependantsImpl(Self* self) { + using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; + core::unordered_set dependants = { self->m_layout.get(), self->m_renderpass.get()}; + for (const auto& info : self->m_specInfos) + if (info.shader) dependants.insert(info.shader.get()); + return dependants; + } }; } diff --git a/include/nbl/asset/ICPUImage.h b/include/nbl/asset/ICPUImage.h index 2527fd1ecb..b732e50492 100644 --- a/include/nbl/asset/ICPUImage.h +++ b/include/nbl/asset/ICPUImage.h @@ -51,6 +51,11 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed return {}; } + inline core::unordered_set computeDependants() override + { + return {}; + } + core::blake3_hash_t computeContentHash() const override; // Having regions specififed to upload is optional! So to have content missing we must have regions but no buffer content diff --git a/include/nbl/asset/ICPUImageView.h b/include/nbl/asset/ICPUImageView.h index 6b3d562a60..9639df6eb9 100644 --- a/include/nbl/asset/ICPUImageView.h +++ b/include/nbl/asset/ICPUImageView.h @@ -51,7 +51,12 @@ class ICPUImageView final : public IImageView, public IAsset inline core::unordered_set computeDependants() const override { - return { params.image.get() }; + return computeDependantsImpl(this); + } + + inline core::unordered_set computeDependants() override + { + return computeDependantsImpl(this); } //! @@ -70,6 +75,13 @@ class ICPUImageView final : public IImageView, public IAsset protected: virtual ~ICPUImageView() = default; + private: + template + requires(std::same_as, ICPUImageView>) + static auto computeDependantsImpl(Self* self) { + using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; + return core::unordered_set{ self->params.image.get() }; + } }; } diff --git a/include/nbl/asset/ICPUMesh.h b/include/nbl/asset/ICPUMesh.h index 2648900ccc..e9aaf53ba4 100644 --- a/include/nbl/asset/ICPUMesh.h +++ b/include/nbl/asset/ICPUMesh.h @@ -87,6 +87,11 @@ class ICPUMesh final : public IMesh, public IAsset return {}; } + inline core::unordered_set computeDependants() override + { + return {}; + } + protected: private: diff --git a/include/nbl/asset/ICPUMeshBuffer.h b/include/nbl/asset/ICPUMeshBuffer.h index 61e9168a98..c44d055c18 100644 --- a/include/nbl/asset/ICPUMeshBuffer.h +++ b/include/nbl/asset/ICPUMeshBuffer.h @@ -617,6 +617,11 @@ class ICPUMeshBuffer final : public IMeshBuffer computeDependants() override + { + return {}; + } + }; } diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h index ae2c64372d..8fe7e38391 100644 --- a/include/nbl/asset/ICPUPipeline.h +++ b/include/nbl/asset/ICPUPipeline.h @@ -131,7 +131,7 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipe inline std::span getSpecInfo(hlsl::ShaderStage stage) { if (!isMutable()) return {}; - const auto specInfo = static_cast(this)->getSpecInfo(stage); + const auto specInfo = const_cast(this)->getSpecInfo(stage); return { const_cast(specInfo.data()), specInfo.size() }; } diff --git a/include/nbl/asset/ICPUPipelineCache.h b/include/nbl/asset/ICPUPipelineCache.h index 6fc019ce7f..0ff912603d 100644 --- a/include/nbl/asset/ICPUPipelineCache.h +++ b/include/nbl/asset/ICPUPipelineCache.h @@ -65,6 +65,11 @@ class ICPUPipelineCache final : public IPreHashed return {}; } + inline core::unordered_set computeDependants() override + { + return {}; + } + // inline core::blake3_hash_t computeContentHash() const override { diff --git a/include/nbl/asset/ICPUPipelineLayout.h b/include/nbl/asset/ICPUPipelineLayout.h index 994d480b17..e755a22f07 100644 --- a/include/nbl/asset/ICPUPipelineLayout.h +++ b/include/nbl/asset/ICPUPipelineLayout.h @@ -79,6 +79,19 @@ class ICPUPipelineLayout : public IAsset, public IPipelineLayout + requires(std::same_as, ICPUPipelineLayout>) + static auto computeDependantsImpl(Self* self) { + using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; + core::unordered_set dependants; + for (auto i = 0; i < self->m_descSetLayouts.size(); i++) + { + if (self->m_descSetLayouts[i]) continue; + dependants.insert(self->m_descSetLayouts[i].get()); + } + return dependants; + } + }; } diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h index 5d975fa4dc..2b04a2f41b 100644 --- a/include/nbl/asset/ICPURayTracingPipeline.h +++ b/include/nbl/asset/ICPURayTracingPipeline.h @@ -57,14 +57,11 @@ class ICPURayTracingPipeline final : public ICPUPipeline computeDependants() const override final { - core::unordered_set dependants; - dependants.insert(m_raygen.shader.get()); - for (const auto& missInfo : m_misses) dependants.insert(missInfo.shader.get()); - for (const auto& anyHitInfo : m_hitGroups.anyHits) dependants.insert(anyHitInfo.shader.get()); - for (const auto& closestHitInfo : m_hitGroups.closestHits) dependants.insert(closestHitInfo.shader.get()); - for (const auto& intersectionInfo : m_hitGroups.intersections) dependants.insert(intersectionInfo.shader.get()); - for (const auto& callableInfo : m_callables) dependants.insert(callableInfo.shader.get()); - return dependants; + return computeDependantsImpl(this); + } + + virtual core::unordered_set computeDependants() override final { + return computeDependantsImpl(this); } inline virtual std::span getSpecInfo(hlsl::ShaderStage stage) const override final @@ -108,6 +105,19 @@ class ICPURayTracingPipeline final : public ICPUPipeline + requires(std::same_as, ICPURayTracingPipeline>) + static auto computeDependantsImpl(Self* self) { + using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; + core::unordered_set dependants; + dependants.insert(self->m_raygen.shader.get()); + for (const auto& missInfo : self->m_misses) dependants.insert(missInfo.shader.get()); + for (const auto& anyHitInfo : self->m_hitGroups.anyHits) dependants.insert(anyHitInfo.shader.get()); + for (const auto& closestHitInfo : self->m_hitGroups.closestHits) dependants.insert(closestHitInfo.shader.get()); + for (const auto& intersectionInfo : self->m_hitGroups.intersections) dependants.insert(intersectionInfo.shader.get()); + for (const auto& callableInfo : self->m_callables) dependants.insert(callableInfo.shader.get()); + return dependants; + } }; } diff --git a/include/nbl/asset/ICPURenderpass.h b/include/nbl/asset/ICPURenderpass.h index bbb2e5003f..9cc73af881 100644 --- a/include/nbl/asset/ICPURenderpass.h +++ b/include/nbl/asset/ICPURenderpass.h @@ -43,6 +43,11 @@ class ICPURenderpass : public IRenderpass, public IAsset return {}; } + inline core::unordered_set computeDependants() override + { + return {}; + } + protected: inline ICPURenderpass(const SCreationParams& _params, const SCreationParamValidationResult& _validation) : IRenderpass(_params, _validation) {} inline ~ICPURenderpass() = default; diff --git a/include/nbl/asset/ICPURenderpassIndependentPipeline.h b/include/nbl/asset/ICPURenderpassIndependentPipeline.h index 8638a4965b..628785d2ab 100644 --- a/include/nbl/asset/ICPURenderpassIndependentPipeline.h +++ b/include/nbl/asset/ICPURenderpassIndependentPipeline.h @@ -71,6 +71,11 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline, return {}; } + inline core::unordered_set computeDependants() override + { + return {}; + } + // inline const SCachedCreationParams& getCachedCreationParams() const {return IRenderpassIndependentPipeline::getCachedCreationParams();} inline SCachedCreationParams& getCachedCreationParams() diff --git a/include/nbl/asset/ICPUSampler.h b/include/nbl/asset/ICPUSampler.h index 46cac56ee0..ed11e7695d 100644 --- a/include/nbl/asset/ICPUSampler.h +++ b/include/nbl/asset/ICPUSampler.h @@ -73,6 +73,11 @@ class ICPUSampler : public ISampler, public IAsset { return {}; } + + inline core::unordered_set computeDependants() override + { + return {}; + } }; } diff --git a/include/nbl/asset/ICPUSkeleton.h b/include/nbl/asset/ICPUSkeleton.h index ce03a9be54..51be7acc5a 100644 --- a/include/nbl/asset/ICPUSkeleton.h +++ b/include/nbl/asset/ICPUSkeleton.h @@ -81,9 +81,23 @@ class ICPUSkeleton final : public ISkeleton, public IAsset inline core::unordered_set computeDependants() const override { - return { m_defaultTransforms.buffer.get(), m_parentJointIDs.buffer.get() }; + return computeDependantsImpl(this); } + inline core::unordered_set computeDependants() override + { + return computeDependantsImpl(this); + } + + private: + template + requires(std::same_as, ICPUSkeleton>) + static auto computeDependantsImpl(Self* self) { + using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; + core::unordered_set dependants; + return { self->m_defaultTransforms.buffer.get(), self->m_parentJointIDs.buffer.get() }; + return dependants; + } }; } diff --git a/include/nbl/asset/IShader.h b/include/nbl/asset/IShader.h index 5abd7d1980..59286e219d 100644 --- a/include/nbl/asset/IShader.h +++ b/include/nbl/asset/IShader.h @@ -52,7 +52,12 @@ class IShader : public IAsset inline core::unordered_set computeDependants() const override { - return { m_code.get() }; + return computeDependantsImpl(this); + } + + inline core::unordered_set computeDependants() override + { + return computeDependantsImpl(this); } // @@ -101,6 +106,14 @@ class IShader : public IAsset std::string m_filepathHint; core::smart_refctd_ptr m_code; E_CONTENT_TYPE m_contentType; + + private: + template + requires(std::same_as, IShader>) + static auto computeDependantsImpl(Self* self) { + using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; + return core::unordered_set{self->m_code.get()}; + } }; } diff --git a/src/nbl/asset/ICPUDescriptorSet.cpp b/src/nbl/asset/ICPUDescriptorSet.cpp index a298fea491..a95074fdb7 100644 --- a/src/nbl/asset/ICPUDescriptorSet.cpp +++ b/src/nbl/asset/ICPUDescriptorSet.cpp @@ -108,35 +108,47 @@ core::smart_refctd_ptr ICPUDescriptorSet::clone(uint32_t _depth) const return cp; } -core::unordered_set ICPUDescriptorSet::computeDependants() const -{ - core::unordered_set dependants = { m_layout.get() }; - for (auto i = 0u; i < static_cast(IDescriptor::E_TYPE::ET_COUNT); i++) - { - if (!m_descriptorInfos[i]) continue; - const auto size = m_descriptorInfos[i]->size(); - for (auto desc_i = 0u; desc_i < size; desc_i++) +template + requires(std::same_as, ICPUDescriptorSet>) +static auto computeDependantsImpl(Self* self) { + using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; + core::unordered_set dependants = { self->m_layout.get() }; + for (auto i = 0u; i < static_cast(IDescriptor::E_TYPE::ET_COUNT); i++) { - auto* desc = m_descriptorInfos[i]->operator[](desc_i).desc.get(); - if (!desc) continue; - switch (IDescriptor::GetTypeCategory(static_cast(i))) + if (!self->m_descriptorInfos[i]) continue; + const auto size = self->m_descriptorInfos[i]->size(); + for (auto desc_i = 0u; desc_i < size; desc_i++) { - case IDescriptor::EC_BUFFER: - dependants.insert(static_cast(desc)); - case IDescriptor::EC_SAMPLER: - dependants.insert(static_cast(desc)); - case IDescriptor::EC_IMAGE: - dependants.insert(static_cast(desc)); - case IDescriptor::EC_BUFFER_VIEW: - dependants.insert(static_cast(desc)); - case IDescriptor::EC_ACCELERATION_STRUCTURE: - dependants.insert(static_cast(desc)); - default: - break; + auto* desc = self->m_descriptorInfos[i]->operator[](desc_i).desc.get(); + if (!desc) continue; + switch (IDescriptor::GetTypeCategory(static_cast(i))) + { + case IDescriptor::EC_BUFFER: + dependants.insert(static_cast(desc)); + case IDescriptor::EC_SAMPLER: + dependants.insert(static_cast(desc)); + case IDescriptor::EC_IMAGE: + dependants.insert(static_cast(desc)); + case IDescriptor::EC_BUFFER_VIEW: + dependants.insert(static_cast(desc)); + case IDescriptor::EC_ACCELERATION_STRUCTURE: + dependants.insert(static_cast(desc)); + default: + break; + } } } - } - return dependants; + return dependants; +} + +core::unordered_set ICPUDescriptorSet::computeDependants() const +{ + return computeDependantsImpl(this); +} + +core::unordered_set ICPUDescriptorSet::computeDependants() +{ + return computeDependantsImpl(this); } } \ No newline at end of file From 2ac65f64277bb9bf3c9288104e36f32e639421f2 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 16 May 2025 18:26:59 +0700 Subject: [PATCH 133/346] Refactor anyDependantDiscardedContents and discardDependantsContents --- include/nbl/asset/IPreHashed.h | 56 +++++++++++++--------------------- 1 file changed, 22 insertions(+), 34 deletions(-) diff --git a/include/nbl/asset/IPreHashed.h b/include/nbl/asset/IPreHashed.h index 4ffda209df..86e1841f61 100644 --- a/include/nbl/asset/IPreHashed.h +++ b/include/nbl/asset/IPreHashed.h @@ -41,36 +41,31 @@ class IPreHashed : public IAsset static inline void discardDependantsContents(const std::span roots) { - struct stack_entry_t - { - const IAsset* asset; - core::unordered_set unvisitedChilds; - }; - core::stack stack; - core::unordered_set alreadyVisited; - auto push = [&stack,&alreadyVisited](const IAsset* node) -> void + core::stack stack; + core::unordered_set alreadyVisited; // whether we have push the node to the stack + core::unordered_set alreadyDescended; // whether we have push the children to the stack + auto push = [&stack,&alreadyVisited](IAsset* node) -> void { if (!node) return; const auto [dummy,inserted] = alreadyVisited.insert(node); if (inserted) - stack.push({ .asset = node, .unvisitedChilds = node->computeDependants()}); + stack.push(node); }; for (const auto& root : roots) push(root); while (!stack.empty()) { - auto& entry = stack.top(); - if (entry.unvisitedChilds.size() > 0) + auto* entry = stack.top(); + const auto [dummy, inserted] = alreadyDescended.insert(entry); + if (inserted) { - auto dep = *entry.unvisitedChilds.begin(); - entry.unvisitedChilds.erase(entry.unvisitedChilds.begin()); - push(dep); - } - else + core::unordered_set dependants = entry->computeDependants(); + for (auto* dependant : dependants) push(dependant); + } else { // post order traversal does discard - auto* isPrehashed = dynamic_cast(entry.asset); + auto* isPrehashed = dynamic_cast(entry); if (isPrehashed) isPrehashed->discardContent(); stack.pop(); @@ -79,13 +74,9 @@ class IPreHashed : public IAsset } static inline bool anyDependantDiscardedContents(const IAsset* root) { - struct stack_entry_t - { - const IAsset* asset; - core::unordered_set unvisitedChilds; - }; - core::stack stack; - core::unordered_set alreadyVisited; + core::stack stack; + core::unordered_set alreadyVisited; // whether we have push the node to the stack + core::unordered_set alreadyDescended; // whether we have push the children to the stack auto push = [&stack,&alreadyVisited](const IAsset* node) -> bool { if (!node) @@ -96,7 +87,7 @@ class IPreHashed : public IAsset auto* isPrehashed = dynamic_cast(node); if (isPrehashed && isPrehashed->missingContent()) return true; - stack.push({ .asset = node, .unvisitedChilds = node->computeDependants() }); + stack.push(node); } return false; }; @@ -104,16 +95,13 @@ class IPreHashed : public IAsset return true; while (!stack.empty()) { - auto& entry = stack.top(); - auto& unvisitedChilds = entry.unvisitedChilds; - if (unvisitedChilds.size() > 0) + auto* entry = stack.top(); + const auto [dummy, inserted] = alreadyDescended.insert(entry); + if (inserted) { - auto dep = *unvisitedChilds.begin(); - unvisitedChilds.erase(unvisitedChilds.begin()); - if (push(dep)) - return true; - } - else + core::unordered_set dependants = entry->computeDependants(); + for (auto* dependant : dependants) push(dependant); + } else stack.pop(); } return false; From 209ecf3478357de8f9b2d3d892971d9464305d34 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Mon, 19 May 2025 14:50:39 +0200 Subject: [PATCH 134/346] correct policy setup, propagate to all 3rdparty projects; silents some warnings and fixes an issue with bz2 error only on first configure run --- 3rdparty/CMakeLists.txt | 3 +++ CMakeLists.txt | 22 +++++++++++++++------- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt index ffbf8e4cbd..5bd2d6859f 100755 --- a/3rdparty/CMakeLists.txt +++ b/3rdparty/CMakeLists.txt @@ -4,6 +4,9 @@ include(../cmake/common.cmake) +project(Nabla-3rdparty LANGUAGES CXX C) +enable_language(C CXX ASM ASM_NASM) + option(NBL_FORCE_RELEASE_3RDPARTY "Force map 3rdaprty's configuration regardless Nabla configuration to Release" OFF) option(NBL_FORCE_RELWITHDEBINFO_3RDPARTY "Force map 3rdaprty's configuration regardless Nabla configuration to RelWithDebInfo" OFF) diff --git a/CMakeLists.txt b/CMakeLists.txt index a63d30a89d..f24877148b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,16 +1,24 @@ # Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. # This file is part of the "Nabla Engine". # For conditions of distribution and use, see copyright notice in nabla.h.in or nabla.h - cmake_minimum_required(VERSION 3.31) -# TODO: Yas - once we deploy 4.x we will fire `cmake_policy` instead of manually picking policies + +# TODO: Yas - once we deploy 4.x we will fire `cmake_policy(VERSION [...])` instead of manually picking policies # https://cmake.org/cmake/help/latest/command/cmake_minimum_required.html#policy-version # also we should update deps which throw warnings about < 3.10 compatibility -cmake_policy(SET CMP0003 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0077.html#cmp0077 -cmake_policy(SET CMP0077 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0077.html#cmp0077 -cmake_policy(SET CMP0112 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0112.html#cmp0112 -cmake_policy(SET CMP0141 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0141.html#policy:CMP0141 -cmake_policy(SET CMP0118 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0118.html#policy:CMP0118 + +macro(NBL_POLICY P S) +if(POLICY ${P}) + cmake_policy(SET ${P} ${S}) + set(CMAKE_POLICY_DEFAULT_${P} ${S}) +endif() +endmacro() + +NBL_POLICY(CMP0003 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0003.html#cmp0003 +NBL_POLICY(CMP0077 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0077.html#cmp0077 +NBL_POLICY(CMP0112 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0112.html#cmp0112 +NBL_POLICY(CMP0141 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0141.html#policy:CMP0141 +NBL_POLICY(CMP0118 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0118.html#policy:CMP0118 set(NBL_BUILD_ANDROID OFF) From ae27b7df8f593880432360f4796386800b2f0c59 Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 19 May 2025 14:51:47 +0200 Subject: [PATCH 135/346] enforce some constraints, stop the double instantiation of `Triangles` and `AABBs` with `const BufferType` and `BufferType` --- include/nbl/asset/IAccelerationStructure.h | 30 ++++++--- include/nbl/video/IGPUAccelerationStructure.h | 20 +++--- include/nbl/video/ILogicalDevice.h | 66 ++++++++++++------- src/nbl/video/CVulkanAccelerationStructure.h | 10 +-- src/nbl/video/CVulkanLogicalDevice.h | 42 ++++++------ src/nbl/video/IGPUAccelerationStructure.cpp | 4 +- src/nbl/video/utilities/CAssetConverter.cpp | 41 ++---------- 7 files changed, 104 insertions(+), 109 deletions(-) diff --git a/include/nbl/asset/IAccelerationStructure.h b/include/nbl/asset/IAccelerationStructure.h index 0efe6781ae..a29d27b828 100644 --- a/include/nbl/asset/IAccelerationStructure.h +++ b/include/nbl/asset/IAccelerationStructure.h @@ -88,19 +88,32 @@ class IBottomLevelAccelerationStructure : public IAccelerationStructure NO_DUPLICATE_ANY_HIT_INVOCATION_BIT = 0x1u<<1u, }; + enum class GeometryType : uint8_t + { + Triangles = 0, + AABBs = 1, + // Later: LSS and friends + Count = 2 + }; + // Note that in Vulkan strides are 64-bit value but restricted to be 32-bit in range - template requires std::is_base_of_v + template requires (!std::is_const_v && std::is_base_of_v) struct Triangles { public: - using buffer_t = std::remove_const_t; - constexpr static inline bool Host = std::is_same_v; + using buffer_t = BufferType; + constexpr static inline GeometryType Type = GeometryType::Triangles; + + private: + constexpr static inline bool HostTransform = std::is_same_v; + + public: // we make our life easier by not taking pointers to single matrix values - using transform_t = std::conditional_t>; + using transform_t = std::conditional_t>; inline bool hasTransform() const { - if constexpr (Host) + if constexpr (HostTransform) return !core::isnan(transform[0][0]); else return bool(transform.buffer); @@ -122,17 +135,18 @@ class IBottomLevelAccelerationStructure : public IAccelerationStructure private: constexpr static transform_t __transform_initializer() { - if constexpr (Host) + if constexpr (HostTransform) return hlsl::float32_t3x4(std::numeric_limits::quiet_NaN()); return {}; } }; // - template requires std::is_base_of_v + template requires (!std::is_const_v && std::is_base_of_v) struct AABBs { - using buffer_t = std::remove_const_t; + using buffer_t = BufferType; + constexpr static inline GeometryType Type = GeometryType::Triangles; // for `MOTION_BIT` you don't get a second buffer for AABBs at different times because linear interpolation of AABBs doesn't work asset::SBufferBinding data = {}; diff --git a/include/nbl/video/IGPUAccelerationStructure.h b/include/nbl/video/IGPUAccelerationStructure.h index af541bdccb..b7c1858130 100644 --- a/include/nbl/video/IGPUAccelerationStructure.h +++ b/include/nbl/video/IGPUAccelerationStructure.h @@ -45,7 +45,7 @@ class IGPUAccelerationStructure : public IBackendObject #endif //! builds - template + template requires (!std::is_const_v && std::is_base_of_v) struct BuildInfo { public: @@ -112,7 +112,7 @@ class IGPUAccelerationStructure : public IBackendObject IGPUAccelerationStructure* dst = nullptr; COPY_MODE mode = COPY_MODE::CLONE; }; - template + template requires (!std::is_const_v && std::is_base_of_v) struct CopyToMemoryInfo { const IGPUAccelerationStructure* src = nullptr; @@ -121,7 +121,7 @@ class IGPUAccelerationStructure : public IBackendObject }; using DeviceCopyToMemoryInfo = CopyToMemoryInfo; using HostCopyToMemoryInfo = CopyToMemoryInfo; - template + template requires (!std::is_const_v && std::is_base_of_v) struct CopyFromMemoryInfo { asset::SBufferBinding src = nullptr; @@ -181,7 +181,7 @@ class IGPUBottomLevelAccelerationStructure : public asset::IBottomLevelAccelerat using DirectBuildRangeRangeInfos = const BuildRangeInfo* const*; using MaxInputCounts = const uint32_t* const; - template + template requires (!std::is_const_v && std::is_base_of_v) struct BuildInfo final : IGPUAccelerationStructure::BuildInfo { private: @@ -203,7 +203,7 @@ class IGPUBottomLevelAccelerationStructure : public asset::IBottomLevelAccelerat NBL_API2 uint32_t valid(const T* const buildRangeInfosOrMaxPrimitiveCounts) const; // really expensive to call, `valid` only calls it when `_NBL_DEBUG` is defined - inline bool validGeometry(size_t& totalPrims, const AABBs& geometry, const BuildRangeInfo& buildRangeInfo) const + inline bool validGeometry(size_t& totalPrims, const AABBs& geometry, const BuildRangeInfo& buildRangeInfo) const { constexpr size_t AABBalignment = 8ull; // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkAccelerationStructureBuildRangeInfoKHR-primitiveOffset-03659 @@ -222,7 +222,7 @@ class IGPUBottomLevelAccelerationStructure : public asset::IBottomLevelAccelerat totalPrims += buildRangeInfo.primitiveCount; return true; } - inline bool validGeometry(size_t& totalPrims, const Triangles& geometry, const BuildRangeInfo& buildRangeInfo) const + inline bool validGeometry(size_t& totalPrims, const Triangles& geometry, const BuildRangeInfo& buildRangeInfo) const { // if (!dstAS->validVertexFormat(geometry.vertexFormat)) @@ -306,7 +306,7 @@ class IGPUBottomLevelAccelerationStructure : public asset::IBottomLevelAccelerat *(oit++) = core::smart_refctd_ptr(srcAS); *(oit++) = core::smart_refctd_ptr(dstAS); - if (buildFlags.hasFlags(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT)) + if (buildFlags.hasFlags(asset::IBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT)) { for (auto i=0u; i* triangles = nullptr; - const AABBs* aabbs; + const Triangles* triangles = nullptr; + const AABBs* aabbs; }; }; using DeviceBuildInfo = BuildInfo; @@ -393,7 +393,7 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr using DirectBuildRangeRangeInfos = const BuildRangeInfo*; using MaxInputCounts = const uint32_t; - template + template requires (!std::is_const_v && std::is_base_of_v) struct BuildInfo final : IGPUAccelerationStructure::BuildInfo { private: diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 8ad3b839ab..b23afa2679 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -412,19 +412,20 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe }; // fun fact: you can use garbage/invalid pointers/offset for the Device/Host addresses of the per-geometry data, just make sure what was supposed to be null is null template requires nbl::is_any_of_v, - IGPUBottomLevelAccelerationStructure::Triangles, - IGPUBottomLevelAccelerationStructure::AABBs, - IGPUBottomLevelAccelerationStructure::AABBs + asset::IBottomLevelAccelerationStructure::Triangles, + asset::IBottomLevelAccelerationStructure::Triangles, + asset::IBottomLevelAccelerationStructure::AABBs, + asset::IBottomLevelAccelerationStructure::AABBs > inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes( - const core::bitflag flags, + const bool hostBuild, + const core::bitflag flags, const bool motionBlur, const std::span geometries, const uint32_t* const pMaxPrimitiveCounts ) const { - if (invalidFeaturesForASBuild(motionBlur)) + if (invalidFeaturesForASBuild(hostBuild,motionBlur)) { NBL_LOG_ERROR("Required features are not enabled"); return {}; @@ -455,13 +456,29 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe uint32_t primsFree = limits.maxAccelerationStructurePrimitiveCount; for (auto i=0u; i,Geometry>) + const auto& geom = geometries[i]; + if constexpr (Geometry::Type==asset::IBottomLevelAccelerationStructure::GeometryType::Triangles) { - // TODO: do we check `maxVertex`, `vertexStride` and `indexType` for validity? + if (flags.hasFlags(asset::IBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT)) + { + NBL_LOG_ERROR("Primitive type is Triangles but build flag says BLAS build is AABBs"); + return {}; + } + if (!getPhysicalDevice()->getBufferFormatUsages()[geom.vertexFormat].accelerationStructureVertex) + { + NBL_LOG_ERROR("Vertex Format %d not supported as Acceleration Structure Vertex Position Input on this Device",geom.vertexFormat); + return {}; + } + // TODO: do we check `maxVertex`, `vertexStride` and `indexType` for validity } - if constexpr (std::is_same_v,Geometry>) + if constexpr (Geometry::Type==asset::IBottomLevelAccelerationStructure::GeometryType::AABBs) { - // TODO: check stride and geometry flags for validity? + if (!flags.hasFlags(asset::IBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT)) + { + NBL_LOG_ERROR("Primitive type is AABB but build flag says BLAS build is not AABBs"); + return {}; + } + // TODO: check stride and geometry flags for validity } if (pMaxPrimitiveCounts[i] > primsFree) { @@ -471,16 +488,16 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe primsFree -= pMaxPrimitiveCounts[i]; } - return getAccelerationStructureBuildSizes_impl(flags,motionBlur,geometries,pMaxPrimitiveCounts); + return getAccelerationStructureBuildSizes_impl(hostBuild,flags,motionBlur,geometries,pMaxPrimitiveCounts); } inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes( const bool hostBuild, - const core::bitflag flags, + const core::bitflag flags, const bool motionBlur, const uint32_t maxInstanceCount ) const { - if (invalidFeaturesForASBuild(motionBlur)) + if (invalidFeaturesForASBuild(hostBuild,motionBlur)) { NBL_LOG_ERROR("Required features are not enabled"); return {}; @@ -504,7 +521,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe } // little utility template - inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes(const core::bitflag flags, const bool motionBlur, const uint32_t maxInstanceCount) const + inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes(const core::bitflag flags, const bool motionBlur, const uint32_t maxInstanceCount) const { return getAccelerationStructureBuildSizes(std::is_same_v,asset::ICPUBuffer>,flags,motionBlur,maxInstanceCount); } @@ -1070,20 +1087,20 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe virtual core::smart_refctd_ptr createTopLevelAccelerationStructure_impl(IGPUTopLevelAccelerationStructure::SCreationParams&& params) = 0; virtual AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl( - const core::bitflag flags, const bool motionBlur, - const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts + const bool hostBuild, const core::bitflag flags, const bool motionBlur, + const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts ) const = 0; virtual AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl( - const core::bitflag flags, const bool motionBlur, - const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts + const bool hostBuild, const core::bitflag flags, const bool motionBlur, + const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts ) const = 0; virtual AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl( - const core::bitflag flags, const bool motionBlur, - const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts + const bool hostBuild, const core::bitflag flags, const bool motionBlur, + const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts ) const = 0; virtual AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl( - const core::bitflag flags, const bool motionBlur, - const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts + const bool hostBuild, const core::bitflag flags, const bool motionBlur, + const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts ) const = 0; virtual AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl( const bool hostBuild, const core::bitflag flags, @@ -1333,8 +1350,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe } return false; } - template - bool invalidFeaturesForASBuild(const bool motionBlur) const + bool invalidFeaturesForASBuild(const bool hostBuild, const bool motionBlur) const { // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkGetAccelerationStructureBuildSizesKHR-accelerationStructure-08933 if (!m_enabledFeatures.accelerationStructure) @@ -1343,7 +1359,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe return true; } // not sure of VUID - if (std::is_same_v && !m_enabledFeatures.accelerationStructureHostCommands) + if (hostBuild && !m_enabledFeatures.accelerationStructureHostCommands) { NBL_LOG_ERROR("Feature `acceleration structure` host commands is not enabled"); return true; diff --git a/src/nbl/video/CVulkanAccelerationStructure.h b/src/nbl/video/CVulkanAccelerationStructure.h index b6c06f158d..42fefaa6d1 100644 --- a/src/nbl/video/CVulkanAccelerationStructure.h +++ b/src/nbl/video/CVulkanAccelerationStructure.h @@ -118,7 +118,7 @@ inline VkGeometryFlagsKHR getVkGeometryFlagsFrom(const IGPUBottomLevelAccelerati // The srcAccelerationStructure, dstAccelerationStructure, and mode members of pBuildInfo are ignored. Any VkDeviceOrHostAddressKHR members of pBuildInfo are ignored by this command static const VkDeviceOrHostAddressConstKHR NullAddress = { 0x0ull }; template -void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles& triangles, VkAccelerationStructureGeometryKHR& outBase) +void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles& triangles, VkAccelerationStructureGeometryKHR& outBase) { static const VkDeviceOrHostAddressConstKHR DummyNonNullAddress = { 0xdeadbeefBADC0FFEull }; @@ -129,7 +129,7 @@ void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles(triangles.indexType); - outBase.geometry.triangles.indexData = QueryOnly ? NullAddress:getVkDeviceOrHostAddress(triangles.indexData); + outBase.geometry.triangles.indexData = triangles.indexType==asset::E_INDEX_TYPE::EIT_UNKNOWN || QueryOnly ? NullAddress:getVkDeviceOrHostAddress(triangles.indexData); // except that the hostAddress member of VkAccelerationStructureGeometryTrianglesDataKHR::transformData will be examined to check if it is NULL. if (!triangles.hasTransform()) outBase.geometry.triangles.transformData = NullAddress; @@ -145,7 +145,7 @@ void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles -void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles& triangles, VkAccelerationStructureGeometryKHR& outBase, VkAccelerationStructureGeometryMotionTrianglesDataNV* &p_vertexMotion) +void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles& triangles, VkAccelerationStructureGeometryKHR& outBase, VkAccelerationStructureGeometryMotionTrianglesDataNV* &p_vertexMotion) { getVkASGeometryFrom(triangles,outBase); if (triangles.vertexData[1].buffer) @@ -158,7 +158,7 @@ void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles -void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::AABBs& aabbs, VkAccelerationStructureGeometryKHR& outBase) +void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::AABBs& aabbs, VkAccelerationStructureGeometryKHR& outBase) { outBase = {VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_KHR,nullptr,VK_GEOMETRY_TYPE_AABBS_KHR}; outBase.geometry.aabbs = {VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_AABBS_DATA_KHR,nullptr}; @@ -221,7 +221,7 @@ inline VkAccelerationStructureBuildGeometryInfoKHR getVkASBuildGeometryInfo(cons for (auto j=0u; j flags, const bool motionBlur, - const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts + const bool hostBuild, const core::bitflag flags, const bool motionBlur, + const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts ) const override { - return getAccelerationStructureBuildSizes_impl_impl_impl(flags,motionBlur,geometries,pMaxPrimitiveCounts); + return getAccelerationStructureBuildSizes_impl_impl_impl(hostBuild,flags,motionBlur,geometries,pMaxPrimitiveCounts); } inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl( - const core::bitflag flags, const bool motionBlur, - const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts + const bool hostBuild, const core::bitflag flags, const bool motionBlur, + const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts ) const override { - return getAccelerationStructureBuildSizes_impl_impl_impl(flags,motionBlur,geometries,pMaxPrimitiveCounts); + return getAccelerationStructureBuildSizes_impl_impl_impl(hostBuild,flags,motionBlur,geometries,pMaxPrimitiveCounts); } inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl( - const core::bitflag flags, const bool motionBlur, - const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts + const bool hostBuild, const core::bitflag flags, const bool motionBlur, + const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts ) const override { - return getAccelerationStructureBuildSizes_impl_impl_impl(flags,motionBlur,geometries,pMaxPrimitiveCounts); + return getAccelerationStructureBuildSizes_impl_impl_impl(hostBuild,flags,motionBlur,geometries,pMaxPrimitiveCounts); } inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl( - const core::bitflag flags, const bool motionBlur, - const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts + const bool hostBuild, const core::bitflag flags, const bool motionBlur, + const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts ) const override { - return getAccelerationStructureBuildSizes_impl_impl_impl(flags,motionBlur,geometries,pMaxPrimitiveCounts); + return getAccelerationStructureBuildSizes_impl_impl_impl(hostBuild,flags,motionBlur,geometries,pMaxPrimitiveCounts); } template inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl_impl_impl( - const core::bitflag flags, const bool motionBlur, + const bool hostBuild, const core::bitflag flags, const bool motionBlur, const std::span geometries, const uint32_t* const pMaxPrimitiveCounts ) const { - constexpr bool IsAABB = std::is_same_v>; + constexpr bool IsTriangle = Geometry::Type==asset::IBottomLevelAccelerationStructure::GeometryType::Triangles; core::vector vk_geometries(geometries.size()); - core::vector vk_triangleMotions(IsAABB ? 0u:geometries.size()); + core::vector vk_triangleMotions(IsTriangle ? geometries.size():0u); auto outTriangleMotions = vk_triangleMotions.data(); for (auto i=0u; i(geometries[i],vk_geometries[i]); - else + if constexpr (IsTriangle) getVkASGeometryFrom(geometries[i],vk_geometries[i],outTriangleMotions); + else + getVkASGeometryFrom(geometries[i],vk_geometries[i]); } - return getAccelerationStructureBuildSizes_impl_impl( - std::is_same_v,false, - getVkASBuildFlagsFrom(flags,motionBlur), - vk_geometries,pMaxPrimitiveCounts - ); + return getAccelerationStructureBuildSizes_impl_impl(hostBuild,false,getVkASBuildFlagsFrom(flags,motionBlur),vk_geometries,pMaxPrimitiveCounts); } AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl( diff --git a/src/nbl/video/IGPUAccelerationStructure.cpp b/src/nbl/video/IGPUAccelerationStructure.cpp index b975742436..e994123616 100644 --- a/src/nbl/video/IGPUAccelerationStructure.cpp +++ b/src/nbl/video/IGPUAccelerationStructure.cpp @@ -5,7 +5,7 @@ namespace nbl::video { -template +template requires (!std::is_const_v && std::is_base_of_v) bool IGPUAccelerationStructure::BuildInfo::invalid(const IGPUAccelerationStructure* const src, const IGPUAccelerationStructure* const dst) const { // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkBuildAccelerationStructuresIndirectKHR-dstAccelerationStructure-03800 @@ -61,7 +61,7 @@ bool IGPUAccelerationStructure::BuildInfo::invalid(const IGPUAcceler //extern template class IGPUAccelerationStructure::BuildInfo; -template +template requires (!std::is_const_v && std::is_base_of_v) template// requires nbl::is_any_of_v,uint32_t,IGPUBottomLevelAccelerationStructure::BuildRangeInfo>,IGPUBottomLevelAccelerationStructure::BuildRangeInfo> uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo::valid(const T* const buildRangeInfosOrMaxPrimitiveCounts) const { diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 285a1dce1d..5d16c5bb9b 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -2853,47 +2853,16 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult else { const uint32_t* pPrimitiveCounts = as->getGeometryPrimitiveCounts().data(); - // the code here is not pretty, but DRY-ing is of this is for later -// TODO: ILogicalDevice needs code to query build sizes of ICPUBottomLevelAccelerationStructure geometries! if (buildFlags.hasFlags(ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT)) { - const auto geoms = as->getAABBGeometries(); - if (patch.hostBuild) - { - const std::span> cpuGeoms = { - reinterpret_cast*>(geoms.data()),geoms.size() - }; - sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pPrimitiveCounts); - } - else - { - const std::span> cpuGeoms = { - reinterpret_cast*>(geoms.data()),geoms.size() - }; - sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pPrimitiveCounts); - } - // TODO: check if the strides need to be aligned to 4 bytes for AABBs + sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,as->getAABBGeometries(),pPrimitiveCounts); for (const auto& geom : geoms) if (const auto aabbCount=*(pPrimitiveCounts++); aabbCount) incrementBuildSize(aabbCount*geom.stride,alignof(float)); } else { - const auto geoms = as->getTriangleGeometries(); - if (patch.hostBuild) - { - const std::span> cpuGeoms = { - reinterpret_cast*>(geoms.data()),geoms.size() - }; - sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pPrimitiveCounts); - } - else - { - const std::span> cpuGeoms = { - reinterpret_cast*>(geoms.data()),geoms.size() - }; - sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pPrimitiveCounts); - } + sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,as->getTriangleGeometries(),pPrimitiveCounts); for (const auto& geom : geoms) if (const auto triCount=*(pPrimitiveCounts++); triCount) { @@ -4683,8 +4652,8 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul allocSizes.reserve(asCount); // BLAS and TLAS specific things core::vector geometryRangeInfo; - core::vector> triangles; - core::vector> aabbs; + core::vector> triangles; + core::vector> aabbs; core::vector> trackedBLASes; if constexpr (IsTLAS) trackedBLASes.reserve(asCount); @@ -5034,7 +5003,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul .geometryFlags = geom.geometryFlags }); } - buildInfo.aabbs = reinterpret_cast* const&>(aabbsOffset); + buildInfo.aabbs = reinterpret_cast* const&>(aabbsOffset); } else { From 292f792e65e066a1189c7991e1021e29ab9656f9 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Mon, 19 May 2025 15:32:02 +0200 Subject: [PATCH 136/346] update DXC pointer (to Clang fixes merge commit) --- 3rdparty/dxc/dxc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc index 4621c707ed..71f2766da9 160000 --- a/3rdparty/dxc/dxc +++ b/3rdparty/dxc/dxc @@ -1 +1 @@ -Subproject commit 4621c707ed774ab8382391f6434810ebecd37111 +Subproject commit 71f2766da918d33d34fefac270fdee983a06dd20 From b3b2b0301fc36ef8df8ee01df6de1a0080713b05 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Mon, 19 May 2025 15:51:49 +0200 Subject: [PATCH 137/346] post-merge updates, correct IBottomLevelAccelerationStructure::BUILD_FLAGS's initial casts --- include/nbl/asset/IAccelerationStructure.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/nbl/asset/IAccelerationStructure.h b/include/nbl/asset/IAccelerationStructure.h index d251dd3077..eac16d8d32 100644 --- a/include/nbl/asset/IAccelerationStructure.h +++ b/include/nbl/asset/IAccelerationStructure.h @@ -59,11 +59,11 @@ class IBottomLevelAccelerationStructure : public IAccelerationStructure // build flags, we don't expose flags that don't make sense for certain levels enum class BUILD_FLAGS : uint16_t { - ALLOW_UPDATE_BIT = base_build_flags_t::ALLOW_UPDATE_BIT, - ALLOW_COMPACTION_BIT = base_build_flags_t::ALLOW_COMPACTION_BIT, - PREFER_FAST_TRACE_BIT = base_build_flags_t::PREFER_FAST_TRACE_BIT, - PREFER_FAST_BUILD_BIT = base_build_flags_t::PREFER_FAST_BUILD_BIT, - LOW_MEMORY_BIT = base_build_flags_t::LOW_MEMORY_BIT, + ALLOW_UPDATE_BIT = static_cast(base_build_flags_t::ALLOW_UPDATE_BIT), + ALLOW_COMPACTION_BIT = static_cast(base_build_flags_t::ALLOW_COMPACTION_BIT), + PREFER_FAST_TRACE_BIT = static_cast(base_build_flags_t::PREFER_FAST_TRACE_BIT), + PREFER_FAST_BUILD_BIT = static_cast(base_build_flags_t::PREFER_FAST_BUILD_BIT), + LOW_MEMORY_BIT = static_cast(base_build_flags_t::LOW_MEMORY_BIT), // Synthetic flag we use to indicate that the build data are AABBs instead of triangles, we've taken away the per-geometry choice thanks to: // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkAccelerationStructureBuildGeometryInfoKHR-type-03792 GEOMETRY_TYPE_IS_AABB_BIT = 0x1u<<5u, From 6dda1e265afd4fe32e128a71edef62c2bf2d729c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Jos=C3=A9=20Letterio?= <40742817+Fletterio@users.noreply.github.com> Date: Mon, 19 May 2025 20:06:50 -0300 Subject: [PATCH 138/346] Add a bunch of missing `const` in demote_promote_writer_readers_lock.h --- .../nbl/system/demote_promote_writer_readers_lock.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/nbl/system/demote_promote_writer_readers_lock.h b/include/nbl/system/demote_promote_writer_readers_lock.h index 6823c26c27..5447e65f3e 100644 --- a/include/nbl/system/demote_promote_writer_readers_lock.h +++ b/include/nbl/system/demote_promote_writer_readers_lock.h @@ -271,7 +271,7 @@ class demote_promote_writer_readers_lock_debug struct DefaultPreemptionCheck { - bool operator()(state_lock_value_t oldState) + bool operator()(const state_lock_value_t oldState) { return false; } @@ -361,13 +361,13 @@ class dpwr_lock_guard_base /** * @brief Checks whether this guard is currently locking the lock `lk` */ - bool hasLocked(dpwr_lock_t& lk) const + bool hasLocked(const dpwr_lock_t& lk) const { return m_lock == &lk; } protected: - dpwr_lock_guard_base(dpwr_lock_t& lk) noexcept : m_lock(&lk) {} + dpwr_lock_guard_base(const dpwr_lock_t& lk) noexcept : m_lock(&lk) {} dpwr_lock_t* m_lock; }; @@ -385,7 +385,7 @@ class dpwr_read_lock_guard_debug : public impl::dpwr_lock_guard_base; using dpwr_write_lock_guard_debug_t = dpwr_write_lock_guard_debug; - dpwr_read_lock_guard_debug(dpwr_lock_t& lk, std::adopt_lock_t) : base_t(lk) {} + dpwr_read_lock_guard_debug(const dpwr_lock_t& lk, std::adopt_lock_t) : base_t(lk) {} explicit dpwr_read_lock_guard_debug(dpwr_lock_t& lk) : dpwr_read_lock_guard_debug(lk, std::adopt_lock_t()) { this->m_lock->read_lock(); @@ -406,7 +406,7 @@ class dpwr_write_lock_guard_debug : public impl::dpwr_lock_guard_base; using dpwr_read_lock_guard_debug_t = dpwr_read_lock_guard_debug; - dpwr_write_lock_guard_debug(dpwr_lock_t& lk, std::adopt_lock_t) : base_t(lk) {} + dpwr_write_lock_guard_debug(const dpwr_lock_t& lk, std::adopt_lock_t) : base_t(lk) {} explicit dpwr_write_lock_guard_debug(dpwr_lock_t& lk) : dpwr_write_lock_guard_debug(lk, std::adopt_lock_t()) { this->m_lock->write_lock(); From 004c95adc9a3b1a002200d059738f30aede4c3f1 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 20 May 2025 12:05:48 +0700 Subject: [PATCH 139/346] fixed minor bug --- examples_tests | 2 +- include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples_tests b/examples_tests index e828dc49ef..f4af3edc1c 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit e828dc49ef0a223dcbb8b4af8d722974747f29ee +Subproject commit f4af3edc1cd8d152f6c67bd15577b2595cb2a43f diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl index 88ff328e05..12f65420ca 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -19,7 +19,7 @@ template struct virtual_wg_size_log2 { static_assert(WorkgroupSizeLog2>=SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize"); - static_assert(WorkgroupSizeLog2<=SubgroupSizeLog2+4, "WorkgroupSize cannot be larger than SubgroupSize*16"); + // static_assert(WorkgroupSizeLog2<=SubgroupSizeLog2+4, "WorkgroupSize cannot be larger than SubgroupSize*16"); NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2),uint16_t,conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2*2+2),uint16_t,3,2>::value,1>::value; NBL_CONSTEXPR_STATIC_INLINE uint16_t value = mpl::max_v+SubgroupSizeLog2; }; From 9bd76f904b05b835f4f8ea42396ac1b5419e26c3 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Tue, 20 May 2025 09:26:19 +0200 Subject: [PATCH 140/346] add docker/msvc-winsdk submodule --- .gitmodules | 5 ++++- docker/msvc-winsdk | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) create mode 160000 docker/msvc-winsdk diff --git a/.gitmodules b/.gitmodules index 584ff16d65..00482441de 100644 --- a/.gitmodules +++ b/.gitmodules @@ -117,4 +117,7 @@ url = git@github.com:Devsh-Graphics-Programming/Compiler-Explorer-Docker.git [submodule "3rdparty/glm"] path = 3rdparty/glm - url = git@github.com:Devsh-Graphics-Programming/glm.git \ No newline at end of file + url = git@github.com:Devsh-Graphics-Programming/glm.git +[submodule "docker/msvc-winsdk"] + path = docker/msvc-winsdk + url = ../docker-nanoserver-msvc-winsdk diff --git a/docker/msvc-winsdk b/docker/msvc-winsdk new file mode 160000 index 0000000000..8aa9e767ec --- /dev/null +++ b/docker/msvc-winsdk @@ -0,0 +1 @@ +Subproject commit 8aa9e767ec60aa77f477ac6cf41728e997dcc950 From 0abbb21ad5f414980480e0c2f4135d631d8cc1c2 Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 20 May 2025 11:19:55 +0200 Subject: [PATCH 141/346] get stuff to compile again --- include/nbl/asset/IAccelerationStructure.h | 7 ++----- src/nbl/video/CVulkanAccelerationStructure.h | 4 ++-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/include/nbl/asset/IAccelerationStructure.h b/include/nbl/asset/IAccelerationStructure.h index a29d27b828..665135f695 100644 --- a/include/nbl/asset/IAccelerationStructure.h +++ b/include/nbl/asset/IAccelerationStructure.h @@ -103,11 +103,8 @@ class IBottomLevelAccelerationStructure : public IAccelerationStructure public: using buffer_t = BufferType; constexpr static inline GeometryType Type = GeometryType::Triangles; - - private: - constexpr static inline bool HostTransform = std::is_same_v; - public: + constexpr static inline bool HostTransform = std::is_same_v; // we make our life easier by not taking pointers to single matrix values using transform_t = std::conditional_t>; @@ -146,7 +143,7 @@ class IBottomLevelAccelerationStructure : public IAccelerationStructure struct AABBs { using buffer_t = BufferType; - constexpr static inline GeometryType Type = GeometryType::Triangles; + constexpr static inline GeometryType Type = GeometryType::AABBs; // for `MOTION_BIT` you don't get a second buffer for AABBs at different times because linear interpolation of AABBs doesn't work asset::SBufferBinding data = {}; diff --git a/src/nbl/video/CVulkanAccelerationStructure.h b/src/nbl/video/CVulkanAccelerationStructure.h index 42fefaa6d1..8041927fa2 100644 --- a/src/nbl/video/CVulkanAccelerationStructure.h +++ b/src/nbl/video/CVulkanAccelerationStructure.h @@ -137,7 +137,7 @@ void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles(triangles.transform); @@ -147,7 +147,7 @@ void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles& triangles, VkAccelerationStructureGeometryKHR& outBase, VkAccelerationStructureGeometryMotionTrianglesDataNV* &p_vertexMotion) { - getVkASGeometryFrom(triangles,outBase); + getVkASGeometryFrom(triangles,outBase); if (triangles.vertexData[1].buffer) { p_vertexMotion->sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_MOTION_TRIANGLES_DATA_NV; From ccacddbc5b2ca1bed787e38fdf50a459606e5376 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 20 May 2025 16:49:30 +0700 Subject: [PATCH 142/346] store temporaries with data accessor --- examples_tests | 2 +- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 40 +++++++++++-------- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/examples_tests b/examples_tests index f4af3edc1c..44c34a8a65 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit f4af3edc1cd8d152f6c67bd15577b2595cb2a43f +Subproject commit 44c34a8a65866fb6304c12032efd08e2338c7116 diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index d53bfd6000..8bfd8b0194 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -151,20 +151,21 @@ struct scan using params_lv1_t = subgroup2::ArithmeticParams; BinOp binop; - vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize]; const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); subgroup2::inclusive_scan inclusiveScan0; // level 0 scan [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { - dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); - scan_local[idx] = inclusiveScan0(scan_local[idx]); + vector_lv0_t value; + dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + value = inclusiveScan0(value); + dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); - scratchAccessor.template set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + scratchAccessor.template set(bankedIndex, value[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -188,23 +189,26 @@ struct scan [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { + vector_lv0_t value; + dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); scalar_t left; scratchAccessor.template get(virtualSubgroupID,left); if (Exclusive) { - scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp(scan_local[idx][Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID())); + scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID())); [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++) - scan_local[idx][Config::ItemsPerInvocation_0-i-1] = binop(left, hlsl::mix(scan_local[idx][Config::ItemsPerInvocation_0-i-2], left_last_elem, (Config::ItemsPerInvocation_0-i-1==0))); + value[Config::ItemsPerInvocation_0-i-1] = binop(left, hlsl::mix(value[Config::ItemsPerInvocation_0-i-2], left_last_elem, (Config::ItemsPerInvocation_0-i-1==0))); } else { [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++) - scan_local[idx][i] = binop(left, scan_local[idx][i]); + value[i] = binop(left, value[i]); } - dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); + dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); } } }; @@ -303,20 +307,21 @@ struct scan using params_lv2_t = subgroup2::ArithmeticParams; BinOp binop; - vector_lv0_t scan_local[Config::VirtualWorkgroupSize / Config::WorkgroupSize]; const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); subgroup2::inclusive_scan inclusiveScan0; // level 0 scan [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { - dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); - scan_local[idx] = inclusiveScan0(scan_local[idx]); + vector_lv0_t value; + dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + value = inclusiveScan0(value); + dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); - scratchAccessor.template set(bankedIndex, scan_local[idx][Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + scratchAccessor.template set(bankedIndex, value[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -368,23 +373,26 @@ struct scan [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { + vector_lv0_t value; + dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); const scalar_t left; scratchAccessor.template get(virtualSubgroupID, left); if (Exclusive) { - scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp(scan_local[idx][Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID())); + scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID())); [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++) - scan_local[idx][Config::ItemsPerInvocation_0-i-1] = binop(left, hlsl::mix(scan_local[idx][Config::ItemsPerInvocation_0-i-2], left_last_elem, (Config::ItemsPerInvocation_0-i-1==0))); + value[Config::ItemsPerInvocation_0-i-1] = binop(left, hlsl::mix(value[Config::ItemsPerInvocation_0-i-2], left_last_elem, (Config::ItemsPerInvocation_0-i-1==0))); } else { [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++) - scan_local[idx][i] = binop(left, scan_local[idx][i]); + value[i] = binop(left, value[i]); } - dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local[idx]); + dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); } } }; From d69cd6026556b57552f2edc7ad82aa9795089591 Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 20 May 2025 12:28:50 +0200 Subject: [PATCH 143/346] correct the calculation of scratch memory needed, and avoid deadlock (allocation failure) due to worst case fragmentation also fix one memory freeing bug --- src/nbl/video/utilities/CAssetConverter.cpp | 99 ++++++++++++++------- 1 file changed, 65 insertions(+), 34 deletions(-) diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 5d16c5bb9b..7c325cb17d 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -2828,13 +2828,13 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult const auto buildFlags = patch.getBuildFlags(as); const auto outIx = i+entry.second.firstCopyIx; const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx]; - // prevent CPU hangs by making sure allocator big enough to service us in worst case but with best case allocator (no other allocations, clean alloc) + // prevent CPU hangs by making sure allocator big enough to service us in worst case const auto minScratchAllocSize = patch.hostBuild ? inputs.scratchForHostASBuildMinAllocSize:inputs.scratchForDeviceASBuildMinAllocSize; - uint64_t buildSize = 0; uint32_t buildAlignment = 4; - auto incrementBuildSize = [minScratchAllocSize,&buildSize,&buildAlignment](const uint64_t size, const uint32_t alignment)->void + uint64_t buildSize = 0; + auto incrementBuildSize = [minScratchAllocSize,&buildSize](const uint64_t size, const uint32_t alignment)->void { - buildSize = core::alignUp(buildSize,alignment)+hlsl::max(size,minScratchAllocSize); - buildAlignment = hlsl::max(buildAlignment,alignment); + // account for fragmentation and misalignment + buildSize += hlsl::max(size,minScratchAllocSize)+hlsl::max(minScratchAllocSize,alignment)*2; }; ILogicalDevice::AccelerationStructureBuildSizes sizes = {}; const auto hashAsU64 = reinterpret_cast(entry.first.data); @@ -2855,35 +2855,45 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult const uint32_t* pPrimitiveCounts = as->getGeometryPrimitiveCounts().data(); if (buildFlags.hasFlags(ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT)) { - sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,as->getAABBGeometries(),pPrimitiveCounts); + const auto geoms = as->getAABBGeometries(); + sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,geoms,pPrimitiveCounts); for (const auto& geom : geoms) if (const auto aabbCount=*(pPrimitiveCounts++); aabbCount) incrementBuildSize(aabbCount*geom.stride,alignof(float)); } else { - sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,as->getTriangleGeometries(),pPrimitiveCounts); + const auto geoms = as->getTriangleGeometries(); + sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,geoms,pPrimitiveCounts); for (const auto& geom : geoms) if (const auto triCount=*(pPrimitiveCounts++); triCount) { auto size = geom.vertexStride*(geom.vertexData[1] ? 2:1)*geom.maxVertex; + uint16_t alignment = hlsl::max(0x1u<(alignof(float),alignment); + } + uint16_t indexSize = 0; switch (geom.indexType) { case E_INDEX_TYPE::EIT_16BIT: - alignment = alignof(uint16_t); + indexSize = sizeof(uint16_t); break; case E_INDEX_TYPE::EIT_32BIT: - alignment = alignof(uint32_t); + indexSize = sizeof(uint32_t); break; default: break; } - if (alignment) - size = core::alignUp(size,alignment)+triCount*3*alignment; - incrementBuildSize(size,hlsl::max(alignment,geom.vertexStride)); + if (indexSize) + { + size = core::alignUp(size,indexSize)+triCount*3*indexSize; + alignment = hlsl::max(indexSize,alignment); + } + inputs.logger.log("%p Triangle Data Size %d Align %d",system::ILogger::ELL_DEBUG,as,size,alignment); + incrementBuildSize(size,alignment); } } } @@ -2896,8 +2906,9 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult ); continue; } - // scratch gets allocated first - buildSize = core::alignUp(hlsl::max(sizes.buildScratchSize,minScratchAllocSize),buildAlignment)+buildSize; + // + incrementBuildSize(sizes.buildScratchSize,device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment); + inputs.logger.log("%p Scratch Size %d Combined %d",system::ILogger::ELL_DEBUG,as,sizes.buildScratchSize,buildSize); // we need to save the buffer in a side-channel for later auto& out = accelerationStructureParams[IsTLAS][entry.second.firstCopyIx+i]; @@ -4718,12 +4729,14 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul { submitsNeeded |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; // queue up a deferred allocation - params.scratchForDeviceASBuild->multi_deallocate(oldAllocCount,allocOffsets.data(),allocSizes.data(),params.compute->getFutureScratchSemaphore()); + if (oldAllocCount) + params.scratchForDeviceASBuild->multi_deallocate(oldAllocCount,allocOffsets.data(),allocSizes.data(),params.compute->getFutureScratchSemaphore()); } else { // release right away - params.scratchForDeviceASBuild->multi_deallocate(oldAllocCount,allocOffsets.data(),allocSizes.data()); + if (oldAllocCount) + params.scratchForDeviceASBuild->multi_deallocate(oldAllocCount,allocOffsets.data(),allocSizes.data()); for (const auto& info : buildInfos) { const auto stagingFound = findInStaging.template operator()(info.dstAS); @@ -4766,7 +4779,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul auto allocCount = 0; auto deallocSrc = core::makeRAIIExiter([¶ms,&allocOffsets,&allocSizes,&alignments,&allocCount]()->void { - const auto beginIx = allocSizes.size()-alignments.size(); + const auto beginIx = allocSizes.size()-allocCount; // if got to end of loop queue up the release of memory, otherwise release right away if (allocCount) params.scratchForDeviceASBuild->multi_deallocate(allocCount,allocOffsets.data()+beginIx,allocSizes.data()+beginIx); @@ -4837,42 +4850,60 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul if (const auto triCount=*(pPrimitiveCounts++); triCount) { auto size = geom.vertexStride*(geom.vertexData[1] ? 2:1)*geom.maxVertex; + uint16_t alignment = hlsl::max(0x1u<(alignof(float),alignment); + } + uint16_t indexSize = 0u; switch (geom.indexType) { case E_INDEX_TYPE::EIT_16BIT: - alignment = alignof(uint16_t); + indexSize = alignof(uint16_t); break; case E_INDEX_TYPE::EIT_32BIT: - alignment = alignof(uint32_t); + indexSize = alignof(uint32_t); break; default: break; } - if (alignment) - size = core::alignUp(size,alignment)+triCount*3*alignment; + if (indexSize) + { + size = core::alignUp(size,indexSize)+triCount*3*indexSize; + alignment = hlsl::max(indexSize,alignment); + } allocSizes.push_back(size); - alignments.push_back(hlsl::max(alignment,geom.vertexStride)); + alignments.push_back(alignment); + const auto tmp = asToBuild.second.scratchSize; + logger.log("%p Triangle Data Size %d Align %d Scratch Size %d",system::ILogger::ELL_DEBUG,canonical.get(),size,alignment,tmp); } } } allocOffsets.resize(allocSizes.size(),scratch_allocator_t::invalid_value); // allocate out scratch or submit overflow, if fail then flush and keep trying till space is made - auto* const offsets = allocOffsets.data()+allocOffsets.size()-alignments.size(); - const auto* const sizes = allocSizes.data()+allocSizes.size()-alignments.size(); + auto* offsets = allocOffsets.data()+allocOffsets.size()-alignments.size(); + const auto* sizes = allocSizes.data()+allocSizes.size()-alignments.size(); + logger.log("%p Combined Size %d",system::ILogger::ELL_DEBUG,canonical.get(),std::accumulate(sizes,sizes+alignments.size(),0)); for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(alignments.size(),offsets,sizes,alignments.data())!=0; t++) - if (t==1) // don't flush right away cause allocator not defragmented yet { - recordBuildCommands(); - // if writing to scratch directly, flush the writes - if (!flushRanges.empty()) + if (t==1) // don't flush right away cause allocator not defragmented yet { - device->flushMappedMemoryRanges(flushRanges); - flushRanges.clear(); + recordBuildCommands(); + // the submit overflow deallocates old offsets and erases them from the temp arrays, pointer changes + offsets = allocOffsets.data(); + sizes = allocSizes.data(); + // if writing to scratch directly, flush the writes + if (!flushRanges.empty()) + { + device->flushMappedMemoryRanges(flushRanges); + flushRanges.clear(); + } + drainCompute(); } - drainCompute(); + // we may be preventing ourselves from allocating memory, with one successful allocation still being alive and fragmenting our allocator + params.scratchForDeviceASBuild->multi_deallocate(alignments.size(),offsets,sizes); + std::fill_n(offsets,alignments.size(),scratch_allocator_t::invalid_value); } // now upon a failure, our allocations will need to be deallocated allocCount = alignments.size(); @@ -5055,7 +5086,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul outGeom.indexType = geom.indexType; outGeom.geometryFlags = geom.geometryFlags; } - buildInfo.triangles = reinterpret_cast* const&>(trianglesOffset); + buildInfo.triangles = reinterpret_cast* const&>(trianglesOffset); } success = pPrimitiveCounts==primitiveCounts.data()+primitiveCounts.size(); rangeInfos.push_back(reinterpret_cast(geometryRangeInfoOffset)); From 5c13a932887a527d8e53d201c4d96aca84994d05 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 20 May 2025 17:41:39 +0700 Subject: [PATCH 144/346] Remove impl_valid and rework SSpecializatioNValidationResult --- include/nbl/asset/IRayTracingPipeline.h | 21 +- include/nbl/video/IGPUComputePipeline.h | 20 +- include/nbl/video/IGPUGraphicsPipeline.h | 49 ++-- include/nbl/video/IGPUPipeline.h | 19 ++ include/nbl/video/IGPURayTracingPipeline.h | 234 ++++++-------------- include/nbl/video/SPipelineCreationParams.h | 49 ++-- 6 files changed, 152 insertions(+), 240 deletions(-) diff --git a/include/nbl/asset/IRayTracingPipeline.h b/include/nbl/asset/IRayTracingPipeline.h index 50ab7ba3f3..82b47f1fcb 100644 --- a/include/nbl/asset/IRayTracingPipeline.h +++ b/include/nbl/asset/IRayTracingPipeline.h @@ -24,10 +24,27 @@ class IRayTracingPipelineBase : public virtual core::IReferenceCounted template class IRayTracingPipeline : public IPipeline, public IRayTracingPipelineBase { - using base_creation_params_t = IPipeline; - public: + #define base_flag(F) static_cast(IPipelineBase::FLAGS::F) + enum class CreationFlags : uint64_t + { + NONE = base_flag(NONE), + DISABLE_OPTIMIZATIONS = base_flag(DISABLE_OPTIMIZATIONS), + ALLOW_DERIVATIVES = base_flag(ALLOW_DERIVATIVES), + FAIL_ON_PIPELINE_COMPILE_REQUIRED = base_flag(FAIL_ON_PIPELINE_COMPILE_REQUIRED), + EARLY_RETURN_ON_FAILURE = base_flag(EARLY_RETURN_ON_FAILURE), + SKIP_BUILT_IN_PRIMITIVES = 1<<12, + SKIP_AABBS = 1<<13, + NO_NULL_ANY_HIT_SHADERS = 1<<14, + NO_NULL_CLOSEST_HIT_SHADERS = 1<<15, + NO_NULL_MISS_SHADERS = 1<<16, + NO_NULL_INTERSECTION_SHADERS = 1<<17, + ALLOW_MOTION = 1<<20, + }; + #undef base_flag + using FLAGS = CreationFlags; + inline const SCachedCreationParams& getCachedCreationParams() const { return m_params; } protected: diff --git a/include/nbl/video/IGPUComputePipeline.h b/include/nbl/video/IGPUComputePipeline.h index 065c567ee2..2eb03cf2da 100644 --- a/include/nbl/video/IGPUComputePipeline.h +++ b/include/nbl/video/IGPUComputePipeline.h @@ -47,21 +47,19 @@ class IGPUComputePipeline : public IGPUPipelinesize()>0x7fffffff) - return {}; - count = static_cast(shader.entries->size()); - } - return {.count=dataSize ? count:0,.dataSize=static_cast(dataSize)}; + SSpecializationValidationResult retval = { + .count = 0, + .dataSize = 0, + }; + + if (!shader.accumulateSpecializationValidationResult(&retval)) + return {}; + + return retval; } IGPUPipelineLayout* layout = nullptr; diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h index f5d6e40275..ae8924a1ab 100644 --- a/include/nbl/video/IGPUGraphicsPipeline.h +++ b/include/nbl/video/IGPUGraphicsPipeline.h @@ -32,15 +32,17 @@ class IGPUGraphicsPipeline : public IGPUPipeline - inline bool impl_valid(ExtraLambda&& extra) const + inline SSpecializationValidationResult valid() const { if (!layout) - return false; + return {}; + SSpecializationValidationResult retval = {.count=0,.dataSize=0}; + if (!layout) + return {}; // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-dynamicRendering-06576 if (!renderpass || cached.subpassIx>=renderpass->getSubpassCount()) - return false; + return {}; // TODO: check rasterization samples, etc. //rp->getCreationParameters().subpasses[i] @@ -49,41 +51,18 @@ class IGPUGraphicsPipeline : public IGPUPipelinebool - { - const auto dataSize = info.valid(); - if (dataSize<0) - return false; - else if (dataSize==0) - return true; - - const size_t count = info.entries ? info.entries->size():0x80000000ull; - if (count>0x7fffffff) - return {}; - retval += {.count=dataSize ? static_cast(count):0,.dataSize=static_cast(dataSize)}; - return retval; - }); - if (!valid) + if (!hasRequiredStages(stagePresence, cached.primitiveAssembly.primitiveType)) return {}; return retval; } diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h index fc4bc8d219..ff6d97f17b 100644 --- a/include/nbl/video/IGPUPipeline.h +++ b/include/nbl/video/IGPUPipeline.h @@ -7,6 +7,7 @@ #define _NBL_VIDEO_I_GPU_PIPELINE_H_INCLUDED_ #include "nbl/video/IGPUPipelineLayout.h" +#include "nbl/video/SPipelineCreationParams.h" #include "nbl/asset/IPipeline.h" namespace nbl::video @@ -69,6 +70,24 @@ class IGPUPipelineBase { return static_cast(specData); } + bool accumulateSpecializationValidationResult(SSpecializationValidationResult* retval) const + { + const auto dataSize = valid(); + if (dataSize < 0) + return false; + if (dataSize == 0) + return true; + + const size_t count = entries ? entries->size() : 0x80000000ull; + if (count > 0x7fffffff) + return false; + *retval += { + .count = dataSize ? static_cast(count) : 0, + .dataSize = static_cast(dataSize), + }; + return *retval; + } + const asset::IShader* shader = nullptr; std::string_view entryPoint = ""; diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h index 2a6701c9e6..f7a92252f7 100644 --- a/include/nbl/video/IGPURayTracingPipeline.h +++ b/include/nbl/video/IGPURayTracingPipeline.h @@ -15,118 +15,9 @@ class IGPURayTracingPipeline : public IGPUPipeline; public: - struct SCreationParams + struct SCreationParams : public SPipelineCreationParams { - #define base_flag(F) static_cast(IPipelineBase::FLAGS::F) - enum class FLAGS : uint64_t - { - NONE = base_flag(NONE), - DISABLE_OPTIMIZATIONS = base_flag(DISABLE_OPTIMIZATIONS), - ALLOW_DERIVATIVES = base_flag(ALLOW_DERIVATIVES), - FAIL_ON_PIPELINE_COMPILE_REQUIRED = base_flag(FAIL_ON_PIPELINE_COMPILE_REQUIRED), - EARLY_RETURN_ON_FAILURE = base_flag(EARLY_RETURN_ON_FAILURE), - SKIP_BUILT_IN_PRIMITIVES = 1<<12, - SKIP_AABBS = 1<<13, - NO_NULL_ANY_HIT_SHADERS = 1<<14, - NO_NULL_CLOSEST_HIT_SHADERS = 1<<15, - NO_NULL_MISS_SHADERS = 1<<16, - NO_NULL_INTERSECTION_SHADERS = 1<<17, - ALLOW_MOTION = 1<<20, - }; - #undef base_flag - - protected: - template - inline bool impl_valid(ExtraLambda&& extra) const - { - if (!m_layout) return false; - - for (const auto info : shaders) - { - if (info.shader) - { - if (!extra(info)) - return false; - const auto stage = info.stage; - if ((stage & ~hlsl::ShaderStage::ESS_ALL_RAY_TRACING) != 0) - return false; - if (!std::has_single_bit>(stage)) - return false; - } - else - { - // every shader must not be null. use SIndex::Unused to represent unused shader. - return false; - } - } - - auto getShaderStage = [this](size_t index) -> hlsl::ShaderStage - { - return shaders[index].stage; - }; - - auto isValidShaderIndex = [this, getShaderStage](size_t index, hlsl::ShaderStage expectedStage, bool is_unused_shader_forbidden) -> bool - { - if (index == SShaderGroupsParams::SIndex::Unused) - return !is_unused_shader_forbidden; - if (index >= shaders.size()) - return false; - if (getShaderStage(index) != expectedStage) - return false; - return true; - }; - - if (!isValidShaderIndex(shaderGroups.raygen.index, hlsl::ShaderStage::ESS_RAYGEN, true)) - { - return false; - } - - for (const auto& shaderGroup : shaderGroups.hits) - { - // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-flags-03470 - if (!isValidShaderIndex(shaderGroup.anyHit, - hlsl::ShaderStage::ESS_ANY_HIT, - bool(flags & FLAGS::NO_NULL_ANY_HIT_SHADERS))) - return false; - - // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-flags-03471 - if (!isValidShaderIndex(shaderGroup.closestHit, - hlsl::ShaderStage::ESS_CLOSEST_HIT, - bool(flags & FLAGS::NO_NULL_CLOSEST_HIT_SHADERS))) - return false; - - if (!isValidShaderIndex(shaderGroup.intersection, - hlsl::ShaderStage::ESS_INTERSECTION, - false)) - return false; - } - - for (const auto& shaderGroup : shaderGroups.misses) - { - if (!isValidShaderIndex(shaderGroup.index, - hlsl::ShaderStage::ESS_MISS, - false)) - return false; - } - - for (const auto& shaderGroup : shaderGroups.callables) - { - if (!isValidShaderIndex(shaderGroup.index, hlsl::ShaderStage::ESS_CALLABLE, false)) - return false; - } - return true; - } - - public: - inline bool valid() const - { - return impl_valid([](const SShaderSpecInfo& info)->bool - { - if (!info.valid()) - return false; - return false; - }); - } + using FLAGS = pipeline_t::FLAGS; struct SShaderGroupsParams { @@ -149,50 +40,12 @@ class IGPURayTracingPipeline : public IGPUPipeline flags = FLAGS::NONE; - }; - - - struct SShaderGroupHandle - { - private: - uint8_t data[video::SPhysicalDeviceLimits::ShaderGroupHandleSize]; - }; - static_assert(sizeof(SShaderGroupHandle) == video::SPhysicalDeviceLimits::ShaderGroupHandleSize); - - struct SHitGroupStackSize - { - uint16_t closestHit; - uint16_t anyHit; - uint16_t intersection; - }; - - using SGeneralShaderGroupContainer = core::smart_refctd_dynamic_array; - using SHitShaderGroupContainer = core::smart_refctd_dynamic_array; - - struct SCreationParams final : SPipelineCreationParams - { - #define base_flag(F) static_cast(IPipelineBase::CreationFlags::F) - enum class FLAGS : uint64_t - { - NONE = base_flag(NONE), - DISABLE_OPTIMIZATIONS = base_flag(DISABLE_OPTIMIZATIONS), - ALLOW_DERIVATIVES = base_flag(ALLOW_DERIVATIVES), - FAIL_ON_PIPELINE_COMPILE_REQUIRED = base_flag(FAIL_ON_PIPELINE_COMPILE_REQUIRED), - EARLY_RETURN_ON_FAILURE = base_flag(EARLY_RETURN_ON_FAILURE), - SKIP_BUILT_IN_PRIMITIVES = 1<<12, - SKIP_AABBS = 1<<13, - NO_NULL_ANY_HIT_SHADERS = 1<<14, - NO_NULL_CLOSEST_HIT_SHADERS = 1<<15, - NO_NULL_MISS_SHADERS = 1<<16, - NO_NULL_INTERSECTION_SHADERS = 1<<17, - ALLOW_MOTION = 1<<20, - }; - #undef base_flag inline SSpecializationValidationResult valid() const { @@ -200,31 +53,76 @@ class IGPURayTracingPipeline : public IGPUPipelinebool + + if (!shaderGroups.raygen.accumulateSpecializationValidationResult(&retval)) + return {}; + + for (const auto& shaderGroup : shaderGroups.hits) { - const auto dataSize = info.valid(); - if (dataSize<0) - return false; - else if (dataSize==0) - return true; - - const size_t count = info.entries ? info.entries->size():0x80000000ull; - if (count>0x7fffffff) + if (shaderGroup.intersection.shader) + { + if (!shaderGroup.intersection.accumulateSpecializationValidationResult(&retval)) return {}; - retval += {.count=dataSize ? static_cast(count):0,.dataSize=static_cast(dataSize)}; - return retval; - }); - if (!valid) - return {}; + } + + if (shaderGroup.closestHit.shader) + { + if (!shaderGroup.closestHit.accumulateSpecializationValidationResult(&retval)) + return {}; + } + + // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-flags-03470 + if (flags & FLAGS::NO_NULL_ANY_HIT_SHADERS && !shaderGroup.anyHit.shader) + return {}; + + if (shaderGroup.anyHit.shader) + { + if (!shaderGroup.anyHit.accumulateSpecializationValidationResult(&retval)) + return {}; + } + + // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-flags-03471 + if (flags & FLAGS::NO_NULL_CLOSEST_HIT_SHADERS && !shaderGroup.intersection.shader) + return {}; + } + + for (const auto& miss : shaderGroups.misses) + { + if (miss.shader) + { + if (!miss.accumulateSpecializationValidationResult(&retval)) + return {}; + } + } + + for (const auto& callable : shaderGroups.callables) + { + if (callable.shader) + { + if (!callable.accumulateSpecializationValidationResult(&retval)) + return {}; + } + } + return retval; } + }; - inline std::span getShaders() const { return shaders; } + struct SShaderGroupHandle + { + private: + uint8_t data[video::SPhysicalDeviceLimits::ShaderGroupHandleSize]; + }; + static_assert(sizeof(SShaderGroupHandle) == video::SPhysicalDeviceLimits::ShaderGroupHandleSize); - IGPUPipelineLayout* layout = nullptr; + struct SHitGroupStackSize + { + uint16_t closestHit; + uint16_t anyHit; + uint16_t intersection; }; inline core::bitflag getCreationFlags() const { return m_flags; } diff --git a/include/nbl/video/SPipelineCreationParams.h b/include/nbl/video/SPipelineCreationParams.h index 489bff4343..3a25560ae4 100644 --- a/include/nbl/video/SPipelineCreationParams.h +++ b/include/nbl/video/SPipelineCreationParams.h @@ -11,6 +11,31 @@ namespace nbl::video { +struct SSpecializationValidationResult +{ + constexpr static inline uint32_t Invalid = ~0u; + inline operator bool() const + { + return count!=Invalid && dataSize!=Invalid; + } + + inline SSpecializationValidationResult& operator+=(const SSpecializationValidationResult& other) + { + // TODO: check for overflow before adding + if (*this && other) + { + count += other.count; + dataSize += other.dataSize; + } + else + *this = {}; + return *this; + } + + uint32_t count = Invalid; + uint32_t dataSize = Invalid; +}; + // For now, due to API design we implicitly satisfy: // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-08771 // to: @@ -18,30 +43,6 @@ namespace nbl::video template struct SPipelineCreationParams { - struct SSpecializationValidationResult - { - constexpr static inline uint32_t Invalid = ~0u; - inline operator bool() const - { - return count!=Invalid && dataSize!=Invalid; - } - - inline SSpecializationValidationResult& operator+=(const SSpecializationValidationResult& other) - { - // TODO: check for overflow before adding - if (*this && other) - { - count += other.count; - dataSize += other.dataSize; - } - else - *this = {}; - return *this; - } - - uint32_t count = Invalid; - uint32_t dataSize = Invalid; - }; constexpr static inline int32_t NotDerivingFromPreviousPipeline = -1; inline bool isDerivative() const From 483a788162180baef274e5e8afab4ef1f922893e Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Tue, 20 May 2025 14:31:44 +0200 Subject: [PATCH 145/346] add docker/msvc-winsdk submodule, update build presets & dxc/CMakeLists.txt --- 3rdparty/dxc/CMakeLists.txt | 26 ++++++----- CMakePresets.json | 88 +++---------------------------------- docker/msvc-winsdk | 2 +- 3 files changed, 22 insertions(+), 94 deletions(-) diff --git a/3rdparty/dxc/CMakeLists.txt b/3rdparty/dxc/CMakeLists.txt index ed2528c922..9432b4df07 100644 --- a/3rdparty/dxc/CMakeLists.txt +++ b/3rdparty/dxc/CMakeLists.txt @@ -41,6 +41,7 @@ list(APPEND NBL_DXC_CMAKE_OPTIONS "-DSPIRV_SKIP_EXECUTABLES:BOOL=ON") list(APPEND NBL_DXC_CMAKE_OPTIONS "-DHLSL_ENABLE_DEBUG_ITERATORS:BOOL=ON") list(APPEND NBL_DXC_CMAKE_OPTIONS "-DDXC_SPIRV_TOOLS_DIR=${DXC_SPIRV_TOOLS_DIR}") list(APPEND NBL_DXC_CMAKE_OPTIONS "-DDXC_SPIRV_HEADERS_DIR=${DXC_SPIRV_HEADERS_DIR}") +list(APPEND NBL_DXC_CMAKE_OPTIONS "-DDXC_ENABLE_ETW=OFF") if(NOT NBL_IS_MULTI_CONFIG) list(APPEND NBL_DXC_CMAKE_OPTIONS "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}") @@ -85,18 +86,23 @@ endif() set(DXC_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/build" CACHE INTERNAL "") -if(MSVC AND (NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja Multi-Config" AND NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja")) - execute_process(COMMAND "${CMAKE_COMMAND}" -C "${CMAKE_CURRENT_SOURCE_DIR}/dxc/cmake/caches/PredefinedParams.cmake" -S "${CMAKE_CURRENT_SOURCE_DIR}/dxc" -B "${DXC_BUILD_DIR}" -G "${CMAKE_GENERATOR}" "-Ax64" -T "${CMAKE_GENERATOR_TOOLSET}" ${NBL_DXC_CMAKE_OPTIONS} - RESULT_VARIABLE DXC_CMAKE_RESULT - OUTPUT_VARIABLE DXC_CMAKE_STREAM_PIPE - ) -else() - execute_process(COMMAND "${CMAKE_COMMAND}" -C "${CMAKE_CURRENT_SOURCE_DIR}/dxc/cmake/caches/PredefinedParams.cmake" -S "${CMAKE_CURRENT_SOURCE_DIR}/dxc" -B "${DXC_BUILD_DIR}" -G "${CMAKE_GENERATOR}" -T "${CMAKE_GENERATOR_TOOLSET}" ${NBL_DXC_CMAKE_OPTIONS} - RESULT_VARIABLE DXC_CMAKE_RESULT - OUTPUT_VARIABLE DXC_CMAKE_STREAM_PIPE - ) +if(NOT CMAKE_GENERATOR MATCHES "Ninja*") + list(APPEND NBL_DXC_CMAKE_OPTIONS -Ax64) +endif() + +if(CMAKE_GENERATOR_TOOLSET) + list(APPEND NBL_DXC_CMAKE_OPTIONS -T "${CMAKE_GENERATOR_TOOLSET}") endif() +if(CMAKE_TOOLCHAIN_FILE) + list(APPEND NBL_DXC_CMAKE_OPTIONS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}") +endif() + +execute_process(COMMAND "${CMAKE_COMMAND}" -C "${CMAKE_CURRENT_SOURCE_DIR}/dxc/cmake/caches/PredefinedParams.cmake" -S "${CMAKE_CURRENT_SOURCE_DIR}/dxc" -B "${DXC_BUILD_DIR}" -G "${CMAKE_GENERATOR}" ${NBL_DXC_CMAKE_OPTIONS} + RESULT_VARIABLE DXC_CMAKE_RESULT + OUTPUT_VARIABLE DXC_CMAKE_STREAM_PIPE +) + if(NOT "${DXC_CMAKE_RESULT}" STREQUAL "0") message(FATAL_ERROR "${DXC_CMAKE_STREAM_PIPE}") endif() diff --git a/CMakePresets.json b/CMakePresets.json index da28fc1aff..032d9ad45e 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -2,8 +2,8 @@ "version": 6, "cmakeMinimumRequired": { "major": 3, - "minor": 29, - "patch": 2 + "minor": 31, + "patch": 0 }, "configurePresets": [ { @@ -90,37 +90,17 @@ { "name": "ci-configure-static-msvc", "inherits": "ci-configure-static-windows-base", - "displayName": "[CI]: Static library target, Visual Studio 17 2022 generator, MSVC v143 toolset", - "description": "Configure as static library with Visual Studio 17 2022 generator and MSVC v143 toolset", - "generator": "Visual Studio 17 2022", - "toolset": "v143" - }, - { - "name": "ci-configure-dynamic-msvc", - "inherits": "ci-configure-dynamic-windows-base", - "displayName": "[CI]: Dynamic library target, Visual Studio 17 2022 generator, MSVC v143 toolset", - "description": "Configure as dynamic library with Visual Studio 17 2022 generator and MSVC v143 toolset", - "generator": "Visual Studio 17 2022", - "toolset": "v143" - }, - { - "name": "ci-configure-static-ninja-multi", - "inherits": "ci-configure-static-windows-base", - "displayName": "[CI]: Static library target, Ninja multi-config generator", - "description": "Configure as static library with Ninja multi-config generator", "generator": "Ninja Multi-Config", "cacheVariables": { - "CMAKE_EXPORT_COMPILE_COMMANDS": "ON" + "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/docker/msvc-winsdk/cmake/winsdk-msvc-toolchain.cmake" } }, { - "name": "ci-configure-dynamic-ninja-multi", + "name": "ci-configure-dynamic-msvc", "inherits": "ci-configure-dynamic-windows-base", - "displayName": "[CI]: Dynamic library target, Ninja multi-config generator", - "description": "Configure as dynamic library with Ninja multi-config generator", "generator": "Ninja Multi-Config", "cacheVariables": { - "CMAKE_EXPORT_COMPILE_COMMANDS": "ON" + "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/docker/msvc-winsdk/cmake/winsdk-msvc-toolchain.cmake" } }, { @@ -319,8 +299,6 @@ "configurePreset": "ci-configure-static-msvc", "inheritConfigureEnvironment": true, "inherits": "build-windows-base", - "displayName": "[CI]: Static library target, Visual Studio 17 2022 generator, MSVC v143 toolset", - "description": "Build Nabla as static library with Visual Studio 17 2022 generator and MSVC v143 toolset", "condition": { "type": "equals", "lhs": "$env{NBL_CI_MODE}", @@ -332,34 +310,6 @@ "configurePreset": "ci-configure-dynamic-msvc", "inheritConfigureEnvironment": true, "inherits": "build-windows-base", - "displayName": "[CI]: Dynamic library target, Visual Studio 17 2022 generator, MSVC v143 toolset", - "description": "Build Nabla as dynamic library with Visual Studio 17 2022 generator and MSVC v143 toolset", - "condition": { - "type": "equals", - "lhs": "$env{NBL_CI_MODE}", - "rhs": "ON" - } - }, - { - "name": "ci-build-static-ninja-multi", - "configurePreset": "ci-configure-static-ninja-multi", - "inheritConfigureEnvironment": true, - "inherits": "build-windows-base", - "displayName": "[CI]: Static library target, Ninja multi-config generator", - "description": "Build Nabla as static library with Ninja multi-config generator", - "condition": { - "type": "equals", - "lhs": "$env{NBL_CI_MODE}", - "rhs": "ON" - } - }, - { - "name": "ci-build-dynamic-ninja-multi", - "configurePreset": "ci-configure-dynamic-ninja-multi", - "inheritConfigureEnvironment": true, - "inherits": "build-windows-base", - "displayName": "[CI]: Dynamic library target, Ninja multi-config generator", - "description": "Build Nabla as dynamic library with Ninja multi-config generator", "condition": { "type": "equals", "lhs": "$env{NBL_CI_MODE}", @@ -466,8 +416,6 @@ "name": "ci-package-static-msvc", "inherits": "ci-package-windows-base", "configurePreset": "ci-configure-static-msvc", - "displayName": "[CI]: Static library target, Visual Studio 17 2022 generator, MSVC v143 toolset", - "description": "Package Nabla as static library compiled with Visual Studio 17 2022 generator and MSVC v143 toolset", "condition": { "type": "equals", "lhs": "$env{NBL_CI_MODE}", @@ -478,32 +426,6 @@ "name": "ci-package-dynamic-msvc", "inherits": "ci-package-windows-base", "configurePreset": "ci-configure-dynamic-msvc", - "displayName": "[CI]: Dynamic library target, Visual Studio 17 2022 generator, MSVC v143 toolset", - "description": "Package Nabla as dynamic library compiled with Visual Studio 17 2022 generator and MSVC v143 toolset", - "condition": { - "type": "equals", - "lhs": "$env{NBL_CI_MODE}", - "rhs": "ON" - } - }, - { - "name": "ci-package-static-ninja-multi", - "inherits": "ci-package-windows-base", - "configurePreset": "ci-configure-static-ninja-multi", - "displayName": "[CI]: Static library target, Ninja multi-config generator", - "description": "Package Nabla as static library compiled with Ninja multi-config generator", - "condition": { - "type": "equals", - "lhs": "$env{NBL_CI_MODE}", - "rhs": "ON" - } - }, - { - "name": "ci-package-dynamic-ninja-multi", - "inherits": "ci-package-windows-base", - "configurePreset": "ci-configure-dynamic-ninja-multi", - "displayName": "[CI]: Dynamic library target, Ninja multi-config generator", - "description": "Package Nabla as dynamic library compiled with Ninja multi-config generator", "condition": { "type": "equals", "lhs": "$env{NBL_CI_MODE}", diff --git a/docker/msvc-winsdk b/docker/msvc-winsdk index 8aa9e767ec..8aa6a18115 160000 --- a/docker/msvc-winsdk +++ b/docker/msvc-winsdk @@ -1 +1 @@ -Subproject commit 8aa9e767ec60aa77f477ac6cf41728e997dcc950 +Subproject commit 8aa6a1811528e82982b2f462515ff9a0e2947e72 From fe75f42223a3b182f66390d2919a60797f430f13 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Tue, 20 May 2025 21:19:44 +0200 Subject: [PATCH 146/346] update docker/msvc-winsdk submodule, minor .env file change --- docker/.env | 2 -- docker/ci-windows.env | 2 ++ docker/msvc-winsdk | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) delete mode 100644 docker/.env create mode 100644 docker/ci-windows.env diff --git a/docker/.env b/docker/.env deleted file mode 100644 index 623184f422..0000000000 --- a/docker/.env +++ /dev/null @@ -1,2 +0,0 @@ -THIS_PROJECT_WORKING_DIRECTORY=C:\docker -THIS_PROJECT_NABLA_DIRECTORY=C:/Users/ContainerAdministrator/Nabla/bind \ No newline at end of file diff --git a/docker/ci-windows.env b/docker/ci-windows.env new file mode 100644 index 0000000000..ea89ce43c7 --- /dev/null +++ b/docker/ci-windows.env @@ -0,0 +1,2 @@ +NBL_CI_MODE=ON +NBL_CI_BUILD_DIRECTORY=C:\mount\nabla\build-ct \ No newline at end of file diff --git a/docker/msvc-winsdk b/docker/msvc-winsdk index 8aa6a18115..831515f599 160000 --- a/docker/msvc-winsdk +++ b/docker/msvc-winsdk @@ -1 +1 @@ -Subproject commit 8aa6a1811528e82982b2f462515ff9a0e2947e72 +Subproject commit 831515f59919fbe97653804a5fc634aeb36d360e From 6eecd13616fb95596754b3aad1e4629ad6c4eaa2 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Tue, 20 May 2025 21:58:23 +0200 Subject: [PATCH 147/346] update CMakePresets.json, add docker/ninja.env --- CMakePresets.json | 3 ++- docker/ninja.env | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) create mode 100644 docker/ninja.env diff --git a/CMakePresets.json b/CMakePresets.json index 032d9ad45e..c6396e4154 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -20,7 +20,8 @@ "NBL_EXPLICIT_MODULE_LOAD_LOG": "ON", "NBL_CPACK_NO_BUILD_DIRECTORY_MODULES": "ON", "NBL_RUN_TESTS": "ON", - "NBL_CPACK_CI": "ON" + "NBL_CPACK_CI": "ON", + "GIT_FAIL_IF_NONZERO_EXIT": "OFF" } }, { diff --git a/docker/ninja.env b/docker/ninja.env new file mode 100644 index 0000000000..9c6e70104c --- /dev/null +++ b/docker/ninja.env @@ -0,0 +1 @@ +NINJA_STATUS="[%r jobs, %f/%t edges, %oe/s, elapsed %ws]: " \ No newline at end of file From 9c596770659f4f8c2c6247c9c56cfbc57c311227 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 21 May 2025 10:57:48 +0700 Subject: [PATCH 148/346] minor fixes --- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 8bfd8b0194..9744798c6f 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -121,7 +121,9 @@ struct reduce for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) scratchAccessor.template get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]); lv1_val = reduction1(lv1_val); - scratchAccessor.template set(invocationIndex, lv1_val[Config::ItemsPerInvocation_1-1]); + + if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) + scratchAccessor.template set(0, lv1_val[Config::ItemsPerInvocation_1-1]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -130,7 +132,7 @@ struct reduce for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { scalar_t reduce_val; - scratchAccessor.template get(glsl::gl_SubgroupInvocationID(),reduce_val); + scratchAccessor.template get(0,reduce_val); dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, hlsl::promote(reduce_val)); } } @@ -179,9 +181,9 @@ struct scan [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) scratchAccessor.template get(i*Config::SubgroupsPerVirtualWorkgroup+prevIndex,lv1_val[i]); - vector_lv1_t shiftedInput = hlsl::mix(hlsl::promote(BinOp::identity), lv1_val, bool(invocationIndex)); - shiftedInput = inclusiveScan1(shiftedInput); - scratchAccessor.template set(invocationIndex, shiftedInput[Config::ItemsPerInvocation_1-1]); + lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex)); + lv1_val = inclusiveScan1(lv1_val); + scratchAccessor.template set(invocationIndex, lv1_val[Config::ItemsPerInvocation_1-1]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -284,7 +286,7 @@ struct reduce for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { scalar_t reduce_val; - scratchAccessor.template get(glsl::gl_SubgroupInvocationID(),reduce_val); + scratchAccessor.template get(0,reduce_val); dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val); } } @@ -353,8 +355,8 @@ struct scan [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) scratchAccessor.template get(lv1_smem_size+i*Config::SubgroupSize+prevIndex,lv2_val[i]); - vector_lv2_t shiftedInput = hlsl::mix(hlsl::promote(BinOp::identity), lv2_val, bool(invocationIndex)); - shiftedInput = inclusiveScan2(shiftedInput); + lv2_val[0] = hlsl::mix(hlsl::promote(BinOp::identity), lv2_val[0], bool(invocationIndex)); + vector_lv2_t shiftedScan = inclusiveScan2(lv2_val); // combine with level 1, only last element of each [unroll] @@ -363,7 +365,7 @@ struct scan scalar_t last_val; scratchAccessor.template get((Config::ItemsPerInvocation_1-1)*Config::SubgroupsPerVirtualWorkgroup+(Config::SubgroupsPerVirtualWorkgroup-1-i),last_val); scalar_t val = hlsl::mix(hlsl::promote(BinOp::identity), lv2_val, bool(i)); - val = binop(last_val, shiftedInput[Config::ItemsPerInvocation_2-1]); + val = binop(last_val, shiftedScan[Config::ItemsPerInvocation_2-1]); scratchAccessor.template set((Config::ItemsPerInvocation_1-1)*Config::SubgroupsPerVirtualWorkgroup+(Config::SubgroupsPerVirtualWorkgroup-1-i), last_val); } } From eb442624fbd1c2b1f9e8b38b73714f107a0eead7 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 21 May 2025 13:55:17 +0700 Subject: [PATCH 149/346] moved indexing functionality to config struct --- .../hlsl/workgroup2/arithmetic_config.hlsl | 10 ++++++++ .../builtin/hlsl/workgroup2/shared_scan.hlsl | 24 +++++++++---------- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl index 12f65420ca..5263a3fec8 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -75,6 +75,16 @@ struct ArithmeticConfiguration static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!"); NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementCount = conditional_value::value + SubgroupSize*ItemsPerInvocation_1>::value; + + static uint32_t virtualSubgroupID(const uint32_t id, const uint32_t offset) + { + return offset * (WorkgroupSize >> SubgroupSizeLog2) + id; + } + + static uint32_t sharedMemCoalescedIndex(const uint32_t id, const uint32_t itemsPerInvocation) + { + return (id & (itemsPerInvocation-1)) * SubgroupsPerVirtualWorkgroup + (id/itemsPerInvocation); + } }; template diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 9744798c6f..af4fb7f44d 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -105,8 +105,8 @@ struct reduce scan_local = reduction0(scan_local); if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { - const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); - const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); + const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); + const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); scratchAccessor.template set(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } @@ -165,8 +165,8 @@ struct scan dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { - const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); - const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); + const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); + const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); scratchAccessor.template set(bankedIndex, value[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } @@ -194,7 +194,7 @@ struct scan vector_lv0_t value; dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); - const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); + const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); scalar_t left; scratchAccessor.template get(virtualSubgroupID,left); if (Exclusive) @@ -244,8 +244,8 @@ struct reduce scan_local = reduction0(scan_local); if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { - const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); - const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); + const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); + const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); scratchAccessor.template set(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } @@ -262,7 +262,7 @@ struct reduce lv1_val = reduction1(lv1_val); if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { - const uint32_t bankedIndex = (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2); + const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(invocationIndex, Config::ItemsPerInvocation_2); // (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2); scratchAccessor.template set(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); } } @@ -321,8 +321,8 @@ struct scan dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { - const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); - const uint32_t bankedIndex = (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); + const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); + const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1); scratchAccessor.template set(bankedIndex, value[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } @@ -340,7 +340,7 @@ struct scan lv1_val = inclusiveScan1(lv1_val); if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { - const uint32_t bankedIndex = (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2); + const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2); // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2); scratchAccessor.template set(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); } } @@ -378,7 +378,7 @@ struct scan vector_lv0_t value; dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); - const uint32_t virtualSubgroupID = idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); + const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); // idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); const scalar_t left; scratchAccessor.template get(virtualSubgroupID, left); if (Exclusive) From 573ce446790c3d56e71c1783668ddc3d75d1c2f1 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 21 May 2025 15:02:00 +0700 Subject: [PATCH 150/346] reduction returns value instead of saving directly to storage --- examples_tests | 2 +- .../builtin/hlsl/workgroup2/arithmetic.hlsl | 8 ++-- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 43 +++++++++++-------- 3 files changed, 31 insertions(+), 22 deletions(-) diff --git a/examples_tests b/examples_tests index 44c34a8a65..0ccd26fc93 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 44c34a8a65866fb6304c12032efd08e2338c7116 +Subproject commit 0ccd26fc93d22587219b12291f855929949cef74 diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl index d0a26cdf94..e4a71bdffc 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl @@ -22,11 +22,13 @@ namespace workgroup2 template struct reduction { - template && ArithmeticSharedMemoryAccessor) - static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + using scalar_t = typename BinOp::type_t; + + template && ArithmeticSharedMemoryAccessor) + static scalar_t __call(NBL_REF_ARG(ReadOnlyDataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { impl::reduce fn; - fn.template __call(dataAccessor, scratchAccessor); + return fn.template __call(dataAccessor, scratchAccessor); } }; diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index af4fb7f44d..7a4d4764f4 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -36,7 +36,7 @@ struct reduce // doesn't use scratch smem, need as param? template - void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { using config_t = subgroup2::Configuration; using params_t = subgroup2::ArithmeticParams; @@ -45,7 +45,8 @@ struct reduce vector_t value; dataAccessor.template get(workgroup::SubgroupContiguousIndex(), value); value = reduction(value); - dataAccessor.template set(workgroup::SubgroupContiguousIndex(), value); + return value[0]; + // dataAccessor.template set(workgroup::SubgroupContiguousIndex(), value); } }; @@ -87,7 +88,7 @@ struct reduce using vector_lv1_t = vector; template - void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { using config_t = subgroup2::Configuration; using params_lv0_t = subgroup2::ArithmeticParams; @@ -128,13 +129,16 @@ struct reduce scratchAccessor.workgroupExecutionAndMemoryBarrier(); // set as last element in scan (reduction) - [unroll] - for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) - { - scalar_t reduce_val; - scratchAccessor.template get(0,reduce_val); - dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, hlsl::promote(reduce_val)); - } + // [unroll] + // for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) + // { + // scalar_t reduce_val; + // scratchAccessor.template get(0,reduce_val); + // dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, hlsl::promote(reduce_val)); + // } + scalar_t reduce_val; + scratchAccessor.template get(0,reduce_val); + return reduce_val; } }; @@ -225,7 +229,7 @@ struct reduce using vector_lv2_t = vector; template - void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { using config_t = subgroup2::Configuration; using params_lv0_t = subgroup2::ArithmeticParams; @@ -282,13 +286,16 @@ struct reduce scratchAccessor.workgroupExecutionAndMemoryBarrier(); // set as last element in scan (reduction) - [unroll] - for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) - { - scalar_t reduce_val; - scratchAccessor.template get(0,reduce_val); - dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val); - } + // [unroll] + // for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) + // { + // scalar_t reduce_val; + // scratchAccessor.template get(0,reduce_val); + // dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val); + // } + scalar_t reduce_val; + scratchAccessor.template get(0,reduce_val); + return reduce_val; } }; From 487c3deb108e145652d0c374ce7ff44c67a0d3ff Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 21 May 2025 10:52:22 +0200 Subject: [PATCH 151/346] Create .github/workflows/build-nabla.yml, update CMakePresets.json for ClangCL vendor, leave minor comments to top CMakeLists.txt --- .github/workflows/build-nabla.yml | 98 +++++++++++++++++++++++++++++++ CMakeLists.txt | 4 +- CMakePresets.json | 40 ++++++++++++- docker/ninja.env | 2 +- 4 files changed, 140 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/build-nabla.yml diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml new file mode 100644 index 0000000000..967953aeef --- /dev/null +++ b/.github/workflows/build-nabla.yml @@ -0,0 +1,98 @@ +name: Build Nabla Workflow + +on: + push: + pull_request: + workflow_dispatch: + +jobs: + build-windows: + runs-on: windows-2022 + + env: + image: ghcr.io/devsh-graphics-programming/docker-nanoserver-msvc-winsdk + entry: pwsh.exe + cmd: -NoLogo -NoProfile -ExecutionPolicy Bypass + mount: C:\mount\nabla + binary: C:\mount\nabla\build-ct + install: C:\mount\nabla\build-ct\install + + strategy: + fail-fast: false + matrix: + vendor: [msvc, clangcl] + config: [Release, Debug, RelWithDebInfo] + tag: ['17.13.6'] + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: 'true' + + - name: Set prefix + id: set-prefix + run: | + echo "prefix=run-windows-${{ matrix.tag }}-${{ matrix.vendor }}-${{ matrix.config }}" >> $GITHUB_OUTPUT + + - name: Pull Image + run: docker pull "${{ env.image }}:${{ matrix.tag }}" + + - name: Run Container + run: | + docker run \ + --entrypoint ${{ env.entry }} -di --isolation process \ + --env-file .\docker\ci-windows.env \ + --env-file .\docker\ninja.env \ + --name orphan \ + -v "${{ github.workspace }}:${{ env.mount }}" \ + -w "${{ env.mount }}" \ + "${{ env.image }}:${{ matrix.tag }}" \ + ${{ env.cmd }} + + - name: Inspect Container + run: docker inspect orphan + + - name: Container -- Configure Project with CMake + run: | + docker exec orphan \ + ${{ env.entry }} ${{ env.cmd }} cmake \ + --preset ci-configure-dynamic-${{ matrix.vendor }} \ + --profiling-output=profiling/cmake-profiling.json \ + --profiling-format=google-trace + + - name: Container -- Build NSC + run: | + docker exec orphan \ + ${{ env.entry }} ${{ env.cmd }} cmake --build \ + --preset ci-build-dynamic-${{ matrix.vendor }} \ + -t nsc --config ${{ matrix.config }} + + - name: Container -- Install NSC + run: | + docker exec orphan \ + ${{ env.entry }} ${{ env.cmd }} cmake --install \ + ${{ env.binary }} --config ${{ matrix.config }} \ + --component Runtimes --prefix ${{ env.install }} + + docker exec orphan \ + ${{ env.entry }} ${{ env.cmd }} cmake --install \ + ${{ env.binary }} --config ${{ matrix.config }} \ + --component Executables --prefix ${{ env.install }} + + - name: Package workflow artifacts + run: | + tar -cvf "${{ steps.set-prefix.outputs.prefix }}-profiling.tar" -C profiling . + tar -cvf "${{ steps.set-prefix.outputs.prefix }}-install.tar" -C ${{ env.install }} . + + - name: Upload profiling artifacts + uses: actions/upload-artifact@v4 + with: + name: ${{ steps.set-prefix.outputs.prefix }}-profiling + path: ${{ steps.set-prefix.outputs.prefix }}-profiling.tar + + - name: Upload install artifacts + uses: actions/upload-artifact@v4 + with: + name: ${{ steps.set-prefix.outputs.prefix }}-install + path: ${{ steps.set-prefix.outputs.prefix }}-install.tar \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index f24877148b..c6664f8085 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +# Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O. # This file is part of the "Nabla Engine". # For conditions of distribution and use, see copyright notice in nabla.h.in or nabla.h cmake_minimum_required(VERSION 3.31) @@ -33,7 +33,9 @@ if(MSVC) link_libraries(delayimp) endif() +# TODO: TO BE KILLED, keep both in one tree option(NBL_STATIC_BUILD "" OFF) # ON for static builds, OFF for shared + option(NBL_COMPILER_DYNAMIC_RUNTIME "" ON) option(NBL_SANITIZE_ADDRESS OFF) diff --git a/CMakePresets.json b/CMakePresets.json index c6396e4154..ad3ae50b6d 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -10,7 +10,6 @@ "name": "ci-configure-base", "hidden": true, "cacheVariables": { - "NBL_CI_MODE": "ON", "NBL_UPDATE_GIT_SUBMODULE": "OFF", "NBL_COMPILE_WITH_CUDA": "OFF", "NBL_BUILD_OPTIX": "OFF", @@ -19,7 +18,6 @@ "_NBL_COMPILE_WITH_OPEN_EXR_": "ON", "NBL_EXPLICIT_MODULE_LOAD_LOG": "ON", "NBL_CPACK_NO_BUILD_DIRECTORY_MODULES": "ON", - "NBL_RUN_TESTS": "ON", "NBL_CPACK_CI": "ON", "GIT_FAIL_IF_NONZERO_EXIT": "OFF" } @@ -104,6 +102,22 @@ "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/docker/msvc-winsdk/cmake/winsdk-msvc-toolchain.cmake" } }, + { + "name": "ci-configure-static-clangcl", + "inherits": "ci-configure-static-windows-base", + "generator": "Ninja Multi-Config", + "cacheVariables": { + "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/docker/msvc-winsdk/cmake/winsdk-clangcl-toolchain.cmake" + } + }, + { + "name": "ci-configure-dynamic-clangcl", + "inherits": "ci-configure-dynamic-windows-base", + "generator": "Ninja Multi-Config", + "cacheVariables": { + "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/docker/msvc-winsdk/cmake/winsdk-clangcl-toolchain.cmake" + } + }, { "name": "user-configure-base", "hidden": true, @@ -316,6 +330,28 @@ "lhs": "$env{NBL_CI_MODE}", "rhs": "ON" } + }, + { + "name": "ci-build-static-clangcl", + "configurePreset": "ci-configure-static-clangcl", + "inheritConfigureEnvironment": true, + "inherits": "build-windows-base", + "condition": { + "type": "equals", + "lhs": "$env{NBL_CI_MODE}", + "rhs": "ON" + } + }, + { + "name": "ci-build-dynamic-clangcl", + "configurePreset": "ci-configure-dynamic-clangcl", + "inheritConfigureEnvironment": true, + "inherits": "build-windows-base", + "condition": { + "type": "equals", + "lhs": "$env{NBL_CI_MODE}", + "rhs": "ON" + } }, { "name": "user-build-static-msvc", diff --git a/docker/ninja.env b/docker/ninja.env index 9c6e70104c..6d52cbd701 100644 --- a/docker/ninja.env +++ b/docker/ninja.env @@ -1 +1 @@ -NINJA_STATUS="[%r jobs, %f/%t edges, %oe/s, elapsed %ws]: " \ No newline at end of file +NINJA_STATUS=[%r jobs, %f/%t edges, %oe/s, elapsed %ws]: \ No newline at end of file From 473cdcd3e3c75d98ecfa783899fd71d61d03a4a3 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com> Date: Wed, 21 May 2025 11:03:41 +0200 Subject: [PATCH 152/346] Update build-nabla.yml, adjust CLI escape chars to pwsh --- .github/workflows/build-nabla.yml | 57 ++++++++++++++++--------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index 967953aeef..691b28f316 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -36,48 +36,49 @@ jobs: echo "prefix=run-windows-${{ matrix.tag }}-${{ matrix.vendor }}-${{ matrix.config }}" >> $GITHUB_OUTPUT - name: Pull Image - run: docker pull "${{ env.image }}:${{ matrix.tag }}" + run: | + docker pull "${{ env.image }}:${{ matrix.tag }}" - name: Run Container run: | - docker run \ - --entrypoint ${{ env.entry }} -di --isolation process \ - --env-file .\docker\ci-windows.env \ - --env-file .\docker\ninja.env \ - --name orphan \ - -v "${{ github.workspace }}:${{ env.mount }}" \ - -w "${{ env.mount }}" \ - "${{ env.image }}:${{ matrix.tag }}" \ + docker run ` + --entrypoint ${{ env.entry }} -di --isolation process ` + --env-file .\docker\ci-windows.env ` + --env-file .\docker\ninja.env ` + --name orphan ` + -v "${{ github.workspace }}:${{ env.mount }}" ` + -w "${{ env.mount }}" ` + "${{ env.image }}:${{ matrix.tag }}" ` ${{ env.cmd }} - name: Inspect Container - run: docker inspect orphan + run: | + docker inspect orphan - - name: Container -- Configure Project with CMake + - name: Container – Configure Project with CMake run: | - docker exec orphan \ - ${{ env.entry }} ${{ env.cmd }} cmake \ - --preset ci-configure-dynamic-${{ matrix.vendor }} \ - --profiling-output=profiling/cmake-profiling.json \ + docker exec orphan ` + ${{ env.entry }} ${{ env.cmd }} cmake ` + --preset ci-configure-dynamic-${{ matrix.vendor }} ` + --profiling-output=profiling/cmake-profiling.json ` --profiling-format=google-trace - - name: Container -- Build NSC + - name: Container – Build NSC run: | - docker exec orphan \ - ${{ env.entry }} ${{ env.cmd }} cmake --build \ - --preset ci-build-dynamic-${{ matrix.vendor }} \ + docker exec orphan ` + ${{ env.entry }} ${{ env.cmd }} cmake --build ` + --preset ci-build-dynamic-${{ matrix.vendor }} ` -t nsc --config ${{ matrix.config }} - - name: Container -- Install NSC + - name: Container – Install NSC run: | - docker exec orphan \ - ${{ env.entry }} ${{ env.cmd }} cmake --install \ - ${{ env.binary }} --config ${{ matrix.config }} \ + docker exec orphan ` + ${{ env.entry }} ${{ env.cmd }} cmake --install ` + ${{ env.binary }} --config ${{ matrix.config }} ` --component Runtimes --prefix ${{ env.install }} - - docker exec orphan \ - ${{ env.entry }} ${{ env.cmd }} cmake --install \ - ${{ env.binary }} --config ${{ matrix.config }} \ + docker exec orphan ` + ${{ env.entry }} ${{ env.cmd }} cmake --install ` + ${{ env.binary }} --config ${{ matrix.config }} ` --component Executables --prefix ${{ env.install }} - name: Package workflow artifacts @@ -95,4 +96,4 @@ jobs: uses: actions/upload-artifact@v4 with: name: ${{ steps.set-prefix.outputs.prefix }}-install - path: ${{ steps.set-prefix.outputs.prefix }}-install.tar \ No newline at end of file + path: ${{ steps.set-prefix.outputs.prefix }}-install.tar From a1a7b6a07d45f5baa9020960af83a2a92e947ecc Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com> Date: Wed, 21 May 2025 11:22:56 +0200 Subject: [PATCH 153/346] Update build-nabla.yml, add unpack packages step, correct container CLI steps --- .github/workflows/build-nabla.yml | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index 691b28f316..649c00d441 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -55,10 +55,15 @@ jobs: run: | docker inspect orphan + - name: Container – Unpack Packages + run: | + docker exec orphan ` + ${{ env.entry }} ${{ env.cmd }} C:\unpack.ps1 + - name: Container – Configure Project with CMake run: | docker exec orphan ` - ${{ env.entry }} ${{ env.cmd }} cmake ` + ${{ env.entry }} ${{ env.cmd }} -Command cmake ` --preset ci-configure-dynamic-${{ matrix.vendor }} ` --profiling-output=profiling/cmake-profiling.json ` --profiling-format=google-trace @@ -66,18 +71,18 @@ jobs: - name: Container – Build NSC run: | docker exec orphan ` - ${{ env.entry }} ${{ env.cmd }} cmake --build ` + ${{ env.entry }} ${{ env.cmd }} -Command cmake --build ` --preset ci-build-dynamic-${{ matrix.vendor }} ` -t nsc --config ${{ matrix.config }} - name: Container – Install NSC run: | docker exec orphan ` - ${{ env.entry }} ${{ env.cmd }} cmake --install ` + ${{ env.entry }} ${{ env.cmd }} -Command cmake --install ` ${{ env.binary }} --config ${{ matrix.config }} ` --component Runtimes --prefix ${{ env.install }} docker exec orphan ` - ${{ env.entry }} ${{ env.cmd }} cmake --install ` + ${{ env.entry }} ${{ env.cmd }} -Command cmake --install ` ${{ env.binary }} --config ${{ matrix.config }} ` --component Executables --prefix ${{ env.install }} From 23d18b61a7c943ba63d175a3c2508a53d4029210 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com> Date: Wed, 21 May 2025 11:32:10 +0200 Subject: [PATCH 154/346] Update build-nabla.yml, recurse submodules, use profiling directory --- .github/workflows/build-nabla.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index 649c00d441..bd50b79499 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -28,7 +28,7 @@ jobs: - name: Checkout uses: actions/checkout@v4 with: - submodules: 'true' + submodules: 'recursive' - name: Set prefix id: set-prefix @@ -62,6 +62,7 @@ jobs: - name: Container – Configure Project with CMake run: | + mkdir profiling docker exec orphan ` ${{ env.entry }} ${{ env.cmd }} -Command cmake ` --preset ci-configure-dynamic-${{ matrix.vendor }} ` From 49ca655e7f11fbc8db64d1c7adb6658938251058 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 21 May 2025 16:42:28 +0700 Subject: [PATCH 155/346] fixes to 2-level scan indexing --- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 28 ++++++------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 7a4d4764f4..eca7ababd2 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -128,14 +128,6 @@ struct reduce } scratchAccessor.workgroupExecutionAndMemoryBarrier(); - // set as last element in scan (reduction) - // [unroll] - // for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) - // { - // scalar_t reduce_val; - // scratchAccessor.template get(0,reduce_val); - // dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, hlsl::promote(reduce_val)); - // } scalar_t reduce_val; scratchAccessor.template get(0,reduce_val); return reduce_val; @@ -187,7 +179,9 @@ struct scan scratchAccessor.template get(i*Config::SubgroupsPerVirtualWorkgroup+prevIndex,lv1_val[i]); lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex)); lv1_val = inclusiveScan1(lv1_val); - scratchAccessor.template set(invocationIndex, lv1_val[Config::ItemsPerInvocation_1-1]); + [unroll] + for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) + scratchAccessor.template set(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -199,14 +193,16 @@ struct scan dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); + const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1); scalar_t left; - scratchAccessor.template get(virtualSubgroupID,left); + scratchAccessor.template get(bankedIndex,left); if (Exclusive) { scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID())); [unroll] - for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++) - value[Config::ItemsPerInvocation_0-i-1] = binop(left, hlsl::mix(value[Config::ItemsPerInvocation_0-i-2], left_last_elem, (Config::ItemsPerInvocation_0-i-1==0))); + for (uint32_t i = Config::ItemsPerInvocation_0-1; i > 0; i--) + value[i] = binop(left, value[i-1]); + value[0] = binop(left, left_last_elem); } else { @@ -285,14 +281,6 @@ struct reduce } scratchAccessor.workgroupExecutionAndMemoryBarrier(); - // set as last element in scan (reduction) - // [unroll] - // for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) - // { - // scalar_t reduce_val; - // scratchAccessor.template get(0,reduce_val); - // dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, reduce_val); - // } scalar_t reduce_val; scratchAccessor.template get(0,reduce_val); return reduce_val; From 756f90dfbe77f6532b9abe5d198c19d5f303bbe3 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com> Date: Wed, 21 May 2025 12:00:48 +0200 Subject: [PATCH 156/346] Update build-nabla.yml, go unprotected 0x45 to avoid scans slowing down builds --- .github/workflows/build-nabla.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index bd50b79499..3faa976c8a 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -25,6 +25,18 @@ jobs: tag: ['17.13.6'] steps: + - name: Environment Setup + run: | + Add-MpPreference -ExclusionPath "${{ github.workspace }}" + Add-MpPreference -ExclusionExtension "*.*" + Add-MpPreference -ExclusionProcess "docker.exe" + Add-MpPreference -ExclusionProcess "dockerd.exe" + Set-MpPreference -RemediationScheduleDay 8 + Set-MpPreference -DisableRealtimeMonitoring $true + Set-MpPreference -DisableRemovableDriveScanning $true + Set-MpPreference -DisableArchiveScanning $true + Set-MpPreference -DisableScanningMappedNetworkDrivesForFullScan $true + - name: Checkout uses: actions/checkout@v4 with: From e2ea8d46fa069f72995c38a9ef2d33090daf68b9 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com> Date: Wed, 21 May 2025 13:19:04 +0200 Subject: [PATCH 157/346] Update build-nabla.yml, correct install directory --- .github/workflows/build-nabla.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index 3faa976c8a..0af67ba08d 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -15,7 +15,7 @@ jobs: cmd: -NoLogo -NoProfile -ExecutionPolicy Bypass mount: C:\mount\nabla binary: C:\mount\nabla\build-ct - install: C:\mount\nabla\build-ct\install + install: build-ct\install strategy: fail-fast: false From 66a49ab55ea090db5fdd9d3dc9b9408cdf384fa7 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com> Date: Wed, 21 May 2025 14:31:23 +0200 Subject: [PATCH 158/346] Update build-nabla.yml, update shell for prefix setup --- .github/workflows/build-nabla.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index 0af67ba08d..be333c8f7b 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -44,8 +44,9 @@ jobs: - name: Set prefix id: set-prefix + shell: bash run: | - echo "prefix=run-windows-${{ matrix.tag }}-${{ matrix.vendor }}-${{ matrix.config }}" >> $GITHUB_OUTPUT + echo "prefix=run-windows-${{ matrix.tag }}-${{ matrix.vendor }}-${{ matrix.config }}" >> "$GITHUB_OUTPUT" - name: Pull Image run: | From 731f0776abf094ba22af69db0c401fb3fe0f85ec Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 21 May 2025 15:40:45 +0200 Subject: [PATCH 159/346] fix various sync bugs in AS building --- src/nbl/video/utilities/CAssetConverter.cpp | 38 ++++++++++++--------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 7c325cb17d..b357e2e2bb 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -2868,7 +2868,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult for (const auto& geom : geoms) if (const auto triCount=*(pPrimitiveCounts++); triCount) { - auto size = geom.vertexStride*(geom.vertexData[1] ? 2:1)*geom.maxVertex; + auto size = geom.vertexStride*(geom.vertexData[1] ? 2:1)*(geom.maxVertex+1); uint16_t alignment = hlsl::max(0x1u< SReserveResult size = core::alignUp(size,indexSize)+triCount*3*indexSize; alignment = hlsl::max(indexSize,alignment); } - inputs.logger.log("%p Triangle Data Size %d Align %d",system::ILogger::ELL_DEBUG,as,size,alignment); + //inputs.logger.log("%p Triangle Data Size %d Align %d",system::ILogger::ELL_DEBUG,as,size,alignment); incrementBuildSize(size,alignment); } } @@ -2908,7 +2908,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult } // incrementBuildSize(sizes.buildScratchSize,device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment); - inputs.logger.log("%p Scratch Size %d Combined %d",system::ILogger::ELL_DEBUG,as,sizes.buildScratchSize,buildSize); + //inputs.logger.log("%p Scratch Size %d Combined %d",system::ILogger::ELL_DEBUG,as,sizes.buildScratchSize,buildSize); // we need to save the buffer in a side-channel for later auto& out = accelerationStructureParams[IsTLAS][entry.second.firstCopyIx+i]; @@ -4632,7 +4632,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul auto* scratchBuffer = params.scratchForDeviceASBuild->getBuffer(); core::vector flushRanges; const bool manualFlush = scratchBuffer->getBoundMemory().memory->haveToMakeVisible(); - if (manualFlush) // TLAS builds do max 2 writes each and BLAS do much more anyway + if (deviceASBuildScratchPtr && manualFlush) // TLAS builds do max 2 writes each and BLAS do much more anyway flushRanges.reserve(asCount*2); // lambdas! auto streamDataToScratch = [&](const size_t offset, const size_t size,IUtilities::IUpstreamingDataProducer& callback) -> bool @@ -4644,10 +4644,14 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul flushRanges.emplace_back(scratchBuffer->getBoundMemory().memory,offset,size,ILogicalDevice::MappedMemoryRange::align_non_coherent_tag); return true; } - else if (const SBufferRange range={.offset=offset,.size=size,.buffer=smart_refctd_ptr(scratchBuffer)}; params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,callback)) - return true; else - return false; + { + const SBufferRange range={.offset=offset,.size=size,.buffer=smart_refctd_ptr(scratchBuffer)}; + const bool retval = params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,callback); + // current recording buffer may have changed + xferCmdBuf = params.transfer->getCommandBufferForRecording(); + return retval; + } }; // core::vector buildInfos; @@ -4849,7 +4853,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul for (const auto& geom : canonical->getTriangleGeometries()) if (const auto triCount=*(pPrimitiveCounts++); triCount) { - auto size = geom.vertexStride*(geom.vertexData[1] ? 2:1)*geom.maxVertex; + auto size = geom.vertexStride*(geom.vertexData[1] ? 2:1)*(geom.maxVertex+1); uint16_t alignment = hlsl::max(0x1u< CAssetConverter::convert_impl(SReserveResul allocSizes.push_back(size); alignments.push_back(alignment); const auto tmp = asToBuild.second.scratchSize; - logger.log("%p Triangle Data Size %d Align %d Scratch Size %d",system::ILogger::ELL_DEBUG,canonical.get(),size,alignment,tmp); + //logger.log("%p Triangle Data Size %d Align %d Scratch Size %d",system::ILogger::ELL_DEBUG,canonical.get(),size,alignment,tmp); } } } @@ -4884,7 +4888,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul // allocate out scratch or submit overflow, if fail then flush and keep trying till space is made auto* offsets = allocOffsets.data()+allocOffsets.size()-alignments.size(); const auto* sizes = allocSizes.data()+allocSizes.size()-alignments.size(); - logger.log("%p Combined Size %d",system::ILogger::ELL_DEBUG,canonical.get(),std::accumulate(sizes,sizes+alignments.size(),0)); + //logger.log("%p Combined Size %d",system::ILogger::ELL_DEBUG,canonical.get(),std::accumulate(sizes,sizes+alignments.size(),0)); for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(alignments.size(),offsets,sizes,alignments.data())!=0; t++) { if (t==1) // don't flush right away cause allocator not defragmented yet @@ -5042,8 +5046,10 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul if (const auto triCount=*(pPrimitiveCounts++); triCount) { auto& outGeom = triangles.emplace_back(); - auto offset = *(offsetIt++); - auto size = geom.vertexStride*geom.maxVertex; + const auto origSize = *(sizeIt++); + const auto origOffset = *(offsetIt++); + auto offset = origOffset; + auto size = geom.vertexStride*(geom.maxVertex+1); for (auto i=0; i<2; i++) if (geom.vertexData[i]) // could assert that it must be true for i==0 { @@ -5073,11 +5079,13 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul size = triCount*3*alignment; memcpyCallback.data = reinterpret_cast(geom.indexData.buffer->getPointer())+geom.indexData.offset; success = streamDataToScratch(offset,size,memcpyCallback); + offset += size; break; } default: break; } + assert(offset-origOffset<=origSize); if (!success) break; outGeom.maxVertex = geom.maxVertex; @@ -5091,8 +5099,6 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul success = pPrimitiveCounts==primitiveCounts.data()+primitiveCounts.size(); rangeInfos.push_back(reinterpret_cast(geometryRangeInfoOffset)); } - // current recording buffer may have changed - xferCmdBuf = params.transfer->getCommandBufferForRecording(); if (!success) { rangeInfos.resize(buildInfos.size()); @@ -5161,7 +5167,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul ) { // clean AS builds, pipeline barrier, query reset and writes need to get executed before we start waiting on the results - drainCompute(); + drainBoth(); // get queries core::vector sizes(compactions.size()); if (!device->getQueryPoolResults(queryPool.get(),0,compactions.size(),sizes.data(),sizeof(size_t),bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT)|IQueryPool::RESULTS_FLAGS::_64_BIT)) @@ -5301,7 +5307,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul }; // submit because we want to launch BLAS builds in a separate submit, so the scratch semaphore can signal and free the scratch and more is available for TLAS builds if (pipelineBarrier(computeCmdBuf,{.memBarriers={&readBLASInTLASBuildBarrier,1}},"Failed to sync BLAS with TLAS build!")) - drainCompute(); + drainBoth(); else failedBLASBarrier = true; } From 11813217a82561331b5f53ad20b21a66e9ce9506 Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 21 May 2025 16:14:16 +0200 Subject: [PATCH 160/346] make BLAS tracking actually work --- include/nbl/video/IGPUAccelerationStructure.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/video/IGPUAccelerationStructure.h b/include/nbl/video/IGPUAccelerationStructure.h index b7c1858130..32ad54159a 100644 --- a/include/nbl/video/IGPUAccelerationStructure.h +++ b/include/nbl/video/IGPUAccelerationStructure.h @@ -672,7 +672,7 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr // this gets called when execution is sure to happen 100%, e.g. not during command recording but during submission inline build_ver_t registerNextBuildVer() { - return m_pendingBuildVer++; + return ++m_pendingBuildVer; } // using blas_smart_ptr_t = core::smart_refctd_ptr; From a639145bb2071855f83b4f2139c3a08203f09353 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 22 May 2025 11:56:29 +0700 Subject: [PATCH 161/346] fixes to 3-level scan and minor stuff --- .../hlsl/workgroup2/arithmetic_config.hlsl | 7 +-- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 62 ++++++++++++------- 2 files changed, 42 insertions(+), 27 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl index 5263a3fec8..04cbcaef4d 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -61,8 +61,8 @@ struct ArithmeticConfiguration NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2; // must have at least enough level 0 outputs to feed a single subgroup - NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupsPerVirtualWorkgroupLog2 = mpl::max_v; - NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupsPerVirtualWorkgroup = uint16_t(0x1u) << SubgroupsPerVirtualWorkgroupLog2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t _SubgroupsPerVirtualWorkgroupLog2 = mpl::max_v; + NBL_CONSTEXPR_STATIC_INLINE uint16_t _SubgroupsPerVirtualWorkgroup = uint16_t(0x1u) << _SubgroupsPerVirtualWorkgroupLog2; using virtual_wg_t = impl::virtual_wg_size_log2; NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = virtual_wg_t::levels; @@ -83,7 +83,7 @@ struct ArithmeticConfiguration static uint32_t sharedMemCoalescedIndex(const uint32_t id, const uint32_t itemsPerInvocation) { - return (id & (itemsPerInvocation-1)) * SubgroupsPerVirtualWorkgroup + (id/itemsPerInvocation); + return (id & (itemsPerInvocation-1)) * SubgroupSize + (id/itemsPerInvocation); } }; @@ -96,7 +96,6 @@ struct is_configuration > : bool_constant { template NBL_CONSTEXPR bool is_configuration_v = is_configuration::value; - } } } diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index eca7ababd2..d44271a260 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -120,7 +120,7 @@ struct reduce vector_lv1_t lv1_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]); + scratchAccessor.template get(i*Config::SubgroupSize+invocationIndex,lv1_val[i]); lv1_val = reduction1(lv1_val); if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) @@ -176,12 +176,12 @@ struct scan const uint32_t prevIndex = invocationIndex-1; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(i*Config::SubgroupsPerVirtualWorkgroup+prevIndex,lv1_val[i]); + scratchAccessor.template get(i*Config::SubgroupSize+prevIndex,lv1_val[i]); lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex)); lv1_val = inclusiveScan1(lv1_val); [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template set(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]); + scratchAccessor.template set(i*Config::SubgroupSize+invocationIndex,lv1_val[i]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -258,7 +258,7 @@ struct reduce vector_lv1_t lv1_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]); + scratchAccessor.template get(i*Config::SubgroupSize+invocationIndex,lv1_val[i]); lv1_val = reduction1(lv1_val); if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { @@ -275,7 +275,7 @@ struct reduce vector_lv2_t lv2_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) - scratchAccessor.template get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv2_val[i]); + scratchAccessor.template get(i*Config::SubgroupSize+invocationIndex,lv2_val[i]); lv2_val = reduction2(lv2_val); scratchAccessor.template set(invocationIndex, lv2_val[Config::ItemsPerInvocation_2-1]); } @@ -324,15 +324,20 @@ struct scan scratchAccessor.workgroupExecutionAndMemoryBarrier(); // level 1 scan - const uint32_t lv1_smem_size = Config::SubgroupsPerVirtualWorkgroup*Config::ItemsPerInvocation_1; + const uint32_t lv1_smem_size = Config::SubgroupsSize*Config::ItemsPerInvocation_1; subgroup2::inclusive_scan inclusiveScan1; if (glsl::gl_SubgroupID() < lv1_smem_size) { vector_lv1_t lv1_val; + const uint32_t prevIndex = invocationIndex-1; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(i*Config::SubgroupsPerVirtualWorkgroup+invocationIndex,lv1_val[i]); + scratchAccessor.template get(i*Config::SubgroupSize+prevIndex,lv1_val[i]); + lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex)); lv1_val = inclusiveScan1(lv1_val); + [unroll] + for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) + scratchAccessor.template set(i*Config::SubgroupSize+invocationIndex,lv1_val[i]); if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) { const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2); // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2); @@ -351,21 +356,30 @@ struct scan for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) scratchAccessor.template get(lv1_smem_size+i*Config::SubgroupSize+prevIndex,lv2_val[i]); lv2_val[0] = hlsl::mix(hlsl::promote(BinOp::identity), lv2_val[0], bool(invocationIndex)); - vector_lv2_t shiftedScan = inclusiveScan2(lv2_val); - - // combine with level 1, only last element of each + lv2_val = inclusiveScan2(lv2_val); [unroll] - for (uint32_t i = 0; i < Config::SubgroupsPerVirtualWorkgroup; i++) - { - scalar_t last_val; - scratchAccessor.template get((Config::ItemsPerInvocation_1-1)*Config::SubgroupsPerVirtualWorkgroup+(Config::SubgroupsPerVirtualWorkgroup-1-i),last_val); - scalar_t val = hlsl::mix(hlsl::promote(BinOp::identity), lv2_val, bool(i)); - val = binop(last_val, shiftedScan[Config::ItemsPerInvocation_2-1]); - scratchAccessor.template set((Config::ItemsPerInvocation_1-1)*Config::SubgroupsPerVirtualWorkgroup+(Config::SubgroupsPerVirtualWorkgroup-1-i), last_val); - } + for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) + scratchAccessor.template set(lv1_smem_size+i*Config::SubgroupSize+invocationIndex,lv2_val[i]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); + // combine with level 1 + if (glsl::gl_SubgroupID() < lv1_smem_size) + { + vector_lv1_t lv1_val; + [unroll] + for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) + scratchAccessor.template get(i*Config::SubgroupSize+invocationIndex,lv1_val[i]); + + scalar_t lv2_scan; + const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2); // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2); + scratchAccessor.template set(lv1_smem_size+bankedIndex, lv2_scan); + + [unroll] + for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) + scratchAccessor.template set(i*Config::SubgroupSize+invocationIndex, binop(lv1_val[i],lv2_scan)); + } + // combine with level 0 [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) @@ -373,15 +387,17 @@ struct scan vector_lv0_t value; dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); - const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); // idx * (Config::WorkgroupSize >> Config::SubgroupSizeLog2) + glsl::gl_SubgroupID(); - const scalar_t left; - scratchAccessor.template get(virtualSubgroupID, left); + const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); + const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1); + scalar_t left; + scratchAccessor.template get(bankedIndex,left); if (Exclusive) { scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID())); [unroll] - for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++) - value[Config::ItemsPerInvocation_0-i-1] = binop(left, hlsl::mix(value[Config::ItemsPerInvocation_0-i-2], left_last_elem, (Config::ItemsPerInvocation_0-i-1==0))); + for (uint32_t i = Config::ItemsPerInvocation_0-1; i > 0; i--) + value[i] = binop(left, value[i-1]); + value[0] = binop(left, left_last_elem); } else { From 7751359a78b5ba7dad595aa04515c4fce3042bf1 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 22 May 2025 15:14:50 +0700 Subject: [PATCH 162/346] some minor fixes --- examples_tests | 2 +- include/nbl/builtin/hlsl/subgroup2/ballot.hlsl | 2 ++ include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl | 5 +---- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/examples_tests b/examples_tests index 0ccd26fc93..13ae89f7d3 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 0ccd26fc93d22587219b12291f855929949cef74 +Subproject commit 13ae89f7d3fc666124486b5e18f13922995d3569 diff --git a/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl b/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl index 52ae6de2d9..3b511126b4 100644 --- a/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl +++ b/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl @@ -4,6 +4,8 @@ #ifndef _NBL_BUILTIN_HLSL_SUBGROUP2_BALLOT_INCLUDED_ #define _NBL_BUILTIN_HLSL_SUBGROUP2_BALLOT_INCLUDED_ +#include "nbl/builtin/hlsl/glsl_compat/subgroup_ballot.hlsl" + namespace nbl { namespace hlsl diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl index 04cbcaef4d..512641abb8 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -22,6 +22,7 @@ struct virtual_wg_size_log2 // static_assert(WorkgroupSizeLog2<=SubgroupSizeLog2+4, "WorkgroupSize cannot be larger than SubgroupSize*16"); NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2),uint16_t,conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2*2+2),uint16_t,3,2>::value,1>::value; NBL_CONSTEXPR_STATIC_INLINE uint16_t value = mpl::max_v+SubgroupSizeLog2; + // must have at least enough level 0 outputs to feed a single subgroup }; template @@ -60,10 +61,6 @@ struct ArithmeticConfiguration NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = _SubgroupSizeLog2; NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2; - // must have at least enough level 0 outputs to feed a single subgroup - NBL_CONSTEXPR_STATIC_INLINE uint16_t _SubgroupsPerVirtualWorkgroupLog2 = mpl::max_v; - NBL_CONSTEXPR_STATIC_INLINE uint16_t _SubgroupsPerVirtualWorkgroup = uint16_t(0x1u) << _SubgroupsPerVirtualWorkgroupLog2; - using virtual_wg_t = impl::virtual_wg_size_log2; NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = virtual_wg_t::levels; NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << virtual_wg_t::value; From 9f43c02bab2d70e4f59ce7a2f50b9580e2583691 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 22 May 2025 15:28:25 +0700 Subject: [PATCH 163/346] Return Subgroup size to IPipelineBase --- include/nbl/asset/ICPUGraphicsPipeline.h | 28 ++++++++++++------------ include/nbl/asset/ICPUPipeline.h | 2 ++ include/nbl/asset/IComputePipeline.h | 18 +-------------- include/nbl/asset/IPipeline.h | 15 +++++++++++++ 4 files changed, 32 insertions(+), 31 deletions(-) diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h index dcdcfb495e..4a7ee3b695 100644 --- a/include/nbl/asset/ICPUGraphicsPipeline.h +++ b/include/nbl/asset/ICPUGraphicsPipeline.h @@ -26,20 +26,6 @@ class ICPUGraphicsPipeline final : public ICPUPipeline(retval,core::dont_grab); } - inline core::smart_refctd_ptr clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final - { - auto* newPipeline = new ICPUGraphicsPipeline(layout.get()); - newPipeline->m_params = m_params; - newPipeline->m_renderpass = m_renderpass; - - for (auto specInfo_i = 0u; specInfo_i < m_specInfos.size(); specInfo_i++) - { - newPipeline->m_specInfos[specInfo_i] = m_specInfos[specInfo_i].clone(depth); - } - - return core::smart_refctd_ptr(newPipeline, core::dont_grab); - } - constexpr static inline auto AssetType = ET_GRAPHICS_PIPELINE; inline E_TYPE getAssetType() const override { return AssetType; } @@ -121,6 +107,20 @@ class ICPUGraphicsPipeline final : public ICPUPipeline clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final + { + auto* newPipeline = new ICPUGraphicsPipeline(layout.get()); + newPipeline->m_params = m_params; + newPipeline->m_renderpass = m_renderpass; + + for (auto specInfo_i = 0u; specInfo_i < m_specInfos.size(); specInfo_i++) + { + newPipeline->m_specInfos[specInfo_i] = m_specInfos[specInfo_i].clone(depth); + } + + return core::smart_refctd_ptr(newPipeline, core::dont_grab); + } }; } diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h index 8fe7e38391..435aca5d40 100644 --- a/include/nbl/asset/ICPUPipeline.h +++ b/include/nbl/asset/ICPUPipeline.h @@ -70,6 +70,8 @@ class ICPUPipelineBase core::smart_refctd_ptr shader = nullptr; std::string entryPoint = ""; + IPipelineBase::SUBGROUP_SIZE requiredSubgroupSize = IPipelineBase::SUBGROUP_SIZE::UNKNOWN; //!< Default value of 8 means no requirement + // Container choice implicitly satisfies: // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-constantID-04911 core::unordered_map entries; diff --git a/include/nbl/asset/IComputePipeline.h b/include/nbl/asset/IComputePipeline.h index 4f439d7100..9ccef877c3 100644 --- a/include/nbl/asset/IComputePipeline.h +++ b/include/nbl/asset/IComputePipeline.h @@ -9,26 +9,10 @@ namespace nbl::asset class IComputePipelineBase : public virtual core::IReferenceCounted { public: - // Nabla requires device's reported subgroup size to be between 4 and 128 - enum class SUBGROUP_SIZE : uint8_t - { - // No constraint but probably means `gl_SubgroupSize` is Dynamically Uniform - UNKNOWN = 0, - // Allows the Subgroup Uniform `gl_SubgroupSize` to be non-Dynamically Uniform and vary between Device's min and max - VARYING = 1, - // The rest we encode as log2(x) of the required value - REQUIRE_4 = 2, - REQUIRE_8 = 3, - REQUIRE_16 = 4, - REQUIRE_32 = 5, - REQUIRE_64 = 6, - REQUIRE_128 = 7 - }; struct SCachedCreationParams final { - SUBGROUP_SIZE requiredSubgroupSize : 3 = SUBGROUP_SIZE::UNKNOWN; //!< Default value of 8 means no requirement - uint8_t requireFullSubgroups : 1 = false; + uint8_t requireFullSubgroups = false; }; }; diff --git a/include/nbl/asset/IPipeline.h b/include/nbl/asset/IPipeline.h index eb64de0b0d..c458c34afe 100644 --- a/include/nbl/asset/IPipeline.h +++ b/include/nbl/asset/IPipeline.h @@ -105,6 +105,21 @@ class IPipelineBase }; using FLAGS = CreationFlags; + // Nabla requires device's reported subgroup size to be between 4 and 128 + enum class SUBGROUP_SIZE : uint8_t + { + // No constraint but probably means `gl_SubgroupSize` is Dynamically Uniform + UNKNOWN = 0, + // Allows the Subgroup Uniform `gl_SubgroupSize` to be non-Dynamically Uniform and vary between Device's min and max + VARYING = 1, + // The rest we encode as log2(x) of the required value + REQUIRE_4 = 2, + REQUIRE_8 = 3, + REQUIRE_16 = 4, + REQUIRE_32 = 5, + REQUIRE_64 = 6, + REQUIRE_128 = 7 + }; }; template From bae94c58e8c73a7b111d0edfa4017a8770803809 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 22 May 2025 15:29:56 +0700 Subject: [PATCH 164/346] Fix missing bracket for getLayout --- include/nbl/asset/ICPUPipeline.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h index 435aca5d40..c7fe9b49e0 100644 --- a/include/nbl/asset/ICPUPipeline.h +++ b/include/nbl/asset/ICPUPipeline.h @@ -125,7 +125,7 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipe core::smart_refctd_ptr layout; if (_depth > 0u) - layout = core::smart_refctd_ptr_static_cast(getLayout->clone(_depth-1u)); + layout = core::smart_refctd_ptr_static_cast(getLayout()->clone(_depth - 1u)); return clone_impl(std::move(layout), _depth); } From 0d8fe94aefe5d820c43d26d9a6235951f2969c6b Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 22 May 2025 15:30:17 +0700 Subject: [PATCH 165/346] Return Subgroup Size to every SShaderSpecInfo --- include/nbl/video/IGPUPipeline.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h index ff6d97f17b..f9a32786bf 100644 --- a/include/nbl/video/IGPUPipeline.h +++ b/include/nbl/video/IGPUPipeline.h @@ -91,6 +91,9 @@ class IGPUPipelineBase { const asset::IShader* shader = nullptr; std::string_view entryPoint = ""; + asset::IPipelineBase::SUBGROUP_SIZE requiredSubgroupSize = asset::IPipelineBase::SUBGROUP_SIZE::UNKNOWN; //!< Default value of 8 means no requirement + + // Container choice implicitly satisfies: // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-constantID-04911 const core::unordered_map* entries; From 4ed04c83eb5162747c7dce9514c9e445b4ffd941 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 22 May 2025 15:30:36 +0700 Subject: [PATCH 166/346] Fix stagePresence typo --- include/nbl/video/IGPUGraphicsPipeline.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h index ae8924a1ab..c44ef5ceb1 100644 --- a/include/nbl/video/IGPUGraphicsPipeline.h +++ b/include/nbl/video/IGPUGraphicsPipeline.h @@ -53,7 +53,7 @@ class IGPUGraphicsPipeline : public IGPUPipeline Date: Thu, 22 May 2025 15:30:56 +0700 Subject: [PATCH 167/346] Move clone_impl to private --- include/nbl/asset/ICPURayTracingPipeline.h | 44 +++++++++++----------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h index 2b04a2f41b..ed2c5d2409 100644 --- a/include/nbl/asset/ICPURayTracingPipeline.h +++ b/include/nbl/asset/ICPURayTracingPipeline.h @@ -31,27 +31,7 @@ class ICPURayTracingPipeline final : public ICPUPipeline(retval,core::dont_grab); } - inline core::smart_refctd_ptr clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final - { - auto newPipeline = new ICPURayTracingPipeline(layout.get()); - newPipeline->m_raygen = m_raygen.clone(depth); - - auto cloneSpecInfos = [depth](const core::vector& specInfos) -> core::vector { - core::vector results; - results.resize(specInfos.size()); - for (auto specInfo_i = 0u; specInfo_i < specInfos.size(); specInfo_i++) - results[specInfo_i] = specInfos[specInfo_i].clone(depth); - return results; - }; - newPipeline->m_misses = cloneSpecInfos(m_misses); - newPipeline->m_hitGroups.anyHits = cloneSpecInfos(m_hitGroups.anyHits); - newPipeline->m_hitGroups.closestHits = cloneSpecInfos(m_hitGroups.closestHits); - newPipeline->m_hitGroups.intersections = cloneSpecInfos(m_hitGroups.intersections); - newPipeline->m_callables = cloneSpecInfos(m_callables); - - newPipeline->m_params = m_params; - return core::smart_refctd_ptr(newPipeline); - } + constexpr static inline auto AssetType = ET_RAYTRACING_PIPELINE; inline E_TYPE getAssetType() const override { return AssetType; } @@ -118,6 +98,28 @@ class ICPURayTracingPipeline final : public ICPUPipelinem_callables) dependants.insert(callableInfo.shader.get()); return dependants; } + + inline core::smart_refctd_ptr clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final + { + auto newPipeline = new ICPURayTracingPipeline(layout.get()); + newPipeline->m_raygen = m_raygen.clone(depth); + + auto cloneSpecInfos = [depth](const core::vector& specInfos) -> core::vector { + core::vector results; + results.resize(specInfos.size()); + for (auto specInfo_i = 0u; specInfo_i < specInfos.size(); specInfo_i++) + results[specInfo_i] = specInfos[specInfo_i].clone(depth); + return results; + }; + newPipeline->m_misses = cloneSpecInfos(m_misses); + newPipeline->m_hitGroups.anyHits = cloneSpecInfos(m_hitGroups.anyHits); + newPipeline->m_hitGroups.closestHits = cloneSpecInfos(m_hitGroups.closestHits); + newPipeline->m_hitGroups.intersections = cloneSpecInfos(m_hitGroups.intersections); + newPipeline->m_callables = cloneSpecInfos(m_callables); + + newPipeline->m_params = m_params; + return core::smart_refctd_ptr(newPipeline); + } }; } From c01392c93f7f7490131b7bc9bb4aa56b2140ba34 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 22 May 2025 15:31:16 +0700 Subject: [PATCH 168/346] Implement getSpecInfoVec for ICPURayTracingPipeline --- include/nbl/asset/ICPURayTracingPipeline.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h index ed2c5d2409..5819099887 100644 --- a/include/nbl/asset/ICPURayTracingPipeline.h +++ b/include/nbl/asset/ICPURayTracingPipeline.h @@ -65,6 +65,28 @@ class ICPURayTracingPipeline final : public ICPUPipeline& getSpecInfoVec(hlsl::ShadeStage stage) + { + if (!isMutable()) return {}; + switch (stage) + { + // raygen is not stored as vector so we can't return it here. Use getSpecInfo + case hlsl::ShaderStage::ESS_MISS: + return m_misses; + case hlsl::ShaderStage::ESS_ANY_HIT: + return m_hitGroups.anyHits; + case hlsl::ShaderStage::ESS_CLOSEST_HIT: + return m_hitGroups.closestHits; + case hlsl::ShaderStage::ESS_INTERSECTION: + return m_hitGroups.intersections; + case hlsl::ShaderStage::ESS_CALLABLE: + return m_callables; + + } + return {}; + } + + inline virtual bool valid() const override final { // TODO(kevinyu): Fix this temporary dummy code From fd6f527f55b6cea8f4912642c92cb9fc572aa41a Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 22 May 2025 17:03:32 +0700 Subject: [PATCH 169/346] latest example --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 13ae89f7d3..a8774db88d 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 13ae89f7d3fc666124486b5e18f13922995d3569 +Subproject commit a8774db88d1d08d0a3fe9f2a30e7dc376120493a From 9a3cc695fbcb7508c0266fe7798ced0b18f9e9ed Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 22 May 2025 12:28:28 +0200 Subject: [PATCH 170/346] default AS patch constructor default values so patches merge correctly --- include/nbl/video/utilities/CAssetConverter.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h index 182b025ada..682b3887a0 100644 --- a/include/nbl/video/utilities/CAssetConverter.h +++ b/include/nbl/video/utilities/CAssetConverter.h @@ -175,7 +175,7 @@ class CAssetConverter : public core::IReferenceCounted //! select build flags uint8_t allowUpdate : 1 = false; uint8_t allowCompaction : 1 = false; - BuildPreference preference : 2 = BuildPreference::Invalid; + BuildPreference preference : 2 = BuildPreference::None; uint8_t lowMemory : 1 = false; //! things that control the build uint8_t hostBuild : 1 = false; // DO NOT USE, will get overriden to false anyway @@ -187,7 +187,7 @@ class CAssetConverter : public core::IReferenceCounted template std::pair combine_impl(const CRTP& _this, const CRTP& other) const { - if (_this.preference!=other.preference || _this.preference==BuildPreference::Invalid) + if (_this.preference!=other.preference && _this.preference!=BuildPreference::None && other.preference!=BuildPreference::None) return {false,_this}; CRTP retval = _this; retval.isMotion |= other.isMotion; From 7b3c0edd4c40380caec7735f3f903483c156bfed Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 22 May 2025 18:35:51 +0700 Subject: [PATCH 171/346] Fix getSpecInfoVec --- include/nbl/asset/ICPURayTracingPipeline.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h index 5819099887..8be23ffe64 100644 --- a/include/nbl/asset/ICPURayTracingPipeline.h +++ b/include/nbl/asset/ICPURayTracingPipeline.h @@ -65,25 +65,25 @@ class ICPURayTracingPipeline final : public ICPUPipeline& getSpecInfoVec(hlsl::ShadeStage stage) + inline core::vector* getSpecInfoVec(hlsl::ShaderStage stage) { - if (!isMutable()) return {}; + if (!isMutable()) return nullptr; switch (stage) { // raygen is not stored as vector so we can't return it here. Use getSpecInfo case hlsl::ShaderStage::ESS_MISS: - return m_misses; + return &m_misses; case hlsl::ShaderStage::ESS_ANY_HIT: - return m_hitGroups.anyHits; + return &m_hitGroups.anyHits; case hlsl::ShaderStage::ESS_CLOSEST_HIT: - return m_hitGroups.closestHits; + return &m_hitGroups.closestHits; case hlsl::ShaderStage::ESS_INTERSECTION: - return m_hitGroups.intersections; + return &m_hitGroups.intersections; case hlsl::ShaderStage::ESS_CALLABLE: - return m_callables; + return &m_callables; } - return {}; + return nullptr; } From 96db32b8bcc55dd9dfc49d1ec9117fec4f329fdd Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 22 May 2025 18:36:09 +0700 Subject: [PATCH 172/346] Implement ICPURayTracingPIpeline valid --- include/nbl/asset/ICPURayTracingPipeline.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h index 8be23ffe64..618c851883 100644 --- a/include/nbl/asset/ICPURayTracingPipeline.h +++ b/include/nbl/asset/ICPURayTracingPipeline.h @@ -89,7 +89,9 @@ class ICPURayTracingPipeline final : public ICPUPipelinevalid()) return false; + if (m_raygen.valid() == SShaderSpecInfo::INVALID_SPEC_INFO) return false; return true; } From 02c0d94b54e2ed0df8597e6157e5a73e54cbf94d Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 22 May 2025 14:17:28 +0200 Subject: [PATCH 173/346] forgot to overwrite staging cache XD --- src/nbl/video/utilities/CAssetConverter.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index b357e2e2bb..c69d373656 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -5276,6 +5276,9 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul auto& resultOutput = std::get>(reservations.m_gpuObjects); resultOutput[foundIx->second].value = compactedAS; } + // overwrite staging cache + auto pFound = findInStaging.template operator()(srcAS); + pFound->second.gpuRef = compactedAS; // insert into compaction map retval[srcAS] = std::move(compactedAS); } From 98f3153a21c755b30b0a9c89c28734ff1216426f Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 23 May 2025 13:11:55 +0700 Subject: [PATCH 174/346] Fix ICPUSkeleton.h computeDependants --- include/nbl/asset/ICPUSkeleton.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/nbl/asset/ICPUSkeleton.h b/include/nbl/asset/ICPUSkeleton.h index 51be7acc5a..a29adbabbc 100644 --- a/include/nbl/asset/ICPUSkeleton.h +++ b/include/nbl/asset/ICPUSkeleton.h @@ -94,9 +94,7 @@ class ICPUSkeleton final : public ISkeleton, public IAsset requires(std::same_as, ICPUSkeleton>) static auto computeDependantsImpl(Self* self) { using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; - core::unordered_set dependants; - return { self->m_defaultTransforms.buffer.get(), self->m_parentJointIDs.buffer.get() }; - return dependants; + return core::unordered_set{ self->m_defaultTransforms.buffer.get(), self->m_parentJointIDs.buffer.get() }; } }; From 30f35af1f9fdd14a48994862e3214c08d8c38710 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 23 May 2025 13:12:17 +0700 Subject: [PATCH 175/346] Small fixes --- include/nbl/asset/ICPUAccelerationStructure.h | 2 +- include/nbl/asset/ICPUDescriptorSetLayout.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h index 3ac794a888..73365cbfce 100644 --- a/include/nbl/asset/ICPUAccelerationStructure.h +++ b/include/nbl/asset/ICPUAccelerationStructure.h @@ -272,7 +272,7 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA inline core::unordered_set computeDependants() const override { core::unordered_set dependants; - for (const auto& instance : m_instances) + for (const auto& instance : *m_instances) dependants.insert(instance.getBase().blas.get()); return dependants; } diff --git a/include/nbl/asset/ICPUDescriptorSetLayout.h b/include/nbl/asset/ICPUDescriptorSetLayout.h index b2c06792d6..aea1520b6f 100644 --- a/include/nbl/asset/ICPUDescriptorSetLayout.h +++ b/include/nbl/asset/ICPUDescriptorSetLayout.h @@ -78,7 +78,7 @@ class ICPUDescriptorSetLayout : public IDescriptorSetLayout, public using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; core::unordered_set dependants; if (!self->m_immutableSamplers) return dependants; - for (const auto& sampler: self->m_immutableSamplers) + for (const auto& sampler: *self->m_immutableSamplers) { dependants.insert(sampler.get()); } From 2983ff09b649e586867ebc417869f4422bd9a764 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 23 May 2025 13:12:32 +0700 Subject: [PATCH 176/346] Remove redundant final specifier --- include/nbl/asset/ICPUComputePipeline.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h index f6b689857f..27d16461a2 100644 --- a/include/nbl/asset/ICPUComputePipeline.h +++ b/include/nbl/asset/ICPUComputePipeline.h @@ -39,7 +39,7 @@ class ICPUComputePipeline final : public ICPUPipeline getSpecInfo(hlsl::ShaderStage stage) const override final + inline std::span getSpecInfo(hlsl::ShaderStage stage) const override { if (stage==hlsl::ShaderStage::ESS_COMPUTE) return {&m_specInfo,1}; From e218e7770e0c92c543f1ea017cd1204ac0375002 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 23 May 2025 13:12:55 +0700 Subject: [PATCH 177/346] Remove const so it can be cast to IAsset* --- include/nbl/asset/IPipeline.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/asset/IPipeline.h b/include/nbl/asset/IPipeline.h index c458c34afe..d2a85c42fb 100644 --- a/include/nbl/asset/IPipeline.h +++ b/include/nbl/asset/IPipeline.h @@ -133,7 +133,7 @@ class IPipeline : public IPipelineBase inline IPipeline(core::smart_refctd_ptr&& _layout) : m_layout(std::move(_layout)) {} - core::smart_refctd_ptr m_layout; + core::smart_refctd_ptr m_layout; }; } From b58e486d505f9dd2f030322b16a34e029e2964c1 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 23 May 2025 13:13:15 +0700 Subject: [PATCH 178/346] Fix RenderpassIndependentPipeline --- include/nbl/asset/ICPURenderpassIndependentPipeline.h | 6 ++++++ include/nbl/asset/IRenderpassIndependentPipeline.h | 5 ----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/nbl/asset/ICPURenderpassIndependentPipeline.h b/include/nbl/asset/ICPURenderpassIndependentPipeline.h index 628785d2ab..fbff6ee312 100644 --- a/include/nbl/asset/ICPURenderpassIndependentPipeline.h +++ b/include/nbl/asset/ICPURenderpassIndependentPipeline.h @@ -19,6 +19,12 @@ namespace nbl::asset class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline, public IAsset { public: + struct SCreationParams + { + std::span shaders = {}; + SCachedCreationParams cached = {}; + }; + //(TODO) it is true however it causes DSs to not be cached when ECF_DONT_CACHE_TOP_LEVEL is set which isnt really intuitive constexpr static inline uint32_t DESC_SET_HIERARCHYLEVELS_BELOW = 0u; // TODO: @Crisspl HOW ON EARTH DOES THIS MAKE SENSE!? diff --git a/include/nbl/asset/IRenderpassIndependentPipeline.h b/include/nbl/asset/IRenderpassIndependentPipeline.h index 7f33b6abc4..feeaff7c99 100644 --- a/include/nbl/asset/IRenderpassIndependentPipeline.h +++ b/include/nbl/asset/IRenderpassIndependentPipeline.h @@ -28,11 +28,6 @@ class IRenderpassIndependentPipeline SRasterizationParams rasterization = {}; SBlendParams blend = {}; }; - struct SCreationParams - { - std::span shaders = {}; - SCachedCreationParams cached = {}; - }; inline const SCachedCreationParams& getCachedCreationParams() const {return m_cachedParams;} From 1f3a4775530484bca85ddf8dc46b5e0bc0c46aa1 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 23 May 2025 13:15:23 +0700 Subject: [PATCH 179/346] Fix SpirvIntrospector --- include/nbl/asset/ICPUPipeline.h | 8 +++++--- include/nbl/asset/utils/CSPIRVIntrospector.h | 4 ++-- src/nbl/asset/utils/CSPIRVIntrospector.cpp | 20 ++++++++++---------- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h index c7fe9b49e0..9674b872e0 100644 --- a/include/nbl/asset/ICPUPipeline.h +++ b/include/nbl/asset/ICPUPipeline.h @@ -72,9 +72,10 @@ class ICPUPipelineBase IPipelineBase::SUBGROUP_SIZE requiredSubgroupSize = IPipelineBase::SUBGROUP_SIZE::UNKNOWN; //!< Default value of 8 means no requirement + using spec_constant_map_t = core::unordered_map; // Container choice implicitly satisfies: // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-constantID-04911 - core::unordered_map entries; + spec_constant_map_t entries; // By requiring Nabla Core Profile features we implicitly satisfy: // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02784 // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02785 @@ -92,7 +93,7 @@ class ICPUPipelineBase } }; - virtual std::span getSpecInfo(const hlsl::ShaderStage stage) const = 0; + virtual std::span getSpecInfo(hlsl::ShaderStage stage) const = 0; }; @@ -130,7 +131,8 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipe return clone_impl(std::move(layout), _depth); } - inline std::span getSpecInfo(hlsl::ShaderStage stage) + // Note(kevinyu): For some reason overload resolution cannot find this function when I name id getSpecInfo. It always use the const variant. Will check on it later. + inline std::span getSpecInfoMut(hlsl::ShaderStage stage) { if (!isMutable()) return {}; const auto specInfo = const_cast(this)->getSpecInfo(stage); diff --git a/include/nbl/asset/utils/CSPIRVIntrospector.h b/include/nbl/asset/utils/CSPIRVIntrospector.h index 3d6455e020..fa497f08aa 100644 --- a/include/nbl/asset/utils/CSPIRVIntrospector.h +++ b/include/nbl/asset/utils/CSPIRVIntrospector.h @@ -582,7 +582,7 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable } // returns true if successfully added all the info to self, false if incompatible with what's already in our pipeline or incomplete (e.g. missing spec constants) - bool merge(const CStageIntrospectionData* stageData, const IPipelineBase::SShaderSpecInfo::spec_constant_map_t* specConstants=nullptr); + bool merge(const CStageIntrospectionData* stageData, const ICPUPipelineBase::SShaderSpecInfo::spec_constant_map_t* specConstants=nullptr); // core::smart_refctd_dynamic_array createPushConstantRangesFromIntrospection(core::smart_refctd_ptr& introspection); @@ -643,7 +643,7 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable } //! creates pipeline for a single IShader - core::smart_refctd_ptr createApproximateComputePipelineFromIntrospection(const IPipelineBase::SShaderSpecInfo& info, core::smart_refctd_ptr&& layout=nullptr); + core::smart_refctd_ptr createApproximateComputePipelineFromIntrospection(const ICPUPipelineBase::SShaderSpecInfo& info, core::smart_refctd_ptr&& layout=nullptr); #if 0 // wait until Renderpass Indep completely gone and Graphics Pipeline is used in a new way && Graphics Pipeline Libraries struct CShaderStages diff --git a/src/nbl/asset/utils/CSPIRVIntrospector.cpp b/src/nbl/asset/utils/CSPIRVIntrospector.cpp index 8b43c676b7..214ffdddbb 100644 --- a/src/nbl/asset/utils/CSPIRVIntrospector.cpp +++ b/src/nbl/asset/utils/CSPIRVIntrospector.cpp @@ -3,6 +3,8 @@ // For conditions of distribution and use, see copyright notice in nabla.h #include "nbl/asset/utils/CSPIRVIntrospector.h" + +#include "nbl/asset/ICPUPipeline.h" #include "nbl/asset/utils/spvUtils.h" #include "nbl_spirv_cross/spirv_parser.hpp" @@ -106,15 +108,15 @@ static CSPIRVIntrospector::CStageIntrospectionData::VAR_TYPE spvcrossType2E_TYPE } } -core::smart_refctd_ptr CSPIRVIntrospector::createApproximateComputePipelineFromIntrospection(const IPipelineBase::SShaderSpecInfo& info, core::smart_refctd_ptr&& layout/* = nullptr*/) +core::smart_refctd_ptr CSPIRVIntrospector::createApproximateComputePipelineFromIntrospection(const ICPUPipelineBase::SShaderSpecInfo& info, core::smart_refctd_ptr&& layout/* = nullptr*/) { - if (info.stage!=IShader::E_SHADER_STAGE::ESS_COMPUTE || info.valid()==IPipelineBase::SShaderSpecInfo::INVALID_SPEC_INFO) + if (info.valid()==ICPUPipelineBase::SShaderSpecInfo::INVALID_SPEC_INFO) return nullptr; CStageIntrospectionData::SParams params; params.entryPoint = info.entryPoint; params.shader = core::smart_refctd_ptr(info.shader); - params.stage = info.stage; + params.stage = hlsl::ShaderStage::ESS_COMPUTE; auto introspection = introspect(params); @@ -174,15 +176,13 @@ core::smart_refctd_ptr CSPIRVIntrospector::createApproximat layout = pplnIntrospectData->createApproximatePipelineLayoutFromIntrospection(introspection); } - ICPUComputePipeline::SCreationParams pplnCreationParams; - pplnCreationParams.layout = layout.get(); - pplnCreationParams.shader = info; - pplnCreationParams.layout = layout.get(); - return ICPUComputePipeline::create(pplnCreationParams); + auto pipeline = ICPUComputePipeline::create(layout.get()); + pipeline->getSpecInfoMut(hlsl::ShaderStage::ESS_COMPUTE)[0] = info; + return pipeline; } // returns true if successfully added all the info to self, false if incompatible with what's already in our pipeline or incomplete (e.g. missing spec constants) -NBL_API2 bool CSPIRVIntrospector::CPipelineIntrospectionData::merge(const CSPIRVIntrospector::CStageIntrospectionData* stageData, const IPipelineBase::SShaderSpecInfo::spec_constant_map_t* specConstants) +NBL_API2 bool CSPIRVIntrospector::CPipelineIntrospectionData::merge(const CSPIRVIntrospector::CStageIntrospectionData* stageData, const ICPUPipelineBase::SShaderSpecInfo::spec_constant_map_t* specConstants) { if (!stageData) return false; @@ -218,7 +218,7 @@ NBL_API2 bool CSPIRVIntrospector::CPipelineIntrospectionData::merge(const CSPIRV if (specConstantFound == specConstants->end()) return false; - descInfo.count = specConstantFound->second; + descInfo.count = (specConstantFound->second.size() != 0); } else { From 5b6e20e8f27af143870735f30ddd82068c2a8503 Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 23 May 2025 10:55:07 +0200 Subject: [PATCH 180/346] keep a pending TLAS build BLAS tracking set linked list Make the Acceleration Structure Copy Structs strongly typed --- include/nbl/video/IGPUAccelerationStructure.h | 181 +++++++++++------- include/nbl/video/IGPUCommandBuffer.h | 51 ++++- include/nbl/video/ILogicalDevice.h | 30 ++- include/nbl/video/IQueue.h | 7 +- src/nbl/video/CVulkanAccelerationStructure.h | 33 ---- src/nbl/video/CVulkanCommandBuffer.cpp | 21 +- src/nbl/video/CVulkanCommandBuffer.h | 6 +- src/nbl/video/CVulkanLogicalDevice.cpp | 21 +- src/nbl/video/CVulkanLogicalDevice.h | 6 +- src/nbl/video/IGPUCommandBuffer.cpp | 49 +++-- src/nbl/video/IQueue.cpp | 32 ++-- src/nbl/video/utilities/CAssetConverter.cpp | 2 +- 12 files changed, 268 insertions(+), 171 deletions(-) diff --git a/include/nbl/video/IGPUAccelerationStructure.h b/include/nbl/video/IGPUAccelerationStructure.h index 32ad54159a..68b4c1940b 100644 --- a/include/nbl/video/IGPUAccelerationStructure.h +++ b/include/nbl/video/IGPUAccelerationStructure.h @@ -98,39 +98,6 @@ class IGPUAccelerationStructure : public IBackendObject } }; - // copies - enum class COPY_MODE : uint8_t - { - CLONE = 0, - COMPACT = 1, - SERIALIZE = 2, - DESERIALIZE = 3, - }; - struct CopyInfo - { - const IGPUAccelerationStructure* src = nullptr; - IGPUAccelerationStructure* dst = nullptr; - COPY_MODE mode = COPY_MODE::CLONE; - }; - template requires (!std::is_const_v && std::is_base_of_v) - struct CopyToMemoryInfo - { - const IGPUAccelerationStructure* src = nullptr; - asset::SBufferBinding dst = nullptr; - COPY_MODE mode = COPY_MODE::SERIALIZE; - }; - using DeviceCopyToMemoryInfo = CopyToMemoryInfo; - using HostCopyToMemoryInfo = CopyToMemoryInfo; - template requires (!std::is_const_v && std::is_base_of_v) - struct CopyFromMemoryInfo - { - asset::SBufferBinding src = nullptr; - IGPUAccelerationStructure* dst = nullptr; - COPY_MODE mode = COPY_MODE::DESERIALIZE; - }; - using DeviceCopyFromMemoryInfo = CopyFromMemoryInfo; - using HostCopyFromMemoryInfo = CopyFromMemoryInfo; - // this will return false also if your deferred operation is not ready yet, so please use in combination with `isPending()` virtual bool wasCopySuccessful(const IDeferredOperation* const deferredOp) = 0; @@ -176,6 +143,30 @@ class IGPUBottomLevelAccelerationStructure : public asset::IBottomLevelAccelerat inline bool usesMotion() const override {return m_params.flags.hasFlags(SCreationParams::FLAGS::MOTION_BIT);} + // copies + struct CopyInfo + { + const IGPUBottomLevelAccelerationStructure* src = nullptr; + IGPUAccelerationStructure* dst = nullptr; + bool compact = false; + }; + template requires (!std::is_const_v && std::is_base_of_v) + struct CopyToMemoryInfo + { + const IGPUBottomLevelAccelerationStructure* src = nullptr; + asset::SBufferBinding dst = nullptr; + }; + using DeviceCopyToMemoryInfo = CopyToMemoryInfo; + using HostCopyToMemoryInfo = CopyToMemoryInfo; + template requires (!std::is_const_v && std::is_base_of_v) + struct CopyFromMemoryInfo + { + asset::SBufferBinding src = nullptr; + IGPUBottomLevelAccelerationStructure* dst = nullptr; + }; + using DeviceCopyFromMemoryInfo = CopyFromMemoryInfo; + using HostCopyFromMemoryInfo = CopyFromMemoryInfo; + // read the comments in the .hlsl file, AABB builds ignore certain fields using BuildRangeInfo = hlsl::acceleration_structures::bottom_level::BuildRangeInfo; // TODO: rename to GeometryRangeInfo, and make `BuildRangeInfo = const GeometryRangeInfo*` using DirectBuildRangeRangeInfos = const BuildRangeInfo* const*; @@ -388,6 +379,34 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr // inline uint32_t getMaxInstanceCount() const {return m_maxInstanceCount;} + // copies + struct CopyInfo + { + const IGPUTopLevelAccelerationStructure* src = nullptr; + IGPUTopLevelAccelerationStructure* dst = nullptr; + bool compact = false; + }; + template requires (!std::is_const_v && std::is_base_of_v) + struct CopyToMemoryInfo + { + const IGPUTopLevelAccelerationStructure* src = nullptr; + asset::SBufferBinding dst = nullptr; + // [optional] Query the tracked BLASes + core::smart_refctd_dynamic_array> trackedBLASes = nullptr; + }; + using DeviceCopyToMemoryInfo = CopyToMemoryInfo; + using HostCopyToMemoryInfo = CopyToMemoryInfo; + template requires (!std::is_const_v && std::is_base_of_v) + struct CopyFromMemoryInfo + { + asset::SBufferBinding src = nullptr; + IGPUTopLevelAccelerationStructure* dst = nullptr; + // [optional] Provide info about what BLAS references to hold onto after the copy. For performance make sure the list is compact (without repeated elements). + std::span trackedBLASes = {}; + }; + using DeviceCopyFromMemoryInfo = CopyFromMemoryInfo; + using HostCopyFromMemoryInfo = CopyFromMemoryInfo; + // read the comments in the .hlsl file using BuildRangeInfo = hlsl::acceleration_structures::top_level::BuildRangeInfo; using DirectBuildRangeRangeInfos = const BuildRangeInfo*; @@ -677,61 +696,87 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr // using blas_smart_ptr_t = core::smart_refctd_ptr; // returns number of tracked BLASes if `tracked==nullptr` otherwise writes `*count` tracked BLASes from `first` into `*tracked` - inline build_ver_t getTrackedBLASes(uint32_t* count, blas_smart_ptr_t* tracked, const uint32_t first=0) const + inline void getPendingBuildTrackedBLASes(uint32_t* count, blas_smart_ptr_t* tracked, const build_ver_t buildVer) const { if (!count) - return 0; + return; // stop multiple threads messing with us std::lock_guard lk(m_trackingLock); - const uint32_t toWrite = std::min(std::max(m_trackedBLASes.size(),first)-first,tracked ? (*count):0xffFFffFFu); - *count = toWrite; - if (tracked && toWrite) - { - auto it = m_trackedBLASes.begin(); - // cmon its an unordered map, iterator should have operator += - for (auto i=0; isize():0; + if (!tracked || !pBLASes) + return; + for (auto it=pBLASes->begin(); it!=pBLASes->end(); it++) + *(tracked++) = *(it++); } - // Useful if TLAS got built externally as well, returns if there were no later builds that preempted us setting the result here + // Useful if TLAS got built externally as well template - inline bool setTrackedBLASes(const Iterator begin, const Iterator end, const build_ver_t buildVer) + inline void insertTrackedBLASes(const Iterator begin, const Iterator end, const build_ver_t buildVer) { + if (buildVer==0) + return; // stop multiple threads messing with us std::lock_guard lk(m_trackingLock); - // stop out of order callbacks - if (buildVer<=m_completedBuildVer) - return false; - m_completedBuildVer = buildVer; - // release already tracked BLASes - m_trackedBLASes.clear(); - // sanity check, TODO: this should be an atomic_max on the `m_pendingBuildVer` - if (m_completedBuildVer>m_pendingBuildVer) - m_pendingBuildVer = m_completedBuildVer; + // insert in the right order + auto prev = m_pendingBuilds.before_begin(); + for (auto it=std::next(prev); it!=m_pendingBuilds.end()&&it->ordinal>buildVer; prev=it++) {} + auto inserted = m_pendingBuilds.emplace_after(prev); // now fill the contents - m_trackedBLASes.insert(begin,end); - return true; + inserted->BLASes.insert(begin,end); + inserted->ordinal = buildVer; + } + template + inline build_ver_t pushTrackedBLASes(const Iterator begin, const Iterator end) + { + const auto buildVer = registerNextBuildVer(); + insertTrackedBLASes(begin,end,buildVer); + return buildVer; } - // a little utility to make sure nothing from this build version and before gets tracked - inline bool clearTrackedBLASes(const build_ver_t buildVer) + // a little utility to make sure nothing from before this build version gets tracked + inline void clearTrackedBLASes(const build_ver_t buildVer) { - return setTrackedBLASes(nullptr,nullptr,buildVer); + // stop multiple threads messing with us + std::lock_guard lk(m_trackingLock); + clearTrackedBLASes_impl(buildVer); } protected: inline IGPUTopLevelAccelerationStructure(core::smart_refctd_ptr&& dev, SCreationParams&& params) : Base(), IGPUAccelerationStructure(std::move(dev),std::move(params)), - m_maxInstanceCount(params.maxInstanceCount),m_trackedBLASes() {} - + m_maxInstanceCount(params.maxInstanceCount) {} const uint32_t m_maxInstanceCount; + + private: + friend class IGPUCommandBuffer; + inline const core::unordered_set* getPendingBuildTrackedBLASes(const build_ver_t buildVer) const + { + const auto found = std::find_if(m_pendingBuilds.begin(),m_pendingBuilds.end(),[buildVer](const auto& item)->bool{return item.ordinal==buildVer;}); + if (found==m_pendingBuilds.end()) + return nullptr; + return &found->BLASes; + } + inline void clearTrackedBLASes_impl(const build_ver_t buildVer) + { + // find first element less or equal to `buildVer` + auto prev = m_pendingBuilds.before_begin(); + for (auto it=std::next(prev); it!=m_pendingBuilds.end()&&it->ordinal>=buildVer; prev=it++) {} + m_pendingBuilds.erase_after(prev,m_pendingBuilds.end()); + } + + std::atomic m_pendingBuildVer = 0; // TODO: maybe replace with new readers/writers lock mutable std::mutex m_trackingLock; - std::atomic m_pendingBuildVer = 0; - build_ver_t m_completedBuildVer = 0; - core::unordered_set m_trackedBLASes; + // TODO: this definitely needs improving with MultiEventTimelines (which also can track deferred Host ops) but then one needs to track semaphore signal-wait deps so we know what "state copy" a compaction wants + // Deferred Op must complete AFTER a submit, otherwise race condition. + // If we make a linked list of pending builds, then we just need to pop completed builds (traverse until current found) + struct STrackingInfo + { + core::unordered_set BLASes; + // when the build got + build_ver_t ordinal; + }; + // a little misleading, the element is the most recently completed one + core::forward_list m_pendingBuilds; }; } diff --git a/include/nbl/video/IGPUCommandBuffer.h b/include/nbl/video/IGPUCommandBuffer.h index d5a3fac0af..98d98ab98a 100644 --- a/include/nbl/video/IGPUCommandBuffer.h +++ b/include/nbl/video/IGPUCommandBuffer.h @@ -321,9 +321,12 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject } //! acceleration structure transfers - bool copyAccelerationStructure(const IGPUAccelerationStructure::CopyInfo& copyInfo); - bool copyAccelerationStructureToMemory(const IGPUAccelerationStructure::DeviceCopyToMemoryInfo& copyInfo); - bool copyAccelerationStructureFromMemory(const IGPUAccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo); + template requires std::is_base_of_v + bool copyAccelerationStructure(const AccelerationStructure::CopyInfo& copyInfo); + template requires std::is_base_of_v + bool copyAccelerationStructureToMemory(const AccelerationStructure::DeviceCopyToMemoryInfo& copyInfo); + template requires std::is_base_of_v + bool copyAccelerationStructureFromMemory(const AccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo); //! state setup bool bindComputePipeline(const IGPUComputePipeline* const pipeline); @@ -549,7 +552,31 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject bool executeCommands(const uint32_t count, IGPUCommandBuffer* const* const cmdbufs); // in case you want the commandbuffer to hold onto things as long as its not RESET - bool recordReferences(const std::span refs); + template + inline bool recordReferences(Iterator begin, const Iterator end) + { + auto oit = reserveReferences(std::distance(begin,end)); + if (oit) + while (begin!=end) + *(oit++) = core::smart_refctd_ptr(*(begin++)); + return oit; + } + inline bool recordReferences(const std::span refs) {return recordReferences(refs.begin(),refs.end());} + + // in case you want the commandbuffer to overwrite the BLAS tracking, e.g. you recorded TLAS building commands directly using `getNativeHandle()` to get the commandbuffer + template + inline bool recordBLASReferenceOverwrite(IGPUTopLevelAccelerationStructure* tlas, Iterator beginBLASes, const Iterator endBLASes) + { + const auto size = std::distance(beginBLASes,endBLASes); + auto oit = reserveReferences(size); + if (oit) + { + m_TLASToBLASReferenceSets[tlas] = {oit,size}; + while (beginBLASes!=endBLASes) + *(oit++) = core::smart_refctd_ptr(*(beginBLASes++)); + } + return oit; + } virtual bool insertDebugMarker(const char* name, const core::vector4df_SIMD& color = core::vector4df_SIMD(1.0, 1.0, 1.0, 1.0)) = 0; virtual bool beginDebugMarker(const char* name, const core::vector4df_SIMD& color = core::vector4df_SIMD(1.0, 1.0, 1.0, 1.0)) = 0; @@ -640,9 +667,9 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject const uint64_t* const pIndirectOffsets, const uint32_t* const pIndirectStrides, const uint32_t* const pMaxInstanceCounts ) = 0; - virtual bool copyAccelerationStructure_impl(const IGPUAccelerationStructure::CopyInfo& copyInfo) = 0; - virtual bool copyAccelerationStructureToMemory_impl(const IGPUAccelerationStructure::DeviceCopyToMemoryInfo& copyInfo) = 0; - virtual bool copyAccelerationStructureFromMemory_impl(const IGPUAccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo) = 0; + virtual bool copyAccelerationStructure_impl(const IGPUAccelerationStructure* src, IGPUAccelerationStructure* dst, const bool compact) = 0; + virtual bool copyAccelerationStructureToMemory_impl(const IGPUAccelerationStructure* src, const asset::SBufferBinding& dst) = 0; + virtual bool copyAccelerationStructureFromMemory_impl(const asset::SBufferBinding& src, IGPUAccelerationStructure* dst) = 0; virtual bool bindComputePipeline_impl(const IGPUComputePipeline* const pipeline) = 0; virtual bool bindGraphicsPipeline_impl(const IGPUGraphicsPipeline* const pipeline) = 0; @@ -875,12 +902,13 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject template requires nbl::is_any_of_v bool invalidDrawIndirectCount(const asset::SBufferBinding& indirectBinding, const asset::SBufferBinding& countBinding, const uint32_t maxDrawCount, const uint32_t stride); + core::smart_refctd_ptr* reserveReferences(const uint32_t size); // This bound descriptor set record doesn't include the descriptor sets whose layout has _any_ one of its bindings // created with IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT // or IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT. core::unordered_map m_boundDescriptorSetsRecord; - + // If the user wants the builds to be tracking, and make the TLAS remember the BLASes that have been built into it. // NOTE: We know that a TLAS may be rebuilt multiple times per frame on purpose and not only the final BLASes need to be kept alive till submission finishes. // However, the Command Pool already tracks resources referenced in the Build Infos, so we only need pointers into those records. @@ -905,6 +933,13 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject NBL_ENUM_ADD_BITWISE_OPERATORS(IGPUCommandBuffer::USAGE); #ifndef _NBL_VIDEO_I_GPU_COMMAND_BUFFER_CPP_ +extern template bool IGPUCommandBuffer::copyAccelerationStructure(const IGPUBottomLevelAccelerationStructure::CopyInfo&); +extern template bool IGPUCommandBuffer::copyAccelerationStructure(const IGPUTopLevelAccelerationStructure::CopyInfo&); +extern template bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const IGPUBottomLevelAccelerationStructure::DeviceCopyToMemoryInfo&); +extern template bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const IGPUTopLevelAccelerationStructure::DeviceCopyToMemoryInfo&); +extern template bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const IGPUBottomLevelAccelerationStructure::DeviceCopyFromMemoryInfo&); +extern template bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const IGPUTopLevelAccelerationStructure::DeviceCopyFromMemoryInfo&); + extern template uint32_t IGPUCommandBuffer::buildAccelerationStructures_common( const std::span, IGPUBottomLevelAccelerationStructure::DirectBuildRangeRangeInfos, const IGPUBuffer* const ); diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index b23afa2679..0e36c9ace1 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -592,7 +592,20 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe { auto tlas = set.first; // we know the build is completed immediately after performing it, so we get our pending stamp then - tlas->setTrackedBLASes(set.second.begin(),set.second.end(),tlas->registerNextBuildVer()); + // ideally we should get our build version when the work of the deferred op gets executed for the first time + using iterator = decltype(set.second)::iterator; + struct CustomIterator + { + inline bool operator!=(const CustomIterator& other) const {return ptr!=other.ptr;} + + inline CustomIterator operator++() {return {ptr++};} + + inline const IGPUBottomLevelAccelerationStructure* operator*() const {return dynamic_cast(ptr->get());} + + iterator ptr; + }; + const auto buildVer = tlas->pushTrackedBLASes({set.second.begin()},{set.second.end()}); + tlas->clearTrackedBLASes(buildVer); } } @@ -657,7 +670,8 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe return writeAccelerationStructuresProperties_impl(accelerationStructures,type,data,stride); } // Host-side copy, DEFERRAL IS NOT OPTIONAL - inline bool copyAccelerationStructure(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::CopyInfo& copyInfo) + template requires std::is_base_of_v + inline bool copyAccelerationStructure(IDeferredOperation* const deferredOperation, const AccelerationStructure::CopyInfo& copyInfo) { if (!acquireDeferredOperation(deferredOperation)) { @@ -679,7 +693,8 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe return result!=DEFERRABLE_RESULT::SOME_ERROR; } - inline bool copyAccelerationStructureToMemory(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyToMemoryInfo& copyInfo) + template requires std::is_base_of_v + inline bool copyAccelerationStructureToMemory(IDeferredOperation* const deferredOperation, const AccelerationStructure::HostCopyToMemoryInfo& copyInfo) { if (!acquireDeferredOperation(deferredOperation)) { @@ -704,7 +719,8 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe }); return result!=DEFERRABLE_RESULT::SOME_ERROR; } - inline bool copyAccelerationStructureFromMemory(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyFromMemoryInfo& copyInfo) + template requires std::is_base_of_v + inline bool copyAccelerationStructureFromMemory(IDeferredOperation* const deferredOperation, const AccelerationStructure::HostCopyFromMemoryInfo& copyInfo) { if (!acquireDeferredOperation(deferredOperation)) { @@ -1122,9 +1138,9 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe const IGPUTopLevelAccelerationStructure::BuildRangeInfo* const pBuildRangeInfos, const uint32_t totalGeometryCount ) = 0; virtual bool writeAccelerationStructuresProperties_impl(const std::span accelerationStructures, const IQueryPool::TYPE type, size_t* data, const size_t stride) = 0; - virtual DEFERRABLE_RESULT copyAccelerationStructure_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::CopyInfo& copyInfo) = 0; - virtual DEFERRABLE_RESULT copyAccelerationStructureToMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyToMemoryInfo& copyInfo) = 0; - virtual DEFERRABLE_RESULT copyAccelerationStructureFromMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyFromMemoryInfo& copyInfo) = 0; + virtual DEFERRABLE_RESULT copyAccelerationStructure_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure* src, IGPUAccelerationStructure* dst, const bool compact) = 0; + virtual DEFERRABLE_RESULT copyAccelerationStructureToMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure* src, const asset::SBufferBinding& dst) = 0; + virtual DEFERRABLE_RESULT copyAccelerationStructureFromMemory_impl(IDeferredOperation* const deferredOperation, const asset::SBufferBinding& src, IGPUAccelerationStructure* dst) = 0; virtual core::smart_refctd_ptr createShader_impl(const asset::ICPUShader* spirvShader) = 0; diff --git a/include/nbl/video/IQueue.h b/include/nbl/video/IQueue.h index 28336b15cc..c52e30517f 100644 --- a/include/nbl/video/IQueue.h +++ b/include/nbl/video/IQueue.h @@ -125,12 +125,7 @@ class IQueue : public core::Interface, public core::Unmovable class DeferredSubmitCallback final { // - struct STLASBuildMetadata - { - core::unordered_set m_BLASes; - uint32_t m_buildVer; - }; - core::unordered_map m_TLASToBLASReferenceSets; + core::unordered_map m_TLASBuilds; // using smart_ptr = core::smart_refctd_ptr; core::smart_refctd_dynamic_array m_resources; diff --git a/src/nbl/video/CVulkanAccelerationStructure.h b/src/nbl/video/CVulkanAccelerationStructure.h index 8041927fa2..4c0d67eee1 100644 --- a/src/nbl/video/CVulkanAccelerationStructure.h +++ b/src/nbl/video/CVulkanAccelerationStructure.h @@ -54,21 +54,6 @@ class CVulkanTopLevelAccelerationStructure final : public CVulkanAccelerationStr using Base::Base; }; - -//! all these utilities cannot be nested because of the complex inheritance between `IGPUAccelerationStructure` and the Vulkan classes -inline VkCopyAccelerationStructureModeKHR getVkCopyAccelerationStructureModeFrom(const IGPUAccelerationStructure::COPY_MODE in) -{ - return static_cast(in); -} -inline VkCopyAccelerationStructureInfoKHR getVkCopyAccelerationStructureInfoFrom(const IGPUAccelerationStructure::CopyInfo& copyInfo) -{ - VkCopyAccelerationStructureInfoKHR info = { VK_STRUCTURE_TYPE_COPY_ACCELERATION_STRUCTURE_INFO_KHR,nullptr }; - info.src = *reinterpret_cast(copyInfo.src->getNativeHandle()); - info.dst = *reinterpret_cast(copyInfo.dst->getNativeHandle()); - info.mode = getVkCopyAccelerationStructureModeFrom(copyInfo.mode); - return info; -} - template concept Buffer = is_any_of_v,IGPUBuffer,asset::ICPUBuffer>; @@ -91,24 +76,6 @@ inline DeviceOrHostAddress getVkDeviceOrHostAddress(const asset::SBu } return addr; } -template -inline VkCopyAccelerationStructureToMemoryInfoKHR getVkCopyAccelerationStructureToMemoryInfoFrom(const IGPUAccelerationStructure::CopyToMemoryInfo& copyInfo) -{ - VkCopyAccelerationStructureToMemoryInfoKHR info = { VK_STRUCTURE_TYPE_COPY_ACCELERATION_STRUCTURE_TO_MEMORY_INFO_KHR,nullptr }; - info.src = *reinterpret_cast(copyInfo.src->getNativeHandle()); - info.dst = getVkDeviceOrHostAddress(copyInfo.dst); - info.mode = getVkCopyAccelerationStructureModeFrom(copyInfo.mode); - return info; -} -template -inline VkCopyMemoryToAccelerationStructureInfoKHR getVkCopyMemoryToAccelerationStructureInfoFrom(const IGPUAccelerationStructure::CopyFromMemoryInfo& copyInfo) -{ - VkCopyMemoryToAccelerationStructureInfoKHR info = { VK_STRUCTURE_TYPE_COPY_MEMORY_TO_ACCELERATION_STRUCTURE_INFO_KHR,nullptr }; - info.src = getVkDeviceOrHostAddress(copyInfo.src); - info.dst = *reinterpret_cast(copyInfo.dst->getNativeHandle()); - info.mode = getVkCopyAccelerationStructureModeFrom(copyInfo.mode); - return info; -} inline VkGeometryFlagsKHR getVkGeometryFlagsFrom(const IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS in) { diff --git a/src/nbl/video/CVulkanCommandBuffer.cpp b/src/nbl/video/CVulkanCommandBuffer.cpp index b569a5fde2..b53c3c1537 100644 --- a/src/nbl/video/CVulkanCommandBuffer.cpp +++ b/src/nbl/video/CVulkanCommandBuffer.cpp @@ -377,22 +377,31 @@ bool CVulkanCommandBuffer::copyImage_impl(const IGPUImage* const srcImage, const } -bool CVulkanCommandBuffer::copyAccelerationStructure_impl(const IGPUAccelerationStructure::CopyInfo& copyInfo) +bool CVulkanCommandBuffer::copyAccelerationStructure_impl(const IGPUAccelerationStructure* src, IGPUAccelerationStructure* dst, const bool compact) { - const auto info = getVkCopyAccelerationStructureInfoFrom(copyInfo); + VkCopyAccelerationStructureInfoKHR info = { VK_STRUCTURE_TYPE_COPY_ACCELERATION_STRUCTURE_INFO_KHR,nullptr }; + info.src = *reinterpret_cast(src->getNativeHandle()); + info.dst = *reinterpret_cast(dst->getNativeHandle()); + info.mode = compact ? VK_COPY_ACCELERATION_STRUCTURE_MODE_COMPACT_KHR:VK_COPY_ACCELERATION_STRUCTURE_MODE_CLONE_KHR; getFunctionTable().vkCmdCopyAccelerationStructureKHR(m_cmdbuf,&info); return true; } -bool CVulkanCommandBuffer::copyAccelerationStructureToMemory_impl(const IGPUAccelerationStructure::DeviceCopyToMemoryInfo& copyInfo) +bool CVulkanCommandBuffer::copyAccelerationStructureToMemory_impl(const IGPUAccelerationStructure* src, const asset::SBufferBinding& dst) { - const auto info = getVkCopyAccelerationStructureToMemoryInfoFrom(copyInfo); + VkCopyAccelerationStructureToMemoryInfoKHR info = { VK_STRUCTURE_TYPE_COPY_ACCELERATION_STRUCTURE_TO_MEMORY_INFO_KHR,nullptr }; + info.src = *reinterpret_cast(src->getNativeHandle()); + info.dst = getVkDeviceOrHostAddress(dst); + info.mode = VK_COPY_ACCELERATION_STRUCTURE_MODE_SERIALIZE_KHR; getFunctionTable().vkCmdCopyAccelerationStructureToMemoryKHR(m_cmdbuf,&info); return true; } -bool CVulkanCommandBuffer::copyAccelerationStructureFromMemory_impl(const IGPUAccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo) +bool CVulkanCommandBuffer::copyAccelerationStructureFromMemory_impl(const asset::SBufferBinding& src, IGPUAccelerationStructure* dst) { - const auto info = getVkCopyMemoryToAccelerationStructureInfoFrom(copyInfo); + VkCopyMemoryToAccelerationStructureInfoKHR info = { VK_STRUCTURE_TYPE_COPY_MEMORY_TO_ACCELERATION_STRUCTURE_INFO_KHR,nullptr }; + info.src = getVkDeviceOrHostAddress(src); + info.dst = *reinterpret_cast(dst->getNativeHandle()); + info.mode = VK_COPY_ACCELERATION_STRUCTURE_MODE_DESERIALIZE_KHR; getFunctionTable().vkCmdCopyMemoryToAccelerationStructureKHR(m_cmdbuf,&info); return true; } diff --git a/src/nbl/video/CVulkanCommandBuffer.h b/src/nbl/video/CVulkanCommandBuffer.h index 634d8c4f2b..f31a79387d 100644 --- a/src/nbl/video/CVulkanCommandBuffer.h +++ b/src/nbl/video/CVulkanCommandBuffer.h @@ -177,9 +177,9 @@ class CVulkanCommandBuffer final : public IGPUCommandBuffer return true; } - bool copyAccelerationStructure_impl(const IGPUAccelerationStructure::CopyInfo& copyInfo) override; - bool copyAccelerationStructureToMemory_impl(const IGPUAccelerationStructure::DeviceCopyToMemoryInfo& copyInfo) override; - bool copyAccelerationStructureFromMemory_impl(const IGPUAccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo) override; + bool copyAccelerationStructure_impl(const IGPUAccelerationStructure* src, IGPUAccelerationStructure* dst, const bool compact); + bool copyAccelerationStructureToMemory_impl(const IGPUAccelerationStructure* src, const asset::SBufferBinding& dst); + bool copyAccelerationStructureFromMemory_impl(const asset::SBufferBinding& src, IGPUAccelerationStructure* dst); bool bindComputePipeline_impl(const IGPUComputePipeline* const pipeline) override; bool bindGraphicsPipeline_impl(const IGPUGraphicsPipeline* const pipeline) override; diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 2e30a18269..b27760699c 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -499,21 +499,30 @@ bool CVulkanLogicalDevice::writeAccelerationStructuresProperties_impl(const std: return m_devf.vk.vkWriteAccelerationStructuresPropertiesKHR(m_vkdev,vk_accelerationStructures.size(),vk_accelerationStructures.data(),static_cast(type),stride*accelerationStructures.size(),data,stride); } -auto CVulkanLogicalDevice::copyAccelerationStructure_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::CopyInfo& copyInfo) -> DEFERRABLE_RESULT +auto CVulkanLogicalDevice::copyAccelerationStructure_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure* src, IGPUAccelerationStructure* dst, const bool compact) -> DEFERRABLE_RESULT { - const auto info = getVkCopyAccelerationStructureInfoFrom(copyInfo); + VkCopyAccelerationStructureInfoKHR info = { VK_STRUCTURE_TYPE_COPY_ACCELERATION_STRUCTURE_INFO_KHR,nullptr }; + info.src = *reinterpret_cast(src->getNativeHandle()); + info.dst = *reinterpret_cast(dst->getNativeHandle()); + info.mode = compact ? VK_COPY_ACCELERATION_STRUCTURE_MODE_COMPACT_KHR:VK_COPY_ACCELERATION_STRUCTURE_MODE_CLONE_KHR; return getDeferrableResultFrom(m_devf.vk.vkCopyAccelerationStructureKHR(m_vkdev,static_cast(deferredOperation)->getInternalObject(),&info)); } -auto CVulkanLogicalDevice::copyAccelerationStructureToMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyToMemoryInfo& copyInfo) -> DEFERRABLE_RESULT +auto CVulkanLogicalDevice::copyAccelerationStructureToMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure* src, const asset::SBufferBinding& dst) -> DEFERRABLE_RESULT { - const auto info = getVkCopyAccelerationStructureToMemoryInfoFrom(copyInfo); + VkCopyAccelerationStructureToMemoryInfoKHR info = { VK_STRUCTURE_TYPE_COPY_ACCELERATION_STRUCTURE_TO_MEMORY_INFO_KHR,nullptr }; + info.src = *reinterpret_cast(src->getNativeHandle()); + info.dst = getVkDeviceOrHostAddress(dst); + info.mode = VK_COPY_ACCELERATION_STRUCTURE_MODE_SERIALIZE_KHR; return getDeferrableResultFrom(m_devf.vk.vkCopyAccelerationStructureToMemoryKHR(m_vkdev,static_cast(deferredOperation)->getInternalObject(),&info)); } -auto CVulkanLogicalDevice::copyAccelerationStructureFromMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyFromMemoryInfo& copyInfo) -> DEFERRABLE_RESULT +auto CVulkanLogicalDevice::copyAccelerationStructureFromMemory_impl(IDeferredOperation* const deferredOperation, const asset::SBufferBinding& src, IGPUAccelerationStructure* dst) -> DEFERRABLE_RESULT { - const auto info = getVkCopyMemoryToAccelerationStructureInfoFrom(copyInfo); + VkCopyMemoryToAccelerationStructureInfoKHR info = { VK_STRUCTURE_TYPE_COPY_MEMORY_TO_ACCELERATION_STRUCTURE_INFO_KHR,nullptr }; + info.src = getVkDeviceOrHostAddress(src); + info.dst = *reinterpret_cast(dst->getNativeHandle()); + info.mode = VK_COPY_ACCELERATION_STRUCTURE_MODE_DESERIALIZE_KHR; return getDeferrableResultFrom(m_devf.vk.vkCopyMemoryToAccelerationStructureKHR(m_vkdev,static_cast(deferredOperation)->getInternalObject(),&info)); } diff --git a/src/nbl/video/CVulkanLogicalDevice.h b/src/nbl/video/CVulkanLogicalDevice.h index 0c5666fae5..06f95a4fc5 100644 --- a/src/nbl/video/CVulkanLogicalDevice.h +++ b/src/nbl/video/CVulkanLogicalDevice.h @@ -261,9 +261,9 @@ class CVulkanLogicalDevice final : public ILogicalDevice return getDeferrableResultFrom(m_devf.vk.vkBuildAccelerationStructuresKHR(m_vkdev,static_cast(deferredOperation)->getInternalObject(),infoCount,vk_buildGeomsInfos.data(),vk_ppBuildRangeInfos)); } bool writeAccelerationStructuresProperties_impl(const std::span accelerationStructures, const IQueryPool::TYPE type, size_t* data, const size_t stride) override; - DEFERRABLE_RESULT copyAccelerationStructure_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::CopyInfo& copyInfo) override; - DEFERRABLE_RESULT copyAccelerationStructureToMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyToMemoryInfo& copyInfo) override; - DEFERRABLE_RESULT copyAccelerationStructureFromMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyFromMemoryInfo& copyInfo) override; + DEFERRABLE_RESULT copyAccelerationStructure_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure* src, IGPUAccelerationStructure* dst, const bool compact) override; + DEFERRABLE_RESULT copyAccelerationStructureToMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure* src, const asset::SBufferBinding& dst) override; + DEFERRABLE_RESULT copyAccelerationStructureFromMemory_impl(IDeferredOperation* const deferredOperation, const asset::SBufferBinding& src, IGPUAccelerationStructure* dst) override; // shaders core::smart_refctd_ptr createShader_impl(const asset::ICPUShader* spirvShader) override; diff --git a/src/nbl/video/IGPUCommandBuffer.cpp b/src/nbl/video/IGPUCommandBuffer.cpp index 6bde593097..5d3c889798 100644 --- a/src/nbl/video/IGPUCommandBuffer.cpp +++ b/src/nbl/video/IGPUCommandBuffer.cpp @@ -864,8 +864,8 @@ template uint32_t IGPUCommandBuffer::buildAccelerationStructures_common, IGPUTopLevelAccelerationStructure::MaxInputCounts* const, const IGPUBuffer* const ); - -bool IGPUCommandBuffer::copyAccelerationStructure(const IGPUAccelerationStructure::CopyInfo& copyInfo) +template requires std::is_base_of_v +bool IGPUCommandBuffer::copyAccelerationStructure(const AccelerationStructure::CopyInfo& copyInfo) { if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT|queue_flags_t::TRANSFER_BIT,RENDERPASS_SCOPE::OUTSIDE)) return false; @@ -888,10 +888,18 @@ bool IGPUCommandBuffer::copyAccelerationStructure(const IGPUAccelerationStructur } m_noCommands = false; - return copyAccelerationStructure_impl(copyInfo); + const bool retval = copyAccelerationStructure_impl(copyInfo.src,copyInfo.dst,copyInfo.compact); + if constexpr (std::is_same_v) + { +// if (copyInfo.buildVer) + } + return retval; } +template bool IGPUCommandBuffer::copyAccelerationStructure(const IGPUBottomLevelAccelerationStructure::CopyInfo&); +template bool IGPUCommandBuffer::copyAccelerationStructure(const IGPUTopLevelAccelerationStructure::CopyInfo&); -bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const IGPUAccelerationStructure::DeviceCopyToMemoryInfo& copyInfo) +template requires std::is_base_of_v +bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const AccelerationStructure::DeviceCopyToMemoryInfo& copyInfo) { if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT|queue_flags_t::TRANSFER_BIT,RENDERPASS_SCOPE::OUTSIDE)) return false; @@ -911,10 +919,17 @@ bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const IGPUAcceleration } m_noCommands = false; - return copyAccelerationStructureToMemory_impl(copyInfo); + const bool retval = copyAccelerationStructureToMemory_impl(copyInfo.src,copyInfo.dst); + if constexpr (std::is_same_v) + { + } + return retval; } +template bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const IGPUBottomLevelAccelerationStructure::DeviceCopyToMemoryInfo&); +template bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const IGPUTopLevelAccelerationStructure::DeviceCopyToMemoryInfo&); -bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const IGPUAccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo) +template requires std::is_base_of_v +bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const AccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo) { if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT|queue_flags_t::TRANSFER_BIT,RENDERPASS_SCOPE::OUTSIDE)) return false; @@ -934,8 +949,14 @@ bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const IGPUAccelerati } m_noCommands = false; - return copyAccelerationStructureFromMemory_impl(copyInfo); + const bool retval = copyAccelerationStructureFromMemory_impl(copyInfo.src,copyInfo.dst); + if constexpr (std::is_same_v) + { + } + return retval; } +template bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const IGPUBottomLevelAccelerationStructure::DeviceCopyFromMemoryInfo&); +template bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const IGPUTopLevelAccelerationStructure::DeviceCopyFromMemoryInfo&); bool IGPUCommandBuffer::bindComputePipeline(const IGPUComputePipeline* const pipeline) @@ -2078,22 +2099,18 @@ bool IGPUCommandBuffer::executeCommands(const uint32_t count, IGPUCommandBuffer* return executeCommands_impl(count,cmdbufs); } -bool IGPUCommandBuffer::recordReferences(const std::span refs) +core::smart_refctd_ptr* IGPUCommandBuffer::reserveReferences(const uint32_t size) { if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT|queue_flags_t::GRAPHICS_BIT|queue_flags_t::TRANSFER_BIT|queue_flags_t::SPARSE_BINDING_BIT)) - return false; + return nullptr; - auto cmd = m_cmdpool->m_commandListPool.emplace(m_commandList,refs.size()); + auto cmd = m_cmdpool->m_commandListPool.emplace(m_commandList,size); if (!cmd) { NBL_LOG_ERROR("out of host memory!"); - return false; + return nullptr; } - auto oit = cmd->getVariableCountResources(); - for (const auto& ref : refs) - *(oit++) = core::smart_refctd_ptr(ref); - - return true; + return cmd->getVariableCountResources(); } } \ No newline at end of file diff --git a/src/nbl/video/IQueue.cpp b/src/nbl/video/IQueue.cpp index e7612cc8d1..f5a4130825 100644 --- a/src/nbl/video/IQueue.cpp +++ b/src/nbl/video/IQueue.cpp @@ -156,12 +156,20 @@ IQueue::DeferredSubmitCallback::DeferredSubmitCallback(const SSubmitInfo& info) for (const auto& refSet : cb.cmdbuf->m_TLASToBLASReferenceSets) { const auto tlas = refSet.first; + using iterator = decltype(refSet.second)::iterator; + struct CustomIterator + { + inline bool operator!=(const CustomIterator& other) const {return ptr!=other.ptr;} + + inline CustomIterator operator++() {return {ptr++};} + + inline const IGPUBottomLevelAccelerationStructure* operator*() const {return dynamic_cast(ptr->get());} + + iterator ptr; + }; + const auto buildVer = tlas->pushTrackedBLASes({refSet.second.begin()},{refSet.second.end()}); // in theory could assert no duplicate entries, but thats obvious - auto& out = m_TLASToBLASReferenceSets[tlas]; - out.m_BLASes.reserve(refSet.second.size()); - for (const auto& refCtd : refSet.second) - out.m_BLASes.emplace(dynamic_cast(refCtd.get())); - out.m_buildVer = tlas->registerNextBuildVer(); + m_TLASBuilds[tlas] = buildVer; } } // We don't hold the last signal semaphore, because the timeline does as an Event trigger. @@ -174,10 +182,10 @@ IQueue::DeferredSubmitCallback::DeferredSubmitCallback(const SSubmitInfo& info) IQueue::DeferredSubmitCallback& IQueue::DeferredSubmitCallback::operator=(DeferredSubmitCallback&& other) { - m_TLASToBLASReferenceSets = std::move(other.m_TLASToBLASReferenceSets); + m_TLASBuilds = std::move(other.m_TLASBuilds); m_resources = std::move(other.m_resources); m_callback = std::move(other.m_callback); - other.m_TLASToBLASReferenceSets = {}; + other.m_TLASBuilds.clear(); other.m_resources = nullptr; other.m_callback = {}; return *this; @@ -186,13 +194,9 @@ IQueue::DeferredSubmitCallback& IQueue::DeferredSubmitCallback::operator=(Deferr // always exhaustive poll, because we need to get rid of resources ASAP void IQueue::DeferredSubmitCallback::operator()() { - // first update tracking info (needs resources alive) - for (const auto& refSet : m_TLASToBLASReferenceSets) - { - const auto tlas = refSet.first; - const auto& blases = refSet.second.m_BLASes; - tlas->setTrackedBLASes(blases.begin(),blases.end(),refSet.second.m_buildVer); - } + // all builds started before ours will now get overwritten (not exactly true, but without a better tracking system, this is the best we can do for now) + for (const auto& build : m_TLASBuilds) + build.first->clearTrackedBLASes(build.second); // then free all resources m_resources = nullptr; // then execute the callback diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index c69d373656..4d09a31eac 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -5261,7 +5261,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul compactedAS->setObjectDebugName(debugName.c_str()); } // record compaction - if (!computeCmdBuf->cmdbuf->copyAccelerationStructure({.src=srcAS,.dst=compactedAS.get(),.mode=IGPUAccelerationStructure::COPY_MODE::COMPACT})) + if (!computeCmdBuf->cmdbuf->copyAccelerationStructure({.src=srcAS,.dst=compactedAS.get(),.compact=true})) { logFail("record Acceleration Structure compaction",compactedAS.get()); continue; From 0f42726948f4c389a84d0bc68ab84f82b377c987 Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 23 May 2025 12:04:27 +0200 Subject: [PATCH 181/346] implemented BLAS tracking for TLAS device-side copies --- include/nbl/video/IGPUAccelerationStructure.h | 13 ++-- include/nbl/video/IGPUCommandBuffer.h | 30 ++++++-- src/nbl/video/IGPUCommandBuffer.cpp | 22 +++--- src/nbl/video/IQueue.cpp | 74 +++++++++++++++---- 4 files changed, 105 insertions(+), 34 deletions(-) diff --git a/include/nbl/video/IGPUAccelerationStructure.h b/include/nbl/video/IGPUAccelerationStructure.h index 68b4c1940b..1b851093e2 100644 --- a/include/nbl/video/IGPUAccelerationStructure.h +++ b/include/nbl/video/IGPUAccelerationStructure.h @@ -379,6 +379,9 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr // inline uint32_t getMaxInstanceCount() const {return m_maxInstanceCount;} + // + using blas_smart_ptr_t = core::smart_refctd_ptr; + // copies struct CopyInfo { @@ -392,7 +395,7 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr const IGPUTopLevelAccelerationStructure* src = nullptr; asset::SBufferBinding dst = nullptr; // [optional] Query the tracked BLASes - core::smart_refctd_dynamic_array> trackedBLASes = nullptr; + core::smart_refctd_dynamic_array trackedBLASes = nullptr; }; using DeviceCopyToMemoryInfo = CopyToMemoryInfo; using HostCopyToMemoryInfo = CopyToMemoryInfo; @@ -693,8 +696,6 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr { return ++m_pendingBuildVer; } - // - using blas_smart_ptr_t = core::smart_refctd_ptr; // returns number of tracked BLASes if `tracked==nullptr` otherwise writes `*count` tracked BLASes from `first` into `*tracked` inline void getPendingBuildTrackedBLASes(uint32_t* count, blas_smart_ptr_t* tracked, const build_ver_t buildVer) const { @@ -703,10 +704,12 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr // stop multiple threads messing with us std::lock_guard lk(m_trackingLock); auto pBLASes = getPendingBuildTrackedBLASes(buildVer); + const auto origCount = *count; *count = pBLASes ? pBLASes->size():0; if (!tracked || !pBLASes) return; - for (auto it=pBLASes->begin(); it!=pBLASes->end(); it++) + auto it = pBLASes->begin(); + for (auto i = 0; i* getPendingBuildTrackedBLASes(const build_ver_t buildVer) const { const auto found = std::find_if(m_pendingBuilds.begin(),m_pendingBuilds.end(),[buildVer](const auto& item)->bool{return item.ordinal==buildVer;}); diff --git a/include/nbl/video/IGPUCommandBuffer.h b/include/nbl/video/IGPUCommandBuffer.h index 98d98ab98a..e1e672e838 100644 --- a/include/nbl/video/IGPUCommandBuffer.h +++ b/include/nbl/video/IGPUCommandBuffer.h @@ -571,7 +571,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject auto oit = reserveReferences(size); if (oit) { - m_TLASToBLASReferenceSets[tlas] = {oit,size}; + m_TLASTrackingOps.emplace_back(TLASTrackingWrite{.src={oit,size},.dst=tlas}); while (beginBLASes!=endBLASes) *(oit++) = core::smart_refctd_ptr(*(beginBLASes++)); } @@ -750,7 +750,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject m_state = STATE::INITIAL; m_boundDescriptorSetsRecord.clear(); - m_TLASToBLASReferenceSets.clear(); + m_TLASTrackingOps.clear(); m_boundGraphicsPipeline= nullptr; m_boundComputePipeline= nullptr; m_boundRayTracingPipeline= nullptr; @@ -768,7 +768,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject { deleteCommandList(); m_boundDescriptorSetsRecord.clear(); - m_TLASToBLASReferenceSets.clear(); + m_TLASTrackingOps.clear(); m_boundGraphicsPipeline= nullptr; m_boundComputePipeline= nullptr; m_boundRayTracingPipeline= nullptr; @@ -909,10 +909,26 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject // or IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT. core::unordered_map m_boundDescriptorSetsRecord; - // If the user wants the builds to be tracking, and make the TLAS remember the BLASes that have been built into it. - // NOTE: We know that a TLAS may be rebuilt multiple times per frame on purpose and not only the final BLASes need to be kept alive till submission finishes. - // However, the Command Pool already tracks resources referenced in the Build Infos, so we only need pointers into those records. - core::unordered_map>> m_TLASToBLASReferenceSets; + // If the user wants the builds and copies to be tracking, and make the TLAS remember the BLASes that have been built into it. + // The Command Pool already tracks resources referenced in the Build Infos or Copies From Memory (Deserializations), so we only need pointers into those records. + struct TLASTrackingWrite + { + std::span> src; + IGPUTopLevelAccelerationStructure* dst; + }; + struct TLASTrackingCopy + { + const IGPUTopLevelAccelerationStructure* src; + IGPUTopLevelAccelerationStructure* dst; + }; + struct TLASTrackingRead + { + const IGPUTopLevelAccelerationStructure* src; + // For a copy to memory (Serialization), we need to dump the BLASes references + core::smart_refctd_dynamic_array dst; + }; + // operations as they'll be performed in order + core::vector> m_TLASTrackingOps; const IGPUGraphicsPipeline* m_boundGraphicsPipeline; const IGPUComputePipeline* m_boundComputePipeline; diff --git a/src/nbl/video/IGPUCommandBuffer.cpp b/src/nbl/video/IGPUCommandBuffer.cpp index 5d3c889798..40c5ea1e3b 100644 --- a/src/nbl/video/IGPUCommandBuffer.cpp +++ b/src/nbl/video/IGPUCommandBuffer.cpp @@ -842,10 +842,7 @@ uint32_t IGPUCommandBuffer::buildAccelerationStructures_common(const std::span) { const auto blasCount = info.trackedBLASes.size(); - if (blasCount) - m_TLASToBLASReferenceSets[info.dstAS] = {oit-blasCount,blasCount}; - else - m_TLASToBLASReferenceSets[info.dstAS] = {}; + m_TLASTrackingOps.emplace_back(TLASTrackingWrite{.src={oit-blasCount,blasCount},.dst=info.dstAS}); } } @@ -890,9 +887,7 @@ bool IGPUCommandBuffer::copyAccelerationStructure(const AccelerationStructure::C m_noCommands = false; const bool retval = copyAccelerationStructure_impl(copyInfo.src,copyInfo.dst,copyInfo.compact); if constexpr (std::is_same_v) - { -// if (copyInfo.buildVer) - } + m_TLASTrackingOps.emplace_back(TLASTrackingCopy{.src=copyInfo.src,.dst=copyInfo.dst}); return retval; } template bool IGPUCommandBuffer::copyAccelerationStructure(const IGPUBottomLevelAccelerationStructure::CopyInfo&); @@ -921,8 +916,7 @@ bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const AccelerationStru m_noCommands = false; const bool retval = copyAccelerationStructureToMemory_impl(copyInfo.src,copyInfo.dst); if constexpr (std::is_same_v) - { - } + m_TLASTrackingOps.emplace_back(TLASTrackingRead{.src=copyInfo.src,.dst=copyInfo.trackedBLASes}); return retval; } template bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const IGPUBottomLevelAccelerationStructure::DeviceCopyToMemoryInfo&); @@ -952,6 +946,16 @@ bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const AccelerationSt const bool retval = copyAccelerationStructureFromMemory_impl(copyInfo.src,copyInfo.dst); if constexpr (std::is_same_v) { + const auto size = copyInfo.trackedBLASes.size(); + auto oit = reserveReferences(size); + if (oit) + { + m_TLASTrackingOps.emplace_back(TLASTrackingWrite{.src={oit,size},.dst=copyInfo.dst}); + for (const auto& blas : copyInfo.trackedBLASes) + *(oit++) = core::smart_refctd_ptr(blas); + } + else + NBL_LOG_ERROR("out of host memory for BLAS tracking references, TLAS will be copied from memory without BLAS tracking data!"); } return retval; } diff --git a/src/nbl/video/IQueue.cpp b/src/nbl/video/IQueue.cpp index f5a4130825..256233dc91 100644 --- a/src/nbl/video/IQueue.cpp +++ b/src/nbl/video/IQueue.cpp @@ -149,27 +149,75 @@ IQueue::DeferredSubmitCallback::DeferredSubmitCallback(const SSubmitInfo& info) auto outRes = m_resources->data(); for (const auto& sema : info.waitSemaphores) *(outRes++) = smart_ptr(sema.semaphore); + // track our own versions + core::unordered_map m_readTLASVersions; + // get the TLAS BLAS tracking info and assign a pending build version number + for (const auto& cb : info.commandBuffers) + for (const auto& var : cb.cmdbuf->m_TLASTrackingOps) + { + const IGPUTopLevelAccelerationStructure* src = nullptr; + switch (var.index()) + { + case 1: + src = std::get<1>(var).src; + break; + case 2: + src = std::get<2>(var).src; + break; + } + if (src) + m_readTLASVersions.insert({src,src->getPendingBuildVer()}); + } for (const auto& cb : info.commandBuffers) { *(outRes++) = smart_ptr(cb.cmdbuf); - // get the TLAS BLAS tracking info and assign a pending build version number - for (const auto& refSet : cb.cmdbuf->m_TLASToBLASReferenceSets) + for (const auto& var : cb.cmdbuf->m_TLASTrackingOps) + switch (var.index()) { - const auto tlas = refSet.first; - using iterator = decltype(refSet.second)::iterator; - struct CustomIterator + case 0: { - inline bool operator!=(const CustomIterator& other) const {return ptr!=other.ptr;} + const IGPUCommandBuffer::TLASTrackingWrite& op = std::get<0>(var); + using iterator = decltype(op.src)::iterator; + struct CustomIterator + { + inline bool operator!=(const CustomIterator& other) const { return ptr != other.ptr; } - inline CustomIterator operator++() {return {ptr++};} + inline CustomIterator operator++() { return { ptr++ }; } - inline const IGPUBottomLevelAccelerationStructure* operator*() const {return dynamic_cast(ptr->get());} + inline const IGPUBottomLevelAccelerationStructure* operator*() const { return dynamic_cast(ptr->get()); } - iterator ptr; - }; - const auto buildVer = tlas->pushTrackedBLASes({refSet.second.begin()},{refSet.second.end()}); - // in theory could assert no duplicate entries, but thats obvious - m_TLASBuilds[tlas] = buildVer; + iterator ptr; + }; + m_readTLASVersions[op.dst] = m_TLASBuilds[op.dst] = op.dst->pushTrackedBLASes({op.src.begin()},{op.src.end()}); + break; + } + case 1: + { + const IGPUCommandBuffer::TLASTrackingCopy& op = std::get<1>(var); + // not sure if even legal, but it would deadlock us + if (op.src==op.dst) + break; + const auto ver = m_readTLASVersions.find(op.src)->second; + // stop multiple threads messing with us + std::lock_guard lk(op.src->m_trackingLock); + const auto* pSrcBLASes = op.src->getPendingBuildTrackedBLASes(ver); + assert(pSrcBLASes); + m_readTLASVersions[op.dst] = m_TLASBuilds[op.dst] = op.dst->pushTrackedBLASes(pSrcBLASes->begin(),pSrcBLASes->end()); + break; + } + case 2: + { + const IGPUCommandBuffer::TLASTrackingRead& op = std::get<2>(var); + const auto ver = m_readTLASVersions.find(op.src)->second; + uint32_t count = op.dst->size(); + op.src->getPendingBuildTrackedBLASes(&count,op.dst->data(),ver); + if (count>op.dst->size()) + cb.cmdbuf->getOriginDevice()->getLogger()->log("BLAS output array too small, should be %d, only wrote out %d BLAS references to destination",system::ILogger::ELL_ERROR,count,op.dst->size()); + break; + } + default: + assert(false); + break; } } // We don't hold the last signal semaphore, because the timeline does as an Event trigger. From 302710fd4a5255b84f6e495b6e0dd398a8b45296 Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 23 May 2025 12:53:17 +0200 Subject: [PATCH 182/346] clean up a bit and implement BLAS tracking info for Host Copies --- include/nbl/video/IGPUAccelerationStructure.h | 11 ++ include/nbl/video/ILogicalDevice.h | 101 +++++++++++++++--- include/nbl/video/IQueue.h | 2 +- src/nbl/video/IQueue.cpp | 23 ++-- 4 files changed, 104 insertions(+), 33 deletions(-) diff --git a/include/nbl/video/IGPUAccelerationStructure.h b/include/nbl/video/IGPUAccelerationStructure.h index 1b851093e2..1bb4fb0c66 100644 --- a/include/nbl/video/IGPUAccelerationStructure.h +++ b/include/nbl/video/IGPUAccelerationStructure.h @@ -750,6 +750,17 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr const uint32_t m_maxInstanceCount; private: + struct DynamicUpCastingSpanIterator + { + inline bool operator!=(const DynamicUpCastingSpanIterator& other) const {return ptr!=other.ptr;} + + inline DynamicUpCastingSpanIterator operator++() {return {ptr++};} + + inline const IGPUBottomLevelAccelerationStructure* operator*() const {return dynamic_cast(ptr->get());} + + std::span>::iterator ptr; + }; + friend class ILogicalDevice; friend class IQueue; inline const core::unordered_set* getPendingBuildTrackedBLASes(const build_ver_t buildVer) const { diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 0e36c9ace1..34036e2ffc 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -593,18 +593,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe auto tlas = set.first; // we know the build is completed immediately after performing it, so we get our pending stamp then // ideally we should get our build version when the work of the deferred op gets executed for the first time - using iterator = decltype(set.second)::iterator; - struct CustomIterator - { - inline bool operator!=(const CustomIterator& other) const {return ptr!=other.ptr;} - - inline CustomIterator operator++() {return {ptr++};} - - inline const IGPUBottomLevelAccelerationStructure* operator*() const {return dynamic_cast(ptr->get());} - - iterator ptr; - }; - const auto buildVer = tlas->pushTrackedBLASes({set.second.begin()},{set.second.end()}); + const auto buildVer = tlas->pushTrackedBLASes({set.second.begin()},{set.second.end()}); tlas->clearTrackedBLASes(buildVer); } } @@ -622,10 +611,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe if constexpr (IsTLAS) { const auto blasCount = info.trackedBLASes.size(); - if (blasCount) - callback.m_TLASToBLASReferenceSets[info.dstAS] = {oit-blasCount,blasCount}; - else - callback.m_TLASToBLASReferenceSets[info.dstAS] = {}; + callback.m_TLASToBLASReferenceSets[info.dstAS] = {oit-blasCount,blasCount}; } } if constexpr (IsTLAS) @@ -685,10 +671,42 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe } auto result = copyAccelerationStructure_impl(deferredOperation,copyInfo); if (result==DEFERRABLE_RESULT::DEFERRED) + { deferredOperation->m_resourceTracking.insert(deferredOperation->m_resourceTracking.begin(),{ core::smart_refctd_ptr(copyInfo.src), core::smart_refctd_ptr(copyInfo.dst) }); + constexpr bool IsTLAS = std::is_same_v; + if constexpr (IsTLAS) + { + struct TLASCallback + { + // upon completion set the BLASes tracked + inline void operator()(IDeferredOperation*) const + { + // not sure if even legal, but it would deadlock us + if (src==dst) + return; + uint32_t buildVer; + { + // stop multiple threads messing with us + std::lock_guard lk(src->m_trackingLock); + // we know the build is completed immediately after performing it, so we get our pending stamp then + // ideally we should get the BLAS set from the Source TLAS when the work of the deferred op gets executed for the first time + const auto* pSrcBLASes = src->getPendingBuildTrackedBLASes(src->getPendingBuildVer()); + const std::span emptySpan = {}; + buildVer = pSrcBLASes ? dst->pushTrackedBLASes(pSrcBLASes->begin(),pSrcBLASes->end()):dst->pushTrackedBLASes(emptySpan.begin(),emptySpan.end()); + } + dst->clearTrackedBLASes(buildVer); + } + + // the rawpointers are already smartpointers in whatever else the `fillTracking` declared above writes + const IGPUTopLevelAccelerationStructure* src; + IGPUTopLevelAccelerationStructure* dst; + } callback = {.src=copyInfo.src,.dst=copyInfo.dst}; + deferredOperation->m_callback = std::move(callback); + } + } return result!=DEFERRABLE_RESULT::SOME_ERROR; @@ -713,10 +731,39 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe } auto result = copyAccelerationStructureToMemory_impl(deferredOperation,copyInfo); if (result==DEFERRABLE_RESULT::DEFERRED) + { deferredOperation->m_resourceTracking.insert(deferredOperation->m_resourceTracking.begin(),{ core::smart_refctd_ptr(copyInfo.src), core::smart_refctd_ptr(copyInfo.dst.buffer) }); + constexpr bool IsTLAS = std::is_same_v; + if constexpr (IsTLAS) + { + struct TLASCallback + { + // upon completion set the BLASes tracked + inline void operator()(IDeferredOperation*) const + { + // stop multiple threads messing with us + std::lock_guard lk(src->m_trackingLock); + // we know the build is completed immediately after performing it, so we get our pending stamp then + // ideally we should get the BLAS set from the Source TLAS when the work of the deferred op gets executed for the first time + const auto ver = src->getPendingBuildVer(); + uint32_t count = dst->size(); + src->getPendingBuildTrackedBLASes(&count,dst->data(),ver); + if (count>dst->size()) + logger->log("BLAS output array too small, should be %d, only wrote out %d BLAS references to destination",system::ILogger::ELL_ERROR,count,dst->size()); + } + + // device keeps it alive for entire lifetime of the callback + system::ILogger* logger; + // the rawpointers are already smartpointers in whatever else the `fillTracking` declared above writes + const IGPUTopLevelAccelerationStructure* src; + core::smart_refctd_dynamic_array dst; + } callback = {.logger=m_logger.get(),.src=copyInfo.src,.dst=copyInfo.trackedBLASes}; + deferredOperation->m_callback = std::move(callback); + } + } return result!=DEFERRABLE_RESULT::SOME_ERROR; } template requires std::is_base_of_v @@ -739,10 +786,32 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe } auto result = copyAccelerationStructureFromMemory_impl(deferredOperation,copyInfo); if (result==DEFERRABLE_RESULT::DEFERRED) + { deferredOperation->m_resourceTracking.insert(deferredOperation->m_resourceTracking.begin(),{ core::smart_refctd_ptr(copyInfo.src.buffer), core::smart_refctd_ptr(copyInfo.dst) }); + constexpr bool IsTLAS = std::is_same_v; + if constexpr (IsTLAS) + { + const size_t offset = deferredOperation->m_resourceTracking.size(); + deferredOperation->m_resourceTracking.insert(deferredOperation->m_resourceTracking.end(),copyInfo.trackedBLASes.begin(),copyInfo.trackedBLASes.end()); + struct TLASCallback + { + // upon completion set the BLASes tracked + inline void operator()(IDeferredOperation*) const + { + const auto buildVer = dst->pushTrackedBLASes({src->begin()},{src->end()}); + dst->clearTrackedBLASes(buildVer); + } + + // the rawpointers are already smartpointers in whatever else the `fillTracking` declared above writes + std::span> src; + IGPUTopLevelAccelerationStructure* dst; + } callback = {.src={deferredOperation->m_resourceTracking.data()+offset,copyInfo.trackedBLASes.size()},.dst=copyInfo.dst}; + deferredOperation->m_callback = std::move(callback); + } + } return result!=DEFERRABLE_RESULT::SOME_ERROR; } diff --git a/include/nbl/video/IQueue.h b/include/nbl/video/IQueue.h index c52e30517f..63073beb33 100644 --- a/include/nbl/video/IQueue.h +++ b/include/nbl/video/IQueue.h @@ -125,7 +125,7 @@ class IQueue : public core::Interface, public core::Unmovable class DeferredSubmitCallback final { // - core::unordered_map m_TLASBuilds; + core::unordered_map m_TLASOverwrites; // using smart_ptr = core::smart_refctd_ptr; core::smart_refctd_dynamic_array m_resources; diff --git a/src/nbl/video/IQueue.cpp b/src/nbl/video/IQueue.cpp index 256233dc91..108f76183c 100644 --- a/src/nbl/video/IQueue.cpp +++ b/src/nbl/video/IQueue.cpp @@ -177,18 +177,9 @@ IQueue::DeferredSubmitCallback::DeferredSubmitCallback(const SSubmitInfo& info) case 0: { const IGPUCommandBuffer::TLASTrackingWrite& op = std::get<0>(var); - using iterator = decltype(op.src)::iterator; - struct CustomIterator - { - inline bool operator!=(const CustomIterator& other) const { return ptr != other.ptr; } - - inline CustomIterator operator++() { return { ptr++ }; } - inline const IGPUBottomLevelAccelerationStructure* operator*() const { return dynamic_cast(ptr->get()); } - - iterator ptr; - }; - m_readTLASVersions[op.dst] = m_TLASBuilds[op.dst] = op.dst->pushTrackedBLASes({op.src.begin()},{op.src.end()}); + using iterator = decltype(op.src)::iterator; + m_readTLASVersions[op.dst] = m_TLASOverwrites[op.dst] = op.dst->pushTrackedBLASes({op.src.begin()},{op.src.end()}); break; } case 1: @@ -201,8 +192,8 @@ IQueue::DeferredSubmitCallback::DeferredSubmitCallback(const SSubmitInfo& info) // stop multiple threads messing with us std::lock_guard lk(op.src->m_trackingLock); const auto* pSrcBLASes = op.src->getPendingBuildTrackedBLASes(ver); - assert(pSrcBLASes); - m_readTLASVersions[op.dst] = m_TLASBuilds[op.dst] = op.dst->pushTrackedBLASes(pSrcBLASes->begin(),pSrcBLASes->end()); + const std::span emptySpan = {}; + m_readTLASVersions[op.dst] = m_TLASOverwrites[op.dst] = pSrcBLASes ? op.dst->pushTrackedBLASes(pSrcBLASes->begin(),pSrcBLASes->end()):op.dst->pushTrackedBLASes(emptySpan.begin(),emptySpan.end()); break; } case 2: @@ -230,10 +221,10 @@ IQueue::DeferredSubmitCallback::DeferredSubmitCallback(const SSubmitInfo& info) IQueue::DeferredSubmitCallback& IQueue::DeferredSubmitCallback::operator=(DeferredSubmitCallback&& other) { - m_TLASBuilds = std::move(other.m_TLASBuilds); + m_TLASOverwrites = std::move(other.m_TLASOverwrites); m_resources = std::move(other.m_resources); m_callback = std::move(other.m_callback); - other.m_TLASBuilds.clear(); + other.m_TLASOverwrites.clear(); other.m_resources = nullptr; other.m_callback = {}; return *this; @@ -243,7 +234,7 @@ IQueue::DeferredSubmitCallback& IQueue::DeferredSubmitCallback::operator=(Deferr void IQueue::DeferredSubmitCallback::operator()() { // all builds started before ours will now get overwritten (not exactly true, but without a better tracking system, this is the best we can do for now) - for (const auto& build : m_TLASBuilds) + for (const auto& build : m_TLASOverwrites) build.first->clearTrackedBLASes(build.second); // then free all resources m_resources = nullptr; From 5813d1067919568c86ca8203fa8760237e82f381 Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 23 May 2025 13:59:50 +0200 Subject: [PATCH 183/346] finish const correctness of Descriptor Set Layouts in Pipelines --- include/nbl/video/ILogicalDevice.h | 8 ++++---- src/nbl/video/CVulkanLogicalDevice.cpp | 10 +++++----- src/nbl/video/CVulkanLogicalDevice.h | 4 ++-- src/nbl/video/CVulkanPipelineLayout.h | 4 ++-- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index c2f2605d0b..c84461ef9f 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -837,8 +837,8 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe // Create a pipeline layout (@see ICPUPipelineLayout) core::smart_refctd_ptr createPipelineLayout( const std::span pcRanges={}, - core::smart_refctd_ptr&& _layout0=nullptr, core::smart_refctd_ptr&& _layout1=nullptr, - core::smart_refctd_ptr&& _layout2=nullptr, core::smart_refctd_ptr&& _layout3=nullptr + core::smart_refctd_ptr&& _layout0=nullptr, core::smart_refctd_ptr&& _layout1=nullptr, + core::smart_refctd_ptr&& _layout2=nullptr, core::smart_refctd_ptr&& _layout3=nullptr ) { if ((_layout0 && !_layout0->wasCreatedBy(this))) @@ -1217,8 +1217,8 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe virtual core::smart_refctd_ptr createDescriptorSetLayout_impl(const std::span bindings, const uint32_t maxSamplersCount) = 0; virtual core::smart_refctd_ptr createPipelineLayout_impl( const std::span pcRanges, - core::smart_refctd_ptr&& _layout0, core::smart_refctd_ptr&& _layout1, - core::smart_refctd_ptr&& _layout2, core::smart_refctd_ptr&& _layout3 + core::smart_refctd_ptr&& _layout0, core::smart_refctd_ptr&& _layout1, + core::smart_refctd_ptr&& _layout2, core::smart_refctd_ptr&& _layout3 ) = 0; virtual core::smart_refctd_ptr createDescriptorPool_impl(const IDescriptorPool::SCreateInfo& createInfo) = 0; diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index b27760699c..bb2d6d6cb4 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -597,13 +597,13 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createDesc core::smart_refctd_ptr CVulkanLogicalDevice::createPipelineLayout_impl( const std::span pcRanges, - core::smart_refctd_ptr&& layout0, - core::smart_refctd_ptr&& layout1, - core::smart_refctd_ptr&& layout2, - core::smart_refctd_ptr&& layout3 + core::smart_refctd_ptr&& layout0, + core::smart_refctd_ptr&& layout1, + core::smart_refctd_ptr&& layout2, + core::smart_refctd_ptr&& layout3 ) { - const core::smart_refctd_ptr tmp[] = { layout0, layout1, layout2, layout3 }; + const core::smart_refctd_ptr tmp[] = { layout0, layout1, layout2, layout3 }; VkDescriptorSetLayout vk_dsLayouts[asset::ICPUPipelineLayout::DESCRIPTOR_SET_COUNT]; uint32_t nonNullSetLayoutCount = ~0u; diff --git a/src/nbl/video/CVulkanLogicalDevice.h b/src/nbl/video/CVulkanLogicalDevice.h index 06f95a4fc5..6386bdfa7c 100644 --- a/src/nbl/video/CVulkanLogicalDevice.h +++ b/src/nbl/video/CVulkanLogicalDevice.h @@ -272,8 +272,8 @@ class CVulkanLogicalDevice final : public ILogicalDevice core::smart_refctd_ptr createDescriptorSetLayout_impl(const std::span bindings, const uint32_t maxSamplersCount) override; core::smart_refctd_ptr createPipelineLayout_impl( const std::span pcRanges, - core::smart_refctd_ptr&& _layout0, core::smart_refctd_ptr&& _layout1, - core::smart_refctd_ptr&& _layout2, core::smart_refctd_ptr&& _layout3 + core::smart_refctd_ptr&& _layout0, core::smart_refctd_ptr&& _layout1, + core::smart_refctd_ptr&& _layout2, core::smart_refctd_ptr&& _layout3 ) override; // descriptor sets diff --git a/src/nbl/video/CVulkanPipelineLayout.h b/src/nbl/video/CVulkanPipelineLayout.h index d89d2a493c..ef46226fdb 100644 --- a/src/nbl/video/CVulkanPipelineLayout.h +++ b/src/nbl/video/CVulkanPipelineLayout.h @@ -15,8 +15,8 @@ class CVulkanPipelineLayout : public IGPUPipelineLayout public: CVulkanPipelineLayout( const ILogicalDevice* dev, const std::span _pcRanges, - core::smart_refctd_ptr&& _layout0, core::smart_refctd_ptr&& _layout1, - core::smart_refctd_ptr&& _layout2, core::smart_refctd_ptr&& _layout3, + core::smart_refctd_ptr&& _layout0, core::smart_refctd_ptr&& _layout1, + core::smart_refctd_ptr&& _layout2, core::smart_refctd_ptr&& _layout3, const VkPipelineLayout vk_layout ) : IGPUPipelineLayout( core::smart_refctd_ptr(dev), From e4487ba3d92735c0a7bb587e23e87ef03607c2ea Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 23 May 2025 14:17:53 +0200 Subject: [PATCH 184/346] small lifetime issue fix --- src/nbl/video/utilities/CAssetConverter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 4d09a31eac..548c049bfe 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -1955,7 +1955,6 @@ class GetDependantVisit : public GetDependantVisitBase(extraArgs...); if constexpr (std::is_same_v) @@ -1985,6 +1984,7 @@ class GetDependantVisit : public GetDependantVisitBase Date: Fri, 23 May 2025 14:31:42 +0200 Subject: [PATCH 185/346] fix device_jit_traits generation --- src/nbl/device/gen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nbl/device/gen.py b/src/nbl/device/gen.py index 253d529b3d..88174cb3c2 100644 --- a/src/nbl/device/gen.py +++ b/src/nbl/device/gen.py @@ -120,7 +120,7 @@ args.jit_traits_output_path, buildTraitsHeader, type="JIT Members", - template="oss << \"NBL_CONSTEXPR_STATIC_INLINE {} {} = ({})\" + CJITIncludeLoader::to_string({}.{});", + template="oss << \"NBL_CONSTEXPR_STATIC_INLINE {} {} = ({})\" + CJITIncludeLoader::to_string({}.{}) << \";\\n\";", limits_json=limits, features_json=features, format_params=["type", "name", "type", "json_type", "cpp_name"], From ad96f8abf35e2face03e81148e90cafae91d25df Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 23 May 2025 14:33:44 +0200 Subject: [PATCH 186/346] pre merge submodule update --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 06bf814d56..69ba991ea4 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 06bf814d56648d1468256f5231f2b772a5bd3263 +Subproject commit 69ba991ea4827c80d008a31256785f4c4c60f12d From d042f42597fb6e12f9be04bb045145934de09d08 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 23 May 2025 20:26:30 +0700 Subject: [PATCH 187/346] Add some utility function to IGPURayTracingPipeline SShaderGroup --- include/nbl/video/IGPURayTracingPipeline.h | 28 ++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h index f7a92252f7..66e3a01072 100644 --- a/include/nbl/video/IGPURayTracingPipeline.h +++ b/include/nbl/video/IGPURayTracingPipeline.h @@ -38,6 +38,34 @@ class IGPURayTracingPipeline : public IGPUPipeline Date: Fri, 23 May 2025 20:26:47 +0700 Subject: [PATCH 188/346] Fix debloat logic in logical device --- include/nbl/video/ILogicalDevice.h | 10 +- src/nbl/video/ILogicalDevice.cpp | 177 +++++++++++++++++++++++++---- 2 files changed, 158 insertions(+), 29 deletions(-) diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 49364f3a54..ab0d5bea06 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -1097,7 +1097,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe virtual core::smart_refctd_ptr createFramebuffer_impl(IGPUFramebuffer::SCreationParams&& params) = 0; template - inline CreationParams::SSpecializationValidationResult commonCreatePipelines(IGPUPipelineCache* const pipelineCache, const std::span params, ExtraLambda&& extra) + inline SSpecializationValidationResult commonCreatePipelines(IGPUPipelineCache* const pipelineCache, const std::span params, ExtraLambda&& extra) { if (pipelineCache && !pipelineCache->wasCreatedBy(this)) { @@ -1110,7 +1110,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe return {}; } - typename CreationParams::SSpecializationValidationResult retval = {.count=0,.dataSize=0}; + SSpecializationValidationResult retval = {.count=0,.dataSize=0}; for (auto i=0; i createInfos, core::smart_refctd_ptr* const output, - const IGPUComputePipeline::SCreationParams::SSpecializationValidationResult& validation + const SSpecializationValidationResult& validation ) = 0; virtual void createGraphicsPipelines_impl( IGPUPipelineCache* const pipelineCache, const std::span params, core::smart_refctd_ptr* const output, - const IGPUGraphicsPipeline::SCreationParams::SSpecializationValidationResult& validation + const SSpecializationValidationResult& validation ) = 0; virtual void createRayTracingPipelines_impl( IGPUPipelineCache* const pipelineCache, const std::span createInfos, core::smart_refctd_ptr* const output, - const IGPURayTracingPipeline::SCreationParams::SSpecializationValidationResult& validation + const SSpecializationValidationResult& validation ) = 0; virtual core::smart_refctd_ptr createQueryPool_impl(const IQueryPool::SCreationParams& params) = 0; diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp index 26cfc4c6a8..d43ef7c58c 100644 --- a/src/nbl/video/ILogicalDevice.cpp +++ b/src/nbl/video/ILogicalDevice.cpp @@ -7,11 +7,70 @@ using namespace nbl; using namespace nbl::video; -static void debloatShaders(const asset::ISPIRVDebloater& debloater, std::span shaderSpecs, core::vector>& outShaders, asset::IPipelineBase::SShaderSpecInfo* outShaderSpecInfos, system::logger_opt_ptr logger = nullptr) +class SpirvDebloatTask +{ + public: + using EntryPoints = core::set; + + SpirvDebloatTask(asset::ISPIRVDebloater* debloater, system::logger_opt_ptr logger) : m_debloater(debloater), m_logger(logger) + { + + } + + void insertEntryPoint(const IGPUPipelineBase::SShaderSpecInfo& shaderSpec, hlsl::ShaderStage stage) + { + const auto* shader = shaderSpec.shader; + auto it = m_entryPointsMap.find(shader); + if (it == m_entryPointsMap.end() || it->first != shader) + it = m_entryPointsMap.emplace_hint(it, shader, EntryPoints()); + it->second.insert({ .name = shaderSpec.entryPoint, .stage = stage }); + } + + IGPUPipelineBase::SShaderSpecInfo debloat(const IGPUPipelineBase::SShaderSpecInfo& shaderSpec, core::vector>& outShaders) + { + const auto* shader = shaderSpec.shader; + const auto& entryPoints = m_entryPointsMap[shader]; + + auto debloatedShaderSpec = shaderSpec; + if (shader != nullptr) + { + if (!m_debloatedShadersMap.contains(shader)) + { + const auto outShadersData = outShaders.data(); + outShaders.push_back(m_debloater->debloat(shader, entryPoints, m_logger)); + assert(outShadersData == outShaders.data()); + m_debloatedShadersMap.emplace(shader, outShaders.back().get()); + } + const auto debloatedShader = m_debloatedShadersMap[shader]; + debloatedShaderSpec.shader = debloatedShader; + } + return debloatedShaderSpec; + } + + private: + core::map m_entryPointsMap; + core::map m_debloatedShadersMap; + asset::ISPIRVDebloater* m_debloater; + const system::logger_opt_ptr m_logger; +}; + +using DebloaterEntryPoints = core::set; +static void insertEntryPoint(const IGPUPipelineBase::SShaderSpecInfo& shaderSpec, hlsl::ShaderStage stage, + core::map entryPointsMap) +{ + const auto* shader = shaderSpec.shader; + auto it = entryPointsMap.find(shader); + if (it == entryPointsMap.end() || it->first != shader) + it = entryPointsMap.emplace_hint(it, shader, DebloaterEntryPoints()); + it->second.insert({ .name = shaderSpec.entryPoint, .stage = stage }); +}; + +static void debloatShaders(const asset::ISPIRVDebloater& debloater, std::span shaderSpecs, core::vector>& outShaders, IGPUPipelineBase::SShaderSpecInfo* outShaderSpecInfos, system::logger_opt_ptr logger = nullptr) { using EntryPoints = core::set; core::map entryPointsMap; + // collect all entry points first before we debloat for (const auto& shaderSpec : shaderSpecs) { const auto* shader = shaderSpec.shader; @@ -781,10 +840,10 @@ asset::ICPUPipelineCache::SCacheKey ILogicalDevice::getPipelineCacheKey() const bool ILogicalDevice::createComputePipelines(IGPUPipelineCache* const pipelineCache, const std::span params, core::smart_refctd_ptr* const output) { std::fill_n(output,params.size(),nullptr); - IGPUComputePipeline::SCreationParams::SSpecializationValidationResult specConstantValidation = commonCreatePipelines(pipelineCache,params,[this](const asset::IPipelineBase::SShaderSpecInfo& info)->bool + SSpecializationValidationResult specConstantValidation = commonCreatePipelines(pipelineCache,params,[this](const IGPUPipelineBase::SShaderSpecInfo& info)->bool { // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02755 - if (info.requiredSubgroupSize>=asset::IPipelineBase::SShaderSpecInfo::SUBGROUP_SIZE::REQUIRE_4 && !getPhysicalDeviceLimits().requiredSubgroupSizeStages.hasFlags(info.stage)) + if (info.requiredSubgroupSize>=asset::IPipelineBase::SUBGROUP_SIZE::REQUIRE_4 && !getPhysicalDeviceLimits().requiredSubgroupSizeStages.hasFlags(hlsl::ShaderStage::ESS_COMPUTE)) { NBL_LOG_ERROR("Invalid shader stage"); return false; @@ -808,7 +867,11 @@ bool ILogicalDevice::createComputePipelines(IGPUPipelineCache* const pipelineCac for (auto ix = 0u; ix < params.size(); ix++) { const auto& ci = params[ix]; - debloatShaders(*m_spirvDebloater.get(), ci.getShaders(), debloatedShaders, &newParams[ix].shader, m_logger); + const core::set entryPoints = { asset::ISPIRVDebloater::EntryPoint{.name = ci.shader.entryPoint, .stage = hlsl::ShaderStage::ESS_COMPUTE} }; + debloatedShaders.push_back(m_spirvDebloater->debloat(ci.shader.shader, entryPoints, m_logger)); + auto debloatedShaderSpec = ci.shader; + debloatedShaderSpec.shader = debloatedShaders.back().get(); + newParams[ix].shader = debloatedShaderSpec; } createComputePipelines_impl(pipelineCache,newParams,output,specConstantValidation); @@ -834,12 +897,10 @@ bool ILogicalDevice::createGraphicsPipelines( ) { std::fill_n(output, params.size(), nullptr); - IGPUGraphicsPipeline::SCreationParams::SSpecializationValidationResult specConstantValidation = commonCreatePipelines(nullptr, params, - [this](const asset::IPipelineBase::SShaderSpecInfo& info)->bool + SSpecializationValidationResult specConstantValidation = commonCreatePipelines(nullptr, params, + [this](const IGPUPipelineBase::SShaderSpecInfo& info)->bool { - if (info.stage != hlsl::ShaderStage::ESS_VERTEX) - return true; - return info.shader; + return info.shader != nullptr; } ); if (!specConstantValidation) @@ -858,9 +919,6 @@ bool ILogicalDevice::createGraphicsPipelines( core::vector> debloatedShaders; // vector to hold all the debloated shaders, so the pointer from the new ShaderSpecInfo is not dangling debloatedShaders.reserve(shaderCount); - core::vector debloatedShaderSpecs(shaderCount); - auto outShaderSpecs = debloatedShaderSpecs.data(); - for (auto ix = 0u; ix < params.size(); ix++) { const auto& ci = params[ix]; @@ -953,9 +1011,19 @@ bool ILogicalDevice::createGraphicsPipelines( } } } + + SpirvDebloatTask debloatTask(m_spirvDebloater.get(), m_logger); + debloatTask.insertEntryPoint(ci.vertexShader, hlsl::ShaderStage::ESS_VERTEX); + debloatTask.insertEntryPoint(ci.tesselationControlShader, hlsl::ShaderStage::ESS_TESSELLATION_CONTROL); + debloatTask.insertEntryPoint(ci.tesselationEvaluationShader, hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION); + debloatTask.insertEntryPoint(ci.geometryShader, hlsl::ShaderStage::ESS_GEOMETRY); + debloatTask.insertEntryPoint(ci.fragmentShader, hlsl::ShaderStage::ESS_FRAGMENT); - newParams[ix].shaders = std::span(outShaderSpecs, ci.getShaders().size()); - debloatShaders(*m_spirvDebloater.get(), ci.getShaders(), debloatedShaders, outShaderSpecs, m_logger); + newParams[ix].vertexShader = debloatTask.debloat(ci.vertexShader, debloatedShaders); + newParams[ix].tesselationControlShader = debloatTask.debloat(ci.tesselationControlShader, debloatedShaders); + newParams[ix].tesselationEvaluationShader = debloatTask.debloat(ci.tesselationEvaluationShader, debloatedShaders); + newParams[ix].geometryShader = debloatTask.debloat(ci.geometryShader, debloatedShaders); + newParams[ix].fragmentShader = debloatTask.debloat(ci.fragmentShader, debloatedShaders); } createGraphicsPipelines_impl(pipelineCache, newParams, output, specConstantValidation); @@ -980,7 +1048,7 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline core::smart_refctd_ptr* const output) { std::fill_n(output,params.size(),nullptr); - IGPURayTracingPipeline::SCreationParams::SSpecializationValidationResult specConstantValidation = commonCreatePipelines(pipelineCache,params,[this](const asset::IPipelineBase::SShaderSpecInfo& info)->bool + SSpecializationValidationResult specConstantValidation = commonCreatePipelines(pipelineCache,params,[this](const IGPUPipelineBase::SShaderSpecInfo& info)->bool { return true; }); @@ -1028,15 +1096,43 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline } core::vector newParams(params.begin(), params.end()); - const auto shaderCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param) + const auto raygenCount = params.size(); // assume every param have raygen + const auto missShaderCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param) { - return sum + param.getShaders().size(); + return sum + param.shaderGroups.getMissShaderCount(); }); + const auto hitShaderCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param) + { + return sum + param.shaderGroups.getHitShaderCount(); + }); + const auto callableShaderCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param) + { + return sum + param.shaderGroups.getCallableShaderCount(); + }); + const auto shaderCount = raygenCount + missShaderCount + hitShaderCount + callableShaderCount; core::vector> debloatedShaders; // vector to hold all the debloated shaders, so the pointer from the new ShaderSpecInfo is not dangling debloatedShaders.reserve(shaderCount); - core::vector debloatedShaderSpecs(shaderCount); - auto outShaderSpecs = debloatedShaderSpecs.data(); + const auto missGroupCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param) + { + return sum + param.shaderGroups.misses.size(); + }); + const auto hitGroupCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param) + { + return sum + param.shaderGroups.hits.size(); + }); + const auto callableGroupCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param) + { + return sum + param.shaderGroups.callables.size(); + }); + + + core::vector debloatedMissSpecs(missGroupCount); + auto debloatedMissSpecData = debloatedMissSpecs.data(); + core::vector debloatedHitSpecs(hitGroupCount); + auto debloatedHitSpecData = debloatedHitSpecs.data(); + core::vector debloatedCallableSpecs(callableGroupCount); + auto debloatedCallableSpecData = debloatedCallableSpecs.data(); const auto& limits = getPhysicalDeviceLimits(); for (auto ix = 0u; ix < params.size(); ix++) @@ -1050,14 +1146,47 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline NBL_LOG_ERROR("Invalid maxRecursionDepth. maxRecursionDepth(%u) exceed the limits(%u)", param.cached.maxRecursionDepth, limits.maxRayRecursionDepth); return false; } - if (param.getShaders().empty()) + + SpirvDebloatTask debloatTask(m_spirvDebloater.get(), m_logger); + debloatTask.insertEntryPoint(param.shaderGroups.raygen, hlsl::ShaderStage::ESS_RAYGEN); + for (const auto& miss : param.shaderGroups.misses) + debloatTask.insertEntryPoint(miss, hlsl::ShaderStage::ESS_MISS); + for (const auto& hit : param.shaderGroups.hits) { - NBL_LOG_ERROR("Pipeline must have at least one shader."); - return false; + debloatTask.insertEntryPoint(hit.closestHit, hlsl::ShaderStage::ESS_CLOSEST_HIT); + debloatTask.insertEntryPoint(hit.anyHit, hlsl::ShaderStage::ESS_ANY_HIT); + debloatTask.insertEntryPoint(hit.intersection, hlsl::ShaderStage::ESS_INTERSECTION); + } + for (const auto& callable : param.shaderGroups.callables) + debloatTask.insertEntryPoint(callable, hlsl::ShaderStage::ESS_CALLABLE); + + newParams[ix] = param; + newParams[ix].shaderGroups.raygen = debloatTask.debloat(param.shaderGroups.raygen, debloatedShaders); + + newParams[ix].shaderGroups.misses = { debloatedMissSpecData, param.shaderGroups.misses.size() }; + for (const auto& miss: param.shaderGroups.misses) + { + *debloatedMissSpecData = debloatTask.debloat(miss, debloatedShaders); + debloatedMissSpecData++; } - newParams[ix].shaders = std::span(outShaderSpecs, param.getShaders().size()); - debloatShaders(*m_spirvDebloater.get(), param.getShaders(), debloatedShaders, outShaderSpecs, m_logger); + newParams[ix].shaderGroups.hits = { debloatedHitSpecData, param.shaderGroups.hits.size() }; + for (const auto& hit: param.shaderGroups.hits) + { + *debloatedHitSpecData = { + .closestHit = debloatTask.debloat(hit.closestHit, debloatedShaders), + .intersection = debloatTask.debloat(hit.intersection, debloatedShaders), + .anyHit = debloatTask.debloat(hit.anyHit, debloatedShaders), + }; + debloatedHitSpecData++; + } + + newParams[ix].shaderGroups.callables = { debloatedCallableSpecData, param.shaderGroups.callables.size() }; + for (const auto& callable: param.shaderGroups.callables) + { + *debloatedCallableSpecData = debloatTask.debloat(callable, debloatedShaders); + debloatedCallableSpecData++; + } } createRayTracingPipelines_impl(pipelineCache, newParams,output,specConstantValidation); From f1fe0899869a762e377751ec9e85b68cde83e7f9 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 23 May 2025 20:57:12 +0700 Subject: [PATCH 189/346] Remove unused funciton in ILogicalDevice.cpp --- src/nbl/video/ILogicalDevice.cpp | 52 -------------------------------- 1 file changed, 52 deletions(-) diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp index d43ef7c58c..7714219836 100644 --- a/src/nbl/video/ILogicalDevice.cpp +++ b/src/nbl/video/ILogicalDevice.cpp @@ -54,58 +54,6 @@ class SpirvDebloatTask const system::logger_opt_ptr m_logger; }; -using DebloaterEntryPoints = core::set; -static void insertEntryPoint(const IGPUPipelineBase::SShaderSpecInfo& shaderSpec, hlsl::ShaderStage stage, - core::map entryPointsMap) -{ - const auto* shader = shaderSpec.shader; - auto it = entryPointsMap.find(shader); - if (it == entryPointsMap.end() || it->first != shader) - it = entryPointsMap.emplace_hint(it, shader, DebloaterEntryPoints()); - it->second.insert({ .name = shaderSpec.entryPoint, .stage = stage }); -}; - -static void debloatShaders(const asset::ISPIRVDebloater& debloater, std::span shaderSpecs, core::vector>& outShaders, IGPUPipelineBase::SShaderSpecInfo* outShaderSpecInfos, system::logger_opt_ptr logger = nullptr) -{ - using EntryPoints = core::set; - core::map entryPointsMap; - - - // collect all entry points first before we debloat - for (const auto& shaderSpec : shaderSpecs) { - const auto* shader = shaderSpec.shader; - auto it = entryPointsMap.find(shader); - if (it == entryPointsMap.end() || it->first != shader) - it = entryPointsMap.emplace_hint(it, shader, EntryPoints()); - it->second.insert({ .name = shaderSpec.entryPoint, .stage = shaderSpec.stage }); - } - - core::map debloatedShaders; - for (const auto& shaderSpec: shaderSpecs) - { - const auto* shader = shaderSpec.shader; - const auto& entryPoints = entryPointsMap[shader]; - - auto debloatedShaderSpec = shaderSpec; - if (shader != nullptr) - { - if (!debloatedShaders.contains(shader)) - { - const auto outShadersData = outShaders.data(); - outShaders.push_back(debloater.debloat(shader, entryPoints, logger)); - assert(outShadersData == outShaders.data()); - debloatedShaders.emplace(shader, outShaders.back().get()); - } - const auto debloatedShader = debloatedShaders[shader]; - debloatedShaderSpec.shader = debloatedShader; - } - *outShaderSpecInfos = debloatedShaderSpec; - - outShaderSpecInfos++; - } - -} - ILogicalDevice::ILogicalDevice(core::smart_refctd_ptr&& api, const IPhysicalDevice* const physicalDevice, const SCreationParams& params, const bool runningInRenderdoc) : m_api(api), m_physicalDevice(physicalDevice), m_enabledFeatures(params.featuresToEnable), m_compilerSet(params.compilerSet), m_logger(m_physicalDevice->getDebugCallback() ? m_physicalDevice->getDebugCallback()->getLogger() : nullptr), From 89f499dde20b7e1ccaad32c0d2dbb3ba637433bc Mon Sep 17 00:00:00 2001 From: devsh Date: Sun, 25 May 2025 19:56:28 +0200 Subject: [PATCH 190/346] get the explicitly instantiated templated methods --- src/nbl/video/IGPUCommandBuffer.cpp | 32 ++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/nbl/video/IGPUCommandBuffer.cpp b/src/nbl/video/IGPUCommandBuffer.cpp index 40c5ea1e3b..bba06c424a 100644 --- a/src/nbl/video/IGPUCommandBuffer.cpp +++ b/src/nbl/video/IGPUCommandBuffer.cpp @@ -235,8 +235,8 @@ bool IGPUCommandBuffer::invalidDependency(const SDependencyInfo #endif // _NBL_DEBUG return false; } -template bool IGPUCommandBuffer::invalidDependency(const SDependencyInfo&) const; -template bool IGPUCommandBuffer::invalidDependency(const SDependencyInfo&) const; +template NBL_API2 bool IGPUCommandBuffer::invalidDependency(const SDependencyInfo&) const; +template NBL_API2 bool IGPUCommandBuffer::invalidDependency(const SDependencyInfo&) const; bool IGPUCommandBuffer::setEvent(IEvent* _event, const SEventDependencyInfo& depInfo) { @@ -848,16 +848,16 @@ uint32_t IGPUCommandBuffer::buildAccelerationStructures_common(const std::span( +template NBL_API2 uint32_t IGPUCommandBuffer::buildAccelerationStructures_common( const std::span, IGPUBottomLevelAccelerationStructure::DirectBuildRangeRangeInfos, const IGPUBuffer* const ); -template uint32_t IGPUCommandBuffer::buildAccelerationStructures_common( +template NBL_API2 uint32_t IGPUCommandBuffer::buildAccelerationStructures_common( const std::span, IGPUBottomLevelAccelerationStructure::MaxInputCounts* const, const IGPUBuffer* const ); -template uint32_t IGPUCommandBuffer::buildAccelerationStructures_common( +template NBL_API2 uint32_t IGPUCommandBuffer::buildAccelerationStructures_common( const std::span, IGPUTopLevelAccelerationStructure::DirectBuildRangeRangeInfos, const IGPUBuffer* const ); -template uint32_t IGPUCommandBuffer::buildAccelerationStructures_common( +template NBL_API2 uint32_t IGPUCommandBuffer::buildAccelerationStructures_common( const std::span, IGPUTopLevelAccelerationStructure::MaxInputCounts* const, const IGPUBuffer* const ); @@ -890,8 +890,8 @@ bool IGPUCommandBuffer::copyAccelerationStructure(const AccelerationStructure::C m_TLASTrackingOps.emplace_back(TLASTrackingCopy{.src=copyInfo.src,.dst=copyInfo.dst}); return retval; } -template bool IGPUCommandBuffer::copyAccelerationStructure(const IGPUBottomLevelAccelerationStructure::CopyInfo&); -template bool IGPUCommandBuffer::copyAccelerationStructure(const IGPUTopLevelAccelerationStructure::CopyInfo&); +template NBL_API2 bool IGPUCommandBuffer::copyAccelerationStructure(const IGPUBottomLevelAccelerationStructure::CopyInfo&); +template NBL_API2 bool IGPUCommandBuffer::copyAccelerationStructure(const IGPUTopLevelAccelerationStructure::CopyInfo&); template requires std::is_base_of_v bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const AccelerationStructure::DeviceCopyToMemoryInfo& copyInfo) @@ -919,8 +919,8 @@ bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const AccelerationStru m_TLASTrackingOps.emplace_back(TLASTrackingRead{.src=copyInfo.src,.dst=copyInfo.trackedBLASes}); return retval; } -template bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const IGPUBottomLevelAccelerationStructure::DeviceCopyToMemoryInfo&); -template bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const IGPUTopLevelAccelerationStructure::DeviceCopyToMemoryInfo&); +template NBL_API2 bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const IGPUBottomLevelAccelerationStructure::DeviceCopyToMemoryInfo&); +template NBL_API2 bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const IGPUTopLevelAccelerationStructure::DeviceCopyToMemoryInfo&); template requires std::is_base_of_v bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const AccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo) @@ -959,8 +959,8 @@ bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const AccelerationSt } return retval; } -template bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const IGPUBottomLevelAccelerationStructure::DeviceCopyFromMemoryInfo&); -template bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const IGPUTopLevelAccelerationStructure::DeviceCopyFromMemoryInfo&); +template NBL_API2 bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const IGPUBottomLevelAccelerationStructure::DeviceCopyFromMemoryInfo&); +template NBL_API2 bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const IGPUTopLevelAccelerationStructure::DeviceCopyFromMemoryInfo&); bool IGPUCommandBuffer::bindComputePipeline(const IGPUComputePipeline* const pipeline) @@ -1686,8 +1686,8 @@ bool IGPUCommandBuffer::invalidDrawIndirect(const asset::SBufferBinding(const asset::SBufferBinding&, const uint32_t, uint32_t); -template bool IGPUCommandBuffer::invalidDrawIndirect(const asset::SBufferBinding&, const uint32_t, uint32_t); +template NBL_API2 bool IGPUCommandBuffer::invalidDrawIndirect(const asset::SBufferBinding&, const uint32_t, uint32_t); +template NBL_API2 bool IGPUCommandBuffer::invalidDrawIndirect(const asset::SBufferBinding&, const uint32_t, uint32_t); template requires nbl::is_any_of_v bool IGPUCommandBuffer::invalidDrawIndirectCount(const asset::SBufferBinding& indirectBinding, const asset::SBufferBinding& countBinding, const uint32_t maxDrawCount, const uint32_t stride) @@ -1705,8 +1705,8 @@ bool IGPUCommandBuffer::invalidDrawIndirectCount(const asset::SBufferBinding(const asset::SBufferBinding&, const asset::SBufferBinding&, const uint32_t, const uint32_t); -template bool IGPUCommandBuffer::invalidDrawIndirectCount(const asset::SBufferBinding&, const asset::SBufferBinding&, const uint32_t, const uint32_t); +template NBL_API2 bool IGPUCommandBuffer::invalidDrawIndirectCount(const asset::SBufferBinding&, const asset::SBufferBinding&, const uint32_t, const uint32_t); +template NBL_API2 bool IGPUCommandBuffer::invalidDrawIndirectCount(const asset::SBufferBinding&, const asset::SBufferBinding&, const uint32_t, const uint32_t); bool IGPUCommandBuffer::drawIndirect(const asset::SBufferBinding& binding, const uint32_t drawCount, const uint32_t stride) { From 499c10dbaf33ef3c9509b153404df937e1a67dee Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 26 May 2025 10:08:01 +0200 Subject: [PATCH 191/346] make asset converter work properly in absence of transfer SIntendedSubmitInfo but when compute calls are done/needed --- src/nbl/video/utilities/CAssetConverter.cpp | 44 ++++++++++++--------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 548c049bfe..4aa631c746 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -4037,7 +4037,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul // whenever transfer needs to do a submit overflow because it ran out of memory for streaming, we can already submit the recorded compute shader dispatches auto computeCmdBuf = shouldDoSomeCompute ? params.compute->getCommandBufferForRecording():nullptr; - auto drainCompute = [¶ms,&computeCmdBuf](const std::span extraSignal={})->auto + auto drainCompute = [¶ms,shouldDoSomeTransfer,&computeCmdBuf](const std::span extraSignal={})->auto { if (!computeCmdBuf || computeCmdBuf->cmdbuf->empty()) return IQueue::RESULT::SUCCESS; @@ -4045,15 +4045,18 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul auto& waitSemaphoreSpan = params.compute->waitSemaphores; std::unique_ptr patchedWaits; // the transfer scratch semaphore value, is from the last submit, not the future value we're enqueing all the deferred memory releases with - if (waitSemaphoreSpan.empty()) - waitSemaphoreSpan = {¶ms.transfer->scratchSemaphore,1}; - else + if (shouldDoSomeTransfer) { - const auto origCount = waitSemaphoreSpan.size(); - patchedWaits.reset(new IQueue::SSubmitInfo::SSemaphoreInfo[origCount+1]); - std::copy(waitSemaphoreSpan.begin(),waitSemaphoreSpan.end(),patchedWaits.get()); - patchedWaits[origCount] = params.transfer->scratchSemaphore; - waitSemaphoreSpan = {patchedWaits.get(),origCount+1}; + if (waitSemaphoreSpan.empty()) + waitSemaphoreSpan = {¶ms.transfer->scratchSemaphore,1}; + else + { + const auto origCount = waitSemaphoreSpan.size(); + patchedWaits.reset(new IQueue::SSubmitInfo::SSemaphoreInfo[origCount+1]); + std::copy(waitSemaphoreSpan.begin(),waitSemaphoreSpan.end(),patchedWaits.get()); + patchedWaits[origCount] = params.transfer->scratchSemaphore; + waitSemaphoreSpan = {patchedWaits.get(),origCount+1}; + } } // don't worry about resetting old `waitSemaphores` because they get cleared to an empty span after overflow submit IQueue::RESULT res = params.compute->submit(computeCmdBuf,extraSignal); @@ -4067,14 +4070,18 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul }; // We want to be doing Host operations while stalled for GPU, compose our overflow callback on top of what's already there, only if we need to ofc - auto origXferStallCallback = params.transfer->overflowCallback; - params.transfer->overflowCallback = [device,&hostUploadBuffers,&origXferStallCallback,&drainCompute](const ISemaphore::SWaitInfo& tillScratchResettable)->void + std::function origXferStallCallback; + if (shouldDoSomeTransfer) { - drainCompute(); - if (origXferStallCallback) - origXferStallCallback(tillScratchResettable); - hostUploadBuffers([device,&tillScratchResettable]()->bool{return device->waitForSemaphores({&tillScratchResettable,1},false,0)==ISemaphore::WAIT_RESULT::TIMEOUT;}); - }; + origXferStallCallback = std::move(params.transfer->overflowCallback); + params.transfer->overflowCallback = [device,&hostUploadBuffers,&origXferStallCallback,&drainCompute](const ISemaphore::SWaitInfo& tillScratchResettable)->void + { + drainCompute(); + if (origXferStallCallback) + origXferStallCallback(tillScratchResettable); + hostUploadBuffers([device,&tillScratchResettable]()->bool{return device->waitForSemaphores({&tillScratchResettable,1},false,0)==ISemaphore::WAIT_RESULT::TIMEOUT;}); + }; + } // when overflowing compute resources, we need to submit the Xfer before submitting Compute auto drainBoth = [¶ms,&xferCmdBuf,&drainCompute](const std::span extraSignal={})->auto { @@ -4149,7 +4156,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul return true; }; - // because of the layout transitions + // because of the layout transitions (TODO: conditional when host_image_copy gets implemented) params.transfer->scratchSemaphore.stageMask |= PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS; // TODO:: Shall we rewrite? e.g. we upload everything first, extra submit for QFOT pipeline barrier & transition in overflow callback, then record compute commands, and submit them, plus their final QFOTs // Lets analyze sync cases: @@ -5337,7 +5344,8 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul retval.set({params.transfer->scratchSemaphore.semaphore,params.transfer->scratchSemaphore.value}); } // reset original callback - params.transfer->overflowCallback = origXferStallCallback; + if (bool(origXferStallCallback)) + params.transfer->overflowCallback = std::move(origXferStallCallback); // Its too dangerous to leave an Intended Transfer Submit hanging around that needs to be submitted for Compute to make forward progress outside of this utility, // and doing transfer-signals-after-compute-wait timeline sema tricks are not and option because: From 31e4e084291b7922b660866d8fe1307b8ff07ffa Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 26 May 2025 10:08:23 +0200 Subject: [PATCH 192/346] update examples_tests --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 69ba991ea4..e30938c261 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 69ba991ea4827c80d008a31256785f4c4c60f12d +Subproject commit e30938c2615dd5d3ab69cadca3ba11d1e03f8233 From 0e9d9323ccab52eebb70ddbc02e1ef03ab7bf76f Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Mon, 26 May 2025 10:24:30 +0200 Subject: [PATCH 193/346] save work --- .github/workflows/build-nabla.yml | 4 +- docker/compiler-explorer | 2 +- docker/msvc-winsdk | 2 +- src/nbl/device/gen.py | 2 +- src/nbl/video/CJITIncludeLoader.cpp | 1 - tools/nsc/CMakeLists.txt | 551 +++++++--------------------- 6 files changed, 136 insertions(+), 426 deletions(-) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index 967953aeef..79b5d7aabb 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -82,8 +82,8 @@ jobs: - name: Package workflow artifacts run: | - tar -cvf "${{ steps.set-prefix.outputs.prefix }}-profiling.tar" -C profiling . - tar -cvf "${{ steps.set-prefix.outputs.prefix }}-install.tar" -C ${{ env.install }} . + tar -cvf "${{ steps.set-prefix.outputs.prefix }}-profiling.tar" profiling + tar -cvf "${{ steps.set-prefix.outputs.prefix }}-install.tar" ${{ env.install }} - name: Upload profiling artifacts uses: actions/upload-artifact@v4 diff --git a/docker/compiler-explorer b/docker/compiler-explorer index e7d3e6ce85..45866dfa87 160000 --- a/docker/compiler-explorer +++ b/docker/compiler-explorer @@ -1 +1 @@ -Subproject commit e7d3e6ce85d4b87bd9afadc5b2ba8c268ccbeb51 +Subproject commit 45866dfa8782404fc121f25ce15ad0626b474db0 diff --git a/docker/msvc-winsdk b/docker/msvc-winsdk index 831515f599..d91a96faed 160000 --- a/docker/msvc-winsdk +++ b/docker/msvc-winsdk @@ -1 +1 @@ -Subproject commit 831515f59919fbe97653804a5fc634aeb36d360e +Subproject commit d91a96faede2933ec02a18b94141fbed549929c0 diff --git a/src/nbl/device/gen.py b/src/nbl/device/gen.py index 253d529b3d..88174cb3c2 100644 --- a/src/nbl/device/gen.py +++ b/src/nbl/device/gen.py @@ -120,7 +120,7 @@ args.jit_traits_output_path, buildTraitsHeader, type="JIT Members", - template="oss << \"NBL_CONSTEXPR_STATIC_INLINE {} {} = ({})\" + CJITIncludeLoader::to_string({}.{});", + template="oss << \"NBL_CONSTEXPR_STATIC_INLINE {} {} = ({})\" + CJITIncludeLoader::to_string({}.{}) << \";\\n\";", limits_json=limits, features_json=features, format_params=["type", "name", "type", "json_type", "cpp_name"], diff --git a/src/nbl/video/CJITIncludeLoader.cpp b/src/nbl/video/CJITIncludeLoader.cpp index a9f27e5afd..1fcbcb0505 100644 --- a/src/nbl/video/CJITIncludeLoader.cpp +++ b/src/nbl/video/CJITIncludeLoader.cpp @@ -20,7 +20,6 @@ auto CJITIncludeLoader::getInclude(const system::path& searchPath, const std::st std::string CJITIncludeLoader::collectDeviceCaps(const SPhysicalDeviceLimits& limits, const SPhysicalDeviceFeatures& features) { #include "nbl/video/device_capabilities_traits_jit.h" - std::string start = R"===( #ifndef _NBL_BUILTIN_HLSL_JIT_DEVICE_CAPABILITIES_INCLUDED_ #define _NBL_BUILTIN_HLSL_JIT_DEVICE_CAPABILITIES_INCLUDED_ diff --git a/tools/nsc/CMakeLists.txt b/tools/nsc/CMakeLists.txt index bb45442982..efe7741f4e 100644 --- a/tools/nsc/CMakeLists.txt +++ b/tools/nsc/CMakeLists.txt @@ -6,6 +6,7 @@ set(GODBOLT_BINARY_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/compiler-explorer") set(GODBOLT_BINARY_PRETEST_DIRECTORY "${GODBOLT_BINARY_DIRECTORY}/pre-test") set(NBL_NSC_COMPILE_DIRECTORY "${GODBOLT_BINARY_PRETEST_DIRECTORY}/.compile/$") set(NBL_NSC_PREINSTALL_DIRECTORY "${GODBOLT_BINARY_PRETEST_DIRECTORY}/.preinstall") +make_directory("${NBL_NSC_PREINSTALL_DIRECTORY}") set(NBL_DOCKER_CT_NSC_VOLUME_SOURCE "${GODBOLT_BINARY_DIRECTORY}/install") @@ -56,323 +57,138 @@ add_test(NAME NBL_NSC_DUMP_BUILD_INFO_TEST if(NBL_ENABLE_DOCKER_INTEGRATION) -find_program(DOCKER_EXE - NAMES docker - REQUIRED -) +find_program(CTEST_EXE NAMES ctest REQUIRED) +find_program(DOCKER_EXE NAMES docker REQUIRED) + +find_file(DXIL_DLL NAMES dxil.dll HINTS "$ENV{CMAKE_WINDOWS_KITS_10_DIR}/Redist/D3D/x64" "C:/Program Files (x86)/Windows Kits/10/Redist/D3D/x64" REQUIRED) +cmake_path(GET DXIL_DLL PARENT_PATH DXIL_DIR) +cmake_path(NATIVE_PATH DXIL_DIR NORMALIZE DXIL_DIR) + +find_file(ICU_DLL NAMES icu.dll HINTS REQUIRED) +cmake_path(GET ICU_DLL PARENT_PATH ICU_DIR) +cmake_path(NATIVE_PATH ICU_DIR NORMALIZE ICU_DIR) +set(ICU_GLOBALIZATION_DIR "C:/Windows/Globalization/ICU") +find_file(ICUDTL_DAT NAMES icudtl.dat HINTS "${ICU_GLOBALIZATION_DIR}" REQUIRED) + +find_file(UCRTBASED_DLL NAMES ucrtbased.dll HINTS ${UCRTBASED_DLL_DIR} REQUIRED) +cmake_path(GET UCRTBASED_DLL PARENT_PATH UCRTBASED_DIR) +cmake_path(NATIVE_PATH UCRTBASED_DIR NORMALIZE UCRTBASED_DIR) + +find_program(SPIRV_DIS_EXE NAMES spirv-dis HINTS "${VULKAN_SDK}/Bin" REQUIRED) +cmake_path(GET SPIRV_DIS_EXE PARENT_PATH SPIRV_DIS_DIR) +cmake_path(NATIVE_PATH SPIRV_DIS_DIR NORMALIZE SPIRV_DIS_DIR) + +cmake_path(NATIVE_PATH MSVC_REDIST_DIR NORMALIZE TOOLSET_REDIST_PATH) -find_program(SPIRV_DIS_EXE - NAMES spirv-dis - HINTS "$ENV{VULKAN_SDK_INSTALL_DIRECTORY}/Bin" - HINTS "$ENV{VK_SDK_PATH}/Bin" - HINTS "$ENV{VULKAN_SDK}/Bin" - REQUIRED +file(GLOB_RECURSE VC_MODULES LIST_DIRECTORIES false + "${TOOLSET_REDIST_PATH}/x64/*.CRT/*.dll" + "${TOOLSET_REDIST_PATH}/debug_nonredist/x64/*.DebugCRT/*.dll" ) -cmake_path(GET Vulkan_INCLUDE_DIR PARENT_PATH VULKAN_SDK_INSTALL_DIRECTORY) -get_filename_component(VULKAN_SDK_VERSION "${VULKAN_SDK_INSTALL_DIRECTORY}" NAME) +foreach(MODULE ${VC_MODULES}) + get_filename_component(DIR ${MODULE} DIRECTORY) + cmake_path(NATIVE_PATH DIR NORMALIZE DIR) + list(APPEND VC_MODULE_DIRS ${DIR}) +endforeach() -if(NOT EXISTS "${VULKAN_SDK_INSTALL_DIRECTORY}") - message(FATAL_ERROR "Internal error, VULKAN_SDK_INSTALL_DIRECTORY doesn't exist") +if(NOT VC_MODULE_DIRS) + message(FATAL_ERROR "Failed to GLOB for VC Redist modules!") endif() -find_program(CTEST_EXE - NAMES ctest - REQUIRED -) +set(CT_RUNTIMES C:/pack/runtimes) +cmake_path(NATIVE_PATH CT_RUNTIMES NORMALIZE CT_RUNTIMES) +set(HOST_MOUNT_DIRS ${VC_MODULE_DIRS} ${SPIRV_DIS_DIR} ${UCRTBASED_DIR} ${DXIL_DIR} ${ICU_DIR}) +list(REMOVE_DUPLICATES HOST_MOUNT_DIRS) -set(NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT "${NBL_DOCKER_CT_NSC_VOLUME_SOURCE}/hlsl.local.properties.cmake") +set(ix 0) +foreach(DIR ${HOST_MOUNT_DIRS}) + set(TARGET_MOUNT_DIR "${CT_RUNTIMES}/system/${ix}") + cmake_path(NATIVE_PATH TARGET_MOUNT_DIR NORMALIZE TARGET_MOUNT_DIR) -set(NBL_DOCKER_CT_NSC_VOLUME_TARGET "C:\\\\nsc\\\\install") -string(GENEX_STRIP "${NBL_PACKAGE_RUNTIME_EXE_DIR_PATH}" NBL_RELATIVE_ENTRY) -set(NSC_RELEASE_BUILD_INFO "${NBL_NSC_PREINSTALL_DIRECTORY}/${NBL_RELATIVE_ENTRY}/${NBL_NSC_BUILD_INFO_FILENAME}") -set(NSC_RELWITHDEBINFO_BUILD_INFO "${NBL_NSC_PREINSTALL_DIRECTORY}/relwithdebinfo/${NBL_RELATIVE_ENTRY}/${NBL_NSC_BUILD_INFO_FILENAME}") -set(NSC_DEBUG_BUILD_INFO "${NBL_NSC_PREINSTALL_DIRECTORY}/debug/${NBL_RELATIVE_ENTRY}/${NBL_NSC_BUILD_INFO_FILENAME}") -cmake_path(NATIVE_PATH NSC_RELEASE_BUILD_INFO NORMALIZE NSC_RELEASE_BUILD_INFO) -cmake_path(NATIVE_PATH NSC_RELWITHDEBINFO_BUILD_INFO NORMALIZE NSC_RELWITHDEBINFO_BUILD_INFO) -cmake_path(NATIVE_PATH NSC_DEBUG_BUILD_INFO NORMALIZE NSC_DEBUG_BUILD_INFO) - -set(NBL_INSTALL_DIRECTORY "${NBL_DOCKER_CT_NSC_VOLUME_TARGET}") -cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_TARGET NORMALIZE NBL_DOCKER_CT_NSC_VOLUME_TARGET) + list(APPEND DOCKER_CLI_ARGS -v "${DIR}:${TARGET_MOUNT_DIR}:ro") + list(APPEND CT_MOUNT_DIRS "${TARGET_MOUNT_DIR}") + + math(EXPR ix "${ix} + 1" OUTPUT_FORMAT DECIMAL) +endforeach() +set(NBL_DOCKER_CT_NSC_VOLUME_TARGET "${CT_RUNTIMES}/Nabla") set(NBL_BUILD_INFO_POSTPROCESS_COMMAND "${CMAKE_COMMAND}" "-DNBL_EXECUTABLE_PATH=${NBL_NSC_PREINSTALL_TARGET_EXE_FILEPATH}" "-DNBL_BUILD_INFO=${NBL_NSC_PREINSTALL_TARGET_BUILD_INFO}" "-DNBL_OUTPUT_FILE=${NBL_NSC_PREINSTALL_TARGET_BUILD_INFO}" - "-DNBL_OUTPUT_EXE_OVERRIDE=$" # as in CT, it's *not* host exe location! + "-DNBL_OUTPUT_EXE_OVERRIDE=$" -P "${NBL_ROOT_PATH}/cmake/scripts/nbl/nablaBuildInfo.cmake" ) +cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_SOURCE NORMALIZE NBL_DOCKER_CT_NSC_VOLUME_SOURCE) +cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_TARGET NORMALIZE NBL_DOCKER_CT_NSC_VOLUME_TARGET) +cmake_path(NATIVE_PATH NBL_NSC_PREINSTALL_DIRECTORY NORMALIZE NBL_NSC_PREINSTALL_DIRECTORY) +list(APPEND DOCKER_CLI_ARGS -v "${NBL_NSC_PREINSTALL_DIRECTORY}:${NBL_DOCKER_CT_NSC_VOLUME_TARGET}") +#list(APPEND DOCKER_CLI_ARGS -v "${ICU_GLOBALIZATION_DIR}:${ICU_GLOBALIZATION_DIR}:ro") -cmake_path(GET SPIRV_DIS_EXE PARENT_PATH VULKAN_SDK_BIN_DIRECTORY) -cmake_path(NATIVE_PATH VULKAN_SDK_BIN_DIRECTORY NORMALIZE VULKAN_SDK_BIN_DIRECTORY) -cmake_path(GET SPIRV_DIS_EXE FILENAME SPIRV_DIS_EXE) -set(CT_SPIRV_DIS_EXE "C:\\vulkan\\${VULKAN_SDK_VERSION}\\bin\\${SPIRV_DIS_EXE}") -cmake_path(NATIVE_PATH CT_SPIRV_DIS_EXE NORMALIZE CT_SPIRV_DIS_EXE) - +set(NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT "${NBL_DOCKER_CT_NSC_VOLUME_SOURCE}/hlsl.local.properties.cmake") +string(GENEX_STRIP "${NBL_PACKAGE_RUNTIME_EXE_DIR_PATH}" NBL_RELATIVE_ENTRY) +set(OUTPUT_CONFIG_FILE $) set(NBL_CE_GENERATE_CONFIG_COMMAND "${CMAKE_COMMAND}" - "-DSPIRV_DIS_EXE=${CT_SPIRV_DIS_EXE}" - "-DNSC_RELEASE_BUILD_INFO=${NSC_RELEASE_BUILD_INFO}" - "-DNSC_RELWITHDEBINFO_BUILD_INFO=${NSC_RELWITHDEBINFO_BUILD_INFO}" - "-DNSC_DEBUG_BUILD_INFO=${NSC_DEBUG_BUILD_INFO}" - "-DOUTPUT_CONFIG_FILE=${NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT}" + "-DSPIRV_DIS_EXE=spirv-dis.exe" + "-DNSC_RELEASE_BUILD_INFO=$" + "-DNSC_RELWITHDEBINFO_BUILD_INFO=$" + "-DNSC_DEBUG_BUILD_INFO=$" + "-DOUTPUT_CONFIG_FILE=${OUTPUT_CONFIG_FILE}" -P "${CMAKE_CURRENT_SOURCE_DIR}/ce-generate-config.cmake" ) -set(NBL_DOCKER_CE_DOCKER_CTX "${NBL_ROOT_PATH}/docker/compiler-explorer") -set(NBL_DOCKER_CE_DOCKERFILE_BASE "${NBL_DOCKER_CE_DOCKER_CTX}/Dockerfile") -set(NBL_DOCKER_CE_COMPOSE_BASE "${NBL_DOCKER_CE_DOCKER_CTX}/compose.yml") -cmake_path(NATIVE_PATH NBL_DOCKER_CE_COMPOSE_BASE NORMALIZE NBL_DOCKER_CE_COMPOSE_BASE) -set(NBL_DOCKER_CE_COMPOSE_TARGET "${GODBOLT_BINARY_DIRECTORY}/.dev-compose.yml") - -include(InstallRequiredSystemLibraries) - -string(REPLACE "v" "VC" TARGET_DCRT ${CMAKE_VS_PLATFORM_TOOLSET}) -set(DEBUG_CRT_RELATIVE debug_nonredist/x64/Microsoft.${TARGET_DCRT}.DebugCRT) -set(DEBUG_CRT_DIRECTORY_SOURCE "${MSVC_REDIST_DIR}/${DEBUG_CRT_RELATIVE}") -cmake_path(NATIVE_PATH MSVC_REDIST_DIR NORMALIZE NBL_REDIST_DIR) - -if(NOT EXISTS "${DEBUG_CRT_DIRECTORY_SOURCE}") - message(FATAL_ERROR "DEBUG_CRT_DIRECTORY_SOURCE = \"${DEBUG_CRT_DIRECTORY_SOURCE}\" doesn't exist!") -endif() - -set(DEBUG_CRT_DIRECTORY_TARGET "${NBL_DOCKER_CT_NSC_VOLUME_SOURCE}/.nonredist") -file(MAKE_DIRECTORY "${DEBUG_CRT_DIRECTORY_TARGET}") -file(GLOB CRT_FILES "${DEBUG_CRT_DIRECTORY_SOURCE}/*") - -find_file(UCRTBASED_DLL_PATH - NAMES ucrtbased.dll - REQUIRED -) - -# TODO: (***) ---> THIS GOES TO /docker to CMakeLists.txt file! +set(CT_ENV_FILE "${CMAKE_CURRENT_BINARY_DIR}/.env") +string(CONFIGURE [=[ +CT_MOUNT_DIRS=@CT_MOUNT_DIRS@ +NBL_INSTALL_DIRECTORY=@NBL_DOCKER_CT_NSC_VOLUME_TARGET@ +NBL_EXPLICIT_MODULE_LOAD_LOG=ON +]=] ENV_CONTENT @ONLY) +file(WRITE "${CT_ENV_FILE}" "${ENV_CONTENT}") +list(APPEND DOCKER_CLI_ARGS --env-file "${CT_ENV_FILE}") -set(BASE_IMAGE mcr.microsoft.com/windows/servercore:ltsc2022-amd64) # NOTE: HARDCODED CURRENTLY - -string(TOLOWER "dr.devsh.eu/nabla/cmake-host-dev-env/${CMAKE_SYSTEM_NAME}/package/vulkan:latest" DOCKER_VULKAN_TAG) -string(TOLOWER "dr.devsh.eu/nabla/cmake-host-dev-env/${CMAKE_SYSTEM_NAME}/toolset/redist/${CMAKE_CXX_COMPILER_ID}/crt:latest" DOCKER_CRT_TAG) -string(TOLOWER "dr.devsh.eu/nabla/cmake-host-dev-env/${CMAKE_SYSTEM_NAME}/build/${CMAKE_CXX_COMPILER_ID}/devel-compiler-explorer-nsc:latest" DOCKER_DEVEL_TAG) - -cmake_path(NATIVE_PATH MSVC_REDIST_DIR NORMALIZE TOOLSET_REDIST_PATH) -get_filename_component(REDIST_CRT_TOOLSET_VERSION "${TOOLSET_REDIST_PATH}" NAME) - -function(GEN_DOCKER_CONTENT _CTX_ _OUTPUT_DIRECTORY_ _EXTRA_DOCKERFILE_CONTENT_ _DOCKER_IGNORE_CONTENT_ _S_NAME_ _CT_NAME_ _IMAGE_NAME_ _WITH_BUILD_) - -set(_OUTPUT_D_PATH_ "${_OUTPUT_DIRECTORY_}/Dockerfile") -set(_OUTPUT_C_PATH_ "${_OUTPUT_DIRECTORY_}/compose.yml") - -string(CONFIGURE "${_EXTRA_DOCKERFILE_CONTENT_}" _EXTRA_DOCKERFILE_CONTENT_EVAL_ @ONLY) -string(CONFIGURE "${_DOCKER_IGNORE_CONTENT_}" _DOCKER_IGNORE_CONTENT_EVAL_ @ONLY) - -unset(DOCKER_CONTENT) -string(APPEND DOCKER_CONTENT -[=[ -# escape=` - -ARG BASE_IMAGE=@BASE_IMAGE@ -FROM ${BASE_IMAGE} -SHELL ["cmd", "/S", "/C"] -@_EXTRA_DOCKERFILE_CONTENT_EVAL_@ -]=] -) - -string(CONFIGURE "${DOCKER_CONTENT}" DOCKER_CONTENT @ONLY) -file(WRITE "${_OUTPUT_D_PATH_}" "${DOCKER_CONTENT}") - -set(_CTX_TARGET_ "${_OUTPUT_DIRECTORY_}/.ctx") - -if("${_CTX_}" STREQUAL "") - -else() - if(NOT EXISTS "${_CTX_}") - message(FATAL_ERROR "Invalid source context directory doesn't exist! _CTX_: \"${_CTX_}\"") - endif() - - file(COPY "${_CTX_}" DESTINATION "${_CTX_TARGET_}") -endif() - -set(_OUTPUT_I_PATH_ "${_CTX_TARGET_}/.dockerignore") - -unset(COMPOSE_CONTENT) -string(APPEND COMPOSE_CONTENT -[=[ -services: - @_S_NAME_@: - build: - context: ./.ctx - dockerfile: "@_OUTPUT_D_PATH_@" - image: @_IMAGE_NAME_@ - container_name: @_CT_NAME_@ - networks: - docker_default: - -networks: - docker_default: - external: true -]=] -) - -string(CONFIGURE "${COMPOSE_CONTENT}" COMPOSE_CONTENT @ONLY) -file(WRITE "${_OUTPUT_C_PATH_}" "${COMPOSE_CONTENT}") -file(WRITE "${_OUTPUT_I_PATH_}" "${_DOCKER_IGNORE_CONTENT_EVAL_}") - -if(_WITH_BUILD_) - execute_process(COMMAND "${DOCKER_EXE}" compose -f "${_OUTPUT_C_PATH_}" build) -endif() -endfunction() - -# Vulkan -set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/docker/vulkan") -set(CT_VULKAN_TARGET vulkan) -GEN_DOCKER_CONTENT("${VULKAN_SDK_INSTALL_DIRECTORY}" "${OUTPUT_DIRECTORY}" -[=[ -COPY ./ "@CT_VULKAN_TARGET@" - -ENV VULKAN_SDK="C:/@CT_VULKAN_TARGET@" -ENV VULKAN_SDK_VERSION="@VULKAN_SDK_VERSION@" -LABEL VULKAN_SDK="C:/@CT_VULKAN_TARGET@" -LABEL VULKAN_SDK_VERSION="@VULKAN_SDK_VERSION@" -]=] -[=[ -* -!@VULKAN_SDK_VERSION@/Bin/*.dll -!@VULKAN_SDK_VERSION@/Bin/*spirv*.exe -]=] -nabla-dev-env-vulkan -nabla.dev.env.vulkan -${DOCKER_VULKAN_TAG} -ON -) - -# CRT -set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/docker/crt") -set(CT_TOOLSET_REDIST_TARGET toolset_redist) -make_directory("${OUTPUT_DIRECTORY}/.ctx") -file(COPY "${UCRTBASED_DLL_PATH}" DESTINATION "${OUTPUT_DIRECTORY}/.ctx") -GEN_DOCKER_CONTENT("${TOOLSET_REDIST_PATH}" "${OUTPUT_DIRECTORY}" -[=[ -COPY ./ "/@CT_TOOLSET_REDIST_TARGET@" - -ENV REDIST_CRT_TOOLSET_VERSION="@REDIST_CRT_TOOLSET_VERSION@" -ENV TOOLSET_REDIST_PATH="C:/@CT_TOOLSET_REDIST_TARGET@" -LABEL REDIST_CRT_TOOLSET_VERSION="@REDIST_CRT_TOOLSET_VERSION@" -LABEL TOOLSET_REDIST_PATH="C:/@CT_TOOLSET_REDIST_TARGET@" -]=] -[=[ -* -!ucrtbased.dll -!@REDIST_CRT_TOOLSET_VERSION@/vc_redist.x64.exe -!@REDIST_CRT_TOOLSET_VERSION@/@DEBUG_CRT_RELATIVE@/*.dll -]=] -nabla-dev-env-crt -nabla.dev.env.crt -${DOCKER_CRT_TAG} -ON -) - -# Devel, combined -set(BASE_IMAGE dr.devsh.eu/compiler-explorer/windows) - -# NOTE to self: could be all done with single docker file & compose file but buildkit works bad with windows driver, yet need to wait for stuff to be implemented -set(DEVEL_CTX "${CMAKE_CURRENT_BINARY_DIR}/docker/devel") -set(CT_REDIST_DIR "${CT_TOOLSET_REDIST_TARGET}/${REDIST_CRT_TOOLSET_VERSION}") -set(CT_NONREDIST_CTR_DIR "${CT_REDIST_DIR}/${DEBUG_CRT_RELATIVE}") -cmake_path(NATIVE_PATH CT_REDIST_DIR NORMALIZE CT_REDIST_DIR) -cmake_path(NATIVE_PATH CT_NONREDIST_CTR_DIR NORMALIZE CT_NONREDIST_CTR_DIR) -set(DEVEL_DOCKERFILE "${DEVEL_CTX}/Dockerfile") - -GEN_DOCKER_CONTENT("" "${DEVEL_CTX}" -[=[ - -COPY --link --from=@DOCKER_VULKAN_TAG@ /@CT_VULKAN_TARGET@ /@CT_VULKAN_TARGET@ -COPY --link --from=@DOCKER_CRT_TAG@ /@CT_TOOLSET_REDIST_TARGET@ /@CT_TOOLSET_REDIST_TARGET@ - -# TODO -# RUN .\@CT_REDIST_DIR@\vc_redist.x64.exe /quiet /install -RUN xcopy .\@CT_NONREDIST_CTR_DIR@\*.dll %SystemRoot%\System32 /Y -RUN xcopy .\@CT_TOOLSET_REDIST_TARGET@\ucrtbased.dll %SystemRoot%\System32 /Y - -]=] -[=[ - -]=] -nabla-dev-env-nsc -nabla.dev.env.nsc -${DOCKER_DEVEL_TAG} -OFF -) - -# <---(***) - -set(NABLA_DEV_ENV_CT_NAME dev.nabla.env.${CMAKE_SYSTEM_NAME}.${CMAKE_CXX_COMPILER_ID}.base) -string(TOLOWER "${NABLA_DEV_ENV_CT_NAME}" NABLA_DEV_ENV_CT_NAME) - -set(COMPOSE_NSC_DEV_SERVICE compiler-explorer-nsc-dev) -string(TOLOWER "dr.devsh.eu/nabla/cmake-host-dev-env/${CMAKE_SYSTEM_NAME}/build/${CMAKE_CXX_COMPILER_ID}/compiler-explorer-nsc:latest" COMPOSE_NSC_DEV_IMAGE) -string(TOLOWER "dr.devsh.eu/compiler-explorer/production/windows/nsc/orphan-production-test:latest" COMPOSE_NSC_ORPHAN_PRODUCTION_TEST_IMAGE) -string(TOLOWER "dr.devsh.eu/compiler-explorer/production/windows/nsc/orphan-prodution-cache:latest" COMPOSE_NSC_PRODUCTION_CACHE_IMAGE) -string(TOLOWER "dr.devsh.eu/compiler-explorer/production/windows/nsc:latest" COMPOSE_NSC_PRODUCTION_IMAGE) - -string(APPEND COMPOSE_CONTENT -[=[ -services: - @COMPOSE_NSC_DEV_SERVICE@: - container_name: dev.ce.nsc.dev - extends: - file: @NBL_DOCKER_CE_COMPOSE_BASE@ - service: compiler-explorer - build: - context: ./.ctx - dockerfile: @DEVEL_DOCKERFILE@ - image: @COMPOSE_NSC_DEV_IMAGE@ - environment: - NBL_INSTALL_DIRECTORY: "@NBL_INSTALL_DIRECTORY@" - NBL_EXPLICIT_MODULE_LOAD_LOG: "ON" - entrypoint: - - "cmd" - - "/c" - - > - copy C:\\nsc\\install\\hlsl.local.properties.cmake %GIT_GODBOLT_REPOSITORY_PATH%\\etc\\config\\hlsl.local.properties - && npm --prefix %GIT_GODBOLT_REPOSITORY_PATH% run dev -- --language hlsl - volumes: - - type: bind - source: .\install - target: @NBL_DOCKER_CT_NSC_VOLUME_TARGET@ - read_only: true - -networks: - docker_default: - external: true -]=] -) - -string(CONFIGURE "${COMPOSE_CONTENT}" COMPOSE_CONTENT @ONLY) -file(WRITE "${NBL_DOCKER_CE_COMPOSE_TARGET}" "${COMPOSE_CONTENT}") -make_directory("${GODBOLT_BINARY_DIRECTORY}/.ctx") - -function(_PROMOTE_PROCESS_ISOLATION_ KERNEL BASES VAR) +set(CT_SETUP_FILE "${CMAKE_CURRENT_BINARY_DIR}/setup.bat") +string(CONFIGURE [=[ +@echo off +set "PATH=%PATH%;%CT_MOUNT_DIRS%" +setx PATH "%PATH%" /M +node --no-warnings --no-deprecation --import=tsx ./app.js --language hlsl +]=] SETUP_CONTENT @ONLY) +file(WRITE "${CT_SETUP_FILE}" "${SETUP_CONTENT}") +list(APPEND DOCKER_CLI_ARGS) + +function(PROMOTE_PROCESS_ISOLATION HOST_KERNEL BASE VAR) set(${VAR} True) - set(ix 0) - list(LENGTH BASES LEN) - - while(ix LESS ${LEN}) - list(GET BASES ${ix} BASE) - - execute_process(COMMAND "${DOCKER_EXE}" inspect --format={{.OsVersion}} ${BASE} RESULT_VARIABLE EXIT_LEVEL OUTPUT_VARIABLE TARGET_KERNEL OUTPUT_STRIP_TRAILING_WHITESPACE) - - if(${EXIT_LEVEL} EQUAL 0) - if(${KERNEL} VERSION_LESS ${TARGET_KERNEL}) - set(${VAR} False PARENT_SCOPE) - message(STATUS "While inspecting ${BASE} - host Kernel ${KERNEL} too low to use container process isolation (target ${TARGET_KERNEL}), falling back to HyperV. Please update your host OS.") - return() - endif() - math(EXPR ix "${ix} + 1") - else() - message(STATUS "Docker image ${BASE} not found locally, pulling...") - execute_process(COMMAND "${DOCKER_EXE}" pull ${BASE}) - endif() - endwhile() + + macro(INSPECT IMAGE) + execute_process(COMMAND "${DOCKER_EXE}" inspect --format={{.OsVersion}} ${IMAGE} + RESULT_VARIABLE EXIT_LEVEL + OUTPUT_VARIABLE TARGET_KERNEL + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + endmacro() + + macro(TO_PROCESS IMAGE TARGET_KERNEL) + if(${HOST_KERNEL} VERSION_LESS ${TARGET_KERNEL}) + set(${VAR} False) + message(STATUS "Host kernel \"${HOST_KERNEL}\" version too low to promote process isolation for \"${IMAGE}\" [${TARGET_KERNEL}] and requires falling back to HyperV. Please update your host OS.") + else() + message(STATUS "Promoting \"${IMAGE}\" [${TARGET_KERNEL}] to process isolation with host kernel [${HOST_KERNEL}] version") + endif() + endmacro() + + INSPECT(${BASE}) + + if(${EXIT_LEVEL} EQUAL 0) + TO_PROCESS(${BASE} ${TARGET_KERNEL}) + else() + message(STATUS "\"${BASE}\" not found in local registry, pulling...") + execute_process(COMMAND "${DOCKER_EXE}" pull ${BASE}) + + INSPECT(${BASE}) + TO_PROCESS(${BASE} ${TARGET_KERNEL}) + endif() set(${VAR} ${${VAR}} PARENT_SCOPE) endfunction() @@ -380,151 +196,46 @@ endfunction() execute_process(COMMAND cmd /C ver OUTPUT_VARIABLE PIPE OUTPUT_STRIP_TRAILING_WHITESPACE) string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+" HOST_KERNEL "${PIPE}") -set(BASES - mcr.microsoft.com/windows/nanoserver:ltsc2022 - mcr.microsoft.com/powershell:lts-nanoserver-ltsc2022 -) - -_PROMOTE_PROCESS_ISOLATION_("${HOST_KERNEL}" "${BASES}" PROMOTE_TO_PROCESS) - -function(_BUILD_IMAGE_ DOCKERFILE CTX TAG) - set(CMD "${DOCKER_EXE}" build) - if(PROMOTE_TO_PROCESS) - list(APPEND CMD --isolation "process") - endif() - list(APPEND CMD -t ${TAG} -f "${DOCKERFILE}" .) - - execute_process(COMMAND ${CMD} WORKING_DIRECTORY "${CTX}") -endfunction() - -_BUILD_IMAGE_("${NBL_DOCKER_CE_DOCKERFILE_BASE}" "${NBL_DOCKER_CE_DOCKER_CTX}" godbolt/base/windows) -_BUILD_IMAGE_("${DEVEL_DOCKERFILE}" "${DEVEL_CTX}" godbolt/devel/windows) - -message(FATAL_ERROR "STOP TEST, PROMOTE_TO_PROCESS = ${PROMOTE_TO_PROCESS}") - -string(APPEND BAT_PRODUCTION_INSTALL -[=[ -@echo off -setlocal - -set BASE_PATH=C:\ - -xcopy "%BASE_PATH%target" "%BASE_PATH%nsc\install" /s /e /h /i /y /f -if %ERRORLEVEL% neq 0 ( - echo [ERROR] Failed to copy C:\target to C:\nsc\install - exit /b %ERRORLEVEL% -) - -if "%GIT_GODBOLT_REPOSITORY_PATH%"=="" ( - echo [ERROR] Environment variable GIT_GODBOLT_REPOSITORY_PATH is not set! - exit /b 1 -) +set(BASE_IMAGE ghcr.io/devsh-graphics-programming/compiler-explorer-docker:nano-2022) +PROMOTE_PROCESS_ISOLATION(${HOST_KERNEL} ${BASE_IMAGE} USE_PROCESS_ISOLATION) -copy "%BASE_PATH%nsc\install\hlsl.local.properties.cmake" "%GIT_GODBOLT_REPOSITORY_PATH%\etc\config\hlsl.local.properties" -if %ERRORLEVEL% neq 0 ( - echo [ERROR] Failed to copy HLSL properties file - exit /b %ERRORLEVEL% -) - -echo [SUCCESS] All production files copied successfully. -exit /b 0 -]=] -) - -string(CONFIGURE "${BAT_PRODUCTION_INSTALL}" BAT_PRODUCTION_INSTALL @ONLY) -file(WRITE "${NBL_DOCKER_CT_NSC_INSTALL_BAT}" "${BAT_PRODUCTION_INSTALL}") +if(USE_PROCESS_ISOLATION) + list(APPEND DOCKER_CLI_ARGS --isolation process) +endif() +set(ORPHAN nsc-orphan) set(NBL_CE_URL http://localhost:80) set(NBL_CE_HEALTHY_CHECK_PY "${NBL_ROOT_PATH}/docker/compiler-explorer/ce_healthy_check.py") set(NBL_CE_ENDPOINT_PY "${NBL_ROOT_PATH}/docker/compiler-explorer/endpoint.py") set(NBL_NSC_BASIC_HLSL_JPAYLOAD "${CMAKE_CURRENT_SOURCE_DIR}/docker/godbolt/hlsl-basic-compile-payload.json") -add_custom_target(run-compiler-explorer - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Performing Pre-Test..." +add_custom_target(run-compiler-explorer ALL + COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Clearing NSC orphans.." + COMMAND "${DOCKER_EXE}" rm -f ${ORPHAN} || "${CMAKE_COMMAND}" -E true + + COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Executing CTests.." COMMAND "${CTEST_EXE}" -C $ --stop-on-failure COMMAND ${NBL_BUILD_INFO_POSTPROCESS_COMMAND} - COMMAND "${DOCKER_EXE}" compose -f "${NBL_DOCKER_CE_COMPOSE_TARGET}" stop ${COMPOSE_NSC_DEV_SERVICE} COMMAND ${NBL_CE_GENERATE_CONFIG_COMMAND} - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --green "OK! Performing executables hot-swap..." - COMMAND "${CMAKE_COMMAND}" -E copy_directory "${NBL_NSC_PREINSTALL_DIRECTORY}" "${NBL_DOCKER_CT_NSC_VOLUME_SOURCE}" - COMMAND "${DOCKER_EXE}" compose -f "${NBL_DOCKER_CE_COMPOSE_TARGET}" up -d ${COMPOSE_NSC_DEV_SERVICE} - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Checking health of Compiler Explorer service..." - COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_HEALTHY_CHECK_PY}" --url "${NBL_CE_URL}" --interval 10 --ticks 25 + + COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Running new NSC orphan container.." + COMMAND "${DOCKER_EXE}" run -di -p 80:10240 --name ${ORPHAN} --entrypoint cmd ${DOCKER_CLI_ARGS} ${BASE_IMAGE} + COMMAND "${DOCKER_EXE}" cp "${OUTPUT_CONFIG_FILE}" ${ORPHAN}:C:\\Compiler-Explorer\\etc\\config\\hlsl.local.properties + COMMAND "${DOCKER_EXE}" cp "${CT_SETUP_FILE}" ${ORPHAN}:C:\\setup.cmd + COMMAND "${DOCKER_EXE}" exec -d ${ORPHAN} C:\\setup.cmd + COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_HEALTHY_CHECK_PY}" --url "${NBL_CE_URL}" --interval 5 --ticks 25 COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "Compiler Explorer is running, type \"localhost\" in your browser!" - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Post-Checking if NSC is able to compile basic shader file..." + + COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Post-Checking if NSC container is able to compile basic shader input..." COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compiler/nsc_$>_upstream/compile --method POST --json "${NBL_NSC_BASIC_HLSL_JPAYLOAD}" - COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "OK! NSC is healthy." + COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "OK! NSC container is healthy." + WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" VERBATIM USES_TERMINAL ) -add_custom_target(is-compiler-explorer-running - COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_HEALTHY_CHECK_PY}" --url "${NBL_CE_URL}" --ticks 1 - COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compilers - VERBATIM - USES_TERMINAL -) - -# Production NSC image -set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/docker/nsc-production") -set(BASE_IMAGE "${COMPOSE_NSC_ORPHAN_PRODUCTION_TEST_IMAGE}") -set(NBL_DOCKER_TMP_PRODUCTION_TARGET "C:\\target") -GEN_DOCKER_CONTENT("" "${OUTPUT_DIRECTORY}" -[=[ -LABEL maintainer="Arkadiusz Lachowicz " ` - org.opencontainers.image.authors="Arkadiusz Lachowicz " ` - org.opencontainers.image.title="Compiler Explorer with Nabla Shader Compilers in Docker" ` - org.opencontainers.image.description="Docker image to run Compiler Explorer instance with Nabla Shader Compilers" ` - org.opencontainers.image.url="https://github.com/Devsh-Graphics-Programming/Nabla" ` - org.opencontainers.image.source="https://github.com/Devsh-Graphics-Programming/Nabla" ` - org.opencontainers.image.documentation="https://github.com/Devsh-Graphics-Programming/Nabla/tree/master/tools/nsc/docker" - -ENTRYPOINT ["powershell.exe", "-ExecutionPolicy", "Bypass", "-Command", "npm", "--prefix", "$env:GIT_GODBOLT_REPOSITORY_PATH", "start", "--", "--language", "hlsl"] -]=] -[=[ - -]=] -nsc-ce-production-cache-webpack -nsc.ce.production.cache.webpack -${COMPOSE_NSC_PRODUCTION_CACHE_IMAGE} -OFF -) - -set(NBL_CE_URL http://localhost:6969) - -add_custom_target(create-production-compiler-explorer - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Removing any remaining pre-test orphan containers..." - COMMAND "${DOCKER_EXE}" rm -f production-ce-orphan-run-test || "${CMAKE_COMMAND}" -E true - COMMAND "${DOCKER_EXE}" rm -f production-ce-orphan-cache-webpack || "${CMAKE_COMMAND}" -E true - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Creating pre-test production image..." - COMMAND "${DOCKER_EXE}" run -dit -v "${NBL_DOCKER_CT_NSC_VOLUME_SOURCE}:${NBL_DOCKER_TMP_PRODUCTION_TARGET}" --name production-ce-orphan-run-test --entrypoint "cmd" "${COMPOSE_NSC_DEV_IMAGE}" - COMMAND "${DOCKER_EXE}" exec production-ce-orphan-run-test "${NBL_DOCKER_TMP_PRODUCTION_TARGET}\\${NBL_DOCKER_INSTALL_BAT_FILENAME}" - COMMAND "${DOCKER_EXE}" stop production-ce-orphan-run-test - COMMAND "${DOCKER_EXE}" commit -m "Copy NSC install redists" production-ce-orphan-run-test "${COMPOSE_NSC_ORPHAN_PRODUCTION_TEST_IMAGE}" - COMMAND "${DOCKER_EXE}" compose build - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Running pre-test production image, caching webpack & running final checks..." - COMMAND "${DOCKER_EXE}" run -dit -p 6969:10240 --name production-ce-orphan-cache-webpack "${COMPOSE_NSC_PRODUCTION_CACHE_IMAGE}" - COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_HEALTHY_CHECK_PY}" --url "${NBL_CE_URL}" --interval 10 --ticks 35 - COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compilers --disable-cookies --timeout 69 - COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compiler/nsc_release_upstream/compile --method POST --json "${NBL_NSC_BASIC_HLSL_JPAYLOAD}" --disable-cookies --timeout 69 - COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compiler/nsc_relwithdebinfo_upstream/compile --method POST --json "${NBL_NSC_BASIC_HLSL_JPAYLOAD}" --disable-cookies --timeout 69 - COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compiler/nsc_debug_upstream/compile --method POST --json "${NBL_NSC_BASIC_HLSL_JPAYLOAD}" --disable-cookies --timeout 69 - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Passed all tests! Creating final production image..." - COMMAND "${DOCKER_EXE}" stop production-ce-orphan-cache-webpack - COMMAND "${DOCKER_EXE}" commit -m "Perform tests, cache webpack build" production-ce-orphan-cache-webpack "${COMPOSE_NSC_PRODUCTION_IMAGE}" - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --green "Created final `${COMPOSE_NSC_PRODUCTION_IMAGE}` production image!" - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --green "To run the production image, execute: 'docker run -p 80:10240 ${COMPOSE_NSC_PRODUCTION_IMAGE}'," - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --green "'docker run -p 80:10240 ${COMPOSE_NSC_PRODUCTION_IMAGE}'." - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --green "The production image can be pushed safely to the public registry." - WORKING_DIRECTORY "${OUTPUT_DIRECTORY}" - VERBATIM - USES_TERMINAL -) - add_dependencies(run-compiler-explorer nsc) set_target_properties(run-compiler-explorer PROPERTIES FOLDER "Godbolt") -set_target_properties(is-compiler-explorer-running PROPERTIES FOLDER "Godbolt") -set_target_properties(create-production-compiler-explorer PROPERTIES FOLDER "Godbolt") endif() \ No newline at end of file From 2eea2b0fa9f98a6c77ef519c8f9f1ae44e98eb49 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 26 May 2025 19:21:35 +0700 Subject: [PATCH 194/346] Fix layout constness on IComputePipeline --- include/nbl/asset/IComputePipeline.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/nbl/asset/IComputePipeline.h b/include/nbl/asset/IComputePipeline.h index 9ccef877c3..ba4d245473 100644 --- a/include/nbl/asset/IComputePipeline.h +++ b/include/nbl/asset/IComputePipeline.h @@ -26,8 +26,8 @@ class IComputePipeline : public IPipeline, public IComputePi inline const SCachedCreationParams& getCachedCreationParams() const { return m_params; } protected: - explicit IComputePipeline(const PipelineLayoutType* layout, const SCachedCreationParams& cachedParams) : - IPipeline(core::smart_refctd_ptr(layout)), + explicit IComputePipeline(PipelineLayoutType* layout, const SCachedCreationParams& cachedParams) : + IPipeline(core::smart_refctd_ptr(layout)), m_params(cachedParams) {} From 969bcb821ee38d7333a36b513d6f28e0ba1248fa Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 26 May 2025 19:21:57 +0700 Subject: [PATCH 195/346] Fix ICPUAcclerationStructure to use computeDependantsImpl --- include/nbl/asset/ICPUAccelerationStructure.h | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h index 73365cbfce..3370e31cab 100644 --- a/include/nbl/asset/ICPUAccelerationStructure.h +++ b/include/nbl/asset/ICPUAccelerationStructure.h @@ -271,10 +271,12 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA inline core::unordered_set computeDependants() const override { - core::unordered_set dependants; - for (const auto& instance : *m_instances) - dependants.insert(instance.getBase().blas.get()); - return dependants; + return computeDependantsImpl(this); + } + + inline core::unordered_set computeDependants() override + { + return computeDependantsImpl(this); } // @@ -375,6 +377,16 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA core::smart_refctd_dynamic_array m_instances = nullptr; hlsl::acceleration_structures::top_level::BuildRangeInfo m_buildRangeInfo; core::bitflag m_buildFlags = BUILD_FLAGS::PREFER_FAST_BUILD_BIT; + + template + requires(std::same_as, ICPUTopLevelAccelerationStructure>) + static auto computeDependantsImpl(Self* self) { + using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; + core::unordered_set dependants; + for (const auto& instance : *self->m_instances) + dependants.insert(instance.getBase().blas.get()); + return dependants; + } }; } From 3e963393781dae658d90942c158a417ed2742aed Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 26 May 2025 19:22:32 +0700 Subject: [PATCH 196/346] Fix ICPUAnimationLibrary to use computeDependantsImpl --- include/nbl/asset/ICPUAnimationLibrary.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/nbl/asset/ICPUAnimationLibrary.h b/include/nbl/asset/ICPUAnimationLibrary.h index 8a6cdaf52a..1663447b73 100644 --- a/include/nbl/asset/ICPUAnimationLibrary.h +++ b/include/nbl/asset/ICPUAnimationLibrary.h @@ -98,7 +98,12 @@ class ICPUAnimationLibrary final : public IAnimationLibrary, public inline core::unordered_set computeDependants() const override { - return { m_keyframeStorageBinding.buffer.get(), m_timestampStorageBinding.buffer.get(), m_animationStorageRange.buffer.get() }; + return computeDependantsImpl(this); + } + + inline core::unordered_set computeDependants() override + { + return computeDependantsImpl(this); } private: From 6de189d2a04c9bf3070ebe40153cbf3c38f405dc Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 26 May 2025 19:23:01 +0700 Subject: [PATCH 197/346] Remove layout constness from ICPUComputePipeline --- include/nbl/asset/ICPUComputePipeline.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h index 27d16461a2..b940c2ae48 100644 --- a/include/nbl/asset/ICPUComputePipeline.h +++ b/include/nbl/asset/ICPUComputePipeline.h @@ -19,7 +19,7 @@ class ICPUComputePipeline final : public ICPUPipeline create(const ICPUPipelineLayout* layout) + static core::smart_refctd_ptr create(ICPUPipelineLayout* layout) { auto retval = new ICPUComputePipeline(layout); return core::smart_refctd_ptr(retval,core::dont_grab); @@ -61,14 +61,14 @@ class ICPUComputePipeline final : public ICPUPipeline clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final + inline core::smart_refctd_ptr clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final { auto newPipeline = new ICPUComputePipeline(layout.get()); newPipeline->m_specInfo = m_specInfo.clone(depth); return core::smart_refctd_ptr(newPipeline, core::dont_grab); } - explicit ICPUComputePipeline(const ICPUPipelineLayout* layout): + explicit ICPUComputePipeline(ICPUPipelineLayout* layout): base_t(layout, {}) {} From b0fe0904ef1bfe1bc7459e50628bd851ef0a5d39 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 26 May 2025 19:23:35 +0700 Subject: [PATCH 198/346] Move ICPUDescriptorSet computeDependantsImpl to header --- include/nbl/asset/ICPUDescriptorSet.h | 33 +++++++++++++++++++++++++++ src/nbl/asset/ICPUDescriptorSet.cpp | 33 --------------------------- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/include/nbl/asset/ICPUDescriptorSet.h b/include/nbl/asset/ICPUDescriptorSet.h index c8a6f68d22..2498a438ca 100644 --- a/include/nbl/asset/ICPUDescriptorSet.h +++ b/include/nbl/asset/ICPUDescriptorSet.h @@ -87,6 +87,39 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet m_descriptorInfos[static_cast(IDescriptor::E_TYPE::ET_COUNT)]; + + template + requires(std::same_as, ICPUDescriptorSet>) + static auto computeDependantsImpl(Self* self) { + using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; + core::unordered_set dependants = { self->m_layout.get() }; + for (auto i = 0u; i < static_cast(IDescriptor::E_TYPE::ET_COUNT); i++) + { + if (!self->m_descriptorInfos[i]) continue; + const auto size = self->m_descriptorInfos[i]->size(); + for (auto desc_i = 0u; desc_i < size; desc_i++) + { + auto* desc = self->m_descriptorInfos[i]->operator[](desc_i).desc.get(); + if (!desc) continue; + switch (IDescriptor::GetTypeCategory(static_cast(i))) + { + case IDescriptor::EC_BUFFER: + dependants.insert(static_cast(desc)); + case IDescriptor::EC_SAMPLER: + dependants.insert(static_cast(desc)); + case IDescriptor::EC_IMAGE: + dependants.insert(static_cast(desc)); + case IDescriptor::EC_BUFFER_VIEW: + dependants.insert(static_cast(desc)); + case IDescriptor::EC_ACCELERATION_STRUCTURE: + dependants.insert(static_cast(desc)); + default: + break; + } + } + } + return dependants; + } }; } diff --git a/src/nbl/asset/ICPUDescriptorSet.cpp b/src/nbl/asset/ICPUDescriptorSet.cpp index a95074fdb7..730f0847f2 100644 --- a/src/nbl/asset/ICPUDescriptorSet.cpp +++ b/src/nbl/asset/ICPUDescriptorSet.cpp @@ -108,39 +108,6 @@ core::smart_refctd_ptr ICPUDescriptorSet::clone(uint32_t _depth) const return cp; } -template - requires(std::same_as, ICPUDescriptorSet>) -static auto computeDependantsImpl(Self* self) { - using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; - core::unordered_set dependants = { self->m_layout.get() }; - for (auto i = 0u; i < static_cast(IDescriptor::E_TYPE::ET_COUNT); i++) - { - if (!self->m_descriptorInfos[i]) continue; - const auto size = self->m_descriptorInfos[i]->size(); - for (auto desc_i = 0u; desc_i < size; desc_i++) - { - auto* desc = self->m_descriptorInfos[i]->operator[](desc_i).desc.get(); - if (!desc) continue; - switch (IDescriptor::GetTypeCategory(static_cast(i))) - { - case IDescriptor::EC_BUFFER: - dependants.insert(static_cast(desc)); - case IDescriptor::EC_SAMPLER: - dependants.insert(static_cast(desc)); - case IDescriptor::EC_IMAGE: - dependants.insert(static_cast(desc)); - case IDescriptor::EC_BUFFER_VIEW: - dependants.insert(static_cast(desc)); - case IDescriptor::EC_ACCELERATION_STRUCTURE: - dependants.insert(static_cast(desc)); - default: - break; - } - } - } - return dependants; -} - core::unordered_set ICPUDescriptorSet::computeDependants() const { return computeDependantsImpl(this); From 1d764ec0ca5c16787ec470c7dd1192b292b638eb Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 26 May 2025 19:24:11 +0700 Subject: [PATCH 199/346] Remove layout constness from cpu graphics pipeline --- include/nbl/asset/ICPUGraphicsPipeline.h | 6 +++--- include/nbl/asset/IGraphicsPipeline.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h index 4a7ee3b695..4a1520880d 100644 --- a/include/nbl/asset/ICPUGraphicsPipeline.h +++ b/include/nbl/asset/ICPUGraphicsPipeline.h @@ -20,7 +20,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline create(const ICPUPipelineLayout* layout) + static core::smart_refctd_ptr create(ICPUPipelineLayout* layout) { auto retval = new ICPUGraphicsPipeline(layout); return core::smart_refctd_ptr(retval,core::dont_grab); @@ -79,7 +79,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline m_specInfos; private: - explicit ICPUGraphicsPipeline(const ICPUPipelineLayout* layout) + explicit ICPUGraphicsPipeline(ICPUPipelineLayout* layout) : base_t(layout, {}, {}) {} @@ -108,7 +108,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final + inline core::smart_refctd_ptr clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final { auto* newPipeline = new ICPUGraphicsPipeline(layout.get()); newPipeline->m_params = m_params; diff --git a/include/nbl/asset/IGraphicsPipeline.h b/include/nbl/asset/IGraphicsPipeline.h index 090a368c2f..5b445afae5 100644 --- a/include/nbl/asset/IGraphicsPipeline.h +++ b/include/nbl/asset/IGraphicsPipeline.h @@ -109,8 +109,8 @@ class IGraphicsPipeline : public IPipeline, public IGraphics } protected: - explicit IGraphicsPipeline(const PipelineLayoutType* layout, const SCachedCreationParams& cachedParams, const renderpass_t* renderpass) : - IPipeline(core::smart_refctd_ptr(layout)), + explicit IGraphicsPipeline(PipelineLayoutType* layout, const SCachedCreationParams& cachedParams, renderpass_t* renderpass) : + IPipeline(core::smart_refctd_ptr(layout)), m_params(cachedParams), m_renderpass(core::smart_refctd_ptr(renderpass)) {} From 377f25d5e90cb85de3dcc11e6bdbb5d7129c59d2 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 26 May 2025 19:24:30 +0700 Subject: [PATCH 200/346] Remove layout constness from cpu pipeline --- include/nbl/asset/ICPUPipeline.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h index 9674b872e0..069c9fc35e 100644 --- a/include/nbl/asset/ICPUPipeline.h +++ b/include/nbl/asset/ICPUPipeline.h @@ -144,7 +144,7 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipe using PipelineNonAssetBase::PipelineNonAssetBase; virtual ~ICPUPipeline() = default; - virtual core::smart_refctd_ptr clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const = 0; + virtual core::smart_refctd_ptr clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const = 0; }; From 8809bdad28ace63bf92a87364c645018db772fe1 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 26 May 2025 19:25:06 +0700 Subject: [PATCH 201/346] Use computeDependantsImpl in cpu pipeline layout --- include/nbl/asset/ICPUPipelineLayout.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/nbl/asset/ICPUPipelineLayout.h b/include/nbl/asset/ICPUPipelineLayout.h index e755a22f07..4b668c1472 100644 --- a/include/nbl/asset/ICPUPipelineLayout.h +++ b/include/nbl/asset/ICPUPipelineLayout.h @@ -32,12 +32,12 @@ class ICPUPipelineLayout : public IAsset, public IPipelineLayout computeDependants() const override { - core::unordered_set dependants; - for (auto i = 0; i < m_descSetLayouts.size(); i++) - { - if (m_descSetLayouts[i]) continue; - dependants.insert(m_descSetLayouts[i].get()); - } + return computeDependantsImpl(this); + } + + inline core::unordered_set computeDependants() override + { + return computeDependantsImpl(this); } // From 3c0b3ba8e30128c32c62e28f65ae77357d4bb0fb Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 26 May 2025 19:25:26 +0700 Subject: [PATCH 202/346] Fix argument pack passing on IGPUPipeline --- include/nbl/video/IGPUPipeline.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h index f9a32786bf..f2e9b79fef 100644 --- a/include/nbl/video/IGPUPipeline.h +++ b/include/nbl/video/IGPUPipeline.h @@ -116,7 +116,7 @@ class IGPUPipeline : public IBackendObject, public PipelineNonBackendObjectBase, template explicit IGPUPipeline(core::smart_refctd_ptr&& device, Args&&... args) : - PipelineNonBackendObjectBase(std::forward(args...)), IBackendObject(std::move(device)) + PipelineNonBackendObjectBase(std::forward(args)...), IBackendObject(std::move(device)) {} virtual ~IGPUPipeline() = default; From 53b45ec0db44f8e9fc8a7bf9e5a8dbfb70d6ad0f Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 26 May 2025 19:25:53 +0700 Subject: [PATCH 203/346] Remove layout constness from cpu ray tracing pipeline --- include/nbl/asset/ICPURayTracingPipeline.h | 2 +- include/nbl/asset/IRayTracingPipeline.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h index 618c851883..1296d8359a 100644 --- a/include/nbl/asset/ICPURayTracingPipeline.h +++ b/include/nbl/asset/ICPURayTracingPipeline.h @@ -123,7 +123,7 @@ class ICPURayTracingPipeline final : public ICPUPipeline clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final + inline core::smart_refctd_ptr clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final { auto newPipeline = new ICPURayTracingPipeline(layout.get()); newPipeline->m_raygen = m_raygen.clone(depth); diff --git a/include/nbl/asset/IRayTracingPipeline.h b/include/nbl/asset/IRayTracingPipeline.h index 82b47f1fcb..b97d8d7002 100644 --- a/include/nbl/asset/IRayTracingPipeline.h +++ b/include/nbl/asset/IRayTracingPipeline.h @@ -48,8 +48,8 @@ class IRayTracingPipeline : public IPipeline, public IRayTra inline const SCachedCreationParams& getCachedCreationParams() const { return m_params; } protected: - explicit IRayTracingPipeline(const PipelineLayoutType* layout, const SCachedCreationParams& cachedParams) : - IPipeline(core::smart_refctd_ptr(layout)), + explicit IRayTracingPipeline(PipelineLayoutType* layout, const SCachedCreationParams& cachedParams) : + IPipeline(core::smart_refctd_ptr(layout)), m_params(cachedParams) {} From e249931d74f676781980f77b3aab9a5d4da45be1 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 26 May 2025 19:26:17 +0700 Subject: [PATCH 204/346] Add cached parameter to SCreationParams for gpu compute pipeline --- include/nbl/video/IGPUComputePipeline.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/nbl/video/IGPUComputePipeline.h b/include/nbl/video/IGPUComputePipeline.h index 2eb03cf2da..36813699c0 100644 --- a/include/nbl/video/IGPUComputePipeline.h +++ b/include/nbl/video/IGPUComputePipeline.h @@ -65,6 +65,7 @@ class IGPUComputePipeline : public IGPUPipeline flags = FLAGS::NONE; + SCachedCreationParams cached = {}; SShaderSpecInfo shader = {}; }; @@ -75,7 +76,7 @@ class IGPUComputePipeline : public IGPUPipeline(params.layout->getOriginDevice()), core::smart_refctd_ptr(params.layout)), m_flags(params.flags) + IGPUPipeline(core::smart_refctd_ptr(params.layout->getOriginDevice()), params.layout, params.cached), m_flags(params.flags) {} virtual ~IGPUComputePipeline() = default; From 006dd7d32bea9197354c9c0ce49e49a8bf4c9c81 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 26 May 2025 19:26:36 +0700 Subject: [PATCH 205/346] Remove layout constness on IPipeline --- include/nbl/asset/IPipeline.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/asset/IPipeline.h b/include/nbl/asset/IPipeline.h index d2a85c42fb..eb54542403 100644 --- a/include/nbl/asset/IPipeline.h +++ b/include/nbl/asset/IPipeline.h @@ -130,7 +130,7 @@ class IPipeline : public IPipelineBase protected: - inline IPipeline(core::smart_refctd_ptr&& _layout) + inline IPipeline(core::smart_refctd_ptr&& _layout) : m_layout(std::move(_layout)) {} core::smart_refctd_ptr m_layout; From 389c358beb5b42cf47a8fc5712c337d5882afc8c Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 26 May 2025 19:26:50 +0700 Subject: [PATCH 206/346] Fix IGPURayTracingPipeline construction --- include/nbl/video/IGPURayTracingPipeline.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h index 66e3a01072..beaecd772a 100644 --- a/include/nbl/video/IGPURayTracingPipeline.h +++ b/include/nbl/video/IGPURayTracingPipeline.h @@ -170,7 +170,7 @@ class IGPURayTracingPipeline : public IGPUPipeline(params.layout->getOriginDevice()), params), + IGPURayTracingPipeline(const SCreationParams& params) : IGPUPipeline(core::smart_refctd_ptr(params.layout->getOriginDevice()), params.layout, params.cached), m_flags(params.flags) {} From 81df19b259b2a2a83a411b9cfd55c62361b58769 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 26 May 2025 20:24:52 +0700 Subject: [PATCH 207/346] Fix debloatedHitSpecData error in ILogicalDevice --- src/nbl/video/ILogicalDevice.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp index 7714219836..62e364a71a 100644 --- a/src/nbl/video/ILogicalDevice.cpp +++ b/src/nbl/video/ILogicalDevice.cpp @@ -1123,8 +1123,8 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline { *debloatedHitSpecData = { .closestHit = debloatTask.debloat(hit.closestHit, debloatedShaders), - .intersection = debloatTask.debloat(hit.intersection, debloatedShaders), .anyHit = debloatTask.debloat(hit.anyHit, debloatedShaders), + .intersection = debloatTask.debloat(hit.intersection, debloatedShaders), }; debloatedHitSpecData++; } From 2d97ce83df891174013197e7127bd8bbd54106de Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 26 May 2025 20:25:10 +0700 Subject: [PATCH 208/346] Fix CComputeBlit --- include/nbl/asset/IComputePipeline.h | 1 + include/nbl/video/utilities/CComputeBlit.h | 2 +- src/nbl/video/utilities/CComputeBlit.cpp | 20 +++++++++++--------- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/include/nbl/asset/IComputePipeline.h b/include/nbl/asset/IComputePipeline.h index ba4d245473..2cb38b39f1 100644 --- a/include/nbl/asset/IComputePipeline.h +++ b/include/nbl/asset/IComputePipeline.h @@ -24,6 +24,7 @@ class IComputePipeline : public IPipeline, public IComputePi public: inline const SCachedCreationParams& getCachedCreationParams() const { return m_params; } + inline SCachedCreationParams& getCachedCreationParams() { return m_params; } protected: explicit IComputePipeline(PipelineLayoutType* layout, const SCachedCreationParams& cachedParams) : diff --git a/include/nbl/video/utilities/CComputeBlit.h b/include/nbl/video/utilities/CComputeBlit.h index 9a02915187..66f6871dc6 100644 --- a/include/nbl/video/utilities/CComputeBlit.h +++ b/include/nbl/video/utilities/CComputeBlit.h @@ -67,7 +67,7 @@ class CComputeBlit : public core::IReferenceCounted // required CAssetConverter* converter; // in theory we _could_ accept either pipeline layout type (or just the base) and make the CPU one back from the GPU - const asset::ICPUPipelineLayout* layout; + asset::ICPUPipelineLayout* layout; // must be Uniform Texel Buffer descriptor type hlsl::SBindingInfo kernelWeights; // must be Sampled Image descriptor type diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp index 4c3bbaa03c..edac6e1f5c 100644 --- a/src/nbl/video/utilities/CComputeBlit.cpp +++ b/src/nbl/video/utilities/CComputeBlit.cpp @@ -39,7 +39,7 @@ auto CComputeBlit::createAndCachePipelines(const SPipelinesCreateInfo& info) -> const auto sharedMemoryPerInvocation = core::max(singlePixelStorage*4,info.sharedMemoryPerInvocation); retval.sharedMemorySize = sharedMemoryPerInvocation*retval.workgroupSize; - const auto* layout = info.layout; + auto* layout = info.layout; // const auto common = [&]()->std::string @@ -77,14 +77,16 @@ struct ConstevalParameters source->setContentHash(source->computeContentHash()); } - ICPUComputePipeline::SCreationParams params = {}; - params.layout = layout; - params.shader.entryPoint = "main"; - params.shader.shader = shader.get(); - params.shader.requiredSubgroupSize = static_cast(findMSB(limits.maxSubgroupSize)); - // needed for the prefix and reductions to work - params.shader.requireFullSubgroups = true; - return ICPUComputePipeline::create(params); + auto pipeline = ICPUComputePipeline::create(layout); + pipeline->getSpecInfoMut(ESS_COMPUTE)[0] = { + .shader = shader, + .entryPoint = "main", + .requiredSubgroupSize = static_cast(findMSB(limits.maxSubgroupSize)), + }; + pipeline->getCachedCreationParams() = { + .requireFullSubgroups = true, + }; + return pipeline; }; // create blit pipeline cpuPplns[0] = createPipeline("nbl/builtin/hlsl/blit/default_blit.comp.hlsl"); From 7917918f64409e25052e99d74e1b1343df8d1565 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Mon, 26 May 2025 15:39:41 +0200 Subject: [PATCH 209/346] finalize NSC image production from CMake, leave a few comments regarding HyperV runner --- tools/nsc/CMakeLists.txt | 132 +++++++++++++++++++++------------------ 1 file changed, 70 insertions(+), 62 deletions(-) diff --git a/tools/nsc/CMakeLists.txt b/tools/nsc/CMakeLists.txt index efe7741f4e..158fd5caf8 100644 --- a/tools/nsc/CMakeLists.txt +++ b/tools/nsc/CMakeLists.txt @@ -9,6 +9,8 @@ set(NBL_NSC_PREINSTALL_DIRECTORY "${GODBOLT_BINARY_PRETEST_DIRECTORY}/.preinstal make_directory("${NBL_NSC_PREINSTALL_DIRECTORY}") set(NBL_DOCKER_CT_NSC_VOLUME_SOURCE "${GODBOLT_BINARY_DIRECTORY}/install") +set(NBL_DOCKER_CTX_DIR "${GODBOLT_BINARY_DIRECTORY}/.ctx") +make_directory("${NBL_DOCKER_CTX_DIR}") set(NBL_DOCKER_INSTALL_BAT_FILENAME install-production.bat) set(NBL_DOCKER_CT_NSC_INSTALL_BAT "${NBL_DOCKER_CT_NSC_VOLUME_SOURCE}/${NBL_DOCKER_INSTALL_BAT_FILENAME}") @@ -57,27 +59,24 @@ add_test(NAME NBL_NSC_DUMP_BUILD_INFO_TEST if(NBL_ENABLE_DOCKER_INTEGRATION) +set(BASE_IMAGE ghcr.io/devsh-graphics-programming/compiler-explorer-docker:nano-2022) + find_program(CTEST_EXE NAMES ctest REQUIRED) find_program(DOCKER_EXE NAMES docker REQUIRED) find_file(DXIL_DLL NAMES dxil.dll HINTS "$ENV{CMAKE_WINDOWS_KITS_10_DIR}/Redist/D3D/x64" "C:/Program Files (x86)/Windows Kits/10/Redist/D3D/x64" REQUIRED) -cmake_path(GET DXIL_DLL PARENT_PATH DXIL_DIR) -cmake_path(NATIVE_PATH DXIL_DIR NORMALIZE DXIL_DIR) find_file(ICU_DLL NAMES icu.dll HINTS REQUIRED) -cmake_path(GET ICU_DLL PARENT_PATH ICU_DIR) -cmake_path(NATIVE_PATH ICU_DIR NORMALIZE ICU_DIR) set(ICU_GLOBALIZATION_DIR "C:/Windows/Globalization/ICU") find_file(ICUDTL_DAT NAMES icudtl.dat HINTS "${ICU_GLOBALIZATION_DIR}" REQUIRED) find_file(UCRTBASED_DLL NAMES ucrtbased.dll HINTS ${UCRTBASED_DLL_DIR} REQUIRED) -cmake_path(GET UCRTBASED_DLL PARENT_PATH UCRTBASED_DIR) -cmake_path(NATIVE_PATH UCRTBASED_DIR NORMALIZE UCRTBASED_DIR) find_program(SPIRV_DIS_EXE NAMES spirv-dis HINTS "${VULKAN_SDK}/Bin" REQUIRED) cmake_path(GET SPIRV_DIS_EXE PARENT_PATH SPIRV_DIS_DIR) cmake_path(NATIVE_PATH SPIRV_DIS_DIR NORMALIZE SPIRV_DIS_DIR) +include(InstallRequiredSystemLibraries) cmake_path(NATIVE_PATH MSVC_REDIST_DIR NORMALIZE TOOLSET_REDIST_PATH) file(GLOB_RECURSE VC_MODULES LIST_DIRECTORIES false @@ -85,33 +84,55 @@ file(GLOB_RECURSE VC_MODULES LIST_DIRECTORIES false "${TOOLSET_REDIST_PATH}/debug_nonredist/x64/*.DebugCRT/*.dll" ) -foreach(MODULE ${VC_MODULES}) - get_filename_component(DIR ${MODULE} DIRECTORY) - cmake_path(NATIVE_PATH DIR NORMALIZE DIR) - list(APPEND VC_MODULE_DIRS ${DIR}) -endforeach() - -if(NOT VC_MODULE_DIRS) +if(NOT VC_MODULES) message(FATAL_ERROR "Failed to GLOB for VC Redist modules!") endif() +make_directory("${NBL_DOCKER_CTX_DIR}/Runtimes") +make_directory("${NBL_DOCKER_CTX_DIR}/Nabla") +execute_process( + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${DXIL_DLL}" "${NBL_DOCKER_CTX_DIR}/Runtimes" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${ICU_DLL}" "${NBL_DOCKER_CTX_DIR}/Runtimes" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${UCRTBASED_DLL}" "${NBL_DOCKER_CTX_DIR}/Runtimes" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${SPIRV_DIS_EXE}" "${NBL_DOCKER_CTX_DIR}/Runtimes" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different ${VC_MODULES} "${NBL_DOCKER_CTX_DIR}/Runtimes" + COMMAND "${CMAKE_COMMAND}" -E copy_directory_if_different ${ICU_GLOBALIZATION_DIR} "${NBL_DOCKER_CTX_DIR}/Globalization/ICU" +) + set(CT_RUNTIMES C:/pack/runtimes) cmake_path(NATIVE_PATH CT_RUNTIMES NORMALIZE CT_RUNTIMES) -set(HOST_MOUNT_DIRS ${VC_MODULE_DIRS} ${SPIRV_DIS_DIR} ${UCRTBASED_DIR} ${DXIL_DIR} ${ICU_DIR}) -list(REMOVE_DUPLICATES HOST_MOUNT_DIRS) -set(ix 0) -foreach(DIR ${HOST_MOUNT_DIRS}) - set(TARGET_MOUNT_DIR "${CT_RUNTIMES}/system/${ix}") - cmake_path(NATIVE_PATH TARGET_MOUNT_DIR NORMALIZE TARGET_MOUNT_DIR) +set(NBL_DOCKER_CT_NSC_VOLUME_TARGET "${CT_RUNTIMES}/Nabla") +cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_SOURCE NORMALIZE NBL_DOCKER_CT_NSC_VOLUME_SOURCE) +cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_TARGET NORMALIZE NBL_DOCKER_CT_NSC_VOLUME_TARGET) +cmake_path(NATIVE_PATH NBL_NSC_PREINSTALL_DIRECTORY NORMALIZE NBL_NSC_PREINSTALL_DIRECTORY) + +string(CONFIGURE [=[ +# syntax=docker/dockerfile:1 +# escape=` +FROM @BASE_IMAGE@ +USER ContainerAdministrator - list(APPEND DOCKER_CLI_ARGS -v "${DIR}:${TARGET_MOUNT_DIR}:ro") - list(APPEND CT_MOUNT_DIRS "${TARGET_MOUNT_DIR}") +COPY Runtimes/ C:/Windows/System32/ +COPY Globalization/ICU/ C:/Windows/Globalization/ICU/ - math(EXPR ix "${ix} + 1" OUTPUT_FORMAT DECIMAL) -endforeach() +COPY Nabla/ @NBL_DOCKER_CT_NSC_VOLUME_TARGET@ +COPY hlsl.local.properties.cmake C:/Compiler-Explorer/etc/config/hlsl.local.properties + +ENV NBL_INSTALL_DIRECTORY=@NBL_DOCKER_CT_NSC_VOLUME_TARGET@ ` +NBL_EXPLICIT_MODULE_LOAD_LOG=ON + +WORKDIR C:/Compiler-Explorer +ENTRYPOINT ["node", "--no-warnings", "--no-deprecation", "--import=tsx", "./app.js", "--language", "hlsl"] +]=] INSTRUCTIONS @ONLY) + +set(DOCKERFILE "${NBL_DOCKER_CTX_DIR}/Dockerfile") +file(WRITE "${DOCKERFILE}" "${INSTRUCTIONS}") + +if(NOT DEFINED NSC_IMAGE_NAME) + set(NSC_IMAGE_NAME nano/godbolt/nsc) +endif() -set(NBL_DOCKER_CT_NSC_VOLUME_TARGET "${CT_RUNTIMES}/Nabla") set(NBL_BUILD_INFO_POSTPROCESS_COMMAND "${CMAKE_COMMAND}" "-DNBL_EXECUTABLE_PATH=${NBL_NSC_PREINSTALL_TARGET_EXE_FILEPATH}" @@ -120,13 +141,8 @@ set(NBL_BUILD_INFO_POSTPROCESS_COMMAND "-DNBL_OUTPUT_EXE_OVERRIDE=$" -P "${NBL_ROOT_PATH}/cmake/scripts/nbl/nablaBuildInfo.cmake" ) -cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_SOURCE NORMALIZE NBL_DOCKER_CT_NSC_VOLUME_SOURCE) -cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_TARGET NORMALIZE NBL_DOCKER_CT_NSC_VOLUME_TARGET) -cmake_path(NATIVE_PATH NBL_NSC_PREINSTALL_DIRECTORY NORMALIZE NBL_NSC_PREINSTALL_DIRECTORY) -list(APPEND DOCKER_CLI_ARGS -v "${NBL_NSC_PREINSTALL_DIRECTORY}:${NBL_DOCKER_CT_NSC_VOLUME_TARGET}") -#list(APPEND DOCKER_CLI_ARGS -v "${ICU_GLOBALIZATION_DIR}:${ICU_GLOBALIZATION_DIR}:ro") -set(NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT "${NBL_DOCKER_CT_NSC_VOLUME_SOURCE}/hlsl.local.properties.cmake") +set(NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT "${NBL_DOCKER_CTX_DIR}/hlsl.local.properties.cmake") string(GENEX_STRIP "${NBL_PACKAGE_RUNTIME_EXE_DIR_PATH}" NBL_RELATIVE_ENTRY) set(OUTPUT_CONFIG_FILE $) set(NBL_CE_GENERATE_CONFIG_COMMAND @@ -139,25 +155,6 @@ set(NBL_CE_GENERATE_CONFIG_COMMAND -P "${CMAKE_CURRENT_SOURCE_DIR}/ce-generate-config.cmake" ) -set(CT_ENV_FILE "${CMAKE_CURRENT_BINARY_DIR}/.env") -string(CONFIGURE [=[ -CT_MOUNT_DIRS=@CT_MOUNT_DIRS@ -NBL_INSTALL_DIRECTORY=@NBL_DOCKER_CT_NSC_VOLUME_TARGET@ -NBL_EXPLICIT_MODULE_LOAD_LOG=ON -]=] ENV_CONTENT @ONLY) -file(WRITE "${CT_ENV_FILE}" "${ENV_CONTENT}") -list(APPEND DOCKER_CLI_ARGS --env-file "${CT_ENV_FILE}") - -set(CT_SETUP_FILE "${CMAKE_CURRENT_BINARY_DIR}/setup.bat") -string(CONFIGURE [=[ -@echo off -set "PATH=%PATH%;%CT_MOUNT_DIRS%" -setx PATH "%PATH%" /M -node --no-warnings --no-deprecation --import=tsx ./app.js --language hlsl -]=] SETUP_CONTENT @ONLY) -file(WRITE "${CT_SETUP_FILE}" "${SETUP_CONTENT}") -list(APPEND DOCKER_CLI_ARGS) - function(PROMOTE_PROCESS_ISOLATION HOST_KERNEL BASE VAR) set(${VAR} True) @@ -172,9 +169,9 @@ function(PROMOTE_PROCESS_ISOLATION HOST_KERNEL BASE VAR) macro(TO_PROCESS IMAGE TARGET_KERNEL) if(${HOST_KERNEL} VERSION_LESS ${TARGET_KERNEL}) set(${VAR} False) - message(STATUS "Host kernel \"${HOST_KERNEL}\" version too low to promote process isolation for \"${IMAGE}\" [${TARGET_KERNEL}] and requires falling back to HyperV. Please update your host OS.") + message(STATUS "Host kernel \"${HOST_KERNEL}\" version too low to promote process isolation with \"${IMAGE}\" [${TARGET_KERNEL}] and requires falling back to HyperV. Please update your host OS.") else() - message(STATUS "Promoting \"${IMAGE}\" [${TARGET_KERNEL}] to process isolation with host kernel [${HOST_KERNEL}] version") + message(STATUS "\"${IMAGE}\" [${TARGET_KERNEL}] can be promoted to process isolation with host kernel [${HOST_KERNEL}] version") endif() endmacro() @@ -195,12 +192,16 @@ endfunction() execute_process(COMMAND cmd /C ver OUTPUT_VARIABLE PIPE OUTPUT_STRIP_TRAILING_WHITESPACE) string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+" HOST_KERNEL "${PIPE}") - -set(BASE_IMAGE ghcr.io/devsh-graphics-programming/compiler-explorer-docker:nano-2022) PROMOTE_PROCESS_ISOLATION(${HOST_KERNEL} ${BASE_IMAGE} USE_PROCESS_ISOLATION) if(USE_PROCESS_ISOLATION) - list(APPEND DOCKER_CLI_ARGS --isolation process) + set(ISOLATION --isolation process) +else() + # TODO: we will need to use GET_RUNTIME_DEPENDENCIES which uses objdump + # https://cmake.org/cmake/help/latest/command/file.html#get-runtime-dependencies + # to collect *all* required deps and copy (FROM at least server core) to destination + # image, it will fail currently if we fully isolate it with VM due to lack of certain DLLs + message(FATAL_ERROR "HyperV is NOT supported! Update your OS!") # yet endif() set(ORPHAN nsc-orphan) @@ -210,20 +211,27 @@ set(NBL_CE_ENDPOINT_PY "${NBL_ROOT_PATH}/docker/compiler-explorer/endpoint.py") set(NBL_NSC_BASIC_HLSL_JPAYLOAD "${CMAKE_CURRENT_SOURCE_DIR}/docker/godbolt/hlsl-basic-compile-payload.json") add_custom_target(run-compiler-explorer ALL - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Clearing NSC orphans.." + COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Killing remaining NSC orphans" COMMAND "${DOCKER_EXE}" rm -f ${ORPHAN} || "${CMAKE_COMMAND}" -E true - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Executing CTests.." + COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Executing CTests" COMMAND "${CTEST_EXE}" -C $ --stop-on-failure + + COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Generating NSC build info" COMMAND ${NBL_BUILD_INFO_POSTPROCESS_COMMAND} + + COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Generating NSC godbolt config" COMMAND ${NBL_CE_GENERATE_CONFIG_COMMAND} - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Running new NSC orphan container.." - COMMAND "${DOCKER_EXE}" run -di -p 80:10240 --name ${ORPHAN} --entrypoint cmd ${DOCKER_CLI_ARGS} ${BASE_IMAGE} - COMMAND "${DOCKER_EXE}" cp "${OUTPUT_CONFIG_FILE}" ${ORPHAN}:C:\\Compiler-Explorer\\etc\\config\\hlsl.local.properties - COMMAND "${DOCKER_EXE}" cp "${CT_SETUP_FILE}" ${ORPHAN}:C:\\setup.cmd - COMMAND "${DOCKER_EXE}" exec -d ${ORPHAN} C:\\setup.cmd - COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_HEALTHY_CHECK_PY}" --url "${NBL_CE_URL}" --interval 5 --ticks 25 + COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Updating NSC package context" + COMMAND "${CMAKE_COMMAND}" -E copy_directory_if_different "${NBL_NSC_PREINSTALL_DIRECTORY}" "${NBL_DOCKER_CTX_DIR}/Nabla" + + COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Building NSC Godbolt image" + COMMAND "${DOCKER_EXE}" build ${ISOLATION} -f "${DOCKERFILE}" -t ${NSC_IMAGE_NAME} "${NBL_DOCKER_CTX_DIR}" + + COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Running new NSC orphan container" + COMMAND "${DOCKER_EXE}" run -di -p 80:10240 ${ISOLATION} --name ${ORPHAN} ${NSC_IMAGE_NAME} + COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_HEALTHY_CHECK_PY}" --url "${NBL_CE_URL}" --interval 5 --ticks 12 COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "Compiler Explorer is running, type \"localhost\" in your browser!" COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Post-Checking if NSC container is able to compile basic shader input..." From 27c50d70a595db1a7ca6f029bc7122cb32e438ec Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Mon, 26 May 2025 17:00:11 +0200 Subject: [PATCH 210/346] compression! --- tools/nsc/CMakeLists.txt | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/tools/nsc/CMakeLists.txt b/tools/nsc/CMakeLists.txt index 158fd5caf8..157c4fa646 100644 --- a/tools/nsc/CMakeLists.txt +++ b/tools/nsc/CMakeLists.txt @@ -99,7 +99,7 @@ execute_process( COMMAND "${CMAKE_COMMAND}" -E copy_directory_if_different ${ICU_GLOBALIZATION_DIR} "${NBL_DOCKER_CTX_DIR}/Globalization/ICU" ) -set(CT_RUNTIMES C:/pack/runtimes) +set(CT_RUNTIMES C:/runtimes) cmake_path(NATIVE_PATH CT_RUNTIMES NORMALIZE CT_RUNTIMES) set(NBL_DOCKER_CT_NSC_VOLUME_TARGET "${CT_RUNTIMES}/Nabla") @@ -110,20 +110,34 @@ cmake_path(NATIVE_PATH NBL_NSC_PREINSTALL_DIRECTORY NORMALIZE NBL_NSC_PREINSTALL string(CONFIGURE [=[ # syntax=docker/dockerfile:1 # escape=` -FROM @BASE_IMAGE@ -USER ContainerAdministrator -COPY Runtimes/ C:/Windows/System32/ -COPY Globalization/ICU/ C:/Windows/Globalization/ICU/ +# ---------------- COMPRESS STEP ---------------- +FROM @BASE_IMAGE@ as compress + +COPY --link Runtimes/ C:/pack/Windows/System32/ +COPY --link Globalization/ICU/ C:/pack/Windows/Globalization/ICU/ +COPY --link Nabla/ C:/pack/runtimes/Nabla/ + +ARG IMPL_COMPRESSION_OPTIONS=-T0 +ARG IMPL_COMPRESSION_LEVEL=3 + +WORKDIR C:\pack +RUN ` +tar -cf - Windows | zstd %IMPL_COMPRESSION_OPTIONS% -%IMPL_COMPRESSION_LEVEL% -o windows-artifacts.tar.zst && ` +tar -cf - runtimes | zstd %IMPL_COMPRESSION_OPTIONS% -%IMPL_COMPRESSION_LEVEL% -o nabla-artifacts.tar.zst + +# ---------------- FINAL IMAGE ---------------- +FROM @BASE_IMAGE@ -COPY Nabla/ @NBL_DOCKER_CT_NSC_VOLUME_TARGET@ +COPY --link --from=compress ["C:/pack/windows-artifacts.tar.zst", "C:/pack/"] +COPY --link --from=compress ["C:/pack/nabla-artifacts.tar.zst", "C:/pack/"] COPY hlsl.local.properties.cmake C:/Compiler-Explorer/etc/config/hlsl.local.properties ENV NBL_INSTALL_DIRECTORY=@NBL_DOCKER_CT_NSC_VOLUME_TARGET@ ` NBL_EXPLICIT_MODULE_LOAD_LOG=ON WORKDIR C:/Compiler-Explorer -ENTRYPOINT ["node", "--no-warnings", "--no-deprecation", "--import=tsx", "./app.js", "--language", "hlsl"] +ENTRYPOINT ["C:\\unpack.bat", "&&", "node", "--no-warnings", "--no-deprecation", "--import=tsx", "./app.js", "--language", "hlsl"] ]=] INSTRUCTIONS @ONLY) set(DOCKERFILE "${NBL_DOCKER_CTX_DIR}/Dockerfile") From e3848cef01a6d243d2474c0a195d81204d016406 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Mon, 26 May 2025 23:17:19 +0200 Subject: [PATCH 211/346] Make docker in docker build, adjust tools/nsc/CMakeLists.txt, update build-nabla.yml to produce NSC image from builder container --- .github/workflows/build-nabla.yml | 7 +- CMakePresets.json | 6 +- tools/nsc/CMakeLists.txt | 147 +++++++++++++++++------------- 3 files changed, 96 insertions(+), 64 deletions(-) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index 7dc8759e84..a194734472 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -54,6 +54,11 @@ jobs: - name: Run Container run: | + $ctx = docker context show + $dockerHost = (docker context inspect $ctx | ConvertFrom-Json).Endpoints.docker.Host + $pipeName = [regex]::Match($dockerHost, '/pipe/(?.+)$').Groups['n'].Value + $pipeHost = "\\.\pipe\$pipeName" + docker run ` --entrypoint ${{ env.entry }} -di --isolation process ` --env-file .\docker\ci-windows.env ` @@ -87,7 +92,7 @@ jobs: docker exec orphan ` ${{ env.entry }} ${{ env.cmd }} -Command cmake --build ` --preset ci-build-dynamic-${{ matrix.vendor }} ` - -t nsc --config ${{ matrix.config }} + -t run-compiler-explorer --config ${{ matrix.config }} - name: Container – Install NSC run: | diff --git a/CMakePresets.json b/CMakePresets.json index ad3ae50b6d..359ec6fb02 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -91,7 +91,8 @@ "inherits": "ci-configure-static-windows-base", "generator": "Ninja Multi-Config", "cacheVariables": { - "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/docker/msvc-winsdk/cmake/winsdk-msvc-toolchain.cmake" + "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/docker/msvc-winsdk/cmake/winsdk-msvc-toolchain.cmake", + "NBL_ENABLE_DOCKER_INTEGRATION": "ON" } }, { @@ -99,7 +100,8 @@ "inherits": "ci-configure-dynamic-windows-base", "generator": "Ninja Multi-Config", "cacheVariables": { - "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/docker/msvc-winsdk/cmake/winsdk-msvc-toolchain.cmake" + "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/docker/msvc-winsdk/cmake/winsdk-msvc-toolchain.cmake", + "NBL_ENABLE_DOCKER_INTEGRATION": "ON" } }, { diff --git a/tools/nsc/CMakeLists.txt b/tools/nsc/CMakeLists.txt index 157c4fa646..b0fec5b7f2 100644 --- a/tools/nsc/CMakeLists.txt +++ b/tools/nsc/CMakeLists.txt @@ -66,10 +66,7 @@ find_program(DOCKER_EXE NAMES docker REQUIRED) find_file(DXIL_DLL NAMES dxil.dll HINTS "$ENV{CMAKE_WINDOWS_KITS_10_DIR}/Redist/D3D/x64" "C:/Program Files (x86)/Windows Kits/10/Redist/D3D/x64" REQUIRED) -find_file(ICU_DLL NAMES icu.dll HINTS REQUIRED) -set(ICU_GLOBALIZATION_DIR "C:/Windows/Globalization/ICU") -find_file(ICUDTL_DAT NAMES icudtl.dat HINTS "${ICU_GLOBALIZATION_DIR}" REQUIRED) - +set(ICU_GLOBALIZATION_DIR C:\\Windows\\Globalization\\ICU) find_file(UCRTBASED_DLL NAMES ucrtbased.dll HINTS ${UCRTBASED_DLL_DIR} REQUIRED) find_program(SPIRV_DIS_EXE NAMES spirv-dis HINTS "${VULKAN_SDK}/Bin" REQUIRED) @@ -77,6 +74,15 @@ cmake_path(GET SPIRV_DIS_EXE PARENT_PATH SPIRV_DIS_DIR) cmake_path(NATIVE_PATH SPIRV_DIS_DIR NORMALIZE SPIRV_DIS_DIR) include(InstallRequiredSystemLibraries) + +if(NOT MSVC_REDIST_DIR) + if(MSVC_REDIST_BASE) # fallback to our CI toolset + set(MSVC_REDIST_DIR "${MSVC_REDIST_BASE}") + else() + message(FATAL_ERROR "Could not find MSVC_REDIST_DIR, define yourself!") + endif() +endif() + cmake_path(NATIVE_PATH MSVC_REDIST_DIR NORMALIZE TOOLSET_REDIST_PATH) file(GLOB_RECURSE VC_MODULES LIST_DIRECTORIES false @@ -92,11 +98,9 @@ make_directory("${NBL_DOCKER_CTX_DIR}/Runtimes") make_directory("${NBL_DOCKER_CTX_DIR}/Nabla") execute_process( COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${DXIL_DLL}" "${NBL_DOCKER_CTX_DIR}/Runtimes" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${ICU_DLL}" "${NBL_DOCKER_CTX_DIR}/Runtimes" COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${UCRTBASED_DLL}" "${NBL_DOCKER_CTX_DIR}/Runtimes" COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${SPIRV_DIS_EXE}" "${NBL_DOCKER_CTX_DIR}/Runtimes" COMMAND "${CMAKE_COMMAND}" -E copy_if_different ${VC_MODULES} "${NBL_DOCKER_CTX_DIR}/Runtimes" - COMMAND "${CMAKE_COMMAND}" -E copy_directory_if_different ${ICU_GLOBALIZATION_DIR} "${NBL_DOCKER_CTX_DIR}/Globalization/ICU" ) set(CT_RUNTIMES C:/runtimes) @@ -107,6 +111,7 @@ cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_SOURCE NORMALIZE NBL_DOCKER_CT_N cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_TARGET NORMALIZE NBL_DOCKER_CT_NSC_VOLUME_TARGET) cmake_path(NATIVE_PATH NBL_NSC_PREINSTALL_DIRECTORY NORMALIZE NBL_NSC_PREINSTALL_DIRECTORY) +set(CORE_IMAGE mcr.microsoft.com/windows/servercore:ltsc2022) string(CONFIGURE [=[ # syntax=docker/dockerfile:1 # escape=` @@ -114,8 +119,9 @@ string(CONFIGURE [=[ # ---------------- COMPRESS STEP ---------------- FROM @BASE_IMAGE@ as compress +COPY --link --from=@CORE_IMAGE@ C:/Windows/System32/icu.dll C:/pack/Windows/System32/ +COPY --link --from=@CORE_IMAGE@ C:/Windows/Globalization/ICU/ C:/pack/Windows/Globalization/ICU/ COPY --link Runtimes/ C:/pack/Windows/System32/ -COPY --link Globalization/ICU/ C:/pack/Windows/Globalization/ICU/ COPY --link Nabla/ C:/pack/runtimes/Nabla/ ARG IMPL_COMPRESSION_OPTIONS=-T0 @@ -134,7 +140,8 @@ COPY --link --from=compress ["C:/pack/nabla-artifacts.tar.zst", "C:/pack/"] COPY hlsl.local.properties.cmake C:/Compiler-Explorer/etc/config/hlsl.local.properties ENV NBL_INSTALL_DIRECTORY=@NBL_DOCKER_CT_NSC_VOLUME_TARGET@ ` -NBL_EXPLICIT_MODULE_LOAD_LOG=ON +NBL_EXPLICIT_MODULE_LOAD_LOG=ON ` +ICU_DATA=C:\Windows\Globalization\ICU WORKDIR C:/Compiler-Explorer ENTRYPOINT ["C:\\unpack.bat", "&&", "node", "--no-warnings", "--no-deprecation", "--import=tsx", "./app.js", "--language", "hlsl"] @@ -147,27 +154,9 @@ if(NOT DEFINED NSC_IMAGE_NAME) set(NSC_IMAGE_NAME nano/godbolt/nsc) endif() -set(NBL_BUILD_INFO_POSTPROCESS_COMMAND - "${CMAKE_COMMAND}" - "-DNBL_EXECUTABLE_PATH=${NBL_NSC_PREINSTALL_TARGET_EXE_FILEPATH}" - "-DNBL_BUILD_INFO=${NBL_NSC_PREINSTALL_TARGET_BUILD_INFO}" - "-DNBL_OUTPUT_FILE=${NBL_NSC_PREINSTALL_TARGET_BUILD_INFO}" - "-DNBL_OUTPUT_EXE_OVERRIDE=$" - -P "${NBL_ROOT_PATH}/cmake/scripts/nbl/nablaBuildInfo.cmake" -) - set(NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT "${NBL_DOCKER_CTX_DIR}/hlsl.local.properties.cmake") string(GENEX_STRIP "${NBL_PACKAGE_RUNTIME_EXE_DIR_PATH}" NBL_RELATIVE_ENTRY) set(OUTPUT_CONFIG_FILE $) -set(NBL_CE_GENERATE_CONFIG_COMMAND - "${CMAKE_COMMAND}" - "-DSPIRV_DIS_EXE=spirv-dis.exe" - "-DNSC_RELEASE_BUILD_INFO=$" - "-DNSC_RELWITHDEBINFO_BUILD_INFO=$" - "-DNSC_DEBUG_BUILD_INFO=$" - "-DOUTPUT_CONFIG_FILE=${OUTPUT_CONFIG_FILE}" - -P "${CMAKE_CURRENT_SOURCE_DIR}/ce-generate-config.cmake" -) function(PROMOTE_PROCESS_ISOLATION HOST_KERNEL BASE VAR) set(${VAR} True) @@ -209,52 +198,88 @@ string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+" HOST_KERNEL "${PIPE}") PROMOTE_PROCESS_ISOLATION(${HOST_KERNEL} ${BASE_IMAGE} USE_PROCESS_ISOLATION) if(USE_PROCESS_ISOLATION) - set(ISOLATION --isolation process) + set(ISOLATION "--isolation process") else() - # TODO: we will need to use GET_RUNTIME_DEPENDENCIES which uses objdump + # NOTE: we would need to use GET_RUNTIME_DEPENDENCIES which uses objdump # https://cmake.org/cmake/help/latest/command/file.html#get-runtime-dependencies - # to collect *all* required deps and copy (FROM at least server core) to destination + # to collect *all* missing deps and copy (FROM at least server core) to destination nano # image, it will fail currently if we fully isolate it with VM due to lack of certain DLLs - message(FATAL_ERROR "HyperV is NOT supported! Update your OS!") # yet + message(FATAL_ERROR "HyperV is NOT supported! Update your OS!") endif() set(ORPHAN nsc-orphan) -set(NBL_CE_URL http://localhost:80) +set(NBL_CE_URL http://${ORPHAN}:10240) set(NBL_CE_HEALTHY_CHECK_PY "${NBL_ROOT_PATH}/docker/compiler-explorer/ce_healthy_check.py") set(NBL_CE_ENDPOINT_PY "${NBL_ROOT_PATH}/docker/compiler-explorer/endpoint.py") set(NBL_NSC_BASIC_HLSL_JPAYLOAD "${CMAKE_CURRENT_SOURCE_DIR}/docker/godbolt/hlsl-basic-compile-payload.json") -add_custom_target(run-compiler-explorer ALL - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Killing remaining NSC orphans" - COMMAND "${DOCKER_EXE}" rm -f ${ORPHAN} || "${CMAKE_COMMAND}" -E true - - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Executing CTests" - COMMAND "${CTEST_EXE}" -C $ --stop-on-failure - - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Generating NSC build info" - COMMAND ${NBL_BUILD_INFO_POSTPROCESS_COMMAND} - - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Generating NSC godbolt config" - COMMAND ${NBL_CE_GENERATE_CONFIG_COMMAND} - - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Updating NSC package context" - COMMAND "${CMAKE_COMMAND}" -E copy_directory_if_different "${NBL_NSC_PREINSTALL_DIRECTORY}" "${NBL_DOCKER_CTX_DIR}/Nabla" - - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Building NSC Godbolt image" - COMMAND "${DOCKER_EXE}" build ${ISOLATION} -f "${DOCKERFILE}" -t ${NSC_IMAGE_NAME} "${NBL_DOCKER_CTX_DIR}" - - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Running new NSC orphan container" - COMMAND "${DOCKER_EXE}" run -di -p 80:10240 ${ISOLATION} --name ${ORPHAN} ${NSC_IMAGE_NAME} - COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_HEALTHY_CHECK_PY}" --url "${NBL_CE_URL}" --interval 5 --ticks 12 - COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "Compiler Explorer is running, type \"localhost\" in your browser!" - - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Post-Checking if NSC container is able to compile basic shader input..." - COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compiler/nsc_$>_upstream/compile --method POST --json "${NBL_NSC_BASIC_HLSL_JPAYLOAD}" - COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "OK! NSC container is healthy." +# to avoid "too long input" errors we proxy build instructions to CMake script and write it to build directory +string(CONFIGURE [=[ +execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Killing remaining NSC orphans") +execute_process(COMMAND "${DOCKER_EXE}" rm -f "${ORPHAN}") + +execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Executing CTests") +execute_process(COMMAND "${CTEST_EXE}" -C "$" --stop-on-failure WORKING_DIRECTORY "@CMAKE_CURRENT_BINARY_DIR@" + COMMAND_ERROR_IS_FATAL ANY) + +execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Generating NSC build info") +execute_process(COMMAND "${CMAKE_COMMAND}" + "-DNBL_EXECUTABLE_PATH=${NBL_NSC_PREINSTALL_TARGET_EXE_FILEPATH}" + "-DNBL_BUILD_INFO=${NBL_NSC_PREINSTALL_TARGET_BUILD_INFO}" + "-DNBL_OUTPUT_FILE=${NBL_NSC_PREINSTALL_TARGET_BUILD_INFO}" + "-DNBL_OUTPUT_EXE_OVERRIDE=$" + -P "${NBL_ROOT_PATH}/cmake/scripts/nbl/nablaBuildInfo.cmake" + COMMAND_ERROR_IS_FATAL ANY) + +execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Generating NSC godbolt config") +execute_process(COMMAND "${CMAKE_COMMAND}" + "-DSPIRV_DIS_EXE=spirv-dis.exe" + "-DNSC_RELEASE_BUILD_INFO=$" + "-DNSC_RELWITHDEBINFO_BUILD_INFO=$" + "-DNSC_DEBUG_BUILD_INFO=$" + "-DOUTPUT_CONFIG_FILE=${OUTPUT_CONFIG_FILE}" + -P "${CMAKE_CURRENT_SOURCE_DIR}/ce-generate-config.cmake" + COMMAND_ERROR_IS_FATAL ANY) + +execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Updating NSC package context") +execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory_if_different + "$" + "${NBL_DOCKER_CTX_DIR}/Nabla" + COMMAND_ERROR_IS_FATAL ANY) + +execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Building NSC Godbolt image") +execute_process(COMMAND "${DOCKER_EXE}" build ${ISOLATION} + -f "${DOCKERFILE}" + -t ${NSC_IMAGE_NAME} + "${NBL_DOCKER_CTX_DIR}" + COMMAND_ERROR_IS_FATAL ANY) + +execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Running new NSC orphan container") +execute_process(COMMAND "${DOCKER_EXE}" run -di -p 80:10240 ${ISOLATION} + --name "${ORPHAN}" ${NSC_IMAGE_NAME} + COMMAND_ERROR_IS_FATAL ANY) + +execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Health‐check") +execute_process(COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_HEALTHY_CHECK_PY}" + --url "${NBL_CE_URL}" --interval 5 --ticks 12 + COMMAND_ERROR_IS_FATAL ANY) + +execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Post‐Checking basic shader compile") +execute_process(COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" + --url "${NBL_CE_URL}" + --endpoint /api/compiler/nsc_$>_upstream/compile + --method POST --json "${NBL_NSC_BASIC_HLSL_JPAYLOAD}" + COMMAND_ERROR_IS_FATAL ANY) + +execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "OK! NSC container is healthy.") +]=] INSTRUCTIONS) + +file(GENERATE OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/run-compiler-explorer-$.cmake" CONTENT "${INSTRUCTIONS}") - WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" - VERBATIM - USES_TERMINAL +add_custom_target(run-compiler-explorer ALL + COMMAND "${CMAKE_COMMAND}" -P "${CMAKE_CURRENT_BINARY_DIR}/run-compiler-explorer-$.cmake" + VERBATIM + COMMAND_EXPAND_LISTS ) add_dependencies(run-compiler-explorer nsc) From 350c6a3604999abb23d133c8affa3a456181dfdc Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 27 May 2025 11:25:25 +0700 Subject: [PATCH 212/346] more util funcs in config, fix some calculations --- examples_tests | 2 +- .../hlsl/workgroup2/arithmetic_config.hlsl | 48 ++++++++--------- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 52 +++++++++---------- 3 files changed, 50 insertions(+), 52 deletions(-) diff --git a/examples_tests b/examples_tests index bb3a901b5d..2a85f4e091 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit bb3a901b5de72b78246af20072f4489960287204 +Subproject commit 2a85f4e0911185a85df31f798b92e6902db3383e diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl index 512641abb8..8ecbe4b5dc 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -19,9 +19,9 @@ template struct virtual_wg_size_log2 { static_assert(WorkgroupSizeLog2>=SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize"); - // static_assert(WorkgroupSizeLog2<=SubgroupSizeLog2+4, "WorkgroupSize cannot be larger than SubgroupSize*16"); + static_assert(WorkgroupSizeLog2<=SubgroupSizeLog2*3+4, "WorkgroupSize cannot be larger than (SubgroupSize^3)*16"); NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2),uint16_t,conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2*2+2),uint16_t,3,2>::value,1>::value; - NBL_CONSTEXPR_STATIC_INLINE uint16_t value = mpl::max_v+SubgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t value = mpl::max_v; // must have at least enough level 0 outputs to feed a single subgroup }; @@ -33,24 +33,6 @@ struct items_per_invocation NBL_CONSTEXPR_STATIC_INLINE uint16_t value1 = uint16_t(0x1u) << conditional_value, ItemsPerInvocationProductLog2>::value; NBL_CONSTEXPR_STATIC_INLINE uint16_t value2 = uint16_t(0x1u) << mpl::max_v; }; - -// explicit specializations for cases that don't fit -#define SPECIALIZE_VIRTUAL_WG_SIZE_CASE(WGLOG2, SGLOG2, LEVELS, VALUE) template<>\ -struct virtual_wg_size_log2\ -{\ - NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = LEVELS;\ - NBL_CONSTEXPR_STATIC_INLINE uint16_t value = VALUE;\ -};\ - -SPECIALIZE_VIRTUAL_WG_SIZE_CASE(11,4,3,12); -SPECIALIZE_VIRTUAL_WG_SIZE_CASE(7,7,1,7); -SPECIALIZE_VIRTUAL_WG_SIZE_CASE(6,6,1,6); -SPECIALIZE_VIRTUAL_WG_SIZE_CASE(5,5,1,5); -SPECIALIZE_VIRTUAL_WG_SIZE_CASE(4,4,1,4); -SPECIALIZE_VIRTUAL_WG_SIZE_CASE(3,3,1,3); -SPECIALIZE_VIRTUAL_WG_SIZE_CASE(2,2,1,2); - -#undef SPECIALIZE_VIRTUAL_WG_SIZE_CASE } template @@ -71,16 +53,32 @@ struct ArithmeticConfiguration NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = items_per_invoc_t::value2; static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!"); - NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementCount = conditional_value::value + SubgroupSize*ItemsPerInvocation_1>::value; + NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = conditional_value::value + SubgroupSize*ItemsPerInvocation_1 + >::value; + + static bool electLast() + { + return glsl::gl_SubgroupInvocationID()==SubgroupSize-1; + } + + static uint32_t virtualSubgroupID(const uint32_t subgroupID, const uint32_t virtualIdx) + { + return virtualIdx * (WorkgroupSize >> SubgroupSizeLog2) + subgroupID; + } - static uint32_t virtualSubgroupID(const uint32_t id, const uint32_t offset) + static uint32_t sharedCoalescedIndexNextLevel(const uint32_t subgroupID, const uint32_t itemsPerInvocation) { - return offset * (WorkgroupSize >> SubgroupSizeLog2) + id; + return (subgroupID & (itemsPerInvocation-1)) * SubgroupSize + (subgroupID/itemsPerInvocation); } - static uint32_t sharedMemCoalescedIndex(const uint32_t id, const uint32_t itemsPerInvocation) + static uint32_t sharedCoalescedIndexByComponent(const uint32_t invocationIndex, const uint32_t component) { - return (id & (itemsPerInvocation-1)) * SubgroupSize + (id/itemsPerInvocation); + return component * SubgroupSize + invocationIndex; } }; diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index d44271a260..dd309e0e12 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -104,10 +104,10 @@ struct reduce vector_lv0_t scan_local; dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local); scan_local = reduction0(scan_local); - if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) + if (Config::electLast()) { const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); - const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); + const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); scratchAccessor.template set(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } @@ -120,10 +120,10 @@ struct reduce vector_lv1_t lv1_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(i*Config::SubgroupSize+invocationIndex,lv1_val[i]); + scratchAccessor.template get(Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv1_val[i]); lv1_val = reduction1(lv1_val); - if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) + if (Config::electLast()) scratchAccessor.template set(0, lv1_val[Config::ItemsPerInvocation_1-1]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -159,10 +159,10 @@ struct scan dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); value = inclusiveScan0(value); dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); - if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) + if (Config::electLast()) { const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); - const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); + const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); scratchAccessor.template set(bankedIndex, value[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } @@ -176,12 +176,12 @@ struct scan const uint32_t prevIndex = invocationIndex-1; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(i*Config::SubgroupSize+prevIndex,lv1_val[i]); + scratchAccessor.template get(Config::sharedCoalescedIndexByComponent(prevIndex, i),lv1_val[i]); lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex)); lv1_val = inclusiveScan1(lv1_val); [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template set(i*Config::SubgroupSize+invocationIndex,lv1_val[i]); + scratchAccessor.template set(Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv1_val[i]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -193,7 +193,7 @@ struct scan dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); - const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1); + const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1); scalar_t left; scratchAccessor.template get(bankedIndex,left); if (Exclusive) @@ -242,10 +242,10 @@ struct reduce vector_lv0_t scan_local; dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local); scan_local = reduction0(scan_local); - if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) + if (Config::electLast()) { const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); - const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); + const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); scratchAccessor.template set(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } @@ -258,11 +258,11 @@ struct reduce vector_lv1_t lv1_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(i*Config::SubgroupSize+invocationIndex,lv1_val[i]); + scratchAccessor.template get(Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv1_val[i]); lv1_val = reduction1(lv1_val); - if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) + if (Config::electLast()) { - const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(invocationIndex, Config::ItemsPerInvocation_2); // (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2); + const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(invocationIndex, Config::ItemsPerInvocation_2); // (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2); scratchAccessor.template set(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); } } @@ -275,7 +275,7 @@ struct reduce vector_lv2_t lv2_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) - scratchAccessor.template get(i*Config::SubgroupSize+invocationIndex,lv2_val[i]); + scratchAccessor.template get(Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv2_val[i]); lv2_val = reduction2(lv2_val); scratchAccessor.template set(invocationIndex, lv2_val[Config::ItemsPerInvocation_2-1]); } @@ -314,10 +314,10 @@ struct scan dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); value = inclusiveScan0(value); dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); - if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) + if (Config::electLast()) { const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); - const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1); + const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1); scratchAccessor.template set(bankedIndex, value[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } @@ -332,15 +332,15 @@ struct scan const uint32_t prevIndex = invocationIndex-1; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(i*Config::SubgroupSize+prevIndex,lv1_val[i]); + scratchAccessor.template get(Config::sharedCoalescedIndexByComponent(prevIndex, i),lv1_val[i]); lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex)); lv1_val = inclusiveScan1(lv1_val); [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template set(i*Config::SubgroupSize+invocationIndex,lv1_val[i]); - if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1) + scratchAccessor.template set(Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv1_val[i]); + if (Config::electLast()) { - const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2); // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2); + const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2); // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2); scratchAccessor.template set(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); } } @@ -354,12 +354,12 @@ struct scan const uint32_t prevIndex = invocationIndex-1; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) - scratchAccessor.template get(lv1_smem_size+i*Config::SubgroupSize+prevIndex,lv2_val[i]); + scratchAccessor.template get(lv1_smem_size+Config::sharedCoalescedIndexByComponent(prevIndex, i),lv2_val[i]); lv2_val[0] = hlsl::mix(hlsl::promote(BinOp::identity), lv2_val[0], bool(invocationIndex)); lv2_val = inclusiveScan2(lv2_val); [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) - scratchAccessor.template set(lv1_smem_size+i*Config::SubgroupSize+invocationIndex,lv2_val[i]); + scratchAccessor.template set(lv1_smem_size+Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv2_val[i]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -372,12 +372,12 @@ struct scan scratchAccessor.template get(i*Config::SubgroupSize+invocationIndex,lv1_val[i]); scalar_t lv2_scan; - const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2); // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2); + const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2); // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2); scratchAccessor.template set(lv1_smem_size+bankedIndex, lv2_scan); [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template set(i*Config::SubgroupSize+invocationIndex, binop(lv1_val[i],lv2_scan)); + scratchAccessor.template set(Config::sharedCoalescedIndexByComponent(invocationIndex, i), binop(lv1_val[i],lv2_scan)); } // combine with level 0 @@ -388,7 +388,7 @@ struct scan dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); - const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1); + const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1); scalar_t left; scratchAccessor.template get(bankedIndex,left); if (Exclusive) From 14e5d15b830376e91de7066e233bdf0108230863 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 27 May 2025 12:24:17 +0700 Subject: [PATCH 213/346] added generic data/shared mem accessors --- .../builtin/hlsl/concepts/accessors/fft.hlsl | 44 ++------------ .../accessors/generic_shared_data.hlsl | 59 +++++++++++++++++++ 2 files changed, 64 insertions(+), 39 deletions(-) create mode 100644 include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl diff --git a/include/nbl/builtin/hlsl/concepts/accessors/fft.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/fft.hlsl index 262cb3c0c7..9088b0c7b4 100644 --- a/include/nbl/builtin/hlsl/concepts/accessors/fft.hlsl +++ b/include/nbl/builtin/hlsl/concepts/accessors/fft.hlsl @@ -1,7 +1,7 @@ #ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_FFT_INCLUDED_ #define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_FFT_INCLUDED_ -#include "nbl/builtin/hlsl/concepts.hlsl" +#include "nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl" #include "nbl/builtin/hlsl/fft/common.hlsl" namespace nbl @@ -17,49 +17,15 @@ namespace fft // * void set(uint32_t index, in uint32_t value); // * void workgroupExecutionAndMemoryBarrier(); -#define NBL_CONCEPT_NAME FFTSharedMemoryAccessor -#define NBL_CONCEPT_TPLT_PRM_KINDS (typename) -#define NBL_CONCEPT_TPLT_PRM_NAMES (T) -#define NBL_CONCEPT_PARAM_0 (accessor, T) -#define NBL_CONCEPT_PARAM_1 (index, uint32_t) -#define NBL_CONCEPT_PARAM_2 (val, uint32_t) -NBL_CONCEPT_BEGIN(3) -#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 -#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 -#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 -NBL_CONCEPT_END( - ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set(index, val)), is_same_v, void)) - ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get(index, val)), is_same_v, void)) - ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.workgroupExecutionAndMemoryBarrier()), is_same_v, void)) -); -#undef val -#undef index -#undef accessor -#include - +template +NBL_BOOL_CONCEPT FFTSharedMemoryAccessor = concepts::accessors::GenericSharedMemoryAccessor; // The Accessor (for a small FFT) MUST provide the following methods: // * void get(uint32_t index, NBL_REF_ARG(complex_t) value); // * void set(uint32_t index, in complex_t value); -#define NBL_CONCEPT_NAME FFTAccessor -#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename) -#define NBL_CONCEPT_TPLT_PRM_NAMES (T)(Scalar) -#define NBL_CONCEPT_PARAM_0 (accessor, T) -#define NBL_CONCEPT_PARAM_1 (index, uint32_t) -#define NBL_CONCEPT_PARAM_2 (val, complex_t) -NBL_CONCEPT_BEGIN(3) -#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 -#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 -#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 -NBL_CONCEPT_END( - ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set >(index, val)), is_same_v, void)) - ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get >(index, val)), is_same_v, void)) -); -#undef val -#undef index -#undef accessor -#include +template +NBL_BOOL_CONCEPT FFTAccessor = concepts::accessors::GenericDataAccessor,I>; } } diff --git a/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl new file mode 100644 index 0000000000..4e6b974249 --- /dev/null +++ b/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl @@ -0,0 +1,59 @@ +#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_WORKGROUP_ARITHMETIC_INCLUDED_ +#define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_WORKGROUP_ARITHMETIC_INCLUDED_ + +#include "nbl/builtin/hlsl/concepts.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace concepts +{ +namespace accessors +{ + +#define NBL_CONCEPT_NAME GenericSharedMemoryAccessor +#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(typename) +#define NBL_CONCEPT_TPLT_PRM_NAMES (T)(V)(I) +#define NBL_CONCEPT_PARAM_0 (accessor, T) +#define NBL_CONCEPT_PARAM_1 (index, I) +#define NBL_CONCEPT_PARAM_2 (val, V) +NBL_CONCEPT_BEGIN(3) +#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 +#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 +#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 +NBL_CONCEPT_END( + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set(index, val)), is_same_v, void)) + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get(index, val)), is_same_v, void)) + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.workgroupExecutionAndMemoryBarrier()), is_same_v, void)) +); +#undef val +#undef index +#undef accessor +#include + +#define NBL_CONCEPT_NAME GenericDataAccessor +#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(typename) +#define NBL_CONCEPT_TPLT_PRM_NAMES (T)(V)(I) +#define NBL_CONCEPT_PARAM_0 (accessor, T) +#define NBL_CONCEPT_PARAM_1 (index, I) +#define NBL_CONCEPT_PARAM_2 (val, V) +NBL_CONCEPT_BEGIN(3) +#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 +#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 +#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 +NBL_CONCEPT_END( + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set(index, val)), is_same_v, void)) + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get(index, val)), is_same_v, void)) +); +#undef val +#undef index +#undef accessor +#include + +} +} +} +} + +#endif From f07329e42145deff72b832faf4bf07b6ada39e5e Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 27 May 2025 13:47:25 +0700 Subject: [PATCH 214/346] fix include guard --- .../builtin/hlsl/concepts/accessors/generic_shared_data.hlsl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl index 4e6b974249..db71228162 100644 --- a/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl +++ b/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl @@ -1,5 +1,5 @@ -#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_WORKGROUP_ARITHMETIC_INCLUDED_ -#define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_WORKGROUP_ARITHMETIC_INCLUDED_ +#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_GENERIC_SHARED_DATA_INCLUDED_ +#define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_GENERIC_SHARED_DATA_INCLUDED_ #include "nbl/builtin/hlsl/concepts.hlsl" From 48a7d161aeb5b921cb5211465ec2d4cbcc177fe9 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 27 May 2025 13:49:40 +0700 Subject: [PATCH 215/346] changes to arithmetic accessor concepts --- examples_tests | 2 +- .../accessors/workgroup_arithmetic.hlsl | 38 ++++++------------- .../builtin/hlsl/workgroup2/arithmetic.hlsl | 12 +++--- 3 files changed, 19 insertions(+), 33 deletions(-) diff --git a/examples_tests b/examples_tests index 2a85f4e091..99f6dfe5b4 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 2a85f4e0911185a85df31f798b92e6902db3383e +Subproject commit 99f6dfe5b4345cc8bbe7ff2ab2353993e395d3bd diff --git a/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl index de5e5a3c35..cbccbec034 100644 --- a/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl +++ b/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl @@ -1,7 +1,7 @@ #ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_WORKGROUP_ARITHMETIC_INCLUDED_ #define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_WORKGROUP_ARITHMETIC_INCLUDED_ -#include "nbl/builtin/hlsl/concepts.hlsl" +#include "nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl" namespace nbl { @@ -10,46 +10,30 @@ namespace hlsl namespace workgroup2 { -#define NBL_CONCEPT_NAME ArithmeticSharedMemoryAccessor -#define NBL_CONCEPT_TPLT_PRM_KINDS (typename) -#define NBL_CONCEPT_TPLT_PRM_NAMES (T) -#define NBL_CONCEPT_PARAM_0 (accessor, T) -#define NBL_CONCEPT_PARAM_1 (index, uint32_t) -#define NBL_CONCEPT_PARAM_2 (val, uint32_t) -NBL_CONCEPT_BEGIN(3) -#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 -#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 -#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 -NBL_CONCEPT_END( - ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set(index, val)), is_same_v, void)) - ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get(index, val)), is_same_v, void)) - ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.workgroupExecutionAndMemoryBarrier()), is_same_v, void)) -); -#undef val -#undef index -#undef accessor -#include +template +NBL_BOOL_CONCEPT ArithmeticSharedMemoryAccessor = concepts::accessors::GenericSharedMemoryAccessor; -#define NBL_CONCEPT_NAME ArithmeticDataAccessor -#define NBL_CONCEPT_TPLT_PRM_KINDS (typename) -#define NBL_CONCEPT_TPLT_PRM_NAMES (T) +#define NBL_CONCEPT_NAME ArithmeticReadOnlyDataAccessor +#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename) +#define NBL_CONCEPT_TPLT_PRM_NAMES (T)(V) #define NBL_CONCEPT_PARAM_0 (accessor, T) #define NBL_CONCEPT_PARAM_1 (index, uint32_t) -#define NBL_CONCEPT_PARAM_2 (val, uint32_t) +#define NBL_CONCEPT_PARAM_2 (val, V) NBL_CONCEPT_BEGIN(3) #define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 #define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 #define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 NBL_CONCEPT_END( - ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set(index, val)), is_same_v, void)) - ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get(index, val)), is_same_v, void)) - ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.workgroupExecutionAndMemoryBarrier()), is_same_v, void)) + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get(index, val)), is_same_v, void)) ); #undef val #undef index #undef accessor #include +template +NBL_BOOL_CONCEPT ArithmeticDataAccessor = concepts::accessors::GenericDataAccessor; + } } } diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl index e4a71bdffc..6702504fa8 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl @@ -6,8 +6,6 @@ #include "nbl/builtin/hlsl/functional.hlsl" -#include "nbl/builtin/hlsl/workgroup/ballot.hlsl" -#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl" #include "nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl" #include "nbl/builtin/hlsl/workgroup2/shared_scan.hlsl" @@ -24,7 +22,7 @@ struct reduction { using scalar_t = typename BinOp::type_t; - template && ArithmeticSharedMemoryAccessor) + template && ArithmeticSharedMemoryAccessor) static scalar_t __call(NBL_REF_ARG(ReadOnlyDataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { impl::reduce fn; @@ -35,7 +33,9 @@ struct reduction template struct inclusive_scan { - template && ArithmeticSharedMemoryAccessor) + using scalar_t = typename BinOp::type_t; + + template && ArithmeticSharedMemoryAccessor) static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { impl::scan fn; @@ -46,7 +46,9 @@ struct inclusive_scan template struct exclusive_scan { - template && ArithmeticSharedMemoryAccessor) + using scalar_t = typename BinOp::type_t; + + template && ArithmeticSharedMemoryAccessor) static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { impl::scan fn; From 20a54be14f624eb59e7030b2d14294f224e87750 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 27 May 2025 15:23:28 +0700 Subject: [PATCH 216/346] concept macro for checking types --- include/nbl/builtin/hlsl/concepts.hlsl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/nbl/builtin/hlsl/concepts.hlsl b/include/nbl/builtin/hlsl/concepts.hlsl index 7fd725dc2b..4b82955bb7 100644 --- a/include/nbl/builtin/hlsl/concepts.hlsl +++ b/include/nbl/builtin/hlsl/concepts.hlsl @@ -33,6 +33,7 @@ namespace concepts #define NBL_CONCEPT_REQ_EXPR 1 // #define NBL_CONCEPT_REQ_EXPR_RET_TYPE 2 +#define NBL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT 3 //! Now diverge @@ -64,8 +65,9 @@ concept NBL_CONCEPT_NAME = requires BOOST_PP_EXPR_IF(LOCAL_PARAM_COUNT,(BOOST_PP #define NBL_IMPL_CONCEPT_REQ_TYPE(...) typename __VA_ARGS__; #define NBL_IMPL_CONCEPT_REQ_EXPR(...) __VA_ARGS__; #define NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE(E,C,...) {E}; C; +#define NBL_IMPL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT(C,...) C< __VA_ARGS__ >; // -#define NBL_IMPL_CONCEPT (NBL_IMPL_CONCEPT_REQ_TYPE,NBL_IMPL_CONCEPT_REQ_EXPR,NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE) +#define NBL_IMPL_CONCEPT (NBL_IMPL_CONCEPT_REQ_TYPE,NBL_IMPL_CONCEPT_REQ_EXPR,NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE,NBL_IMPL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT) // #define NBL_IMPL_CONCEPT_END_DEF(r,unused,i,e) NBL_EVAL(BOOST_PP_TUPLE_ELEM(BOOST_PP_SEQ_HEAD(e),NBL_IMPL_CONCEPT) BOOST_PP_SEQ_TAIL(e)) // @@ -95,8 +97,9 @@ concept NBL_CONCEPT_NAME = requires BOOST_PP_EXPR_IF(LOCAL_PARAM_COUNT,(BOOST_PP #define NBL_IMPL_CONCEPT_REQ_TYPE(...) ::nbl::hlsl::make_void_t #define NBL_IMPL_CONCEPT_REQ_EXPR(...) ::nbl::hlsl::make_void_t #define NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE(E,C,...) ::nbl::hlsl::enable_if_t > +#define NBL_IMPL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT(C,...) ::nbl::hlsl::enable_if_t > // -#define NBL_IMPL_CONCEPT_SFINAE (NBL_IMPL_CONCEPT_REQ_TYPE,NBL_IMPL_CONCEPT_REQ_EXPR,NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE) +#define NBL_IMPL_CONCEPT_SFINAE (NBL_IMPL_CONCEPT_REQ_TYPE,NBL_IMPL_CONCEPT_REQ_EXPR,NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE,NBL_IMPL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT) // #define NBL_IMPL_CONCEPT_END_DEF(r,unused,i,e) template \ struct BOOST_PP_CAT(__requirement,i) : ::nbl::hlsl::false_type {}; \ From d83ac5cbf9301b173c8199118f0d9937c80e5186 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 27 May 2025 15:41:20 +0700 Subject: [PATCH 217/346] revert concept macro addition --- include/nbl/builtin/hlsl/concepts.hlsl | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/include/nbl/builtin/hlsl/concepts.hlsl b/include/nbl/builtin/hlsl/concepts.hlsl index 4b82955bb7..7fd725dc2b 100644 --- a/include/nbl/builtin/hlsl/concepts.hlsl +++ b/include/nbl/builtin/hlsl/concepts.hlsl @@ -33,7 +33,6 @@ namespace concepts #define NBL_CONCEPT_REQ_EXPR 1 // #define NBL_CONCEPT_REQ_EXPR_RET_TYPE 2 -#define NBL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT 3 //! Now diverge @@ -65,9 +64,8 @@ concept NBL_CONCEPT_NAME = requires BOOST_PP_EXPR_IF(LOCAL_PARAM_COUNT,(BOOST_PP #define NBL_IMPL_CONCEPT_REQ_TYPE(...) typename __VA_ARGS__; #define NBL_IMPL_CONCEPT_REQ_EXPR(...) __VA_ARGS__; #define NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE(E,C,...) {E}; C; -#define NBL_IMPL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT(C,...) C< __VA_ARGS__ >; // -#define NBL_IMPL_CONCEPT (NBL_IMPL_CONCEPT_REQ_TYPE,NBL_IMPL_CONCEPT_REQ_EXPR,NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE,NBL_IMPL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT) +#define NBL_IMPL_CONCEPT (NBL_IMPL_CONCEPT_REQ_TYPE,NBL_IMPL_CONCEPT_REQ_EXPR,NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE) // #define NBL_IMPL_CONCEPT_END_DEF(r,unused,i,e) NBL_EVAL(BOOST_PP_TUPLE_ELEM(BOOST_PP_SEQ_HEAD(e),NBL_IMPL_CONCEPT) BOOST_PP_SEQ_TAIL(e)) // @@ -97,9 +95,8 @@ concept NBL_CONCEPT_NAME = requires BOOST_PP_EXPR_IF(LOCAL_PARAM_COUNT,(BOOST_PP #define NBL_IMPL_CONCEPT_REQ_TYPE(...) ::nbl::hlsl::make_void_t #define NBL_IMPL_CONCEPT_REQ_EXPR(...) ::nbl::hlsl::make_void_t #define NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE(E,C,...) ::nbl::hlsl::enable_if_t > -#define NBL_IMPL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT(C,...) ::nbl::hlsl::enable_if_t > // -#define NBL_IMPL_CONCEPT_SFINAE (NBL_IMPL_CONCEPT_REQ_TYPE,NBL_IMPL_CONCEPT_REQ_EXPR,NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE,NBL_IMPL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT) +#define NBL_IMPL_CONCEPT_SFINAE (NBL_IMPL_CONCEPT_REQ_TYPE,NBL_IMPL_CONCEPT_REQ_EXPR,NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE) // #define NBL_IMPL_CONCEPT_END_DEF(r,unused,i,e) template \ struct BOOST_PP_CAT(__requirement,i) : ::nbl::hlsl::false_type {}; \ From 00787bf305da99a9a13580dbe39faf95ddf05d72 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 27 May 2025 15:42:38 +0700 Subject: [PATCH 218/346] added generic read/write accessors --- .../accessors/generic_shared_data.hlsl | 46 +++++++++++++------ 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl index db71228162..cc22595444 100644 --- a/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl +++ b/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl @@ -16,15 +16,15 @@ namespace accessors #define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(typename) #define NBL_CONCEPT_TPLT_PRM_NAMES (T)(V)(I) #define NBL_CONCEPT_PARAM_0 (accessor, T) -#define NBL_CONCEPT_PARAM_1 (index, I) -#define NBL_CONCEPT_PARAM_2 (val, V) +#define NBL_CONCEPT_PARAM_1 (val, V) +#define NBL_CONCEPT_PARAM_2 (index, I) NBL_CONCEPT_BEGIN(3) #define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 -#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 -#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 +#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 +#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 NBL_CONCEPT_END( - ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set(index, val)), is_same_v, void)) - ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get(index, val)), is_same_v, void)) + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set(index, val)), is_same_v, void)) + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get(index, val)), is_same_v, void)) ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.workgroupExecutionAndMemoryBarrier()), is_same_v, void)) ); #undef val @@ -32,25 +32,45 @@ NBL_CONCEPT_END( #undef accessor #include -#define NBL_CONCEPT_NAME GenericDataAccessor +#define NBL_CONCEPT_NAME GenericReadAccessor #define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(typename) #define NBL_CONCEPT_TPLT_PRM_NAMES (T)(V)(I) #define NBL_CONCEPT_PARAM_0 (accessor, T) -#define NBL_CONCEPT_PARAM_1 (index, I) -#define NBL_CONCEPT_PARAM_2 (val, V) +#define NBL_CONCEPT_PARAM_1 (val, V) +#define NBL_CONCEPT_PARAM_2 (index, I) NBL_CONCEPT_BEGIN(3) #define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 -#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 -#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 +#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 +#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 NBL_CONCEPT_END( - ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set(index, val)), is_same_v, void)) - ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get(index, val)), is_same_v, void)) + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get(index, val)), is_same_v, void)) ); #undef val #undef index #undef accessor #include +#define NBL_CONCEPT_NAME GenericWriteAccessor +#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(typename) +#define NBL_CONCEPT_TPLT_PRM_NAMES (T)(V)(I) +#define NBL_CONCEPT_PARAM_0 (accessor, T) +#define NBL_CONCEPT_PARAM_1 (val, V) +#define NBL_CONCEPT_PARAM_2 (index, I) +NBL_CONCEPT_BEGIN(3) +#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 +#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 +#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 +NBL_CONCEPT_END( + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set(index, val)), is_same_v, void)) +); +#undef val +#undef index +#undef accessor +#include + +template +NBL_BOOL_CONCEPT GenericDataAccessor = GenericWriteAccessor && GenericWriteAccessor; + } } } From c0dfc1eeddac4378dd8fc836ddb71efe7e9ee5b3 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 27 May 2025 15:43:37 +0700 Subject: [PATCH 219/346] more refactor for accessor concept changes --- .../accessors/workgroup_arithmetic.hlsl | 21 +--- .../hlsl/workgroup2/arithmetic_config.hlsl | 8 +- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 105 +++++++++--------- 3 files changed, 59 insertions(+), 75 deletions(-) diff --git a/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl index cbccbec034..267342634f 100644 --- a/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl +++ b/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl @@ -10,26 +10,11 @@ namespace hlsl namespace workgroup2 { -template +template NBL_BOOL_CONCEPT ArithmeticSharedMemoryAccessor = concepts::accessors::GenericSharedMemoryAccessor; -#define NBL_CONCEPT_NAME ArithmeticReadOnlyDataAccessor -#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename) -#define NBL_CONCEPT_TPLT_PRM_NAMES (T)(V) -#define NBL_CONCEPT_PARAM_0 (accessor, T) -#define NBL_CONCEPT_PARAM_1 (index, uint32_t) -#define NBL_CONCEPT_PARAM_2 (val, V) -NBL_CONCEPT_BEGIN(3) -#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 -#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 -#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 -NBL_CONCEPT_END( - ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get(index, val)), is_same_v, void)) -); -#undef val -#undef index -#undef accessor -#include +template +NBL_BOOL_CONCEPT ArithmeticReadOnlyDataAccessor = concepts::accessors::GenericReadAccessor; template NBL_BOOL_CONCEPT ArithmeticDataAccessor = concepts::accessors::GenericDataAccessor; diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl index 8ecbe4b5dc..7611036a49 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -66,17 +66,17 @@ struct ArithmeticConfiguration return glsl::gl_SubgroupInvocationID()==SubgroupSize-1; } - static uint32_t virtualSubgroupID(const uint32_t subgroupID, const uint32_t virtualIdx) + static uint32_t virtualSubgroupID(const uint32_t subgroupID, const uint32_t workgroupInVirtualIndex) { - return virtualIdx * (WorkgroupSize >> SubgroupSizeLog2) + subgroupID; + return workgroupInVirtualIndex * (WorkgroupSize >> SubgroupSizeLog2) + subgroupID; } - static uint32_t sharedCoalescedIndexNextLevel(const uint32_t subgroupID, const uint32_t itemsPerInvocation) + static uint32_t sharedStoreIndex(const uint32_t subgroupID, const uint32_t itemsPerInvocation) { return (subgroupID & (itemsPerInvocation-1)) * SubgroupSize + (subgroupID/itemsPerInvocation); } - static uint32_t sharedCoalescedIndexByComponent(const uint32_t invocationIndex, const uint32_t component) + static uint32_t sharedLoadIndex(const uint32_t invocationIndex, const uint32_t component) { return component * SubgroupSize + invocationIndex; } diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index dd309e0e12..96b2ffdd97 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -33,7 +33,7 @@ struct reduce { using scalar_t = typename BinOp::type_t; using vector_t = vector; // data accessor needs to be this type - // doesn't use scratch smem, need as param? + // doesn't use scratch smem, should be NOOP accessor template scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) @@ -43,10 +43,8 @@ struct reduce subgroup2::reduction reduction; vector_t value; - dataAccessor.template get(workgroup::SubgroupContiguousIndex(), value); - value = reduction(value); - return value[0]; - // dataAccessor.template set(workgroup::SubgroupContiguousIndex(), value); + dataAccessor.template get(glsl::gl_SubgroupInvocationID(), value); + return reduction(value); } }; @@ -55,7 +53,7 @@ struct scan { using scalar_t = typename BinOp::type_t; using vector_t = vector; // data accessor needs to be this type - // doesn't use scratch smem, need as param? + // doesn't use scratch smem, should be NOOP accessor template void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) @@ -64,7 +62,7 @@ struct scan using params_t = subgroup2::ArithmeticParams; vector_t value; - dataAccessor.template get(workgroup::SubgroupContiguousIndex(), value); + dataAccessor.template get(glsl::gl_SubgroupInvocationID(), value); if (Exclusive) { subgroup2::exclusive_scan excl_scan; @@ -75,7 +73,7 @@ struct scan subgroup2::inclusive_scan incl_scan; value = incl_scan(value); } - dataAccessor.template set(workgroup::SubgroupContiguousIndex(), value); // can be safely merged with above lines? + dataAccessor.template set(glsl::gl_SubgroupInvocationID(), value); } }; @@ -102,13 +100,13 @@ struct reduce for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { vector_lv0_t scan_local; - dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local); + dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local); scan_local = reduction0(scan_local); if (Config::electLast()) { const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); - const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); - scratchAccessor.template set(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + const uint32_t bankedIndex = Config::sharedStoreIndex(virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); + scratchAccessor.template set(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -120,16 +118,16 @@ struct reduce vector_lv1_t lv1_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv1_val[i]); + scratchAccessor.template get(Config::sharedLoadIndex(invocationIndex, i),lv1_val[i]); lv1_val = reduction1(lv1_val); if (Config::electLast()) - scratchAccessor.template set(0, lv1_val[Config::ItemsPerInvocation_1-1]); + scratchAccessor.template set(0, lv1_val[Config::ItemsPerInvocation_1-1]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); scalar_t reduce_val; - scratchAccessor.template get(0,reduce_val); + scratchAccessor.template get(0,reduce_val); return reduce_val; } }; @@ -156,14 +154,14 @@ struct scan for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { vector_lv0_t value; - dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); value = inclusiveScan0(value); - dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); if (Config::electLast()) { const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); - const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); - scratchAccessor.template set(bankedIndex, value[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + const uint32_t bankedIndex = Config::sharedStoreIndex(virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); + scratchAccessor.template set(bankedIndex, value[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -176,12 +174,12 @@ struct scan const uint32_t prevIndex = invocationIndex-1; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(Config::sharedCoalescedIndexByComponent(prevIndex, i),lv1_val[i]); + scratchAccessor.template get(Config::sharedLoadIndex(prevIndex, i),lv1_val[i]); lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex)); lv1_val = inclusiveScan1(lv1_val); [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template set(Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv1_val[i]); + scratchAccessor.template set(Config::sharedLoadIndex(invocationIndex, i),lv1_val[i]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -190,12 +188,12 @@ struct scan for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { vector_lv0_t value; - dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); - const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1); + const uint32_t bankedIndex = Config::sharedStoreIndex(virtualSubgroupID, Config::ItemsPerInvocation_1); scalar_t left; - scratchAccessor.template get(bankedIndex,left); + scratchAccessor.template get(bankedIndex,left); if (Exclusive) { scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID())); @@ -210,7 +208,7 @@ struct scan for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++) value[i] = binop(left, value[i]); } - dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); } } }; @@ -240,30 +238,31 @@ struct reduce for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { vector_lv0_t scan_local; - dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local); + dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local); scan_local = reduction0(scan_local); if (Config::electLast()) { const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); - const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); - scratchAccessor.template set(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + const uint32_t bankedIndex = Config::sharedStoreIndex(virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); + scratchAccessor.template set(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); // level 1 scan + const uint32_t lv1_smem_size = Config::SubgroupsSize*Config::ItemsPerInvocation_1; subgroup2::reduction reduction1; - if (glsl::gl_SubgroupID() < Config::SubgroupSizeLog2*Config::ItemsPerInvocation_1) + if (glsl::gl_SubgroupID() < Config::SubgroupSize*Config::ItemsPerInvocation_2) { vector_lv1_t lv1_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv1_val[i]); + scratchAccessor.template get(Config::sharedLoadIndex(invocationIndex, i),lv1_val[i]); lv1_val = reduction1(lv1_val); if (Config::electLast()) { - const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(invocationIndex, Config::ItemsPerInvocation_2); // (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2); - scratchAccessor.template set(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); + const uint32_t bankedIndex = Config::sharedStoreIndex(invocationIndex, Config::ItemsPerInvocation_2); // (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2); + scratchAccessor.template set(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -275,14 +274,14 @@ struct reduce vector_lv2_t lv2_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) - scratchAccessor.template get(Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv2_val[i]); + scratchAccessor.template get(lv1_smem_size+Config::sharedLoadIndex(invocationIndex, i),lv2_val[i]); lv2_val = reduction2(lv2_val); - scratchAccessor.template set(invocationIndex, lv2_val[Config::ItemsPerInvocation_2-1]); + scratchAccessor.template set(invocationIndex, lv2_val[Config::ItemsPerInvocation_2-1]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); scalar_t reduce_val; - scratchAccessor.template get(0,reduce_val); + scratchAccessor.template get(0,reduce_val); return reduce_val; } }; @@ -311,14 +310,14 @@ struct scan for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { vector_lv0_t value; - dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); value = inclusiveScan0(value); - dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); if (Config::electLast()) { const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); - const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1); - scratchAccessor.template set(bankedIndex, value[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + const uint32_t bankedIndex = Config::sharedStoreIndex(virtualSubgroupID, Config::ItemsPerInvocation_1); + scratchAccessor.template set(bankedIndex, value[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -332,16 +331,16 @@ struct scan const uint32_t prevIndex = invocationIndex-1; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(Config::sharedCoalescedIndexByComponent(prevIndex, i),lv1_val[i]); + scratchAccessor.template get(Config::sharedLoadIndex(prevIndex, i),lv1_val[i]); lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex)); lv1_val = inclusiveScan1(lv1_val); [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template set(Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv1_val[i]); + scratchAccessor.template set(Config::sharedLoadIndex(invocationIndex, i),lv1_val[i]); if (Config::electLast()) { - const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2); // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2); - scratchAccessor.template set(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); + const uint32_t bankedIndex = Config::sharedStoreIndex(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2); // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2); + scratchAccessor.template set(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -354,12 +353,12 @@ struct scan const uint32_t prevIndex = invocationIndex-1; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) - scratchAccessor.template get(lv1_smem_size+Config::sharedCoalescedIndexByComponent(prevIndex, i),lv2_val[i]); - lv2_val[0] = hlsl::mix(hlsl::promote(BinOp::identity), lv2_val[0], bool(invocationIndex)); + scratchAccessor.template get(lv1_smem_size+Config::sharedLoadIndex(prevIndex, i),lv2_val[i]); + lv2_val[0] = hlsl::mix(BinOp::identity, lv2_val[0], bool(invocationIndex)); lv2_val = inclusiveScan2(lv2_val); [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) - scratchAccessor.template set(lv1_smem_size+Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv2_val[i]); + scratchAccessor.template set(lv1_smem_size+Config::sharedLoadIndex(invocationIndex, i),lv2_val[i]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -369,15 +368,15 @@ struct scan vector_lv1_t lv1_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(i*Config::SubgroupSize+invocationIndex,lv1_val[i]); + scratchAccessor.template get(i*Config::SubgroupSize+invocationIndex,lv1_val[i]); scalar_t lv2_scan; - const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2); // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2); - scratchAccessor.template set(lv1_smem_size+bankedIndex, lv2_scan); + const uint32_t bankedIndex = Config::sharedStoreIndex(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2); // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2); + scratchAccessor.template set(lv1_smem_size+bankedIndex, lv2_scan); [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template set(Config::sharedCoalescedIndexByComponent(invocationIndex, i), binop(lv1_val[i],lv2_scan)); + scratchAccessor.template set(Config::sharedLoadIndex(invocationIndex, i), binop(lv1_val[i],lv2_scan)); } // combine with level 0 @@ -385,12 +384,12 @@ struct scan for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { vector_lv0_t value; - dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); - const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1); + const uint32_t bankedIndex = Config::sharedStoreIndex(virtualSubgroupID, Config::ItemsPerInvocation_1); scalar_t left; - scratchAccessor.template get(bankedIndex,left); + scratchAccessor.template get(bankedIndex,left); if (Exclusive) { scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID())); @@ -405,7 +404,7 @@ struct scan for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++) value[i] = binop(left, value[i]); } - dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); } } }; From 55840a3063fb64ef79f84ffc51b6392fbed1530e Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 27 May 2025 16:13:50 +0700 Subject: [PATCH 220/346] don't pass scalar_t as index type --- examples_tests | 2 +- include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples_tests b/examples_tests index 99f6dfe5b4..3d898943fb 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 99f6dfe5b4345cc8bbe7ff2ab2353993e395d3bd +Subproject commit 3d898943fb9bd4690aa3b92b7a80f5a61198f0de diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl index 6702504fa8..643f8d123e 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl @@ -22,7 +22,7 @@ struct reduction { using scalar_t = typename BinOp::type_t; - template && ArithmeticSharedMemoryAccessor) + template && ArithmeticSharedMemoryAccessor) static scalar_t __call(NBL_REF_ARG(ReadOnlyDataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { impl::reduce fn; @@ -35,7 +35,7 @@ struct inclusive_scan { using scalar_t = typename BinOp::type_t; - template && ArithmeticSharedMemoryAccessor) + template && ArithmeticSharedMemoryAccessor) static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { impl::scan fn; @@ -48,7 +48,7 @@ struct exclusive_scan { using scalar_t = typename BinOp::type_t; - template && ArithmeticSharedMemoryAccessor) + template && ArithmeticSharedMemoryAccessor) static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { impl::scan fn; From d758ff7474aecd42c1ec11769482fed9e70b0d9e Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 27 May 2025 16:31:39 +0700 Subject: [PATCH 221/346] refactor accessor to match accessor template --- examples_tests | 2 +- include/nbl/builtin/hlsl/memory_accessor.hlsl | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/examples_tests b/examples_tests index 3d898943fb..3d63ed7328 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 3d898943fb9bd4690aa3b92b7a80f5a61198f0de +Subproject commit 3d63ed732838c3073dfb7993d3eb1305fb5882be diff --git a/include/nbl/builtin/hlsl/memory_accessor.hlsl b/include/nbl/builtin/hlsl/memory_accessor.hlsl index 99ec0736a4..2194b1e917 100644 --- a/include/nbl/builtin/hlsl/memory_accessor.hlsl +++ b/include/nbl/builtin/hlsl/memory_accessor.hlsl @@ -112,8 +112,8 @@ struct StructureOfArrays : impl::StructureOfArraysBase - enable_if_t get(const index_t ix, NBL_REF_ARG(T) value) + template + enable_if_t get(const I ix, NBL_REF_ARG(T) value) { NBL_CONSTEXPR uint64_t SubElementCount = sizeof(T)/sizeof(access_t); // `vector` for now, we'll use `array` later when `bit_cast` gets fixed @@ -123,8 +123,8 @@ struct StructureOfArrays : impl::StructureOfArraysBase >(aux); } - template - enable_if_t set(const index_t ix, NBL_CONST_REF_ARG(T) value) + template + enable_if_t set(const I ix, NBL_CONST_REF_ARG(T) value) { NBL_CONSTEXPR uint64_t SubElementCount = sizeof(T)/sizeof(access_t); // `vector` for now, we'll use `array` later when `bit_cast` gets fixed @@ -209,11 +209,11 @@ struct Offset : impl::OffsetBase BaseAccessor accessor; - template - void set(index_t idx, T value) {accessor.set(idx+base_t::offset,value); } + template + void set(I idx, T value) {accessor.set(idx+base_t::offset,value); } - template - void get(index_t idx, NBL_REF_ARG(T) value) {accessor.get(idx+base_t::offset,value);} + template + void get(I idx, NBL_REF_ARG(T) value) {accessor.get(idx+base_t::offset,value);} template enable_if_t< From b062ede97571b771c36f2a674045367baee901f7 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 27 May 2025 17:18:41 +0700 Subject: [PATCH 222/346] simplified indexing functions --- .../hlsl/workgroup2/arithmetic_config.hlsl | 15 +++++++-- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 33 ++++++++----------- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl index 7611036a49..e02c74e80b 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -71,9 +71,20 @@ struct ArithmeticConfiguration return workgroupInVirtualIndex * (WorkgroupSize >> SubgroupSizeLog2) + subgroupID; } - static uint32_t sharedStoreIndex(const uint32_t subgroupID, const uint32_t itemsPerInvocation) + template + static uint32_t sharedStoreIndex(const uint32_t subgroupID) { - return (subgroupID & (itemsPerInvocation-1)) * SubgroupSize + (subgroupID/itemsPerInvocation); + if (level<2) + return (subgroupID & (ItemsPerInvocation_1-1)) * SubgroupSize + (subgroupID/ItemsPerInvocation_1); + else + return (subgroupID & (ItemsPerInvocation_2-1)) * SubgroupSize + (subgroupID/ItemsPerInvocation_2); + } + + template + static uint32_t sharedStoreIndexFromVirtualIndex(const uint32_t subgroupID, const uint32_t workgroupInVirtualIndex) + { + const uint32_t virtualID = virtualSubgroupID(subgroupID, workgroupInVirtualIndex); + return sharedStoreIndex(virtualID); } static uint32_t sharedLoadIndex(const uint32_t invocationIndex, const uint32_t component) diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 96b2ffdd97..418c3219f4 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -104,8 +104,7 @@ struct reduce scan_local = reduction0(scan_local); if (Config::electLast()) { - const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); - const uint32_t bankedIndex = Config::sharedStoreIndex(virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); + const uint32_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx); scratchAccessor.template set(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } @@ -159,8 +158,7 @@ struct scan dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); if (Config::electLast()) { - const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); - const uint32_t bankedIndex = Config::sharedStoreIndex(virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); + const uint32_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx); scratchAccessor.template set(bankedIndex, value[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } @@ -174,7 +172,7 @@ struct scan const uint32_t prevIndex = invocationIndex-1; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(Config::sharedLoadIndex(prevIndex, i),lv1_val[i]); + scratchAccessor.template get(Config::sharedLoadIndex(invocationIndex, i)-1,lv1_val[i]); lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex)); lv1_val = inclusiveScan1(lv1_val); [unroll] @@ -190,8 +188,7 @@ struct scan vector_lv0_t value; dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); - const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); - const uint32_t bankedIndex = Config::sharedStoreIndex(virtualSubgroupID, Config::ItemsPerInvocation_1); + const uint32_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx); scalar_t left; scratchAccessor.template get(bankedIndex,left); if (Exclusive) @@ -242,8 +239,7 @@ struct reduce scan_local = reduction0(scan_local); if (Config::electLast()) { - const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); - const uint32_t bankedIndex = Config::sharedStoreIndex(virtualSubgroupID, Config::ItemsPerInvocation_1); // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1); + const uint32_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx); scratchAccessor.template set(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } @@ -261,7 +257,7 @@ struct reduce lv1_val = reduction1(lv1_val); if (Config::electLast()) { - const uint32_t bankedIndex = Config::sharedStoreIndex(invocationIndex, Config::ItemsPerInvocation_2); // (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2); + const uint32_t bankedIndex = Config::template sharedStoreIndex<2>(invocationIndex); scratchAccessor.template set(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); } } @@ -276,7 +272,8 @@ struct reduce for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) scratchAccessor.template get(lv1_smem_size+Config::sharedLoadIndex(invocationIndex, i),lv2_val[i]); lv2_val = reduction2(lv2_val); - scratchAccessor.template set(invocationIndex, lv2_val[Config::ItemsPerInvocation_2-1]); + if (Config::electLast()) + scratchAccessor.template set(0, lv2_val[Config::ItemsPerInvocation_2-1]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -315,8 +312,7 @@ struct scan dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); if (Config::electLast()) { - const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); - const uint32_t bankedIndex = Config::sharedStoreIndex(virtualSubgroupID, Config::ItemsPerInvocation_1); + const uint32_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx); scratchAccessor.template set(bankedIndex, value[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } @@ -331,7 +327,7 @@ struct scan const uint32_t prevIndex = invocationIndex-1; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(Config::sharedLoadIndex(prevIndex, i),lv1_val[i]); + scratchAccessor.template get(Config::sharedLoadIndex(invocationIndex, i)-1,lv1_val[i]); lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex)); lv1_val = inclusiveScan1(lv1_val); [unroll] @@ -339,7 +335,7 @@ struct scan scratchAccessor.template set(Config::sharedLoadIndex(invocationIndex, i),lv1_val[i]); if (Config::electLast()) { - const uint32_t bankedIndex = Config::sharedStoreIndex(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2); // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2); + const uint32_t bankedIndex = Config::template sharedStoreIndex<2>(glsl::gl_SubgroupID()); scratchAccessor.template set(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); } } @@ -353,7 +349,7 @@ struct scan const uint32_t prevIndex = invocationIndex-1; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) - scratchAccessor.template get(lv1_smem_size+Config::sharedLoadIndex(prevIndex, i),lv2_val[i]); + scratchAccessor.template get(lv1_smem_size+Config::sharedLoadIndex(invocationIndex, i)-1,lv2_val[i]); lv2_val[0] = hlsl::mix(BinOp::identity, lv2_val[0], bool(invocationIndex)); lv2_val = inclusiveScan2(lv2_val); [unroll] @@ -371,7 +367,7 @@ struct scan scratchAccessor.template get(i*Config::SubgroupSize+invocationIndex,lv1_val[i]); scalar_t lv2_scan; - const uint32_t bankedIndex = Config::sharedStoreIndex(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2); // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2); + const uint32_t bankedIndex = Config::template sharedStoreIndex<2>(glsl::gl_SubgroupID()); scratchAccessor.template set(lv1_smem_size+bankedIndex, lv2_scan); [unroll] @@ -386,8 +382,7 @@ struct scan vector_lv0_t value; dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); - const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx); - const uint32_t bankedIndex = Config::sharedStoreIndex(virtualSubgroupID, Config::ItemsPerInvocation_1); + const uint32_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx); scalar_t left; scratchAccessor.template get(bankedIndex,left); if (Exclusive) From add176bb73b9e7b5a643ac15962b7c74ff754e92 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Tue, 27 May 2025 12:54:24 +0200 Subject: [PATCH 223/346] update NSC image creation to not violate Microsoft EULA, update .github/workflows/build-nabla.yml --- .github/workflows/build-nabla.yml | 5 +- tools/nsc/CMakeLists.txt | 76 +++++++++++++++++++------------ 2 files changed, 51 insertions(+), 30 deletions(-) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index a194734472..de1194d34b 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -20,7 +20,10 @@ jobs: strategy: fail-fast: false matrix: - vendor: [msvc, clangcl] + # vendor: [msvc, clangcl] + # TODO: Yas please fix ClangCL, we have a few new compile errors + # if we build MSVC then build "run-compiler-explorer" target, for ClangCL build just "nsc" + vendor: [msvc] config: [Release, Debug, RelWithDebInfo] tag: ['17.13.6'] diff --git a/tools/nsc/CMakeLists.txt b/tools/nsc/CMakeLists.txt index b0fec5b7f2..0fad4987be 100644 --- a/tools/nsc/CMakeLists.txt +++ b/tools/nsc/CMakeLists.txt @@ -73,12 +73,11 @@ find_program(SPIRV_DIS_EXE NAMES spirv-dis HINTS "${VULKAN_SDK}/Bin" REQUIRED) cmake_path(GET SPIRV_DIS_EXE PARENT_PATH SPIRV_DIS_DIR) cmake_path(NATIVE_PATH SPIRV_DIS_DIR NORMALIZE SPIRV_DIS_DIR) -include(InstallRequiredSystemLibraries) - -if(NOT MSVC_REDIST_DIR) - if(MSVC_REDIST_BASE) # fallback to our CI toolset - set(MSVC_REDIST_DIR "${MSVC_REDIST_BASE}") - else() +if(MSVC_REDIST_BASE) # fallback to our toolset + set(MSVC_REDIST_DIR "${MSVC_REDIST_BASE}") +else() + include(InstallRequiredSystemLibraries) + if(NOT MSVC_REDIST_DIR) message(FATAL_ERROR "Could not find MSVC_REDIST_DIR, define yourself!") endif() endif() @@ -111,7 +110,6 @@ cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_SOURCE NORMALIZE NBL_DOCKER_CT_N cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_TARGET NORMALIZE NBL_DOCKER_CT_NSC_VOLUME_TARGET) cmake_path(NATIVE_PATH NBL_NSC_PREINSTALL_DIRECTORY NORMALIZE NBL_NSC_PREINSTALL_DIRECTORY) -set(CORE_IMAGE mcr.microsoft.com/windows/servercore:ltsc2022) string(CONFIGURE [=[ # syntax=docker/dockerfile:1 # escape=` @@ -119,8 +117,6 @@ string(CONFIGURE [=[ # ---------------- COMPRESS STEP ---------------- FROM @BASE_IMAGE@ as compress -COPY --link --from=@CORE_IMAGE@ C:/Windows/System32/icu.dll C:/pack/Windows/System32/ -COPY --link --from=@CORE_IMAGE@ C:/Windows/Globalization/ICU/ C:/pack/Windows/Globalization/ICU/ COPY --link Runtimes/ C:/pack/Windows/System32/ COPY --link Nabla/ C:/pack/runtimes/Nabla/ @@ -140,11 +136,14 @@ COPY --link --from=compress ["C:/pack/nabla-artifacts.tar.zst", "C:/pack/"] COPY hlsl.local.properties.cmake C:/Compiler-Explorer/etc/config/hlsl.local.properties ENV NBL_INSTALL_DIRECTORY=@NBL_DOCKER_CT_NSC_VOLUME_TARGET@ ` -NBL_EXPLICIT_MODULE_LOAD_LOG=ON ` -ICU_DATA=C:\Windows\Globalization\ICU +NBL_EXPLICIT_MODULE_LOAD_LOG=ON WORKDIR C:/Compiler-Explorer -ENTRYPOINT ["C:\\unpack.bat", "&&", "node", "--no-warnings", "--no-deprecation", "--import=tsx", "./app.js", "--language", "hlsl"] +ENTRYPOINT [ ` + "C:\\unpack.bat", "&&", ` + "copy", "C:\\mount\\Windows\\System32\\icu.dll", "C:\\Windows\\System32\\icu.dll", "&&", ` + "node", "--no-warnings", "--no-deprecation", "--import=tsx", "./app.js", "--language", "hlsl" ` +] ]=] INSTRUCTIONS @ONLY) set(DOCKERFILE "${NBL_DOCKER_CTX_DIR}/Dockerfile") @@ -197,16 +196,28 @@ execute_process(COMMAND cmd /C ver OUTPUT_VARIABLE PIPE OUTPUT_STRIP_TRAILING_WH string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+" HOST_KERNEL "${PIPE}") PROMOTE_PROCESS_ISOLATION(${HOST_KERNEL} ${BASE_IMAGE} USE_PROCESS_ISOLATION) -if(USE_PROCESS_ISOLATION) - set(ISOLATION "--isolation process") -else() +if(NOT USE_PROCESS_ISOLATION) # NOTE: we would need to use GET_RUNTIME_DEPENDENCIES which uses objdump # https://cmake.org/cmake/help/latest/command/file.html#get-runtime-dependencies # to collect *all* missing deps and copy (FROM at least server core) to destination nano # image, it will fail currently if we fully isolate it with VM due to lack of certain DLLs + # BUT it means violating EULA, hence we are not going to support it, also (**) message(FATAL_ERROR "HyperV is NOT supported! Update your OS!") endif() +set(CORE_IMAGE mcr.microsoft.com/windows/servercore:ltsc2022) +set(ICU_DIR C:\\Windows\\Globalization\\ICU) +set(ICU_DLL C:\\Windows\\System32\\icu.dll) +if(NOT EXISTS ${ICU_DIR} OR NOT EXISTS ${ICU_DLL}) + # fallback for CI purposes, NOTE: we do NOT distribute those in final image as we have host runner requirements (**) + message(STATUS "\"${ICU_DIR}\" or \"${ICU_DLL}\ not found, fallback: copying them to the runner from \"${CORE_IMAGE}\"") + execute_process(COMMAND "${DOCKER_EXE}" rm -f nano-orphan RESULT_VARIABLE res) + execute_process(COMMAND "${DOCKER_EXE}" run -di --isolation process --name nano-orphan --entrypoint cmd ${CORE_IMAGE} COMMAND_ERROR_IS_FATAL ANY) + execute_process(COMMAND "${DOCKER_EXE}" cp nano-orphan:${ICU_DIR} ${ICU_DIR} COMMAND_ERROR_IS_FATAL ANY) + execute_process(COMMAND "${DOCKER_EXE}" cp nano-orphan:${ICU_DLL} ${ICU_DLL} COMMAND_ERROR_IS_FATAL ANY) + message(STATUS "Fallback completed, runner patched!") +endif() + set(ORPHAN nsc-orphan) set(NBL_CE_URL http://${ORPHAN}:10240) set(NBL_CE_HEALTHY_CHECK_PY "${NBL_ROOT_PATH}/docker/compiler-explorer/ce_healthy_check.py") @@ -215,14 +226,14 @@ set(NBL_NSC_BASIC_HLSL_JPAYLOAD "${CMAKE_CURRENT_SOURCE_DIR}/docker/godbolt/hlsl # to avoid "too long input" errors we proxy build instructions to CMake script and write it to build directory string(CONFIGURE [=[ -execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Killing remaining NSC orphans") -execute_process(COMMAND "${DOCKER_EXE}" rm -f "${ORPHAN}") +message(STATUS "Killing remaining NSC orphans") +execute_process(COMMAND "${DOCKER_EXE}" rm -f "${ORPHAN}" RESULT_VARIABLE res) -execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Executing CTests") +message(STATUS "Executing CTests") execute_process(COMMAND "${CTEST_EXE}" -C "$" --stop-on-failure WORKING_DIRECTORY "@CMAKE_CURRENT_BINARY_DIR@" COMMAND_ERROR_IS_FATAL ANY) -execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Generating NSC build info") +message(STATUS "Generating NSC build info") execute_process(COMMAND "${CMAKE_COMMAND}" "-DNBL_EXECUTABLE_PATH=${NBL_NSC_PREINSTALL_TARGET_EXE_FILEPATH}" "-DNBL_BUILD_INFO=${NBL_NSC_PREINSTALL_TARGET_BUILD_INFO}" @@ -231,7 +242,7 @@ execute_process(COMMAND "${CMAKE_COMMAND}" -P "${NBL_ROOT_PATH}/cmake/scripts/nbl/nablaBuildInfo.cmake" COMMAND_ERROR_IS_FATAL ANY) -execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Generating NSC godbolt config") +message(STATUS "Generating NSC godbolt config") execute_process(COMMAND "${CMAKE_COMMAND}" "-DSPIRV_DIS_EXE=spirv-dis.exe" "-DNSC_RELEASE_BUILD_INFO=$" @@ -241,37 +252,44 @@ execute_process(COMMAND "${CMAKE_COMMAND}" -P "${CMAKE_CURRENT_SOURCE_DIR}/ce-generate-config.cmake" COMMAND_ERROR_IS_FATAL ANY) -execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Updating NSC package context") +message(STATUS "Updating NSC package context") execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory_if_different "$" "${NBL_DOCKER_CTX_DIR}/Nabla" COMMAND_ERROR_IS_FATAL ANY) -execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Building NSC Godbolt image") -execute_process(COMMAND "${DOCKER_EXE}" build ${ISOLATION} +message(STATUS "Building NSC Godbolt image") +execute_process(COMMAND "${DOCKER_EXE}" build --isolation process -f "${DOCKERFILE}" -t ${NSC_IMAGE_NAME} "${NBL_DOCKER_CTX_DIR}" COMMAND_ERROR_IS_FATAL ANY) -execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Running new NSC orphan container") -execute_process(COMMAND "${DOCKER_EXE}" run -di -p 80:10240 ${ISOLATION} - --name "${ORPHAN}" ${NSC_IMAGE_NAME} +message(STATUS "Running new NSC orphan container") +execute_process(COMMAND "${DOCKER_EXE}" run -di -p 80:10240 --isolation process + --name "${ORPHAN}" + -v $ + -v $ + ${NSC_IMAGE_NAME} COMMAND_ERROR_IS_FATAL ANY) -execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Health‐check") +message(STATUS "Healthy check") execute_process(COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_HEALTHY_CHECK_PY}" --url "${NBL_CE_URL}" --interval 5 --ticks 12 COMMAND_ERROR_IS_FATAL ANY) -execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "Post‐Checking basic shader compile") +message(STATUS "Post Basic NSC shader compile check") execute_process(COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compiler/nsc_$>_upstream/compile --method POST --json "${NBL_NSC_BASIC_HLSL_JPAYLOAD}" COMMAND_ERROR_IS_FATAL ANY) -execute_process(COMMAND "${CMAKE_COMMAND}" -E echo "OK! NSC container is healthy.") +message(STATUS "Printing NSC container logs") +execute_process(COMMAND "${DOCKER_EXE}" logs "${ORPHAN}" COMMAND_ERROR_IS_FATAL ANY) + +message(STATUS "OK! NSC container is healthy.") +message(STATUS "Type \"localhost\" in your browser to use NSC with Godbolt!") ]=] INSTRUCTIONS) file(GENERATE OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/run-compiler-explorer-$.cmake" CONTENT "${INSTRUCTIONS}") From c6d23bd2adbf9e1d9dfef08213cb16f44581e364 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Tue, 27 May 2025 13:30:32 +0200 Subject: [PATCH 224/346] mount named pipeline and use as docker host, update .github/workflows/build-nabla.yml --- .github/workflows/build-nabla.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index de1194d34b..94263a89e8 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -68,6 +68,7 @@ jobs: --env-file .\docker\ninja.env ` --name orphan ` -v "${{ github.workspace }}:${{ env.mount }}" ` + -v "${pipeHost}:\\.\pipe\dockerd" -e "DOCKER_HOST=npipe:////./pipe/dockerd" ` -w "${{ env.mount }}" ` "${{ env.image }}:${{ matrix.tag }}" ` ${{ env.cmd }} From 68095dac5c6095f6209e9a915553ac6a0dc424e5 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 27 May 2025 19:42:41 +0700 Subject: [PATCH 225/346] Fix error in ILogicalDevice.cpp due to removed getShaders method --- src/nbl/video/ILogicalDevice.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp index 62e364a71a..d9e1479d2e 100644 --- a/src/nbl/video/ILogicalDevice.cpp +++ b/src/nbl/video/ILogicalDevice.cpp @@ -862,7 +862,7 @@ bool ILogicalDevice::createGraphicsPipelines( core::vector newParams(params.begin(), params.end()); const auto shaderCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param) { - return sum + param.getShaders().size(); + return sum + param.getShaderCount(); }); core::vector> debloatedShaders; // vector to hold all the debloated shaders, so the pointer from the new ShaderSpecInfo is not dangling debloatedShaders.reserve(shaderCount); From 98e17598f15b50e8b82f1e2eeb02e88dec1d4a2f Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 27 May 2025 19:43:04 +0700 Subject: [PATCH 226/346] Fix all errors in CVulkanLogicalDevice --- src/nbl/video/CVulkanLogicalDevice.cpp | 140 ++++++++++++++----------- src/nbl/video/CVulkanLogicalDevice.h | 6 +- 2 files changed, 82 insertions(+), 64 deletions(-) diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 792ab719eb..6050b7a7a0 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -1035,7 +1035,9 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createFramebuffer_ // TODO: Change this to pass SPIR-V directly! VkPipelineShaderStageCreateInfo getVkShaderStageCreateInfoFrom( - const asset::IPipelineBase::SShaderSpecInfo& specInfo, + const video::IGPUPipelineBase::SShaderSpecInfo& specInfo, + hlsl::ShaderStage stage, + bool requireFullSubgroups, VkShaderModuleCreateInfo* &outShaderModule, std::string* &outEntryPoints, VkPipelineShaderStageRequiredSubgroupSizeCreateInfo* &outRequiredSubgroupSize, @@ -1054,8 +1056,6 @@ VkPipelineShaderStageCreateInfo getVkShaderStageCreateInfoFrom( // TODO: VkShaderModuleValidationCacheCreateInfoEXT from VK_EXT_validation_cache // TODO: VkPipelineRobustnessCreateInfoEXT from VK_EXT_pipeline_robustness (allows per-pipeline control of robustness) - const auto stage = specInfo.stage; - (*outEntryPoints) = specInfo.entryPoint; const auto entryPointName = outEntryPoints->c_str(); outEntryPoints++; @@ -1076,8 +1076,8 @@ VkPipelineShaderStageCreateInfo getVkShaderStageCreateInfoFrom( { outSpecMapEntry->constantID = entry.first; outSpecMapEntry->offset = std::distance(specDataBegin,outSpecData); - outSpecMapEntry->size = entry.second.size; - memcpy(outSpecData,entry.second.data,outSpecMapEntry->size); + outSpecMapEntry->size = entry.second.size(); + memcpy(outSpecData, entry.second.data(), outSpecMapEntry->size); outSpecData += outSpecMapEntry->size; outSpecMapEntry++; } @@ -1098,7 +1098,7 @@ VkPipelineShaderStageCreateInfo getVkShaderStageCreateInfoFrom( outShaderModule++; // Implicit: https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02754 - using subgroup_size_t = std::remove_reference_t::SUBGROUP_SIZE; + using subgroup_size_t = asset::IPipelineBase::SUBGROUP_SIZE; if (specInfo.requiredSubgroupSize>=subgroup_size_t::REQUIRE_4) { *ppNext = outRequiredSubgroupSize; @@ -1110,7 +1110,7 @@ VkPipelineShaderStageCreateInfo getVkShaderStageCreateInfoFrom( else retval.flags = 0; - if (specInfo.requireFullSubgroups) + if (requireFullSubgroups) { assert(stage==hlsl::ShaderStage::ESS_COMPUTE/*TODO: Or Mesh Or Task*/); retval.flags |= VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT; @@ -1141,7 +1141,7 @@ void CVulkanLogicalDevice::createComputePipelines_impl( IGPUPipelineCache* const pipelineCache, const std::span createInfos, core::smart_refctd_ptr* const output, - const IGPUComputePipeline::SCreationParams::SSpecializationValidationResult& validation + const SSpecializationValidationResult& validation ) { const VkPipelineCache vk_pipelineCache = pipelineCache ? static_cast(pipelineCache)->getInternalObject():VK_NULL_HANDLE; @@ -1168,7 +1168,7 @@ void CVulkanLogicalDevice::createComputePipelines_impl( { initPipelineCreateInfo(outCreateInfo,info); const auto& spec = info.shader; - outCreateInfo->stage = getVkShaderStageCreateInfoFrom(spec, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo, outSpecMapEntry, outSpecData); + outCreateInfo->stage = getVkShaderStageCreateInfoFrom(spec, hlsl::ShaderStage::ESS_COMPUTE, info.cached.requireFullSubgroups, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo, outSpecMapEntry, outSpecData); outCreateInfo++; } auto vk_pipelines = reinterpret_cast(output); @@ -1187,7 +1187,7 @@ void CVulkanLogicalDevice::createComputePipelines_impl( ); debugNameBuilder.str(""); const auto& specInfo = createInfos[i].shader; - debugNameBuilder << specInfo.shader->getFilepathHint() << "(" << specInfo.entryPoint << "," << specInfo.stage << ")\n"; + debugNameBuilder << specInfo.shader->getFilepathHint() << "(" << specInfo.entryPoint << "," << hlsl::ShaderStage::ESS_COMPUTE << ")\n"; } } else @@ -1198,7 +1198,7 @@ void CVulkanLogicalDevice::createGraphicsPipelines_impl( IGPUPipelineCache* const pipelineCache, const std::span createInfos, core::smart_refctd_ptr* const output, - const IGPUGraphicsPipeline::SCreationParams::SSpecializationValidationResult& validation + const SSpecializationValidationResult& validation ) { auto getVkStencilOpStateFrom = [](const asset::SStencilOpParams& params)->VkStencilOpState @@ -1300,14 +1300,20 @@ void CVulkanLogicalDevice::createGraphicsPipelines_impl( { initPipelineCreateInfo(outCreateInfo,info); outCreateInfo->pStages = outShaderStage; - for (const auto& spec : info.shaders) + auto processSpecShader = [&](IGPUPipelineBase::SShaderSpecInfo spec, hlsl::ShaderStage shaderStage) { if (spec.shader) { - *(outShaderStage++) = getVkShaderStageCreateInfoFrom(spec, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo, outSpecMapEntry, outSpecData); - outCreateInfo->stageCount = std::distancepStages)>(outCreateInfo->pStages, outShaderStage); + *(outShaderStage++) = getVkShaderStageCreateInfoFrom(spec, shaderStage, false, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo, outSpecMapEntry, outSpecData); + outCreateInfo->stageCount = std::distancepStages)>(outCreateInfo->pStages, outShaderStage); } - } + }; + processSpecShader(info.vertexShader, hlsl::ShaderStage::ESS_VERTEX); + processSpecShader(info.tesselationControlShader, hlsl::ShaderStage::ESS_TESSELLATION_CONTROL); + processSpecShader(info.tesselationEvaluationShader, hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION); + processSpecShader(info.geometryShader, hlsl::ShaderStage::ESS_GEOMETRY); + processSpecShader(info.fragmentShader, hlsl::ShaderStage::ESS_FRAGMENT); + // when dealing with mesh shaders, the vertex input and assembly state will be null { { @@ -1342,17 +1348,13 @@ void CVulkanLogicalDevice::createGraphicsPipelines_impl( } outCreateInfo->pInputAssemblyState = outInputAssembly++; } - for (const auto& spec : info.shaders) - if (spec.shader) + + if (info.tesselationControlShader.shader || info.tesselationEvaluationShader.shader) { - const auto stage = spec.stage; - if (stage==hlsl::ShaderStage::ESS_TESSELLATION_CONTROL || stage==hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION) - { - outTessellation->patchControlPoints = info.cached.primitiveAssembly.tessPatchVertCount; - outCreateInfo->pTessellationState = outTessellation++; - break; - } + outTessellation->patchControlPoints = info.cached.primitiveAssembly.tessPatchVertCount; + outCreateInfo->pTessellationState = outTessellation++; } + const auto& raster = info.cached.rasterization; { outViewport->viewportCount = raster.viewportCount; @@ -1432,16 +1434,22 @@ void CVulkanLogicalDevice::createGraphicsPipelines_impl( { for (size_t i=0ull; i(createInfos[i],vk_pipeline); debugNameBuilder.str(""); - for (const auto& shader: createInfos[i].shaders) + auto buildDebugName = [&](const IGPUPipelineBase::SShaderSpecInfo& spec, hlsl::ShaderStage stage) { - if (shader.shader != nullptr) - debugNameBuilder <getFilepathHint() << "(" << shader.entryPoint << "," << shader.stage << ")\n"; - } + if (spec.shader != nullptr) + debugNameBuilder <getFilepathHint() << "(" << spec.entryPoint << "," << stage << ")\n"; + }; + buildDebugName(createInfo.vertexShader, hlsl::ESS_VERTEX); + buildDebugName(createInfo.tesselationControlShader, hlsl::ESS_TESSELLATION_CONTROL); + buildDebugName(createInfo.tesselationEvaluationShader, hlsl::ESS_TESSELLATION_EVALUATION); + buildDebugName(createInfo.geometryShader, hlsl::ESS_GEOMETRY); + buildDebugName(createInfo.fragmentShader, hlsl::ESS_FRAGMENT); output[i]->setObjectDebugName(debugNameBuilder.str().c_str()); } } @@ -1453,12 +1461,11 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl( IGPUPipelineCache* const pipelineCache, const std::span createInfos, core::smart_refctd_ptr* const output, - const IGPURayTracingPipeline::SCreationParams::SSpecializationValidationResult& validation + const SSpecializationValidationResult& validation ) { - using SShaderGroupParams = asset::IRayTracingPipelineBase::SShaderGroupsParams; - using SGeneralShaderGroup = asset::IRayTracingPipelineBase::SGeneralShaderGroup; - using SHitShaderGroup = asset::IRayTracingPipelineBase::SHitShaderGroup; + using SShaderGroupParams = IGPURayTracingPipeline::SCreationParams::SShaderGroupsParams; + using SHitShaderGroup = SShaderGroupParams::SHitGroup; const auto dynamicStates = std::array{ VK_DYNAMIC_STATE_RAY_TRACING_PIPELINE_STACK_SIZE_KHR }; const VkPipelineDynamicStateCreateInfo vk_dynamicStateCreateInfo = { @@ -1473,7 +1480,7 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl( size_t maxShaderStages = 0; for (const auto& info : createInfos) - maxShaderStages += info.shaders.size(); + maxShaderStages += info.shaderGroups.getShaderCount(); size_t maxShaderGroups = 0; for (const auto& info : createInfos) maxShaderGroups += info.shaderGroups.getShaderGroupCount(); @@ -1498,40 +1505,51 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl( auto outSpecInfo = vk_specializationInfos.data(); auto outSpecMapEntry = vk_specializationMapEntry.data(); auto outSpecData = specializationData.data(); - auto getVkShaderIndex = [](uint32_t index) { return index == SShaderGroupParams::SIndex::Unused ? VK_SHADER_UNUSED_KHR : index; }; - auto getGeneralVkRayTracingShaderGroupCreateInfo = [getVkShaderIndex](SGeneralShaderGroup group) -> VkRayTracingShaderGroupCreateInfoKHR + + for (const auto& info : createInfos) { - return { - .sType = VK_STRUCTURE_TYPE_RAY_TRACING_SHADER_GROUP_CREATE_INFO_KHR, - .pNext = nullptr, - .type = VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR, - .generalShader = getVkShaderIndex(group.index), - .closestHitShader = VK_SHADER_UNUSED_KHR, - .anyHitShader = VK_SHADER_UNUSED_KHR, - .intersectionShader = VK_SHADER_UNUSED_KHR, + core::unordered_map shaderIndexes; + auto getVkShaderIndex = [&](const asset::IShader* shader) + { return shader == nullptr ? VK_SHADER_UNUSED_KHR : shaderIndexes[shader]; }; + + auto getGeneralVkRayTracingShaderGroupCreateInfo = [getVkShaderIndex](IGPUPipelineBase::SShaderSpecInfo spec) -> VkRayTracingShaderGroupCreateInfoKHR + { + return { + .sType = VK_STRUCTURE_TYPE_RAY_TRACING_SHADER_GROUP_CREATE_INFO_KHR, + .pNext = nullptr, + .type = VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR, + .generalShader = getVkShaderIndex(spec.shader), + .closestHitShader = VK_SHADER_UNUSED_KHR, + .anyHitShader = VK_SHADER_UNUSED_KHR, + .intersectionShader = VK_SHADER_UNUSED_KHR, + }; }; - }; - auto getHitVkRayTracingShaderGroupCreateInfo = [getVkShaderIndex](SHitShaderGroup group) -> VkRayTracingShaderGroupCreateInfoKHR - { - return { - .sType = VK_STRUCTURE_TYPE_RAY_TRACING_SHADER_GROUP_CREATE_INFO_KHR, - .pNext = nullptr, - .type = group.intersection == SShaderGroupParams::SIndex::Unused ? - VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR : VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR, - .generalShader = VK_SHADER_UNUSED_KHR, - .closestHitShader = getVkShaderIndex(group.closestHit), - .anyHitShader = getVkShaderIndex(group.anyHit), - .intersectionShader = getVkShaderIndex(group.intersection), + auto getHitVkRayTracingShaderGroupCreateInfo = [getVkShaderIndex](SHitShaderGroup group) -> VkRayTracingShaderGroupCreateInfoKHR + { + return { + .sType = VK_STRUCTURE_TYPE_RAY_TRACING_SHADER_GROUP_CREATE_INFO_KHR, + .pNext = nullptr, + .type = group.intersection.shader == nullptr ? + VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR : VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR, + .generalShader = VK_SHADER_UNUSED_KHR, + .closestHitShader = getVkShaderIndex(group.closestHit.shader), + .anyHitShader = getVkShaderIndex(group.anyHit.shader), + .intersectionShader = getVkShaderIndex(group.intersection.shader), + }; }; - }; - for (const auto& info : createInfos) - { + initPipelineCreateInfo(outCreateInfo,info); outCreateInfo->pStages = outShaderStage; - for (const auto& specInfo : info.shaders) + auto processSpecInfo = [&](const IGPUPipelineBase::SShaderSpecInfo& spec, hlsl::ShaderStage shaderStage) { - *(outShaderStage++) = getVkShaderStageCreateInfoFrom(specInfo, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo,outSpecMapEntry,outSpecData); - } + if (!spec.shader) return; + if (shaderIndexes.find(spec.shader) == shaderIndexes.end()) + { + shaderIndexes.insert({ spec.shader, static_cast(std::distance(outShaderStage, vk_shaderStage.data()))}); + *(outShaderStage++) = getVkShaderStageCreateInfoFrom(spec, shaderStage, false, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo,outSpecMapEntry,outSpecData); + } + }; + processSpecInfo(info.shaderGroups.raygen, hlsl::ESS_RAYGEN); outCreateInfo->stageCount = std::distancepStages)>(outCreateInfo->pStages,outShaderStage); assert(outCreateInfo->stageCount != 0); diff --git a/src/nbl/video/CVulkanLogicalDevice.h b/src/nbl/video/CVulkanLogicalDevice.h index 93d45dcc32..f5cda084c5 100644 --- a/src/nbl/video/CVulkanLogicalDevice.h +++ b/src/nbl/video/CVulkanLogicalDevice.h @@ -289,20 +289,20 @@ class CVulkanLogicalDevice final : public ILogicalDevice IGPUPipelineCache* const pipelineCache, const std::span createInfos, core::smart_refctd_ptr* const output, - const IGPUComputePipeline::SCreationParams::SSpecializationValidationResult& validation + const SSpecializationValidationResult& validation ) override; void createGraphicsPipelines_impl( IGPUPipelineCache* const pipelineCache, const std::span params, core::smart_refctd_ptr* const output, - const IGPUGraphicsPipeline::SCreationParams::SSpecializationValidationResult& validation + const SSpecializationValidationResult& validation ) override; void createRayTracingPipelines_impl( IGPUPipelineCache* const pipelineCache, const std::span params, core::smart_refctd_ptr* const output, - const IGPURayTracingPipeline::SCreationParams::SSpecializationValidationResult& validation + const SSpecializationValidationResult& validation ) override; // queries From 59ccb2240ac7e80ca752b5efc0cd254913e468f6 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 27 May 2025 19:43:19 +0700 Subject: [PATCH 227/346] Add get shader count for creationParams --- include/nbl/video/IGPUGraphicsPipeline.h | 11 +++++++++++ include/nbl/video/IGPURayTracingPipeline.h | 5 +++++ 2 files changed, 16 insertions(+) diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h index c44ef5ceb1..806ee337c3 100644 --- a/include/nbl/video/IGPUGraphicsPipeline.h +++ b/include/nbl/video/IGPUGraphicsPipeline.h @@ -78,6 +78,17 @@ class IGPUGraphicsPipeline : public IGPUPipeline flags = FLAGS::NONE; + + inline uint32_t getShaderCount() const + { + uint32_t count = 0; + count += (vertexShader.shader != nullptr); + count += (tesselationControlShader.shader != nullptr); + count += (tesselationEvaluationShader.shader != nullptr); + count += (geometryShader.shader != nullptr); + count += (fragmentShader.shader != nullptr); + return count; + } }; inline core::bitflag getCreationFlags() const {return m_flags;} diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h index beaecd772a..3bcd4537f3 100644 --- a/include/nbl/video/IGPURayTracingPipeline.h +++ b/include/nbl/video/IGPURayTracingPipeline.h @@ -66,6 +66,11 @@ class IGPURayTracingPipeline : public IGPUPipeline Date: Tue, 27 May 2025 14:50:09 +0200 Subject: [PATCH 228/346] update validation of kernel version & promote to process logic --- tools/nsc/CMakeLists.txt | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/tools/nsc/CMakeLists.txt b/tools/nsc/CMakeLists.txt index 0fad4987be..11b78ab4a3 100644 --- a/tools/nsc/CMakeLists.txt +++ b/tools/nsc/CMakeLists.txt @@ -157,29 +157,34 @@ set(NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT "${NBL_DOCKER_CTX_DIR}/hlsl.local.prop string(GENEX_STRIP "${NBL_PACKAGE_RUNTIME_EXE_DIR_PATH}" NBL_RELATIVE_ENTRY) set(OUTPUT_CONFIG_FILE $) -function(PROMOTE_PROCESS_ISOLATION HOST_KERNEL BASE VAR) +function(PROMOTE_PROCESS_ISOLATION BASE VAR) set(${VAR} True) macro(INSPECT IMAGE) - execute_process(COMMAND "${DOCKER_EXE}" inspect --format={{.OsVersion}} ${IMAGE} - RESULT_VARIABLE EXIT_LEVEL + execute_process(COMMAND "${DOCKER_EXE}" inspect --format={{.OsVersion}} ${IMAGE} + RESULT_VARIABLE INSPECTION_OK OUTPUT_VARIABLE TARGET_KERNEL OUTPUT_STRIP_TRAILING_WHITESPACE ) endmacro() macro(TO_PROCESS IMAGE TARGET_KERNEL) - if(${HOST_KERNEL} VERSION_LESS ${TARGET_KERNEL}) - set(${VAR} False) - message(STATUS "Host kernel \"${HOST_KERNEL}\" version too low to promote process isolation with \"${IMAGE}\" [${TARGET_KERNEL}] and requires falling back to HyperV. Please update your host OS.") + execute_process(COMMAND "${DOCKER_EXE}" run --rm --isolation process --entrypoint cmd ${BASE} /K + RESULT_VARIABLE PROCESS_ISOLATION_OK + OUTPUT_QUIET ERROR_QUIET + ) + + if(${PROCESS_ISOLATION_OK} EQUAL 0) + message(STATUS "Promoting \"${IMAGE}\" [${TARGET_KERNEL}] to process isolation") else() - message(STATUS "\"${IMAGE}\" [${TARGET_KERNEL}] can be promoted to process isolation with host kernel [${HOST_KERNEL}] version") + set(${VAR} False) + message(STATUS "Cannot promote \"${IMAGE}\" [${TARGET_KERNEL}] to process isolation, requires falling back to HyperV. Please update your docker host OS.") endif() endmacro() INSPECT(${BASE}) - if(${EXIT_LEVEL} EQUAL 0) + if(${INSPECTION_OK} EQUAL 0) TO_PROCESS(${BASE} ${TARGET_KERNEL}) else() message(STATUS "\"${BASE}\" not found in local registry, pulling...") @@ -192,9 +197,7 @@ function(PROMOTE_PROCESS_ISOLATION HOST_KERNEL BASE VAR) set(${VAR} ${${VAR}} PARENT_SCOPE) endfunction() -execute_process(COMMAND cmd /C ver OUTPUT_VARIABLE PIPE OUTPUT_STRIP_TRAILING_WHITESPACE) -string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+" HOST_KERNEL "${PIPE}") -PROMOTE_PROCESS_ISOLATION(${HOST_KERNEL} ${BASE_IMAGE} USE_PROCESS_ISOLATION) +PROMOTE_PROCESS_ISOLATION(${BASE_IMAGE} USE_PROCESS_ISOLATION) if(NOT USE_PROCESS_ISOLATION) # NOTE: we would need to use GET_RUNTIME_DEPENDENCIES which uses objdump From bc9befbced6d8489ddc09b84b97ba63207a86985 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 27 May 2025 20:05:53 +0700 Subject: [PATCH 229/346] Move shader stage validation out of commonCreatePipelines --- include/nbl/video/ILogicalDevice.h | 69 +----------------------------- src/nbl/video/ILogicalDevice.cpp | 59 ++++++++++++++++--------- 2 files changed, 41 insertions(+), 87 deletions(-) diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index ab0d5bea06..0ad882a71e 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -1096,8 +1096,8 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe virtual core::smart_refctd_ptr createRenderpass_impl(const IGPURenderpass::SCreationParams& params, IGPURenderpass::SCreationParamValidationResult&& validation) = 0; virtual core::smart_refctd_ptr createFramebuffer_impl(IGPUFramebuffer::SCreationParams&& params) = 0; - template - inline SSpecializationValidationResult commonCreatePipelines(IGPUPipelineCache* const pipelineCache, const std::span params, ExtraLambda&& extra) + template + inline SSpecializationValidationResult commonCreatePipelines(IGPUPipelineCache* const pipelineCache, const std::span params) { if (pipelineCache && !pipelineCache->wasCreatedBy(this)) { @@ -1149,71 +1149,6 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe return {}; } - const auto& features = getEnabledFeatures(); - for (auto info : ci.getShaders()) - if (info.shader) - { - const asset::IShader::E_SHADER_STAGE shaderStage = info.stage; - - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-00704 - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-00705 - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-02091 - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-02092 - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-00706 - switch (shaderStage) - { - case hlsl::ShaderStage::ESS_TESSELLATION_CONTROL: [[fallthrough]]; - case hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION: - if (!features.tessellationShader) - { - NBL_LOG_ERROR("Cannot create IGPUShader for %p, Tessellation Shader feature not enabled!", info.shader); - return {}; - } - break; - case hlsl::ShaderStage::ESS_GEOMETRY: - if (!features.geometryShader) - { - NBL_LOG_ERROR("Cannot create IGPUShader for %p, Geometry Shader feature not enabled!", info.shader); - return {}; - } - break; - case hlsl::ShaderStage::ESS_ALL_OR_LIBRARY: [[fallthrough]]; - case hlsl::ShaderStage::ESS_VERTEX: [[fallthrough]]; - case hlsl::ShaderStage::ESS_FRAGMENT: [[fallthrough]]; - case hlsl::ShaderStage::ESS_COMPUTE: - break; - // unsupported yet - case hlsl::ShaderStage::ESS_TASK: [[fallthrough]]; - case hlsl::ShaderStage::ESS_MESH: - NBL_LOG_ERROR("Unsupported (yet) shader stage"); - return {}; - break; - case hlsl::ShaderStage::ESS_RAYGEN: [[fallthrough]]; - case hlsl::ShaderStage::ESS_ANY_HIT: [[fallthrough]]; - case hlsl::ShaderStage::ESS_CLOSEST_HIT: [[fallthrough]]; - case hlsl::ShaderStage::ESS_MISS: [[fallthrough]]; - case hlsl::ShaderStage::ESS_INTERSECTION: [[fallthrough]]; - case hlsl::ShaderStage::ESS_CALLABLE: - if (!features.rayTracingPipeline) - { - NBL_LOG_ERROR("Cannot create IGPUShader for %p, Raytracing Pipeline feature not enabled!", info.shader); - return {}; - } - break; - default: - // Implicit unsupported stages or weird multi-bit stage enum values - NBL_LOG_ERROR("Unknown Shader Stage %d", shaderStage); - return {}; - break; - } - - if (!extra(info)) - { - NBL_LOG_ERROR("Invalid shader were specified (params[%d])", i); - return {}; - } - } - retval += validation; } return retval; diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp index d9e1479d2e..c019be84a7 100644 --- a/src/nbl/video/ILogicalDevice.cpp +++ b/src/nbl/video/ILogicalDevice.cpp @@ -788,16 +788,8 @@ asset::ICPUPipelineCache::SCacheKey ILogicalDevice::getPipelineCacheKey() const bool ILogicalDevice::createComputePipelines(IGPUPipelineCache* const pipelineCache, const std::span params, core::smart_refctd_ptr* const output) { std::fill_n(output,params.size(),nullptr); - SSpecializationValidationResult specConstantValidation = commonCreatePipelines(pipelineCache,params,[this](const IGPUPipelineBase::SShaderSpecInfo& info)->bool - { - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02755 - if (info.requiredSubgroupSize>=asset::IPipelineBase::SUBGROUP_SIZE::REQUIRE_4 && !getPhysicalDeviceLimits().requiredSubgroupSizeStages.hasFlags(hlsl::ShaderStage::ESS_COMPUTE)) - { - NBL_LOG_ERROR("Invalid shader stage"); - return false; - } - return true; - }); + SSpecializationValidationResult specConstantValidation = commonCreatePipelines(pipelineCache, params); + if (!specConstantValidation) { NBL_LOG_ERROR("Invalid parameters were given"); @@ -815,6 +807,14 @@ bool ILogicalDevice::createComputePipelines(IGPUPipelineCache* const pipelineCac for (auto ix = 0u; ix < params.size(); ix++) { const auto& ci = params[ix]; + + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02755 + if (ci.shader.requiredSubgroupSize>=asset::IPipelineBase::SUBGROUP_SIZE::REQUIRE_4 && !getPhysicalDeviceLimits().requiredSubgroupSizeStages.hasFlags(hlsl::ShaderStage::ESS_COMPUTE)) + { + NBL_LOG_ERROR("Invalid shader stage"); + return false; + } + const core::set entryPoints = { asset::ISPIRVDebloater::EntryPoint{.name = ci.shader.entryPoint, .stage = hlsl::ShaderStage::ESS_COMPUTE} }; debloatedShaders.push_back(m_spirvDebloater->debloat(ci.shader.shader, entryPoints, m_logger)); auto debloatedShaderSpec = ci.shader; @@ -845,12 +845,7 @@ bool ILogicalDevice::createGraphicsPipelines( ) { std::fill_n(output, params.size(), nullptr); - SSpecializationValidationResult specConstantValidation = commonCreatePipelines(nullptr, params, - [this](const IGPUPipelineBase::SShaderSpecInfo& info)->bool - { - return info.shader != nullptr; - } - ); + SSpecializationValidationResult specConstantValidation = commonCreatePipelines(nullptr, params); if (!specConstantValidation) { NBL_LOG_ERROR("Invalid parameters were given"); @@ -870,6 +865,27 @@ bool ILogicalDevice::createGraphicsPipelines( for (auto ix = 0u; ix < params.size(); ix++) { const auto& ci = params[ix]; + + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-00704 + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-00705 + if (ci.tesselationControlShader.shader) + { + NBL_LOG_ERROR("Cannot create IGPUShader for %p, Tessellation Shader feature not enabled!", ci.tesselationControlShader.shader); + return false; + } + + if (ci.tesselationEvaluationShader.shader) + { + NBL_LOG_ERROR("Cannot create IGPUShader for %p, Tessellation Shader feature not enabled!", ci.tesselationEvaluationShader.shader); + return false; + } + + if (ci.geometryShader.shader) + { + NBL_LOG_ERROR("Cannot create IGPUShader for %p, Geometry Shader feature not enabled!", ci.geometryShader.shader); + return false; + } + auto renderpass = ci.renderpass; if (!renderpass->wasCreatedBy(this)) { @@ -996,10 +1012,7 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline core::smart_refctd_ptr* const output) { std::fill_n(output,params.size(),nullptr); - SSpecializationValidationResult specConstantValidation = commonCreatePipelines(pipelineCache,params,[this](const IGPUPipelineBase::SShaderSpecInfo& info)->bool - { - return true; - }); + SSpecializationValidationResult specConstantValidation = commonCreatePipelines(pipelineCache,params); if (!specConstantValidation) { NBL_LOG_ERROR("Invalid parameters were given"); @@ -1020,6 +1033,12 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline const bool skipAABBs = bool(param.flags & IGPURayTracingPipeline::SCreationParams::FLAGS::SKIP_AABBS); const bool skipBuiltin = bool(param.flags & IGPURayTracingPipeline::SCreationParams::FLAGS::SKIP_BUILT_IN_PRIMITIVES); + if (!features.rayTracingPipeline) + { + NBL_LOG_ERROR("Raytracing Pipeline feature not enabled!"); + return {}; + } + // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-rayTraversalPrimitiveCulling-03597 if (skipAABBs && skipBuiltin) { From 3f5708e5f6abd295d8be64f4f9135dcab80f1741 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Tue, 27 May 2025 17:11:43 +0200 Subject: [PATCH 230/346] let override publish CE port & control its URL depending on NBL_DOCKER_DIND_BUILD, update CMakePresets.json --- CMakePresets.json | 4 +++- tools/nsc/CMakeLists.txt | 18 ++++++++++++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/CMakePresets.json b/CMakePresets.json index 359ec6fb02..ae56cf1739 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -19,7 +19,9 @@ "NBL_EXPLICIT_MODULE_LOAD_LOG": "ON", "NBL_CPACK_NO_BUILD_DIRECTORY_MODULES": "ON", "NBL_CPACK_CI": "ON", - "GIT_FAIL_IF_NONZERO_EXIT": "OFF" + "GIT_FAIL_IF_NONZERO_EXIT": "OFF", + "NBL_DOCKER_DIND_BUILD": "ON", + "NBL_CE_PUBLISH_PORT": "10240" } }, { diff --git a/tools/nsc/CMakeLists.txt b/tools/nsc/CMakeLists.txt index 11b78ab4a3..55db4ce14a 100644 --- a/tools/nsc/CMakeLists.txt +++ b/tools/nsc/CMakeLists.txt @@ -222,7 +222,17 @@ if(NOT EXISTS ${ICU_DIR} OR NOT EXISTS ${ICU_DLL}) endif() set(ORPHAN nsc-orphan) -set(NBL_CE_URL http://${ORPHAN}:10240) + +if(NOT DEFINED NBL_CE_PUBLISH_PORT) + set(NBL_CE_PUBLISH_PORT 80) +endif() + +if(NBL_DOCKER_DIND_BUILD) + set(NBL_CE_URL http://${ORPHAN}:${NBL_CE_PUBLISH_PORT}) +else() + set(NBL_CE_URL http://localhost:${NBL_CE_PUBLISH_PORT}) +endif() + set(NBL_CE_HEALTHY_CHECK_PY "${NBL_ROOT_PATH}/docker/compiler-explorer/ce_healthy_check.py") set(NBL_CE_ENDPOINT_PY "${NBL_ROOT_PATH}/docker/compiler-explorer/endpoint.py") set(NBL_NSC_BASIC_HLSL_JPAYLOAD "${CMAKE_CURRENT_SOURCE_DIR}/docker/godbolt/hlsl-basic-compile-payload.json") @@ -269,8 +279,8 @@ execute_process(COMMAND "${DOCKER_EXE}" build --isolation process COMMAND_ERROR_IS_FATAL ANY) message(STATUS "Running new NSC orphan container") -execute_process(COMMAND "${DOCKER_EXE}" run -di -p 80:10240 --isolation process - --name "${ORPHAN}" +execute_process(COMMAND "${DOCKER_EXE}" run -di -p ${NBL_CE_PUBLISH_PORT}:10240 --isolation process + --name "${ORPHAN}" --network docker_default -v $ -v $ ${NSC_IMAGE_NAME} @@ -292,7 +302,7 @@ message(STATUS "Printing NSC container logs") execute_process(COMMAND "${DOCKER_EXE}" logs "${ORPHAN}" COMMAND_ERROR_IS_FATAL ANY) message(STATUS "OK! NSC container is healthy.") -message(STATUS "Type \"localhost\" in your browser to use NSC with Godbolt!") +message(STATUS "Type \"${NBL_CE_URL}\" in your browser to use NSC with Godbolt!") ]=] INSTRUCTIONS) file(GENERATE OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/run-compiler-explorer-$.cmake" CONTENT "${INSTRUCTIONS}") From b81fb12b95561e9c6822b20ddac6d02d7fc4ee23 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Tue, 27 May 2025 17:16:48 +0200 Subject: [PATCH 231/346] and specify network for builder, update .github/workflows/build-nabla.yml --- .github/workflows/build-nabla.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index 94263a89e8..30f1156096 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -66,7 +66,7 @@ jobs: --entrypoint ${{ env.entry }} -di --isolation process ` --env-file .\docker\ci-windows.env ` --env-file .\docker\ninja.env ` - --name orphan ` + --name orphan --network docker_default ` -v "${{ github.workspace }}:${{ env.mount }}" ` -v "${pipeHost}:\\.\pipe\dockerd" -e "DOCKER_HOST=npipe:////./pipe/dockerd" ` -w "${{ env.mount }}" ` From fcbfa5c56380414582ae381c5cad3f04028f34dc Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Tue, 27 May 2025 17:31:18 +0200 Subject: [PATCH 232/346] add "create default network" step to actions --- .github/workflows/build-nabla.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index 30f1156096..d079b8bcaf 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -55,6 +55,13 @@ jobs: run: | docker pull "${{ env.image }}:${{ matrix.tag }}" + - name: Create default network + run: | + if (-not (docker network ls --format '{{.Name}}' | Where-Object { $_ -eq 'docker_default' })) { + docker network create --driver nat docker_default + if ($LASTEXITCODE -ne 0) { exit 1 } + } + - name: Run Container run: | $ctx = docker context show From 472aa0ba6f98bed8a8d3996bececb514e1473046 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 28 May 2025 10:50:00 +0700 Subject: [PATCH 233/346] more fixes to indexing --- .../hlsl/workgroup2/arithmetic_config.hlsl | 15 +++++++++++++-- .../nbl/builtin/hlsl/workgroup2/shared_scan.hlsl | 2 +- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl index e02c74e80b..1587f919cc 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -46,6 +46,11 @@ struct ArithmeticConfiguration using virtual_wg_t = impl::virtual_wg_size_log2; NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = virtual_wg_t::levels; NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << virtual_wg_t::value; + static_assert(VirtualWorkgropupSize<=WorkgroupSize*SubgroupSize) + + NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroupLog2 = mpl::max_v; + NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroup = uint16_t(0x1u) << __SubgroupsPerVirtualWorkgroupLog2; + using items_per_invoc_t = impl::items_per_invocation; // NBL_CONSTEXPR_STATIC_INLINE uint32_t2 ItemsPerInvocation; TODO? doesn't allow inline definitions for uint32_t2 for some reason, uint32_t[2] as well ; declaring out of line results in not constant expression NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_0 = items_per_invoc_t::value0; @@ -74,10 +79,16 @@ struct ArithmeticConfiguration template static uint32_t sharedStoreIndex(const uint32_t subgroupID) { + uint32_t offsetBySubgroup; + if (level == LevelCount-1) + offsetBySubgroup = SubgroupSize; + else + offsetBySubgroup = __SubgroupsPerVirtualWorkgroup; + if (level<2) - return (subgroupID & (ItemsPerInvocation_1-1)) * SubgroupSize + (subgroupID/ItemsPerInvocation_1); + return (subgroupID & (ItemsPerInvocation_1-1)) * offsetBySubgroup + (subgroupID/ItemsPerInvocation_1); else - return (subgroupID & (ItemsPerInvocation_2-1)) * SubgroupSize + (subgroupID/ItemsPerInvocation_2); + return (subgroupID & (ItemsPerInvocation_2-1)) * offsetBySubgroup + (subgroupID/ItemsPerInvocation_2); } template diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 418c3219f4..99238851eb 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -321,7 +321,7 @@ struct scan // level 1 scan const uint32_t lv1_smem_size = Config::SubgroupsSize*Config::ItemsPerInvocation_1; subgroup2::inclusive_scan inclusiveScan1; - if (glsl::gl_SubgroupID() < lv1_smem_size) + if (glsl::gl_SubgroupID() < Config::SubgroupsSize*Config::ItemsPerInvocation_2) { vector_lv1_t lv1_val; const uint32_t prevIndex = invocationIndex-1; From c483941b09f804fada57491a4f69ffdb27518df2 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 28 May 2025 11:38:18 +0700 Subject: [PATCH 234/346] share level 0 scan between 2-level and 3-level scans (and reduce) --- .../hlsl/workgroup2/arithmetic_config.hlsl | 2 +- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 93 ++++++++----------- 2 files changed, 40 insertions(+), 55 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl index 1587f919cc..75947ea97c 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -46,7 +46,7 @@ struct ArithmeticConfiguration using virtual_wg_t = impl::virtual_wg_size_log2; NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = virtual_wg_t::levels; NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << virtual_wg_t::value; - static_assert(VirtualWorkgropupSize<=WorkgroupSize*SubgroupSize) + static_assert(VirtualWorkgroupSize<=WorkgroupSize*SubgroupSize); NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroupLog2 = mpl::max_v; NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroup = uint16_t(0x1u) << __SubgroupsPerVirtualWorkgroupLog2; diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 99238851eb..195431c5d3 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -85,22 +85,17 @@ struct reduce using vector_lv0_t = vector; // data accessor needs to be this type using vector_lv1_t = vector; - template - scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + template + static void __doLevel0(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { - using config_t = subgroup2::Configuration; - using params_lv0_t = subgroup2::ArithmeticParams; - using params_lv1_t = subgroup2::ArithmeticParams; - BinOp binop; - const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 0 scan - subgroup2::reduction reduction0; + subgroup2::reduction reduction0; [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { - vector_lv0_t scan_local; - dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local); + vector_t scan_local; + dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local); scan_local = reduction0(scan_local); if (Config::electLast()) { @@ -109,7 +104,19 @@ struct reduce } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); + } + + template + scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + { + using config_t = subgroup2::Configuration; + using params_lv0_t = subgroup2::ArithmeticParams; + using params_lv1_t = subgroup2::ArithmeticParams; + BinOp binop; + __doLevel0(dataAccessor, scratchAccessor); + + const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 1 scan subgroup2::reduction reduction1; if (glsl::gl_SubgroupID() == 0) @@ -138,24 +145,19 @@ struct scan using vector_lv0_t = vector; // data accessor needs to be this type using vector_lv1_t = vector; - template - void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + template + static void __doLevel0(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { - using config_t = subgroup2::Configuration; - using params_lv0_t = subgroup2::ArithmeticParams; - using params_lv1_t = subgroup2::ArithmeticParams; - BinOp binop; - const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); - subgroup2::inclusive_scan inclusiveScan0; + subgroup2::inclusive_scan inclusiveScan0; // level 0 scan [unroll] for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { - vector_lv0_t value; - dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + vector_t value; + dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); value = inclusiveScan0(value); - dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); if (Config::electLast()) { const uint32_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx); @@ -163,7 +165,19 @@ struct scan } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); + } + template + void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + { + using config_t = subgroup2::Configuration; + using params_lv0_t = subgroup2::ArithmeticParams; + using params_lv1_t = subgroup2::ArithmeticParams; + BinOp binop; + + __doLevel0(dataAccessor, scratchAccessor); + + const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 1 scan subgroup2::inclusive_scan inclusiveScan1; if (glsl::gl_SubgroupID() == 0) @@ -228,23 +242,9 @@ struct reduce using params_lv2_t = subgroup2::ArithmeticParams; BinOp binop; - const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); - // level 0 scan - subgroup2::reduction reduction0; - [unroll] - for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) - { - vector_lv0_t scan_local; - dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local); - scan_local = reduction0(scan_local); - if (Config::electLast()) - { - const uint32_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx); - scratchAccessor.template set(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan - } - } - scratchAccessor.workgroupExecutionAndMemoryBarrier(); + reduce::template __doLevel0(dataAccessor, scratchAccessor); + const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 1 scan const uint32_t lv1_smem_size = Config::SubgroupsSize*Config::ItemsPerInvocation_1; subgroup2::reduction reduction1; @@ -300,24 +300,9 @@ struct scan using params_lv2_t = subgroup2::ArithmeticParams; BinOp binop; - const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); - subgroup2::inclusive_scan inclusiveScan0; - // level 0 scan - [unroll] - for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) - { - vector_lv0_t value; - dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); - value = inclusiveScan0(value); - dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); - if (Config::electLast()) - { - const uint32_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx); - scratchAccessor.template set(bankedIndex, value[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan - } - } - scratchAccessor.workgroupExecutionAndMemoryBarrier(); + scan::template __doLevel0(dataAccessor, scratchAccessor); + const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 1 scan const uint32_t lv1_smem_size = Config::SubgroupsSize*Config::ItemsPerInvocation_1; subgroup2::inclusive_scan inclusiveScan1; From 951ff99bc2ab1be385010c06ca3ba8ad236f2b2c Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 28 May 2025 12:11:14 +0700 Subject: [PATCH 235/346] reduce duplicate vars in config --- .../builtin/hlsl/workgroup2/arithmetic_config.hlsl | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl index 75947ea97c..c0e105e700 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -15,20 +15,23 @@ namespace workgroup2 namespace impl { -template +template struct virtual_wg_size_log2 { + NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = _SubgroupSizeLog2; static_assert(WorkgroupSizeLog2>=SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize"); static_assert(WorkgroupSizeLog2<=SubgroupSizeLog2*3+4, "WorkgroupSize cannot be larger than (SubgroupSize^3)*16"); + NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2),uint16_t,conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2*2+2),uint16_t,3,2>::value,1>::value; NBL_CONSTEXPR_STATIC_INLINE uint16_t value = mpl::max_v; // must have at least enough level 0 outputs to feed a single subgroup }; -template +template struct items_per_invocation { - NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocationProductLog2 = mpl::max_v; + NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocationProductLog2 = mpl::max_v; NBL_CONSTEXPR_STATIC_INLINE uint16_t value0 = BaseItemsPerInvocation; NBL_CONSTEXPR_STATIC_INLINE uint16_t value1 = uint16_t(0x1u) << conditional_value, ItemsPerInvocationProductLog2>::value; NBL_CONSTEXPR_STATIC_INLINE uint16_t value2 = uint16_t(0x1u) << mpl::max_v; @@ -51,7 +54,7 @@ struct ArithmeticConfiguration NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroupLog2 = mpl::max_v; NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroup = uint16_t(0x1u) << __SubgroupsPerVirtualWorkgroupLog2; - using items_per_invoc_t = impl::items_per_invocation; + using items_per_invoc_t = impl::items_per_invocation; // NBL_CONSTEXPR_STATIC_INLINE uint32_t2 ItemsPerInvocation; TODO? doesn't allow inline definitions for uint32_t2 for some reason, uint32_t[2] as well ; declaring out of line results in not constant expression NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_0 = items_per_invoc_t::value0; NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_1 = items_per_invoc_t::value1; From 1f64763acb7cb41c8cde1d5a65ca9316d3da34cb Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 28 May 2025 10:06:25 +0200 Subject: [PATCH 236/346] add labeling to image creation steps, update actions to upload image workflow artifacts --- .github/workflows/build-nabla.yml | 34 +++-- tools/nsc/CMakeLists.txt | 241 +++++++++++++++++------------- 2 files changed, 165 insertions(+), 110 deletions(-) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index d079b8bcaf..e15d1a5ab2 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -39,6 +39,11 @@ jobs: Set-MpPreference -DisableRemovableDriveScanning $true Set-MpPreference -DisableArchiveScanning $true Set-MpPreference -DisableScanningMappedNetworkDrivesForFullScan $true + + if (-not (docker network ls --format '{{.Name}}' | Where-Object { $_ -eq 'docker_default' })) { + docker network create --driver nat docker_default + if ($LASTEXITCODE -ne 0) { exit 1 } + } - name: Checkout uses: actions/checkout@v4 @@ -47,21 +52,17 @@ jobs: - name: Set prefix id: set-prefix - shell: bash run: | - echo "prefix=run-windows-${{ matrix.tag }}-${{ matrix.vendor }}-${{ matrix.config }}" >> "$GITHUB_OUTPUT" + $prefix = "run-windows-${{ matrix.tag }}-${{ matrix.vendor }}-${{ matrix.config }}" + $nscTargetTaggedImage = "ghcr.io/$env:GITHUB_REPOSITORY:nsc-godbolt-build-${{ matrix.vendor }}-${{ matrix.config }}-${{ matrix.tag }}".ToLower() + + "prefix=$prefix" >> $env:GITHUB_OUTPUT + "nscTargetTaggedImage=$nscTargetTaggedImage" >> $env:GITHUB_OUTPUT - name: Pull Image run: | docker pull "${{ env.image }}:${{ matrix.tag }}" - - name: Create default network - run: | - if (-not (docker network ls --format '{{.Name}}' | Where-Object { $_ -eq 'docker_default' })) { - docker network create --driver nat docker_default - if ($LASTEXITCODE -ne 0) { exit 1 } - } - - name: Run Container run: | $ctx = docker context show @@ -97,6 +98,7 @@ jobs: --preset ci-configure-dynamic-${{ matrix.vendor }} ` --profiling-output=profiling/cmake-profiling.json ` --profiling-format=google-trace + -DNSC_IMAGE_NAME=${{ steps.set-prefix.outputs.nscTargetTaggedImage }} - name: Container – Build NSC run: | @@ -116,11 +118,23 @@ jobs: ${{ env.binary }} --config ${{ matrix.config }} ` --component Executables --prefix ${{ env.install }} - - name: Package workflow artifacts + - name: Container – Save NSC Image + run: | + docker exec orphan ` + ${{ env.entry }} ${{ env.cmd }} -Command docker ` + save ${{ steps.set-prefix.outputs.nscTargetTaggedImage }} | zstd -T0 -3 -f -o ${{ steps.set-prefix.outputs.prefix }}-nsc-godbolt-image.tar.zst + + - name: Package left workflow artifacts run: | tar -cvf "${{ steps.set-prefix.outputs.prefix }}-profiling.tar" profiling tar -cvf "${{ steps.set-prefix.outputs.prefix }}-install.tar" ${{ env.install }} + - name: Upload NSC Godbolt Image artifact + uses: actions/upload-artifact@v4 + with: + name: ${{ steps.set-prefix.outputs.prefix }}-nsc-godbolt-image + path: ${{ steps.set-prefix.outputs.prefix }}-nsc-godbolt-image.tar.zst + - name: Upload profiling artifacts uses: actions/upload-artifact@v4 with: diff --git a/tools/nsc/CMakeLists.txt b/tools/nsc/CMakeLists.txt index 55db4ce14a..d3b8bdf94a 100644 --- a/tools/nsc/CMakeLists.txt +++ b/tools/nsc/CMakeLists.txt @@ -59,11 +59,82 @@ add_test(NAME NBL_NSC_DUMP_BUILD_INFO_TEST if(NBL_ENABLE_DOCKER_INTEGRATION) +find_program(DOCKER_EXE NAMES docker REQUIRED) set(BASE_IMAGE ghcr.io/devsh-graphics-programming/compiler-explorer-docker:nano-2022) +set(CORE_IMAGE mcr.microsoft.com/windows/servercore:ltsc2022) -find_program(CTEST_EXE NAMES ctest REQUIRED) -find_program(DOCKER_EXE NAMES docker REQUIRED) +function(PROMOTE_PROCESS_ISOLATION BASE VAR) + set(${VAR} True) + + macro(INSPECT IMAGE) + execute_process(COMMAND "${DOCKER_EXE}" inspect --format={{.OsVersion}} ${IMAGE} + RESULT_VARIABLE INSPECTION_OK + OUTPUT_VARIABLE TARGET_KERNEL + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + endmacro() + + macro(TO_PROCESS IMAGE TARGET_KERNEL) + execute_process(COMMAND "${DOCKER_EXE}" run --rm --isolation process --entrypoint cmd ${BASE} /K + RESULT_VARIABLE PROCESS_ISOLATION_OK + OUTPUT_QUIET ERROR_QUIET + ) + + if(${PROCESS_ISOLATION_OK} EQUAL 0) + message(STATUS "Promoting \"${IMAGE}\" [${TARGET_KERNEL}] to process isolation") + else() + set(${VAR} False) + message(STATUS "Cannot promote \"${IMAGE}\" [${TARGET_KERNEL}] to process isolation, requires falling back to HyperV. Please update your docker host OS.") + endif() + endmacro() + + INSPECT(${BASE}) + + if(${INSPECTION_OK} EQUAL 0) + TO_PROCESS(${BASE} ${TARGET_KERNEL}) + else() + message(STATUS "\"${BASE}\" not found in local registry, pulling...") + execute_process(COMMAND "${DOCKER_EXE}" pull ${BASE}) + + INSPECT(${BASE}) + TO_PROCESS(${BASE} ${TARGET_KERNEL}) + endif() + + set(${VAR} ${${VAR}} PARENT_SCOPE) +endfunction() + +PROMOTE_PROCESS_ISOLATION(${BASE_IMAGE} USE_PROCESS_ISOLATION) + +if(NOT USE_PROCESS_ISOLATION) + # NOTE: we would need to use GET_RUNTIME_DEPENDENCIES which uses objdump + # https://cmake.org/cmake/help/latest/command/file.html#get-runtime-dependencies + # to collect *all* missing deps and copy (FROM at least server core) to destination nano + # image, it will fail currently if we fully isolate it with VM due to lack of certain DLLs + # BUT it means violating EULA, hence we are not going to support it, also (**) + message(FATAL_ERROR "HyperV is NOT supported! Update your OS!") +endif() + +function(GET_LABEL BASE_IMAGE LABEL VAR) + set(FORMAT "{{ index .Config.Labels \"${LABEL}\" }}") + execute_process(COMMAND ${DOCKER_EXE} inspect --format=${FORMAT} ${BASE_IMAGE} + OUTPUT_VARIABLE OUT + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_VARIABLE ERR + RESULT_VARIABLE RES + ) + if(NOT RES EQUAL 0) + message(WARNING "Could not get \"${LABEL}\" label from \"${BASE_IMAGE}\" image, it doesn't exist!") + endif() + + set(${VAR} "${OUT}" PARENT_SCOPE) +endfunction() + +GET_LABEL(${BASE_IMAGE} org.opencontainers.image.title ORG_LABEL_TITLE) +GET_LABEL(${BASE_IMAGE} org.opencontainers.image.source ORG_LABEL_SOURCE) +GET_LABEL(${BASE_IMAGE} org.opencontainers.image.description ORG_LABEL_DESCRIPTION) + +find_program(CTEST_EXE NAMES ctest REQUIRED) find_file(DXIL_DLL NAMES dxil.dll HINTS "$ENV{CMAKE_WINDOWS_KITS_10_DIR}/Redist/D3D/x64" "C:/Program Files (x86)/Windows Kits/10/Redist/D3D/x64" REQUIRED) set(ICU_GLOBALIZATION_DIR C:\\Windows\\Globalization\\ICU) @@ -144,6 +215,11 @@ ENTRYPOINT [ ` "copy", "C:\\mount\\Windows\\System32\\icu.dll", "C:\\Windows\\System32\\icu.dll", "&&", ` "node", "--no-warnings", "--no-deprecation", "--import=tsx", "./app.js", "--language", "hlsl" ` ] + +LABEL org.opencontainers.image.title="[Nabla Shader Compiler (NSC)]: @ORG_LABEL_TITLE@" +LABEL org.opencontainers.image.source=https://github.com/Devsh-Graphics-Programming/Nabla +LABEL org.opencontainers.image.description="[Nabla Shader Compiler (NSC)]: @ORG_LABEL_DESCRIPTION@" + ]=] INSTRUCTIONS @ONLY) set(DOCKERFILE "${NBL_DOCKER_CTX_DIR}/Dockerfile") @@ -157,58 +233,6 @@ set(NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT "${NBL_DOCKER_CTX_DIR}/hlsl.local.prop string(GENEX_STRIP "${NBL_PACKAGE_RUNTIME_EXE_DIR_PATH}" NBL_RELATIVE_ENTRY) set(OUTPUT_CONFIG_FILE $) -function(PROMOTE_PROCESS_ISOLATION BASE VAR) - set(${VAR} True) - - macro(INSPECT IMAGE) - execute_process(COMMAND "${DOCKER_EXE}" inspect --format={{.OsVersion}} ${IMAGE} - RESULT_VARIABLE INSPECTION_OK - OUTPUT_VARIABLE TARGET_KERNEL - OUTPUT_STRIP_TRAILING_WHITESPACE - ) - endmacro() - - macro(TO_PROCESS IMAGE TARGET_KERNEL) - execute_process(COMMAND "${DOCKER_EXE}" run --rm --isolation process --entrypoint cmd ${BASE} /K - RESULT_VARIABLE PROCESS_ISOLATION_OK - OUTPUT_QUIET ERROR_QUIET - ) - - if(${PROCESS_ISOLATION_OK} EQUAL 0) - message(STATUS "Promoting \"${IMAGE}\" [${TARGET_KERNEL}] to process isolation") - else() - set(${VAR} False) - message(STATUS "Cannot promote \"${IMAGE}\" [${TARGET_KERNEL}] to process isolation, requires falling back to HyperV. Please update your docker host OS.") - endif() - endmacro() - - INSPECT(${BASE}) - - if(${INSPECTION_OK} EQUAL 0) - TO_PROCESS(${BASE} ${TARGET_KERNEL}) - else() - message(STATUS "\"${BASE}\" not found in local registry, pulling...") - execute_process(COMMAND "${DOCKER_EXE}" pull ${BASE}) - - INSPECT(${BASE}) - TO_PROCESS(${BASE} ${TARGET_KERNEL}) - endif() - - set(${VAR} ${${VAR}} PARENT_SCOPE) -endfunction() - -PROMOTE_PROCESS_ISOLATION(${BASE_IMAGE} USE_PROCESS_ISOLATION) - -if(NOT USE_PROCESS_ISOLATION) - # NOTE: we would need to use GET_RUNTIME_DEPENDENCIES which uses objdump - # https://cmake.org/cmake/help/latest/command/file.html#get-runtime-dependencies - # to collect *all* missing deps and copy (FROM at least server core) to destination nano - # image, it will fail currently if we fully isolate it with VM due to lack of certain DLLs - # BUT it means violating EULA, hence we are not going to support it, also (**) - message(FATAL_ERROR "HyperV is NOT supported! Update your OS!") -endif() - -set(CORE_IMAGE mcr.microsoft.com/windows/servercore:ltsc2022) set(ICU_DIR C:\\Windows\\Globalization\\ICU) set(ICU_DLL C:\\Windows\\System32\\icu.dll) if(NOT EXISTS ${ICU_DIR} OR NOT EXISTS ${ICU_DLL}) @@ -240,75 +264,92 @@ set(NBL_NSC_BASIC_HLSL_JPAYLOAD "${CMAKE_CURRENT_SOURCE_DIR}/docker/godbolt/hlsl # to avoid "too long input" errors we proxy build instructions to CMake script and write it to build directory string(CONFIGURE [=[ message(STATUS "Killing remaining NSC orphans") -execute_process(COMMAND "${DOCKER_EXE}" rm -f "${ORPHAN}" RESULT_VARIABLE res) +execute_process(COMMAND "@DOCKER_EXE@" + rm -f "@ORPHAN@" + RESULT_VARIABLE res +) message(STATUS "Executing CTests") -execute_process(COMMAND "${CTEST_EXE}" -C "$" --stop-on-failure WORKING_DIRECTORY "@CMAKE_CURRENT_BINARY_DIR@" - COMMAND_ERROR_IS_FATAL ANY) +execute_process(COMMAND "@CTEST_EXE@" + -C "$" --stop-on-failure + WORKING_DIRECTORY "@CMAKE_CURRENT_BINARY_DIR@" + COMMAND_ERROR_IS_FATAL ANY +) message(STATUS "Generating NSC build info") -execute_process(COMMAND "${CMAKE_COMMAND}" - "-DNBL_EXECUTABLE_PATH=${NBL_NSC_PREINSTALL_TARGET_EXE_FILEPATH}" - "-DNBL_BUILD_INFO=${NBL_NSC_PREINSTALL_TARGET_BUILD_INFO}" - "-DNBL_OUTPUT_FILE=${NBL_NSC_PREINSTALL_TARGET_BUILD_INFO}" - "-DNBL_OUTPUT_EXE_OVERRIDE=$" - -P "${NBL_ROOT_PATH}/cmake/scripts/nbl/nablaBuildInfo.cmake" - COMMAND_ERROR_IS_FATAL ANY) +execute_process(COMMAND "@CMAKE_COMMAND@" + "-DNBL_EXECUTABLE_PATH=@NBL_NSC_PREINSTALL_TARGET_EXE_FILEPATH@" + "-DNBL_BUILD_INFO=@NBL_NSC_PREINSTALL_TARGET_BUILD_INFO@" + "-DNBL_OUTPUT_FILE=@NBL_NSC_PREINSTALL_TARGET_BUILD_INFO@" + "-DNBL_OUTPUT_EXE_OVERRIDE=$" + -P "@NBL_ROOT_PATH@/cmake/scripts/nbl/nablaBuildInfo.cmake" + COMMAND_ERROR_IS_FATAL ANY +) message(STATUS "Generating NSC godbolt config") -execute_process(COMMAND "${CMAKE_COMMAND}" - "-DSPIRV_DIS_EXE=spirv-dis.exe" - "-DNSC_RELEASE_BUILD_INFO=$" - "-DNSC_RELWITHDEBINFO_BUILD_INFO=$" - "-DNSC_DEBUG_BUILD_INFO=$" - "-DOUTPUT_CONFIG_FILE=${OUTPUT_CONFIG_FILE}" - -P "${CMAKE_CURRENT_SOURCE_DIR}/ce-generate-config.cmake" - COMMAND_ERROR_IS_FATAL ANY) +execute_process(COMMAND "@CMAKE_COMMAND@" + "-DSPIRV_DIS_EXE=spirv-dis.exe" + "-DNSC_RELEASE_BUILD_INFO=$" + "-DNSC_RELWITHDEBINFO_BUILD_INFO=$" + "-DNSC_DEBUG_BUILD_INFO=$" + "-DOUTPUT_CONFIG_FILE=@OUTPUT_CONFIG_FILE@" + -P "@CMAKE_CURRENT_SOURCE_DIR@/ce-generate-config.cmake" + COMMAND_ERROR_IS_FATAL ANY +) message(STATUS "Updating NSC package context") -execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory_if_different - "$" - "${NBL_DOCKER_CTX_DIR}/Nabla" - COMMAND_ERROR_IS_FATAL ANY) +execute_process(COMMAND "@CMAKE_COMMAND@" -E copy_directory_if_different + "$" + "@NBL_DOCKER_CTX_DIR@/Nabla" + COMMAND_ERROR_IS_FATAL ANY +) message(STATUS "Building NSC Godbolt image") -execute_process(COMMAND "${DOCKER_EXE}" build --isolation process - -f "${DOCKERFILE}" - -t ${NSC_IMAGE_NAME} - "${NBL_DOCKER_CTX_DIR}" - COMMAND_ERROR_IS_FATAL ANY) +string(TIMESTAMP BUILD_TIMESTAMP "%Y-%m-%dT%H:%M:%SZ" UTC) +execute_process(COMMAND "@DOCKER_EXE@" build --isolation process + --label=org.opencontainers.image.created="${BUILD_TIMESTAMP}" + -f "@DOCKERFILE@" -t @NSC_IMAGE_NAME@ "@NBL_DOCKER_CTX_DIR@" + COMMAND_ERROR_IS_FATAL ANY +) message(STATUS "Running new NSC orphan container") -execute_process(COMMAND "${DOCKER_EXE}" run -di -p ${NBL_CE_PUBLISH_PORT}:10240 --isolation process - --name "${ORPHAN}" --network docker_default - -v $ - -v $ - ${NSC_IMAGE_NAME} - COMMAND_ERROR_IS_FATAL ANY) +execute_process(COMMAND "@DOCKER_EXE@" run -di -p @NBL_CE_PUBLISH_PORT@:10240 --isolation process + --name "@ORPHAN@" --network docker_default + -v $ + -v $ + @NSC_IMAGE_NAME@ + COMMAND_ERROR_IS_FATAL ANY +) message(STATUS "Healthy check") -execute_process(COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_HEALTHY_CHECK_PY}" - --url "${NBL_CE_URL}" --interval 5 --ticks 12 - COMMAND_ERROR_IS_FATAL ANY) +execute_process(COMMAND "@_Python3_EXECUTABLE@" "@NBL_CE_HEALTHY_CHECK_PY@" + --url "@NBL_CE_URL@" --interval 5 --ticks 12 + COMMAND_ERROR_IS_FATAL ANY +) message(STATUS "Post Basic NSC shader compile check") -execute_process(COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" - --url "${NBL_CE_URL}" - --endpoint /api/compiler/nsc_$>_upstream/compile - --method POST --json "${NBL_NSC_BASIC_HLSL_JPAYLOAD}" - COMMAND_ERROR_IS_FATAL ANY) +execute_process(COMMAND "@_Python3_EXECUTABLE@" "@NBL_CE_ENDPOINT_PY@" + --url "@NBL_CE_URL@" + --endpoint /api/compiler/nsc_$>_upstream/compile + --method POST --json "@NBL_NSC_BASIC_HLSL_JPAYLOAD@" + COMMAND_ERROR_IS_FATAL ANY +) message(STATUS "Printing NSC container logs") -execute_process(COMMAND "${DOCKER_EXE}" logs "${ORPHAN}" COMMAND_ERROR_IS_FATAL ANY) +execute_process(COMMAND "@DOCKER_EXE@" + logs "@ORPHAN@" + COMMAND_ERROR_IS_FATAL ANY +) message(STATUS "OK! NSC container is healthy.") -message(STATUS "Type \"${NBL_CE_URL}\" in your browser to use NSC with Godbolt!") -]=] INSTRUCTIONS) +message(STATUS "Type \"@NBL_CE_URL@\" in your browser to use NSC with Godbolt!") +]=] INSTRUCTIONS @ONLY) -file(GENERATE OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/run-compiler-explorer-$.cmake" CONTENT "${INSTRUCTIONS}") +set(SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/run-compiler-explorer-$.cmake") +file(GENERATE OUTPUT ${SCRIPT_FILE} CONTENT "${INSTRUCTIONS}") add_custom_target(run-compiler-explorer ALL - COMMAND "${CMAKE_COMMAND}" -P "${CMAKE_CURRENT_BINARY_DIR}/run-compiler-explorer-$.cmake" + COMMAND "${CMAKE_COMMAND}" -P ${SCRIPT_FILE} VERBATIM COMMAND_EXPAND_LISTS ) From d5318514a57d0d25f7c1710cc079092a33516afd Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 28 May 2025 10:36:32 +0200 Subject: [PATCH 237/346] correct passing vars in shell --- .github/workflows/build-nabla.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index e15d1a5ab2..d5d2a2b8d6 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -52,9 +52,12 @@ jobs: - name: Set prefix id: set-prefix + shell: pwsh run: | $prefix = "run-windows-${{ matrix.tag }}-${{ matrix.vendor }}-${{ matrix.config }}" - $nscTargetTaggedImage = "ghcr.io/$env:GITHUB_REPOSITORY:nsc-godbolt-build-${{ matrix.vendor }}-${{ matrix.config }}-${{ matrix.tag }}".ToLower() + $repo = $env:GITHUB_REPOSITORY + $tag = "nsc-godbolt-build-${{ matrix.vendor }}-${{ matrix.config }}-${{ matrix.tag }}" + $nscTargetTaggedImage = "ghcr.io/$repo:$tag".ToLower() "prefix=$prefix" >> $env:GITHUB_OUTPUT "nscTargetTaggedImage=$nscTargetTaggedImage" >> $env:GITHUB_OUTPUT @@ -98,7 +101,7 @@ jobs: --preset ci-configure-dynamic-${{ matrix.vendor }} ` --profiling-output=profiling/cmake-profiling.json ` --profiling-format=google-trace - -DNSC_IMAGE_NAME=${{ steps.set-prefix.outputs.nscTargetTaggedImage }} + "-DNSC_IMAGE_NAME=${{ steps.set-prefix.outputs.nscTargetTaggedImage }}" - name: Container – Build NSC run: | From 2074c138b71e2af95242d030b4fce5742a313027 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 28 May 2025 10:49:35 +0200 Subject: [PATCH 238/346] post fixes to actions, use ${} to delimit var name --- .github/workflows/build-nabla.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index d5d2a2b8d6..e022353652 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -44,11 +44,6 @@ jobs: docker network create --driver nat docker_default if ($LASTEXITCODE -ne 0) { exit 1 } } - - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: 'recursive' - name: Set prefix id: set-prefix @@ -57,10 +52,15 @@ jobs: $prefix = "run-windows-${{ matrix.tag }}-${{ matrix.vendor }}-${{ matrix.config }}" $repo = $env:GITHUB_REPOSITORY $tag = "nsc-godbolt-build-${{ matrix.vendor }}-${{ matrix.config }}-${{ matrix.tag }}" - $nscTargetTaggedImage = "ghcr.io/$repo:$tag".ToLower() + $nscTargetTaggedImage = "ghcr.io/${repo}:${tag}".ToLower() "prefix=$prefix" >> $env:GITHUB_OUTPUT "nscTargetTaggedImage=$nscTargetTaggedImage" >> $env:GITHUB_OUTPUT + + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: 'recursive' - name: Pull Image run: | From 353c46775da80c04678b8a732be9e1af7950b233 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 28 May 2025 11:33:39 +0200 Subject: [PATCH 239/346] ahh typo! --- .github/workflows/build-nabla.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index e022353652..a7b0fe3e68 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -100,7 +100,7 @@ jobs: ${{ env.entry }} ${{ env.cmd }} -Command cmake ` --preset ci-configure-dynamic-${{ matrix.vendor }} ` --profiling-output=profiling/cmake-profiling.json ` - --profiling-format=google-trace + --profiling-format=google-trace ` "-DNSC_IMAGE_NAME=${{ steps.set-prefix.outputs.nscTargetTaggedImage }}" - name: Container – Build NSC From b8d53cccb83629454bbb48ab827e0172a03bb26f Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 28 May 2025 16:48:27 +0700 Subject: [PATCH 240/346] Fix vulkan ray tracing creation --- include/nbl/video/CVulkanRayTracingPipeline.h | 5 ++- src/nbl/video/CVulkanComputePipeline.h | 5 ++- src/nbl/video/CVulkanLogicalDevice.cpp | 3 +- src/nbl/video/CVulkanRayTracingPipeline.cpp | 33 ++++++++++--------- src/nbl/video/ILogicalDevice.cpp | 6 ++-- 5 files changed, 27 insertions(+), 25 deletions(-) diff --git a/include/nbl/video/CVulkanRayTracingPipeline.h b/include/nbl/video/CVulkanRayTracingPipeline.h index 82d8c777b6..a9bc476f43 100644 --- a/include/nbl/video/CVulkanRayTracingPipeline.h +++ b/include/nbl/video/CVulkanRayTracingPipeline.h @@ -41,10 +41,13 @@ class CVulkanRayTracingPipeline final : public IGPURayTracingPipeline const VkPipeline m_vkPipeline; ShaderGroupHandleContainer m_shaderGroupHandles; - uint16_t m_raygenStackSize; core::smart_refctd_dynamic_array m_missStackSizes; core::smart_refctd_dynamic_array m_hitGroupStackSizes; core::smart_refctd_dynamic_array m_callableStackSizes; + uint32_t m_missGroupCount; + uint32_t m_hitGroupCount; + uint32_t m_callableGroupCount; + uint16_t m_raygenStackSize; uint32_t getRaygenIndex() const; uint32_t getMissBaseIndex() const; diff --git a/src/nbl/video/CVulkanComputePipeline.h b/src/nbl/video/CVulkanComputePipeline.h index 76fb346e30..89077f9a9a 100644 --- a/src/nbl/video/CVulkanComputePipeline.h +++ b/src/nbl/video/CVulkanComputePipeline.h @@ -15,10 +15,9 @@ class CVulkanComputePipeline final : public IGPUComputePipeline { public: CVulkanComputePipeline( - core::smart_refctd_ptr&& _layout, - const core::bitflag _flags, + const SCreationParams& params, const VkPipeline pipeline - ) : IGPUComputePipeline(std::move(_layout),_flags), m_pipeline(pipeline) {} + ) : IGPUComputePipeline(params), m_pipeline(pipeline) {} inline const void* getNativeHandle() const override { return &m_pipeline; } diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 6050b7a7a0..216fefcef9 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -1182,8 +1182,7 @@ void CVulkanLogicalDevice::createComputePipelines_impl( // break the lifetime cause of the aliasing std::uninitialized_default_construct_n(output+i,1); output[i] = core::make_smart_refctd_ptr( - core::smart_refctd_ptr(info.layout), - info.flags,vk_pipeline + info,vk_pipeline ); debugNameBuilder.str(""); const auto& specInfo = createInfos[i].shader; diff --git a/src/nbl/video/CVulkanRayTracingPipeline.cpp b/src/nbl/video/CVulkanRayTracingPipeline.cpp index a107d3bbed..960d78428a 100644 --- a/src/nbl/video/CVulkanRayTracingPipeline.cpp +++ b/src/nbl/video/CVulkanRayTracingPipeline.cpp @@ -15,17 +15,17 @@ namespace nbl::video ShaderGroupHandleContainer&& shaderGroupHandles) : IGPURayTracingPipeline(params), m_vkPipeline(vk_pipeline), + m_shaderGroupHandles(std::move(shaderGroupHandles)), m_missStackSizes(core::make_refctd_dynamic_array(params.shaderGroups.misses.size())), m_hitGroupStackSizes(core::make_refctd_dynamic_array(params.shaderGroups.hits.size())), - m_callableStackSizes(core::make_refctd_dynamic_array(params.shaderGroups.hits.size())), - m_shaderGroupHandles(std::move(shaderGroupHandles)) + m_callableStackSizes(core::make_refctd_dynamic_array(params.shaderGroups.hits.size())) { const auto* vulkanDevice = static_cast(getOriginDevice()); auto* vk = vulkanDevice->getFunctionTable(); - auto getVkShaderGroupStackSize = [&](uint32_t baseGroupIx, uint32_t shaderGroupIx, uint32_t shaderIx, VkShaderGroupShaderKHR shaderType) -> uint16_t + auto getVkShaderGroupStackSize = [&](uint32_t baseGroupIx, uint32_t shaderGroupIx, const asset::IShader* shader, VkShaderGroupShaderKHR shaderType) -> uint16_t { - if (shaderIx == SShaderGroupsParams::SIndex::Unused) + if (shader == nullptr) return 0; return vk->vk.vkGetRayTracingShaderGroupStackSizeKHR( @@ -36,14 +36,17 @@ namespace nbl::video ); }; - m_raygenStackSize = getVkShaderGroupStackSize(getRaygenIndex(), 0, params.shaderGroups.raygen.index, VK_SHADER_GROUP_SHADER_GENERAL_KHR); + m_callableGroupCount = params.shaderGroups.callables.size(); + m_missGroupCount = params.shaderGroups.misses.size(); + m_hitGroupCount = params.shaderGroups.hits.size(); + m_raygenStackSize = getVkShaderGroupStackSize(getRaygenIndex(), 0, params.shaderGroups.raygen.shader, VK_SHADER_GROUP_SHADER_GENERAL_KHR); for (size_t shaderGroupIx = 0; shaderGroupIx < params.shaderGroups.misses.size(); shaderGroupIx++) { m_missStackSizes->operator[](shaderGroupIx) = getVkShaderGroupStackSize( getMissBaseIndex(), shaderGroupIx, - params.shaderGroups.misses[shaderGroupIx].index, + params.shaderGroups.misses[shaderGroupIx].shader, VK_SHADER_GROUP_SHADER_GENERAL_KHR); } @@ -52,9 +55,9 @@ namespace nbl::video const auto& hitGroup = params.shaderGroups.hits[shaderGroupIx]; const auto baseIndex = getHitBaseIndex(); m_hitGroupStackSizes->operator[](shaderGroupIx) = SHitGroupStackSize{ - .closestHit = getVkShaderGroupStackSize(baseIndex,shaderGroupIx, hitGroup.closestHit, VK_SHADER_GROUP_SHADER_CLOSEST_HIT_KHR), - .anyHit = getVkShaderGroupStackSize(baseIndex, shaderGroupIx, hitGroup.anyHit,VK_SHADER_GROUP_SHADER_ANY_HIT_KHR), - .intersection = getVkShaderGroupStackSize(baseIndex, shaderGroupIx, hitGroup.intersection, VK_SHADER_GROUP_SHADER_INTERSECTION_KHR), + .closestHit = getVkShaderGroupStackSize(baseIndex,shaderGroupIx, hitGroup.closestHit.shader, VK_SHADER_GROUP_SHADER_CLOSEST_HIT_KHR), + .anyHit = getVkShaderGroupStackSize(baseIndex, shaderGroupIx, hitGroup.anyHit.shader,VK_SHADER_GROUP_SHADER_ANY_HIT_KHR), + .intersection = getVkShaderGroupStackSize(baseIndex, shaderGroupIx, hitGroup.intersection.shader, VK_SHADER_GROUP_SHADER_INTERSECTION_KHR), }; } @@ -63,7 +66,7 @@ namespace nbl::video m_callableStackSizes->operator[](shaderGroupIx) = getVkShaderGroupStackSize( getCallableBaseIndex(), shaderGroupIx, - params.shaderGroups.callables[shaderGroupIx].index, + params.shaderGroups.callables[shaderGroupIx].shader, VK_SHADER_GROUP_SHADER_GENERAL_KHR); } } @@ -83,19 +86,19 @@ namespace nbl::video std::span CVulkanRayTracingPipeline::getMissHandles() const { const auto baseIndex = getMissBaseIndex(); - return std::span(m_shaderGroupHandles->begin() + baseIndex, m_missShaderGroups->size()); + return std::span(m_shaderGroupHandles->begin() + baseIndex, m_missGroupCount); } std::span CVulkanRayTracingPipeline::getHitHandles() const { const auto baseIndex = getHitBaseIndex(); - return std::span(m_shaderGroupHandles->begin() + baseIndex, m_hitShaderGroups->size()); + return std::span(m_shaderGroupHandles->begin() + baseIndex, m_hitGroupCount); } std::span CVulkanRayTracingPipeline::getCallableHandles() const { const auto baseIndex = getCallableBaseIndex(); - return std::span(m_shaderGroupHandles->begin() + baseIndex, m_callableShaderGroups->size()); + return std::span(m_shaderGroupHandles->begin() + baseIndex, m_callableGroupCount); } uint16_t CVulkanRayTracingPipeline::getRaygenStackSize() const @@ -159,13 +162,13 @@ namespace nbl::video uint32_t CVulkanRayTracingPipeline::getHitBaseIndex() const { // one raygen group + miss groups before this groups - return 1 + m_missShaderGroups->size(); + return 1 + m_missGroupCount; } uint32_t CVulkanRayTracingPipeline::getCallableBaseIndex() const { // one raygen group + miss groups + hit groups before this groups - return 1 + m_missShaderGroups->size() + m_hitShaderGroups->size(); + return 1 + m_missGroupCount + m_hitGroupCount; } } diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp index c019be84a7..0056cc3a2a 100644 --- a/src/nbl/video/ILogicalDevice.cpp +++ b/src/nbl/video/ILogicalDevice.cpp @@ -797,10 +797,8 @@ bool ILogicalDevice::createComputePipelines(IGPUPipelineCache* const pipelineCac } core::vector newParams(params.begin(), params.end()); - const auto shaderCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param) - { - return sum + param.getShaders().size(); - }); + const auto shaderCount = params.size(); + core::vector> debloatedShaders; // vector to hold all the debloated shaders, so the pointer from the new ShaderSpecInfo is not dangling debloatedShaders.reserve(shaderCount); From f26201e29746ecbd7fb126a9fab50cccad70b6e6 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 28 May 2025 16:48:39 +0700 Subject: [PATCH 241/346] Another fix to CCOmputeBlit --- src/nbl/video/utilities/CComputeBlit.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp index edac6e1f5c..a402df2137 100644 --- a/src/nbl/video/utilities/CComputeBlit.cpp +++ b/src/nbl/video/utilities/CComputeBlit.cpp @@ -66,7 +66,7 @@ struct ConstevalParameters }(); auto createPipeline = [&limits,layout,&common](const char* mainPath)->smart_refctd_ptr { - auto shader = make_smart_refctd_ptr( + auto shader = make_smart_refctd_ptr( (common+"\n#include \""+mainPath+"\"\n").c_str(), IShader::E_CONTENT_TYPE::ECT_HLSL, mainPath From b18c83425b548aa5cd2fca4f7d5f80127a099766 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 28 May 2025 12:02:19 +0200 Subject: [PATCH 242/346] pass NSC_IMAGE_NAME with ENV as it glitches when using CMake CLI due to . and / chars --- .github/workflows/build-nabla.yml | 4 ++-- tools/nsc/CMakeLists.txt | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index a7b0fe3e68..f93c0c270f 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -77,6 +77,7 @@ jobs: --entrypoint ${{ env.entry }} -di --isolation process ` --env-file .\docker\ci-windows.env ` --env-file .\docker\ninja.env ` + --env "NSC_IMAGE_NAME=${{ steps.set-prefix.outputs.nscTargetTaggedImage }}" ` --name orphan --network docker_default ` -v "${{ github.workspace }}:${{ env.mount }}" ` -v "${pipeHost}:\\.\pipe\dockerd" -e "DOCKER_HOST=npipe:////./pipe/dockerd" ` @@ -100,8 +101,7 @@ jobs: ${{ env.entry }} ${{ env.cmd }} -Command cmake ` --preset ci-configure-dynamic-${{ matrix.vendor }} ` --profiling-output=profiling/cmake-profiling.json ` - --profiling-format=google-trace ` - "-DNSC_IMAGE_NAME=${{ steps.set-prefix.outputs.nscTargetTaggedImage }}" + --profiling-format=google-trace - name: Container – Build NSC run: | diff --git a/tools/nsc/CMakeLists.txt b/tools/nsc/CMakeLists.txt index d3b8bdf94a..bcdcbca531 100644 --- a/tools/nsc/CMakeLists.txt +++ b/tools/nsc/CMakeLists.txt @@ -225,7 +225,9 @@ LABEL org.opencontainers.image.description="[Nabla Shader Compiler (NSC)]: @ORG_ set(DOCKERFILE "${NBL_DOCKER_CTX_DIR}/Dockerfile") file(WRITE "${DOCKERFILE}" "${INSTRUCTIONS}") -if(NOT DEFINED NSC_IMAGE_NAME) +if(DEFINED ENV{NSC_IMAGE_NAME}) + set(NSC_IMAGE_NAME "$ENV{NSC_IMAGE_NAME}") +else() set(NSC_IMAGE_NAME nano/godbolt/nsc) endif() From 7e6af2471090f2a6e19121e5570a3423d1ee0bcf Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 28 May 2025 15:21:30 +0200 Subject: [PATCH 243/346] upload NSC image to Github Container Registry on master push, create compose.yml for deploys --- .github/workflows/build-nabla.yml | 22 +++++++++++++++++++++- compose.yml | 16 ++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 compose.yml diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index f93c0c270f..e2ce30cc05 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -53,10 +53,15 @@ jobs: $repo = $env:GITHUB_REPOSITORY $tag = "nsc-godbolt-build-${{ matrix.vendor }}-${{ matrix.config }}-${{ matrix.tag }}" $nscTargetTaggedImage = "ghcr.io/${repo}:${tag}".ToLower() + $nscTargetTaggedImageLatest = "ghcr.io/${repo}:nsc-godbolt-latest".ToLower() + + $shouldPushImage = ("${{ github.ref }}" -eq "refs/heads/master" -and "${{ matrix.vendor }}" -eq "msvc") "prefix=$prefix" >> $env:GITHUB_OUTPUT "nscTargetTaggedImage=$nscTargetTaggedImage" >> $env:GITHUB_OUTPUT - + "nscTargetTaggedImageLatest=$nscTargetTaggedImageLatest" >> $env:GITHUB_OUTPUT + "shouldPushImage=$shouldPushImage" >> $env:GITHUB_OUTPUT + - name: Checkout uses: actions/checkout@v4 with: @@ -137,6 +142,7 @@ jobs: with: name: ${{ steps.set-prefix.outputs.prefix }}-nsc-godbolt-image path: ${{ steps.set-prefix.outputs.prefix }}-nsc-godbolt-image.tar.zst + compression-level: 0 - name: Upload profiling artifacts uses: actions/upload-artifact@v4 @@ -149,3 +155,17 @@ jobs: with: name: ${{ steps.set-prefix.outputs.prefix }}-install path: ${{ steps.set-prefix.outputs.prefix }}-install.tar + + - name: Login to GHCR + if: steps.set-prefix.outputs.shouldPushImage == 'True' + run: echo "${{ secrets.CR_PAT }}" | docker login ghcr.io -u $env:GITHUB_ACTOR --password-stdin + + - name: Tag Latest image + if: steps.set-prefix.outputs.shouldPushImage == 'True' + run: | + docker tag ${{ steps.set-prefix.outputs.nscTargetTaggedImage }} ${{ steps.set-prefix.outputs.nscTargetTaggedImageLatest }} + + - name: Push images to GHCR + if: steps.set-prefix.outputs.shouldPushImage == 'True' + run: | + docker push ${{ steps.set-prefix.outputs.nscTargetTaggedImageLatest }} \ No newline at end of file diff --git a/compose.yml b/compose.yml new file mode 100644 index 0000000000..8d6f1bc64a --- /dev/null +++ b/compose.yml @@ -0,0 +1,16 @@ +services: + nsc: + container_name: nsc-godbolt + image: ghcr.io/devsh-graphics-programming/nabla:nsc-godbolt-latest + isolation: process + ports: + - "80:10240" + volumes: + - type: bind + source: C:\Windows\Globalization\ICU + target: C:\Windows\Globalization\ICU + read_only: true + - type: bind + source: C:\Windows\System32 + target: C:\mount\Windows\System32 + read_only: true \ No newline at end of file From e5b229ac6dd960fabfec8b83c8af8f5bdab41620 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 28 May 2025 15:36:47 +0200 Subject: [PATCH 244/346] lock on push, update .github/workflows/build-nabla.yml --- .github/workflows/build-nabla.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index e2ce30cc05..9f1e203f1e 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -5,6 +5,10 @@ on: pull_request: workflow_dispatch: +concurrency: + group: push-lock-${{ github.ref }} + cancel-in-progress: true + jobs: build-windows: runs-on: windows-2022 From 9328fd434a07e8ef24b382ba6a21dd37b671cb5d Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 28 May 2025 16:48:20 +0200 Subject: [PATCH 245/346] update shouldPushImage logic --- .github/workflows/build-nabla.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index 9f1e203f1e..67fc9c4401 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -59,8 +59,14 @@ jobs: $nscTargetTaggedImage = "ghcr.io/${repo}:${tag}".ToLower() $nscTargetTaggedImageLatest = "ghcr.io/${repo}:nsc-godbolt-latest".ToLower() - $shouldPushImage = ("${{ github.ref }}" -eq "refs/heads/master" -and "${{ matrix.vendor }}" -eq "msvc") + $shouldPushImage = ( + "${{ github.ref }}" -eq "refs/heads/master" -and + "${{ matrix.vendor }}" -eq "msvc" -and + "${{ matrix.config }}" -eq "Release" + ) + Write-Host "::notice::Should push image? $shouldPushImage" + "prefix=$prefix" >> $env:GITHUB_OUTPUT "nscTargetTaggedImage=$nscTargetTaggedImage" >> $env:GITHUB_OUTPUT "nscTargetTaggedImageLatest=$nscTargetTaggedImageLatest" >> $env:GITHUB_OUTPUT From 127c6d9593baa2dc950d9c76c80bf405ae6c76f2 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 29 May 2025 17:29:27 +0700 Subject: [PATCH 246/346] some fixes to indexing --- examples_tests | 2 +- .../hlsl/workgroup2/arithmetic_config.hlsl | 6 ++++- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 25 ++++++++----------- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/examples_tests b/examples_tests index 3d63ed7328..f202ef5632 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 3d63ed732838c3073dfb7993d3eb1305fb5882be +Subproject commit f202ef563249c172d4a6c699379c6793ae939863 diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl index c0e105e700..2f1a8b06a0 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -101,9 +101,13 @@ struct ArithmeticConfiguration return sharedStoreIndex(virtualID); } + template static uint32_t sharedLoadIndex(const uint32_t invocationIndex, const uint32_t component) { - return component * SubgroupSize + invocationIndex; + if (level == LevelCount-1) + return component * SubgroupSize + invocationIndex; + else + return component * __SubgroupsPerVirtualWorkgroup + invocationIndex; } }; diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 195431c5d3..1d386835b9 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -124,7 +124,7 @@ struct reduce vector_lv1_t lv1_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(Config::sharedLoadIndex(invocationIndex, i),lv1_val[i]); + scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); lv1_val = reduction1(lv1_val); if (Config::electLast()) @@ -183,15 +183,14 @@ struct scan if (glsl::gl_SubgroupID() == 0) { vector_lv1_t lv1_val; - const uint32_t prevIndex = invocationIndex-1; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(Config::sharedLoadIndex(invocationIndex, i)-1,lv1_val[i]); + scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i)-1,lv1_val[i]); lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex)); lv1_val = inclusiveScan1(lv1_val); [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template set(Config::sharedLoadIndex(invocationIndex, i),lv1_val[i]); + scratchAccessor.template set(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -253,11 +252,11 @@ struct reduce vector_lv1_t lv1_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(Config::sharedLoadIndex(invocationIndex, i),lv1_val[i]); + scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); lv1_val = reduction1(lv1_val); if (Config::electLast()) { - const uint32_t bankedIndex = Config::template sharedStoreIndex<2>(invocationIndex); + const uint32_t bankedIndex = Config::template sharedStoreIndex<2>(glsl::gl_SubgroupID()); scratchAccessor.template set(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); } } @@ -270,7 +269,7 @@ struct reduce vector_lv2_t lv2_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) - scratchAccessor.template get(lv1_smem_size+Config::sharedLoadIndex(invocationIndex, i),lv2_val[i]); + scratchAccessor.template get(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]); lv2_val = reduction2(lv2_val); if (Config::electLast()) scratchAccessor.template set(0, lv2_val[Config::ItemsPerInvocation_2-1]); @@ -309,15 +308,14 @@ struct scan if (glsl::gl_SubgroupID() < Config::SubgroupsSize*Config::ItemsPerInvocation_2) { vector_lv1_t lv1_val; - const uint32_t prevIndex = invocationIndex-1; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(Config::sharedLoadIndex(invocationIndex, i)-1,lv1_val[i]); + scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i)-1,lv1_val[i]); lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex)); lv1_val = inclusiveScan1(lv1_val); [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template set(Config::sharedLoadIndex(invocationIndex, i),lv1_val[i]); + scratchAccessor.template set(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); if (Config::electLast()) { const uint32_t bankedIndex = Config::template sharedStoreIndex<2>(glsl::gl_SubgroupID()); @@ -331,15 +329,14 @@ struct scan if (glsl::gl_SubgroupID() == 0) { vector_lv2_t lv2_val; - const uint32_t prevIndex = invocationIndex-1; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) - scratchAccessor.template get(lv1_smem_size+Config::sharedLoadIndex(invocationIndex, i)-1,lv2_val[i]); + scratchAccessor.template get(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i)-1,lv2_val[i]); lv2_val[0] = hlsl::mix(BinOp::identity, lv2_val[0], bool(invocationIndex)); lv2_val = inclusiveScan2(lv2_val); [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) - scratchAccessor.template set(lv1_smem_size+Config::sharedLoadIndex(invocationIndex, i),lv2_val[i]); + scratchAccessor.template set(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -357,7 +354,7 @@ struct scan [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template set(Config::sharedLoadIndex(invocationIndex, i), binop(lv1_val[i],lv2_scan)); + scratchAccessor.template set(Config::template sharedLoadIndex<1>(invocationIndex, i), binop(lv1_val[i],lv2_scan)); } // combine with level 0 From 52c7db99f99c3f349eb29675941184c606ff7269 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com> Date: Thu, 29 May 2025 14:05:05 +0200 Subject: [PATCH 247/346] Update tools/nsc/docker/README.md --- tools/nsc/docker/README.md | 87 +++++++++++++++++++++++++++++++++++--- 1 file changed, 80 insertions(+), 7 deletions(-) diff --git a/tools/nsc/docker/README.md b/tools/nsc/docker/README.md index 21f8f4e06d..a18fe48c1f 100644 --- a/tools/nsc/docker/README.md +++ b/tools/nsc/docker/README.md @@ -1,16 +1,89 @@ -# NSC Docker Godbolt +# NSC & Godbolt integration -## Run NSC tool straight from build directory in compiler explorer docker container! +## Run Compiler Explorer with NSC tool in docker container! -Currently only Windows platform with target *x86_64* architecture is supported. Tested with Hyper-V isolation mode. +https://github.com/user-attachments/assets/8d409477-92e4-4238-b5e5-637cfbdf7263 -### Requirements +## Requirements -- [***Docker Desktop***](https://www.docker.com/products/docker-desktop/) +- Configured [***Docker***](https://docs.docker.com/desktop/setup/install/windows-install/) for Windows Containers +- [Windows, Windows Server Core or Windows Server]() with **minumum** x86_64 10.0.20348 build (2022 distributions) -### How To +> [!TIP] +> type `cmd /ver` to see your build version -Switch docker to windows containers, configure CMake with `NBL_ENABLE_DOCKER_INTEGRATION` option (recommended Visual Studio generator) & build `run-compiler-explorer` target. After the build completes type `localhost` in your browser. +> [!CAUTION] +> Hyper-V is **NOT** supported, you must run NSC Godbolt container as process + +## How to run image + +> [!IMPORTANT] +> If using Docker Desktop - first make sure you have switched to `Containers for Windows`, see image bellow. If you are CLI user and have client & daemon headless then use appropriate windows build context. ![Containers for Windows](https://user-images.githubusercontent.com/65064509/152947300-affca592-35a7-4e4c-a7fc-2055ce1ba528.png) +> [!CAUTION] +> Examples bellow use `docker compose` to run the image but if you want to `docker run` then make sure to mount required system directories and expose port otherwise will fail, see the compose file for more details + +### from container registry + +execute + +```powershell +curl -L https://raw.githubusercontent.com/Devsh-Graphics-Programming/Nabla/master/compose.yml | docker compose -f - up +``` + +or in Nabla checkout + +```powershell +docker compose up +``` + +and type `localhost` in your browser. + +### from Nabla pipeline workflow artifacts + +> [!NOTE] +> We publish container images to the GitHub Container Registry that include **only the Release variant** of NSC executables built with **MSVC**. +> However, our CI pipelines **build and test all configurations**. Compressed images for each configuration are uploaded as **workflow artifacts**. +> Look for artifacts named: +> `-msvc--nsc-godbolt-image` + +> [!NOTE] +> To decompress image artifact you need [zstd]() + +Download workflow image artifact, unzip and + +```powershell +zstd -d < -msvc--nsc-godbolt-image.tar.zst | docker load +``` + +
+Docker load example (click to expand) + +``` +C:\Users\anastaziuk\Desktop\DevshGraphicsProgramming\Nabla\tools\nsc\docker>zstd -d < run-windows-17.13.6-msvc-Debug-nsc-godbolt-image.tar.zst | docker load +b2ebf78c3627: Loading layer [==================================================>] 3.149MB/3.149MB +4c201e14cc01: Loading layer [==================================================>] 77.4MB/77.4MB +68a216251b8f: Loading layer [==================================================>] 61.95kB/61.95kB +7a4e13ca4c4e: Loading layer [==================================================>] 52.74kB/52.74kB +634001f55b21: Loading layer [==================================================>] 52.74kB/52.74kB +6a609178bb9a: Loading layer [==================================================>] 52.74kB/52.74kB +3d7afb042308: Loading layer [==================================================>] 52.74kB/52.74kB +ca034d7bc58a: Loading layer [==================================================>] 52.74kB/52.74kB +55b4134a1ae9: Loading layer [==================================================>] 52.74kB/52.74kB +0648adff3faa: Loading layer [==================================================>] 52.74kB/52.74kB +Loaded image: ghcr.io/devsh-graphics-programming/nabla:nsc-godbolt-build-msvc-debug-17.13.6 +``` + +
+ +copy `compose.yml` in Nabla root directory to eg. `override-compose.yml`, replace it's `image` field value with loaded image name (eg. `ghcr.io/devsh-graphics-programming/nabla:nsc-godbolt-build-msvc-debug-17.13.6` like in the example) and execute + +``` +docker compose -f override-compose.yml up +``` + +## How to build image + +Configure CMake with `NBL_ENABLE_DOCKER_INTEGRATION` and build `run-compiler-explorer` target. From 531784f6daa447884f7d155c9f2dffc4f0abb85e Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com> Date: Thu, 29 May 2025 14:08:59 +0200 Subject: [PATCH 248/346] post tools/nsc/docker/README.md updates --- tools/nsc/docker/README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/nsc/docker/README.md b/tools/nsc/docker/README.md index a18fe48c1f..afd8b0f8b7 100644 --- a/tools/nsc/docker/README.md +++ b/tools/nsc/docker/README.md @@ -23,7 +23,7 @@ https://github.com/user-attachments/assets/8d409477-92e4-4238-b5e5-637cfbdf7263 ![Containers for Windows](https://user-images.githubusercontent.com/65064509/152947300-affca592-35a7-4e4c-a7fc-2055ce1ba528.png) > [!CAUTION] -> Examples bellow use `docker compose` to run the image but if you want to `docker run` then make sure to mount required system directories and expose port otherwise will fail, see the compose file for more details +> Examples bellow use `docker compose` to run the image but if you want to `docker run` then make sure to mount required system directories and expose port otherwise it will fail in runtime, see the [compose]() file for more details ### from container registry @@ -78,12 +78,14 @@ Loaded image: ghcr.io/devsh-graphics-programming/nabla:nsc-godbolt-build-msvc-de -copy `compose.yml` in Nabla root directory to eg. `override-compose.yml`, replace it's `image` field value with loaded image name (eg. `ghcr.io/devsh-graphics-programming/nabla:nsc-godbolt-build-msvc-debug-17.13.6` like in the example) and execute +copy `compose.yml` in Nabla root directory to eg. `override-compose.yml`, replace it's `image` field value with loaded image name (eg. `ghcr.io/devsh-graphics-programming/nabla:nsc-godbolt-build-msvc-debug-17.13.6` like in the example) then execute ``` docker compose -f override-compose.yml up ``` +and type `localhost` in your browser. + ## How to build image Configure CMake with `NBL_ENABLE_DOCKER_INTEGRATION` and build `run-compiler-explorer` target. From edac59f31c8edfdafd05db4e7961bb5c14435713 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 29 May 2025 20:44:35 +0700 Subject: [PATCH 249/346] Fix AssetConvert to use the current SpecInfo --- src/nbl/video/utilities/CAssetConverter.cpp | 75 +++++++++++---------- 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index d1615a4637..ce46d5a9a8 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -519,8 +519,8 @@ class AssetVisitor : public CRTP if (!layout || !descend(layout,{layout})) return false; const auto& specInfo = asset->getSpecInfo(); - const auto* shader = specInfo.shader; - if (!shader || !descend(shader,{shader},specInfo)) + const auto* shader = specInfo.shader.get(); + if (!shader || !descend(shader,{shader},specInfo, hlsl::ESS_COMPUTE)) return false; return true; } @@ -536,8 +536,8 @@ class AssetVisitor : public CRTP using stage_t = hlsl::ShaderStage; for (stage_t stage : {stage_t::ESS_VERTEX,stage_t::ESS_TESSELLATION_CONTROL,stage_t::ESS_TESSELLATION_EVALUATION,stage_t::ESS_GEOMETRY,stage_t::ESS_FRAGMENT}) { - const auto& specInfo = asset->getSpecInfo(stage); - const auto* shader = specInfo.shader; + const auto& specInfo = asset->getSpecInfos(stage); + const auto* shader = specInfo[0].shader.get(); if (!shader) { if (stage==stage_t::ESS_VERTEX) // required @@ -545,7 +545,7 @@ class AssetVisitor : public CRTP CRTP::template nullOptional(); continue; } - if (!descend(shader,{shader},specInfo)) + if (!descend(shader,{shader},specInfo[0], stage)) return false; } return true; @@ -1035,25 +1035,18 @@ class HashVisit : public CAssetConverter::CHashCache::hash_impl_base auto argTuple = std::tuple(extraArgs...); const auto& arg0 = std::get<0>(argTuple); // hash the spec info - if constexpr (std::is_same_v) + if constexpr (std::is_same_v) { + const auto stage = std::get<1>(argTuple); hasher << arg0.entryPoint; - hasher << arg0.stage; + hasher << stage; hasher << arg0.requiredSubgroupSize; - switch (arg0.stage) + if (!arg0.entries.empty()) { - case hlsl::ShaderStage::ESS_COMPUTE: - hasher << arg0.requireFullSubgroups; - break; - default: - break; - } - if (arg0.entries) - { - for (const auto& specConstant : *arg0.entries) + for (const auto& specConstant : arg0.entries) { hasher << specConstant.first; - hasher.update(specConstant.second.data, specConstant.second.size); + hasher.update(specConstant.second.data(), specConstant.second.size()); } } } @@ -1303,6 +1296,8 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_tgetCachedCreationParams(); + hasher << params.requireFullSubgroups; return true; } bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t lookup) @@ -1718,16 +1713,14 @@ class GetDependantVisit : public GetDependantVisitBase::value*/sizeof(IShader::E_SHADER_STAGE)*8> specInfo = {}; + ICPUPipelineBase::SShaderSpecInfo specInfo = {}; protected: bool descend_impl( @@ -1743,18 +1736,16 @@ class GetDependantVisit : public GetDependantVisitBase& user, const CAssetConverter::patch_t& userPatch, - const instance_t& dep, const CAssetConverter::patch_t& soloPatch, const IPipelineBase::SShaderSpecInfo& inSpecInfo + const instance_t& dep, const CAssetConverter::patch_t& soloPatch, const ICPUPipelineBase::SShaderSpecInfo& inSpecInfo, hlsl::ShaderStage stage ) { auto depObj = getDependant(dep,soloPatch); if (!depObj) return false; - getSpecInfo(inSpecInfo.stage) = { - .shader = depObj.get(), + getSpecInfo() = ICPUPipelineBase::SShaderSpecInfo{ + .shader = depObj, .entryPoint = inSpecInfo.entryPoint, // warning: its a `string_view` now! - .stage = inSpecInfo.stage, .requiredSubgroupSize = inSpecInfo.requiredSubgroupSize, - .requireFullSubgroups = inSpecInfo.requireFullSubgroups, .entries = inSpecInfo.entries }; return true; @@ -1775,7 +1766,7 @@ class GetDependantVisit : public GetDependantVisitBase::value*/sizeof(IShader::E_SHADER_STAGE)*8> specInfo = {}; + std::array::value*/sizeof(IShader::E_SHADER_STAGE)*8> specInfo = {}; // optionals (done this way because inheritance chain with templated class hides protected methods) IGPURenderpass* renderpass = nullptr; @@ -1793,18 +1784,16 @@ class GetDependantVisit : public GetDependantVisitBase& user, const CAssetConverter::patch_t& userPatch, - const instance_t& dep, const CAssetConverter::patch_t& soloPatch, const IPipelineBase::SShaderSpecInfo& inSpecInfo + const instance_t& dep, const CAssetConverter::patch_t& soloPatch, const ICPUPipelineBase::SShaderSpecInfo& inSpecInfo, hlsl::ShaderStage stage ) { auto depObj = getDependant(dep,soloPatch); if (!depObj) return false; - getSpecInfo(inSpecInfo.stage) = { - .shader = depObj.get(), + getSpecInfo(stage) = { + .shader = depObj, .entryPoint = inSpecInfo.entryPoint, // warning: its a `string_view` now! - .stage = inSpecInfo.stage, .requiredSubgroupSize = inSpecInfo.requiredSubgroupSize, - .requireFullSubgroups = 0, .entries = inSpecInfo.entries }; return true; @@ -3120,12 +3109,13 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult continue; // ILogicalDevice::createComputePipelines is rather aggressive on the spec constant validation, so we create one pipeline at a time core::smart_refctd_ptr ppln; + IGPUPipelineBase::SShaderEntryMap entryMap; { // no derivatives, special flags, etc. IGPUComputePipeline::SCreationParams params = {}; params.layout = visitor.layout; // while there are patches possible for shaders, the only patch which can happen here is changing a stage from UNKNOWN to COMPUTE - params.shader = visitor.getSpecInfo(IShader::E_SHADER_STAGE::ESS_COMPUTE); + params.shader = IGPUPipelineBase::SShaderSpecInfo::create(visitor.getSpecInfo(), entryMap); device->createComputePipelines(inputs.pipelineCache,{¶ms,1},&ppln); } assign(entry.first,entry.second.firstCopyIx,i,std::move(ppln)); @@ -3148,7 +3138,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult } if constexpr (std::is_same_v) { - core::vector tmpSpecInfo; + core::vector tmpSpecInfo; tmpSpecInfo.reserve(5); for (auto& entry : conversionRequests) { @@ -3170,6 +3160,12 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult { // no derivatives, special flags, etc. IGPUGraphicsPipeline::SCreationParams params = {}; + using SShaderEntryMap = IGPUPipelineBase::SShaderEntryMap; + SShaderEntryMap vertexEntryMap; + SShaderEntryMap tesselationControlEntryMap; + SShaderEntryMap tesselationEvaluationEntryMap; + SShaderEntryMap geometryEntryMap; + SShaderEntryMap fragmentEntryMap; bool depNotFound = false; { params.layout = visitor.layout; @@ -3183,7 +3179,12 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult if (info.shader) tmpSpecInfo.push_back(std::move(info)); } - params.shaders = tmpSpecInfo; + using GPUShaderSpecInfo = IGPUPipelineBase::SShaderSpecInfo; + params.vertexShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_VERTEX), vertexEntryMap); + params.tesselationControlShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_TESSELLATION_CONTROL), tesselationControlEntryMap); + params.tesselationEvaluationShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_TESSELLATION_EVALUATION), tesselationEvaluationEntryMap); + params.geometryShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_GEOMETRY), geometryEntryMap); + params.fragmentShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_FRAGMENT), fragmentEntryMap); } params.cached = asset->getCachedCreationParams(); device->createGraphicsPipelines(inputs.pipelineCache,{¶ms,1},&ppln); From a31cc66ddf18268a14c82e8410cf72ff95e161b9 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 29 May 2025 20:45:33 +0700 Subject: [PATCH 250/346] Small fixes to asset and video --- include/nbl/asset/ICPUComputePipeline.h | 14 +++++++++++- include/nbl/asset/ICPUGraphicsPipeline.h | 4 ++-- include/nbl/asset/ICPUPipeline.h | 6 ++--- .../asset/ICPURenderpassIndependentPipeline.h | 4 ++-- include/nbl/asset/IComputePipeline.h | 1 - include/nbl/video/IGPUPipeline.h | 22 +++++++++++++++++-- 6 files changed, 40 insertions(+), 11 deletions(-) diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h index b940c2ae48..69bffe2bba 100644 --- a/include/nbl/asset/ICPUComputePipeline.h +++ b/include/nbl/asset/ICPUComputePipeline.h @@ -39,13 +39,25 @@ class ICPUComputePipeline final : public ICPUPipeline getSpecInfo(hlsl::ShaderStage stage) const override + inline std::span getSpecInfos(hlsl::ShaderStage stage) const override { if (stage==hlsl::ShaderStage::ESS_COMPUTE) return {&m_specInfo,1}; return {}; } + inline SShaderSpecInfo& getSpecInfo() + { + return m_specInfo; + } + + inline const SShaderSpecInfo& getSpecInfo() const + { + return m_specInfo; + } + + inline SCachedCreationParams& getCachedCreationParamsMut() { return m_params; } + inline bool valid() const override { if (!m_layout) return false; diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h index 4a1520880d..a17bebe87d 100644 --- a/include/nbl/asset/ICPUGraphicsPipeline.h +++ b/include/nbl/asset/ICPUGraphicsPipeline.h @@ -39,13 +39,13 @@ class ICPUGraphicsPipeline final : public ICPUPipeline getSpecInfo(hlsl::ShaderStage stage) const override final + inline virtual std::span getSpecInfos(hlsl::ShaderStage stage) const override final { const auto stageIndex = stageToIndex(stage); if (stageIndex != -1) diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h index 069c9fc35e..0642acb676 100644 --- a/include/nbl/asset/ICPUPipeline.h +++ b/include/nbl/asset/ICPUPipeline.h @@ -93,7 +93,7 @@ class ICPUPipelineBase } }; - virtual std::span getSpecInfo(hlsl::ShaderStage stage) const = 0; + virtual std::span getSpecInfos(hlsl::ShaderStage stage) const = 0; }; @@ -131,11 +131,11 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipe return clone_impl(std::move(layout), _depth); } - // Note(kevinyu): For some reason overload resolution cannot find this function when I name id getSpecInfo. It always use the const variant. Will check on it later. + // Note(kevinyu): For some reason overload resolution cannot find this function when I name id getSpecInfos. It always use the const variant. Will check on it later. inline std::span getSpecInfoMut(hlsl::ShaderStage stage) { if (!isMutable()) return {}; - const auto specInfo = const_cast(this)->getSpecInfo(stage); + const auto specInfo = const_cast(this)->getSpecInfos(stage); return { const_cast(specInfo.data()), specInfo.size() }; } diff --git a/include/nbl/asset/ICPURenderpassIndependentPipeline.h b/include/nbl/asset/ICPURenderpassIndependentPipeline.h index fbff6ee312..83536e0c54 100644 --- a/include/nbl/asset/ICPURenderpassIndependentPipeline.h +++ b/include/nbl/asset/ICPURenderpassIndependentPipeline.h @@ -105,7 +105,7 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline, #if 0 // The getters are weird because the shader pointer needs patching - inline IShader::SSpecInfo getSpecInfo(const hlsl::ShaderStage stage) + inline IShader::SSpecInfo getSpecInfos(const hlsl::ShaderStage stage) { assert(isMutable()); const auto stageIx = hlsl::findLSB(stage); @@ -113,7 +113,7 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline, return {}; return m_infos[stageIx]; } - inline IShader::SSpecInfo getSpecInfo(const hlsl::ShaderStage stage) const + inline IShader::SSpecInfo getSpecInfos(const hlsl::ShaderStage stage) const { const auto stageIx = hlsl::findLSB(stage); if (stageIx<0 || stageIx>=GRAPHICS_SHADER_STAGE_COUNT || hlsl::bitCount(stage)!=1) diff --git a/include/nbl/asset/IComputePipeline.h b/include/nbl/asset/IComputePipeline.h index 2cb38b39f1..ba4d245473 100644 --- a/include/nbl/asset/IComputePipeline.h +++ b/include/nbl/asset/IComputePipeline.h @@ -24,7 +24,6 @@ class IComputePipeline : public IPipeline, public IComputePi public: inline const SCachedCreationParams& getCachedCreationParams() const { return m_params; } - inline SCachedCreationParams& getCachedCreationParams() { return m_params; } protected: explicit IComputePipeline(PipelineLayoutType* layout, const SCachedCreationParams& cachedParams) : diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h index f2e9b79fef..0b56b87ee9 100644 --- a/include/nbl/video/IGPUPipeline.h +++ b/include/nbl/video/IGPUPipeline.h @@ -8,6 +8,7 @@ #include "nbl/video/IGPUPipelineLayout.h" #include "nbl/video/SPipelineCreationParams.h" +#include "nbl/asset/ICPUPipeline.h" #include "nbl/asset/IPipeline.h" namespace nbl::video @@ -17,6 +18,7 @@ class IGPUPipelineBase { public: struct SShaderSpecInfo { + //! Structure specifying a specialization map entry /* Note that if specialization constant ID is used @@ -93,18 +95,34 @@ class IGPUPipelineBase { asset::IPipelineBase::SUBGROUP_SIZE requiredSubgroupSize = asset::IPipelineBase::SUBGROUP_SIZE::UNKNOWN; //!< Default value of 8 means no requirement - // Container choice implicitly satisfies: // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-constantID-04911 - const core::unordered_map* entries; + using entry_map_t = core::unordered_map; + const entry_map_t* entries; // By requiring Nabla Core Profile features we implicitly satisfy: // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02784 // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02785 // Also because our API is sane, it satisfies the following by construction: // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02754 + + static inline SShaderSpecInfo create(const asset::ICPUPipelineBase::SShaderSpecInfo& cpuSpecInfo, entry_map_t& outEntries) + { + SShaderSpecInfo specInfo; + specInfo.shader = cpuSpecInfo.shader.get(); + specInfo.entryPoint = cpuSpecInfo.entryPoint; + specInfo.requiredSubgroupSize = cpuSpecInfo.requiredSubgroupSize; + for (const auto&[key, value] : cpuSpecInfo.entries) + { + outEntries.insert({ key, { value.data(), value.size() } }); + } + specInfo.entries = &outEntries; + return specInfo; + }; }; + using SShaderEntryMap = SShaderSpecInfo::entry_map_t; + }; // Common Base class for pipelines From 08ece5d33d07391a85be35905d508dc2359efb6e Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 29 May 2025 20:49:19 +0700 Subject: [PATCH 251/346] Fix CComputeBlit --- src/nbl/video/utilities/CComputeBlit.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp index a402df2137..ade127b790 100644 --- a/src/nbl/video/utilities/CComputeBlit.cpp +++ b/src/nbl/video/utilities/CComputeBlit.cpp @@ -83,7 +83,7 @@ struct ConstevalParameters .entryPoint = "main", .requiredSubgroupSize = static_cast(findMSB(limits.maxSubgroupSize)), }; - pipeline->getCachedCreationParams() = { + pipeline->getCachedCreationParamsMut() = { .requireFullSubgroups = true, }; return pipeline; From 75530d4bc6e3297613a841f2d2b7929814a6d720 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com> Date: Thu, 29 May 2025 19:58:08 +0200 Subject: [PATCH 252/346] Create run-nsc.yml workflow --- .github/workflows /run-nsc.yml | 206 +++++++++++++++++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100644 .github/workflows /run-nsc.yml diff --git a/.github/workflows /run-nsc.yml b/.github/workflows /run-nsc.yml new file mode 100644 index 0000000000..07be0d44e9 --- /dev/null +++ b/.github/workflows /run-nsc.yml @@ -0,0 +1,206 @@ +name: Run NSC Godbolt Container + +on: + workflow_dispatch: + inputs: + run_id: + description: "The id of the workflow run where the desired download artifact was uploaded from" + required: true + build_config: + description: "Build configuration (Release / RelWithDebInfo / Debug)" + required: true + default: "Release" + type: choice + options: + - Release + - RelWithDebInfo + - Debug + withDiscordMSG: + description: "Send Discord message after tunnel is up" + required: true + default: true + type: boolean + +jobs: + run-container: + runs-on: windows-2022 + env: + DISCORD_WEBHOOK: ${{ secrets.DC_ACTIONS_WEBHOOK }} + + steps: + - name: Environment Setup + run: | + Add-MpPreference -ExclusionPath "${{ github.workspace }}" + Add-MpPreference -ExclusionExtension "*.*" + Add-MpPreference -ExclusionProcess "docker.exe" + Add-MpPreference -ExclusionProcess "dockerd.exe" + Set-MpPreference -RemediationScheduleDay 8 + Set-MpPreference -DisableRealtimeMonitoring $true + Set-MpPreference -DisableRemovableDriveScanning $true + Set-MpPreference -DisableArchiveScanning $true + Set-MpPreference -DisableScanningMappedNetworkDrivesForFullScan $true + + if (-not (docker network ls --format '{{.Name}}' | Where-Object { $_ -eq 'docker_default' })) { + docker network create --driver nat docker_default + if ($LASTEXITCODE -ne 0) { exit 1 } + } + + - name: Download NSC Godbolt artifact + uses: actions/download-artifact@v4 + with: + run-id: ${{ inputs.run_id }} + pattern: run-windows-*-msvc-${{ inputs.build_config }}-nsc-godbolt-image + path: artifact + merge-multiple: true + github-token: ${{ secrets.READ_PAT }} + repository: Devsh-Graphics-Programming/Nabla + + - name: Decompress .tar.zst + run: | + Get-ChildItem artifact -Filter *.tar.zst | ForEach-Object { + $output = $_.FullName -replace '\.zst$', '' + zstd -d "$($_.FullName)" -o "$output" + } + + - name: Load Docker image + run: | + $image = Get-ChildItem artifact -Filter *.tar | Select-Object -First 1 + docker load -i $image.FullName + + - name: Generate and run Docker Compose with matched image + run: | + $imageName = docker image ls --format "{{.Repository}}:{{.Tag}}" | + Where-Object { $_ -like "ghcr.io/devsh-graphics-programming/nabla:nsc-*" } | + Select-Object -First 1 + + if (-not $imageName) { + Write-Error "Could not find image with tag matching ghcr.io/devsh-graphics-programming/nabla:nsc-*" + exit 1 + } + + Write-Host "Found image: $imageName" + + @" + services: + nsc: + container_name: nsc-godbolt + image: $imageName + isolation: process + ports: + - "10240:10240" + volumes: + - type: bind + source: C:\Windows\Globalization\ICU + target: C:\Windows\Globalization\ICU + read_only: true + - type: bind + source: C:\Windows\System32 + target: C:\mount\Windows\System32 + read_only: true + networks: + - docker_default + + networks: + docker_default: + external: true + "@ | Set-Content compose.generated.yml + + docker compose -f compose.generated.yml up -d + + - name: Wait for local server on port 10240 + run: | + $maxRetries = 24 + $retryDelay = 5 + $success = $false + + for ($i = 0; $i -lt $maxRetries; $i++) { + try { + $response = Invoke-WebRequest -Uri "http://localhost:10240" -UseBasicParsing -TimeoutSec 5 + if ($response.StatusCode -eq 200) { + Write-Host "Local server is up and responding." + $success = $true + break + } else { + Write-Host "Received HTTP $($response.StatusCode), retrying..." + } + } catch { + Write-Host "Local server not responding yet, retrying..." + } + Start-Sleep -Seconds $retryDelay + } + + if (-not $success) { + Write-Error "Local server on port 10240 did not respond within timeout." + exit 1 + } + + - name: Print Container Logs + run: | + docker logs nsc-godbolt + + - name: Download cloudflared + run: | + Invoke-WebRequest -Uri https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-windows-amd64.exe -OutFile cloudflared.exe + + - name: Start tunnel + run: | + Start-Process -NoNewWindow -FilePath .\cloudflared.exe -ArgumentList "tunnel", "--url", "http://localhost:10240", "--logfile", "cf.log" + + $tries = 60 + $url = $null + + while ($tries -gt 0) { + if (Test-Path cf.log) { + $log = Get-Content cf.log + foreach ($line in $log) { + if ($line -match 'https:\/\/[a-zA-Z0-9\-]+\.trycloudflare\.com') { + $url = $Matches[0] + Write-Host "::notice title=Tunnel URL::$url" + break + } + } + if ($url) { break } + } + Start-Sleep -Seconds 1 + $tries -= 1 + } + + if (-not $url) { + Write-Error "Could not get tunnel URL from cloudflared log" + exit 1 + } + + $webhookUrl = "$env:DISCORD_WEBHOOK" + $runId = "${{ inputs.run_id }}" + $actor = "$env:GITHUB_ACTOR" + $startTime = (Get-Date -Format "yyyy-MM-dd HH:mm:ss") + $composedURL = "https://github.com/Devsh-Graphics-Programming/Nabla/actions/runs/$runId" + $workflowRunURL = "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + $sendDiscord = "${{ inputs.withDiscordMSG }}" -eq "true" + + $description = @" + - tunnel opened for 5 hours, click [here](<$url>) to connect + - workflow [logs #${{ github.run_id }}](<$workflowRunURL>) + - image downloaded from [run #$runId](<$composedURL>) + - dispatched by $actor + "@ + + $payload = @{ + embeds = @( + @{ + title = "Running NSC Godbolt Container" + description = $description + color = 15844367 + footer = @{ + text = "sent from GitHub Actions runner" + } + timestamp = (Get-Date).ToString("o") + } + ) + } | ConvertTo-Json -Depth 10 + + if ($sendDiscord) { + Invoke-RestMethod -Uri $webhookUrl -Method Post -ContentType 'application/json' -Body $payload + } + + Start-Sleep -Seconds 18000 From 4ec5bac4273992780aec843975b90bd77c1b8a5f Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com> Date: Thu, 29 May 2025 20:04:44 +0200 Subject: [PATCH 253/346] Delete .github/workflows directory --- .github/workflows /run-nsc.yml | 206 --------------------------------- 1 file changed, 206 deletions(-) delete mode 100644 .github/workflows /run-nsc.yml diff --git a/.github/workflows /run-nsc.yml b/.github/workflows /run-nsc.yml deleted file mode 100644 index 07be0d44e9..0000000000 --- a/.github/workflows /run-nsc.yml +++ /dev/null @@ -1,206 +0,0 @@ -name: Run NSC Godbolt Container - -on: - workflow_dispatch: - inputs: - run_id: - description: "The id of the workflow run where the desired download artifact was uploaded from" - required: true - build_config: - description: "Build configuration (Release / RelWithDebInfo / Debug)" - required: true - default: "Release" - type: choice - options: - - Release - - RelWithDebInfo - - Debug - withDiscordMSG: - description: "Send Discord message after tunnel is up" - required: true - default: true - type: boolean - -jobs: - run-container: - runs-on: windows-2022 - env: - DISCORD_WEBHOOK: ${{ secrets.DC_ACTIONS_WEBHOOK }} - - steps: - - name: Environment Setup - run: | - Add-MpPreference -ExclusionPath "${{ github.workspace }}" - Add-MpPreference -ExclusionExtension "*.*" - Add-MpPreference -ExclusionProcess "docker.exe" - Add-MpPreference -ExclusionProcess "dockerd.exe" - Set-MpPreference -RemediationScheduleDay 8 - Set-MpPreference -DisableRealtimeMonitoring $true - Set-MpPreference -DisableRemovableDriveScanning $true - Set-MpPreference -DisableArchiveScanning $true - Set-MpPreference -DisableScanningMappedNetworkDrivesForFullScan $true - - if (-not (docker network ls --format '{{.Name}}' | Where-Object { $_ -eq 'docker_default' })) { - docker network create --driver nat docker_default - if ($LASTEXITCODE -ne 0) { exit 1 } - } - - - name: Download NSC Godbolt artifact - uses: actions/download-artifact@v4 - with: - run-id: ${{ inputs.run_id }} - pattern: run-windows-*-msvc-${{ inputs.build_config }}-nsc-godbolt-image - path: artifact - merge-multiple: true - github-token: ${{ secrets.READ_PAT }} - repository: Devsh-Graphics-Programming/Nabla - - - name: Decompress .tar.zst - run: | - Get-ChildItem artifact -Filter *.tar.zst | ForEach-Object { - $output = $_.FullName -replace '\.zst$', '' - zstd -d "$($_.FullName)" -o "$output" - } - - - name: Load Docker image - run: | - $image = Get-ChildItem artifact -Filter *.tar | Select-Object -First 1 - docker load -i $image.FullName - - - name: Generate and run Docker Compose with matched image - run: | - $imageName = docker image ls --format "{{.Repository}}:{{.Tag}}" | - Where-Object { $_ -like "ghcr.io/devsh-graphics-programming/nabla:nsc-*" } | - Select-Object -First 1 - - if (-not $imageName) { - Write-Error "Could not find image with tag matching ghcr.io/devsh-graphics-programming/nabla:nsc-*" - exit 1 - } - - Write-Host "Found image: $imageName" - - @" - services: - nsc: - container_name: nsc-godbolt - image: $imageName - isolation: process - ports: - - "10240:10240" - volumes: - - type: bind - source: C:\Windows\Globalization\ICU - target: C:\Windows\Globalization\ICU - read_only: true - - type: bind - source: C:\Windows\System32 - target: C:\mount\Windows\System32 - read_only: true - networks: - - docker_default - - networks: - docker_default: - external: true - "@ | Set-Content compose.generated.yml - - docker compose -f compose.generated.yml up -d - - - name: Wait for local server on port 10240 - run: | - $maxRetries = 24 - $retryDelay = 5 - $success = $false - - for ($i = 0; $i -lt $maxRetries; $i++) { - try { - $response = Invoke-WebRequest -Uri "http://localhost:10240" -UseBasicParsing -TimeoutSec 5 - if ($response.StatusCode -eq 200) { - Write-Host "Local server is up and responding." - $success = $true - break - } else { - Write-Host "Received HTTP $($response.StatusCode), retrying..." - } - } catch { - Write-Host "Local server not responding yet, retrying..." - } - Start-Sleep -Seconds $retryDelay - } - - if (-not $success) { - Write-Error "Local server on port 10240 did not respond within timeout." - exit 1 - } - - - name: Print Container Logs - run: | - docker logs nsc-godbolt - - - name: Download cloudflared - run: | - Invoke-WebRequest -Uri https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-windows-amd64.exe -OutFile cloudflared.exe - - - name: Start tunnel - run: | - Start-Process -NoNewWindow -FilePath .\cloudflared.exe -ArgumentList "tunnel", "--url", "http://localhost:10240", "--logfile", "cf.log" - - $tries = 60 - $url = $null - - while ($tries -gt 0) { - if (Test-Path cf.log) { - $log = Get-Content cf.log - foreach ($line in $log) { - if ($line -match 'https:\/\/[a-zA-Z0-9\-]+\.trycloudflare\.com') { - $url = $Matches[0] - Write-Host "::notice title=Tunnel URL::$url" - break - } - } - if ($url) { break } - } - Start-Sleep -Seconds 1 - $tries -= 1 - } - - if (-not $url) { - Write-Error "Could not get tunnel URL from cloudflared log" - exit 1 - } - - $webhookUrl = "$env:DISCORD_WEBHOOK" - $runId = "${{ inputs.run_id }}" - $actor = "$env:GITHUB_ACTOR" - $startTime = (Get-Date -Format "yyyy-MM-dd HH:mm:ss") - $composedURL = "https://github.com/Devsh-Graphics-Programming/Nabla/actions/runs/$runId" - $workflowRunURL = "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" - $sendDiscord = "${{ inputs.withDiscordMSG }}" -eq "true" - - $description = @" - - tunnel opened for 5 hours, click [here](<$url>) to connect - - workflow [logs #${{ github.run_id }}](<$workflowRunURL>) - - image downloaded from [run #$runId](<$composedURL>) - - dispatched by $actor - "@ - - $payload = @{ - embeds = @( - @{ - title = "Running NSC Godbolt Container" - description = $description - color = 15844367 - footer = @{ - text = "sent from GitHub Actions runner" - } - timestamp = (Get-Date).ToString("o") - } - ) - } | ConvertTo-Json -Depth 10 - - if ($sendDiscord) { - Invoke-RestMethod -Uri $webhookUrl -Method Post -ContentType 'application/json' -Body $payload - } - - Start-Sleep -Seconds 18000 From 104422f7752b4b49d9e2f938f64d52c1528337a9 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com> Date: Thu, 29 May 2025 20:05:10 +0200 Subject: [PATCH 254/346] Create run-nsc.yml --- .github/workflows/run-nsc.yml | 206 ++++++++++++++++++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100644 .github/workflows/run-nsc.yml diff --git a/.github/workflows/run-nsc.yml b/.github/workflows/run-nsc.yml new file mode 100644 index 0000000000..07be0d44e9 --- /dev/null +++ b/.github/workflows/run-nsc.yml @@ -0,0 +1,206 @@ +name: Run NSC Godbolt Container + +on: + workflow_dispatch: + inputs: + run_id: + description: "The id of the workflow run where the desired download artifact was uploaded from" + required: true + build_config: + description: "Build configuration (Release / RelWithDebInfo / Debug)" + required: true + default: "Release" + type: choice + options: + - Release + - RelWithDebInfo + - Debug + withDiscordMSG: + description: "Send Discord message after tunnel is up" + required: true + default: true + type: boolean + +jobs: + run-container: + runs-on: windows-2022 + env: + DISCORD_WEBHOOK: ${{ secrets.DC_ACTIONS_WEBHOOK }} + + steps: + - name: Environment Setup + run: | + Add-MpPreference -ExclusionPath "${{ github.workspace }}" + Add-MpPreference -ExclusionExtension "*.*" + Add-MpPreference -ExclusionProcess "docker.exe" + Add-MpPreference -ExclusionProcess "dockerd.exe" + Set-MpPreference -RemediationScheduleDay 8 + Set-MpPreference -DisableRealtimeMonitoring $true + Set-MpPreference -DisableRemovableDriveScanning $true + Set-MpPreference -DisableArchiveScanning $true + Set-MpPreference -DisableScanningMappedNetworkDrivesForFullScan $true + + if (-not (docker network ls --format '{{.Name}}' | Where-Object { $_ -eq 'docker_default' })) { + docker network create --driver nat docker_default + if ($LASTEXITCODE -ne 0) { exit 1 } + } + + - name: Download NSC Godbolt artifact + uses: actions/download-artifact@v4 + with: + run-id: ${{ inputs.run_id }} + pattern: run-windows-*-msvc-${{ inputs.build_config }}-nsc-godbolt-image + path: artifact + merge-multiple: true + github-token: ${{ secrets.READ_PAT }} + repository: Devsh-Graphics-Programming/Nabla + + - name: Decompress .tar.zst + run: | + Get-ChildItem artifact -Filter *.tar.zst | ForEach-Object { + $output = $_.FullName -replace '\.zst$', '' + zstd -d "$($_.FullName)" -o "$output" + } + + - name: Load Docker image + run: | + $image = Get-ChildItem artifact -Filter *.tar | Select-Object -First 1 + docker load -i $image.FullName + + - name: Generate and run Docker Compose with matched image + run: | + $imageName = docker image ls --format "{{.Repository}}:{{.Tag}}" | + Where-Object { $_ -like "ghcr.io/devsh-graphics-programming/nabla:nsc-*" } | + Select-Object -First 1 + + if (-not $imageName) { + Write-Error "Could not find image with tag matching ghcr.io/devsh-graphics-programming/nabla:nsc-*" + exit 1 + } + + Write-Host "Found image: $imageName" + + @" + services: + nsc: + container_name: nsc-godbolt + image: $imageName + isolation: process + ports: + - "10240:10240" + volumes: + - type: bind + source: C:\Windows\Globalization\ICU + target: C:\Windows\Globalization\ICU + read_only: true + - type: bind + source: C:\Windows\System32 + target: C:\mount\Windows\System32 + read_only: true + networks: + - docker_default + + networks: + docker_default: + external: true + "@ | Set-Content compose.generated.yml + + docker compose -f compose.generated.yml up -d + + - name: Wait for local server on port 10240 + run: | + $maxRetries = 24 + $retryDelay = 5 + $success = $false + + for ($i = 0; $i -lt $maxRetries; $i++) { + try { + $response = Invoke-WebRequest -Uri "http://localhost:10240" -UseBasicParsing -TimeoutSec 5 + if ($response.StatusCode -eq 200) { + Write-Host "Local server is up and responding." + $success = $true + break + } else { + Write-Host "Received HTTP $($response.StatusCode), retrying..." + } + } catch { + Write-Host "Local server not responding yet, retrying..." + } + Start-Sleep -Seconds $retryDelay + } + + if (-not $success) { + Write-Error "Local server on port 10240 did not respond within timeout." + exit 1 + } + + - name: Print Container Logs + run: | + docker logs nsc-godbolt + + - name: Download cloudflared + run: | + Invoke-WebRequest -Uri https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-windows-amd64.exe -OutFile cloudflared.exe + + - name: Start tunnel + run: | + Start-Process -NoNewWindow -FilePath .\cloudflared.exe -ArgumentList "tunnel", "--url", "http://localhost:10240", "--logfile", "cf.log" + + $tries = 60 + $url = $null + + while ($tries -gt 0) { + if (Test-Path cf.log) { + $log = Get-Content cf.log + foreach ($line in $log) { + if ($line -match 'https:\/\/[a-zA-Z0-9\-]+\.trycloudflare\.com') { + $url = $Matches[0] + Write-Host "::notice title=Tunnel URL::$url" + break + } + } + if ($url) { break } + } + Start-Sleep -Seconds 1 + $tries -= 1 + } + + if (-not $url) { + Write-Error "Could not get tunnel URL from cloudflared log" + exit 1 + } + + $webhookUrl = "$env:DISCORD_WEBHOOK" + $runId = "${{ inputs.run_id }}" + $actor = "$env:GITHUB_ACTOR" + $startTime = (Get-Date -Format "yyyy-MM-dd HH:mm:ss") + $composedURL = "https://github.com/Devsh-Graphics-Programming/Nabla/actions/runs/$runId" + $workflowRunURL = "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + $sendDiscord = "${{ inputs.withDiscordMSG }}" -eq "true" + + $description = @" + - tunnel opened for 5 hours, click [here](<$url>) to connect + - workflow [logs #${{ github.run_id }}](<$workflowRunURL>) + - image downloaded from [run #$runId](<$composedURL>) + - dispatched by $actor + "@ + + $payload = @{ + embeds = @( + @{ + title = "Running NSC Godbolt Container" + description = $description + color = 15844367 + footer = @{ + text = "sent from GitHub Actions runner" + } + timestamp = (Get-Date).ToString("o") + } + ) + } | ConvertTo-Json -Depth 10 + + if ($sendDiscord) { + Invoke-RestMethod -Uri $webhookUrl -Method Post -ContentType 'application/json' -Body $payload + } + + Start-Sleep -Seconds 18000 From 90d3579660fbe8f914e1009cc778490bbe5c456a Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 30 May 2025 11:10:54 +0700 Subject: [PATCH 255/346] fix scans for level 1+ --- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 1d386835b9..e4c23ee555 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -179,15 +179,15 @@ struct scan const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 1 scan - subgroup2::inclusive_scan inclusiveScan1; + subgroup2::exclusive_scan exclusiveScan1; if (glsl::gl_SubgroupID() == 0) { vector_lv1_t lv1_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i)-1,lv1_val[i]); - lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex)); - lv1_val = inclusiveScan1(lv1_val); + scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); + // lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex)); + lv1_val = exclusiveScan1(lv1_val); [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) scratchAccessor.template set(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); @@ -304,15 +304,16 @@ struct scan const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 1 scan const uint32_t lv1_smem_size = Config::SubgroupsSize*Config::ItemsPerInvocation_1; - subgroup2::inclusive_scan inclusiveScan1; - if (glsl::gl_SubgroupID() < Config::SubgroupsSize*Config::ItemsPerInvocation_2) + const uint32_t lv1_num_invoc = Config::SubgroupsSize*Config::ItemsPerInvocation_2; + subgroup2::exclusive_scan exclusiveScan1; + if (glsl::gl_SubgroupID() < lv1_num_invoc) { vector_lv1_t lv1_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i)-1,lv1_val[i]); - lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex)); - lv1_val = inclusiveScan1(lv1_val); + scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); + // lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex)); + lv1_val = exclusiveScan1(lv1_val); [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) scratchAccessor.template set(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); @@ -325,15 +326,15 @@ struct scan scratchAccessor.workgroupExecutionAndMemoryBarrier(); // level 2 scan - subgroup2::inclusive_scan inclusiveScan2; + subgroup2::exclusive_scan exclusiveScan2; if (glsl::gl_SubgroupID() == 0) { vector_lv2_t lv2_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) - scratchAccessor.template get(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i)-1,lv2_val[i]); + scratchAccessor.template get(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]); lv2_val[0] = hlsl::mix(BinOp::identity, lv2_val[0], bool(invocationIndex)); - lv2_val = inclusiveScan2(lv2_val); + lv2_val = exclusiveScan2(lv2_val); [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) scratchAccessor.template set(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]); @@ -341,7 +342,7 @@ struct scan scratchAccessor.workgroupExecutionAndMemoryBarrier(); // combine with level 1 - if (glsl::gl_SubgroupID() < lv1_smem_size) + if (glsl::gl_SubgroupID() < lv1_num_invoc) { vector_lv1_t lv1_val; [unroll] From 203c03a8f52b4cec36f88d6566fdff6d67534b53 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 30 May 2025 14:17:42 +0700 Subject: [PATCH 256/346] some indexing fixes for 3-level reduce/scan --- .../builtin/hlsl/workgroup2/arithmetic_config.hlsl | 13 +++++++------ .../nbl/builtin/hlsl/workgroup2/shared_scan.hlsl | 6 +++--- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl index 2f1a8b06a0..c7832c360a 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -51,9 +51,6 @@ struct ArithmeticConfiguration NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << virtual_wg_t::value; static_assert(VirtualWorkgroupSize<=WorkgroupSize*SubgroupSize); - NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroupLog2 = mpl::max_v; - NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroup = uint16_t(0x1u) << __SubgroupsPerVirtualWorkgroupLog2; - using items_per_invoc_t = impl::items_per_invocation; // NBL_CONSTEXPR_STATIC_INLINE uint32_t2 ItemsPerInvocation; TODO? doesn't allow inline definitions for uint32_t2 for some reason, uint32_t[2] as well ; declaring out of line results in not constant expression NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_0 = items_per_invoc_t::value0; @@ -61,12 +58,16 @@ struct ArithmeticConfiguration NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = items_per_invoc_t::value2; static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!"); + NBL_CONSTEXPR_STATIC_INLINE uint16_t __ItemsPerVirtualWorkgroupLog2 = mpl::max_v; + NBL_CONSTEXPR_STATIC_INLINE uint16_t __ItemsPerVirtualWorkgroup = uint16_t(0x1u) << __ItemsPerVirtualWorkgroupLog2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroup = __ItemsPerVirtualWorkgroup / ItemsPerInvocation_1; + NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = conditional_value::value + SubgroupSize*ItemsPerInvocation_1 + SubgroupSize*ItemsPerInvocation_2+__ItemsPerVirtualWorkgroup, + SubgroupSize*ItemsPerInvocation_1 + >::value >::value; static bool electLast() diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index e4c23ee555..af37908292 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -245,7 +245,7 @@ struct reduce const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 1 scan - const uint32_t lv1_smem_size = Config::SubgroupsSize*Config::ItemsPerInvocation_1; + const uint32_t lv1_smem_size = Config::__ItemsPerVirtualWorkgroup; subgroup2::reduction reduction1; if (glsl::gl_SubgroupID() < Config::SubgroupSize*Config::ItemsPerInvocation_2) { @@ -303,8 +303,8 @@ struct scan const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 1 scan - const uint32_t lv1_smem_size = Config::SubgroupsSize*Config::ItemsPerInvocation_1; - const uint32_t lv1_num_invoc = Config::SubgroupsSize*Config::ItemsPerInvocation_2; + const uint32_t lv1_smem_size = Config::__ItemsPerVirtualWorkgroup; + const uint32_t lv1_num_invoc = Config::SubgroupSize*Config::ItemsPerInvocation_2; subgroup2::exclusive_scan exclusiveScan1; if (glsl::gl_SubgroupID() < lv1_num_invoc) { From 0b163078f8363129a3b34a293f0f1286d2e82791 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 30 May 2025 15:57:21 +0700 Subject: [PATCH 257/346] fix 3-level scan downsweep step --- examples_tests | 2 +- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 18 +++++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/examples_tests b/examples_tests index f202ef5632..93b78108b4 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit f202ef563249c172d4a6c699379c6793ae939863 +Subproject commit 93b78108b433cfb85407c5f6816adc4c58b0fb7b diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index af37908292..de55a131b8 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -305,7 +305,7 @@ struct scan // level 1 scan const uint32_t lv1_smem_size = Config::__ItemsPerVirtualWorkgroup; const uint32_t lv1_num_invoc = Config::SubgroupSize*Config::ItemsPerInvocation_2; - subgroup2::exclusive_scan exclusiveScan1; + subgroup2::inclusive_scan inclusiveScan1; if (glsl::gl_SubgroupID() < lv1_num_invoc) { vector_lv1_t lv1_val; @@ -313,7 +313,7 @@ struct scan for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); // lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex)); - lv1_val = exclusiveScan1(lv1_val); + lv1_val = inclusiveScan1(lv1_val); [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) scratchAccessor.template set(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); @@ -333,7 +333,7 @@ struct scan [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) scratchAccessor.template get(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]); - lv2_val[0] = hlsl::mix(BinOp::identity, lv2_val[0], bool(invocationIndex)); + // lv2_val[0] = hlsl::mix(BinOp::identity, lv2_val[0], bool(invocationIndex)); lv2_val = exclusiveScan2(lv2_val); [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) @@ -347,16 +347,20 @@ struct scan vector_lv1_t lv1_val; [unroll] for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(i*Config::SubgroupSize+invocationIndex,lv1_val[i]); + scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i), lv1_val[i]); + + const scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp(lv1_val[Config::ItemsPerInvocation_1-1],1), bool(glsl::gl_SubgroupInvocationID())); scalar_t lv2_scan; const uint32_t bankedIndex = Config::template sharedStoreIndex<2>(glsl::gl_SubgroupID()); - scratchAccessor.template set(lv1_smem_size+bankedIndex, lv2_scan); + scratchAccessor.template get(lv1_smem_size+bankedIndex, lv2_scan); [unroll] - for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template set(Config::template sharedLoadIndex<1>(invocationIndex, i), binop(lv1_val[i],lv2_scan)); + for (uint32_t i = Config::ItemsPerInvocation_1-1; i > 0; i--) + scratchAccessor.template set(Config::template sharedLoadIndex<1>(invocationIndex, i), binop(lv1_val[i-1],lv2_scan)); + scratchAccessor.template set(Config::template sharedLoadIndex<1>(invocationIndex, 0), binop(left_last_elem,lv2_scan)); } + scratchAccessor.workgroupExecutionAndMemoryBarrier(); // combine with level 0 [unroll] From aab868be8dca650a1a037a016382691831def6b6 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com> Date: Fri, 30 May 2025 18:49:34 +0200 Subject: [PATCH 258/346] Update run-nsc.yml, add reverse proxy, require authentication to connect to tested NSC Godbolt instance, add options to set timeout and default to 1h to not violate Github ToS we do not allow for public connections and restrict to members of DevshGraphicsProgramming only --- .github/workflows/run-nsc.yml | 105 ++++++++++++++++++++++++++-------- 1 file changed, 81 insertions(+), 24 deletions(-) diff --git a/.github/workflows/run-nsc.yml b/.github/workflows/run-nsc.yml index 07be0d44e9..c886256a83 100644 --- a/.github/workflows/run-nsc.yml +++ b/.github/workflows/run-nsc.yml @@ -15,6 +15,17 @@ on: - Release - RelWithDebInfo - Debug + tunnelDurationHours: + description: "Hours amount the restricted tunnel should stay up" + required: true + default: "1" + type: choice + options: + - "1" + - "2" + - "3" + - "4" + - "5" withDiscordMSG: description: "Send Discord message after tunnel is up" required: true @@ -44,7 +55,47 @@ jobs: docker network create --driver nat docker_default if ($LASTEXITCODE -ne 0) { exit 1 } } - + + $sendDiscord = "${{ inputs.withDiscordMSG }}" -eq "true" + Write-Host "::notice::Should send discord message? $sendDiscord" + + - name: Download Restricted Reverse Proxy binaries, setup NGINX config + run: | + Invoke-WebRequest -Uri https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-windows-amd64.exe -OutFile cloudflared.exe + Invoke-WebRequest -Uri "https://nginx.org/download/nginx-1.24.0.zip" -OutFile nginx.zip + Expand-Archive nginx.zip -DestinationPath nginx + + Remove-Item -Recurse -Force "nginx/nginx-1.24.0/conf" + New-Item -ItemType Directory -Path "nginx/nginx-1.24.0/conf" -Force | Out-Null + + '${{ secrets.NSC_BASIC_AUTH_HTPASSWD }}' | Out-File nginx/nginx-1.24.0/conf/.htpasswd -Encoding ascii + $htpasswdPath = (Resolve-Path "nginx/nginx-1.24.0/conf/.htpasswd").Path -replace '\\', '/' + + @" + events {} + + http { + server { + listen 10241; + + location / { + auth_basic "Restricted Compiler Explorer access for Development & NSC Artifact Tests, downloaded from Nabla actions pipeline"; + auth_basic_user_file "$htpasswdPath"; + + proxy_pass http://127.0.0.1:10240; + proxy_set_header Host `$host; + proxy_set_header X-Real-IP `$remote_addr; + } + } + } + "@ | Out-File nginx/nginx-1.24.0/conf/nginx.conf -Encoding ascii + + Write-Host "::group::Generated nginx.conf" + Get-Content nginx/nginx-1.24.0/conf/nginx.conf + Write-Host "::endgroup::" + + & "nginx/nginx-1.24.0/nginx.exe" -t -p "nginx/nginx-1.24.0" -c "conf/nginx.conf" + - name: Download NSC Godbolt artifact uses: actions/download-artifact@v4 with: @@ -107,7 +158,7 @@ jobs: docker compose -f compose.generated.yml up -d - - name: Wait for local server on port 10240 + - name: Wait for NSC container response on port run: | $maxRetries = 24 $retryDelay = 5 @@ -117,34 +168,35 @@ jobs: try { $response = Invoke-WebRequest -Uri "http://localhost:10240" -UseBasicParsing -TimeoutSec 5 if ($response.StatusCode -eq 200) { - Write-Host "Local server is up and responding." + Write-Host "NSC container is up listening on port 10240 and responding." $success = $true break } else { Write-Host "Received HTTP $($response.StatusCode), retrying..." } } catch { - Write-Host "Local server not responding yet, retrying..." + Write-Host "NSC container is not responding on port 10240, retrying..." } Start-Sleep -Seconds $retryDelay } if (-not $success) { - Write-Error "Local server on port 10240 did not respond within timeout." + Write-Error "No response from NSC container on port 10240, timeout." exit 1 } - - name: Print Container Logs + - name: Print NSC container logs run: | docker logs nsc-godbolt - - name: Download cloudflared + - name: Start Restricted Tunnel + env: + DISCORD_ENABLED: ${{ inputs.withDiscordMSG }} + TUNNEL_DURATION_HOURS: ${{ inputs.tunnelDurationHours }} run: | - Invoke-WebRequest -Uri https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-windows-amd64.exe -OutFile cloudflared.exe - - - name: Start tunnel - run: | - Start-Process -NoNewWindow -FilePath .\cloudflared.exe -ArgumentList "tunnel", "--url", "http://localhost:10240", "--logfile", "cf.log" + Start-Process -NoNewWindow -FilePath .\nginx\nginx-1.24.0\nginx.exe -ArgumentList '-p', (Join-Path $PWD 'nginx/nginx-1.24.0'), '-c', 'conf/nginx.conf' + Start-Process -NoNewWindow -FilePath .\cloudflared.exe -ArgumentList "tunnel", "--url", "http://localhost:10241", "--logfile", "cf.log" + netstat -an | findstr 10241 $tries = 60 $url = $null @@ -164,23 +216,27 @@ jobs: Start-Sleep -Seconds 1 $tries -= 1 } - + if (-not $url) { Write-Error "Could not get tunnel URL from cloudflared log" exit 1 } $webhookUrl = "$env:DISCORD_WEBHOOK" - $runId = "${{ inputs.run_id }}" + $runId = "$env:GITHUB_RUN_ID" $actor = "$env:GITHUB_ACTOR" - $startTime = (Get-Date -Format "yyyy-MM-dd HH:mm:ss") $composedURL = "https://github.com/Devsh-Graphics-Programming/Nabla/actions/runs/$runId" - $workflowRunURL = "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" - $sendDiscord = "${{ inputs.withDiscordMSG }}" -eq "true" + $workflowRunURL = "https://github.com/$env:GITHUB_REPOSITORY/actions/runs/$runId" + $sendDiscord = "$env:DISCORD_ENABLED" -eq "true" + $hours = [int]$env:TUNNEL_DURATION_HOURS + $duration = $hours * 3600 + + Write-Host "Blocking job for $hours hours" $description = @" - - tunnel opened for 5 hours, click [here](<$url>) to connect - - workflow [logs #${{ github.run_id }}](<$workflowRunURL>) + - tunnel opened for $hours hours, click [here](<$url>) to connect + - requires authentication + - workflow [logs #$runId](<$workflowRunURL>) - image downloaded from [run #$runId](<$composedURL>) - dispatched by $actor "@ @@ -191,16 +247,17 @@ jobs: title = "Running NSC Godbolt Container" description = $description color = 15844367 - footer = @{ - text = "sent from GitHub Actions runner" - } + footer = @{ text = "sent from GitHub Actions runner" } timestamp = (Get-Date).ToString("o") } ) } | ConvertTo-Json -Depth 10 - + if ($sendDiscord) { + Write-Host "Sending Discord webhook..." Invoke-RestMethod -Uri $webhookUrl -Method Post -ContentType 'application/json' -Body $payload + } else { + Write-Host "Discord webhook disabled" } - Start-Sleep -Seconds 18000 + Start-Sleep -Seconds $duration From 068fc26b724832047ec3038a33fddd5c39fbc1fe Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com> Date: Fri, 30 May 2025 19:35:02 +0200 Subject: [PATCH 259/346] Update run-nsc.yml, typo I did --- .github/workflows/run-nsc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-nsc.yml b/.github/workflows/run-nsc.yml index c886256a83..e52bbae0fd 100644 --- a/.github/workflows/run-nsc.yml +++ b/.github/workflows/run-nsc.yml @@ -225,8 +225,8 @@ jobs: $webhookUrl = "$env:DISCORD_WEBHOOK" $runId = "$env:GITHUB_RUN_ID" $actor = "$env:GITHUB_ACTOR" + $workflowRunURL = "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" $composedURL = "https://github.com/Devsh-Graphics-Programming/Nabla/actions/runs/$runId" - $workflowRunURL = "https://github.com/$env:GITHUB_REPOSITORY/actions/runs/$runId" $sendDiscord = "$env:DISCORD_ENABLED" -eq "true" $hours = [int]$env:TUNNEL_DURATION_HOURS $duration = $hours * 3600 From 5bf733671bead2de6216f048005d89c038ea3376 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com> Date: Fri, 30 May 2025 19:52:12 +0200 Subject: [PATCH 260/346] Update run-nsc.yml, some updates to dc logs --- .github/workflows/run-nsc.yml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/run-nsc.yml b/.github/workflows/run-nsc.yml index e52bbae0fd..456e0d3054 100644 --- a/.github/workflows/run-nsc.yml +++ b/.github/workflows/run-nsc.yml @@ -223,10 +223,11 @@ jobs: } $webhookUrl = "$env:DISCORD_WEBHOOK" - $runId = "$env:GITHUB_RUN_ID" + $thisWorkflowRunID = "${{ github.run_id }}" + $artifactWorkflowRunID = "${{ inputs.run_id }}" + $thisWorkflowRunURL = "https://github.com/${{ github.repository }}/actions/runs/$thisWorkflowRunID" + $artifactWorkflowRunURL = "https://github.com/${{ github.repository }}/actions/runs/$artifactWorkflowRunID" $actor = "$env:GITHUB_ACTOR" - $workflowRunURL = "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" - $composedURL = "https://github.com/Devsh-Graphics-Programming/Nabla/actions/runs/$runId" $sendDiscord = "$env:DISCORD_ENABLED" -eq "true" $hours = [int]$env:TUNNEL_DURATION_HOURS $duration = $hours * 3600 @@ -236,8 +237,8 @@ jobs: $description = @" - tunnel opened for $hours hours, click [here](<$url>) to connect - requires authentication - - workflow [logs #$runId](<$workflowRunURL>) - - image downloaded from [run #$runId](<$composedURL>) + - workflow [logs #$thisWorkflowRunID](<$thisWorkflowRunURL>) + - image downloaded from [run #$artifactWorkflowRunID](<$artifactWorkflowRunURL>) - dispatched by $actor "@ From 83991b9190173efcf2192e601da161a92058ab20 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 2 Jun 2025 10:28:26 +0700 Subject: [PATCH 261/346] added tuple.hlsl --- include/nbl/builtin/hlsl/tuple.hlsl | 61 +++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 include/nbl/builtin/hlsl/tuple.hlsl diff --git a/include/nbl/builtin/hlsl/tuple.hlsl b/include/nbl/builtin/hlsl/tuple.hlsl new file mode 100644 index 0000000000..a9c26090ea --- /dev/null +++ b/include/nbl/builtin/hlsl/tuple.hlsl @@ -0,0 +1,61 @@ +// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_BUILTIN_HLSL_TUPLE_INCLUDED_ +#define _NBL_BUILTIN_HLSL_TUPLE_INCLUDED_ + +#include "nbl/builtin/hlsl/type_traits.hlsl" + +namespace nbl +{ +namespace hlsl +{ + +template // TODO: in the future use BOOST_PP to make this +struct tuple +{ + T0 t0; + T1 t1; + T2 t2; +}; + +template +struct tuple_element; + +template +struct tuple +{ + T0 t0; +}; + +template +struct tuple +{ + T0 t0; + T1 t1; +}; +// specializations for less and less void elements + +// base case +template +struct tuple_element<0,tuple > +{ + using type = Head; +}; + +template +struct tuple_element<1,tuple > +{ + using type = Head; +}; + +template +struct tuple_element<2,tuple > +{ + using type = Head; +}; + +} +} + +#endif From 209adb4f51d5646c7545a1615b4635b821921e13 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 2 Jun 2025 11:31:47 +0700 Subject: [PATCH 262/346] added some comments to config funcs for future debugging --- examples_tests | 2 +- .../builtin/hlsl/workgroup2/arithmetic_config.hlsl | 11 +++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/examples_tests b/examples_tests index 93b78108b4..3a3aaa9fce 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 93b78108b433cfb85407c5f6816adc4c58b0fb7b +Subproject commit 3a3aaa9fce04cda7726170e2128124d466252a27 diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl index c7832c360a..90b46b8c07 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -5,6 +5,7 @@ #define _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_CONFIG_INCLUDED_ #include "nbl/builtin/hlsl/cpp_compat.hlsl" +#include "nbl/builtin/hlsl/tuple.hlsl" namespace nbl { @@ -52,16 +53,16 @@ struct ArithmeticConfiguration static_assert(VirtualWorkgroupSize<=WorkgroupSize*SubgroupSize); using items_per_invoc_t = impl::items_per_invocation; - // NBL_CONSTEXPR_STATIC_INLINE uint32_t2 ItemsPerInvocation; TODO? doesn't allow inline definitions for uint32_t2 for some reason, uint32_t[2] as well ; declaring out of line results in not constant expression + using ItemsPerInvocation = tuple,integral_constant,integral_constant >; NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_0 = items_per_invoc_t::value0; NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_1 = items_per_invoc_t::value1; NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = items_per_invoc_t::value2; - static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!"); NBL_CONSTEXPR_STATIC_INLINE uint16_t __ItemsPerVirtualWorkgroupLog2 = mpl::max_v; NBL_CONSTEXPR_STATIC_INLINE uint16_t __ItemsPerVirtualWorkgroup = uint16_t(0x1u) << __ItemsPerVirtualWorkgroupLog2; NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroup = __ItemsPerVirtualWorkgroup / ItemsPerInvocation_1; + // user specified the shared mem size of uint32_ts NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = conditional_value> SubgroupSizeLog2) + subgroupID; } + // get a coalesced index to store for the next level in shared mem, e.g. level 0 -> level 1 + // specify the next level to store values for in template param + // at level==LevelCount-1, it is guaranteed to have SubgroupSize elements template static uint32_t sharedStoreIndex(const uint32_t subgroupID) { @@ -102,6 +108,7 @@ struct ArithmeticConfiguration return sharedStoreIndex(virtualID); } + // get the coalesced index in shared mem at the current level template static uint32_t sharedLoadIndex(const uint32_t invocationIndex, const uint32_t component) { From 9cdaa9fd385ffc54c48d973ce11640f3a24b64f1 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Mon, 2 Jun 2025 13:37:44 +0200 Subject: [PATCH 263/346] change NSC package name, add badges creation & deploy --- .github/workflows/build-nabla.yml | 76 +++++++++++++++++++++++++++++-- .github/workflows/run-nsc.yml | 4 +- README.md | 12 +++-- compose.yml | 2 +- 4 files changed, 83 insertions(+), 11 deletions(-) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index 67fc9c4401..cedecb3b92 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -5,6 +5,9 @@ on: pull_request: workflow_dispatch: +permissions: + contents: read + concurrency: group: push-lock-${{ github.ref }} cancel-in-progress: true @@ -54,10 +57,10 @@ jobs: shell: pwsh run: | $prefix = "run-windows-${{ matrix.tag }}-${{ matrix.vendor }}-${{ matrix.config }}" - $repo = $env:GITHUB_REPOSITORY - $tag = "nsc-godbolt-build-${{ matrix.vendor }}-${{ matrix.config }}-${{ matrix.tag }}" - $nscTargetTaggedImage = "ghcr.io/${repo}:${tag}".ToLower() - $nscTargetTaggedImageLatest = "ghcr.io/${repo}:nsc-godbolt-latest".ToLower() + $package = "nabla-shader-compiler-godbolt" + $tag = "build-${{ matrix.vendor }}-${{ matrix.config }}-${{ matrix.tag }}" + $nscTargetTaggedImage = "ghcr.io/${package}:${tag}".ToLower() + $nscTargetTaggedImageLatest = "ghcr.io/${package}:latest".ToLower() $shouldPushImage = ( "${{ github.ref }}" -eq "refs/heads/master" -and @@ -178,4 +181,67 @@ jobs: - name: Push images to GHCR if: steps.set-prefix.outputs.shouldPushImage == 'True' run: | - docker push ${{ steps.set-prefix.outputs.nscTargetTaggedImageLatest }} \ No newline at end of file + docker push ${{ steps.set-prefix.outputs.nscTargetTaggedImageLatest }} + + update-badges: + name: Update Build & Image Badges + if: ${{ always() && github.ref == 'refs/heads/master' }} + needs: build-windows + runs-on: windows-2022 + permissions: + contents: write + + steps: + - name: Create Build Badge + run: | + $jobStatus = "${{ needs.build-windows.result }}" + $buildMsg = if ($jobStatus -eq "success") { "passing" } else { "failing" } + $buildColor = if ($jobStatus -eq "success") { "brightgreen" } else { "red" } + + $buildBadge = @{ + schemaVersion = 1 + label = "build" + message = $buildMsg + color = $buildColor + } | ConvertTo-Json -Depth 2 + + $buildPath = ".badge-public/nabla" + New-Item -ItemType Directory -Path $buildPath -Force | Out-Null + $buildBadge | Set-Content -Path "$buildPath/build.json" -Encoding utf8 + + - name: Create Image Size Badge + run: | + $image = "ghcr.io/devsh-graphics-programming/nabla:nsc-godbolt-latest" + $manifest = docker manifest inspect $image | ConvertFrom-Json + + if ($manifest.manifests) { + $totalSize = ($manifest.manifests | Measure-Object -Property size -Sum).Sum + } elseif ($manifest.layers) { + $totalSize = ($manifest.layers | Measure-Object -Property size -Sum).Sum + } else { + Write-Error "No valid size information found in manifest." + exit 1 + } + + $sizeMB = [Math]::Round($totalSize / 1MB, 2) + $size = "$sizeMB MB" + + $imageBadge = @{ + schemaVersion = 1 + label = $image + message = $size + color = "blue" + } | ConvertTo-Json -Depth 2 + + $imagePath = ".badge-public/packages/nabla-shader-compiler-nsc" + New-Item -ItemType Directory -Path $imagePath -Force | Out-Null + $imageBadge | Set-Content -Path "$imagePath/image-badge.json" -Encoding utf8 + + - name: Deploy Badges + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_branch: badges + publish_dir: .badge-public + keep_files: true + commit_message: "[CI] badges update" \ No newline at end of file diff --git a/.github/workflows/run-nsc.yml b/.github/workflows/run-nsc.yml index 456e0d3054..d5f9f74c2b 100644 --- a/.github/workflows/run-nsc.yml +++ b/.github/workflows/run-nsc.yml @@ -121,11 +121,11 @@ jobs: - name: Generate and run Docker Compose with matched image run: | $imageName = docker image ls --format "{{.Repository}}:{{.Tag}}" | - Where-Object { $_ -like "ghcr.io/devsh-graphics-programming/nabla:nsc-*" } | + Where-Object { $_ -like "ghcr.io/devsh-graphics-programming/nabla-shader-compiler-godbolt:build-*" } | Select-Object -First 1 if (-not $imageName) { - Write-Error "Could not find image with tag matching ghcr.io/devsh-graphics-programming/nabla:nsc-*" + Write-Error "Could not find image with tag matching ghcr.io/devsh-graphics-programming/nabla-shader-compiler-godbolt:build-*" exit 1 } diff --git a/README.md b/README.md index 2b85c9c460..f49fede7d7 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,15 @@
Click to see the source
-
- Click to see the source -
+ +

+ + Build Status + + License: Apache 2.0 + + Join our Discord +

# Table of Contents diff --git a/compose.yml b/compose.yml index 8d6f1bc64a..3f32e8d1b5 100644 --- a/compose.yml +++ b/compose.yml @@ -1,7 +1,7 @@ services: nsc: container_name: nsc-godbolt - image: ghcr.io/devsh-graphics-programming/nabla:nsc-godbolt-latest + image: ghcr.io/devsh-graphics-programming/nabla-shader-compiler-godbolt:latest isolation: process ports: - "80:10240" From 67a9a07d43c41d4f7746342a53c8e5731c6d5dd3 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com> Date: Mon, 2 Jun 2025 14:35:44 +0200 Subject: [PATCH 264/346] Update build-nabla.yml, typo --- .github/workflows/build-nabla.yml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index cedecb3b92..e116e3f5ec 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -57,10 +57,11 @@ jobs: shell: pwsh run: | $prefix = "run-windows-${{ matrix.tag }}-${{ matrix.vendor }}-${{ matrix.config }}" + $owner = "${{ github.repository_owner }}" $package = "nabla-shader-compiler-godbolt" $tag = "build-${{ matrix.vendor }}-${{ matrix.config }}-${{ matrix.tag }}" - $nscTargetTaggedImage = "ghcr.io/${package}:${tag}".ToLower() - $nscTargetTaggedImageLatest = "ghcr.io/${package}:latest".ToLower() + $nscTargetTaggedImage = "ghcr.io/${owner}/${package}:${tag}".ToLower() + $nscTargetTaggedImageLatest = "ghcr.io/${owner}/${package}:latest".ToLower() $shouldPushImage = ( "${{ github.ref }}" -eq "refs/heads/master" -and @@ -211,6 +212,10 @@ jobs: - name: Create Image Size Badge run: | + $owner = "${{ github.repository_owner }}" + $package = "nabla-shader-compiler-godbolt" + $nscTargetTaggedImageLatest = "ghcr.io/${owner}/${package}:latest".ToLower() + $image = "ghcr.io/devsh-graphics-programming/nabla:nsc-godbolt-latest" $manifest = docker manifest inspect $image | ConvertFrom-Json @@ -244,4 +249,4 @@ jobs: publish_branch: badges publish_dir: .badge-public keep_files: true - commit_message: "[CI] badges update" \ No newline at end of file + commit_message: "[CI] badges update" From 9178ec3ae3edc9c9e8a1f37817b5eb568f16d408 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Mon, 2 Jun 2025 15:57:08 +0200 Subject: [PATCH 265/346] update badge labels, discord links and tools/nsc/docker/README.md --- .github/workflows/build-nabla.yml | 7 ++++--- README.md | 2 +- tools/nsc/docker/README.md | 14 ++++++++++++++ 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index e116e3f5ec..8988fe6df6 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -193,6 +193,9 @@ jobs: contents: write steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Create Build Badge run: | $jobStatus = "${{ needs.build-windows.result }}" @@ -214,9 +217,7 @@ jobs: run: | $owner = "${{ github.repository_owner }}" $package = "nabla-shader-compiler-godbolt" - $nscTargetTaggedImageLatest = "ghcr.io/${owner}/${package}:latest".ToLower() - - $image = "ghcr.io/devsh-graphics-programming/nabla:nsc-godbolt-latest" + $image = "ghcr.io/${owner}/${package}:latest".ToLower() $manifest = docker manifest inspect $image | ConvertFrom-Json if ($manifest.manifests) { diff --git a/README.md b/README.md index f49fede7d7..a696846b30 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Build Status License: Apache 2.0 - + Join our Discord

diff --git a/tools/nsc/docker/README.md b/tools/nsc/docker/README.md index afd8b0f8b7..d44eea9f81 100644 --- a/tools/nsc/docker/README.md +++ b/tools/nsc/docker/README.md @@ -4,6 +4,17 @@ https://github.com/user-attachments/assets/8d409477-92e4-4238-b5e5-637cfbdf7263 +

+ + Image Status + + Build Status + + License: Apache 2.0 + + Join our Discord +

+ ## Requirements - Configured [***Docker***](https://docs.docker.com/desktop/setup/install/windows-install/) for Windows Containers @@ -12,6 +23,9 @@ https://github.com/user-attachments/assets/8d409477-92e4-4238-b5e5-637cfbdf7263 > [!TIP] > type `cmd /ver` to see your build version +> [!WARNING] +> You cannot run it on Windows Home Edition as it doesn't have `Containers` feature, visit Microsoft [docs]() for more details + > [!CAUTION] > Hyper-V is **NOT** supported, you must run NSC Godbolt container as process From 7d77d30baacb673d7f1ca63e0e015ea984d8455d Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 3 Jun 2025 12:10:18 +0700 Subject: [PATCH 266/346] change indexing to uint16_t --- .../hlsl/workgroup2/arithmetic_config.hlsl | 28 ++-- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 137 +++++++++--------- 2 files changed, 84 insertions(+), 81 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl index 90b46b8c07..a9fdcfe0a4 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -58,15 +58,19 @@ struct ArithmeticConfiguration NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_1 = items_per_invoc_t::value1; NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = items_per_invoc_t::value2; - NBL_CONSTEXPR_STATIC_INLINE uint16_t __ItemsPerVirtualWorkgroupLog2 = mpl::max_v; - NBL_CONSTEXPR_STATIC_INLINE uint16_t __ItemsPerVirtualWorkgroup = uint16_t(0x1u) << __ItemsPerVirtualWorkgroupLog2; - NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroup = __ItemsPerVirtualWorkgroup / ItemsPerInvocation_1; + // NBL_CONSTEXPR_STATIC_INLINE uint16_t __ItemsPerVirtualWorkgroupLog2 = mpl::max_v; + // NBL_CONSTEXPR_STATIC_INLINE uint16_t __ItemsPerVirtualWorkgroup = uint16_t(0x1u) << __ItemsPerVirtualWorkgroupLog2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelInputCount_1 = conditional_value>SubgroupSizeLog2), SubgroupSize>, + SubgroupSize*ItemsPerInvocation_1>::value; + NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelInputCount_2 = conditional_value::value; + NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroup = LevelInputCount_1 / ItemsPerInvocation_1; // user specified the shared mem size of uint32_ts NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = conditional_value::value >::value; @@ -78,7 +82,7 @@ struct ArithmeticConfiguration // gets a subgroupID as if each workgroup has (VirtualWorkgroupSize/SubgroupSize) subgroups // each subgroup does work (VirtualWorkgroupSize/WorkgroupSize) times, the index denoted by workgroupInVirtualIndex - static uint32_t virtualSubgroupID(const uint32_t subgroupID, const uint32_t workgroupInVirtualIndex) + static uint16_t virtualSubgroupID(const uint16_t subgroupID, const uint16_t workgroupInVirtualIndex) { return workgroupInVirtualIndex * (WorkgroupSize >> SubgroupSizeLog2) + subgroupID; } @@ -87,30 +91,30 @@ struct ArithmeticConfiguration // specify the next level to store values for in template param // at level==LevelCount-1, it is guaranteed to have SubgroupSize elements template - static uint32_t sharedStoreIndex(const uint32_t subgroupID) + static uint16_t sharedStoreIndex(const uint16_t subgroupID) { - uint32_t offsetBySubgroup; + uint16_t offsetBySubgroup; if (level == LevelCount-1) offsetBySubgroup = SubgroupSize; else offsetBySubgroup = __SubgroupsPerVirtualWorkgroup; if (level<2) - return (subgroupID & (ItemsPerInvocation_1-1)) * offsetBySubgroup + (subgroupID/ItemsPerInvocation_1); + return (subgroupID & (ItemsPerInvocation_1-uint16_t(1u))) * offsetBySubgroup + (subgroupID/ItemsPerInvocation_1); else - return (subgroupID & (ItemsPerInvocation_2-1)) * offsetBySubgroup + (subgroupID/ItemsPerInvocation_2); + return (subgroupID & (ItemsPerInvocation_2-uint16_t(1u))) * offsetBySubgroup + (subgroupID/ItemsPerInvocation_2); } template - static uint32_t sharedStoreIndexFromVirtualIndex(const uint32_t subgroupID, const uint32_t workgroupInVirtualIndex) + static uint16_t sharedStoreIndexFromVirtualIndex(const uint16_t subgroupID, const uint16_t workgroupInVirtualIndex) { - const uint32_t virtualID = virtualSubgroupID(subgroupID, workgroupInVirtualIndex); + const uint16_t virtualID = virtualSubgroupID(subgroupID, workgroupInVirtualIndex); return sharedStoreIndex(virtualID); } // get the coalesced index in shared mem at the current level template - static uint32_t sharedLoadIndex(const uint32_t invocationIndex, const uint32_t component) + static uint16_t sharedLoadIndex(const uint16_t invocationIndex, const uint16_t component) { if (level == LevelCount-1) return component * SubgroupSize + invocationIndex; diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index de55a131b8..78ed124baf 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -43,7 +43,7 @@ struct reduce subgroup2::reduction reduction; vector_t value; - dataAccessor.template get(glsl::gl_SubgroupInvocationID(), value); + dataAccessor.template get(uint16_t(glsl::gl_SubgroupInvocationID()), value); return reduction(value); } }; @@ -62,7 +62,7 @@ struct scan using params_t = subgroup2::ArithmeticParams; vector_t value; - dataAccessor.template get(glsl::gl_SubgroupInvocationID(), value); + dataAccessor.template get(uint16_t(glsl::gl_SubgroupInvocationID()), value); if (Exclusive) { subgroup2::exclusive_scan excl_scan; @@ -73,7 +73,7 @@ struct scan subgroup2::inclusive_scan incl_scan; value = incl_scan(value); } - dataAccessor.template set(glsl::gl_SubgroupInvocationID(), value); + dataAccessor.template set(uint16_t(glsl::gl_SubgroupInvocationID()), value); } }; @@ -88,19 +88,19 @@ struct reduce template static void __doLevel0(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { - const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); + const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 0 scan subgroup2::reduction reduction0; [unroll] - for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) + for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { vector_t scan_local; - dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local); + dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local); scan_local = reduction0(scan_local); if (Config::electLast()) { - const uint32_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx); - scratchAccessor.template set(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()), idx); + scratchAccessor.template set(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -116,19 +116,19 @@ struct reduce __doLevel0(dataAccessor, scratchAccessor); - const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); + const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 1 scan subgroup2::reduction reduction1; if (glsl::gl_SubgroupID() == 0) { vector_lv1_t lv1_val; [unroll] - for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); + for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++) + scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); lv1_val = reduction1(lv1_val); if (Config::electLast()) - scratchAccessor.template set(0, lv1_val[Config::ItemsPerInvocation_1-1]); + scratchAccessor.template set(0, lv1_val[Config::ItemsPerInvocation_1-1]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -148,20 +148,20 @@ struct scan template static void __doLevel0(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { - const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); + const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); subgroup2::inclusive_scan inclusiveScan0; // level 0 scan [unroll] - for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) + for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { vector_t value; - dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); value = inclusiveScan0(value); - dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); if (Config::electLast()) { - const uint32_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx); - scratchAccessor.template set(bankedIndex, value[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()), idx); + scratchAccessor.template set(bankedIndex, value[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -177,48 +177,48 @@ struct scan __doLevel0(dataAccessor, scratchAccessor); - const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); + const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 1 scan subgroup2::exclusive_scan exclusiveScan1; if (glsl::gl_SubgroupID() == 0) { vector_lv1_t lv1_val; [unroll] - for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); + for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++) + scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); // lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex)); lv1_val = exclusiveScan1(lv1_val); [unroll] - for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template set(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); + for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++) + scratchAccessor.template set(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); // combine with level 0 [unroll] - for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) + for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { vector_lv0_t value; - dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); - const uint32_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx); + const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()), idx); scalar_t left; - scratchAccessor.template get(bankedIndex,left); + scratchAccessor.template get(bankedIndex,left); if (Exclusive) { scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID())); [unroll] - for (uint32_t i = Config::ItemsPerInvocation_0-1; i > 0; i--) + for (uint16_t i = Config::ItemsPerInvocation_0-1; i > 0; i--) value[i] = binop(left, value[i-1]); value[0] = binop(left, left_last_elem); } else { [unroll] - for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++) + for (uint16_t i = 0; i < Config::ItemsPerInvocation_0; i++) value[i] = binop(left, value[i]); } - dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); } } }; @@ -243,21 +243,21 @@ struct reduce reduce::template __doLevel0(dataAccessor, scratchAccessor); - const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); + const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 1 scan const uint32_t lv1_smem_size = Config::__ItemsPerVirtualWorkgroup; subgroup2::reduction reduction1; - if (glsl::gl_SubgroupID() < Config::SubgroupSize*Config::ItemsPerInvocation_2) + if (glsl::gl_SubgroupID() < Config::LevelInputCount_2) { vector_lv1_t lv1_val; [unroll] - for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); + for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++) + scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); lv1_val = reduction1(lv1_val); if (Config::electLast()) { - const uint32_t bankedIndex = Config::template sharedStoreIndex<2>(glsl::gl_SubgroupID()); - scratchAccessor.template set(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); + const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID())); + scratchAccessor.template set(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -268,16 +268,16 @@ struct reduce { vector_lv2_t lv2_val; [unroll] - for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) - scratchAccessor.template get(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]); + for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++) + scratchAccessor.template get(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]); lv2_val = reduction2(lv2_val); if (Config::electLast()) - scratchAccessor.template set(0, lv2_val[Config::ItemsPerInvocation_2-1]); + scratchAccessor.template set(0, lv2_val[Config::ItemsPerInvocation_2-1]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); scalar_t reduce_val; - scratchAccessor.template get(0,reduce_val); + scratchAccessor.template get(0,reduce_val); return reduce_val; } }; @@ -301,26 +301,25 @@ struct scan scan::template __doLevel0(dataAccessor, scratchAccessor); - const uint32_t invocationIndex = workgroup::SubgroupContiguousIndex(); + const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 1 scan - const uint32_t lv1_smem_size = Config::__ItemsPerVirtualWorkgroup; - const uint32_t lv1_num_invoc = Config::SubgroupSize*Config::ItemsPerInvocation_2; + const uint32_t lv1_smem_size = Config::LevelInputCount_1; subgroup2::inclusive_scan inclusiveScan1; - if (glsl::gl_SubgroupID() < lv1_num_invoc) + if (glsl::gl_SubgroupID() < Config::LevelInputCount_2) { vector_lv1_t lv1_val; [unroll] - for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); + for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++) + scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); // lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex)); lv1_val = inclusiveScan1(lv1_val); [unroll] - for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template set(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); + for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++) + scratchAccessor.template set(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); if (Config::electLast()) { - const uint32_t bankedIndex = Config::template sharedStoreIndex<2>(glsl::gl_SubgroupID()); - scratchAccessor.template set(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); + const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID())); + scratchAccessor.template set(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -331,62 +330,62 @@ struct scan { vector_lv2_t lv2_val; [unroll] - for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) - scratchAccessor.template get(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]); + for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++) + scratchAccessor.template get(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]); // lv2_val[0] = hlsl::mix(BinOp::identity, lv2_val[0], bool(invocationIndex)); lv2_val = exclusiveScan2(lv2_val); [unroll] - for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++) - scratchAccessor.template set(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]); + for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++) + scratchAccessor.template set(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); // combine with level 1 - if (glsl::gl_SubgroupID() < lv1_num_invoc) + if (glsl::gl_SubgroupID() < Config::LevelInputCount_2) { vector_lv1_t lv1_val; [unroll] - for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i), lv1_val[i]); + for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++) + scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i), lv1_val[i]); const scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp(lv1_val[Config::ItemsPerInvocation_1-1],1), bool(glsl::gl_SubgroupInvocationID())); scalar_t lv2_scan; - const uint32_t bankedIndex = Config::template sharedStoreIndex<2>(glsl::gl_SubgroupID()); - scratchAccessor.template get(lv1_smem_size+bankedIndex, lv2_scan); + const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID())); + scratchAccessor.template get(lv1_smem_size+bankedIndex, lv2_scan); [unroll] - for (uint32_t i = Config::ItemsPerInvocation_1-1; i > 0; i--) - scratchAccessor.template set(Config::template sharedLoadIndex<1>(invocationIndex, i), binop(lv1_val[i-1],lv2_scan)); - scratchAccessor.template set(Config::template sharedLoadIndex<1>(invocationIndex, 0), binop(left_last_elem,lv2_scan)); + for (uint16_t i = Config::ItemsPerInvocation_1-1; i > 0; i--) + scratchAccessor.template set(Config::template sharedLoadIndex<1>(invocationIndex, i), binop(lv1_val[i-1],lv2_scan)); + scratchAccessor.template set(Config::template sharedLoadIndex<1>(invocationIndex, 0), binop(left_last_elem,lv2_scan)); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); // combine with level 0 [unroll] - for (uint32_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) + for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { vector_lv0_t value; - dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); - const uint32_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx); + const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx); scalar_t left; - scratchAccessor.template get(bankedIndex,left); + scratchAccessor.template get(bankedIndex,left); if (Exclusive) { scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID())); [unroll] - for (uint32_t i = Config::ItemsPerInvocation_0-1; i > 0; i--) + for (uint16_t i = Config::ItemsPerInvocation_0-1; i > 0; i--) value[i] = binop(left, value[i-1]); value[0] = binop(left, left_last_elem); } else { [unroll] - for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++) + for (uint16_t i = 0; i < Config::ItemsPerInvocation_0; i++) value[i] = binop(left, value[i]); } - dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); } } }; From 7b15a544161cd8a6fb2011dac615928922d42c92 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 3 Jun 2025 15:49:02 +0700 Subject: [PATCH 267/346] do inclusive scan on upsweep and shift left on downsweep --- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 41 ++++++++++--------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 78ed124baf..d473e466b9 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -179,15 +179,14 @@ struct scan const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 1 scan - subgroup2::exclusive_scan exclusiveScan1; + subgroup2::inclusive_scan inclusiveScan1; if (glsl::gl_SubgroupID() == 0) { vector_lv1_t lv1_val; [unroll] for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++) scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); - // lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex)); - lv1_val = exclusiveScan1(lv1_val); + lv1_val = inclusiveScan1(lv1_val); [unroll] for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++) scratchAccessor.template set(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); @@ -201,9 +200,12 @@ struct scan vector_lv0_t value; dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); - const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()), idx); + const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()-1u), idx); scalar_t left; - scratchAccessor.template get(bankedIndex,left); + if (idx != 0 || glsl::gl_SubgroupID() != 0) + scratchAccessor.template get(bankedIndex,left); + else + left = BinOp::identity; if (Exclusive) { scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID())); @@ -245,7 +247,7 @@ struct reduce const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 1 scan - const uint32_t lv1_smem_size = Config::__ItemsPerVirtualWorkgroup; + const uint32_t lv1_smem_size = Config::LevelInputCount_1; subgroup2::reduction reduction1; if (glsl::gl_SubgroupID() < Config::LevelInputCount_2) { @@ -311,7 +313,6 @@ struct scan [unroll] for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++) scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); - // lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex)); lv1_val = inclusiveScan1(lv1_val); [unroll] for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++) @@ -325,15 +326,14 @@ struct scan scratchAccessor.workgroupExecutionAndMemoryBarrier(); // level 2 scan - subgroup2::exclusive_scan exclusiveScan2; + subgroup2::inclusive_scan inclusiveScan2; if (glsl::gl_SubgroupID() == 0) { vector_lv2_t lv2_val; [unroll] for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++) scratchAccessor.template get(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]); - // lv2_val[0] = hlsl::mix(BinOp::identity, lv2_val[0], bool(invocationIndex)); - lv2_val = exclusiveScan2(lv2_val); + lv2_val = inclusiveScan2(lv2_val); [unroll] for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++) scratchAccessor.template set(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]); @@ -344,20 +344,18 @@ struct scan if (glsl::gl_SubgroupID() < Config::LevelInputCount_2) { vector_lv1_t lv1_val; + scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex-uint16_t(1u), Config::ItemsPerInvocation_1-uint16_t(1u)), lv1_val[0]); [unroll] - for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i), lv1_val[i]); - - const scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp(lv1_val[Config::ItemsPerInvocation_1-1],1), bool(glsl::gl_SubgroupInvocationID())); + for (uint16_t i = 1; i < Config::ItemsPerInvocation_1; i++) + scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i-uint16_t(1u)), lv1_val[i]); scalar_t lv2_scan; - const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID())); + const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID()-1u)); scratchAccessor.template get(lv1_smem_size+bankedIndex, lv2_scan); [unroll] - for (uint16_t i = Config::ItemsPerInvocation_1-1; i > 0; i--) - scratchAccessor.template set(Config::template sharedLoadIndex<1>(invocationIndex, i), binop(lv1_val[i-1],lv2_scan)); - scratchAccessor.template set(Config::template sharedLoadIndex<1>(invocationIndex, 0), binop(left_last_elem,lv2_scan)); + for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i--) + scratchAccessor.template set(Config::template sharedLoadIndex<1>(invocationIndex, i), binop(lv1_val[i],lv2_scan)); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -368,9 +366,12 @@ struct scan vector_lv0_t value; dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); - const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(glsl::gl_SubgroupID(), idx); + const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()-1u), idx); scalar_t left; - scratchAccessor.template get(bankedIndex,left); + if (idx != 0 || glsl::gl_SubgroupID() != 0) + scratchAccessor.template get(bankedIndex,left); + else + left = BinOp::identity; if (Exclusive) { scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID())); From 37aa99baee12a87bcb351d74988e7a6349317e6e Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 3 Jun 2025 16:46:32 +0700 Subject: [PATCH 268/346] some adjustments to config and func usages --- .../builtin/hlsl/workgroup2/arithmetic.hlsl | 6 ++-- .../hlsl/workgroup2/arithmetic_config.hlsl | 28 +++++++++---------- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 14 ++++------ 3 files changed, 23 insertions(+), 25 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl index 643f8d123e..62a9fb7bef 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl @@ -17,7 +17,7 @@ namespace hlsl namespace workgroup2 { -template +template) struct reduction { using scalar_t = typename BinOp::type_t; @@ -30,7 +30,7 @@ struct reduction } }; -template +template) struct inclusive_scan { using scalar_t = typename BinOp::type_t; @@ -43,7 +43,7 @@ struct inclusive_scan } }; -template +template) struct exclusive_scan { using scalar_t = typename BinOp::type_t; diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl index a9fdcfe0a4..e2cf846d6c 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -36,6 +36,8 @@ struct items_per_invocation NBL_CONSTEXPR_STATIC_INLINE uint16_t value0 = BaseItemsPerInvocation; NBL_CONSTEXPR_STATIC_INLINE uint16_t value1 = uint16_t(0x1u) << conditional_value, ItemsPerInvocationProductLog2>::value; NBL_CONSTEXPR_STATIC_INLINE uint16_t value2 = uint16_t(0x1u) << mpl::max_v; + + using ItemsPerInvocation = tuple,integral_constant,integral_constant >; }; } @@ -53,26 +55,24 @@ struct ArithmeticConfiguration static_assert(VirtualWorkgroupSize<=WorkgroupSize*SubgroupSize); using items_per_invoc_t = impl::items_per_invocation; - using ItemsPerInvocation = tuple,integral_constant,integral_constant >; NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_0 = items_per_invoc_t::value0; NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_1 = items_per_invoc_t::value1; NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = items_per_invoc_t::value2; + static_assert(ItemsPerInvocation_2<=4, "4 level scan would have been needed with this config!"); - // NBL_CONSTEXPR_STATIC_INLINE uint16_t __ItemsPerVirtualWorkgroupLog2 = mpl::max_v; - // NBL_CONSTEXPR_STATIC_INLINE uint16_t __ItemsPerVirtualWorkgroup = uint16_t(0x1u) << __ItemsPerVirtualWorkgroupLog2; NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelInputCount_1 = conditional_value>SubgroupSizeLog2), SubgroupSize>, SubgroupSize*ItemsPerInvocation_1>::value; NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelInputCount_2 = conditional_value::value; NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroup = LevelInputCount_1 / ItemsPerInvocation_1; - // user specified the shared mem size of uint32_ts + // user specified the shared mem size of Scalars NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = conditional_value::value + LevelInputCount_2, + 0 + >::value + LevelInputCount_1 >::value; static bool electLast() @@ -90,8 +90,8 @@ struct ArithmeticConfiguration // get a coalesced index to store for the next level in shared mem, e.g. level 0 -> level 1 // specify the next level to store values for in template param // at level==LevelCount-1, it is guaranteed to have SubgroupSize elements - template - static uint16_t sharedStoreIndex(const uint16_t subgroupID) + template0 && level + template0 && level + template0 && level const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 1 scan - const uint32_t lv1_smem_size = Config::LevelInputCount_1; subgroup2::reduction reduction1; if (glsl::gl_SubgroupID() < Config::LevelInputCount_2) { @@ -259,7 +258,7 @@ struct reduce if (Config::electLast()) { const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID())); - scratchAccessor.template set(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); + scratchAccessor.template set(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -271,7 +270,7 @@ struct reduce vector_lv2_t lv2_val; [unroll] for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++) - scratchAccessor.template get(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]); + scratchAccessor.template get(Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]); lv2_val = reduction2(lv2_val); if (Config::electLast()) scratchAccessor.template set(0, lv2_val[Config::ItemsPerInvocation_2-1]); @@ -305,7 +304,6 @@ struct scan const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 1 scan - const uint32_t lv1_smem_size = Config::LevelInputCount_1; subgroup2::inclusive_scan inclusiveScan1; if (glsl::gl_SubgroupID() < Config::LevelInputCount_2) { @@ -320,7 +318,7 @@ struct scan if (Config::electLast()) { const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID())); - scratchAccessor.template set(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); + scratchAccessor.template set(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -332,11 +330,11 @@ struct scan vector_lv2_t lv2_val; [unroll] for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++) - scratchAccessor.template get(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]); + scratchAccessor.template get(Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]); lv2_val = inclusiveScan2(lv2_val); [unroll] for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++) - scratchAccessor.template set(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]); + scratchAccessor.template set(Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -351,7 +349,7 @@ struct scan scalar_t lv2_scan; const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID()-1u)); - scratchAccessor.template get(lv1_smem_size+bankedIndex, lv2_scan); + scratchAccessor.template get(bankedIndex, lv2_scan); [unroll] for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i--) From eaffe98a29f5a0968bcecb2add6cb27db91d6602 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com> Date: Tue, 3 Jun 2025 14:43:02 +0200 Subject: [PATCH 269/346] Update compose.yml --- compose.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/compose.yml b/compose.yml index 3f32e8d1b5..f9444275f4 100644 --- a/compose.yml +++ b/compose.yml @@ -13,4 +13,9 @@ services: - type: bind source: C:\Windows\System32 target: C:\mount\Windows\System32 - read_only: true \ No newline at end of file + read_only: true + +networks: + default: + external: true + name: docker_default From d514007886f35ec82d26d40ab15debcb36548324 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com> Date: Tue, 3 Jun 2025 15:28:03 +0200 Subject: [PATCH 270/346] Update compose.yml, restart: always to boot after host wakes up --- compose.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/compose.yml b/compose.yml index f9444275f4..c80bdb4319 100644 --- a/compose.yml +++ b/compose.yml @@ -14,6 +14,7 @@ services: source: C:\Windows\System32 target: C:\mount\Windows\System32 read_only: true + restart: always networks: default: From f09ca19f5fff3a089f3ff91be780ab7dbcfd8f98 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com> Date: Tue, 3 Jun 2025 16:15:38 +0200 Subject: [PATCH 271/346] Update build-nabla.yml, add deploy-production job --- .github/workflows/build-nabla.yml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index 8988fe6df6..3e8e0b4dd0 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -251,3 +251,19 @@ jobs: publish_dir: .badge-public keep_files: true commit_message: "[CI] badges update" + + deploy-production: + name: Deploy to production host + if: ${{ always() && github.ref == 'refs/heads/master' }} + needs: build-windows + runs-on: ubuntu-latest + + steps: + - name: Pull latest images, re-run containers + uses: appleboy/ssh-action@v1 + with: + host: ${{ secrets.CE_HOST }} + username: ${{ secrets.CE_USER }} + key: ${{ secrets.CE_KEY }} + script: | + powershell -NoLogo -NoProfile -ExecutionPolicy Bypass -NoExit -File C:\Scripts\startup-docker.ps1 From 3b3d45c83c7ae6f1a4ae05a3fdd69844a4b94bf1 Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 4 Jun 2025 01:05:20 +0200 Subject: [PATCH 272/346] always set the callback back, because even if it were empty it needs to be empty again --- src/nbl/video/utilities/CAssetConverter.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 4aa631c746..0ef13633da 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -5344,8 +5344,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul retval.set({params.transfer->scratchSemaphore.semaphore,params.transfer->scratchSemaphore.value}); } // reset original callback - if (bool(origXferStallCallback)) - params.transfer->overflowCallback = std::move(origXferStallCallback); + params.transfer->overflowCallback = std::move(origXferStallCallback); // Its too dangerous to leave an Intended Transfer Submit hanging around that needs to be submitted for Compute to make forward progress outside of this utility, // and doing transfer-signals-after-compute-wait timeline sema tricks are not and option because: From da6c3134e342eb37517f78974c8febe5e26ec2ca Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 4 Jun 2025 11:14:06 +0700 Subject: [PATCH 273/346] split out level 0 scans into its own struct --- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 79 +++++++++++-------- 1 file changed, 47 insertions(+), 32 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 4edb5ae9ff..329542fa18 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -77,16 +77,15 @@ struct scan } }; -// 2-level scans +// do level 0 scans for 2- and 3-level scans (same code) template -struct reduce +struct reduce_level0 { using scalar_t = typename BinOp::type_t; - using vector_lv0_t = vector; // data accessor needs to be this type - using vector_lv1_t = vector; + using vector_t = vector; // data accessor needs to be this type - template - static void __doLevel0(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + template + static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 0 scan @@ -104,7 +103,45 @@ struct reduce } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); + }; +}; + +template +struct scan_level0 +{ + using scalar_t = typename BinOp::type_t; + using vector_t = vector; // data accessor needs to be this type + + template + static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + { + const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); + subgroup2::inclusive_scan inclusiveScan0; + // level 0 scan + [unroll] + for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) + { + vector_t value; + dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + value = inclusiveScan0(value); + dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + if (Config::electLast()) + { + const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()), idx); + scratchAccessor.template set(bankedIndex, value[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + } + } + scratchAccessor.workgroupExecutionAndMemoryBarrier(); } +}; + +// 2-level scans +template +struct reduce +{ + using scalar_t = typename BinOp::type_t; + using vector_lv0_t = vector; // data accessor needs to be this type + using vector_lv1_t = vector; template scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) @@ -114,7 +151,7 @@ struct reduce using params_lv1_t = subgroup2::ArithmeticParams; BinOp binop; - __doLevel0(dataAccessor, scratchAccessor); + reduce_level0::template __call(dataAccessor, scratchAccessor); const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 1 scan @@ -145,28 +182,6 @@ struct scan using vector_lv0_t = vector; // data accessor needs to be this type using vector_lv1_t = vector; - template - static void __doLevel0(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) - { - const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); - subgroup2::inclusive_scan inclusiveScan0; - // level 0 scan - [unroll] - for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) - { - vector_t value; - dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); - value = inclusiveScan0(value); - dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); - if (Config::electLast()) - { - const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()), idx); - scratchAccessor.template set(bankedIndex, value[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan - } - } - scratchAccessor.workgroupExecutionAndMemoryBarrier(); - } - template void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { @@ -175,7 +190,7 @@ struct scan using params_lv1_t = subgroup2::ArithmeticParams; BinOp binop; - __doLevel0(dataAccessor, scratchAccessor); + scan_level0::template __call(dataAccessor, scratchAccessor); const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 1 scan @@ -243,7 +258,7 @@ struct reduce using params_lv2_t = subgroup2::ArithmeticParams; BinOp binop; - reduce::template __doLevel0(dataAccessor, scratchAccessor); + reduce_level0::template __call(dataAccessor, scratchAccessor); const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 1 scan @@ -300,7 +315,7 @@ struct scan using params_lv2_t = subgroup2::ArithmeticParams; BinOp binop; - scan::template __doLevel0(dataAccessor, scratchAccessor); + scan_level0::template __call(dataAccessor, scratchAccessor); const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 1 scan From e230d06aaea58f47d7ec5059990f862c4230c246 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 4 Jun 2025 15:34:40 +0700 Subject: [PATCH 274/346] fixes to 3 level scan --- .../builtin/hlsl/workgroup2/arithmetic_config.hlsl | 6 +++++- include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl | 12 +++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl index e2cf846d6c..aecd489beb 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -116,8 +116,12 @@ struct ArithmeticConfiguration template0 && level if (glsl::gl_SubgroupID() < Config::LevelInputCount_2) { vector_lv1_t lv1_val; - scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex-uint16_t(1u), Config::ItemsPerInvocation_1-uint16_t(1u)), lv1_val[0]); [unroll] - for (uint16_t i = 1; i < Config::ItemsPerInvocation_1; i++) - scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i-uint16_t(1u)), lv1_val[i]); + for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++) + scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i), lv1_val[i]); scalar_t lv2_scan; const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID()-1u)); - scratchAccessor.template get(bankedIndex, lv2_scan); + if (glsl::gl_SubgroupID() != 0) + scratchAccessor.template get(bankedIndex, lv2_scan); + else + lv2_scan = BinOp::identity; [unroll] - for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i--) + for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++) scratchAccessor.template set(Config::template sharedLoadIndex<1>(invocationIndex, i), binop(lv1_val[i],lv2_scan)); } scratchAccessor.workgroupExecutionAndMemoryBarrier(); From 3da175daca07a49ffad2672b1d3e74b46221e13b Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 5 Jun 2025 10:53:40 +0700 Subject: [PATCH 275/346] padding to shared mem indexing to avoid bank conflict --- .../hlsl/workgroup2/arithmetic_config.hlsl | 33 ++++++++++--------- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 7 +--- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl index aecd489beb..0177863b11 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -70,10 +70,11 @@ struct ArithmeticConfiguration NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = conditional_value::value + LevelInputCount_1 >::value; + NBL_CONSTEXPR_STATIC_INLINE uint16_t __padding = conditional_value::value; static bool electLast() { @@ -90,22 +91,22 @@ struct ArithmeticConfiguration // get a coalesced index to store for the next level in shared mem, e.g. level 0 -> level 1 // specify the next level to store values for in template param // at level==LevelCount-1, it is guaranteed to have SubgroupSize elements - template0 && level// NBL_FUNC_REQUIRES(level>0 && level0 && level// NBL_FUNC_REQUIRES(level>0 && level0 && level// NBL_FUNC_REQUIRES(level>0 && level [unroll] for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++) scratchAccessor.template set(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); - if (Config::electLast()) - { - const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID())); - scratchAccessor.template set(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); - } } scratchAccessor.workgroupExecutionAndMemoryBarrier(); @@ -345,7 +340,7 @@ struct scan vector_lv2_t lv2_val; [unroll] for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++) - scratchAccessor.template get(Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]); + scratchAccessor.template get(Config::template sharedLoadIndex<1>(((invocationIndex*Config::ItemsPerInvocation_1)+i+1)*Config::SubgroupSize-1, Config::ItemsPerInvocation_1-1),lv2_val[i]); lv2_val = inclusiveScan2(lv2_val); [unroll] for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++) From 32732e784f835787f724593675c9445bd0742ed7 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 5 Jun 2025 12:16:23 +0700 Subject: [PATCH 276/346] fix padding bugs --- include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 79c62399d2..80dec1b85c 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -337,10 +337,11 @@ struct scan subgroup2::inclusive_scan inclusiveScan2; if (glsl::gl_SubgroupID() == 0) { + const uint16_t one = uint16_t(1u); vector_lv2_t lv2_val; [unroll] for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++) - scratchAccessor.template get(Config::template sharedLoadIndex<1>(((invocationIndex*Config::ItemsPerInvocation_1)+i+1)*Config::SubgroupSize-1, Config::ItemsPerInvocation_1-1),lv2_val[i]); + scratchAccessor.template get(Config::template sharedLoadIndex<1>((invocationIndex*Config::ItemsPerInvocation_2+i+one)*Config::SubgroupSize-one, Config::ItemsPerInvocation_1-one),lv2_val[i]); lv2_val = inclusiveScan2(lv2_val); [unroll] for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++) From 1fc684d74ce6463944bb7817959992f183d23dc2 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 6 Jun 2025 17:36:04 +0700 Subject: [PATCH 277/346] Fix AssetConverter after merge --- src/nbl/video/utilities/CAssetConverter.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index d07e305777..8204a61e27 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -2415,8 +2415,12 @@ struct conversions_t if (!deferredAllocator->request(output,constrainMask)) return; } - // set debug names on everything! - setDebugName(conv,output->get(),contentHash,uniqueCopyGroupID); + + if constexpr (!std::is_same_v) + { + // set debug names on everything + setDebugName(conv,output->get(),contentHash,uniqueCopyGroupID); + } } // Since the dfsCache has the original asset pointers as keys, we map in reverse (multiple `instance_t` can map to the same unique content hash and GPU object) @@ -3042,10 +3046,6 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult .writeCache = inputs.writeShaderCache }; - // no one depend on the converted IShaders so we need to hold a smart ptr into them somewhere. - // This is to prevent m_stagingCache to hold a dangling pointer into IShader - retval.m_shaders.reserve(gpuObjUniqueCopyGroupIDs.size()); - for (auto& entry : conversionRequests.contentHashToCanonical) for (auto i=0ull; i SReserveResult pruneStaging.template operator()(); pruneStaging.template operator()(); pruneStaging.template operator()(); - pruneStaging.template operator()(); + pruneStaging.template operator()(); pruneStaging.template operator()(); pruneStaging.template operator()(); pruneStaging.template operator()(); @@ -3667,7 +3667,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul }; // wipe gpu item in staging cache (this may drop it as well if it was made for only a root asset == no users) - core::unordered_map outputReverseMap; + core::unordered_map outputReverseMap; core::for_each_in_tuple(reservations.m_gpuObjects,[&outputReverseMap](const auto& gpuObjects)->void { uint32_t i = 0; From 7a2065aacd811cb5a2e56e97fbedc4e5fbfeccb9 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 9 Jun 2025 13:48:39 +0700 Subject: [PATCH 278/346] update to latest example --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 6581ed496d..1710b69862 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 6581ed496d2fc41cae1dc5c9ceba10f3bdfc5135 +Subproject commit 1710b698621796aa767edf7bc940e55e6758c2a8 From 5c2f55b34235ceb4e9e62d37522a78ac9e6c74b0 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 9 Jun 2025 14:27:28 +0700 Subject: [PATCH 279/346] Fix pipeline creation in full screen triangle pass --- .../ext/FullScreenTriangle/FullScreenTriangle.h | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h b/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h index 4e7147c904..1abebf23ea 100644 --- a/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h +++ b/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h @@ -40,7 +40,7 @@ struct ProtoPipeline final inline operator bool() const {return m_vxShader.get();} inline core::smart_refctd_ptr createPipeline( - const asset::IPipelineBase::SShaderSpecInfo& fragShader, + const video::IGPUPipelineBase::SShaderSpecInfo& fragShader, video::IGPUPipelineLayout* layout, video::IGPURenderpass* renderpass, const uint32_t subpassIx=0, @@ -58,17 +58,13 @@ struct ProtoPipeline final { const auto orientationAsUint32 = static_cast(swapchainTransform); - asset::IPipelineBase::SShaderSpecInfo::spec_constant_map_t specConstants; - specConstants[0] = {.data=&orientationAsUint32,.size=sizeof(orientationAsUint32)}; - - const asset::IPipelineBase::SShaderSpecInfo shaders[2] = { - {.shader=m_vxShader.get(), .entryPoint = "main" ,.stage = hlsl::ESS_VERTEX,.entries=&specConstants}, - fragShader - }; + IGPUPipelineBase::SShaderEntryMap specConstants; + specConstants[0] = std::span{ reinterpret_cast(&orientationAsUint32), sizeof(orientationAsUint32)}; IGPUGraphicsPipeline::SCreationParams params[1]; params[0].layout = layout; - params[0].shaders = shaders; + params[0].vertexShader = { .shader = m_vxShader.get(), .entryPoint = "main", .entries = &specConstants }; + params[0].fragmentShader = fragShader; params[0].cached = { .vertexInput = {}, // The Full Screen Triangle doesn't use any HW vertex input state .primitiveAssembly = {}, From 03f7bc7548fb97f5a7dd9c950997f14fb13521e1 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 9 Jun 2025 14:30:31 +0700 Subject: [PATCH 280/346] Fix descriptor set casting for const counterpart --- include/nbl/asset/ICPUDescriptorSet.h | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/include/nbl/asset/ICPUDescriptorSet.h b/include/nbl/asset/ICPUDescriptorSet.h index 2498a438ca..c7f54360ac 100644 --- a/include/nbl/asset/ICPUDescriptorSet.h +++ b/include/nbl/asset/ICPUDescriptorSet.h @@ -92,6 +92,13 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet, ICPUDescriptorSet>) static auto computeDependantsImpl(Self* self) { using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; + + using cpu_buffer_ptr_t = std::conditional_t, const ICPUBuffer*, ICPUBuffer*>; + using cpu_sampler_ptr_t = std::conditional_t, const ICPUSampler*, ICPUSampler*>; + using cpu_image_view_ptr_t = std::conditional_t, const ICPUImageView*, ICPUImageView*>; + using cpu_buffer_view_ptr_t = std::conditional_t, const ICPUBufferView*, ICPUBufferView*>; + using cpu_tlas_ptr_t = std::conditional_t, const ICPUTopLevelAccelerationStructure*, ICPUTopLevelAccelerationStructure*>; + core::unordered_set dependants = { self->m_layout.get() }; for (auto i = 0u; i < static_cast(IDescriptor::E_TYPE::ET_COUNT); i++) { @@ -104,15 +111,15 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet(i))) { case IDescriptor::EC_BUFFER: - dependants.insert(static_cast(desc)); + dependants.insert(static_cast(desc)); case IDescriptor::EC_SAMPLER: - dependants.insert(static_cast(desc)); + dependants.insert(static_cast(desc)); case IDescriptor::EC_IMAGE: - dependants.insert(static_cast(desc)); + dependants.insert(static_cast(desc)); case IDescriptor::EC_BUFFER_VIEW: - dependants.insert(static_cast(desc)); + dependants.insert(static_cast(desc)); case IDescriptor::EC_ACCELERATION_STRUCTURE: - dependants.insert(static_cast(desc)); + dependants.insert(static_cast(desc)); default: break; } From aeebe3679492232275ee7dd28255a336d5af0006 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 9 Jun 2025 14:31:27 +0700 Subject: [PATCH 281/346] Fix entries traversal in gpu pipeline --- include/nbl/video/IGPUPipeline.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h index 0b56b87ee9..5a160fb2b2 100644 --- a/include/nbl/video/IGPUPipeline.h +++ b/include/nbl/video/IGPUPipeline.h @@ -61,11 +61,14 @@ class IGPUPipelineBase { // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-module-08987 int64_t specData = 0; - for (const auto& entry : *entries) + if (entries) { - if (!entry.second.size()) - return INVALID_SPEC_INFO; - specData += entry.second.size(); + for (const auto& entry : *entries) + { + if (!entry.second.size()) + return INVALID_SPEC_INFO; + specData += entry.second.size(); + } } if (specData>0x7fffffff) return INVALID_SPEC_INFO; From b65f14fad0ab3405d43fa5b8da313e50a5cb807e Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 9 Jun 2025 14:31:44 +0700 Subject: [PATCH 282/346] move SHitGroup to outside SCreationParams --- include/nbl/video/IGPURayTracingPipeline.h | 13 +++++++------ src/nbl/video/ILogicalDevice.cpp | 2 +- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h index 3bcd4537f3..4b92db329b 100644 --- a/include/nbl/video/IGPURayTracingPipeline.h +++ b/include/nbl/video/IGPURayTracingPipeline.h @@ -15,18 +15,19 @@ class IGPURayTracingPipeline : public IGPUPipeline; public: + struct SHitGroup + { + SShaderSpecInfo closestHit; + SShaderSpecInfo anyHit; + SShaderSpecInfo intersection; + }; + struct SCreationParams : public SPipelineCreationParams { using FLAGS = pipeline_t::FLAGS; struct SShaderGroupsParams { - struct SHitGroup - { - SShaderSpecInfo closestHit; - SShaderSpecInfo anyHit; - SShaderSpecInfo intersection; - }; SShaderSpecInfo raygen; std::span misses; diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp index 0056cc3a2a..cbfee667cf 100644 --- a/src/nbl/video/ILogicalDevice.cpp +++ b/src/nbl/video/ILogicalDevice.cpp @@ -1094,7 +1094,7 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline core::vector debloatedMissSpecs(missGroupCount); auto debloatedMissSpecData = debloatedMissSpecs.data(); - core::vector debloatedHitSpecs(hitGroupCount); + core::vector debloatedHitSpecs(hitGroupCount); auto debloatedHitSpecData = debloatedHitSpecs.data(); core::vector debloatedCallableSpecs(callableGroupCount); auto debloatedCallableSpecData = debloatedCallableSpecs.data(); From c5f947947b1d0c07f244fe8ab5f941e176b548bf Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 9 Jun 2025 14:32:33 +0700 Subject: [PATCH 283/346] Fix ray tracing pipeline creation --- src/nbl/video/CVulkanLogicalDevice.cpp | 33 ++++++++++++++++++++------ 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 24f5ae60b2..89f7ab1da3 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -1473,7 +1473,7 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl( ) { using SShaderGroupParams = IGPURayTracingPipeline::SCreationParams::SShaderGroupsParams; - using SHitShaderGroup = SShaderGroupParams::SHitGroup; + using SHitShaderGroup = IGPURayTracingPipeline::SHitGroup; const auto dynamicStates = std::array{ VK_DYNAMIC_STATE_RAY_TRACING_PIPELINE_STACK_SIZE_KHR }; const VkPipelineDynamicStateCreateInfo vk_dynamicStateCreateInfo = { @@ -1518,7 +1518,10 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl( { core::unordered_map shaderIndexes; auto getVkShaderIndex = [&](const asset::IShader* shader) - { return shader == nullptr ? VK_SHADER_UNUSED_KHR : shaderIndexes[shader]; }; + { + const auto index = shader == nullptr ? VK_SHADER_UNUSED_KHR : shaderIndexes[shader]; + return index; + }; auto getGeneralVkRayTracingShaderGroupCreateInfo = [getVkShaderIndex](IGPUPipelineBase::SShaderSpecInfo spec) -> VkRayTracingShaderGroupCreateInfoKHR { @@ -1553,23 +1556,39 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl( if (!spec.shader) return; if (shaderIndexes.find(spec.shader) == shaderIndexes.end()) { - shaderIndexes.insert({ spec.shader, static_cast(std::distance(outShaderStage, vk_shaderStage.data()))}); - *(outShaderStage++) = getVkShaderStageCreateInfoFrom(spec, shaderStage, false, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo,outSpecMapEntry,outSpecData); + shaderIndexes.insert({ spec.shader, std::distancepStages)>(outCreateInfo->pStages, outShaderStage)}); + *(outShaderStage) = getVkShaderStageCreateInfoFrom(spec, shaderStage, false, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo,outSpecMapEntry,outSpecData); + outShaderStage++; } }; - processSpecInfo(info.shaderGroups.raygen, hlsl::ESS_RAYGEN); - outCreateInfo->stageCount = std::distancepStages)>(outCreateInfo->pStages,outShaderStage); - assert(outCreateInfo->stageCount != 0); const auto& shaderGroups = info.shaderGroups; outCreateInfo->pGroups = outShaderGroup; + processSpecInfo(info.shaderGroups.raygen, hlsl::ESS_RAYGEN); *(outShaderGroup++) = getGeneralVkRayTracingShaderGroupCreateInfo(shaderGroups.raygen); + for (const auto& shaderGroup : shaderGroups.misses) + { + processSpecInfo(shaderGroup, hlsl::ESS_MISS); *(outShaderGroup++) = getGeneralVkRayTracingShaderGroupCreateInfo(shaderGroup); + } + for (const auto& shaderGroup : shaderGroups.hits) + { + processSpecInfo(shaderGroup.closestHit, hlsl::ESS_CLOSEST_HIT); + processSpecInfo(shaderGroup.anyHit, hlsl::ESS_ANY_HIT); + processSpecInfo(shaderGroup.intersection, hlsl::ESS_INTERSECTION); *(outShaderGroup++) = getHitVkRayTracingShaderGroupCreateInfo(shaderGroup); + } + for (const auto& shaderGroup : shaderGroups.callables) + { + processSpecInfo(shaderGroup, hlsl::ESS_CALLABLE); *(outShaderGroup++) = getGeneralVkRayTracingShaderGroupCreateInfo(shaderGroup); + } + + outCreateInfo->stageCount = std::distancepStages)>(outCreateInfo->pStages,outShaderStage); + assert(outCreateInfo->stageCount != 0); outCreateInfo->groupCount = 1 + shaderGroups.hits.size() + shaderGroups.misses.size() + shaderGroups.callables.size(); outCreateInfo->maxPipelineRayRecursionDepth = info.cached.maxRecursionDepth; if (info.cached.dynamicStackSize) From edefa6724f4ffa6d6c2adf9c968c8d51691358b2 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 9 Jun 2025 14:32:51 +0700 Subject: [PATCH 284/346] Fix imgui pass --- src/nbl/ext/ImGui/ImGui.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/nbl/ext/ImGui/ImGui.cpp b/src/nbl/ext/ImGui/ImGui.cpp index b40c7155be..f477e96cdf 100644 --- a/src/nbl/ext/ImGui/ImGui.cpp +++ b/src/nbl/ext/ImGui/ImGui.cpp @@ -342,17 +342,13 @@ core::smart_refctd_ptr UI::createPipeline(SCreation core::smart_refctd_ptr pipeline; { - const IPipelineBase::SShaderSpecInfo specs[] = - { - {.shader = shaders.vertex.get(), .entryPoint = "VSMain", .stage = hlsl::ShaderStage::ESS_VERTEX}, - {.shader = shaders.fragment.get(), .entryPoint = "PSMain", .stage = hlsl::ShaderStage::ESS_FRAGMENT} - }; IGPUGraphicsPipeline::SCreationParams params[1]; { auto& param = params[0u]; + param.vertexShader = { .shader = shaders.vertex.get(), .entryPoint = "VSMain" }; + param.fragmentShader = { .shader = shaders.fragment.get(), .entryPoint = "PSMain" }; param.layout = pipelineLayout.get(); - param.shaders = specs; param.renderpass = creationParams.renderpass.get(); param.cached = { .vertexInput = vertexInputParams, .primitiveAssembly = primitiveAssemblyParams, .rasterization = rasterizationParams, .blend = blendParams, .subpassIx = creationParams.subpassIx }; }; From 67c4d8dac7c82c45d2a8575361ec21e6708f2d3e Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 9 Jun 2025 14:33:09 +0700 Subject: [PATCH 285/346] Add assert shader stage --- src/nbl/video/utilities/CAssetConverter.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 41f57e3b11..147a76bdd4 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -1040,6 +1040,7 @@ class HashVisit : public CAssetConverter::CHashCache::hash_impl_base { const auto stage = std::get<1>(argTuple); hasher << arg0.entryPoint; + assert(hlsl::bitCount(stage) == 1); hasher << stage; hasher << arg0.requiredSubgroupSize; if (!arg0.entries.empty()) From bc9aed64c6c32414d86101165bf828f39b00a5ed Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 9 Jun 2025 14:36:14 +0700 Subject: [PATCH 286/346] use core::bitflag::hasFlags instead of & --- include/nbl/video/IGPURayTracingPipeline.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h index 4b92db329b..7151f8f227 100644 --- a/include/nbl/video/IGPURayTracingPipeline.h +++ b/include/nbl/video/IGPURayTracingPipeline.h @@ -109,7 +109,7 @@ class IGPURayTracingPipeline : public IGPUPipeline Date: Mon, 9 Jun 2025 14:39:51 +0700 Subject: [PATCH 287/346] Add whether shader is null when cloning SShaderSpecInfo --- include/nbl/asset/ICPUPipeline.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h index 0642acb676..8b9fec34c4 100644 --- a/include/nbl/asset/ICPUPipeline.h +++ b/include/nbl/asset/ICPUPipeline.h @@ -85,7 +85,7 @@ class ICPUPipelineBase SShaderSpecInfo clone(uint32_t depth) const { auto newSpecInfo = *this; - if (depth > 0u) + if (newSpecInfo.shader.get() != nullptr && depth > 0u) { newSpecInfo.shader = core::smart_refctd_ptr_static_cast(this->shader->clone(depth - 1u)); } From 35815d2a8d25b3d2a5e3281f6b2a411d8b8c31f6 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 9 Jun 2025 15:23:18 +0700 Subject: [PATCH 288/346] Small improvement on ILogicalDevice --- src/nbl/video/ILogicalDevice.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp index cbfee667cf..975151ddbd 100644 --- a/src/nbl/video/ILogicalDevice.cpp +++ b/src/nbl/video/ILogicalDevice.cpp @@ -29,7 +29,9 @@ class SpirvDebloatTask IGPUPipelineBase::SShaderSpecInfo debloat(const IGPUPipelineBase::SShaderSpecInfo& shaderSpec, core::vector>& outShaders) { const auto* shader = shaderSpec.shader; - const auto& entryPoints = m_entryPointsMap[shader]; + const auto findResult = m_entryPointsMap.find(shader); + assert(findResult != m_entryPointsMap.end()); + const auto& entryPoints = findResult->second; auto debloatedShaderSpec = shaderSpec; if (shader != nullptr) @@ -1128,14 +1130,14 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline newParams[ix] = param; newParams[ix].shaderGroups.raygen = debloatTask.debloat(param.shaderGroups.raygen, debloatedShaders); - newParams[ix].shaderGroups.misses = { debloatedMissSpecData, param.shaderGroups.misses.size() }; + newParams[ix].shaderGroups.misses = debloatedMissSpecs; for (const auto& miss: param.shaderGroups.misses) { *debloatedMissSpecData = debloatTask.debloat(miss, debloatedShaders); debloatedMissSpecData++; } - newParams[ix].shaderGroups.hits = { debloatedHitSpecData, param.shaderGroups.hits.size() }; + newParams[ix].shaderGroups.hits = debloatedHitSpecs; for (const auto& hit: param.shaderGroups.hits) { *debloatedHitSpecData = { @@ -1146,7 +1148,7 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline debloatedHitSpecData++; } - newParams[ix].shaderGroups.callables = { debloatedCallableSpecData, param.shaderGroups.callables.size() }; + newParams[ix].shaderGroups.callables = debloatedCallableSpecs; for (const auto& callable: param.shaderGroups.callables) { *debloatedCallableSpecData = debloatTask.debloat(callable, debloatedShaders); From 66c87a0040ca652ea0bf3cce7825c08b2cc2f0b0 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 9 Jun 2025 15:24:11 +0700 Subject: [PATCH 289/346] Small improvement on SShaderSpecInfo::create --- include/nbl/video/IGPUPipeline.h | 7 ++++--- src/nbl/video/utilities/CAssetConverter.cpp | 12 ++++++------ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h index 5a160fb2b2..96ee843296 100644 --- a/include/nbl/video/IGPUPipeline.h +++ b/include/nbl/video/IGPUPipeline.h @@ -109,17 +109,18 @@ class IGPUPipelineBase { // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02754 - static inline SShaderSpecInfo create(const asset::ICPUPipelineBase::SShaderSpecInfo& cpuSpecInfo, entry_map_t& outEntries) + static inline SShaderSpecInfo create(const asset::ICPUPipelineBase::SShaderSpecInfo& cpuSpecInfo, entry_map_t* outEntries) { SShaderSpecInfo specInfo; specInfo.shader = cpuSpecInfo.shader.get(); specInfo.entryPoint = cpuSpecInfo.entryPoint; specInfo.requiredSubgroupSize = cpuSpecInfo.requiredSubgroupSize; + outEntries->clear(); for (const auto&[key, value] : cpuSpecInfo.entries) { - outEntries.insert({ key, { value.data(), value.size() } }); + outEntries->insert({ key, { value.data(), value.size() } }); } - specInfo.entries = &outEntries; + specInfo.entries = outEntries; return specInfo; }; }; diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 147a76bdd4..b53dc54262 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -3203,7 +3203,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult IGPUComputePipeline::SCreationParams params = {}; params.layout = visitor.layout; // while there are patches possible for shaders, the only patch which can happen here is changing a stage from UNKNOWN to COMPUTE - params.shader = IGPUPipelineBase::SShaderSpecInfo::create(visitor.getSpecInfo(), entryMap); + params.shader = IGPUPipelineBase::SShaderSpecInfo::create(visitor.getSpecInfo(), &entryMap); device->createComputePipelines(inputs.pipelineCache,{¶ms,1},&ppln); } conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,std::move(ppln)); @@ -3268,11 +3268,11 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult tmpSpecInfo.push_back(std::move(info)); } using GPUShaderSpecInfo = IGPUPipelineBase::SShaderSpecInfo; - params.vertexShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_VERTEX), vertexEntryMap); - params.tesselationControlShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_TESSELLATION_CONTROL), tesselationControlEntryMap); - params.tesselationEvaluationShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_TESSELLATION_EVALUATION), tesselationEvaluationEntryMap); - params.geometryShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_GEOMETRY), geometryEntryMap); - params.fragmentShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_FRAGMENT), fragmentEntryMap); + params.vertexShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_VERTEX), &vertexEntryMap); + params.tesselationControlShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_TESSELLATION_CONTROL), &tesselationControlEntryMap); + params.tesselationEvaluationShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_TESSELLATION_EVALUATION), &tesselationEvaluationEntryMap); + params.geometryShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_GEOMETRY), &geometryEntryMap); + params.fragmentShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_FRAGMENT), &fragmentEntryMap); } params.cached = asset->getCachedCreationParams(); device->createGraphicsPipelines(inputs.pipelineCache,{¶ms,1},&ppln); From 01dced9a007c5e4592aa65d77af9c6c14aab5b72 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 9 Jun 2025 15:24:36 +0700 Subject: [PATCH 290/346] Skip null node --- include/nbl/asset/IPreHashed.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/nbl/asset/IPreHashed.h b/include/nbl/asset/IPreHashed.h index 86e1841f61..94fb9a7d2d 100644 --- a/include/nbl/asset/IPreHashed.h +++ b/include/nbl/asset/IPreHashed.h @@ -46,8 +46,6 @@ class IPreHashed : public IAsset core::unordered_set alreadyDescended; // whether we have push the children to the stack auto push = [&stack,&alreadyVisited](IAsset* node) -> void { - if (!node) - return; const auto [dummy,inserted] = alreadyVisited.insert(node); if (inserted) stack.push(node); From ce77b462813cb4bb18ef26d6c02027514536e55a Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 9 Jun 2025 16:52:17 +0700 Subject: [PATCH 291/346] uncomment some concept requires --- include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl index 0177863b11..e11e238130 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -91,7 +91,7 @@ struct ArithmeticConfiguration // get a coalesced index to store for the next level in shared mem, e.g. level 0 -> level 1 // specify the next level to store values for in template param // at level==LevelCount-1, it is guaranteed to have SubgroupSize elements - template// NBL_FUNC_REQUIRES(level>0 && level0 && level// NBL_FUNC_REQUIRES(level>0 && level0 && level// NBL_FUNC_REQUIRES(level>0 && level0 && level Date: Wed, 11 Jun 2025 14:06:36 +0700 Subject: [PATCH 292/346] Add renderpass to constructor parameter of cpu graphics pipeline --- include/nbl/asset/ICPUGraphicsPipeline.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h index a17bebe87d..8e338020ab 100644 --- a/include/nbl/asset/ICPUGraphicsPipeline.h +++ b/include/nbl/asset/ICPUGraphicsPipeline.h @@ -20,9 +20,9 @@ class ICPUGraphicsPipeline final : public ICPUPipeline create(ICPUPipelineLayout* layout) + static core::smart_refctd_ptr create(ICPUPipelineLayout* layout, ICPURenderpass* renderpass = nullptr) { - auto retval = new ICPUGraphicsPipeline(layout); + auto retval = new ICPUGraphicsPipeline(layout, renderpass); return core::smart_refctd_ptr(retval,core::dont_grab); } @@ -79,8 +79,8 @@ class ICPUGraphicsPipeline final : public ICPUPipeline m_specInfos; private: - explicit ICPUGraphicsPipeline(ICPUPipelineLayout* layout) - : base_t(layout, {}, {}) + explicit ICPUGraphicsPipeline(ICPUPipelineLayout* layout, ICPURenderpass* renderpass) + : base_t(layout, {}, renderpass) {} static inline int8_t stageToIndex(const hlsl::ShaderStage stage) @@ -110,9 +110,8 @@ class ICPUGraphicsPipeline final : public ICPUPipeline clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final { - auto* newPipeline = new ICPUGraphicsPipeline(layout.get()); + auto* newPipeline = new ICPUGraphicsPipeline(layout.get(), m_renderpass.get()); newPipeline->m_params = m_params; - newPipeline->m_renderpass = m_renderpass; for (auto specInfo_i = 0u; specInfo_i < m_specInfos.size(); specInfo_i++) { From ccecd470a645b56f4b7296bc4dd1115f259c5c87 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 11 Jun 2025 14:07:11 +0700 Subject: [PATCH 293/346] Fix overload error --- include/nbl/asset/ICPUComputePipeline.h | 17 ++++++++++++- include/nbl/asset/ICPUGraphicsPipeline.h | 28 ++++++++++++++++++++- include/nbl/asset/ICPUPipeline.h | 6 +++-- include/nbl/asset/ICPURayTracingPipeline.h | 17 ++++++++++++- src/nbl/asset/utils/CSPIRVIntrospector.cpp | 2 +- src/nbl/video/utilities/CAssetConverter.cpp | 15 +++-------- src/nbl/video/utilities/CComputeBlit.cpp | 4 +-- 7 files changed, 69 insertions(+), 20 deletions(-) diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h index 69bffe2bba..cc05e6c762 100644 --- a/include/nbl/asset/ICPUComputePipeline.h +++ b/include/nbl/asset/ICPUComputePipeline.h @@ -15,6 +15,7 @@ namespace nbl::asset //! CPU Version of Compute Pipeline class ICPUComputePipeline final : public ICPUPipeline> { + using pipeline_base_t = IComputePipeline; using base_t = ICPUPipeline>; public: @@ -46,6 +47,11 @@ class ICPUComputePipeline final : public ICPUPipeline getSpecInfos(hlsl::ShaderStage stage) + { + return base_t::getSpecInfos(stage); + } + inline SShaderSpecInfo& getSpecInfo() { return m_specInfo; @@ -56,7 +62,16 @@ class ICPUComputePipeline final : public ICPUPipeline getSpecInfos(hlsl::ShaderStage stage) + { + return base_t::getSpecInfos(stage); + } + + SShaderSpecInfo* getSpecInfo(hlsl::ShaderStage stage) + { + if (!isMutable()) return nullptr; + const auto stageIndex = stageToIndex(stage); + if (stageIndex != -1) + return &m_specInfos[stageIndex]; + return nullptr; + } + + const SShaderSpecInfo* getSpecInfo(hlsl::ShaderStage stage) const + { + const auto stageIndex = stageToIndex(stage); + if (stageIndex != -1) + return &m_specInfos[stageIndex]; + return nullptr; + } inline virtual bool valid() const override final { diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h index 8b9fec34c4..7003beeee7 100644 --- a/include/nbl/asset/ICPUPipeline.h +++ b/include/nbl/asset/ICPUPipeline.h @@ -132,10 +132,12 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipe } // Note(kevinyu): For some reason overload resolution cannot find this function when I name id getSpecInfos. It always use the const variant. Will check on it later. - inline std::span getSpecInfoMut(hlsl::ShaderStage stage) + inline std::span getSpecInfos(hlsl::ShaderStage stage) { if (!isMutable()) return {}; - const auto specInfo = const_cast(this)->getSpecInfos(stage); + const this_t* constPipeline = const_cast(this); + const ICPUPipelineBase* basePipeline = constPipeline; + const auto specInfo = basePipeline->getSpecInfos(stage); return { const_cast(specInfo.data()), specInfo.size() }; } diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h index 1296d8359a..0c448b06b1 100644 --- a/include/nbl/asset/ICPURayTracingPipeline.h +++ b/include/nbl/asset/ICPURayTracingPipeline.h @@ -44,7 +44,7 @@ class ICPURayTracingPipeline final : public ICPUPipeline getSpecInfo(hlsl::ShaderStage stage) const override final + inline virtual std::span getSpecInfos(hlsl::ShaderStage stage) const override final { switch (stage) { @@ -65,6 +65,11 @@ class ICPURayTracingPipeline final : public ICPUPipeline getSpecInfos(hlsl::ShaderStage stage) + { + return base_t::getSpecInfos(stage); + } + inline core::vector* getSpecInfoVec(hlsl::ShaderStage stage) { if (!isMutable()) return nullptr; @@ -95,6 +100,16 @@ class ICPURayTracingPipeline final : public ICPUPipeline CSPIRVIntrospector::createApproximat } auto pipeline = ICPUComputePipeline::create(layout.get()); - pipeline->getSpecInfoMut(hlsl::ShaderStage::ESS_COMPUTE)[0] = info; + pipeline->getSpecInfos(hlsl::ShaderStage::ESS_COMPUTE)[0] = info; return pipeline; } diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index b53dc54262..d8ce147820 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -536,8 +536,8 @@ class AssetVisitor : public CRTP using stage_t = hlsl::ShaderStage; for (stage_t stage : {stage_t::ESS_VERTEX,stage_t::ESS_TESSELLATION_CONTROL,stage_t::ESS_TESSELLATION_EVALUATION,stage_t::ESS_GEOMETRY,stage_t::ESS_FRAGMENT}) { - const auto& specInfo = asset->getSpecInfos(stage); - const auto* shader = specInfo[0].shader.get(); + const auto& specInfo = *asset->getSpecInfo(stage); + const auto* shader = specInfo.shader.get(); if (!shader) { if (stage==stage_t::ESS_VERTEX) // required @@ -545,7 +545,7 @@ class AssetVisitor : public CRTP CRTP::template nullOptional(); continue; } - if (!descend(shader,{shader},specInfo[0], stage)) + if (!descend(shader,{shader}, specInfo, stage)) return false; } return true; @@ -3226,8 +3226,6 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult } if constexpr (std::is_same_v) { - core::vector tmpSpecInfo; - tmpSpecInfo.reserve(5); for (auto& entry : conversionRequests.contentHashToCanonical) { const ICPUGraphicsPipeline* asset = entry.second.canonicalAsset; @@ -3259,14 +3257,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult params.layout = visitor.layout; params.renderpass = visitor.renderpass; // while there are patches possible for shaders, the only patch which can happen here is changing a stage from UNKNOWN to match the slot here - tmpSpecInfo.clear(); using stage_t = hlsl::ShaderStage; - for (stage_t stage : {stage_t::ESS_VERTEX,stage_t::ESS_TESSELLATION_CONTROL,stage_t::ESS_TESSELLATION_EVALUATION,stage_t::ESS_GEOMETRY,stage_t::ESS_FRAGMENT}) - { - auto& info = visitor.getSpecInfo(stage); - if (info.shader) - tmpSpecInfo.push_back(std::move(info)); - } using GPUShaderSpecInfo = IGPUPipelineBase::SShaderSpecInfo; params.vertexShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_VERTEX), &vertexEntryMap); params.tesselationControlShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_TESSELLATION_CONTROL), &tesselationControlEntryMap); diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp index ade127b790..924c337cbe 100644 --- a/src/nbl/video/utilities/CComputeBlit.cpp +++ b/src/nbl/video/utilities/CComputeBlit.cpp @@ -78,12 +78,12 @@ struct ConstevalParameters } auto pipeline = ICPUComputePipeline::create(layout); - pipeline->getSpecInfoMut(ESS_COMPUTE)[0] = { + pipeline->getSpecInfo() = { .shader = shader, .entryPoint = "main", .requiredSubgroupSize = static_cast(findMSB(limits.maxSubgroupSize)), }; - pipeline->getCachedCreationParamsMut() = { + pipeline->getCachedCreationParams() = { .requireFullSubgroups = true, }; return pipeline; From 99f2d49295810801e5f33d5bebb0a8de2a9e38b9 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 11 Jun 2025 14:07:40 +0700 Subject: [PATCH 294/346] Optimize SpirvDebloaterTask to use only one map --- src/nbl/video/ILogicalDevice.cpp | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp index 975151ddbd..75b36ce889 100644 --- a/src/nbl/video/ILogicalDevice.cpp +++ b/src/nbl/video/ILogicalDevice.cpp @@ -10,7 +10,12 @@ using namespace nbl::video; class SpirvDebloatTask { public: - using EntryPoints = core::set; + using EntryPoints = core::set; + struct ShaderInfo + { + EntryPoints entryPoints; + const asset::IShader* debloatedShaders; + }; SpirvDebloatTask(asset::ISPIRVDebloater* debloater, system::logger_opt_ptr logger) : m_debloater(debloater), m_logger(logger) { @@ -20,38 +25,37 @@ class SpirvDebloatTask void insertEntryPoint(const IGPUPipelineBase::SShaderSpecInfo& shaderSpec, hlsl::ShaderStage stage) { const auto* shader = shaderSpec.shader; - auto it = m_entryPointsMap.find(shader); - if (it == m_entryPointsMap.end() || it->first != shader) - it = m_entryPointsMap.emplace_hint(it, shader, EntryPoints()); - it->second.insert({ .name = shaderSpec.entryPoint, .stage = stage }); + auto it = m_shaderInfoMap.find(shader); + if (it == m_shaderInfoMap.end() || it->first != shader) + it = m_shaderInfoMap.emplace_hint(it, shader, ShaderInfo{ EntryPoints(), nullptr } ); + it->second.entryPoints.insert({ .name = shaderSpec.entryPoint, .stage = stage }); } IGPUPipelineBase::SShaderSpecInfo debloat(const IGPUPipelineBase::SShaderSpecInfo& shaderSpec, core::vector>& outShaders) { const auto* shader = shaderSpec.shader; - const auto findResult = m_entryPointsMap.find(shader); - assert(findResult != m_entryPointsMap.end()); - const auto& entryPoints = findResult->second; + auto findResult = m_shaderInfoMap.find(shader); + assert(findResult != m_shaderInfoMap.end()); + const auto& entryPoints = findResult->second.entryPoints; + auto* debloatedShader = findResult->second.debloatedShaders; auto debloatedShaderSpec = shaderSpec; if (shader != nullptr) { - if (!m_debloatedShadersMap.contains(shader)) + if (debloatedShader == nullptr) { const auto outShadersData = outShaders.data(); outShaders.push_back(m_debloater->debloat(shader, entryPoints, m_logger)); assert(outShadersData == outShaders.data()); - m_debloatedShadersMap.emplace(shader, outShaders.back().get()); + debloatedShader = outShaders.back().get(); } - const auto debloatedShader = m_debloatedShadersMap[shader]; debloatedShaderSpec.shader = debloatedShader; } return debloatedShaderSpec; } private: - core::map m_entryPointsMap; - core::map m_debloatedShadersMap; + core::map m_shaderInfoMap; asset::ISPIRVDebloater* m_debloater; const system::logger_opt_ptr m_logger; }; From efecb7efb59d57a0334bf52913d1f7fe0a73f367 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 11 Jun 2025 14:07:55 +0700 Subject: [PATCH 295/346] Add inline to method in IGPUPipeline --- include/nbl/video/IGPUPipeline.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h index 96ee843296..c22ad998db 100644 --- a/include/nbl/video/IGPUPipeline.h +++ b/include/nbl/video/IGPUPipeline.h @@ -75,7 +75,7 @@ class IGPUPipelineBase { return static_cast(specData); } - bool accumulateSpecializationValidationResult(SSpecializationValidationResult* retval) const + inline bool accumulateSpecializationValidationResult(SSpecializationValidationResult* retval) const { const auto dataSize = valid(); if (dataSize < 0) From 1707b84bfd374a7ba43da9e0dc5a5fe8332194e4 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 11 Jun 2025 14:46:03 +0700 Subject: [PATCH 296/346] Move required subgroups size stages checking to commonCreatePipelines --- include/nbl/video/IGPUComputePipeline.h | 9 ++++++++ include/nbl/video/IGPUGraphicsPipeline.h | 17 +++++++++++++++ include/nbl/video/IGPURayTracingPipeline.h | 24 ++++++++++++++++++++++ include/nbl/video/ILogicalDevice.h | 8 ++++++++ src/nbl/video/ILogicalDevice.cpp | 7 ------- 5 files changed, 58 insertions(+), 7 deletions(-) diff --git a/include/nbl/video/IGPUComputePipeline.h b/include/nbl/video/IGPUComputePipeline.h index 36813699c0..4c7bac1e6a 100644 --- a/include/nbl/video/IGPUComputePipeline.h +++ b/include/nbl/video/IGPUComputePipeline.h @@ -62,6 +62,15 @@ class IGPUComputePipeline : public IGPUPipeline getRequiredSubgroupStages() const + { + if (shader.requiredSubgroupSize >= asset::IPipelineBase::SUBGROUP_SIZE::REQUIRE_4) + { + return hlsl::ESS_COMPUTE; + } + return {}; + } + IGPUPipelineLayout* layout = nullptr; // TODO: Could guess the required flags from SPIR-V introspection of declared caps core::bitflag flags = FLAGS::NONE; diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h index 806ee337c3..dd2e587ee4 100644 --- a/include/nbl/video/IGPUGraphicsPipeline.h +++ b/include/nbl/video/IGPUGraphicsPipeline.h @@ -67,6 +67,23 @@ class IGPUGraphicsPipeline : public IGPUPipeline getRequiredSubgroupStages() const + { + core::bitflag stages; + auto processSpecInfo = [&](const SShaderSpecInfo& spec, hlsl::ShaderStage stage) + { + if (spec.requiredSubgroupSize >= SUBGROUP_SIZE::REQUIRE_4) { + stages |= stage; + } + }; + processSpecInfo(vertexShader, hlsl::ESS_VERTEX); + processSpecInfo(tesselationControlShader, hlsl::ESS_TESSELLATION_CONTROL); + processSpecInfo(tesselationEvaluationShader, hlsl::ESS_TESSELLATION_EVALUATION); + processSpecInfo(geometryShader, hlsl::ESS_GEOMETRY); + processSpecInfo(fragmentShader, hlsl::ESS_FRAGMENT); + return stages; + } + IGPUPipelineLayout* layout = nullptr; SShaderSpecInfo vertexShader; SShaderSpecInfo tesselationControlShader; diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h index 7151f8f227..90060ab883 100644 --- a/include/nbl/video/IGPURayTracingPipeline.h +++ b/include/nbl/video/IGPURayTracingPipeline.h @@ -143,6 +143,30 @@ class IGPURayTracingPipeline : public IGPUPipeline getRequiredSubgroupStages() const + { + core::bitflag stages; + auto processSpecInfo = [&](const SShaderSpecInfo& spec, hlsl::ShaderStage stage) + { + if (spec.requiredSubgroupSize >= SUBGROUP_SIZE::REQUIRE_4) { + stages |= stage; + } + }; + processSpecInfo(shaderGroups.raygen, hlsl::ESS_RAYGEN); + for (const auto& miss : shaderGroups.misses) + processSpecInfo(miss, hlsl::ESS_MISS); + for (const auto& hit : shaderGroups.hits) + { + processSpecInfo(hit.closestHit, hlsl::ESS_CLOSEST_HIT); + processSpecInfo(hit.anyHit, hlsl::ESS_ANY_HIT); + processSpecInfo(hit.intersection, hlsl::ESS_INTERSECTION); + } + for (const auto& callable : shaderGroups.callables) + processSpecInfo(callable, hlsl::ESS_CALLABLE); + return stages; + } + }; struct SShaderGroupHandle diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 5976d06eb0..3f4dfa0f05 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -1259,6 +1259,14 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe return {}; } + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02755 + const auto requiredSubgroupSizeStages = getPhysicalDeviceLimits().requiredSubgroupSizeStages; + if (!requiredSubgroupSizeStages.hasFlags(ci.getRequiredSubgroupStages())) + { + NBL_LOG_ERROR("Invalid shader stage"); + return {}; + } + retval += validation; } return retval; diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp index 75b36ce889..52ca3a55bd 100644 --- a/src/nbl/video/ILogicalDevice.cpp +++ b/src/nbl/video/ILogicalDevice.cpp @@ -812,13 +812,6 @@ bool ILogicalDevice::createComputePipelines(IGPUPipelineCache* const pipelineCac { const auto& ci = params[ix]; - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02755 - if (ci.shader.requiredSubgroupSize>=asset::IPipelineBase::SUBGROUP_SIZE::REQUIRE_4 && !getPhysicalDeviceLimits().requiredSubgroupSizeStages.hasFlags(hlsl::ShaderStage::ESS_COMPUTE)) - { - NBL_LOG_ERROR("Invalid shader stage"); - return false; - } - const core::set entryPoints = { asset::ISPIRVDebloater::EntryPoint{.name = ci.shader.entryPoint, .stage = hlsl::ShaderStage::ESS_COMPUTE} }; debloatedShaders.push_back(m_spirvDebloater->debloat(ci.shader.shader, entryPoints, m_logger)); auto debloatedShaderSpec = ci.shader; From 165eebc56ca14b4afcac3b9180d67112919038fa Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 11 Jun 2025 20:34:41 +0700 Subject: [PATCH 297/346] Implement visitDependents --- include/nbl/asset/IAsset.h | 21 ++++++++++++++ include/nbl/asset/ICPUAccelerationStructure.h | 9 ++++++ include/nbl/asset/ICPUAnimationLibrary.h | 7 +++++ include/nbl/asset/ICPUBuffer.h | 2 ++ include/nbl/asset/ICPUBufferView.h | 6 ++++ include/nbl/asset/ICPUComputePipeline.h | 6 ++++ include/nbl/asset/ICPUDescriptorSet.h | 29 +++++++++++++++++++ include/nbl/asset/ICPUDescriptorSetLayout.h | 6 ++++ include/nbl/asset/ICPUGraphicsPipeline.h | 8 +++++ include/nbl/asset/ICPUImage.h | 4 +++ include/nbl/asset/ICPUImageView.h | 5 ++++ include/nbl/asset/ICPUMesh.h | 4 +++ include/nbl/asset/ICPUMeshBuffer.h | 3 ++ include/nbl/asset/ICPUPipelineCache.h | 8 +++-- include/nbl/asset/ICPUPipelineLayout.h | 9 ++++++ include/nbl/asset/ICPURayTracingPipeline.h | 17 +++++++++++ include/nbl/asset/ICPURenderpass.h | 6 ++++ .../asset/ICPURenderpassIndependentPipeline.h | 6 ++++ include/nbl/asset/ICPUSampler.h | 6 ++++ include/nbl/asset/ICPUSkeleton.h | 6 ++++ include/nbl/asset/IShader.h | 5 ++++ 21 files changed, 171 insertions(+), 2 deletions(-) diff --git a/include/nbl/asset/IAsset.h b/include/nbl/asset/IAsset.h index 0e91b99c36..cc105f2633 100644 --- a/include/nbl/asset/IAsset.h +++ b/include/nbl/asset/IAsset.h @@ -156,6 +156,25 @@ class IAsset : virtual public core::IReferenceCounted //! inline bool isMutable() const {return m_mutable;} + inline void visitDependents(std::function visit) const + { + visitDependentsImpl([&visit](const IAsset* dep)->bool + { + if (dep) + return visit(dep); + return true; + }); + } + + inline void visitDependents(std::function visit) + { + assert(isMutable()); + visitDependents([&](const IAsset* dependent) -> bool + { + return visit(const_cast(dependent)); + }); + } + virtual core::unordered_set computeDependants() const = 0; virtual core::unordered_set computeDependants() = 0; @@ -174,6 +193,8 @@ class IAsset : virtual public core::IReferenceCounted private: friend IAssetManager; bool m_mutable = true; + + virtual void visitDependentsImpl(std::function visit) const = 0; }; template diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h index 4e194867e6..8d02b3ac8b 100644 --- a/include/nbl/asset/ICPUAccelerationStructure.h +++ b/include/nbl/asset/ICPUAccelerationStructure.h @@ -135,6 +135,7 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo return cp; } + // Do not report anything as a dependant, we'll simply drop the data instead of discarding its contents inline core::unordered_set computeDependants() const override { @@ -257,6 +258,8 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo core::smart_refctd_dynamic_array> m_AABBGeoms = nullptr; core::smart_refctd_dynamic_array m_geometryPrimitiveCount = nullptr; core::bitflag m_buildFlags = BUILD_FLAGS::PREFER_FAST_TRACE_BIT; + + inline virtual void visitDependentsImpl(std::function visit) const override {} }; class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelAccelerationStructure @@ -386,6 +389,12 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA for (const auto& instance : *self->m_instances) dependants.insert(instance.getBase().blas.get()); return dependants; + } + + inline virtual void visitDependentsImpl(std::function visit) const override + { + for (const auto& instance : *m_instances) + if (!visit(instance.getBase().blas.get())) return; } }; diff --git a/include/nbl/asset/ICPUAnimationLibrary.h b/include/nbl/asset/ICPUAnimationLibrary.h index 1663447b73..33b5b182c9 100644 --- a/include/nbl/asset/ICPUAnimationLibrary.h +++ b/include/nbl/asset/ICPUAnimationLibrary.h @@ -113,6 +113,13 @@ class ICPUAnimationLibrary final : public IAnimationLibrary, public static auto computeDependantsImpl(Self* self) { using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; return core::unordered_set{ self->m_keyframeStorageBinding.buffer.get(), self->m_timestampStorageBinding.buffer.get(), self->m_animationStorageRange.buffer.get() }; + } + + virtual void visitDependentsImpl(std::function visit) const override + { + if (!visit(m_keyframeStorageBinding.buffer.get())) return; + if (!visit(m_timestampStorageBinding.buffer.get())) return; + if (!visit(m_animationStorageRange.buffer.get())) return; } }; diff --git a/include/nbl/asset/ICPUBuffer.h b/include/nbl/asset/ICPUBuffer.h index 0ad1d7bf48..94f1dc750a 100644 --- a/include/nbl/asset/ICPUBuffer.h +++ b/include/nbl/asset/ICPUBuffer.h @@ -139,6 +139,8 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed discardContent_impl(); } + inline virtual void visitDependentsImpl(std::function visit) const override {} + void* m_data; core::smart_refctd_ptr m_mem_resource; size_t m_alignment; diff --git a/include/nbl/asset/ICPUBufferView.h b/include/nbl/asset/ICPUBufferView.h index 55d50356c1..1741a1f445 100644 --- a/include/nbl/asset/ICPUBufferView.h +++ b/include/nbl/asset/ICPUBufferView.h @@ -28,6 +28,7 @@ class ICPUBufferView : public IBufferView, public IAsset constexpr static inline auto AssetType = ET_BUFFER_VIEW; inline IAsset::E_TYPE getAssetType() const override { return AssetType; } + inline core::unordered_set computeDependants() const override { return computeDependantsImpl(this); @@ -66,6 +67,11 @@ class ICPUBufferView : public IBufferView, public IAsset using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; return core::unordered_set{ self->m_buffer.get() }; } + + inline virtual void visitDependentsImpl(std::function visit) const override + { + if (!visit(m_buffer.get())) return; + } }; } diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h index cc05e6c762..5dbec00ea4 100644 --- a/include/nbl/asset/ICPUComputePipeline.h +++ b/include/nbl/asset/ICPUComputePipeline.h @@ -105,6 +105,12 @@ class ICPUComputePipeline final : public ICPUPipeline, const IAsset*, IAsset*>; return core::unordered_set{ self->m_layout.get(), self->m_specInfo.shader.get() }; } + + virtual void visitDependentsImpl(std::function visit) const override + { + if (!visit(m_layout.get())) return; + if (!visit(m_specInfo.shader.get())) return; + } }; } diff --git a/include/nbl/asset/ICPUDescriptorSet.h b/include/nbl/asset/ICPUDescriptorSet.h index c7f54360ac..05a7f51f60 100644 --- a/include/nbl/asset/ICPUDescriptorSet.h +++ b/include/nbl/asset/ICPUDescriptorSet.h @@ -127,6 +127,35 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet visit) const override + { + for (auto i = 0u; i < static_cast(IDescriptor::E_TYPE::ET_COUNT); i++) + { + if (!m_descriptorInfos[i]) continue; + const auto size = m_descriptorInfos[i]->size(); + for (auto desc_i = 0u; desc_i < size; desc_i++) + { + auto* desc = m_descriptorInfos[i]->operator[](desc_i).desc.get(); + if (!desc) continue; + switch (IDescriptor::GetTypeCategory(static_cast(i))) + { + case IDescriptor::EC_BUFFER: + if (!visit(static_cast(desc))) return; + case IDescriptor::EC_SAMPLER: + if (!visit(static_cast(desc))) return; + case IDescriptor::EC_IMAGE: + if (!visit(static_cast(desc))) return; + case IDescriptor::EC_BUFFER_VIEW: + if (!visit(static_cast(desc))) return; + case IDescriptor::EC_ACCELERATION_STRUCTURE: + if (!visit(static_cast(desc))) return; + default: + break; + } + } + } + } }; } diff --git a/include/nbl/asset/ICPUDescriptorSetLayout.h b/include/nbl/asset/ICPUDescriptorSetLayout.h index aea1520b6f..8dce4d9db4 100644 --- a/include/nbl/asset/ICPUDescriptorSetLayout.h +++ b/include/nbl/asset/ICPUDescriptorSetLayout.h @@ -85,6 +85,12 @@ class ICPUDescriptorSetLayout : public IDescriptorSetLayout, public return dependants; } + inline virtual void visitDependentsImpl(std::function visit) const override + { + if (m_immutableSamplers) return; + for (const auto& sampler : *m_immutableSamplers) + if (!visit(sampler.get())) return; + } }; } diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h index eb4bc0d961..470c5d813b 100644 --- a/include/nbl/asset/ICPUGraphicsPipeline.h +++ b/include/nbl/asset/ICPUGraphicsPipeline.h @@ -146,6 +146,14 @@ class ICPUGraphicsPipeline final : public ICPUPipeline(newPipeline, core::dont_grab); } + + inline virtual void visitDependentsImpl(std::function visit) const override + { + if (!visit(m_layout.get())) return; + if (!visit(m_renderpass.get())) return; + for (const auto& info : m_specInfos) + if (!visit(info.shader.get())) return; + } }; } diff --git a/include/nbl/asset/ICPUImage.h b/include/nbl/asset/ICPUImage.h index b732e50492..f13d75b76a 100644 --- a/include/nbl/asset/ICPUImage.h +++ b/include/nbl/asset/ICPUImage.h @@ -227,6 +227,10 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed return _a.imageSubresource.mipLevel < _b.imageSubresource.mipLevel; } }; + + inline virtual void visitDependentsImpl(std::function visit) const override + { + } }; } // end namespace nbl::asset diff --git a/include/nbl/asset/ICPUImageView.h b/include/nbl/asset/ICPUImageView.h index 9639df6eb9..74cb143fe6 100644 --- a/include/nbl/asset/ICPUImageView.h +++ b/include/nbl/asset/ICPUImageView.h @@ -82,6 +82,11 @@ class ICPUImageView final : public IImageView, public IAsset using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; return core::unordered_set{ self->params.image.get() }; } + + inline virtual void visitDependentsImpl(std::function visit) const override + { + if (!visit(params.image.get())) return; + } }; } diff --git a/include/nbl/asset/ICPUMesh.h b/include/nbl/asset/ICPUMesh.h index e9aaf53ba4..f52db5055e 100644 --- a/include/nbl/asset/ICPUMesh.h +++ b/include/nbl/asset/ICPUMesh.h @@ -96,6 +96,10 @@ class ICPUMesh final : public IMesh, public IAsset private: core::vector> m_meshBuffers; + + inline virtual void visitDependentsImpl(std::function visit) const override + { + } }; } diff --git a/include/nbl/asset/ICPUMeshBuffer.h b/include/nbl/asset/ICPUMeshBuffer.h index c44d055c18..9872cc6b10 100644 --- a/include/nbl/asset/ICPUMeshBuffer.h +++ b/include/nbl/asset/ICPUMeshBuffer.h @@ -622,6 +622,9 @@ class ICPUMeshBuffer final : public IMeshBuffer visit) const override + { + } }; } diff --git a/include/nbl/asset/ICPUPipelineCache.h b/include/nbl/asset/ICPUPipelineCache.h index 0ff912603d..85ac650a22 100644 --- a/include/nbl/asset/ICPUPipelineCache.h +++ b/include/nbl/asset/ICPUPipelineCache.h @@ -60,12 +60,12 @@ class ICPUPipelineCache final : public IPreHashed return core::make_smart_refctd_ptr(std::move(cache_cp)); } - inline core::unordered_set computeDependants() const override + inline core::unordered_set computeDependants() const override { return {}; } - inline core::unordered_set computeDependants() override + inline core::unordered_set computeDependants() override { return {}; } @@ -102,6 +102,10 @@ class ICPUPipelineCache final : public IPreHashed private: entries_map_t m_cache; + + inline virtual void visitDependentsImpl(std::function visit) const override + { + } }; } diff --git a/include/nbl/asset/ICPUPipelineLayout.h b/include/nbl/asset/ICPUPipelineLayout.h index 4b668c1472..c7d835faae 100644 --- a/include/nbl/asset/ICPUPipelineLayout.h +++ b/include/nbl/asset/ICPUPipelineLayout.h @@ -92,6 +92,15 @@ class ICPUPipelineLayout : public IAsset, public IPipelineLayout visit) const override + { + for (auto i = 0; i < m_descSetLayouts.size(); i++) + { + if (m_descSetLayouts[i]) continue; + if (!visit(m_descSetLayouts[i].get())) return; + } + } + }; } diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h index 0c448b06b1..09101c73ee 100644 --- a/include/nbl/asset/ICPURayTracingPipeline.h +++ b/include/nbl/asset/ICPURayTracingPipeline.h @@ -138,6 +138,23 @@ class ICPURayTracingPipeline final : public ICPUPipeline visit) const override + { + core::unordered_set dependants; + const auto visitOnce = [&](const IAsset* dep) -> bool { + auto [iter, inserted] = dependants.insert(dep); + if (inserted) return visit(dep); + return true; + }; + visitOnce(m_raygen.shader.get()); + for (const auto& missInfo : self->m_misses) visitOnce(missInfo.shader.get()); + for (const auto& anyHitInfo : self->m_hitGroups.anyHits) visitOnce(anyHitInfo.shader.get()); + for (const auto& closestHitInfo : self->m_hitGroups.closestHits) visitOnce(closestHitInfo.shader.get()); + for (const auto& intersectionInfo : self->m_hitGroups.intersections) visitOnce(intersectionInfo.shader.get()); + for (const auto& callableInfo : self->m_callables) visitOnce(callableInfo.shader.get()); + + } + inline core::smart_refctd_ptr clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final { auto newPipeline = new ICPURayTracingPipeline(layout.get()); diff --git a/include/nbl/asset/ICPURenderpass.h b/include/nbl/asset/ICPURenderpass.h index 9cc73af881..517ffbe766 100644 --- a/include/nbl/asset/ICPURenderpass.h +++ b/include/nbl/asset/ICPURenderpass.h @@ -52,6 +52,12 @@ class ICPURenderpass : public IRenderpass, public IAsset inline ICPURenderpass(const SCreationParams& _params, const SCreationParamValidationResult& _validation) : IRenderpass(_params, _validation) {} inline ~ICPURenderpass() = default; + private: + + inline virtual void visitDependentsImpl(std::function visit) const override + { + } + }; } diff --git a/include/nbl/asset/ICPURenderpassIndependentPipeline.h b/include/nbl/asset/ICPURenderpassIndependentPipeline.h index 83536e0c54..feb04cd1c4 100644 --- a/include/nbl/asset/ICPURenderpassIndependentPipeline.h +++ b/include/nbl/asset/ICPURenderpassIndependentPipeline.h @@ -157,6 +157,12 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline, std::array,GRAPHICS_SHADER_STAGE_COUNT> m_entries = {}; std::array m_infos = {}; #endif + + private: + + inline virtual void visitDependentsImpl(std::function visit) const override + { + } }; } diff --git a/include/nbl/asset/ICPUSampler.h b/include/nbl/asset/ICPUSampler.h index ed11e7695d..d2ef756cad 100644 --- a/include/nbl/asset/ICPUSampler.h +++ b/include/nbl/asset/ICPUSampler.h @@ -78,6 +78,12 @@ class ICPUSampler : public ISampler, public IAsset { return {}; } + + private: + + inline virtual void visitDependentsImpl(std::function visit) const override + { + } }; } diff --git a/include/nbl/asset/ICPUSkeleton.h b/include/nbl/asset/ICPUSkeleton.h index a29adbabbc..fb5c5953e0 100644 --- a/include/nbl/asset/ICPUSkeleton.h +++ b/include/nbl/asset/ICPUSkeleton.h @@ -96,6 +96,12 @@ class ICPUSkeleton final : public ISkeleton, public IAsset using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; return core::unordered_set{ self->m_defaultTransforms.buffer.get(), self->m_parentJointIDs.buffer.get() }; } + + inline virtual void visitDependentsImpl(std::function visit) const override + { + if (!visit(m_defaultTransforms.buffer.get())) return; + if (!visit(m_parentJointIDs.buffer.get())) return; + } }; } diff --git a/include/nbl/asset/IShader.h b/include/nbl/asset/IShader.h index 59286e219d..4574ac073a 100644 --- a/include/nbl/asset/IShader.h +++ b/include/nbl/asset/IShader.h @@ -114,6 +114,11 @@ class IShader : public IAsset using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; return core::unordered_set{self->m_code.get()}; } + + inline virtual void visitDependentsImpl(std::function visit) const override + { + if (!visit(m_code.get())) return; + } }; } From d0a0245e7619aa1c2ae02672fc09683e0750db30 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 11 Jun 2025 20:35:28 +0700 Subject: [PATCH 298/346] Check shader availability in getRequiredSubgroupStages --- include/nbl/video/IGPUComputePipeline.h | 2 +- include/nbl/video/IGPUGraphicsPipeline.h | 2 +- include/nbl/video/IGPURayTracingPipeline.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/nbl/video/IGPUComputePipeline.h b/include/nbl/video/IGPUComputePipeline.h index 4c7bac1e6a..1b6cbd69f2 100644 --- a/include/nbl/video/IGPUComputePipeline.h +++ b/include/nbl/video/IGPUComputePipeline.h @@ -64,7 +64,7 @@ class IGPUComputePipeline : public IGPUPipeline getRequiredSubgroupStages() const { - if (shader.requiredSubgroupSize >= asset::IPipelineBase::SUBGROUP_SIZE::REQUIRE_4) + if (shader.shader && shader.requiredSubgroupSize >= asset::IPipelineBase::SUBGROUP_SIZE::REQUIRE_4) { return hlsl::ESS_COMPUTE; } diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h index dd2e587ee4..7d38ea677e 100644 --- a/include/nbl/video/IGPUGraphicsPipeline.h +++ b/include/nbl/video/IGPUGraphicsPipeline.h @@ -72,7 +72,7 @@ class IGPUGraphicsPipeline : public IGPUPipeline stages; auto processSpecInfo = [&](const SShaderSpecInfo& spec, hlsl::ShaderStage stage) { - if (spec.requiredSubgroupSize >= SUBGROUP_SIZE::REQUIRE_4) { + if (spec.shader && spec.requiredSubgroupSize >= SUBGROUP_SIZE::REQUIRE_4) { stages |= stage; } }; diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h index 90060ab883..6d77fc360e 100644 --- a/include/nbl/video/IGPURayTracingPipeline.h +++ b/include/nbl/video/IGPURayTracingPipeline.h @@ -149,7 +149,7 @@ class IGPURayTracingPipeline : public IGPUPipeline stages; auto processSpecInfo = [&](const SShaderSpecInfo& spec, hlsl::ShaderStage stage) { - if (spec.requiredSubgroupSize >= SUBGROUP_SIZE::REQUIRE_4) { + if (spec.shader && spec.requiredSubgroupSize >= SUBGROUP_SIZE::REQUIRE_4) { stages |= stage; } }; From 2ad3e732084570ec915191b0662783523a72c86a Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 11 Jun 2025 20:52:38 +0700 Subject: [PATCH 299/346] Use visitDependents for discardDependantContents and anyDependantDiscardedContents --- include/nbl/asset/IPreHashed.h | 50 ++++++++++++++-------------------- 1 file changed, 20 insertions(+), 30 deletions(-) diff --git a/include/nbl/asset/IPreHashed.h b/include/nbl/asset/IPreHashed.h index 94fb9a7d2d..054bfaee92 100644 --- a/include/nbl/asset/IPreHashed.h +++ b/include/nbl/asset/IPreHashed.h @@ -43,66 +43,56 @@ class IPreHashed : public IAsset { core::stack stack; core::unordered_set alreadyVisited; // whether we have push the node to the stack - core::unordered_set alreadyDescended; // whether we have push the children to the stack - auto push = [&stack,&alreadyVisited](IAsset* node) -> void + auto push = [&stack,&alreadyVisited](IAsset* node) -> bool { const auto [dummy,inserted] = alreadyVisited.insert(node); if (inserted) stack.push(node); + return true; }; for (const auto& root : roots) push(root); while (!stack.empty()) { auto* entry = stack.top(); - const auto [dummy, inserted] = alreadyDescended.insert(entry); - if (inserted) - { - core::unordered_set dependants = entry->computeDependants(); - for (auto* dependant : dependants) push(dependant); - } else - { - // post order traversal does discard - auto* isPrehashed = dynamic_cast(entry); - if (isPrehashed) - isPrehashed->discardContent(); - stack.pop(); - } + stack.pop(); + entry->visitDependents(push); + // post order traversal does discard + auto* isPrehashed = dynamic_cast(entry); + if (isPrehashed) + isPrehashed->discardContent(); } } static inline bool anyDependantDiscardedContents(const IAsset* root) { core::stack stack; core::unordered_set alreadyVisited; // whether we have push the node to the stack - core::unordered_set alreadyDescended; // whether we have push the children to the stack - auto push = [&stack,&alreadyVisited](const IAsset* node) -> bool + bool result = false; + auto push = [&stack,&alreadyVisited,&result](const IAsset* node) -> bool { - if (!node) - return false; const auto [dummy,inserted] = alreadyVisited.insert(node); if (inserted) { auto* isPrehashed = dynamic_cast(node); if (isPrehashed && isPrehashed->missingContent()) - return true; + { + stack = {}; + result = true; + return false; + } stack.push(node); } - return false; + return true; }; - if (push(root)) + if (!push(root)) return true; while (!stack.empty()) { auto* entry = stack.top(); - const auto [dummy, inserted] = alreadyDescended.insert(entry); - if (inserted) - { - core::unordered_set dependants = entry->computeDependants(); - for (auto* dependant : dependants) push(dependant); - } else - stack.pop(); + stack.pop(); + entry->visitDependents(push); } - return false; + return result; } protected: From 542bd0675a094eba3170188a50d05a85321e88bc Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 11 Jun 2025 20:52:52 +0700 Subject: [PATCH 300/346] Fix debloat task --- src/nbl/video/ILogicalDevice.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp index 52ca3a55bd..19dc001d8f 100644 --- a/src/nbl/video/ILogicalDevice.cpp +++ b/src/nbl/video/ILogicalDevice.cpp @@ -37,7 +37,7 @@ class SpirvDebloatTask auto findResult = m_shaderInfoMap.find(shader); assert(findResult != m_shaderInfoMap.end()); const auto& entryPoints = findResult->second.entryPoints; - auto* debloatedShader = findResult->second.debloatedShaders; + auto& debloatedShader = findResult->second.debloatedShaders; auto debloatedShaderSpec = shaderSpec; if (shader != nullptr) From c9597ff77d5aa12ebdd31d27e242163b19f713b6 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 11 Jun 2025 20:59:18 +0700 Subject: [PATCH 301/346] Initialize stages to zero. --- include/nbl/video/IGPUGraphicsPipeline.h | 2 +- include/nbl/video/IGPURayTracingPipeline.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h index 7d38ea677e..6b2201672b 100644 --- a/include/nbl/video/IGPUGraphicsPipeline.h +++ b/include/nbl/video/IGPUGraphicsPipeline.h @@ -69,7 +69,7 @@ class IGPUGraphicsPipeline : public IGPUPipeline getRequiredSubgroupStages() const { - core::bitflag stages; + core::bitflag stages = {}; auto processSpecInfo = [&](const SShaderSpecInfo& spec, hlsl::ShaderStage stage) { if (spec.shader && spec.requiredSubgroupSize >= SUBGROUP_SIZE::REQUIRE_4) { diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h index 6d77fc360e..482861dbcc 100644 --- a/include/nbl/video/IGPURayTracingPipeline.h +++ b/include/nbl/video/IGPURayTracingPipeline.h @@ -146,7 +146,7 @@ class IGPURayTracingPipeline : public IGPUPipeline getRequiredSubgroupStages() const { - core::bitflag stages; + core::bitflag stages = {}; auto processSpecInfo = [&](const SShaderSpecInfo& spec, hlsl::ShaderStage stage) { if (spec.shader && spec.requiredSubgroupSize >= SUBGROUP_SIZE::REQUIRE_4) { From 046a334819f1b88454afb6cb7ab4de3b6d8906e9 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 11 Jun 2025 21:02:23 +0700 Subject: [PATCH 302/346] More descriptive error --- include/nbl/video/ILogicalDevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 3f4dfa0f05..d8ef2bdef1 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -1263,7 +1263,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe const auto requiredSubgroupSizeStages = getPhysicalDeviceLimits().requiredSubgroupSizeStages; if (!requiredSubgroupSizeStages.hasFlags(ci.getRequiredSubgroupStages())) { - NBL_LOG_ERROR("Invalid shader stage"); + NBL_LOG_ERROR("Shader stage is not a valid bit specified in requiredSubgroupSizeStages"); return {}; } From fc1bc51846626a425ec697d53bbacc6273d11159 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 13 Jun 2025 14:08:25 +0700 Subject: [PATCH 303/346] removed redundant stuff, make config more readable --- .../hlsl/workgroup2/arithmetic_config.hlsl | 48 +++++++++++-------- .../builtin/hlsl/workgroup2/shared_scan.hlsl | 38 +++++++-------- src/nbl/builtin/CMakeLists.txt | 1 + 3 files changed, 47 insertions(+), 40 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl index e11e238130..419547bfd8 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -55,16 +55,22 @@ struct ArithmeticConfiguration static_assert(VirtualWorkgroupSize<=WorkgroupSize*SubgroupSize); using items_per_invoc_t = impl::items_per_invocation; - NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_0 = items_per_invoc_t::value0; - NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_1 = items_per_invoc_t::value1; - NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = items_per_invoc_t::value2; + using ItemsPerInvocation = typename items_per_invoc_t::ItemsPerInvocation; + NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_0 = tuple_element<0,ItemsPerInvocation>::type::value; + NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_1 = tuple_element<1,ItemsPerInvocation>::type::value; + NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = tuple_element<2,ItemsPerInvocation>::type::value; static_assert(ItemsPerInvocation_2<=4, "4 level scan would have been needed with this config!"); NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelInputCount_1 = conditional_value>SubgroupSizeLog2), SubgroupSize>, SubgroupSize*ItemsPerInvocation_1>::value; NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelInputCount_2 = conditional_value::value; - NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroup = LevelInputCount_1 / ItemsPerInvocation_1; + NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualInvocationsAtLevel1 = LevelInputCount_1 / ItemsPerInvocation_1; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t __padding = conditional_value::value; + NBL_CONSTEXPR_STATIC_INLINE uint16_t __channelStride_1 = conditional_value::value; + NBL_CONSTEXPR_STATIC_INLINE uint16_t __channelStride_2 = conditional_value::value; + using ChannelStride = tuple,integral_constant >; // user specified the shared mem size of Scalars NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = conditional_value::value + LevelInputCount_1 >::value; - NBL_CONSTEXPR_STATIC_INLINE uint16_t __padding = conditional_value::value; static bool electLast() { @@ -94,16 +99,21 @@ struct ArithmeticConfiguration template0 && level::type::value; + const uint16_t outChannel = virtualSubgroupID & (ItemsPerNextInvocation-uint16_t(1u)); + const uint16_t outInvocation = virtualSubgroupID/ItemsPerNextInvocation; + const uint16_t localOffset = outChannel * tuple_element::type::value + outInvocation; if (level==2) - return LevelInputCount_1 + ((SubgroupSize-uint16_t(1u))*ItemsPerInvocation_1) + (virtualSubgroupID & (ItemsPerInvocation_2-uint16_t(1u))) * nextLevelInvocationCount + (virtualSubgroupID/ItemsPerInvocation_2); + { + const uint16_t baseOffset = LevelInputCount_1 + (SubgroupSize-uint16_t(1u)) * ItemsPerNextInvocation; + return baseOffset + localOffset; + } else - return (virtualSubgroupID & (ItemsPerInvocation_1-uint16_t(1u))) * (nextLevelInvocationCount+__padding) + (virtualSubgroupID/ItemsPerInvocation_1) + virtualSubgroupID/(SubgroupSize*ItemsPerInvocation_1); + { + const uint16_t paddingOffset = virtualSubgroupID/(SubgroupSize*ItemsPerInvocation_1); + return localOffset + paddingOffset; + } } template0 && level0 && level::type::value + invocationIndex; + const uint16_t paddingOffset = invocationIndex/SubgroupSize; if (level==2) - return LevelInputCount_1 + ((SubgroupSize-uint16_t(1u))*ItemsPerInvocation_1) + component * levelInvocationCount + invocationIndex + invocationIndex/SubgroupSize; + { + const uint16_t baseOffset = LevelInputCount_1 + (SubgroupSize-uint16_t(1u)) * ItemsPerInvocation_1; + return baseOffset + localOffset + paddingOffset; + } else - return component * (levelInvocationCount+__padding) + invocationIndex + invocationIndex/SubgroupSize; + return localOffset + paddingOffset; } }; diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index 80dec1b85c..f8242f5ae1 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -84,12 +84,15 @@ struct reduce_level0 using scalar_t = typename BinOp::type_t; using vector_t = vector; // data accessor needs to be this type - template + template static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { + using config_t = subgroup2::Configuration; + using params_t = subgroup2::ArithmeticParams; + const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 0 scan - subgroup2::reduction reduction0; + subgroup2::reduction reduction0; [unroll] for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) { @@ -112,11 +115,14 @@ struct scan_level0 using scalar_t = typename BinOp::type_t; using vector_t = vector; // data accessor needs to be this type - template + template static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { + using config_t = subgroup2::Configuration; + using params_t = subgroup2::ArithmeticParams; + const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); - subgroup2::inclusive_scan inclusiveScan0; + subgroup2::inclusive_scan inclusiveScan0; // level 0 scan [unroll] for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) @@ -147,11 +153,10 @@ struct reduce scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { using config_t = subgroup2::Configuration; - using params_lv0_t = subgroup2::ArithmeticParams; using params_lv1_t = subgroup2::ArithmeticParams; BinOp binop; - reduce_level0::template __call(dataAccessor, scratchAccessor); + reduce_level0::template __call(dataAccessor, scratchAccessor); const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 1 scan @@ -186,11 +191,10 @@ struct scan void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { using config_t = subgroup2::Configuration; - using params_lv0_t = subgroup2::ArithmeticParams; using params_lv1_t = subgroup2::ArithmeticParams; BinOp binop; - scan_level0::template __call(dataAccessor, scratchAccessor); + scan_level0::template __call(dataAccessor, scratchAccessor); const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 1 scan @@ -216,11 +220,9 @@ struct scan dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()-1u), idx); - scalar_t left; + scalar_t left = BinOp::identity; if (idx != 0 || glsl::gl_SubgroupID() != 0) scratchAccessor.template get(bankedIndex,left); - else - left = BinOp::identity; if (Exclusive) { scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID())); @@ -253,12 +255,11 @@ struct reduce scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { using config_t = subgroup2::Configuration; - using params_lv0_t = subgroup2::ArithmeticParams; using params_lv1_t = subgroup2::ArithmeticParams; using params_lv2_t = subgroup2::ArithmeticParams; BinOp binop; - reduce_level0::template __call(dataAccessor, scratchAccessor); + reduce_level0::template __call(dataAccessor, scratchAccessor); const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 1 scan @@ -310,12 +311,11 @@ struct scan void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) { using config_t = subgroup2::Configuration; - using params_lv0_t = subgroup2::ArithmeticParams; using params_lv1_t = subgroup2::ArithmeticParams; using params_lv2_t = subgroup2::ArithmeticParams; BinOp binop; - scan_level0::template __call(dataAccessor, scratchAccessor); + scan_level0::template __call(dataAccessor, scratchAccessor); const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); // level 1 scan @@ -357,12 +357,10 @@ struct scan for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++) scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i), lv1_val[i]); - scalar_t lv2_scan; + scalar_t lv2_scan = BinOp::identity; const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID()-1u)); if (glsl::gl_SubgroupID() != 0) scratchAccessor.template get(bankedIndex, lv2_scan); - else - lv2_scan = BinOp::identity; [unroll] for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++) @@ -378,11 +376,9 @@ struct scan dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()-1u), idx); - scalar_t left; + scalar_t left = BinOp::identity; if (idx != 0 || glsl::gl_SubgroupID() != 0) scratchAccessor.template get(bankedIndex,left); - else - left = BinOp::identity; if (Exclusive) { scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID())); diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index a6405a3c99..d051c2153b 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -369,6 +369,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/anisotropi LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/loadable_image.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/mip_mapped.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/storable_image.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/generic_shared_data.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/fft.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/workgroup_arithmetic.hlsl") #tgmath From 10b7f508f82f180f1260eb875291f153c7f96b4b Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 13 Jun 2025 15:59:28 +0700 Subject: [PATCH 304/346] fix some bugs, readability fix --- .../builtin/hlsl/workgroup2/arithmetic_config.hlsl | 14 +++++++------- .../nbl/builtin/hlsl/workgroup2/shared_scan.hlsl | 8 ++++++-- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl index 419547bfd8..f894eac58a 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -68,9 +68,9 @@ struct ArithmeticConfiguration NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualInvocationsAtLevel1 = LevelInputCount_1 / ItemsPerInvocation_1; NBL_CONSTEXPR_STATIC_INLINE uint16_t __padding = conditional_value::value; - NBL_CONSTEXPR_STATIC_INLINE uint16_t __channelStride_1 = conditional_value::value; + NBL_CONSTEXPR_STATIC_INLINE uint16_t __channelStride_1 = conditional_value::value + __padding; NBL_CONSTEXPR_STATIC_INLINE uint16_t __channelStride_2 = conditional_value::value; - using ChannelStride = tuple,integral_constant >; + using ChannelStride = tuple,integral_constant,integral_constant >; // we don't use stride 0 // user specified the shared mem size of Scalars NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = conditional_value::type::value; const uint16_t outChannel = virtualSubgroupID & (ItemsPerNextInvocation-uint16_t(1u)); - const uint16_t outInvocation = virtualSubgroupID/ItemsPerNextInvocation; + const uint16_t outInvocation = virtualSubgroupID / ItemsPerNextInvocation; const uint16_t localOffset = outChannel * tuple_element::type::value + outInvocation; if (level==2) { - const uint16_t baseOffset = LevelInputCount_1 + (SubgroupSize-uint16_t(1u)) * ItemsPerNextInvocation; + const uint16_t baseOffset = LevelInputCount_1 + (SubgroupSize - uint16_t(1u)) * ItemsPerInvocation_1; return baseOffset + localOffset; } else { - const uint16_t paddingOffset = virtualSubgroupID/(SubgroupSize*ItemsPerInvocation_1); + const uint16_t paddingOffset = virtualSubgroupID / (SubgroupSize * ItemsPerInvocation_1); return localOffset + paddingOffset; } } @@ -128,11 +128,11 @@ struct ArithmeticConfiguration static uint16_t sharedLoadIndex(const uint16_t invocationIndex, const uint16_t component) { const uint16_t localOffset = component * tuple_element::type::value + invocationIndex; - const uint16_t paddingOffset = invocationIndex/SubgroupSize; + const uint16_t paddingOffset = invocationIndex / SubgroupSize; if (level==2) { - const uint16_t baseOffset = LevelInputCount_1 + (SubgroupSize-uint16_t(1u)) * ItemsPerInvocation_1; + const uint16_t baseOffset = LevelInputCount_1 + (SubgroupSize - uint16_t(1u)) * ItemsPerInvocation_1; return baseOffset + localOffset + paddingOffset; } else diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl index f8242f5ae1..5b19c55fbd 100644 --- a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -337,11 +337,15 @@ struct scan subgroup2::inclusive_scan inclusiveScan2; if (glsl::gl_SubgroupID() == 0) { - const uint16_t one = uint16_t(1u); + const uint16_t lastChannel = Config::ItemsPerInvocation_1 - uint16_t(1u); vector_lv2_t lv2_val; [unroll] for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++) - scratchAccessor.template get(Config::template sharedLoadIndex<1>((invocationIndex*Config::ItemsPerInvocation_2+i+one)*Config::SubgroupSize-one, Config::ItemsPerInvocation_1-one),lv2_val[i]); + { + const uint16_t inputSubgroupID = invocationIndex * Config::ItemsPerInvocation_2 + i; + const uint16_t inputSubgroupLastInvocation = inputSubgroupID * Config::SubgroupSize + (Config::SubgroupSize - uint16_t(1u)); + scratchAccessor.template get(Config::template sharedLoadIndex<1>(inputSubgroupLastInvocation, lastChannel),lv2_val[i]); + } lv2_val = inclusiveScan2(lv2_val); [unroll] for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++) From 50281c67af6b113924eafda8b0cdb97d6af1c836 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 14 Jun 2025 15:09:02 +0700 Subject: [PATCH 305/346] Remove computeDependants interface --- include/nbl/asset/IAsset.h | 4 -- include/nbl/asset/ICPUAccelerationStructure.h | 31 ------------- include/nbl/asset/ICPUAnimationLibrary.h | 17 -------- include/nbl/asset/ICPUBuffer.h | 10 ----- include/nbl/asset/ICPUBufferView.h | 17 -------- include/nbl/asset/ICPUComputePipeline.h | 18 -------- include/nbl/asset/ICPUDescriptorSet.h | 43 ------------------- include/nbl/asset/ICPUDescriptorSetLayout.h | 22 ---------- include/nbl/asset/ICPUGraphicsPipeline.h | 20 --------- include/nbl/asset/ICPUImage.h | 11 ----- include/nbl/asset/ICPUImageView.h | 16 ------- include/nbl/asset/ICPUMesh.h | 11 ----- include/nbl/asset/ICPUMeshBuffer.h | 11 ----- include/nbl/asset/ICPUPipelineCache.h | 10 ----- include/nbl/asset/ICPUPipelineLayout.h | 23 ---------- include/nbl/asset/ICPURayTracingPipeline.h | 22 ---------- include/nbl/asset/ICPURenderpass.h | 10 ----- .../asset/ICPURenderpassIndependentPipeline.h | 10 ----- include/nbl/asset/ICPUSampler.h | 10 ----- include/nbl/asset/ICPUSkeleton.h | 16 ------- include/nbl/asset/IShader.h | 16 ------- src/nbl/asset/ICPUDescriptorSet.cpp | 10 ----- 22 files changed, 358 deletions(-) diff --git a/include/nbl/asset/IAsset.h b/include/nbl/asset/IAsset.h index cc105f2633..78f96cbbdd 100644 --- a/include/nbl/asset/IAsset.h +++ b/include/nbl/asset/IAsset.h @@ -175,10 +175,6 @@ class IAsset : virtual public core::IReferenceCounted }); } - virtual core::unordered_set computeDependants() const = 0; - - virtual core::unordered_set computeDependants() = 0; - virtual bool valid() const { //TODO(kevinyu): Temporary set this to true to make changes compile. Will revisit this later for each asset diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h index 8d02b3ac8b..61a550cd81 100644 --- a/include/nbl/asset/ICPUAccelerationStructure.h +++ b/include/nbl/asset/ICPUAccelerationStructure.h @@ -136,17 +136,6 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo } - // Do not report anything as a dependant, we'll simply drop the data instead of discarding its contents - inline core::unordered_set computeDependants() const override - { - return {}; - } - - inline core::unordered_set computeDependants() override - { - return {}; - } - inline core::blake3_hash_t computeContentHash() const override { if (missingContent()) @@ -272,16 +261,6 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA // ICPUTopLevelAccelerationStructure() = default; - inline core::unordered_set computeDependants() const override - { - return computeDependantsImpl(this); - } - - inline core::unordered_set computeDependants() override - { - return computeDependantsImpl(this); - } - // inline auto& getBuildRangeInfo() { @@ -381,16 +360,6 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA hlsl::acceleration_structures::top_level::BuildRangeInfo m_buildRangeInfo; core::bitflag m_buildFlags = BUILD_FLAGS::PREFER_FAST_BUILD_BIT; - template - requires(std::same_as, ICPUTopLevelAccelerationStructure>) - static auto computeDependantsImpl(Self* self) { - using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; - core::unordered_set dependants; - for (const auto& instance : *self->m_instances) - dependants.insert(instance.getBase().blas.get()); - return dependants; - } - inline virtual void visitDependentsImpl(std::function visit) const override { for (const auto& instance : *m_instances) diff --git a/include/nbl/asset/ICPUAnimationLibrary.h b/include/nbl/asset/ICPUAnimationLibrary.h index 33b5b182c9..490c6b6e2e 100644 --- a/include/nbl/asset/ICPUAnimationLibrary.h +++ b/include/nbl/asset/ICPUAnimationLibrary.h @@ -96,25 +96,8 @@ class ICPUAnimationLibrary final : public IAnimationLibrary, public constexpr static inline auto AssetType = ET_ANIMATION_LIBRARY; inline E_TYPE getAssetType() const override { return AssetType; } - inline core::unordered_set computeDependants() const override - { - return computeDependantsImpl(this); - } - - inline core::unordered_set computeDependants() override - { - return computeDependantsImpl(this); - } - private: - template - requires(std::same_as, ICPUAnimationLibrary>) - static auto computeDependantsImpl(Self* self) { - using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; - return core::unordered_set{ self->m_keyframeStorageBinding.buffer.get(), self->m_timestampStorageBinding.buffer.get(), self->m_animationStorageRange.buffer.get() }; - } - virtual void visitDependentsImpl(std::function visit) const override { if (!visit(m_keyframeStorageBinding.buffer.get())) return; diff --git a/include/nbl/asset/ICPUBuffer.h b/include/nbl/asset/ICPUBuffer.h index 94f1dc750a..9b33442a84 100644 --- a/include/nbl/asset/ICPUBuffer.h +++ b/include/nbl/asset/ICPUBuffer.h @@ -75,16 +75,6 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed constexpr static inline auto AssetType = ET_BUFFER; inline IAsset::E_TYPE getAssetType() const override final { return AssetType; } - inline core::unordered_set computeDependants() const override - { - return {}; - } - - inline core::unordered_set computeDependants() override - { - return {}; - } - inline core::blake3_hash_t computeContentHash() const override { core::blake3_hasher hasher; diff --git a/include/nbl/asset/ICPUBufferView.h b/include/nbl/asset/ICPUBufferView.h index 1741a1f445..ebe28832f6 100644 --- a/include/nbl/asset/ICPUBufferView.h +++ b/include/nbl/asset/ICPUBufferView.h @@ -28,17 +28,6 @@ class ICPUBufferView : public IBufferView, public IAsset constexpr static inline auto AssetType = ET_BUFFER_VIEW; inline IAsset::E_TYPE getAssetType() const override { return AssetType; } - - inline core::unordered_set computeDependants() const override - { - return computeDependantsImpl(this); - } - - inline core::unordered_set computeDependants() override - { - return computeDependantsImpl(this); - } - ICPUBuffer* getUnderlyingBuffer() { assert(isMutable()); @@ -61,12 +50,6 @@ class ICPUBufferView : public IBufferView, public IAsset virtual ~ICPUBufferView() = default; private: - template - requires(std::same_as, ICPUBufferView>) - static auto computeDependantsImpl(Self* self) { - using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; - return core::unordered_set{ self->m_buffer.get() }; - } inline virtual void visitDependentsImpl(std::function visit) const override { diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h index 5dbec00ea4..61da031b20 100644 --- a/include/nbl/asset/ICPUComputePipeline.h +++ b/include/nbl/asset/ICPUComputePipeline.h @@ -28,17 +28,6 @@ class ICPUComputePipeline final : public ICPUPipeline computeDependants() const override - { - return computeDependantsImpl(this); - } - - inline core::unordered_set computeDependants() override - { - return computeDependantsImpl(this); - } inline std::span getSpecInfos(hlsl::ShaderStage stage) const override { @@ -98,13 +87,6 @@ class ICPUComputePipeline final : public ICPUPipeline - requires(std::same_as, ICPUComputePipeline>) - static auto computeDependantsImpl(Self* self) { - using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; - return core::unordered_set{ self->m_layout.get(), self->m_specInfo.shader.get() }; - } virtual void visitDependentsImpl(std::function visit) const override { diff --git a/include/nbl/asset/ICPUDescriptorSet.h b/include/nbl/asset/ICPUDescriptorSet.h index 05a7f51f60..ee99b3c9e8 100644 --- a/include/nbl/asset/ICPUDescriptorSet.h +++ b/include/nbl/asset/ICPUDescriptorSet.h @@ -77,9 +77,6 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet clone(uint32_t _depth = ~0u) const override; - core::unordered_set computeDependants() const override; - core::unordered_set computeDependants() override; - protected: virtual ~ICPUDescriptorSet() = default; @@ -88,46 +85,6 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet m_descriptorInfos[static_cast(IDescriptor::E_TYPE::ET_COUNT)]; - template - requires(std::same_as, ICPUDescriptorSet>) - static auto computeDependantsImpl(Self* self) { - using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; - - using cpu_buffer_ptr_t = std::conditional_t, const ICPUBuffer*, ICPUBuffer*>; - using cpu_sampler_ptr_t = std::conditional_t, const ICPUSampler*, ICPUSampler*>; - using cpu_image_view_ptr_t = std::conditional_t, const ICPUImageView*, ICPUImageView*>; - using cpu_buffer_view_ptr_t = std::conditional_t, const ICPUBufferView*, ICPUBufferView*>; - using cpu_tlas_ptr_t = std::conditional_t, const ICPUTopLevelAccelerationStructure*, ICPUTopLevelAccelerationStructure*>; - - core::unordered_set dependants = { self->m_layout.get() }; - for (auto i = 0u; i < static_cast(IDescriptor::E_TYPE::ET_COUNT); i++) - { - if (!self->m_descriptorInfos[i]) continue; - const auto size = self->m_descriptorInfos[i]->size(); - for (auto desc_i = 0u; desc_i < size; desc_i++) - { - auto* desc = self->m_descriptorInfos[i]->operator[](desc_i).desc.get(); - if (!desc) continue; - switch (IDescriptor::GetTypeCategory(static_cast(i))) - { - case IDescriptor::EC_BUFFER: - dependants.insert(static_cast(desc)); - case IDescriptor::EC_SAMPLER: - dependants.insert(static_cast(desc)); - case IDescriptor::EC_IMAGE: - dependants.insert(static_cast(desc)); - case IDescriptor::EC_BUFFER_VIEW: - dependants.insert(static_cast(desc)); - case IDescriptor::EC_ACCELERATION_STRUCTURE: - dependants.insert(static_cast(desc)); - default: - break; - } - } - } - return dependants; - } - virtual void visitDependentsImpl(std::function visit) const override { for (auto i = 0u; i < static_cast(IDescriptor::E_TYPE::ET_COUNT); i++) diff --git a/include/nbl/asset/ICPUDescriptorSetLayout.h b/include/nbl/asset/ICPUDescriptorSetLayout.h index 8dce4d9db4..871a58395b 100644 --- a/include/nbl/asset/ICPUDescriptorSetLayout.h +++ b/include/nbl/asset/ICPUDescriptorSetLayout.h @@ -57,33 +57,11 @@ class ICPUDescriptorSetLayout : public IDescriptorSetLayout, public constexpr static inline auto AssetType = ET_DESCRIPTOR_SET_LAYOUT; inline E_TYPE getAssetType() const override { return AssetType; } - core::unordered_set computeDependants() const override - { - return computeDependantsImpl(this); - } - - core::unordered_set computeDependants() override - { - return computeDependantsImpl(this); - } - protected: virtual ~ICPUDescriptorSetLayout() = default; private: - template - requires(std::same_as, ICPUDescriptorSetLayout>) - static auto computeDependantsImpl(Self* self) { - using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; - core::unordered_set dependants; - if (!self->m_immutableSamplers) return dependants; - for (const auto& sampler: *self->m_immutableSamplers) - { - dependants.insert(sampler.get()); - } - return dependants; - } inline virtual void visitDependentsImpl(std::function visit) const override { diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h index 470c5d813b..14a745f65f 100644 --- a/include/nbl/asset/ICPUGraphicsPipeline.h +++ b/include/nbl/asset/ICPUGraphicsPipeline.h @@ -29,16 +29,6 @@ class ICPUGraphicsPipeline final : public ICPUPipeline computeDependants() const override - { - return computeDependantsImpl(this); - } - - inline core::unordered_set computeDependants() override - { - return computeDependantsImpl(this); - } - inline const SCachedCreationParams& getCachedCreationParams() const { return pipeline_base_t::getCachedCreationParams(); @@ -124,16 +114,6 @@ class ICPUGraphicsPipeline final : public ICPUPipeline(hlsl::ShaderStage::ESS_VERTEX + index); } - template - requires(std::same_as, ICPUGraphicsPipeline>) - static auto computeDependantsImpl(Self* self) { - using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; - core::unordered_set dependants = { self->m_layout.get(), self->m_renderpass.get()}; - for (const auto& info : self->m_specInfos) - if (info.shader) dependants.insert(info.shader.get()); - return dependants; - } - inline core::smart_refctd_ptr clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final { auto* newPipeline = new ICPUGraphicsPipeline(layout.get(), m_renderpass.get()); diff --git a/include/nbl/asset/ICPUImage.h b/include/nbl/asset/ICPUImage.h index f13d75b76a..e3a0d8558f 100644 --- a/include/nbl/asset/ICPUImage.h +++ b/include/nbl/asset/ICPUImage.h @@ -45,17 +45,6 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed constexpr static inline auto AssetType = ET_IMAGE; inline IAsset::E_TYPE getAssetType() const override { return AssetType; } - // Do not report buffer as dependant, as we will simply drop it instead of discarding its contents! - inline core::unordered_set computeDependants() const override - { - return {}; - } - - inline core::unordered_set computeDependants() override - { - return {}; - } - core::blake3_hash_t computeContentHash() const override; // Having regions specififed to upload is optional! So to have content missing we must have regions but no buffer content diff --git a/include/nbl/asset/ICPUImageView.h b/include/nbl/asset/ICPUImageView.h index 74cb143fe6..f30489bdfd 100644 --- a/include/nbl/asset/ICPUImageView.h +++ b/include/nbl/asset/ICPUImageView.h @@ -49,16 +49,6 @@ class ICPUImageView final : public IImageView, public IAsset constexpr static inline auto AssetType = ET_IMAGE_VIEW; inline IAsset::E_TYPE getAssetType() const override { return AssetType; } - inline core::unordered_set computeDependants() const override - { - return computeDependantsImpl(this); - } - - inline core::unordered_set computeDependants() override - { - return computeDependantsImpl(this); - } - //! const SComponentMapping& getComponents() const { return params.components; } SComponentMapping& getComponents() @@ -76,12 +66,6 @@ class ICPUImageView final : public IImageView, public IAsset virtual ~ICPUImageView() = default; private: - template - requires(std::same_as, ICPUImageView>) - static auto computeDependantsImpl(Self* self) { - using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; - return core::unordered_set{ self->params.image.get() }; - } inline virtual void visitDependentsImpl(std::function visit) const override { diff --git a/include/nbl/asset/ICPUMesh.h b/include/nbl/asset/ICPUMesh.h index f52db5055e..2a65dc4e17 100644 --- a/include/nbl/asset/ICPUMesh.h +++ b/include/nbl/asset/ICPUMesh.h @@ -81,17 +81,6 @@ class ICPUMesh final : public IMesh, public IAsset return cp; } - //! CLASS IS DEPRECATED ANYWAY - inline core::unordered_set computeDependants() const override - { - return {}; - } - - inline core::unordered_set computeDependants() override - { - return {}; - } - protected: private: diff --git a/include/nbl/asset/ICPUMeshBuffer.h b/include/nbl/asset/ICPUMeshBuffer.h index 9872cc6b10..6f4b7f074c 100644 --- a/include/nbl/asset/ICPUMeshBuffer.h +++ b/include/nbl/asset/ICPUMeshBuffer.h @@ -611,17 +611,6 @@ class ICPUMeshBuffer final : public IMeshBuffer(const_cast(this)->getJointAABBs()); } - //! Class is deprecated anyway. - inline core::unordered_set computeDependants() const override - { - return {}; - } - - inline core::unordered_set computeDependants() override - { - return {}; - } - inline virtual void visitDependentsImpl(std::function visit) const override { } diff --git a/include/nbl/asset/ICPUPipelineCache.h b/include/nbl/asset/ICPUPipelineCache.h index 85ac650a22..217499170a 100644 --- a/include/nbl/asset/ICPUPipelineCache.h +++ b/include/nbl/asset/ICPUPipelineCache.h @@ -60,16 +60,6 @@ class ICPUPipelineCache final : public IPreHashed return core::make_smart_refctd_ptr(std::move(cache_cp)); } - inline core::unordered_set computeDependants() const override - { - return {}; - } - - inline core::unordered_set computeDependants() override - { - return {}; - } - // inline core::blake3_hash_t computeContentHash() const override { diff --git a/include/nbl/asset/ICPUPipelineLayout.h b/include/nbl/asset/ICPUPipelineLayout.h index c7d835faae..cfab4e7360 100644 --- a/include/nbl/asset/ICPUPipelineLayout.h +++ b/include/nbl/asset/ICPUPipelineLayout.h @@ -30,16 +30,6 @@ class ICPUPipelineLayout : public IAsset, public IPipelineLayout&& _layout2, core::smart_refctd_ptr&& _layout3 ) : IPipelineLayout(_pcRanges,std::move(_layout0),std::move(_layout1),std::move(_layout2),std::move(_layout3)) {} - inline core::unordered_set computeDependants() const override - { - return computeDependantsImpl(this); - } - - inline core::unordered_set computeDependants() override - { - return computeDependantsImpl(this); - } - // ICPUDescriptorSetLayout* getDescriptorSetLayout(uint32_t _set) { @@ -79,19 +69,6 @@ class ICPUPipelineLayout : public IAsset, public IPipelineLayout - requires(std::same_as, ICPUPipelineLayout>) - static auto computeDependantsImpl(Self* self) { - using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; - core::unordered_set dependants; - for (auto i = 0; i < self->m_descSetLayouts.size(); i++) - { - if (self->m_descSetLayouts[i]) continue; - dependants.insert(self->m_descSetLayouts[i].get()); - } - return dependants; - } - inline virtual void visitDependentsImpl(std::function visit) const override { for (auto i = 0; i < m_descSetLayouts.size(); i++) diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h index 09101c73ee..5e8e55b5e9 100644 --- a/include/nbl/asset/ICPURayTracingPipeline.h +++ b/include/nbl/asset/ICPURayTracingPipeline.h @@ -36,14 +36,6 @@ class ICPURayTracingPipeline final : public ICPUPipeline computeDependants() const override final { - return computeDependantsImpl(this); - } - - virtual core::unordered_set computeDependants() override final { - return computeDependantsImpl(this); - } - inline virtual std::span getSpecInfos(hlsl::ShaderStage stage) const override final { switch (stage) @@ -124,20 +116,6 @@ class ICPURayTracingPipeline final : public ICPUPipeline - requires(std::same_as, ICPURayTracingPipeline>) - static auto computeDependantsImpl(Self* self) { - using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; - core::unordered_set dependants; - dependants.insert(self->m_raygen.shader.get()); - for (const auto& missInfo : self->m_misses) dependants.insert(missInfo.shader.get()); - for (const auto& anyHitInfo : self->m_hitGroups.anyHits) dependants.insert(anyHitInfo.shader.get()); - for (const auto& closestHitInfo : self->m_hitGroups.closestHits) dependants.insert(closestHitInfo.shader.get()); - for (const auto& intersectionInfo : self->m_hitGroups.intersections) dependants.insert(intersectionInfo.shader.get()); - for (const auto& callableInfo : self->m_callables) dependants.insert(callableInfo.shader.get()); - return dependants; - } - inline virtual void visitDependentsImpl(std::function visit) const override { core::unordered_set dependants; diff --git a/include/nbl/asset/ICPURenderpass.h b/include/nbl/asset/ICPURenderpass.h index 517ffbe766..39fe388427 100644 --- a/include/nbl/asset/ICPURenderpass.h +++ b/include/nbl/asset/ICPURenderpass.h @@ -38,16 +38,6 @@ class ICPURenderpass : public IRenderpass, public IAsset return ET_RENDERPASS; } - inline core::unordered_set computeDependants() const override - { - return {}; - } - - inline core::unordered_set computeDependants() override - { - return {}; - } - protected: inline ICPURenderpass(const SCreationParams& _params, const SCreationParamValidationResult& _validation) : IRenderpass(_params, _validation) {} inline ~ICPURenderpass() = default; diff --git a/include/nbl/asset/ICPURenderpassIndependentPipeline.h b/include/nbl/asset/ICPURenderpassIndependentPipeline.h index feb04cd1c4..6db56fa279 100644 --- a/include/nbl/asset/ICPURenderpassIndependentPipeline.h +++ b/include/nbl/asset/ICPURenderpassIndependentPipeline.h @@ -72,16 +72,6 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline, _NBL_STATIC_INLINE_CONSTEXPR auto AssetType = ET_RENDERPASS_INDEPENDENT_PIPELINE; inline E_TYPE getAssetType() const override { return AssetType; } - inline core::unordered_set computeDependants() const override - { - return {}; - } - - inline core::unordered_set computeDependants() override - { - return {}; - } - // inline const SCachedCreationParams& getCachedCreationParams() const {return IRenderpassIndependentPipeline::getCachedCreationParams();} inline SCachedCreationParams& getCachedCreationParams() diff --git a/include/nbl/asset/ICPUSampler.h b/include/nbl/asset/ICPUSampler.h index d2ef756cad..4df7eb9ab5 100644 --- a/include/nbl/asset/ICPUSampler.h +++ b/include/nbl/asset/ICPUSampler.h @@ -69,16 +69,6 @@ class ICPUSampler : public ISampler, public IAsset constexpr static inline auto AssetType = ET_SAMPLER; inline IAsset::E_TYPE getAssetType() const override { return AssetType; } - inline core::unordered_set computeDependants() const override - { - return {}; - } - - inline core::unordered_set computeDependants() override - { - return {}; - } - private: inline virtual void visitDependentsImpl(std::function visit) const override diff --git a/include/nbl/asset/ICPUSkeleton.h b/include/nbl/asset/ICPUSkeleton.h index fb5c5953e0..0d6f0d405b 100644 --- a/include/nbl/asset/ICPUSkeleton.h +++ b/include/nbl/asset/ICPUSkeleton.h @@ -79,23 +79,7 @@ class ICPUSkeleton final : public ISkeleton, public IAsset constexpr static inline auto AssetType = ET_SKELETON; inline E_TYPE getAssetType() const override { return AssetType; } - inline core::unordered_set computeDependants() const override - { - return computeDependantsImpl(this); - } - - inline core::unordered_set computeDependants() override - { - return computeDependantsImpl(this); - } - private: - template - requires(std::same_as, ICPUSkeleton>) - static auto computeDependantsImpl(Self* self) { - using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; - return core::unordered_set{ self->m_defaultTransforms.buffer.get(), self->m_parentJointIDs.buffer.get() }; - } inline virtual void visitDependentsImpl(std::function visit) const override { diff --git a/include/nbl/asset/IShader.h b/include/nbl/asset/IShader.h index 4574ac073a..8ce332cb99 100644 --- a/include/nbl/asset/IShader.h +++ b/include/nbl/asset/IShader.h @@ -50,16 +50,6 @@ class IShader : public IAsset constexpr static inline auto AssetType = ET_SHADER; inline E_TYPE getAssetType() const override { return AssetType; } - inline core::unordered_set computeDependants() const override - { - return computeDependantsImpl(this); - } - - inline core::unordered_set computeDependants() override - { - return computeDependantsImpl(this); - } - // inline core::smart_refctd_ptr clone(uint32_t _depth=~0u) const override { @@ -108,12 +98,6 @@ class IShader : public IAsset E_CONTENT_TYPE m_contentType; private: - template - requires(std::same_as, IShader>) - static auto computeDependantsImpl(Self* self) { - using asset_ptr_t = std::conditional_t, const IAsset*, IAsset*>; - return core::unordered_set{self->m_code.get()}; - } inline virtual void visitDependentsImpl(std::function visit) const override { diff --git a/src/nbl/asset/ICPUDescriptorSet.cpp b/src/nbl/asset/ICPUDescriptorSet.cpp index 730f0847f2..7137edcba5 100644 --- a/src/nbl/asset/ICPUDescriptorSet.cpp +++ b/src/nbl/asset/ICPUDescriptorSet.cpp @@ -108,14 +108,4 @@ core::smart_refctd_ptr ICPUDescriptorSet::clone(uint32_t _depth) const return cp; } -core::unordered_set ICPUDescriptorSet::computeDependants() const -{ - return computeDependantsImpl(this); -} - -core::unordered_set ICPUDescriptorSet::computeDependants() -{ - return computeDependantsImpl(this); -} - } \ No newline at end of file From 6a84bd7cf9ff9301ed43250681df131a69d15ca2 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 14 Jun 2025 16:32:43 +0700 Subject: [PATCH 306/346] Rename visitDependentsImpl to visitDependents_impl --- include/nbl/asset/IAsset.h | 4 ++-- include/nbl/asset/ICPUAccelerationStructure.h | 4 ++-- include/nbl/asset/ICPUAnimationLibrary.h | 2 +- include/nbl/asset/ICPUBuffer.h | 2 +- include/nbl/asset/ICPUBufferView.h | 2 +- include/nbl/asset/ICPUComputePipeline.h | 2 +- include/nbl/asset/ICPUDescriptorSet.h | 2 +- include/nbl/asset/ICPUDescriptorSetLayout.h | 2 +- include/nbl/asset/ICPUGraphicsPipeline.h | 2 +- include/nbl/asset/ICPUImage.h | 2 +- include/nbl/asset/ICPUImageView.h | 2 +- include/nbl/asset/ICPUMesh.h | 2 +- include/nbl/asset/ICPUMeshBuffer.h | 2 +- include/nbl/asset/ICPUPipelineCache.h | 2 +- include/nbl/asset/ICPUPipelineLayout.h | 2 +- include/nbl/asset/ICPURayTracingPipeline.h | 21 +++++++------------ include/nbl/asset/ICPURenderpass.h | 2 +- .../asset/ICPURenderpassIndependentPipeline.h | 2 +- include/nbl/asset/ICPUSampler.h | 2 +- include/nbl/asset/ICPUSkeleton.h | 2 +- include/nbl/asset/IShader.h | 2 +- 21 files changed, 29 insertions(+), 36 deletions(-) diff --git a/include/nbl/asset/IAsset.h b/include/nbl/asset/IAsset.h index 78f96cbbdd..2e45f62bbb 100644 --- a/include/nbl/asset/IAsset.h +++ b/include/nbl/asset/IAsset.h @@ -158,7 +158,7 @@ class IAsset : virtual public core::IReferenceCounted inline void visitDependents(std::function visit) const { - visitDependentsImpl([&visit](const IAsset* dep)->bool + visitDependents_impl([&visit](const IAsset* dep)->bool { if (dep) return visit(dep); @@ -190,7 +190,7 @@ class IAsset : virtual public core::IReferenceCounted friend IAssetManager; bool m_mutable = true; - virtual void visitDependentsImpl(std::function visit) const = 0; + virtual void visitDependents_impl(std::function visit) const = 0; }; template diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h index 61a550cd81..3836690bda 100644 --- a/include/nbl/asset/ICPUAccelerationStructure.h +++ b/include/nbl/asset/ICPUAccelerationStructure.h @@ -248,7 +248,7 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo core::smart_refctd_dynamic_array m_geometryPrimitiveCount = nullptr; core::bitflag m_buildFlags = BUILD_FLAGS::PREFER_FAST_TRACE_BIT; - inline virtual void visitDependentsImpl(std::function visit) const override {} + inline virtual void visitDependents_impl(std::function visit) const override {} }; class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelAccelerationStructure @@ -360,7 +360,7 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA hlsl::acceleration_structures::top_level::BuildRangeInfo m_buildRangeInfo; core::bitflag m_buildFlags = BUILD_FLAGS::PREFER_FAST_BUILD_BIT; - inline virtual void visitDependentsImpl(std::function visit) const override + inline virtual void visitDependents_impl(std::function visit) const override { for (const auto& instance : *m_instances) if (!visit(instance.getBase().blas.get())) return; diff --git a/include/nbl/asset/ICPUAnimationLibrary.h b/include/nbl/asset/ICPUAnimationLibrary.h index 490c6b6e2e..2d620f562c 100644 --- a/include/nbl/asset/ICPUAnimationLibrary.h +++ b/include/nbl/asset/ICPUAnimationLibrary.h @@ -98,7 +98,7 @@ class ICPUAnimationLibrary final : public IAnimationLibrary, public private: - virtual void visitDependentsImpl(std::function visit) const override + virtual void visitDependents_impl(std::function visit) const override { if (!visit(m_keyframeStorageBinding.buffer.get())) return; if (!visit(m_timestampStorageBinding.buffer.get())) return; diff --git a/include/nbl/asset/ICPUBuffer.h b/include/nbl/asset/ICPUBuffer.h index 9b33442a84..30232c061a 100644 --- a/include/nbl/asset/ICPUBuffer.h +++ b/include/nbl/asset/ICPUBuffer.h @@ -129,7 +129,7 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed discardContent_impl(); } - inline virtual void visitDependentsImpl(std::function visit) const override {} + inline virtual void visitDependents_impl(std::function visit) const override {} void* m_data; core::smart_refctd_ptr m_mem_resource; diff --git a/include/nbl/asset/ICPUBufferView.h b/include/nbl/asset/ICPUBufferView.h index ebe28832f6..512103a9cd 100644 --- a/include/nbl/asset/ICPUBufferView.h +++ b/include/nbl/asset/ICPUBufferView.h @@ -51,7 +51,7 @@ class ICPUBufferView : public IBufferView, public IAsset private: - inline virtual void visitDependentsImpl(std::function visit) const override + inline virtual void visitDependents_impl(std::function visit) const override { if (!visit(m_buffer.get())) return; } diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h index 61da031b20..02b56d02ce 100644 --- a/include/nbl/asset/ICPUComputePipeline.h +++ b/include/nbl/asset/ICPUComputePipeline.h @@ -88,7 +88,7 @@ class ICPUComputePipeline final : public ICPUPipeline visit) const override + virtual void visitDependents_impl(std::function visit) const override { if (!visit(m_layout.get())) return; if (!visit(m_specInfo.shader.get())) return; diff --git a/include/nbl/asset/ICPUDescriptorSet.h b/include/nbl/asset/ICPUDescriptorSet.h index ee99b3c9e8..b0d9786868 100644 --- a/include/nbl/asset/ICPUDescriptorSet.h +++ b/include/nbl/asset/ICPUDescriptorSet.h @@ -85,7 +85,7 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet m_descriptorInfos[static_cast(IDescriptor::E_TYPE::ET_COUNT)]; - virtual void visitDependentsImpl(std::function visit) const override + virtual void visitDependents_impl(std::function visit) const override { for (auto i = 0u; i < static_cast(IDescriptor::E_TYPE::ET_COUNT); i++) { diff --git a/include/nbl/asset/ICPUDescriptorSetLayout.h b/include/nbl/asset/ICPUDescriptorSetLayout.h index 871a58395b..216297a562 100644 --- a/include/nbl/asset/ICPUDescriptorSetLayout.h +++ b/include/nbl/asset/ICPUDescriptorSetLayout.h @@ -63,7 +63,7 @@ class ICPUDescriptorSetLayout : public IDescriptorSetLayout, public private: - inline virtual void visitDependentsImpl(std::function visit) const override + inline virtual void visitDependents_impl(std::function visit) const override { if (m_immutableSamplers) return; for (const auto& sampler : *m_immutableSamplers) diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h index 14a745f65f..f39f38f673 100644 --- a/include/nbl/asset/ICPUGraphicsPipeline.h +++ b/include/nbl/asset/ICPUGraphicsPipeline.h @@ -127,7 +127,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline(newPipeline, core::dont_grab); } - inline virtual void visitDependentsImpl(std::function visit) const override + inline virtual void visitDependents_impl(std::function visit) const override { if (!visit(m_layout.get())) return; if (!visit(m_renderpass.get())) return; diff --git a/include/nbl/asset/ICPUImage.h b/include/nbl/asset/ICPUImage.h index e3a0d8558f..847b796da0 100644 --- a/include/nbl/asset/ICPUImage.h +++ b/include/nbl/asset/ICPUImage.h @@ -217,7 +217,7 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed } }; - inline virtual void visitDependentsImpl(std::function visit) const override + inline virtual void visitDependents_impl(std::function visit) const override { } }; diff --git a/include/nbl/asset/ICPUImageView.h b/include/nbl/asset/ICPUImageView.h index f30489bdfd..6338021aed 100644 --- a/include/nbl/asset/ICPUImageView.h +++ b/include/nbl/asset/ICPUImageView.h @@ -67,7 +67,7 @@ class ICPUImageView final : public IImageView, public IAsset private: - inline virtual void visitDependentsImpl(std::function visit) const override + inline virtual void visitDependents_impl(std::function visit) const override { if (!visit(params.image.get())) return; } diff --git a/include/nbl/asset/ICPUMesh.h b/include/nbl/asset/ICPUMesh.h index 2a65dc4e17..019578775c 100644 --- a/include/nbl/asset/ICPUMesh.h +++ b/include/nbl/asset/ICPUMesh.h @@ -86,7 +86,7 @@ class ICPUMesh final : public IMesh, public IAsset private: core::vector> m_meshBuffers; - inline virtual void visitDependentsImpl(std::function visit) const override + inline virtual void visitDependents_impl(std::function visit) const override { } }; diff --git a/include/nbl/asset/ICPUMeshBuffer.h b/include/nbl/asset/ICPUMeshBuffer.h index 6f4b7f074c..8fc5ae26e9 100644 --- a/include/nbl/asset/ICPUMeshBuffer.h +++ b/include/nbl/asset/ICPUMeshBuffer.h @@ -611,7 +611,7 @@ class ICPUMeshBuffer final : public IMeshBuffer(const_cast(this)->getJointAABBs()); } - inline virtual void visitDependentsImpl(std::function visit) const override + inline virtual void visitDependents_impl(std::function visit) const override { } }; diff --git a/include/nbl/asset/ICPUPipelineCache.h b/include/nbl/asset/ICPUPipelineCache.h index 217499170a..a0d4373c6e 100644 --- a/include/nbl/asset/ICPUPipelineCache.h +++ b/include/nbl/asset/ICPUPipelineCache.h @@ -93,7 +93,7 @@ class ICPUPipelineCache final : public IPreHashed private: entries_map_t m_cache; - inline virtual void visitDependentsImpl(std::function visit) const override + inline virtual void visitDependents_impl(std::function visit) const override { } }; diff --git a/include/nbl/asset/ICPUPipelineLayout.h b/include/nbl/asset/ICPUPipelineLayout.h index cfab4e7360..f4f636601c 100644 --- a/include/nbl/asset/ICPUPipelineLayout.h +++ b/include/nbl/asset/ICPUPipelineLayout.h @@ -69,7 +69,7 @@ class ICPUPipelineLayout : public IAsset, public IPipelineLayout visit) const override + inline virtual void visitDependents_impl(std::function visit) const override { for (auto i = 0; i < m_descSetLayouts.size(); i++) { diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h index 5e8e55b5e9..955275f819 100644 --- a/include/nbl/asset/ICPURayTracingPipeline.h +++ b/include/nbl/asset/ICPURayTracingPipeline.h @@ -116,21 +116,14 @@ class ICPURayTracingPipeline final : public ICPUPipeline visit) const override + inline virtual void visitDependents_impl(std::function visit) const override { - core::unordered_set dependants; - const auto visitOnce = [&](const IAsset* dep) -> bool { - auto [iter, inserted] = dependants.insert(dep); - if (inserted) return visit(dep); - return true; - }; - visitOnce(m_raygen.shader.get()); - for (const auto& missInfo : self->m_misses) visitOnce(missInfo.shader.get()); - for (const auto& anyHitInfo : self->m_hitGroups.anyHits) visitOnce(anyHitInfo.shader.get()); - for (const auto& closestHitInfo : self->m_hitGroups.closestHits) visitOnce(closestHitInfo.shader.get()); - for (const auto& intersectionInfo : self->m_hitGroups.intersections) visitOnce(intersectionInfo.shader.get()); - for (const auto& callableInfo : self->m_callables) visitOnce(callableInfo.shader.get()); - + if (!visit(m_raygen.shader.get()) return; + for (const auto& missInfo : self->m_misses) if (!visit(missInfo.shader.get())) return; + for (const auto& anyHitInfo : self->m_hitGroups.anyHits) if (!visit(anyHitInfo.shader.get())) return; + for (const auto& closestHitInfo : self->m_hitGroups.closestHits) if (!visit(closestHitInfo.shader.get())) return; + for (const auto& intersectionInfo : self->m_hitGroups.intersections) if (!visit(intersectionInfo.shader.get())) return; + for (const auto& callableInfo : self->m_callables) if(!visit(callableInfo.shader.get())) return; } inline core::smart_refctd_ptr clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final diff --git a/include/nbl/asset/ICPURenderpass.h b/include/nbl/asset/ICPURenderpass.h index 39fe388427..7622609789 100644 --- a/include/nbl/asset/ICPURenderpass.h +++ b/include/nbl/asset/ICPURenderpass.h @@ -44,7 +44,7 @@ class ICPURenderpass : public IRenderpass, public IAsset private: - inline virtual void visitDependentsImpl(std::function visit) const override + inline virtual void visitDependents_impl(std::function visit) const override { } diff --git a/include/nbl/asset/ICPURenderpassIndependentPipeline.h b/include/nbl/asset/ICPURenderpassIndependentPipeline.h index 6db56fa279..422cf548b4 100644 --- a/include/nbl/asset/ICPURenderpassIndependentPipeline.h +++ b/include/nbl/asset/ICPURenderpassIndependentPipeline.h @@ -150,7 +150,7 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline, private: - inline virtual void visitDependentsImpl(std::function visit) const override + inline virtual void visitDependents_impl(std::function visit) const override { } }; diff --git a/include/nbl/asset/ICPUSampler.h b/include/nbl/asset/ICPUSampler.h index 4df7eb9ab5..6ddf479319 100644 --- a/include/nbl/asset/ICPUSampler.h +++ b/include/nbl/asset/ICPUSampler.h @@ -71,7 +71,7 @@ class ICPUSampler : public ISampler, public IAsset private: - inline virtual void visitDependentsImpl(std::function visit) const override + inline virtual void visitDependents_impl(std::function visit) const override { } }; diff --git a/include/nbl/asset/ICPUSkeleton.h b/include/nbl/asset/ICPUSkeleton.h index 0d6f0d405b..e66293da0c 100644 --- a/include/nbl/asset/ICPUSkeleton.h +++ b/include/nbl/asset/ICPUSkeleton.h @@ -81,7 +81,7 @@ class ICPUSkeleton final : public ISkeleton, public IAsset private: - inline virtual void visitDependentsImpl(std::function visit) const override + inline virtual void visitDependents_impl(std::function visit) const override { if (!visit(m_defaultTransforms.buffer.get())) return; if (!visit(m_parentJointIDs.buffer.get())) return; diff --git a/include/nbl/asset/IShader.h b/include/nbl/asset/IShader.h index 8ce332cb99..3ef14f3e78 100644 --- a/include/nbl/asset/IShader.h +++ b/include/nbl/asset/IShader.h @@ -99,7 +99,7 @@ class IShader : public IAsset private: - inline virtual void visitDependentsImpl(std::function visit) const override + inline virtual void visitDependents_impl(std::function visit) const override { if (!visit(m_code.get())) return; } From d58554e8d03b9bcd6ba3ada56887dfb06b9ae04c Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 14 Jun 2025 19:13:31 +0700 Subject: [PATCH 307/346] Fix visitDependents_impl on some asset --- include/nbl/asset/ICPUAccelerationStructure.h | 1 + include/nbl/asset/ICPUDescriptorSetLayout.h | 2 +- include/nbl/asset/ICPUPipelineLayout.h | 12 +++++++++++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h index 3836690bda..4c837dc91a 100644 --- a/include/nbl/asset/ICPUAccelerationStructure.h +++ b/include/nbl/asset/ICPUAccelerationStructure.h @@ -362,6 +362,7 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA inline virtual void visitDependents_impl(std::function visit) const override { + if (!m_instances) return; for (const auto& instance : *m_instances) if (!visit(instance.getBase().blas.get())) return; } diff --git a/include/nbl/asset/ICPUDescriptorSetLayout.h b/include/nbl/asset/ICPUDescriptorSetLayout.h index 216297a562..19e38a26b2 100644 --- a/include/nbl/asset/ICPUDescriptorSetLayout.h +++ b/include/nbl/asset/ICPUDescriptorSetLayout.h @@ -65,7 +65,7 @@ class ICPUDescriptorSetLayout : public IDescriptorSetLayout, public inline virtual void visitDependents_impl(std::function visit) const override { - if (m_immutableSamplers) return; + if (!m_immutableSamplers) return; for (const auto& sampler : *m_immutableSamplers) if (!visit(sampler.get())) return; } diff --git a/include/nbl/asset/ICPUPipelineLayout.h b/include/nbl/asset/ICPUPipelineLayout.h index f4f636601c..0684980cf8 100644 --- a/include/nbl/asset/ICPUPipelineLayout.h +++ b/include/nbl/asset/ICPUPipelineLayout.h @@ -66,6 +66,16 @@ class ICPUPipelineLayout : public IAsset, public IPipelineLayoutvalid()) return false; + } + return true; + } + protected: virtual ~ICPUPipelineLayout() = default; @@ -73,7 +83,7 @@ class ICPUPipelineLayout : public IAsset, public IPipelineLayout Date: Sat, 14 Jun 2025 19:15:57 +0700 Subject: [PATCH 308/346] Implement ICPUBuffer valid() --- include/nbl/asset/ICPUBuffer.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/nbl/asset/ICPUBuffer.h b/include/nbl/asset/ICPUBuffer.h index 30232c061a..66170ac20d 100644 --- a/include/nbl/asset/ICPUBuffer.h +++ b/include/nbl/asset/ICPUBuffer.h @@ -110,6 +110,14 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed return true; } + inline virtual bool valid() const override + { + if (!m_data) return false; + if (!m_mem_resource) return false; + // check if alignment is power of two + return (m_alignment > 0 && !(m_alignment & (m_alignment - 1))); + } + protected: inline void discardContent_impl() override { From c3c50b43a861c9ef7d9485fe3f759a21f14e041e Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 14 Jun 2025 19:16:12 +0700 Subject: [PATCH 309/346] Implement ICPUBufferView::valid() --- include/nbl/asset/ICPUBufferView.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/nbl/asset/ICPUBufferView.h b/include/nbl/asset/ICPUBufferView.h index 512103a9cd..c96f0377f4 100644 --- a/include/nbl/asset/ICPUBufferView.h +++ b/include/nbl/asset/ICPUBufferView.h @@ -46,6 +46,16 @@ class ICPUBufferView : public IBufferView, public IAsset m_size = _size; } + inline virtual bool valid() const override + { + if (!m_buffer->valid()) return false; + if (m_offset >= m_buffer->getSize()) return false; + if (m_size <= 0) return false; + if (m_offset >= m_buffer->getSize()) return false; + if (m_size > m_buffer->getSize() - m_offset) return false; + return true; + } + protected: virtual ~ICPUBufferView() = default; From 51e408b002bea254a6526d4b2af6eb913ba98169 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 14 Jun 2025 19:16:25 +0700 Subject: [PATCH 310/346] Implement ICPUDescriptorSet::valid() --- include/nbl/asset/ICPUDescriptorSet.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/nbl/asset/ICPUDescriptorSet.h b/include/nbl/asset/ICPUDescriptorSet.h index b0d9786868..776e4e1409 100644 --- a/include/nbl/asset/ICPUDescriptorSet.h +++ b/include/nbl/asset/ICPUDescriptorSet.h @@ -77,6 +77,11 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet clone(uint32_t _depth = ~0u) const override; + inline virtual bool valid() const override { + if (!m_layout->valid()) return false; + return true; + } + protected: virtual ~ICPUDescriptorSet() = default; From 345dbd8ff5a99536c4ed7b2397189888bfa8d359 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 14 Jun 2025 19:16:38 +0700 Subject: [PATCH 311/346] Implement ICPUDescriptorSetLayout::valid() --- include/nbl/asset/ICPUDescriptorSetLayout.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/nbl/asset/ICPUDescriptorSetLayout.h b/include/nbl/asset/ICPUDescriptorSetLayout.h index 19e38a26b2..da249620bc 100644 --- a/include/nbl/asset/ICPUDescriptorSetLayout.h +++ b/include/nbl/asset/ICPUDescriptorSetLayout.h @@ -56,6 +56,10 @@ class ICPUDescriptorSetLayout : public IDescriptorSetLayout, public constexpr static inline auto AssetType = ET_DESCRIPTOR_SET_LAYOUT; inline E_TYPE getAssetType() const override { return AssetType; } + inline virtual bool valid() const override + { + return true; // no modification is possible after creation + } protected: virtual ~ICPUDescriptorSetLayout() = default; From 4a4b51d1d4b2df7e09c901a3ac0f3dbc1721827c Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 14 Jun 2025 19:16:49 +0700 Subject: [PATCH 312/346] Implement ICPUImage::valid() --- include/nbl/asset/ICPUImage.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/nbl/asset/ICPUImage.h b/include/nbl/asset/ICPUImage.h index 847b796da0..78c7c4891f 100644 --- a/include/nbl/asset/ICPUImage.h +++ b/include/nbl/asset/ICPUImage.h @@ -195,6 +195,16 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed return true; } + inline virtual bool valid() const override + { + if (!validateCreationParameters(m_creationParams)) return false; + if (info != m_creationParams.format) return false; + if (!buffer->valid()) return false; + for (const auto& region : regions) + if (!region.isValid()) return false; + return true; + } + protected: inline ICPUImage(const SCreationParams& _params) : IImage(_params) {} virtual ~ICPUImage() = default; From c5d1d85e120a312f959de7c087c27580a4ef7fa8 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 14 Jun 2025 19:17:03 +0700 Subject: [PATCH 313/346] Implement ICPUImageView::valid() --- include/nbl/asset/ICPUImageView.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/nbl/asset/ICPUImageView.h b/include/nbl/asset/ICPUImageView.h index 6338021aed..953651c604 100644 --- a/include/nbl/asset/ICPUImageView.h +++ b/include/nbl/asset/ICPUImageView.h @@ -62,6 +62,17 @@ class ICPUImageView final : public IImageView, public IAsset params.subresourceRange.aspectMask = aspect.value; } + inline virtual bool valid() const override + { + if (!validateCreationParameters(params)) return false; + + // image nullptr already checked in validateCreationParameters; + assert(params.image); + if (!params.image->valid()) return false; + + return true; + } + protected: virtual ~ICPUImageView() = default; From 70870997f7fa776fd4c3ef059b702a7f94c9b1d5 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 14 Jun 2025 19:17:16 +0700 Subject: [PATCH 314/346] Implement ICPUPipelineCache::valid() --- include/nbl/asset/ICPUPipelineCache.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/nbl/asset/ICPUPipelineCache.h b/include/nbl/asset/ICPUPipelineCache.h index a0d4373c6e..702b86620e 100644 --- a/include/nbl/asset/ICPUPipelineCache.h +++ b/include/nbl/asset/ICPUPipelineCache.h @@ -83,6 +83,11 @@ class ICPUPipelineCache final : public IPreHashed // const auto& getEntries() const {return m_cache;} + inline virtual bool valid() const override + { + return true; + } + protected: inline void discardContent_impl() override { From 73a17a07bc70ade6d45129f4f752c71d787bdfd4 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 14 Jun 2025 19:21:51 +0700 Subject: [PATCH 315/346] Implement ICPUSampler::valid() --- include/nbl/asset/ICPURenderpass.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/nbl/asset/ICPURenderpass.h b/include/nbl/asset/ICPURenderpass.h index 7622609789..a131b44add 100644 --- a/include/nbl/asset/ICPURenderpass.h +++ b/include/nbl/asset/ICPURenderpass.h @@ -38,6 +38,12 @@ class ICPURenderpass : public IRenderpass, public IAsset return ET_RENDERPASS; } + inline virtual bool valid() const override + { + // no modification is possible after creation. parameter is validated when creating renderpass + return true; + } + protected: inline ICPURenderpass(const SCreationParams& _params, const SCreationParamValidationResult& _validation) : IRenderpass(_params, _validation) {} inline ~ICPURenderpass() = default; From c7cff1dcefc61879c0a168bb899e357cdc0e609d Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 16 Jun 2025 12:31:57 +0700 Subject: [PATCH 316/346] Remove valid implementation on IAsset and implement valid for all derived class of IAsset --- include/nbl/asset/IAsset.h | 6 +-- include/nbl/asset/ICPUAccelerationStructure.h | 38 +++++++++++++++++++ include/nbl/asset/ICPUAnimationLibrary.h | 1 + include/nbl/asset/ICPUImage.h | 7 ++-- include/nbl/asset/ICPUMesh.h | 10 +++++ include/nbl/asset/ICPUMeshBuffer.h | 5 +++ .../asset/ICPURenderpassIndependentPipeline.h | 5 +++ include/nbl/asset/ICPUSampler.h | 1 + include/nbl/asset/ICPUSkeleton.h | 1 + include/nbl/asset/IShader.h | 8 ++++ 10 files changed, 74 insertions(+), 8 deletions(-) diff --git a/include/nbl/asset/IAsset.h b/include/nbl/asset/IAsset.h index 2e45f62bbb..dc77931c25 100644 --- a/include/nbl/asset/IAsset.h +++ b/include/nbl/asset/IAsset.h @@ -175,11 +175,7 @@ class IAsset : virtual public core::IReferenceCounted }); } - virtual bool valid() const - { - //TODO(kevinyu): Temporary set this to true to make changes compile. Will revisit this later for each asset - return true; - } + virtual bool valid() const = 0; protected: inline IAsset() = default; diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h index 4c837dc91a..2c4933d36c 100644 --- a/include/nbl/asset/ICPUAccelerationStructure.h +++ b/include/nbl/asset/ICPUAccelerationStructure.h @@ -231,6 +231,33 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo return !m_geometryPrimitiveCount || !m_triangleGeoms && !m_AABBGeoms; } + inline virtual bool valid() const override + { + if (!validBuildFlags(m_buildFlags)) return false; + + size_t geometryCount = 0; + if (m_buildFlags.hasFlags(BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT)) + { + if (!m_AABBGeoms || m_triangleGeoms) return false; + geometryCount = m_AABBGeoms->size(); + } + else + { + if (!m_triangleGeoms || m_AABBGeoms) return false; + geometryCount = m_triangleGeoms->size(); + } + + // https://registry.khronos.org/vulkan/specs/latest/man/html/vkGetAccelerationStructureBuildSizesKHR.html#VUID-vkGetAccelerationStructureBuildSizesKHR-pBuildInfo-03619 + if (geometryCount == 0) { + if (m_geometryPrimitiveCount && m_geometryPrimitiveCount->size() > 0) return false; + } + else + { + if (!m_geometryPrimitiveCount || m_geometryPrimitiveCount->size() != geometryCount) return false; + } + return true; + } + protected: virtual ~ICPUBottomLevelAccelerationStructure() = default; @@ -352,6 +379,17 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA return cp; } + inline virtual bool valid() const override + { + if (!validBuildFlags(m_buildFlags)) return false; + if (!m_instances) return false; + for (const auto& instance : *m_instances) + if (!instance.getBase().blas->valid()) return false; + if (m_buildRangeInfo.instanceCount != m_instances->size()) return false; + if (m_buildRangeInfo.instanceByteOffset % 16 != 0) return false; + return true; + } + protected: virtual ~ICPUTopLevelAccelerationStructure() = default; diff --git a/include/nbl/asset/ICPUAnimationLibrary.h b/include/nbl/asset/ICPUAnimationLibrary.h index 2d620f562c..bcaae3bf3e 100644 --- a/include/nbl/asset/ICPUAnimationLibrary.h +++ b/include/nbl/asset/ICPUAnimationLibrary.h @@ -95,6 +95,7 @@ class ICPUAnimationLibrary final : public IAnimationLibrary, public constexpr static inline auto AssetType = ET_ANIMATION_LIBRARY; inline E_TYPE getAssetType() const override { return AssetType; } + inline virtual bool valid() const override { return true; } private: diff --git a/include/nbl/asset/ICPUImage.h b/include/nbl/asset/ICPUImage.h index 78c7c4891f..01ee3d41e0 100644 --- a/include/nbl/asset/ICPUImage.h +++ b/include/nbl/asset/ICPUImage.h @@ -199,9 +199,10 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed { if (!validateCreationParameters(m_creationParams)) return false; if (info != m_creationParams.format) return false; - if (!buffer->valid()) return false; - for (const auto& region : regions) - if (!region.isValid()) return false; + if (buffer && !buffer->valid()) return false; + if (regions) + for (const auto& region : *regions) + if (!region.isValid()) return false; return true; } diff --git a/include/nbl/asset/ICPUMesh.h b/include/nbl/asset/ICPUMesh.h index 019578775c..0f780ef437 100644 --- a/include/nbl/asset/ICPUMesh.h +++ b/include/nbl/asset/ICPUMesh.h @@ -81,6 +81,16 @@ class ICPUMesh final : public IMesh, public IAsset return cp; } + inline virtual bool valid() const override + { + for (const auto& meshBuffer : m_meshBuffers) + { + if (!meshBuffer) return false; + if (!meshBuffer->valid()) return false; + } + return true; + } + protected: private: diff --git a/include/nbl/asset/ICPUMeshBuffer.h b/include/nbl/asset/ICPUMeshBuffer.h index 8fc5ae26e9..6bd3cd5700 100644 --- a/include/nbl/asset/ICPUMeshBuffer.h +++ b/include/nbl/asset/ICPUMeshBuffer.h @@ -610,7 +610,12 @@ class ICPUMeshBuffer final : public IMeshBuffer(const_cast(this)->getJointAABBs()); } + inline virtual bool valid() const override + { + return true; + } + private: inline virtual void visitDependents_impl(std::function visit) const override { } diff --git a/include/nbl/asset/ICPURenderpassIndependentPipeline.h b/include/nbl/asset/ICPURenderpassIndependentPipeline.h index 422cf548b4..b349aab888 100644 --- a/include/nbl/asset/ICPURenderpassIndependentPipeline.h +++ b/include/nbl/asset/ICPURenderpassIndependentPipeline.h @@ -93,6 +93,11 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline, m_layout = std::move(_layout); } + inline virtual bool valid() const override + { + return m_layout && m_layout->valid(); + } + #if 0 // The getters are weird because the shader pointer needs patching inline IShader::SSpecInfo getSpecInfos(const hlsl::ShaderStage stage) diff --git a/include/nbl/asset/ICPUSampler.h b/include/nbl/asset/ICPUSampler.h index 6ddf479319..8db568f26b 100644 --- a/include/nbl/asset/ICPUSampler.h +++ b/include/nbl/asset/ICPUSampler.h @@ -68,6 +68,7 @@ class ICPUSampler : public ISampler, public IAsset constexpr static inline auto AssetType = ET_SAMPLER; inline IAsset::E_TYPE getAssetType() const override { return AssetType; } + inline virtual bool valid() const override { return true; } private: diff --git a/include/nbl/asset/ICPUSkeleton.h b/include/nbl/asset/ICPUSkeleton.h index e66293da0c..361468d1c5 100644 --- a/include/nbl/asset/ICPUSkeleton.h +++ b/include/nbl/asset/ICPUSkeleton.h @@ -78,6 +78,7 @@ class ICPUSkeleton final : public ISkeleton, public IAsset constexpr static inline auto AssetType = ET_SKELETON; inline E_TYPE getAssetType() const override { return AssetType; } + inline virtual bool valid() const override { return true; } private: diff --git a/include/nbl/asset/IShader.h b/include/nbl/asset/IShader.h index 3ef14f3e78..d1ae2e0c86 100644 --- a/include/nbl/asset/IShader.h +++ b/include/nbl/asset/IShader.h @@ -87,6 +87,14 @@ class IShader : public IAsset // TODO: `void setContent(core::smart_refctd_ptr&&,const E_CONTENT_TYPE)` + inline virtual bool valid() const override + { + if (!m_code) return false; + if (m_contentType == E_CONTENT_TYPE::ECT_UNKNOWN) return false; + // Note(kevyuu) : Should we check for m_filepathHint if content type is not spirv. What if no pragma includ in the source code. Do we even need m_filepathHint in that case? + return true; + } + // alias for legacy reasons using E_SHADER_STAGE = hlsl::ShaderStage; From 437c19408a3e5900f4a69fbc1f5ed7a9544e18eb Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 16 Jun 2025 15:10:06 +0700 Subject: [PATCH 317/346] use x-macros for config compat between hlsl and cpp --- examples_tests | 2 +- .../hlsl/workgroup2/arithmetic_config.hlsl | 165 +++++++++++++----- .../impl/arithmetic_config_def.hlsl | 34 ++++ .../workgroup2/impl/items_per_invoc_def.hlsl | 8 + .../workgroup2/impl/virtual_wg_size_def.hlsl | 8 + 5 files changed, 176 insertions(+), 41 deletions(-) create mode 100644 include/nbl/builtin/hlsl/workgroup2/impl/arithmetic_config_def.hlsl create mode 100644 include/nbl/builtin/hlsl/workgroup2/impl/items_per_invoc_def.hlsl create mode 100644 include/nbl/builtin/hlsl/workgroup2/impl/virtual_wg_size_def.hlsl diff --git a/examples_tests b/examples_tests index 1710b69862..4c10dc1cdb 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 1710b698621796aa767edf7bc940e55e6758c2a8 +Subproject commit 4c10dc1cdba4ab12dfedef97768aa4a10e606213 diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl index f894eac58a..6eb6a535fe 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -6,6 +6,7 @@ #include "nbl/builtin/hlsl/cpp_compat.hlsl" #include "nbl/builtin/hlsl/tuple.hlsl" +#include "nbl/builtin/hlsl/mpl.hlsl" namespace nbl { @@ -19,23 +20,37 @@ namespace impl template struct virtual_wg_size_log2 { - NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2; - NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = _SubgroupSizeLog2; + #define DEFINE_ASSIGN(TYPE,ID,...) NBL_CONSTEXPR_STATIC_INLINE TYPE ID = __VA_ARGS__; + #define DEFINE_VIRTUAL_WG_T(ID) ID + #define DEFINE_MPL_MAX_V(TYPE,ARG1,ARG2) mpl::max_v + #define DEFINE_COND_VAL(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value::value + #include "impl/virtual_wg_size_def.hlsl" + #undef DEFINE_COND_VAL + #undef DEFINE_MPL_MAX_V + #undef DEFINE_VIRTUAL_WG_T + #undef DEFINE_ASSIGN + + // must have at least enough level 0 outputs to feed a single subgroup static_assert(WorkgroupSizeLog2>=SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize"); static_assert(WorkgroupSizeLog2<=SubgroupSizeLog2*3+4, "WorkgroupSize cannot be larger than (SubgroupSize^3)*16"); - - NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2),uint16_t,conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2*2+2),uint16_t,3,2>::value,1>::value; - NBL_CONSTEXPR_STATIC_INLINE uint16_t value = mpl::max_v; - // must have at least enough level 0 outputs to feed a single subgroup }; template struct items_per_invocation { - NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocationProductLog2 = mpl::max_v; - NBL_CONSTEXPR_STATIC_INLINE uint16_t value0 = BaseItemsPerInvocation; - NBL_CONSTEXPR_STATIC_INLINE uint16_t value1 = uint16_t(0x1u) << conditional_value, ItemsPerInvocationProductLog2>::value; - NBL_CONSTEXPR_STATIC_INLINE uint16_t value2 = uint16_t(0x1u) << mpl::max_v; + #define DEFINE_ASSIGN(TYPE,ID,...) NBL_CONSTEXPR_STATIC_INLINE TYPE ID = __VA_ARGS__; + #define DEFINE_VIRTUAL_WG_T(ID) VirtualWorkgroup::ID + #define DEFINE_ITEMS_INVOC_T(ID) ID + #define DEFINE_MPL_MIN_V(TYPE,ARG1,ARG2) mpl::min_v + #define DEFINE_MPL_MAX_V(TYPE,ARG1,ARG2) mpl::max_v + #define DEFINE_COND_VAL(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value::value + #include "impl/items_per_invoc_def.hlsl" + #undef DEFINE_COND_VAL + #undef DEFINE_MPL_MAX_V + #undef DEFINE_MPL_MIN_V + #undef DEFINE_ITEMS_INVOC_T + #undef DEFINE_VIRTUAL_WG_T + #undef DEFINE_ASSIGN using ItemsPerInvocation = tuple,integral_constant,integral_constant >; }; @@ -44,47 +59,35 @@ struct items_per_invocation template struct ArithmeticConfiguration { - NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2; - NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << WorkgroupSizeLog2; - NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = _SubgroupSizeLog2; - NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2; - - using virtual_wg_t = impl::virtual_wg_size_log2; - NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = virtual_wg_t::levels; - NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << virtual_wg_t::value; - static_assert(VirtualWorkgroupSize<=WorkgroupSize*SubgroupSize); - + using virtual_wg_t = impl::virtual_wg_size_log2<_WorkgroupSizeLog2, _SubgroupSizeLog2>; using items_per_invoc_t = impl::items_per_invocation; using ItemsPerInvocation = typename items_per_invoc_t::ItemsPerInvocation; - NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_0 = tuple_element<0,ItemsPerInvocation>::type::value; - NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_1 = tuple_element<1,ItemsPerInvocation>::type::value; - NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = tuple_element<2,ItemsPerInvocation>::type::value; - static_assert(ItemsPerInvocation_2<=4, "4 level scan would have been needed with this config!"); - NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelInputCount_1 = conditional_value>SubgroupSizeLog2), SubgroupSize>, - SubgroupSize*ItemsPerInvocation_1>::value; - NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelInputCount_2 = conditional_value::value; - NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualInvocationsAtLevel1 = LevelInputCount_1 / ItemsPerInvocation_1; + #define DEFINE_ASSIGN(TYPE,ID,...) NBL_CONSTEXPR_STATIC_INLINE TYPE ID = __VA_ARGS__; + #define DEFINE_VIRTUAL_WG_T(ID) virtual_wg_t::ID + #define DEFINE_ITEMS_INVOC_T(ID) items_per_invoc_t::ID + #define DEFINE_CONFIG_T(ID) ID + #define DEFINE_MPL_MAX_V(TYPE,ARG1,ARG2) mpl::max_v + #define DEFINE_COND_VAL(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value::value + #include "impl/arithmetic_config_def.hlsl" + #undef DEFINE_COND_VAL + #undef DEFINE_MPL_MAX_V + #undef DEFINE_CONFIG_T + #undef DEFINE_ITEMS_INVOC_T + #undef DEFINE_VIRTUAL_WG_T + #undef DEFINE_ASSIGN - NBL_CONSTEXPR_STATIC_INLINE uint16_t __padding = conditional_value::value; - NBL_CONSTEXPR_STATIC_INLINE uint16_t __channelStride_1 = conditional_value::value + __padding; - NBL_CONSTEXPR_STATIC_INLINE uint16_t __channelStride_2 = conditional_value::value; using ChannelStride = tuple,integral_constant,integral_constant >; // we don't use stride 0 - // user specified the shared mem size of Scalars - NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = conditional_value::value + LevelInputCount_1 - >::value; + static_assert(VirtualWorkgroupSize<=WorkgroupSize*SubgroupSize); + static_assert(ItemsPerInvocation_2<=4, "4 level scan would have been needed with this config!"); +#ifdef __HLSL_VERSION static bool electLast() { return glsl::gl_SubgroupInvocationID()==SubgroupSize-1; } +#endif // gets a subgroupID as if each workgroup has (VirtualWorkgroupSize/SubgroupSize) subgroups // each subgroup does work (VirtualWorkgroupSize/WorkgroupSize) times, the index denoted by workgroupInVirtualIndex @@ -140,6 +143,88 @@ struct ArithmeticConfiguration } }; +#ifndef __HLSL_VERSION +namespace impl +{ +struct SVirtualWGSizeLog2 +{ + static SVirtualWGSizeLog2 create(const uint16_t _WorkgroupSizeLog2, const uint16_t _SubgroupSizeLog2) + { + SVirtualWGSizeLog2 retval; + #define DEFINE_ASSIGN(TYPE,ID,...) retval.ID = __VA_ARGS__; + #define DEFINE_VIRTUAL_WG_T(ID) retval.ID + #define DEFINE_MPL_MAX_V(TYPE,ARG1,ARG2) hlsl::max(ARG1, ARG2) + #define DEFINE_COND_VAL(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL) + #include "impl/virtual_wg_size_def.hlsl" + #undef DEFINE_COND_VAL + #undef DEFINE_MPL_MAX_V + #undef DEFINE_VIRTUAL_WG_T + #undef DEFINE_ASSIGN + return retval; + } + + #define DEFINE_ASSIGN(TYPE,ID,...) TYPE ID; + #include "impl/virtual_wg_size_def.hlsl" + #undef DEFINE_ASSIGN +}; + +struct SItemsPerInvoc +{ + static SItemsPerInvoc create(const SVirtualWGSizeLog2 virtualWgSizeLog2, const uint16_t BaseItemsPerInvocation) + { + SItemsPerInvoc retval; + #define DEFINE_ASSIGN(TYPE,ID,...) retval.ID = __VA_ARGS__; + #define DEFINE_VIRTUAL_WG_T(ID) virtualWgSizeLog2.ID + #define DEFINE_ITEMS_INVOC_T(ID) retval.ID + #define DEFINE_MPL_MIN_V(TYPE,ARG1,ARG2) hlsl::min(ARG1, ARG2) + #define DEFINE_MPL_MAX_V(TYPE,ARG1,ARG2) hlsl::max(ARG1, ARG2) + #define DEFINE_COND_VAL(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL) + #include "impl/items_per_invoc_def.hlsl" + #undef DEFINE_COND_VAL + #undef DEFINE_MPL_MAX_V + #undef DEFINE_MPL_MIN_V + #undef DEFINE_ITEMS_INVOC_T + #undef DEFINE_VIRTUAL_WG_T + #undef DEFINE_ASSIGN + return retval; + } + + #define DEFINE_ASSIGN(TYPE,ID,...) TYPE ID; + #include "impl/items_per_invoc_def.hlsl" + #undef DEFINE_ASSIGN +}; +} + +struct SArithmeticConfiguration +{ + static SArithmeticConfiguration create(const uint16_t _WorkgroupSizeLog2, const uint16_t _SubgroupSizeLog2, const uint16_t _ItemsPerInvocation) + { + impl::SVirtualWGSizeLog2 virtualWgSizeLog2 = impl::SVirtualWGSizeLog2::create(_WorkgroupSizeLog2, _SubgroupSizeLog2); + impl::SItemsPerInvoc itemsPerInvoc = impl::SItemsPerInvoc::create(virtualWgSizeLog2, _ItemsPerInvocation); + + SArithmeticConfiguration retval; + #define DEFINE_ASSIGN(TYPE,ID,...) retval.ID = __VA_ARGS__; + #define DEFINE_VIRTUAL_WG_T(ID) virtualWgSizeLog2.ID + #define DEFINE_ITEMS_INVOC_T(ID) itemsPerInvoc.ID + #define DEFINE_CONFIG_T(ID) retval.ID + #define DEFINE_MPL_MAX_V(TYPE,ARG1,ARG2) hlsl::max(ARG1, ARG2) + #define DEFINE_COND_VAL(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL) + #include "impl/arithmetic_config_def.hlsl" + #undef DEFINE_COND_VAL + #undef DEFINE_MPL_MAX_V + #undef DEFINE_CONFIG_T + #undef DEFINE_ITEMS_INVOC_T + #undef DEFINE_VIRTUAL_WG_T + #undef DEFINE_ASSIGN + return retval; + } + + #define DEFINE_ASSIGN(TYPE,ID,...) TYPE ID; + #include "impl/arithmetic_config_def.hlsl" + #undef DEFINE_ASSIGN +}; +#endif + template struct is_configuration : bool_constant {}; diff --git a/include/nbl/builtin/hlsl/workgroup2/impl/arithmetic_config_def.hlsl b/include/nbl/builtin/hlsl/workgroup2/impl/arithmetic_config_def.hlsl new file mode 100644 index 0000000000..4ea6fc010d --- /dev/null +++ b/include/nbl/builtin/hlsl/workgroup2/impl/arithmetic_config_def.hlsl @@ -0,0 +1,34 @@ +// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +DEFINE_ASSIGN(uint16_t, WorkgroupSizeLog2, _WorkgroupSizeLog2) +DEFINE_ASSIGN(uint16_t, WorkgroupSize, uint16_t(0x1u) << DEFINE_CONFIG_T(WorkgroupSizeLog2)) +DEFINE_ASSIGN(uint16_t, SubgroupSizeLog2, _SubgroupSizeLog2) +DEFINE_ASSIGN(uint16_t, SubgroupSize, uint16_t(0x1u) << DEFINE_CONFIG_T(SubgroupSizeLog2)) + +DEFINE_ASSIGN(uint16_t, LevelCount, DEFINE_VIRTUAL_WG_T(levels)) +DEFINE_ASSIGN(uint16_t, VirtualWorkgroupSize, uint16_t(0x1u) << DEFINE_VIRTUAL_WG_T(value)) + +DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_0, DEFINE_ITEMS_INVOC_T(value0)) +DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_1, DEFINE_ITEMS_INVOC_T(value1)) +DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_2, DEFINE_ITEMS_INVOC_T(value2)) + +DEFINE_ASSIGN(uint16_t, LevelInputCount_1, DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==3), + DEFINE_MPL_MAX_V(uint16_t, (DEFINE_CONFIG_T(VirtualWorkgroupSize)>>DEFINE_CONFIG_T(SubgroupSizeLog2)), DEFINE_CONFIG_T(SubgroupSize)), + DEFINE_CONFIG_T(SubgroupSize)*DEFINE_CONFIG_T(ItemsPerInvocation_1))) +DEFINE_ASSIGN(uint16_t, LevelInputCount_2, DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==3),DEFINE_CONFIG_T(SubgroupSize)*DEFINE_CONFIG_T(ItemsPerInvocation_2),0)) +DEFINE_ASSIGN(uint16_t, VirtualInvocationsAtLevel1, DEFINE_CONFIG_T(LevelInputCount_1) / DEFINE_CONFIG_T(ItemsPerInvocation_1)) + +DEFINE_ASSIGN(uint16_t, __padding, DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==3),DEFINE_CONFIG_T(SubgroupSize)-1,0)) +DEFINE_ASSIGN(uint16_t, __channelStride_1, DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==3),DEFINE_CONFIG_T(VirtualInvocationsAtLevel1),DEFINE_CONFIG_T(SubgroupSize)) + DEFINE_CONFIG_T(__padding)) +DEFINE_ASSIGN(uint16_t, __channelStride_2, DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==3),DEFINE_CONFIG_T(SubgroupSize),0)) + +// user specified the shared mem size of Scalars +DEFINE_ASSIGN(uint32_t, SharedScratchElementCount, DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==1), + 0, + DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==3), + DEFINE_CONFIG_T(LevelInputCount_2)+(DEFINE_CONFIG_T(SubgroupSize)*DEFINE_CONFIG_T(ItemsPerInvocation_1))-1, + 0 + ) + DEFINE_CONFIG_T(LevelInputCount_1) + )) diff --git a/include/nbl/builtin/hlsl/workgroup2/impl/items_per_invoc_def.hlsl b/include/nbl/builtin/hlsl/workgroup2/impl/items_per_invoc_def.hlsl new file mode 100644 index 0000000000..857b64d774 --- /dev/null +++ b/include/nbl/builtin/hlsl/workgroup2/impl/items_per_invoc_def.hlsl @@ -0,0 +1,8 @@ +// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +DEFINE_ASSIGN(uint16_t, ItemsPerInvocationProductLog2, DEFINE_MPL_MAX_V(int16_t,DEFINE_VIRTUAL_WG_T(WorkgroupSizeLog2)-DEFINE_VIRTUAL_WG_T(SubgroupSizeLog2)*DEFINE_VIRTUAL_WG_T(levels),0)) +DEFINE_ASSIGN(uint16_t, value0, BaseItemsPerInvocation) +DEFINE_ASSIGN(uint16_t, value1, uint16_t(0x1u) << DEFINE_COND_VAL(uint16_t,(DEFINE_VIRTUAL_WG_T(levels)==3),DEFINE_MPL_MIN_V(uint16_t,DEFINE_ITEMS_INVOC_T(ItemsPerInvocationProductLog2),2),DEFINE_ITEMS_INVOC_T(ItemsPerInvocationProductLog2))) +DEFINE_ASSIGN(uint16_t, value2, uint16_t(0x1u) << DEFINE_MPL_MAX_V(int16_t,DEFINE_ITEMS_INVOC_T(ItemsPerInvocationProductLog2)-2,0)) \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/workgroup2/impl/virtual_wg_size_def.hlsl b/include/nbl/builtin/hlsl/workgroup2/impl/virtual_wg_size_def.hlsl new file mode 100644 index 0000000000..3190ba5df3 --- /dev/null +++ b/include/nbl/builtin/hlsl/workgroup2/impl/virtual_wg_size_def.hlsl @@ -0,0 +1,8 @@ +// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +DEFINE_ASSIGN(uint16_t, WorkgroupSizeLog2, _WorkgroupSizeLog2) +DEFINE_ASSIGN(uint16_t, SubgroupSizeLog2, _SubgroupSizeLog2) +DEFINE_ASSIGN(uint16_t, levels, DEFINE_COND_VAL(uint16_t,(_WorkgroupSizeLog2>_SubgroupSizeLog2),DEFINE_COND_VAL(uint16_t,(_WorkgroupSizeLog2>_SubgroupSizeLog2*2+2),3,2),1)) +DEFINE_ASSIGN(uint16_t, value, DEFINE_MPL_MAX_V(uint16_t, _SubgroupSizeLog2*DEFINE_VIRTUAL_WG_T(levels), _WorkgroupSizeLog2)) From ae3946e5299f28e064bcd14870c4a6c1eb2f18c0 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 16 Jun 2025 15:37:44 +0700 Subject: [PATCH 318/346] Add comment to some valid logic of top acceleration structure --- include/nbl/asset/ICPUAccelerationStructure.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h index 2c4933d36c..feddcbb35f 100644 --- a/include/nbl/asset/ICPUAccelerationStructure.h +++ b/include/nbl/asset/ICPUAccelerationStructure.h @@ -386,6 +386,7 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA for (const auto& instance : *m_instances) if (!instance.getBase().blas->valid()) return false; if (m_buildRangeInfo.instanceCount != m_instances->size()) return false; + // https://registry.khronos.org/vulkan/specs/latest/man/html/VkAccelerationStructureBuildRangeInfoKHR.html#VUID-VkAccelerationStructureBuildRangeInfoKHR-primitiveOffset-03660 if (m_buildRangeInfo.instanceByteOffset % 16 != 0) return false; return true; } From ef2ed3ac6b199541fc6f782831f825bc266391db Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 16 Jun 2025 15:39:46 +0700 Subject: [PATCH 319/346] Rename getSpecInfoVec to getSpecInfoVector --- include/nbl/asset/ICPURayTracingPipeline.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h index 955275f819..8e6bdaf8b9 100644 --- a/include/nbl/asset/ICPURayTracingPipeline.h +++ b/include/nbl/asset/ICPURayTracingPipeline.h @@ -62,7 +62,7 @@ class ICPURayTracingPipeline final : public ICPUPipeline* getSpecInfoVec(hlsl::ShaderStage stage) + inline core::vector* getSpecInfoVector(hlsl::ShaderStage stage) { if (!isMutable()) return nullptr; switch (stage) From 9c8792594e9588e22450030a9c34bbaa4728924e Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 16 Jun 2025 16:27:26 +0700 Subject: [PATCH 320/346] Fix indentation --- include/nbl/asset/IAsset.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/nbl/asset/IAsset.h b/include/nbl/asset/IAsset.h index dc77931c25..b35981ffc7 100644 --- a/include/nbl/asset/IAsset.h +++ b/include/nbl/asset/IAsset.h @@ -169,10 +169,10 @@ class IAsset : virtual public core::IReferenceCounted inline void visitDependents(std::function visit) { assert(isMutable()); - visitDependents([&](const IAsset* dependent) -> bool - { - return visit(const_cast(dependent)); - }); + visitDependents([&](const IAsset* dependent) -> bool + { + return visit(const_cast(dependent)); + }); } virtual bool valid() const = 0; From 697589ccada856f55b459fe57e7e8a2e9f3f0371 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 16 Jun 2025 16:33:06 +0700 Subject: [PATCH 321/346] Remove virtual from final classes --- include/nbl/asset/ICPUAccelerationStructure.h | 8 ++++---- include/nbl/asset/ICPUAnimationLibrary.h | 4 ++-- include/nbl/asset/ICPUBuffer.h | 4 ++-- include/nbl/asset/ICPUBufferView.h | 4 ++-- include/nbl/asset/ICPUComputePipeline.h | 2 +- include/nbl/asset/ICPUDescriptorSet.h | 4 ++-- include/nbl/asset/ICPUDescriptorSetLayout.h | 4 ++-- include/nbl/asset/ICPUGraphicsPipeline.h | 6 +++--- include/nbl/asset/ICPUImage.h | 4 ++-- include/nbl/asset/ICPUImageView.h | 4 ++-- include/nbl/asset/ICPUMesh.h | 8 ++++---- include/nbl/asset/ICPUMeshBuffer.h | 4 ++-- include/nbl/asset/ICPUPipelineCache.h | 4 ++-- include/nbl/asset/ICPUPipelineLayout.h | 4 ++-- include/nbl/asset/ICPURayTracingPipeline.h | 6 +++--- include/nbl/asset/ICPURenderpass.h | 4 ++-- include/nbl/asset/ICPURenderpassIndependentPipeline.h | 4 ++-- include/nbl/asset/ICPUSampler.h | 4 ++-- include/nbl/asset/ICPUSkeleton.h | 4 ++-- 19 files changed, 43 insertions(+), 43 deletions(-) diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h index feddcbb35f..a4f1e9dec4 100644 --- a/include/nbl/asset/ICPUAccelerationStructure.h +++ b/include/nbl/asset/ICPUAccelerationStructure.h @@ -231,7 +231,7 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo return !m_geometryPrimitiveCount || !m_triangleGeoms && !m_AABBGeoms; } - inline virtual bool valid() const override + inline bool valid() const override { if (!validBuildFlags(m_buildFlags)) return false; @@ -275,7 +275,7 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo core::smart_refctd_dynamic_array m_geometryPrimitiveCount = nullptr; core::bitflag m_buildFlags = BUILD_FLAGS::PREFER_FAST_TRACE_BIT; - inline virtual void visitDependents_impl(std::function visit) const override {} + inline void visitDependents_impl(std::function visit) const override {} }; class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelAccelerationStructure @@ -379,7 +379,7 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA return cp; } - inline virtual bool valid() const override + inline bool valid() const override { if (!validBuildFlags(m_buildFlags)) return false; if (!m_instances) return false; @@ -399,7 +399,7 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA hlsl::acceleration_structures::top_level::BuildRangeInfo m_buildRangeInfo; core::bitflag m_buildFlags = BUILD_FLAGS::PREFER_FAST_BUILD_BIT; - inline virtual void visitDependents_impl(std::function visit) const override + inline void visitDependents_impl(std::function visit) const override { if (!m_instances) return; for (const auto& instance : *m_instances) diff --git a/include/nbl/asset/ICPUAnimationLibrary.h b/include/nbl/asset/ICPUAnimationLibrary.h index bcaae3bf3e..321cefa33b 100644 --- a/include/nbl/asset/ICPUAnimationLibrary.h +++ b/include/nbl/asset/ICPUAnimationLibrary.h @@ -95,11 +95,11 @@ class ICPUAnimationLibrary final : public IAnimationLibrary, public constexpr static inline auto AssetType = ET_ANIMATION_LIBRARY; inline E_TYPE getAssetType() const override { return AssetType; } - inline virtual bool valid() const override { return true; } + inline bool valid() const override { return true; } private: - virtual void visitDependents_impl(std::function visit) const override + inline void visitDependents_impl(std::function visit) const override { if (!visit(m_keyframeStorageBinding.buffer.get())) return; if (!visit(m_timestampStorageBinding.buffer.get())) return; diff --git a/include/nbl/asset/ICPUBuffer.h b/include/nbl/asset/ICPUBuffer.h index 66170ac20d..46105b3c0e 100644 --- a/include/nbl/asset/ICPUBuffer.h +++ b/include/nbl/asset/ICPUBuffer.h @@ -110,7 +110,7 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed return true; } - inline virtual bool valid() const override + inline bool valid() const override { if (!m_data) return false; if (!m_mem_resource) return false; @@ -137,7 +137,7 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed discardContent_impl(); } - inline virtual void visitDependents_impl(std::function visit) const override {} + inline void visitDependents_impl(std::function visit) const override {} void* m_data; core::smart_refctd_ptr m_mem_resource; diff --git a/include/nbl/asset/ICPUBufferView.h b/include/nbl/asset/ICPUBufferView.h index c96f0377f4..8634fd8394 100644 --- a/include/nbl/asset/ICPUBufferView.h +++ b/include/nbl/asset/ICPUBufferView.h @@ -46,7 +46,7 @@ class ICPUBufferView : public IBufferView, public IAsset m_size = _size; } - inline virtual bool valid() const override + inline bool valid() const override { if (!m_buffer->valid()) return false; if (m_offset >= m_buffer->getSize()) return false; @@ -61,7 +61,7 @@ class ICPUBufferView : public IBufferView, public IAsset private: - inline virtual void visitDependents_impl(std::function visit) const override + inline void visitDependents_impl(std::function visit) const override { if (!visit(m_buffer.get())) return; } diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h index 02b56d02ce..9b867e3a06 100644 --- a/include/nbl/asset/ICPUComputePipeline.h +++ b/include/nbl/asset/ICPUComputePipeline.h @@ -88,7 +88,7 @@ class ICPUComputePipeline final : public ICPUPipeline visit) const override + inline void visitDependents_impl(std::function visit) const override { if (!visit(m_layout.get())) return; if (!visit(m_specInfo.shader.get())) return; diff --git a/include/nbl/asset/ICPUDescriptorSet.h b/include/nbl/asset/ICPUDescriptorSet.h index 776e4e1409..29cfe4cb1d 100644 --- a/include/nbl/asset/ICPUDescriptorSet.h +++ b/include/nbl/asset/ICPUDescriptorSet.h @@ -77,7 +77,7 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet clone(uint32_t _depth = ~0u) const override; - inline virtual bool valid() const override { + inline bool valid() const override { if (!m_layout->valid()) return false; return true; } @@ -90,7 +90,7 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet m_descriptorInfos[static_cast(IDescriptor::E_TYPE::ET_COUNT)]; - virtual void visitDependents_impl(std::function visit) const override + inline void visitDependents_impl(std::function visit) const override { for (auto i = 0u; i < static_cast(IDescriptor::E_TYPE::ET_COUNT); i++) { diff --git a/include/nbl/asset/ICPUDescriptorSetLayout.h b/include/nbl/asset/ICPUDescriptorSetLayout.h index da249620bc..a46bb55808 100644 --- a/include/nbl/asset/ICPUDescriptorSetLayout.h +++ b/include/nbl/asset/ICPUDescriptorSetLayout.h @@ -56,7 +56,7 @@ class ICPUDescriptorSetLayout : public IDescriptorSetLayout, public constexpr static inline auto AssetType = ET_DESCRIPTOR_SET_LAYOUT; inline E_TYPE getAssetType() const override { return AssetType; } - inline virtual bool valid() const override + inline bool valid() const override { return true; // no modification is possible after creation } @@ -67,7 +67,7 @@ class ICPUDescriptorSetLayout : public IDescriptorSetLayout, public private: - inline virtual void visitDependents_impl(std::function visit) const override + inline void visitDependents_impl(std::function visit) const override { if (!m_immutableSamplers) return; for (const auto& sampler : *m_immutableSamplers) diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h index f39f38f673..a95a82633c 100644 --- a/include/nbl/asset/ICPUGraphicsPipeline.h +++ b/include/nbl/asset/ICPUGraphicsPipeline.h @@ -40,7 +40,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline getSpecInfos(hlsl::ShaderStage stage) const override final + inline std::span getSpecInfos(hlsl::ShaderStage stage) const override final { const auto stageIndex = stageToIndex(stage); if (stageIndex != -1) @@ -70,7 +70,7 @@ class ICPUGraphicsPipeline final : public ICPUPipelinevalid())return false; @@ -127,7 +127,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline(newPipeline, core::dont_grab); } - inline virtual void visitDependents_impl(std::function visit) const override + inline void visitDependents_impl(std::function visit) const override { if (!visit(m_layout.get())) return; if (!visit(m_renderpass.get())) return; diff --git a/include/nbl/asset/ICPUImage.h b/include/nbl/asset/ICPUImage.h index 01ee3d41e0..13cbb7ecec 100644 --- a/include/nbl/asset/ICPUImage.h +++ b/include/nbl/asset/ICPUImage.h @@ -195,7 +195,7 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed return true; } - inline virtual bool valid() const override + inline bool valid() const override { if (!validateCreationParameters(m_creationParams)) return false; if (info != m_creationParams.format) return false; @@ -228,7 +228,7 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed } }; - inline virtual void visitDependents_impl(std::function visit) const override + inline void visitDependents_impl(std::function visit) const override { } }; diff --git a/include/nbl/asset/ICPUImageView.h b/include/nbl/asset/ICPUImageView.h index 953651c604..85a0629cc3 100644 --- a/include/nbl/asset/ICPUImageView.h +++ b/include/nbl/asset/ICPUImageView.h @@ -62,7 +62,7 @@ class ICPUImageView final : public IImageView, public IAsset params.subresourceRange.aspectMask = aspect.value; } - inline virtual bool valid() const override + inline bool valid() const override { if (!validateCreationParameters(params)) return false; @@ -78,7 +78,7 @@ class ICPUImageView final : public IImageView, public IAsset private: - inline virtual void visitDependents_impl(std::function visit) const override + inline void visitDependents_impl(std::function visit) const override { if (!visit(params.image.get())) return; } diff --git a/include/nbl/asset/ICPUMesh.h b/include/nbl/asset/ICPUMesh.h index 0f780ef437..df647b14a4 100644 --- a/include/nbl/asset/ICPUMesh.h +++ b/include/nbl/asset/ICPUMesh.h @@ -81,7 +81,7 @@ class ICPUMesh final : public IMesh, public IAsset return cp; } - inline virtual bool valid() const override + inline bool valid() const override { for (const auto& meshBuffer : m_meshBuffers) { @@ -96,9 +96,9 @@ class ICPUMesh final : public IMesh, public IAsset private: core::vector> m_meshBuffers; - inline virtual void visitDependents_impl(std::function visit) const override - { - } + inline void visitDependents_impl(std::function visit) const override + { + } }; } diff --git a/include/nbl/asset/ICPUMeshBuffer.h b/include/nbl/asset/ICPUMeshBuffer.h index 6bd3cd5700..aa6cbc9429 100644 --- a/include/nbl/asset/ICPUMeshBuffer.h +++ b/include/nbl/asset/ICPUMeshBuffer.h @@ -610,13 +610,13 @@ class ICPUMeshBuffer final : public IMeshBuffer(const_cast(this)->getJointAABBs()); } - inline virtual bool valid() const override + inline bool valid() const override { return true; } private: - inline virtual void visitDependents_impl(std::function visit) const override + inline void visitDependents_impl(std::function visit) const override { } }; diff --git a/include/nbl/asset/ICPUPipelineCache.h b/include/nbl/asset/ICPUPipelineCache.h index 702b86620e..c5511f39bb 100644 --- a/include/nbl/asset/ICPUPipelineCache.h +++ b/include/nbl/asset/ICPUPipelineCache.h @@ -83,7 +83,7 @@ class ICPUPipelineCache final : public IPreHashed // const auto& getEntries() const {return m_cache;} - inline virtual bool valid() const override + inline bool valid() const override { return true; } @@ -98,7 +98,7 @@ class ICPUPipelineCache final : public IPreHashed private: entries_map_t m_cache; - inline virtual void visitDependents_impl(std::function visit) const override + inline void visitDependents_impl(std::function visit) const override { } }; diff --git a/include/nbl/asset/ICPUPipelineLayout.h b/include/nbl/asset/ICPUPipelineLayout.h index 0684980cf8..b30ecc3e10 100644 --- a/include/nbl/asset/ICPUPipelineLayout.h +++ b/include/nbl/asset/ICPUPipelineLayout.h @@ -66,7 +66,7 @@ class ICPUPipelineLayout : public IAsset, public IPipelineLayout visit) const override + inline void visitDependents_impl(std::function visit) const override { for (auto i = 0; i < m_descSetLayouts.size(); i++) { diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h index 8e6bdaf8b9..2c157f91e9 100644 --- a/include/nbl/asset/ICPURayTracingPipeline.h +++ b/include/nbl/asset/ICPURayTracingPipeline.h @@ -36,7 +36,7 @@ class ICPURayTracingPipeline final : public ICPUPipeline getSpecInfos(hlsl::ShaderStage stage) const override final + inline std::span getSpecInfos(hlsl::ShaderStage stage) const override final { switch (stage) { @@ -84,7 +84,7 @@ class ICPURayTracingPipeline final : public ICPUPipelinevalid()) return false; @@ -116,7 +116,7 @@ class ICPURayTracingPipeline final : public ICPUPipeline visit) const override + inline void visitDependents_impl(std::function visit) const override { if (!visit(m_raygen.shader.get()) return; for (const auto& missInfo : self->m_misses) if (!visit(missInfo.shader.get())) return; diff --git a/include/nbl/asset/ICPURenderpass.h b/include/nbl/asset/ICPURenderpass.h index a131b44add..daaa5c62b0 100644 --- a/include/nbl/asset/ICPURenderpass.h +++ b/include/nbl/asset/ICPURenderpass.h @@ -38,7 +38,7 @@ class ICPURenderpass : public IRenderpass, public IAsset return ET_RENDERPASS; } - inline virtual bool valid() const override + inline bool valid() const override { // no modification is possible after creation. parameter is validated when creating renderpass return true; @@ -50,7 +50,7 @@ class ICPURenderpass : public IRenderpass, public IAsset private: - inline virtual void visitDependents_impl(std::function visit) const override + inline void visitDependents_impl(std::function visit) const override { } diff --git a/include/nbl/asset/ICPURenderpassIndependentPipeline.h b/include/nbl/asset/ICPURenderpassIndependentPipeline.h index b349aab888..3d67af23d0 100644 --- a/include/nbl/asset/ICPURenderpassIndependentPipeline.h +++ b/include/nbl/asset/ICPURenderpassIndependentPipeline.h @@ -93,7 +93,7 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline, m_layout = std::move(_layout); } - inline virtual bool valid() const override + inline bool valid() const override { return m_layout && m_layout->valid(); } @@ -155,7 +155,7 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline, private: - inline virtual void visitDependents_impl(std::function visit) const override + inline void visitDependents_impl(std::function visit) const override { } }; diff --git a/include/nbl/asset/ICPUSampler.h b/include/nbl/asset/ICPUSampler.h index 8db568f26b..6b2bea5219 100644 --- a/include/nbl/asset/ICPUSampler.h +++ b/include/nbl/asset/ICPUSampler.h @@ -68,11 +68,11 @@ class ICPUSampler : public ISampler, public IAsset constexpr static inline auto AssetType = ET_SAMPLER; inline IAsset::E_TYPE getAssetType() const override { return AssetType; } - inline virtual bool valid() const override { return true; } + inline bool valid() const override { return true; } private: - inline virtual void visitDependents_impl(std::function visit) const override + inline void visitDependents_impl(std::function visit) const override { } }; diff --git a/include/nbl/asset/ICPUSkeleton.h b/include/nbl/asset/ICPUSkeleton.h index 361468d1c5..1049798268 100644 --- a/include/nbl/asset/ICPUSkeleton.h +++ b/include/nbl/asset/ICPUSkeleton.h @@ -78,11 +78,11 @@ class ICPUSkeleton final : public ISkeleton, public IAsset constexpr static inline auto AssetType = ET_SKELETON; inline E_TYPE getAssetType() const override { return AssetType; } - inline virtual bool valid() const override { return true; } + inline bool valid() const override { return true; } private: - inline virtual void visitDependents_impl(std::function visit) const override + inline void visitDependents_impl(std::function visit) const override { if (!visit(m_defaultTransforms.buffer.get())) return; if (!visit(m_parentJointIDs.buffer.get())) return; From 469bf0419ccf56cf225fae1ae03d290794ff1f18 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 16 Jun 2025 16:34:39 +0700 Subject: [PATCH 322/346] Fix indentation --- include/nbl/asset/ICPUAccelerationStructure.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h index a4f1e9dec4..a6b148a891 100644 --- a/include/nbl/asset/ICPUAccelerationStructure.h +++ b/include/nbl/asset/ICPUAccelerationStructure.h @@ -275,7 +275,7 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo core::smart_refctd_dynamic_array m_geometryPrimitiveCount = nullptr; core::bitflag m_buildFlags = BUILD_FLAGS::PREFER_FAST_TRACE_BIT; - inline void visitDependents_impl(std::function visit) const override {} + inline void visitDependents_impl(std::function visit) const override {} }; class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelAccelerationStructure @@ -399,11 +399,11 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA hlsl::acceleration_structures::top_level::BuildRangeInfo m_buildRangeInfo; core::bitflag m_buildFlags = BUILD_FLAGS::PREFER_FAST_BUILD_BIT; - inline void visitDependents_impl(std::function visit) const override + inline void visitDependents_impl(std::function visit) const override { - if (!m_instances) return; - for (const auto& instance : *m_instances) - if (!visit(instance.getBase().blas.get())) return; + if (!m_instances) return; + for (const auto& instance : *m_instances) + if (!visit(instance.getBase().blas.get())) return; } }; From 6e23e6e76c4f29dae8d02584f82423cbb3c3cdcf Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 16 Jun 2025 17:03:27 +0700 Subject: [PATCH 323/346] Fix indentation --- include/nbl/asset/ICPUImage.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/include/nbl/asset/ICPUImage.h b/include/nbl/asset/ICPUImage.h index 13cbb7ecec..fdbf640557 100644 --- a/include/nbl/asset/ICPUImage.h +++ b/include/nbl/asset/ICPUImage.h @@ -195,16 +195,16 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed return true; } - inline bool valid() const override - { - if (!validateCreationParameters(m_creationParams)) return false; - if (info != m_creationParams.format) return false; - if (buffer && !buffer->valid()) return false; - if (regions) - for (const auto& region : *regions) - if (!region.isValid()) return false; - return true; - } + inline bool valid() const override + { + if (!validateCreationParameters(m_creationParams)) return false; + if (info != m_creationParams.format) return false; + if (buffer && !buffer->valid()) return false; + if (regions) + for (const auto& region : *regions) + if (!region.isValid()) return false; + return true; + } protected: inline ICPUImage(const SCreationParams& _params) : IImage(_params) {} From 9c138b7c281ed6610058bf80bbd27035730518f4 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 16 Jun 2025 17:03:35 +0700 Subject: [PATCH 324/346] Fix indentation --- include/nbl/asset/IPreHashed.h | 104 ++++++++++++++++----------------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/include/nbl/asset/IPreHashed.h b/include/nbl/asset/IPreHashed.h index 054bfaee92..f7252211e1 100644 --- a/include/nbl/asset/IPreHashed.h +++ b/include/nbl/asset/IPreHashed.h @@ -39,61 +39,61 @@ class IPreHashed : public IAsset discardContent_impl(); } - static inline void discardDependantsContents(const std::span roots) - { - core::stack stack; - core::unordered_set alreadyVisited; // whether we have push the node to the stack - auto push = [&stack,&alreadyVisited](IAsset* node) -> bool - { - const auto [dummy,inserted] = alreadyVisited.insert(node); - if (inserted) - stack.push(node); - return true; - }; - for (const auto& root : roots) - push(root); - while (!stack.empty()) - { - auto* entry = stack.top(); - stack.pop(); - entry->visitDependents(push); - // post order traversal does discard + static inline void discardDependantsContents(const std::span roots) + { + core::vector stack; + core::unordered_set alreadyVisited; // whether we have push the node to the stack + auto push = [&stack,&alreadyVisited](IAsset* node) -> bool + { + const auto [dummy,inserted] = alreadyVisited.insert(node); + if (inserted) + stack.push_back(node); + return true; + }; + for (const auto& root : roots) + push(root); + while (!stack.empty()) + { + auto* entry = stack.back(); + stack.pop_back(); + entry->visitDependents(push); + // pre order traversal does discard auto* isPrehashed = dynamic_cast(entry); if (isPrehashed) isPrehashed->discardContent(); - } - } - static inline bool anyDependantDiscardedContents(const IAsset* root) - { - core::stack stack; - core::unordered_set alreadyVisited; // whether we have push the node to the stack - bool result = false; - auto push = [&stack,&alreadyVisited,&result](const IAsset* node) -> bool - { - const auto [dummy,inserted] = alreadyVisited.insert(node); - if (inserted) - { - auto* isPrehashed = dynamic_cast(node); - if (isPrehashed && isPrehashed->missingContent()) - { - stack = {}; - result = true; - return false; - } - stack.push(node); - } - return true; - }; - if (!push(root)) - return true; - while (!stack.empty()) - { - auto* entry = stack.top(); - stack.pop(); - entry->visitDependents(push); - } - return result; - } + } + } + static inline bool anyDependantDiscardedContents(const IAsset* root) + { + core::vector stack; + core::unordered_set alreadyVisited; // whether we have push the node to the stack + bool result = false; + auto push = [&stack,&alreadyVisited,&result](const IAsset* node) -> bool + { + const auto [dummy,inserted] = alreadyVisited.insert(node); + if (inserted) + { + auto* isPrehashed = dynamic_cast(node); + if (isPrehashed && isPrehashed->missingContent()) + { + stack.clear(); + result = true; + return false; + } + stack.push_back(node); + } + return true; + }; + if (!push(root)) + return true; + while (!stack.empty()) + { + auto* entry = stack.back(); + stack.pop_back(); + entry->visitDependents(push); + } + return result; + } protected: inline IPreHashed() = default; From 026d49412acb9e77d2dcdb6b911df5f36d9db63b Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 16 Jun 2025 17:05:27 +0700 Subject: [PATCH 325/346] Fix indentation --- include/nbl/asset/IAsset.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/nbl/asset/IAsset.h b/include/nbl/asset/IAsset.h index b35981ffc7..aae73fac2a 100644 --- a/include/nbl/asset/IAsset.h +++ b/include/nbl/asset/IAsset.h @@ -156,15 +156,15 @@ class IAsset : virtual public core::IReferenceCounted //! inline bool isMutable() const {return m_mutable;} - inline void visitDependents(std::function visit) const - { - visitDependents_impl([&visit](const IAsset* dep)->bool + inline void visitDependents(std::function visit) const + { + visitDependents_impl([&visit](const IAsset* dep)->bool { if (dep) return visit(dep); return true; }); - } + } inline void visitDependents(std::function visit) { From 2578abed02a426253e9cded093da1c84397eb020 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 16 Jun 2025 17:57:16 +0700 Subject: [PATCH 326/346] Check raygen shader existence for raytracing pipeline --- include/nbl/video/IGPURayTracingPipeline.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h index 482861dbcc..56c7b38c29 100644 --- a/include/nbl/video/IGPURayTracingPipeline.h +++ b/include/nbl/video/IGPURayTracingPipeline.h @@ -141,6 +141,8 @@ class IGPURayTracingPipeline : public IGPUPipeline Date: Mon, 16 Jun 2025 17:57:32 +0700 Subject: [PATCH 327/346] Check vertex shader existence for graphics pipeline --- include/nbl/video/IGPUGraphicsPipeline.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h index 6b2201672b..7027252b0f 100644 --- a/include/nbl/video/IGPUGraphicsPipeline.h +++ b/include/nbl/video/IGPUGraphicsPipeline.h @@ -64,6 +64,9 @@ class IGPUGraphicsPipeline : public IGPUPipeline Date: Mon, 16 Jun 2025 17:58:34 +0700 Subject: [PATCH 328/346] Remove comment on IShader::valid() --- include/nbl/asset/IShader.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/nbl/asset/IShader.h b/include/nbl/asset/IShader.h index d1ae2e0c86..25211f5909 100644 --- a/include/nbl/asset/IShader.h +++ b/include/nbl/asset/IShader.h @@ -91,7 +91,6 @@ class IShader : public IAsset { if (!m_code) return false; if (m_contentType == E_CONTENT_TYPE::ECT_UNKNOWN) return false; - // Note(kevyuu) : Should we check for m_filepathHint if content type is not spirv. What if no pragma includ in the source code. Do we even need m_filepathHint in that case? return true; } From e2f7b8f59e5010ac6eb2354a4ca9e5cfa0af6eea Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 16 Jun 2025 17:59:39 +0700 Subject: [PATCH 329/346] Remove virtual on IShader::valid and IShader::visitDependents_impl --- include/nbl/asset/IShader.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/nbl/asset/IShader.h b/include/nbl/asset/IShader.h index 25211f5909..34b93e99c2 100644 --- a/include/nbl/asset/IShader.h +++ b/include/nbl/asset/IShader.h @@ -87,7 +87,7 @@ class IShader : public IAsset // TODO: `void setContent(core::smart_refctd_ptr&&,const E_CONTENT_TYPE)` - inline virtual bool valid() const override + inline bool valid() const override { if (!m_code) return false; if (m_contentType == E_CONTENT_TYPE::ECT_UNKNOWN) return false; @@ -106,7 +106,7 @@ class IShader : public IAsset private: - inline virtual void visitDependents_impl(std::function visit) const override + inline void visitDependents_impl(std::function visit) const override { if (!visit(m_code.get())) return; } From 47900b1bfca77a3dcf238a913432a44b193b4e0a Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 16 Jun 2025 18:00:38 +0700 Subject: [PATCH 330/346] Add final to IShader --- include/nbl/asset/IShader.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/asset/IShader.h b/include/nbl/asset/IShader.h index 34b93e99c2..96ff73f3f0 100644 --- a/include/nbl/asset/IShader.h +++ b/include/nbl/asset/IShader.h @@ -27,7 +27,7 @@ namespace nbl::asset The purpose for the class is for storing raw HLSL code to be compiled or already compiled (but unspecialized) SPIR-V code. */ -class IShader : public IAsset +class IShader final : public IAsset { public: enum class E_CONTENT_TYPE : uint8_t From 029cfeb5e7f9eae3caebd572c26c47b04d7806c4 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 16 Jun 2025 18:10:02 +0700 Subject: [PATCH 331/346] improved readability for config, include all new files --- .../hlsl/workgroup2/arithmetic_config.hlsl | 118 ++++++++---------- .../impl/arithmetic_config_def.hlsl | 38 +++--- .../workgroup2/impl/items_per_invoc_def.hlsl | 6 +- .../workgroup2/impl/virtual_wg_size_def.hlsl | 4 +- src/nbl/builtin/CMakeLists.txt | 3 + 5 files changed, 78 insertions(+), 91 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl index 6eb6a535fe..9a211899cb 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -21,13 +21,11 @@ template struct virtual_wg_size_log2 { #define DEFINE_ASSIGN(TYPE,ID,...) NBL_CONSTEXPR_STATIC_INLINE TYPE ID = __VA_ARGS__; - #define DEFINE_VIRTUAL_WG_T(ID) ID - #define DEFINE_MPL_MAX_V(TYPE,ARG1,ARG2) mpl::max_v - #define DEFINE_COND_VAL(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value::value + #define MAX(TYPE,ARG1,ARG2) mpl::max_v + #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value::value #include "impl/virtual_wg_size_def.hlsl" - #undef DEFINE_COND_VAL - #undef DEFINE_MPL_MAX_V - #undef DEFINE_VIRTUAL_WG_T + #undef SELECT + #undef MAX #undef DEFINE_ASSIGN // must have at least enough level 0 outputs to feed a single subgroup @@ -39,17 +37,15 @@ template struct items_per_invocation { #define DEFINE_ASSIGN(TYPE,ID,...) NBL_CONSTEXPR_STATIC_INLINE TYPE ID = __VA_ARGS__; - #define DEFINE_VIRTUAL_WG_T(ID) VirtualWorkgroup::ID - #define DEFINE_ITEMS_INVOC_T(ID) ID - #define DEFINE_MPL_MIN_V(TYPE,ARG1,ARG2) mpl::min_v - #define DEFINE_MPL_MAX_V(TYPE,ARG1,ARG2) mpl::max_v - #define DEFINE_COND_VAL(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value::value + #define VIRTUAL_WG_SIZE VirtualWorkgroup:: + #define MIN(TYPE,ARG1,ARG2) mpl::min_v + #define MAX(TYPE,ARG1,ARG2) mpl::max_v + #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value::value #include "impl/items_per_invoc_def.hlsl" - #undef DEFINE_COND_VAL - #undef DEFINE_MPL_MAX_V - #undef DEFINE_MPL_MIN_V - #undef DEFINE_ITEMS_INVOC_T - #undef DEFINE_VIRTUAL_WG_T + #undef SELECT + #undef MAX + #undef MIN + #undef VIRTUAL_WG_SIZE #undef DEFINE_ASSIGN using ItemsPerInvocation = tuple,integral_constant,integral_constant >; @@ -64,17 +60,15 @@ struct ArithmeticConfiguration using ItemsPerInvocation = typename items_per_invoc_t::ItemsPerInvocation; #define DEFINE_ASSIGN(TYPE,ID,...) NBL_CONSTEXPR_STATIC_INLINE TYPE ID = __VA_ARGS__; - #define DEFINE_VIRTUAL_WG_T(ID) virtual_wg_t::ID - #define DEFINE_ITEMS_INVOC_T(ID) items_per_invoc_t::ID - #define DEFINE_CONFIG_T(ID) ID - #define DEFINE_MPL_MAX_V(TYPE,ARG1,ARG2) mpl::max_v - #define DEFINE_COND_VAL(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value::value + #define VIRTUAL_WG_SIZE virtual_wg_t:: + #define ITEMS_PER_INVOC items_per_invoc_t:: + #define MAX(TYPE,ARG1,ARG2) mpl::max_v + #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value::value #include "impl/arithmetic_config_def.hlsl" - #undef DEFINE_COND_VAL - #undef DEFINE_MPL_MAX_V - #undef DEFINE_CONFIG_T - #undef DEFINE_ITEMS_INVOC_T - #undef DEFINE_VIRTUAL_WG_T + #undef SELECT + #undef MAX + #undef ITEMS_PER_INVOC + #undef VIRTUAL_WG_SIZE #undef DEFINE_ASSIGN using ChannelStride = tuple,integral_constant,integral_constant >; // we don't use stride 0 @@ -148,19 +142,15 @@ namespace impl { struct SVirtualWGSizeLog2 { - static SVirtualWGSizeLog2 create(const uint16_t _WorkgroupSizeLog2, const uint16_t _SubgroupSizeLog2) + void init(const uint16_t _WorkgroupSizeLog2, const uint16_t _SubgroupSizeLog2) { - SVirtualWGSizeLog2 retval; - #define DEFINE_ASSIGN(TYPE,ID,...) retval.ID = __VA_ARGS__; - #define DEFINE_VIRTUAL_WG_T(ID) retval.ID - #define DEFINE_MPL_MAX_V(TYPE,ARG1,ARG2) hlsl::max(ARG1, ARG2) - #define DEFINE_COND_VAL(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL) + #define DEFINE_ASSIGN(TYPE,ID,...) ID = __VA_ARGS__; + #define MAX(TYPE,ARG1,ARG2) hlsl::max(ARG1, ARG2) + #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL) #include "impl/virtual_wg_size_def.hlsl" - #undef DEFINE_COND_VAL - #undef DEFINE_MPL_MAX_V - #undef DEFINE_VIRTUAL_WG_T + #undef SELECT + #undef MAX #undef DEFINE_ASSIGN - return retval; } #define DEFINE_ASSIGN(TYPE,ID,...) TYPE ID; @@ -170,23 +160,19 @@ struct SVirtualWGSizeLog2 struct SItemsPerInvoc { - static SItemsPerInvoc create(const SVirtualWGSizeLog2 virtualWgSizeLog2, const uint16_t BaseItemsPerInvocation) + void init(const SVirtualWGSizeLog2 virtualWgSizeLog2, const uint16_t BaseItemsPerInvocation) { - SItemsPerInvoc retval; - #define DEFINE_ASSIGN(TYPE,ID,...) retval.ID = __VA_ARGS__; - #define DEFINE_VIRTUAL_WG_T(ID) virtualWgSizeLog2.ID - #define DEFINE_ITEMS_INVOC_T(ID) retval.ID - #define DEFINE_MPL_MIN_V(TYPE,ARG1,ARG2) hlsl::min(ARG1, ARG2) - #define DEFINE_MPL_MAX_V(TYPE,ARG1,ARG2) hlsl::max(ARG1, ARG2) - #define DEFINE_COND_VAL(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL) + #define DEFINE_ASSIGN(TYPE,ID,...) ID = __VA_ARGS__; + #define VIRTUAL_WG_SIZE virtualWgSizeLog2. + #define MIN(TYPE,ARG1,ARG2) hlsl::min(ARG1, ARG2) + #define MAX(TYPE,ARG1,ARG2) hlsl::max(ARG1, ARG2) + #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL) #include "impl/items_per_invoc_def.hlsl" - #undef DEFINE_COND_VAL - #undef DEFINE_MPL_MAX_V - #undef DEFINE_MPL_MIN_V - #undef DEFINE_ITEMS_INVOC_T - #undef DEFINE_VIRTUAL_WG_T + #undef SELECT + #undef MAX + #undef MIN + #undef VIRTUAL_WG_SIZE #undef DEFINE_ASSIGN - return retval; } #define DEFINE_ASSIGN(TYPE,ID,...) TYPE ID; @@ -197,26 +183,24 @@ struct SItemsPerInvoc struct SArithmeticConfiguration { - static SArithmeticConfiguration create(const uint16_t _WorkgroupSizeLog2, const uint16_t _SubgroupSizeLog2, const uint16_t _ItemsPerInvocation) + void init(const uint16_t _WorkgroupSizeLog2, const uint16_t _SubgroupSizeLog2, const uint16_t _ItemsPerInvocation) { - impl::SVirtualWGSizeLog2 virtualWgSizeLog2 = impl::SVirtualWGSizeLog2::create(_WorkgroupSizeLog2, _SubgroupSizeLog2); - impl::SItemsPerInvoc itemsPerInvoc = impl::SItemsPerInvoc::create(virtualWgSizeLog2, _ItemsPerInvocation); - - SArithmeticConfiguration retval; - #define DEFINE_ASSIGN(TYPE,ID,...) retval.ID = __VA_ARGS__; - #define DEFINE_VIRTUAL_WG_T(ID) virtualWgSizeLog2.ID - #define DEFINE_ITEMS_INVOC_T(ID) itemsPerInvoc.ID - #define DEFINE_CONFIG_T(ID) retval.ID - #define DEFINE_MPL_MAX_V(TYPE,ARG1,ARG2) hlsl::max(ARG1, ARG2) - #define DEFINE_COND_VAL(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL) + impl::SVirtualWGSizeLog2 virtualWgSizeLog2; + virtualWgSizeLog2.init(_WorkgroupSizeLog2, _SubgroupSizeLog2); + impl::SItemsPerInvoc itemsPerInvoc; + itemsPerInvoc.init(virtualWgSizeLog2, _ItemsPerInvocation); + + #define DEFINE_ASSIGN(TYPE,ID,...) ID = __VA_ARGS__; + #define VIRTUAL_WG_SIZE virtualWgSizeLog2. + #define ITEMS_PER_INVOC itemsPerInvoc. + #define MAX(TYPE,ARG1,ARG2) hlsl::max(ARG1, ARG2) + #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL) #include "impl/arithmetic_config_def.hlsl" - #undef DEFINE_COND_VAL - #undef DEFINE_MPL_MAX_V - #undef DEFINE_CONFIG_T - #undef DEFINE_ITEMS_INVOC_T - #undef DEFINE_VIRTUAL_WG_T + #undef SELECT + #undef MAX + #undef ITEMS_PER_INVOC + #undef VIRTUAL_WG_SIZE #undef DEFINE_ASSIGN - return retval; } #define DEFINE_ASSIGN(TYPE,ID,...) TYPE ID; diff --git a/include/nbl/builtin/hlsl/workgroup2/impl/arithmetic_config_def.hlsl b/include/nbl/builtin/hlsl/workgroup2/impl/arithmetic_config_def.hlsl index 4ea6fc010d..94f54409db 100644 --- a/include/nbl/builtin/hlsl/workgroup2/impl/arithmetic_config_def.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/impl/arithmetic_config_def.hlsl @@ -3,32 +3,32 @@ // For conditions of distribution and use, see copyright notice in nabla.h DEFINE_ASSIGN(uint16_t, WorkgroupSizeLog2, _WorkgroupSizeLog2) -DEFINE_ASSIGN(uint16_t, WorkgroupSize, uint16_t(0x1u) << DEFINE_CONFIG_T(WorkgroupSizeLog2)) +DEFINE_ASSIGN(uint16_t, WorkgroupSize, uint16_t(0x1u) << WorkgroupSizeLog2) DEFINE_ASSIGN(uint16_t, SubgroupSizeLog2, _SubgroupSizeLog2) -DEFINE_ASSIGN(uint16_t, SubgroupSize, uint16_t(0x1u) << DEFINE_CONFIG_T(SubgroupSizeLog2)) +DEFINE_ASSIGN(uint16_t, SubgroupSize, uint16_t(0x1u) << SubgroupSizeLog2) -DEFINE_ASSIGN(uint16_t, LevelCount, DEFINE_VIRTUAL_WG_T(levels)) -DEFINE_ASSIGN(uint16_t, VirtualWorkgroupSize, uint16_t(0x1u) << DEFINE_VIRTUAL_WG_T(value)) +DEFINE_ASSIGN(uint16_t, LevelCount, VIRTUAL_WG_SIZE levels) +DEFINE_ASSIGN(uint16_t, VirtualWorkgroupSize, uint16_t(0x1u) << VIRTUAL_WG_SIZE value) -DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_0, DEFINE_ITEMS_INVOC_T(value0)) -DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_1, DEFINE_ITEMS_INVOC_T(value1)) -DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_2, DEFINE_ITEMS_INVOC_T(value2)) +DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_0, ITEMS_PER_INVOC value0) +DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_1, ITEMS_PER_INVOC value1) +DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_2, ITEMS_PER_INVOC value2) -DEFINE_ASSIGN(uint16_t, LevelInputCount_1, DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==3), - DEFINE_MPL_MAX_V(uint16_t, (DEFINE_CONFIG_T(VirtualWorkgroupSize)>>DEFINE_CONFIG_T(SubgroupSizeLog2)), DEFINE_CONFIG_T(SubgroupSize)), - DEFINE_CONFIG_T(SubgroupSize)*DEFINE_CONFIG_T(ItemsPerInvocation_1))) -DEFINE_ASSIGN(uint16_t, LevelInputCount_2, DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==3),DEFINE_CONFIG_T(SubgroupSize)*DEFINE_CONFIG_T(ItemsPerInvocation_2),0)) -DEFINE_ASSIGN(uint16_t, VirtualInvocationsAtLevel1, DEFINE_CONFIG_T(LevelInputCount_1) / DEFINE_CONFIG_T(ItemsPerInvocation_1)) +DEFINE_ASSIGN(uint16_t, LevelInputCount_1, SELECT(uint16_t,(LevelCount==3), + MAX(uint16_t, (VirtualWorkgroupSize>>SubgroupSizeLog2), SubgroupSize), + SubgroupSize*ItemsPerInvocation_1)) +DEFINE_ASSIGN(uint16_t, LevelInputCount_2, SELECT(uint16_t,(LevelCount==3),SubgroupSize*ItemsPerInvocation_2,0)) +DEFINE_ASSIGN(uint16_t, VirtualInvocationsAtLevel1, LevelInputCount_1 / ItemsPerInvocation_1) -DEFINE_ASSIGN(uint16_t, __padding, DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==3),DEFINE_CONFIG_T(SubgroupSize)-1,0)) -DEFINE_ASSIGN(uint16_t, __channelStride_1, DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==3),DEFINE_CONFIG_T(VirtualInvocationsAtLevel1),DEFINE_CONFIG_T(SubgroupSize)) + DEFINE_CONFIG_T(__padding)) -DEFINE_ASSIGN(uint16_t, __channelStride_2, DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==3),DEFINE_CONFIG_T(SubgroupSize),0)) +DEFINE_ASSIGN(uint16_t, __padding, SELECT(uint16_t,(LevelCount==3),SubgroupSize-1,0)) +DEFINE_ASSIGN(uint16_t, __channelStride_1, SELECT(uint16_t,(LevelCount==3),VirtualInvocationsAtLevel1,SubgroupSize) + __padding) +DEFINE_ASSIGN(uint16_t, __channelStride_2, SELECT(uint16_t,(LevelCount==3),SubgroupSize,0)) // user specified the shared mem size of Scalars -DEFINE_ASSIGN(uint32_t, SharedScratchElementCount, DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==1), +DEFINE_ASSIGN(uint32_t, SharedScratchElementCount, SELECT(uint16_t,(LevelCount==1), 0, - DEFINE_COND_VAL(uint16_t,(DEFINE_CONFIG_T(LevelCount)==3), - DEFINE_CONFIG_T(LevelInputCount_2)+(DEFINE_CONFIG_T(SubgroupSize)*DEFINE_CONFIG_T(ItemsPerInvocation_1))-1, + SELECT(uint16_t,(LevelCount==3), + LevelInputCount_2+(SubgroupSize*ItemsPerInvocation_1)-1, 0 - ) + DEFINE_CONFIG_T(LevelInputCount_1) + ) + LevelInputCount_1 )) diff --git a/include/nbl/builtin/hlsl/workgroup2/impl/items_per_invoc_def.hlsl b/include/nbl/builtin/hlsl/workgroup2/impl/items_per_invoc_def.hlsl index 857b64d774..c32d7ef8bd 100644 --- a/include/nbl/builtin/hlsl/workgroup2/impl/items_per_invoc_def.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/impl/items_per_invoc_def.hlsl @@ -2,7 +2,7 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -DEFINE_ASSIGN(uint16_t, ItemsPerInvocationProductLog2, DEFINE_MPL_MAX_V(int16_t,DEFINE_VIRTUAL_WG_T(WorkgroupSizeLog2)-DEFINE_VIRTUAL_WG_T(SubgroupSizeLog2)*DEFINE_VIRTUAL_WG_T(levels),0)) +DEFINE_ASSIGN(uint16_t, ItemsPerInvocationProductLog2, MAX(int16_t,VIRTUAL_WG_SIZE WorkgroupSizeLog2-VIRTUAL_WG_SIZE SubgroupSizeLog2*VIRTUAL_WG_SIZE levels,0)) DEFINE_ASSIGN(uint16_t, value0, BaseItemsPerInvocation) -DEFINE_ASSIGN(uint16_t, value1, uint16_t(0x1u) << DEFINE_COND_VAL(uint16_t,(DEFINE_VIRTUAL_WG_T(levels)==3),DEFINE_MPL_MIN_V(uint16_t,DEFINE_ITEMS_INVOC_T(ItemsPerInvocationProductLog2),2),DEFINE_ITEMS_INVOC_T(ItemsPerInvocationProductLog2))) -DEFINE_ASSIGN(uint16_t, value2, uint16_t(0x1u) << DEFINE_MPL_MAX_V(int16_t,DEFINE_ITEMS_INVOC_T(ItemsPerInvocationProductLog2)-2,0)) \ No newline at end of file +DEFINE_ASSIGN(uint16_t, value1, uint16_t(0x1u) << SELECT(uint16_t,(VIRTUAL_WG_SIZE levels==3),MIN(uint16_t,ItemsPerInvocationProductLog2,2),ItemsPerInvocationProductLog2)) +DEFINE_ASSIGN(uint16_t, value2, uint16_t(0x1u) << MAX(int16_t,ItemsPerInvocationProductLog2-2,0)) \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/workgroup2/impl/virtual_wg_size_def.hlsl b/include/nbl/builtin/hlsl/workgroup2/impl/virtual_wg_size_def.hlsl index 3190ba5df3..e4c4047f1d 100644 --- a/include/nbl/builtin/hlsl/workgroup2/impl/virtual_wg_size_def.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/impl/virtual_wg_size_def.hlsl @@ -4,5 +4,5 @@ DEFINE_ASSIGN(uint16_t, WorkgroupSizeLog2, _WorkgroupSizeLog2) DEFINE_ASSIGN(uint16_t, SubgroupSizeLog2, _SubgroupSizeLog2) -DEFINE_ASSIGN(uint16_t, levels, DEFINE_COND_VAL(uint16_t,(_WorkgroupSizeLog2>_SubgroupSizeLog2),DEFINE_COND_VAL(uint16_t,(_WorkgroupSizeLog2>_SubgroupSizeLog2*2+2),3,2),1)) -DEFINE_ASSIGN(uint16_t, value, DEFINE_MPL_MAX_V(uint16_t, _SubgroupSizeLog2*DEFINE_VIRTUAL_WG_T(levels), _WorkgroupSizeLog2)) +DEFINE_ASSIGN(uint16_t, levels, SELECT(uint16_t,(_WorkgroupSizeLog2>_SubgroupSizeLog2),SELECT(uint16_t,(_WorkgroupSizeLog2>_SubgroupSizeLog2*2+2),3,2),1)) +DEFINE_ASSIGN(uint16_t, value, MAX(uint16_t, _SubgroupSizeLog2*levels, _WorkgroupSizeLog2)) diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index d051c2153b..a3d15744a7 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -347,6 +347,9 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shared_scan.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shuffle.hlsl") #workgroup2 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/arithmetic_config.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/impl/virtual_wg_size_def.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/impl/items_per_invoc_def.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/impl/arithmetic_config_def.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/arithmetic.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/shared_scan.hlsl") #Extensions From 9b340a4df6627b3abd3950312c8629d9c1782fb8 Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 16 Jun 2025 13:29:25 +0200 Subject: [PATCH 332/346] set the `examples_tests` submodule back to `master` HEAD as workgroup scan example not ready yet --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 4c10dc1cdb..e30938c261 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 4c10dc1cdba4ab12dfedef97768aa4a10e606213 +Subproject commit e30938c2615dd5d3ab69cadca3ba11d1e03f8233 From 5d990a3698ee69e57aad41376e2c445f18197816 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 17 Jun 2025 11:09:14 +0700 Subject: [PATCH 333/346] Rename ISPIRVDebloater to ISPIRVEntryPointTrimmer --- ...VDebloater.h => ISPIRVEntryPointTrimmer.h} | 18 +-- include/nbl/video/ILogicalDevice.h | 4 +- src/nbl/CMakeLists.txt | 2 +- ...loater.cpp => ISPIRVEntryPointTrimmer.cpp} | 26 ++-- src/nbl/video/CVulkanLogicalDevice.cpp | 2 +- src/nbl/video/ILogicalDevice.cpp | 124 +++++++++--------- 6 files changed, 88 insertions(+), 88 deletions(-) rename include/nbl/asset/utils/{ISPIRVDebloater.h => ISPIRVEntryPointTrimmer.h} (72%) rename src/nbl/asset/utils/{ISPIRVDebloater.cpp => ISPIRVEntryPointTrimmer.cpp} (91%) diff --git a/include/nbl/asset/utils/ISPIRVDebloater.h b/include/nbl/asset/utils/ISPIRVEntryPointTrimmer.h similarity index 72% rename from include/nbl/asset/utils/ISPIRVDebloater.h rename to include/nbl/asset/utils/ISPIRVEntryPointTrimmer.h index f5f87956be..a2e24dabab 100644 --- a/include/nbl/asset/utils/ISPIRVDebloater.h +++ b/include/nbl/asset/utils/ISPIRVEntryPointTrimmer.h @@ -1,5 +1,5 @@ -#ifndef _NBL_ASSET_I_SPIRV_DEBLOATER_H_INCLUDED_ -#define _NBL_ASSET_I_SPIRV_DEBLOATER_H_INCLUDED_ +#ifndef _NBL_ASSET_I_SPIRV_ENTRY_POINT_TRIMMER_H_INCLUDED_ +#define _NBL_ASSET_I_SPIRV_ENTRY_POINT_TRIMMER_H_INCLUDED_ #include "nbl/core/declarations.h" @@ -10,14 +10,14 @@ namespace nbl::asset { -class ISPIRVDebloater final : public core::IReferenceCounted +class ISPIRVEntryPointTrimmer final : public core::IReferenceCounted { public: - ISPIRVDebloater(); + ISPIRVEntryPointTrimmer(); struct Result { - core::smart_refctd_ptr spirv; // nullptr if there is some entry point not found or spirv does not need to be debloated + core::smart_refctd_ptr spirv; // nullptr if there is some entry point not found or spirv does not need to be trimmed bool isSuccess; inline operator bool() const @@ -45,9 +45,9 @@ class ISPIRVDebloater final : public core::IReferenceCounted } }; - Result debloat(const ICPUBuffer* spirvBuffer, const core::set& entryPoints, system::logger_opt_ptr logger = nullptr) const; + Result trim(const ICPUBuffer* spirvBuffer, const core::set& entryPoints, system::logger_opt_ptr logger = nullptr) const; - inline core::smart_refctd_ptr debloat(const IShader* shader, const core::set& entryPoints, system::logger_opt_ptr logger = nullptr) const + inline core::smart_refctd_ptr trim(const IShader* shader, const core::set& entryPoints, system::logger_opt_ptr logger = nullptr) const { if (shader->getContentType() != IShader::E_CONTENT_TYPE::ECT_SPIRV) { @@ -55,10 +55,10 @@ class ISPIRVDebloater final : public core::IReferenceCounted return nullptr; } const auto buffer = shader->getContent(); - const auto result = debloat(buffer, entryPoints, logger); + const auto result = trim(buffer, entryPoints, logger); if (result && result.spirv.get() == nullptr) { - // when debloat does not happen return original shader + // when trim does not happen return original shader return core::smart_refctd_ptr(shader); } diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index d8ef2bdef1..def3ee0979 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -3,7 +3,7 @@ #include "nbl/asset/asset.h" #include "nbl/asset/utils/ISPIRVOptimizer.h" -#include "nbl/asset/utils/ISPIRVDebloater.h" +#include "nbl/asset/utils/ISPIRVEntryPointTrimmer.h" #include "nbl/asset/utils/CCompilerSet.h" #include "nbl/video/SPhysicalDeviceFeatures.h" @@ -1315,7 +1315,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe uint16_t firstQueueIndex = 0u; }; const std::array m_queueFamilyInfos; - core::smart_refctd_ptr m_spirvDebloater; + core::smart_refctd_ptr m_spirvTrimmer; private: const SPhysicalDeviceLimits& getPhysicalDeviceLimits() const; diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index b484464fb3..2dddc74f77 100755 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -162,7 +162,7 @@ set(NBL_ASSET_SOURCES # Shaders ${NBL_ROOT_PATH}/src/nbl/asset/utils/ISPIRVOptimizer.cpp - ${NBL_ROOT_PATH}/src/nbl/asset/utils/ISPIRVDebloater.cpp + ${NBL_ROOT_PATH}/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp ${NBL_ROOT_PATH}/src/nbl/asset/utils/IShaderCompiler.cpp ${NBL_ROOT_PATH}/src/nbl/asset/utils/CGLSLCompiler.cpp ${NBL_ROOT_PATH}/src/nbl/asset/utils/CHLSLCompiler.cpp diff --git a/src/nbl/asset/utils/ISPIRVDebloater.cpp b/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp similarity index 91% rename from src/nbl/asset/utils/ISPIRVDebloater.cpp rename to src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp index f05e9d70f5..981133536d 100644 --- a/src/nbl/asset/utils/ISPIRVDebloater.cpp +++ b/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp @@ -1,4 +1,4 @@ -#include "nbl/asset/utils/ISPIRVDebloater.h" +#include "nbl/asset/utils/ISPIRVEntryPointTrimmer.h" #include "nbl/asset/utils/ISPIRVOptimizer.h" #include "nbl_spirv_cross/spirv.hpp" @@ -10,7 +10,7 @@ using namespace nbl::asset; static constexpr spv_target_env SPIRV_VERSION = spv_target_env::SPV_ENV_UNIVERSAL_1_6; -ISPIRVDebloater::ISPIRVDebloater() +ISPIRVEntryPointTrimmer::ISPIRVEntryPointTrimmer() { constexpr auto optimizationPasses = std::array{ ISPIRVOptimizer::EOP_DEAD_BRANCH_ELIM, @@ -78,7 +78,7 @@ static bool validate(const uint32_t* binary, uint32_t binarySize, nbl::system::l return core.Validate(binary, binarySize, validatorOptions); } -ISPIRVDebloater::Result ISPIRVDebloater::debloat(const ICPUBuffer* spirvBuffer, const core::set& entryPoints, system::logger_opt_ptr logger) const +ISPIRVEntryPointTrimmer::Result ISPIRVEntryPointTrimmer::trim(const ICPUBuffer* spirvBuffer, const core::set& entryPoints, system::logger_opt_ptr logger) const { const auto* spirv = static_cast(spirvBuffer->getPointer()); const auto spirvDwordCount = spirvBuffer->getSize() / 4; @@ -134,7 +134,7 @@ ISPIRVDebloater::Result ISPIRVDebloater::debloat(const ICPUBuffer* spirvBuffer, std::vector minimizedSpirv; core::unordered_set removedEntryPointIds; - bool needDebloat = false; + bool needtrim = false; auto offset = HEADER_SIZE; auto parse_instruction = [](uint32_t instruction) -> std::tuple { @@ -185,16 +185,16 @@ ISPIRVDebloater::Result ISPIRVDebloater::debloat(const ICPUBuffer* spirvBuffer, foundEntryPoint += 1; // a valid spirv will have unique entry points, so this should works } else { - if (needDebloat == false) + if (needtrim == false) { minimizedSpirv.reserve(spirvDwordCount); minimizedSpirv.insert(minimizedSpirv.end(), spirv, spirv + curOffset); - needDebloat = true; + needtrim = true; } removedEntryPointIds.insert(curEntryPointId); continue; } - if (!needDebloat) continue; + if (!needtrim) continue; minimizedSpirv.insert(minimizedSpirv.end(), spirv + curOffset, spirv + offset); } @@ -208,7 +208,7 @@ ISPIRVDebloater::Result ISPIRVDebloater::debloat(const ICPUBuffer* spirvBuffer, }; } - if (!needDebloat) + if (!needtrim) { return { .spirv = nullptr, @@ -236,22 +236,22 @@ ISPIRVDebloater::Result ISPIRVDebloater::debloat(const ICPUBuffer* spirvBuffer, assert(validate(minimizedSpirv.data(), minimizedSpirv.size(), logger)); - auto debloatedSpirv = m_optimizer->optimize(minimizedSpirv.data(), minimizedSpirv.size(), logger); + auto trimmedSpirv = m_optimizer->optimize(minimizedSpirv.data(), minimizedSpirv.size(), logger); #ifdef _NBL_DEBUG logger.log("Before stripping capabilities:", nbl::system::ILogger::ELL_DEBUG); printCapabilities(spirv, spirvDwordCount, logger); logger.log("\n", nbl::system::ILogger::ELL_DEBUG); - const auto* debloatedSpirvBuffer = static_cast(debloatedSpirv->getPointer()); - const auto debloatedSpirvDwordCount = debloatedSpirv->getSize() / 4; + const auto* trimmedSpirvBuffer = static_cast(trimmedSpirv->getPointer()); + const auto trimmedSpirvDwordCount = trimmedSpirv->getSize() / 4; logger.log("After stripping capabilities:", nbl::system::ILogger::ELL_DEBUG); - printCapabilities(debloatedSpirvBuffer, debloatedSpirvDwordCount, logger); + printCapabilities(trimmedSpirvBuffer, trimmedSpirvDwordCount, logger); logger.log("\n", nbl::system::ILogger::ELL_DEBUG); #endif return { - .spirv = std::move(debloatedSpirv), + .spirv = std::move(trimmedSpirv), .isSuccess = true, }; diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 89f7ab1da3..9757182bcc 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -1,6 +1,6 @@ #include "nbl/video/CVulkanLogicalDevice.h" -#include "nbl/asset/utils/ISPIRVDebloater.h" +#include "nbl/asset/utils/ISPIRVEntryPointTrimmer.h" #include "nbl/video/CThreadSafeQueueAdapter.h" #include "nbl/video/surface/CSurfaceVulkan.h" diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp index 19dc001d8f..225a33bec3 100644 --- a/src/nbl/video/ILogicalDevice.cpp +++ b/src/nbl/video/ILogicalDevice.cpp @@ -7,17 +7,17 @@ using namespace nbl; using namespace nbl::video; -class SpirvDebloatTask +class SpirvTrimTask { public: - using EntryPoints = core::set; + using EntryPoints = core::set; struct ShaderInfo { EntryPoints entryPoints; - const asset::IShader* debloatedShaders; + const asset::IShader* trimmedShaders; }; - SpirvDebloatTask(asset::ISPIRVDebloater* debloater, system::logger_opt_ptr logger) : m_debloater(debloater), m_logger(logger) + SpirvTrimTask(asset::ISPIRVEntryPointTrimmer* trimer, system::logger_opt_ptr logger) : m_trimmer(trimer), m_logger(logger) { } @@ -31,39 +31,39 @@ class SpirvDebloatTask it->second.entryPoints.insert({ .name = shaderSpec.entryPoint, .stage = stage }); } - IGPUPipelineBase::SShaderSpecInfo debloat(const IGPUPipelineBase::SShaderSpecInfo& shaderSpec, core::vector>& outShaders) + IGPUPipelineBase::SShaderSpecInfo trim(const IGPUPipelineBase::SShaderSpecInfo& shaderSpec, core::vector>& outShaders) { const auto* shader = shaderSpec.shader; auto findResult = m_shaderInfoMap.find(shader); assert(findResult != m_shaderInfoMap.end()); const auto& entryPoints = findResult->second.entryPoints; - auto& debloatedShader = findResult->second.debloatedShaders; + auto& trimmedShader = findResult->second.trimmedShaders; - auto debloatedShaderSpec = shaderSpec; + auto trimmedShaderSpec = shaderSpec; if (shader != nullptr) { - if (debloatedShader == nullptr) + if (trimmedShader == nullptr) { const auto outShadersData = outShaders.data(); - outShaders.push_back(m_debloater->debloat(shader, entryPoints, m_logger)); + outShaders.push_back(m_trimmer->trim(shader, entryPoints, m_logger)); assert(outShadersData == outShaders.data()); - debloatedShader = outShaders.back().get(); + trimmedShader = outShaders.back().get(); } - debloatedShaderSpec.shader = debloatedShader; + trimmedShaderSpec.shader = trimmedShader; } - return debloatedShaderSpec; + return trimmedShaderSpec; } private: core::map m_shaderInfoMap; - asset::ISPIRVDebloater* m_debloater; + asset::ISPIRVEntryPointTrimmer* m_trimmer; const system::logger_opt_ptr m_logger; }; ILogicalDevice::ILogicalDevice(core::smart_refctd_ptr&& api, const IPhysicalDevice* const physicalDevice, const SCreationParams& params, const bool runningInRenderdoc) : m_api(api), m_physicalDevice(physicalDevice), m_enabledFeatures(params.featuresToEnable), m_compilerSet(params.compilerSet), m_logger(m_physicalDevice->getDebugCallback() ? m_physicalDevice->getDebugCallback()->getLogger() : nullptr), - m_spirvDebloater(core::make_smart_refctd_ptr()) + m_spirvTrimmer(core::make_smart_refctd_ptr()) { { uint32_t qcnt = 0u; @@ -805,18 +805,18 @@ bool ILogicalDevice::createComputePipelines(IGPUPipelineCache* const pipelineCac core::vector newParams(params.begin(), params.end()); const auto shaderCount = params.size(); - core::vector> debloatedShaders; // vector to hold all the debloated shaders, so the pointer from the new ShaderSpecInfo is not dangling - debloatedShaders.reserve(shaderCount); + core::vector> trimmedShaders; // vector to hold all the trimmed shaders, so the pointer from the new ShaderSpecInfo is not dangling + trimmedShaders.reserve(shaderCount); for (auto ix = 0u; ix < params.size(); ix++) { const auto& ci = params[ix]; - const core::set entryPoints = { asset::ISPIRVDebloater::EntryPoint{.name = ci.shader.entryPoint, .stage = hlsl::ShaderStage::ESS_COMPUTE} }; - debloatedShaders.push_back(m_spirvDebloater->debloat(ci.shader.shader, entryPoints, m_logger)); - auto debloatedShaderSpec = ci.shader; - debloatedShaderSpec.shader = debloatedShaders.back().get(); - newParams[ix].shader = debloatedShaderSpec; + const core::set entryPoints = { asset::ISPIRVEntryPointTrimmer::EntryPoint{.name = ci.shader.entryPoint, .stage = hlsl::ShaderStage::ESS_COMPUTE} }; + trimmedShaders.push_back(m_spirvTrimmer->trim(ci.shader.shader, entryPoints, m_logger)); + auto trimmedShaderSpec = ci.shader; + trimmedShaderSpec.shader = trimmedShaders.back().get(); + newParams[ix].shader = trimmedShaderSpec; } createComputePipelines_impl(pipelineCache,newParams,output,specConstantValidation); @@ -856,8 +856,8 @@ bool ILogicalDevice::createGraphicsPipelines( { return sum + param.getShaderCount(); }); - core::vector> debloatedShaders; // vector to hold all the debloated shaders, so the pointer from the new ShaderSpecInfo is not dangling - debloatedShaders.reserve(shaderCount); + core::vector> trimmedShaders; // vector to hold all the trimmed shaders, so the pointer from the new ShaderSpecInfo is not dangling + trimmedShaders.reserve(shaderCount); for (auto ix = 0u; ix < params.size(); ix++) { @@ -973,18 +973,18 @@ bool ILogicalDevice::createGraphicsPipelines( } } - SpirvDebloatTask debloatTask(m_spirvDebloater.get(), m_logger); - debloatTask.insertEntryPoint(ci.vertexShader, hlsl::ShaderStage::ESS_VERTEX); - debloatTask.insertEntryPoint(ci.tesselationControlShader, hlsl::ShaderStage::ESS_TESSELLATION_CONTROL); - debloatTask.insertEntryPoint(ci.tesselationEvaluationShader, hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION); - debloatTask.insertEntryPoint(ci.geometryShader, hlsl::ShaderStage::ESS_GEOMETRY); - debloatTask.insertEntryPoint(ci.fragmentShader, hlsl::ShaderStage::ESS_FRAGMENT); + SpirvTrimTask trimTask(m_spirvTrimmer.get(), m_logger); + trimTask.insertEntryPoint(ci.vertexShader, hlsl::ShaderStage::ESS_VERTEX); + trimTask.insertEntryPoint(ci.tesselationControlShader, hlsl::ShaderStage::ESS_TESSELLATION_CONTROL); + trimTask.insertEntryPoint(ci.tesselationEvaluationShader, hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION); + trimTask.insertEntryPoint(ci.geometryShader, hlsl::ShaderStage::ESS_GEOMETRY); + trimTask.insertEntryPoint(ci.fragmentShader, hlsl::ShaderStage::ESS_FRAGMENT); - newParams[ix].vertexShader = debloatTask.debloat(ci.vertexShader, debloatedShaders); - newParams[ix].tesselationControlShader = debloatTask.debloat(ci.tesselationControlShader, debloatedShaders); - newParams[ix].tesselationEvaluationShader = debloatTask.debloat(ci.tesselationEvaluationShader, debloatedShaders); - newParams[ix].geometryShader = debloatTask.debloat(ci.geometryShader, debloatedShaders); - newParams[ix].fragmentShader = debloatTask.debloat(ci.fragmentShader, debloatedShaders); + newParams[ix].vertexShader = trimTask.trim(ci.vertexShader, trimmedShaders); + newParams[ix].tesselationControlShader = trimTask.trim(ci.tesselationControlShader, trimmedShaders); + newParams[ix].tesselationEvaluationShader = trimTask.trim(ci.tesselationEvaluationShader, trimmedShaders); + newParams[ix].geometryShader = trimTask.trim(ci.geometryShader, trimmedShaders); + newParams[ix].fragmentShader = trimTask.trim(ci.fragmentShader, trimmedShaders); } createGraphicsPipelines_impl(pipelineCache, newParams, output, specConstantValidation); @@ -1074,8 +1074,8 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline return sum + param.shaderGroups.getCallableShaderCount(); }); const auto shaderCount = raygenCount + missShaderCount + hitShaderCount + callableShaderCount; - core::vector> debloatedShaders; // vector to hold all the debloated shaders, so the pointer from the new ShaderSpecInfo is not dangling - debloatedShaders.reserve(shaderCount); + core::vector> trimmedShaders; // vector to hold all the trimmed shaders, so the pointer from the new ShaderSpecInfo is not dangling + trimmedShaders.reserve(shaderCount); const auto missGroupCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param) { @@ -1091,12 +1091,12 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline }); - core::vector debloatedMissSpecs(missGroupCount); - auto debloatedMissSpecData = debloatedMissSpecs.data(); - core::vector debloatedHitSpecs(hitGroupCount); - auto debloatedHitSpecData = debloatedHitSpecs.data(); - core::vector debloatedCallableSpecs(callableGroupCount); - auto debloatedCallableSpecData = debloatedCallableSpecs.data(); + core::vector trimmedMissSpecs(missGroupCount); + auto trimmedMissSpecData = trimmedMissSpecs.data(); + core::vector trimmedHitSpecs(hitGroupCount); + auto trimmedHitSpecData = trimmedHitSpecs.data(); + core::vector trimmedCallableSpecs(callableGroupCount); + auto trimmedCallableSpecData = trimmedCallableSpecs.data(); const auto& limits = getPhysicalDeviceLimits(); for (auto ix = 0u; ix < params.size(); ix++) @@ -1111,45 +1111,45 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline return false; } - SpirvDebloatTask debloatTask(m_spirvDebloater.get(), m_logger); - debloatTask.insertEntryPoint(param.shaderGroups.raygen, hlsl::ShaderStage::ESS_RAYGEN); + SpirvTrimTask trimTask(m_spirvTrimmer.get(), m_logger); + trimTask.insertEntryPoint(param.shaderGroups.raygen, hlsl::ShaderStage::ESS_RAYGEN); for (const auto& miss : param.shaderGroups.misses) - debloatTask.insertEntryPoint(miss, hlsl::ShaderStage::ESS_MISS); + trimTask.insertEntryPoint(miss, hlsl::ShaderStage::ESS_MISS); for (const auto& hit : param.shaderGroups.hits) { - debloatTask.insertEntryPoint(hit.closestHit, hlsl::ShaderStage::ESS_CLOSEST_HIT); - debloatTask.insertEntryPoint(hit.anyHit, hlsl::ShaderStage::ESS_ANY_HIT); - debloatTask.insertEntryPoint(hit.intersection, hlsl::ShaderStage::ESS_INTERSECTION); + trimTask.insertEntryPoint(hit.closestHit, hlsl::ShaderStage::ESS_CLOSEST_HIT); + trimTask.insertEntryPoint(hit.anyHit, hlsl::ShaderStage::ESS_ANY_HIT); + trimTask.insertEntryPoint(hit.intersection, hlsl::ShaderStage::ESS_INTERSECTION); } for (const auto& callable : param.shaderGroups.callables) - debloatTask.insertEntryPoint(callable, hlsl::ShaderStage::ESS_CALLABLE); + trimTask.insertEntryPoint(callable, hlsl::ShaderStage::ESS_CALLABLE); newParams[ix] = param; - newParams[ix].shaderGroups.raygen = debloatTask.debloat(param.shaderGroups.raygen, debloatedShaders); + newParams[ix].shaderGroups.raygen = trimTask.trim(param.shaderGroups.raygen, trimmedShaders); - newParams[ix].shaderGroups.misses = debloatedMissSpecs; + newParams[ix].shaderGroups.misses = trimmedMissSpecs; for (const auto& miss: param.shaderGroups.misses) { - *debloatedMissSpecData = debloatTask.debloat(miss, debloatedShaders); - debloatedMissSpecData++; + *trimmedMissSpecData = trimTask.trim(miss, trimmedShaders); + trimmedMissSpecData++; } - newParams[ix].shaderGroups.hits = debloatedHitSpecs; + newParams[ix].shaderGroups.hits = trimmedHitSpecs; for (const auto& hit: param.shaderGroups.hits) { - *debloatedHitSpecData = { - .closestHit = debloatTask.debloat(hit.closestHit, debloatedShaders), - .anyHit = debloatTask.debloat(hit.anyHit, debloatedShaders), - .intersection = debloatTask.debloat(hit.intersection, debloatedShaders), + *trimmedHitSpecData = { + .closestHit = trimTask.trim(hit.closestHit, trimmedShaders), + .anyHit = trimTask.trim(hit.anyHit, trimmedShaders), + .intersection = trimTask.trim(hit.intersection, trimmedShaders), }; - debloatedHitSpecData++; + trimmedHitSpecData++; } - newParams[ix].shaderGroups.callables = debloatedCallableSpecs; + newParams[ix].shaderGroups.callables = trimmedCallableSpecs; for (const auto& callable: param.shaderGroups.callables) { - *debloatedCallableSpecData = debloatTask.debloat(callable, debloatedShaders); - debloatedCallableSpecData++; + *trimmedCallableSpecData = trimTask.trim(callable, trimmedShaders); + trimmedCallableSpecData++; } } From 6505cde350e4ea9a36b0ee37a64e4846157f2d68 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 17 Jun 2025 14:00:15 +0700 Subject: [PATCH 334/346] Fix indentation to use tabs --- include/nbl/asset/ICPUDescriptorSet.h | 56 +++++++++++++-------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/include/nbl/asset/ICPUDescriptorSet.h b/include/nbl/asset/ICPUDescriptorSet.h index 29cfe4cb1d..53151068ae 100644 --- a/include/nbl/asset/ICPUDescriptorSet.h +++ b/include/nbl/asset/ICPUDescriptorSet.h @@ -90,34 +90,34 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet m_descriptorInfos[static_cast(IDescriptor::E_TYPE::ET_COUNT)]; - inline void visitDependents_impl(std::function visit) const override - { - for (auto i = 0u; i < static_cast(IDescriptor::E_TYPE::ET_COUNT); i++) - { - if (!m_descriptorInfos[i]) continue; - const auto size = m_descriptorInfos[i]->size(); - for (auto desc_i = 0u; desc_i < size; desc_i++) - { - auto* desc = m_descriptorInfos[i]->operator[](desc_i).desc.get(); - if (!desc) continue; - switch (IDescriptor::GetTypeCategory(static_cast(i))) - { - case IDescriptor::EC_BUFFER: - if (!visit(static_cast(desc))) return; - case IDescriptor::EC_SAMPLER: - if (!visit(static_cast(desc))) return; - case IDescriptor::EC_IMAGE: - if (!visit(static_cast(desc))) return; - case IDescriptor::EC_BUFFER_VIEW: - if (!visit(static_cast(desc))) return; - case IDescriptor::EC_ACCELERATION_STRUCTURE: - if (!visit(static_cast(desc))) return; - default: - break; - } - } - } - } + inline void visitDependents_impl(std::function visit) const override + { + for (auto i = 0u; i < static_cast(IDescriptor::E_TYPE::ET_COUNT); i++) + { + if (!m_descriptorInfos[i]) continue; + const auto size = m_descriptorInfos[i]->size(); + for (auto desc_i = 0u; desc_i < size; desc_i++) + { + auto* desc = m_descriptorInfos[i]->operator[](desc_i).desc.get(); + if (!desc) continue; + switch (IDescriptor::GetTypeCategory(static_cast(i))) + { + case IDescriptor::EC_BUFFER: + if (!visit(static_cast(desc))) return; + case IDescriptor::EC_SAMPLER: + if (!visit(static_cast(desc))) return; + case IDescriptor::EC_IMAGE: + if (!visit(static_cast(desc))) return; + case IDescriptor::EC_BUFFER_VIEW: + if (!visit(static_cast(desc))) return; + case IDescriptor::EC_ACCELERATION_STRUCTURE: + if (!visit(static_cast(desc))) return; + default: + break; + } + } + } + } }; } From 11df7a6b89ae6d79c2b6b42e059daa4d069c96ce Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 17 Jun 2025 14:00:48 +0700 Subject: [PATCH 335/346] Initial refinement for IDescriptorSet::valid() --- include/nbl/asset/ICPUDescriptorSet.h | 28 +++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/include/nbl/asset/ICPUDescriptorSet.h b/include/nbl/asset/ICPUDescriptorSet.h index 53151068ae..857a437567 100644 --- a/include/nbl/asset/ICPUDescriptorSet.h +++ b/include/nbl/asset/ICPUDescriptorSet.h @@ -79,6 +79,34 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSetvalid()) return false; + for (auto type_i = 0u; type_i < static_cast(IDescriptor::E_TYPE::ET_COUNT); type_i++) + { + const auto descriptorType = static_cast(type_i); + const auto descriptorCategory = IDescriptor::GetTypeCategory(descriptorType); + const auto& descriptorRedirect = m_layout->getDescriptorRedirect(descriptorType); + const auto& descriptorInfoArr = m_descriptorInfos[type_i]; + + if (descriptorInfoArr->size() != descriptorRedirect.getTotalCount()) return false; + + auto offset = 0; + for (auto binding_i = 0; binding_i < descriptorRedirect.getBindingCount(); binding_i++) + { + const auto storageIndex = IDescriptorSetLayoutBase::CBindingRedirect::storage_range_index_t(binding_i); + const auto descriptorCount = descriptorRedirect.getCount(storageIndex); + const auto createFlags = descriptorRedirect.getCreateFlags(storageIndex); + const auto isPartiallyBound = !createFlags.hasFlags(IDescriptorSetLayoutBase::SBindingBase::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT); + for (auto descriptor_i = 0; descriptor_i < descriptorCount; descriptor_i++) + { + const auto storageOffset = IDescriptorSetLayoutBase::CBindingRedirect::storage_offset_t(offset); + const auto& descriptorInfo = descriptorInfoArr->operator[](offset); + + // partiallyBound layout can have null descriptor, otherwise not + if (!isPartiallyBound && !descriptorInfo.desc) return false; + if (descriptorInfo.desc && descriptorInfo.desc->getTypeCategory() != descriptorCategory) return false; + } + } + } + return true; } From 033c7cfbc061c8e2075b3e3fc5e5ae0ac54b39a0 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 17 Jun 2025 15:17:16 +0700 Subject: [PATCH 336/346] Remove unnecessary final specifier --- include/nbl/asset/ICPUGraphicsPipeline.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h index a95a82633c..f4583f2a37 100644 --- a/include/nbl/asset/ICPUGraphicsPipeline.h +++ b/include/nbl/asset/ICPUGraphicsPipeline.h @@ -70,7 +70,7 @@ class ICPUGraphicsPipeline final : public ICPUPipelinevalid())return false; From 3cf455406a03605eba19ebb907ed9da86ef8ed11 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 17 Jun 2025 15:18:29 +0700 Subject: [PATCH 337/346] Add const to hlsl::ShaderStage --- include/nbl/asset/ICPUComputePipeline.h | 4 ++-- include/nbl/asset/ICPUGraphicsPipeline.h | 8 ++++---- include/nbl/asset/ICPUPipeline.h | 4 ++-- include/nbl/asset/ICPURayTracingPipeline.h | 6 +++--- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h index 9b867e3a06..ffcf78e908 100644 --- a/include/nbl/asset/ICPUComputePipeline.h +++ b/include/nbl/asset/ICPUComputePipeline.h @@ -29,14 +29,14 @@ class ICPUComputePipeline final : public ICPUPipeline getSpecInfos(hlsl::ShaderStage stage) const override + inline std::span getSpecInfos(const hlsl::ShaderStage stage) const override { if (stage==hlsl::ShaderStage::ESS_COMPUTE) return {&m_specInfo,1}; return {}; } - inline std::span getSpecInfos(hlsl::ShaderStage stage) + inline std::span getSpecInfos(const hlsl::ShaderStage stage) { return base_t::getSpecInfos(stage); } diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h index f4583f2a37..acc990f18c 100644 --- a/include/nbl/asset/ICPUGraphicsPipeline.h +++ b/include/nbl/asset/ICPUGraphicsPipeline.h @@ -40,7 +40,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline getSpecInfos(hlsl::ShaderStage stage) const override final + inline std::span getSpecInfos(const hlsl::ShaderStage stage) const override final { const auto stageIndex = stageToIndex(stage); if (stageIndex != -1) @@ -48,12 +48,12 @@ class ICPUGraphicsPipeline final : public ICPUPipeline getSpecInfos(hlsl::ShaderStage stage) + inline std::span getSpecInfos(const hlsl::ShaderStage stage) { return base_t::getSpecInfos(stage); } - SShaderSpecInfo* getSpecInfo(hlsl::ShaderStage stage) + SShaderSpecInfo* getSpecInfo(const hlsl::ShaderStage stage) { if (!isMutable()) return nullptr; const auto stageIndex = stageToIndex(stage); @@ -62,7 +62,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline getSpecInfos(hlsl::ShaderStage stage) const = 0; + virtual std::span getSpecInfos(const hlsl::ShaderStage stage) const = 0; }; @@ -132,7 +132,7 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipe } // Note(kevinyu): For some reason overload resolution cannot find this function when I name id getSpecInfos. It always use the const variant. Will check on it later. - inline std::span getSpecInfos(hlsl::ShaderStage stage) + inline std::span getSpecInfos(const hlsl::ShaderStage stage) { if (!isMutable()) return {}; const this_t* constPipeline = const_cast(this); diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h index 2c157f91e9..f56a5f6b46 100644 --- a/include/nbl/asset/ICPURayTracingPipeline.h +++ b/include/nbl/asset/ICPURayTracingPipeline.h @@ -36,7 +36,7 @@ class ICPURayTracingPipeline final : public ICPUPipeline getSpecInfos(hlsl::ShaderStage stage) const override final + inline std::span getSpecInfos(const hlsl::ShaderStage stage) const override final { switch (stage) { @@ -57,12 +57,12 @@ class ICPURayTracingPipeline final : public ICPUPipeline getSpecInfos(hlsl::ShaderStage stage) + inline std::span getSpecInfos(const hlsl::ShaderStage stage) { return base_t::getSpecInfos(stage); } - inline core::vector* getSpecInfoVector(hlsl::ShaderStage stage) + inline core::vector* getSpecInfoVector(const hlsl::ShaderStage stage) { if (!isMutable()) return nullptr; switch (stage) From 9eab2f862aa41392b154d25ddee4cf9942438d45 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 17 Jun 2025 15:18:57 +0700 Subject: [PATCH 338/346] Remove unnecessary final specifier --- include/nbl/asset/ICPURayTracingPipeline.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h index f56a5f6b46..17c53557e1 100644 --- a/include/nbl/asset/ICPURayTracingPipeline.h +++ b/include/nbl/asset/ICPURayTracingPipeline.h @@ -36,7 +36,7 @@ class ICPURayTracingPipeline final : public ICPUPipeline getSpecInfos(const hlsl::ShaderStage stage) const override final + inline std::span getSpecInfos(const hlsl::ShaderStage stage) const override { switch (stage) { From 74241f3406ee17f93caf9dd860b5df2f8392d084 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 17 Jun 2025 15:19:35 +0700 Subject: [PATCH 339/346] Add comment on why we need multiple dead branch elimination and multiple dead function pass --- src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp b/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp index 981133536d..361f5d3cfa 100644 --- a/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp +++ b/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp @@ -12,6 +12,7 @@ static constexpr spv_target_env SPIRV_VERSION = spv_target_env::SPV_ENV_UNIVERSA ISPIRVEntryPointTrimmer::ISPIRVEntryPointTrimmer() { + // Multiple dead branch and dead function elimination because the first entry point removal might result to dead branch. Then the dead branch might result to dead function. Then, the dead function might result to dead branch and so on. constexpr auto optimizationPasses = std::array{ ISPIRVOptimizer::EOP_DEAD_BRANCH_ELIM, ISPIRVOptimizer::EOP_ELIM_DEAD_FUNCTIONS, From 04bcf0d86cfcaee56ee1df9b9aa71405b5ef9f86 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 17 Jun 2025 15:19:56 +0700 Subject: [PATCH 340/346] Remove unused variable --- include/nbl/asset/ICPUDescriptorSet.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/nbl/asset/ICPUDescriptorSet.h b/include/nbl/asset/ICPUDescriptorSet.h index 857a437567..4247283c0e 100644 --- a/include/nbl/asset/ICPUDescriptorSet.h +++ b/include/nbl/asset/ICPUDescriptorSet.h @@ -97,7 +97,6 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSetoperator[](offset); // partiallyBound layout can have null descriptor, otherwise not From fc1983f3a1d2a15424a0fcce860aab27f23e4548 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 17 Jun 2025 16:50:14 +0700 Subject: [PATCH 341/346] Small fixes on SpirvTrimTask --- include/nbl/video/IGPURayTracingPipeline.h | 33 ---------------------- src/nbl/video/ILogicalDevice.cpp | 23 ++------------- 2 files changed, 3 insertions(+), 53 deletions(-) diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h index 56c7b38c29..690e6685d3 100644 --- a/include/nbl/video/IGPURayTracingPipeline.h +++ b/include/nbl/video/IGPURayTracingPipeline.h @@ -39,39 +39,6 @@ class IGPURayTracingPipeline : public IGPUPipelinesecond.entryPoints; - auto& trimmedShader = findResult->second.trimmedShaders; + auto& trimmedShader = findResult->second.trimmedShader; auto trimmedShaderSpec = shaderSpec; if (shader != nullptr) { if (trimmedShader == nullptr) { - const auto outShadersData = outShaders.data(); outShaders.push_back(m_trimmer->trim(shader, entryPoints, m_logger)); - assert(outShadersData == outShaders.data()); trimmedShader = outShaders.back().get(); } trimmedShaderSpec.shader = trimmedShader; @@ -1060,22 +1058,7 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline } core::vector newParams(params.begin(), params.end()); - const auto raygenCount = params.size(); // assume every param have raygen - const auto missShaderCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param) - { - return sum + param.shaderGroups.getMissShaderCount(); - }); - const auto hitShaderCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param) - { - return sum + param.shaderGroups.getHitShaderCount(); - }); - const auto callableShaderCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param) - { - return sum + param.shaderGroups.getCallableShaderCount(); - }); - const auto shaderCount = raygenCount + missShaderCount + hitShaderCount + callableShaderCount; core::vector> trimmedShaders; // vector to hold all the trimmed shaders, so the pointer from the new ShaderSpecInfo is not dangling - trimmedShaders.reserve(shaderCount); const auto missGroupCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param) { From 3767ede47b3841a6e8982ba1674ae5555c924f0f Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 17 Jun 2025 16:52:44 +0700 Subject: [PATCH 342/346] Fix shader indexing logic in ray tracing pipeline creation --- src/nbl/video/CVulkanLogicalDevice.cpp | 37 +++++++++++++++++++------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 9757182bcc..54cc6afdf1 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -1516,10 +1516,28 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl( for (const auto& info : createInfos) { - core::unordered_map shaderIndexes; - auto getVkShaderIndex = [&](const asset::IShader* shader) + struct VkShaderStageKey { - const auto index = shader == nullptr ? VK_SHADER_UNUSED_KHR : shaderIndexes[shader]; + const asset::IShader* shader; + std::string_view entryPoint; + bool operator==(const VkShaderStageKey& other) const = default; + + struct HashFunction + { + size_t operator()(const VkShaderStageKey& key) const + { + size_t rowHash = std::hash()(key.shader); + size_t colHash = std::hash()(key.entryPoint) << 1; + return rowHash ^ colHash; + } + }; + }; + + core::unordered_map shaderIndexes; + auto getVkShaderIndex = [&](const IGPUPipelineBase::SShaderSpecInfo& spec) + { + const auto key = VkShaderStageKey{ spec.shader, spec.entryPoint }; + const auto index = key.shader == nullptr ? VK_SHADER_UNUSED_KHR : shaderIndexes[key]; return index; }; @@ -1529,7 +1547,7 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl( .sType = VK_STRUCTURE_TYPE_RAY_TRACING_SHADER_GROUP_CREATE_INFO_KHR, .pNext = nullptr, .type = VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR, - .generalShader = getVkShaderIndex(spec.shader), + .generalShader = getVkShaderIndex({spec.shader, spec.entryPoint}), .closestHitShader = VK_SHADER_UNUSED_KHR, .anyHitShader = VK_SHADER_UNUSED_KHR, .intersectionShader = VK_SHADER_UNUSED_KHR, @@ -1543,9 +1561,9 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl( .type = group.intersection.shader == nullptr ? VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR : VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR, .generalShader = VK_SHADER_UNUSED_KHR, - .closestHitShader = getVkShaderIndex(group.closestHit.shader), - .anyHitShader = getVkShaderIndex(group.anyHit.shader), - .intersectionShader = getVkShaderIndex(group.intersection.shader), + .closestHitShader = getVkShaderIndex(group.closestHit), + .anyHitShader = getVkShaderIndex(group.anyHit), + .intersectionShader = getVkShaderIndex(group.intersection), }; }; @@ -1554,9 +1572,10 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl( auto processSpecInfo = [&](const IGPUPipelineBase::SShaderSpecInfo& spec, hlsl::ShaderStage shaderStage) { if (!spec.shader) return; - if (shaderIndexes.find(spec.shader) == shaderIndexes.end()) + const auto key = VkShaderStageKey{ spec.shader, spec.entryPoint }; + if (shaderIndexes.find(key) == shaderIndexes.end()) { - shaderIndexes.insert({ spec.shader, std::distancepStages)>(outCreateInfo->pStages, outShaderStage)}); + shaderIndexes.insert({ key , std::distancepStages)>(outCreateInfo->pStages, outShaderStage)}); *(outShaderStage) = getVkShaderStageCreateInfoFrom(spec, shaderStage, false, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo,outSpecMapEntry,outSpecData); outShaderStage++; } From 061d49cccd93ba371307fa7f9f2045fcdde21219 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 17 Jun 2025 18:09:38 +0700 Subject: [PATCH 343/346] Fix maxShaderStages calculation when creating ray tracing pipeline --- src/nbl/video/CVulkanLogicalDevice.cpp | 59 +++++++++++++++++--------- 1 file changed, 39 insertions(+), 20 deletions(-) diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 54cc6afdf1..9494efc2f2 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -1486,9 +1486,44 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl( const VkPipelineCache vk_pipelineCache = pipelineCache ? static_cast(pipelineCache)->getInternalObject():VK_NULL_HANDLE; + struct ShaderModuleKey + { + const asset::IShader* shader; + std::string_view entryPoint; + bool operator==(const ShaderModuleKey& other) const = default; + + struct HashFunction + { + size_t operator()(const ShaderModuleKey& key) const + { + size_t rowHash = std::hash()(key.shader); + size_t colHash = std::hash()(key.entryPoint) << 1; + return rowHash ^ colHash; + } + }; + }; size_t maxShaderStages = 0; for (const auto& info : createInfos) - maxShaderStages += info.shaderGroups.getShaderCount(); + { + core::unordered_set shaderModules; + shaderModules.insert({ info.shaderGroups.raygen.shader, info.shaderGroups.raygen.entryPoint }); + for (const auto& miss : info.shaderGroups.misses) + { + shaderModules.insert({ miss.shader, miss.entryPoint }); + } + for (const auto& hit : info.shaderGroups.hits) + { + shaderModules.insert({ hit.closestHit.shader, hit.closestHit.entryPoint }); + shaderModules.insert({ hit.anyHit.shader, hit.anyHit.entryPoint }); + shaderModules.insert({ hit.intersection.shader, hit.intersection.entryPoint }); + } + for (const auto& callable : info.shaderGroups.callables) + { + shaderModules.insert({ callable.shader, callable.entryPoint }); + } + + maxShaderStages += shaderModules.size(); + } size_t maxShaderGroups = 0; for (const auto& info : createInfos) maxShaderGroups += info.shaderGroups.getShaderGroupCount(); @@ -1516,27 +1551,11 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl( for (const auto& info : createInfos) { - struct VkShaderStageKey - { - const asset::IShader* shader; - std::string_view entryPoint; - bool operator==(const VkShaderStageKey& other) const = default; - - struct HashFunction - { - size_t operator()(const VkShaderStageKey& key) const - { - size_t rowHash = std::hash()(key.shader); - size_t colHash = std::hash()(key.entryPoint) << 1; - return rowHash ^ colHash; - } - }; - }; - core::unordered_map shaderIndexes; + core::unordered_map shaderIndexes; auto getVkShaderIndex = [&](const IGPUPipelineBase::SShaderSpecInfo& spec) { - const auto key = VkShaderStageKey{ spec.shader, spec.entryPoint }; + const auto key = ShaderModuleKey{ spec.shader, spec.entryPoint }; const auto index = key.shader == nullptr ? VK_SHADER_UNUSED_KHR : shaderIndexes[key]; return index; }; @@ -1572,7 +1591,7 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl( auto processSpecInfo = [&](const IGPUPipelineBase::SShaderSpecInfo& spec, hlsl::ShaderStage shaderStage) { if (!spec.shader) return; - const auto key = VkShaderStageKey{ spec.shader, spec.entryPoint }; + const auto key = ShaderModuleKey{ spec.shader, spec.entryPoint }; if (shaderIndexes.find(key) == shaderIndexes.end()) { shaderIndexes.insert({ key , std::distancepStages)>(outCreateInfo->pStages, outShaderStage)}); From e6d8727b904f1f10a0aadb82b01ce35487f34953 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Tue, 17 Jun 2025 15:59:06 +0200 Subject: [PATCH 344/346] update submodule pointer --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index e30938c261..95d8f78465 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit e30938c2615dd5d3ab69cadca3ba11d1e03f8233 +Subproject commit 95d8f78465e100bb3a926cea412c21891c800b9d From 6fea3e5ca08d69303ba873166cbb60c7268ba18f Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 17 Jun 2025 22:07:52 +0700 Subject: [PATCH 345/346] Add agrressive dce pass to remove type and remove multiple round of branch elim --- src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp b/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp index 361f5d3cfa..36d76eaf93 100644 --- a/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp +++ b/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp @@ -14,12 +14,11 @@ ISPIRVEntryPointTrimmer::ISPIRVEntryPointTrimmer() { // Multiple dead branch and dead function elimination because the first entry point removal might result to dead branch. Then the dead branch might result to dead function. Then, the dead function might result to dead branch and so on. constexpr auto optimizationPasses = std::array{ - ISPIRVOptimizer::EOP_DEAD_BRANCH_ELIM, - ISPIRVOptimizer::EOP_ELIM_DEAD_FUNCTIONS, ISPIRVOptimizer::EOP_DEAD_BRANCH_ELIM, ISPIRVOptimizer::EOP_ELIM_DEAD_FUNCTIONS, ISPIRVOptimizer::EOP_ELIM_DEAD_VARIABLES, ISPIRVOptimizer::EOP_ELIM_DEAD_CONSTANTS, + ISPIRVOptimizer::EOP_AGGRESSIVE_DCE, ISPIRVOptimizer::EOP_ELIM_DEAD_MEMBERS, ISPIRVOptimizer::EOP_TRIM_CAPABILITIES, }; From 0aa03c70861118bdefc9eae9c647a58212e68340 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 17 Jun 2025 22:08:37 +0700 Subject: [PATCH 346/346] Remove comment --- src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp b/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp index 36d76eaf93..de78d2b162 100644 --- a/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp +++ b/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp @@ -12,7 +12,6 @@ static constexpr spv_target_env SPIRV_VERSION = spv_target_env::SPV_ENV_UNIVERSA ISPIRVEntryPointTrimmer::ISPIRVEntryPointTrimmer() { - // Multiple dead branch and dead function elimination because the first entry point removal might result to dead branch. Then the dead branch might result to dead function. Then, the dead function might result to dead branch and so on. constexpr auto optimizationPasses = std::array{ ISPIRVOptimizer::EOP_DEAD_BRANCH_ELIM, ISPIRVOptimizer::EOP_ELIM_DEAD_FUNCTIONS,