From 77413e675a7b26e4f7570386d66eaa92face848a Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Mon, 4 Mar 2024 10:50:40 +0100 Subject: [PATCH 001/358] Downgrade Python to more supported version by PM, bring back my kazoo shutdown implementation which works --- docker/compose/Dockerfile | 4 ++-- docker/scripts/nbl/ci/dev/lib/kazoo.py | 18 +++++------------- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/docker/compose/Dockerfile b/docker/compose/Dockerfile index 60637594b9..a3f166eca0 100644 --- a/docker/compose/Dockerfile +++ b/docker/compose/Dockerfile @@ -68,9 +68,9 @@ RUN ` choco install -y strawberryperl --version 5.28.2.1 RUN ` - # Download & install Python 3.11.7 + # Download & install Python 3.11.6 ` - choco install -y python --version 3.11.7 + choco install -y python --version 3.11.6 RUN ` # Donwload debugpy Python module diff --git a/docker/scripts/nbl/ci/dev/lib/kazoo.py b/docker/scripts/nbl/ci/dev/lib/kazoo.py index 47d5b54205..2e67b4edf4 100644 --- a/docker/scripts/nbl/ci/dev/lib/kazoo.py +++ b/docker/scripts/nbl/ci/dev/lib/kazoo.py @@ -62,16 +62,6 @@ def appendKazooAtomic(self, zNodePath, data): pass -def shutdownOs(): - if os.name == 'nt' or os.name == 'java': # For windows and java (in the rare case of running jython) - return os.system('shutdown /s /f 0') - elif os.name == 'posix': # For Unix, Linux, Mac - return os.system('shutdown -h now') - else: - print('Unknown operating system') # Docs for os.name listed only the above three cases - return 1 - - def healthyCheck(host): try: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: @@ -92,8 +82,10 @@ def healthyCheck(host): if shutdown: print("Requested shutdown...") - if shutdownOs() != 0: - print(f"Could not shutdown container") + try: + subprocess.run(f"shutdown /s /f", check=True) + except subprocess.CalledProcessError as e: + print(f"Could not shutdown container because of: {e.stderr}") return True except (socket.error, socket.timeout): @@ -110,4 +102,4 @@ def healthyCheck(host): if healthyCheck(args.host): sys.exit(0) # healthy else: - sys.exit(1) # not healthy + sys.exit(1) # not healthy \ No newline at end of file From d6ce5d2376f1eb7b08b45074c5272c9cdc4d5934 Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Wed, 6 Mar 2024 20:18:27 +0100 Subject: [PATCH 002/358] Create cmake/cpack/find/nabla.cmake & cmake/cpack/find/compoment/template.cmake, script component install rules has been added creating interface install variables in separate files which can be used with find_package. TODO: global configuration Nabla file including the component files if found in package, makes flexible search requests for package parts --- 3rdparty/CMakeLists.txt | 2 +- cmake/cpack/find/compoment/template.cmake | 1 + cmake/cpack/find/nabla.cmake | 61 +++++++++++++++++++++++ cmake/cpack/package.cmake | 3 +- 4 files changed, 65 insertions(+), 2 deletions(-) create mode 100644 cmake/cpack/find/compoment/template.cmake create mode 100644 cmake/cpack/find/nabla.cmake diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt index 6a4c33c0a9..7c2bd276ae 100755 --- a/3rdparty/CMakeLists.txt +++ b/3rdparty/CMakeLists.txt @@ -63,8 +63,8 @@ set(_OLD_SKIP_INSTALL_ALL ${SKIP_INSTALL_ALL}) set(BUILD_SHARED_LIBS OFF) set(SKIP_INSTALL_ALL ON) -add_subdirectory(zlib zlib EXCLUDE_FROM_ALL) file(LOCK "${CMAKE_CURRENT_SOURCE_DIR}/zlib" DIRECTORY GUARD PROCESS RESULT_VARIABLE NBL_LOCK TIMEOUT 60) +add_subdirectory(zlib zlib EXCLUDE_FROM_ALL) if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/zlib/zconf.h.included") execute_process(COMMAND "${CMAKE_COMMAND}" -E rename zconf.h.included zconf.h WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/zlib" diff --git a/cmake/cpack/find/compoment/template.cmake b/cmake/cpack/find/compoment/template.cmake new file mode 100644 index 0000000000..ce0c6e9108 --- /dev/null +++ b/cmake/cpack/find/compoment/template.cmake @@ -0,0 +1 @@ +list(TRANSFORM @_NBL_PROXY_@ PREPEND "${CMAKE_CURRENT_LIST_DIR}/") \ No newline at end of file diff --git a/cmake/cpack/find/nabla.cmake b/cmake/cpack/find/nabla.cmake new file mode 100644 index 0000000000..2258263cc5 --- /dev/null +++ b/cmake/cpack/find/nabla.cmake @@ -0,0 +1,61 @@ +function(NBL_GEN_FIND_NABLA_CODE_IMPL _COMPOMENT_ _SPATH_) +string(APPEND NBL_FIND_NABLA_IMPL "set(_COMPOMENT_ ${_COMPOMENT_})\nset(_SPATH_ ${_SPATH_})\n\nset(NBL_ROOT_PATH ${NBL_ROOT_PATH})\n\n") +string(APPEND NBL_FIND_NABLA_IMPL +[=[ +if(CMAKE_INSTALL_CONFIG_NAME MATCHES "^([Dd][Ee][Bb][Uu][Gg])$") + set(NBL_CONFIG_PREFIX_PATH debug) +elseif(CMAKE_INSTALL_CONFIG_NAME MATCHES "^([Rr][Ee][Ll][Ww][Ii][Tt][Hh][Dd][Ee][Bb][Ii][Nn][Ff][Oo])$") + set(NBL_CONFIG_PREFIX_PATH relwithdebinfo) +elseif(CMAKE_INSTALL_CONFIG_NAME MATCHES "^([Rr][Ee][Ll][Ee][Aa][Ss][Ee])$") + unset(NBL_CONFIG_PREFIX_PATH) +else() + message(FATAL_ERROR "Internal error, requested \"${CMAKE_INSTALL_CONFIG_NAME}\" configuration is invalid!") +endif() + +string(REPLACE "${CMAKE_INSTALL_PREFIX}" "" NBL_CMAKE_INSTALL_MANIFEST_CONTENT "${CMAKE_INSTALL_MANIFEST_FILES}") +list(REMOVE_DUPLICATES NBL_CMAKE_INSTALL_MANIFEST_CONTENT) + +set(_NBL_PREFIX_ "${CMAKE_INSTALL_PREFIX}/${NBL_CONFIG_PREFIX_PATH}") +string(TOUPPER "${CMAKE_INSTALL_CONFIG_NAME}" _NBL_CONFIG_) +string(TOUPPER "${_COMPOMENT_}" _Cu_) +string(TOLOWER "${_COMPOMENT_}" _Cl_) + +set(NBL_CMAKE_COMPOMENT_OUTPUT_DIRECTORY "${CMAKE_INSTALL_PREFIX}/${NBL_CONFIG_PREFIX_PATH}/cmake/compoment") +set(NBL_CMAKE_COMPOMENT_OUTPUT_FILE "${NBL_CMAKE_COMPOMENT_OUTPUT_DIRECTORY}/NablaConfig${_COMPOMENT_}.cmake") + +cmake_path(RELATIVE_PATH CMAKE_INSTALL_PREFIX BASE_DIRECTORY "${NBL_CMAKE_COMPOMENT_OUTPUT_DIRECTORY}" OUTPUT_VARIABLE _NBL_REL_TO_PREFIX_) + +foreach(_MANIFEST_INSTALL_REL_FILE_ IN LISTS NBL_CMAKE_INSTALL_MANIFEST_CONTENT) + string(FIND "${_MANIFEST_INSTALL_REL_FILE_}" "/${_SPATH_}/" _NBL_FOUND_) + + if(NOT "${_NBL_FOUND_}" STREQUAL "-1") + set(_X_ "${_NBL_REL_TO_PREFIX_}/${_MANIFEST_INSTALL_REL_FILE_}") + cmake_path(NORMAL_PATH _X_ OUTPUT_VARIABLE _X_) + + list(APPEND NBL_INSTALL_${_Cu_}_${_NBL_CONFIG_} "${_X_}") + endif() +endforeach() + +set(_NBL_PROXY_ NBL_INSTALL_${_Cu_}_${_NBL_CONFIG_}) + +string(APPEND NBL_MANIFEST_IMPL "set(${_NBL_PROXY_}\n\t${${_NBL_PROXY_}}\n)") +string(REPLACE ";" "\n\t" NBL_MANIFEST_IMPL "${NBL_MANIFEST_IMPL}") +string(CONFIGURE "${NBL_MANIFEST_IMPL}" NBL_MANIFEST_IMPL_CONF) +file(WRITE "${NBL_CMAKE_COMPOMENT_OUTPUT_FILE}" "${NBL_MANIFEST_IMPL_CONF}") + +# the reason behind this weird looking thing is you cannot nest bracket arguments https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#bracket-argument +# some variables need evaluation but some not and must be literals, to make this code read-able & work we do a small workaround +configure_file("${NBL_ROOT_PATH}/cmake/cpack/find/compoment/template.cmake" "${NBL_CMAKE_COMPOMENT_OUTPUT_FILE}.tmp" @ONLY) +file(READ "${NBL_CMAKE_COMPOMENT_OUTPUT_FILE}.tmp" _NBL_COMPOMENT_INCLUDE_LIST_TRANFORM_) +file(REMOVE "${NBL_CMAKE_COMPOMENT_OUTPUT_FILE}.tmp") +file(APPEND "${NBL_CMAKE_COMPOMENT_OUTPUT_FILE}" "\n${_NBL_COMPOMENT_INCLUDE_LIST_TRANFORM_}") +]=] +) + +install(CODE "${NBL_FIND_NABLA_IMPL}" COMPONENT ${_COMPOMENT_}) +endfunction() + +# Generate compoment configurations +NBL_GEN_FIND_NABLA_CODE_IMPL(Headers include) +NBL_GEN_FIND_NABLA_CODE_IMPL(Libraries lib) +NBL_GEN_FIND_NABLA_CODE_IMPL(Runtimes runtime) \ No newline at end of file diff --git a/cmake/cpack/package.cmake b/cmake/cpack/package.cmake index a72c7aadfe..3991bfeddf 100644 --- a/cmake/cpack/package.cmake +++ b/cmake/cpack/package.cmake @@ -87,4 +87,5 @@ set(CPACK_COMPONENT_HEADERS_DEPENDS Libraries Runtimes) set(CPACK_THREADS 0) # try to use all threads for compression -include(CPack) \ No newline at end of file +include(CPack) +include("${CMAKE_CURRENT_LIST_DIR}/find/nabla.cmake") \ No newline at end of file From 15381bac3a7365231d75b6a000d4064a3ef820cd Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Mon, 18 Mar 2024 09:12:25 +0100 Subject: [PATCH 003/358] begin implementing FindPackageHandleStandardArgs module, save work --- cmake/cpack/find/compoment/template.cmake | 5 ++- cmake/cpack/find/nabla.cmake | 39 +++++++++++++++++------ 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/cmake/cpack/find/compoment/template.cmake b/cmake/cpack/find/compoment/template.cmake index ce0c6e9108..595053d410 100644 --- a/cmake/cpack/find/compoment/template.cmake +++ b/cmake/cpack/find/compoment/template.cmake @@ -1 +1,4 @@ -list(TRANSFORM @_NBL_PROXY_@ PREPEND "${CMAKE_CURRENT_LIST_DIR}/") \ No newline at end of file +list(TRANSFORM @_NBL_PROXY_@ PREPEND "${CMAKE_CURRENT_LIST_DIR}/") + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(@_NBL_PACKAGE_@ DEFAULT_MSG @_NBL_PROXY_@) \ No newline at end of file diff --git a/cmake/cpack/find/nabla.cmake b/cmake/cpack/find/nabla.cmake index 2258263cc5..8fa2938d12 100644 --- a/cmake/cpack/find/nabla.cmake +++ b/cmake/cpack/find/nabla.cmake @@ -1,4 +1,4 @@ -function(NBL_GEN_FIND_NABLA_CODE_IMPL _COMPOMENT_ _SPATH_) +function(NBL_GEN_FIND_NABLA_COMPONENT_CODE_IMPL _COMPOMENT_ _SPATH_) string(APPEND NBL_FIND_NABLA_IMPL "set(_COMPOMENT_ ${_COMPOMENT_})\nset(_SPATH_ ${_SPATH_})\n\nset(NBL_ROOT_PATH ${NBL_ROOT_PATH})\n\n") string(APPEND NBL_FIND_NABLA_IMPL [=[ @@ -12,16 +12,20 @@ else() message(FATAL_ERROR "Internal error, requested \"${CMAKE_INSTALL_CONFIG_NAME}\" configuration is invalid!") endif() +string(TOUPPER "${CMAKE_INSTALL_CONFIG_NAME}" _NBL_CONFIG_) +set(_NBL_PACKAGE_ Nabla${_COMPOMENT_}) + +set(NBL_CMAKE_OUTPUT_DIRECTORY "${CMAKE_INSTALL_PREFIX}/${NBL_CONFIG_PREFIX_PATH}/cmake") +set(NBL_CMAKE_COMPOMENT_OUTPUT_DIRECTORY "${NBL_CMAKE_OUTPUT_DIRECTORY}/component") + string(REPLACE "${CMAKE_INSTALL_PREFIX}" "" NBL_CMAKE_INSTALL_MANIFEST_CONTENT "${CMAKE_INSTALL_MANIFEST_FILES}") list(REMOVE_DUPLICATES NBL_CMAKE_INSTALL_MANIFEST_CONTENT) set(_NBL_PREFIX_ "${CMAKE_INSTALL_PREFIX}/${NBL_CONFIG_PREFIX_PATH}") -string(TOUPPER "${CMAKE_INSTALL_CONFIG_NAME}" _NBL_CONFIG_) string(TOUPPER "${_COMPOMENT_}" _Cu_) string(TOLOWER "${_COMPOMENT_}" _Cl_) -set(NBL_CMAKE_COMPOMENT_OUTPUT_DIRECTORY "${CMAKE_INSTALL_PREFIX}/${NBL_CONFIG_PREFIX_PATH}/cmake/compoment") -set(NBL_CMAKE_COMPOMENT_OUTPUT_FILE "${NBL_CMAKE_COMPOMENT_OUTPUT_DIRECTORY}/NablaConfig${_COMPOMENT_}.cmake") +set(NBL_CMAKE_COMPOMENT_OUTPUT_FILE "${NBL_CMAKE_COMPOMENT_OUTPUT_DIRECTORY}/${_NBL_PACKAGE_}Config.cmake") cmake_path(RELATIVE_PATH CMAKE_INSTALL_PREFIX BASE_DIRECTORY "${NBL_CMAKE_COMPOMENT_OUTPUT_DIRECTORY}" OUTPUT_VARIABLE _NBL_REL_TO_PREFIX_) @@ -32,11 +36,11 @@ foreach(_MANIFEST_INSTALL_REL_FILE_ IN LISTS NBL_CMAKE_INSTALL_MANIFEST_CONTENT) set(_X_ "${_NBL_REL_TO_PREFIX_}/${_MANIFEST_INSTALL_REL_FILE_}") cmake_path(NORMAL_PATH _X_ OUTPUT_VARIABLE _X_) - list(APPEND NBL_INSTALL_${_Cu_}_${_NBL_CONFIG_} "${_X_}") + list(APPEND NABLA_INSTALL_${_Cu_}_${_NBL_CONFIG_} "${_X_}") endif() endforeach() -set(_NBL_PROXY_ NBL_INSTALL_${_Cu_}_${_NBL_CONFIG_}) +set(_NBL_PROXY_ NABLA_INSTALL_${_Cu_}_${_NBL_CONFIG_}) string(APPEND NBL_MANIFEST_IMPL "set(${_NBL_PROXY_}\n\t${${_NBL_PROXY_}}\n)") string(REPLACE ";" "\n\t" NBL_MANIFEST_IMPL "${NBL_MANIFEST_IMPL}") @@ -49,13 +53,28 @@ configure_file("${NBL_ROOT_PATH}/cmake/cpack/find/compoment/template.cmake" "${N file(READ "${NBL_CMAKE_COMPOMENT_OUTPUT_FILE}.tmp" _NBL_COMPOMENT_INCLUDE_LIST_TRANFORM_) file(REMOVE "${NBL_CMAKE_COMPOMENT_OUTPUT_FILE}.tmp") file(APPEND "${NBL_CMAKE_COMPOMENT_OUTPUT_FILE}" "\n${_NBL_COMPOMENT_INCLUDE_LIST_TRANFORM_}") + +# Config + ]=] ) install(CODE "${NBL_FIND_NABLA_IMPL}" COMPONENT ${_COMPOMENT_}) endfunction() -# Generate compoment configurations -NBL_GEN_FIND_NABLA_CODE_IMPL(Headers include) -NBL_GEN_FIND_NABLA_CODE_IMPL(Libraries lib) -NBL_GEN_FIND_NABLA_CODE_IMPL(Runtimes runtime) \ No newline at end of file +function(NBL_GEN_FIND_NABLA_CONFIG_CODE_IMPL) +string(APPEND NBL_FIND_NABLA_IMPL +[=[ + +]=] + +install(CODE "${NBL_FIND_NABLA_IMPL}" ALL_COMPONENTS) +endfunction() + +# Generate component configurations +NBL_GEN_FIND_NABLA_COMPONENT_CODE_IMPL(Headers include) +NBL_GEN_FIND_NABLA_COMPONENT_CODE_IMPL(Libraries lib) +NBL_GEN_FIND_NABLA_COMPONENT_CODE_IMPL(Runtimes runtime) + +# Generate config file +NBL_GEN_FIND_NABLA_CONFIG_CODE_IMPL() \ No newline at end of file From 36696ed6af2a8250538c99b64b90c028a31fc53f Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Mon, 18 Mar 2024 14:17:56 +0100 Subject: [PATCH 004/358] Create dynamic & relocatable package generator, add config/template.cmake & licence/template.cmake, update apache zookeeper version --- cmake/cpack/find/config/template.cmake | 15 +++++++ cmake/cpack/find/licence/template.cmake | 3 ++ cmake/cpack/find/nabla.cmake | 54 +++++++++++++++---------- docker/compose/Dockerfile | 4 +- 4 files changed, 53 insertions(+), 23 deletions(-) create mode 100644 cmake/cpack/find/config/template.cmake create mode 100644 cmake/cpack/find/licence/template.cmake diff --git a/cmake/cpack/find/config/template.cmake b/cmake/cpack/find/config/template.cmake new file mode 100644 index 0000000000..a9ba30e354 --- /dev/null +++ b/cmake/cpack/find/config/template.cmake @@ -0,0 +1,15 @@ +find_package(@_NBL_PACKAGE_@ + REQUIRED + CONFIG + GLOBAL + PATHS "${CMAKE_CURRENT_LIST_DIR}/compoment" + NO_DEFAULT_PATH + NO_PACKAGE_ROOT_PATH + NO_CMAKE_PATH + NO_CMAKE_ENVIRONMENT_PATH + NO_SYSTEM_ENVIRONMENT_PATH + NO_CMAKE_PACKAGE_REGISTRY + NO_CMAKE_SYSTEM_PATH + NO_CMAKE_INSTALL_PREFIX + NO_CMAKE_SYSTEM_PACKAGE_REGISTRY +) \ No newline at end of file diff --git a/cmake/cpack/find/licence/template.cmake b/cmake/cpack/find/licence/template.cmake new file mode 100644 index 0000000000..f1bd9360ca --- /dev/null +++ b/cmake/cpack/find/licence/template.cmake @@ -0,0 +1,3 @@ +# Copyright (C) 2018-2040 - DevSH Graphics Programming Sp. z O.O. +# This file is part of the "Nabla Engine". +# For conditions of distribution and use, see copyright notice in nabla.h \ No newline at end of file diff --git a/cmake/cpack/find/nabla.cmake b/cmake/cpack/find/nabla.cmake index 8fa2938d12..3c6fcd0b1d 100644 --- a/cmake/cpack/find/nabla.cmake +++ b/cmake/cpack/find/nabla.cmake @@ -1,5 +1,5 @@ function(NBL_GEN_FIND_NABLA_COMPONENT_CODE_IMPL _COMPOMENT_ _SPATH_) -string(APPEND NBL_FIND_NABLA_IMPL "set(_COMPOMENT_ ${_COMPOMENT_})\nset(_SPATH_ ${_SPATH_})\n\nset(NBL_ROOT_PATH ${NBL_ROOT_PATH})\n\n") +string(APPEND NBL_FIND_NABLA_IMPL "set(_COMPOMENT_ ${_COMPOMENT_})\nset(_SPATH_ ${_SPATH_})\nset(NBL_ROOT_PATH ${NBL_ROOT_PATH})\nset(NBL_STATIC_BUILD ${NBL_STATIC_BUILD})\n\n") string(APPEND NBL_FIND_NABLA_IMPL [=[ if(CMAKE_INSTALL_CONFIG_NAME MATCHES "^([Dd][Ee][Bb][Uu][Gg])$") @@ -12,20 +12,34 @@ else() message(FATAL_ERROR "Internal error, requested \"${CMAKE_INSTALL_CONFIG_NAME}\" configuration is invalid!") endif() +string(TOUPPER "${_COMPOMENT_}" _Cu_) +string(TOLOWER "${_COMPOMENT_}" _Cl_) + string(TOUPPER "${CMAKE_INSTALL_CONFIG_NAME}" _NBL_CONFIG_) -set(_NBL_PACKAGE_ Nabla${_COMPOMENT_}) +string(TOLOWER "${CMAKE_INSTALL_CONFIG_NAME}" _NBL_CONFIG_L_) + +if(NBL_STATIC_BUILD) + set(NBL_LIBRARY_TYPE STATIC) +else() + set(NBL_LIBRARY_TYPE DYNAMIC) +endif() + +string(TOUPPER "${NBL_LIBRARY_TYPE}" _LTu_) +string(TOLOWER "${NBL_LIBRARY_TYPE}" _LTl_) + +set(_NBL_PACKAGE_ nabla-${_Cl_}-${_LTl_}-${_NBL_CONFIG_L_}) +set(_NBL_COMPLETE_P_CONFIG_ nabla-${_LTl_}-${_NBL_CONFIG_L_}) set(NBL_CMAKE_OUTPUT_DIRECTORY "${CMAKE_INSTALL_PREFIX}/${NBL_CONFIG_PREFIX_PATH}/cmake") -set(NBL_CMAKE_COMPOMENT_OUTPUT_DIRECTORY "${NBL_CMAKE_OUTPUT_DIRECTORY}/component") +set(NBL_CMAKE_COMPOMENT_OUTPUT_DIRECTORY "${NBL_CMAKE_OUTPUT_DIRECTORY}/compoment") string(REPLACE "${CMAKE_INSTALL_PREFIX}" "" NBL_CMAKE_INSTALL_MANIFEST_CONTENT "${CMAKE_INSTALL_MANIFEST_FILES}") list(REMOVE_DUPLICATES NBL_CMAKE_INSTALL_MANIFEST_CONTENT) set(_NBL_PREFIX_ "${CMAKE_INSTALL_PREFIX}/${NBL_CONFIG_PREFIX_PATH}") -string(TOUPPER "${_COMPOMENT_}" _Cu_) -string(TOLOWER "${_COMPOMENT_}" _Cl_) set(NBL_CMAKE_COMPOMENT_OUTPUT_FILE "${NBL_CMAKE_COMPOMENT_OUTPUT_DIRECTORY}/${_NBL_PACKAGE_}Config.cmake") +set(NBL_CMAKE_CONFIG_OUTPUT_FILE "${NBL_CMAKE_OUTPUT_DIRECTORY}/${_NBL_COMPLETE_P_CONFIG_}Config.cmake") cmake_path(RELATIVE_PATH CMAKE_INSTALL_PREFIX BASE_DIRECTORY "${NBL_CMAKE_COMPOMENT_OUTPUT_DIRECTORY}" OUTPUT_VARIABLE _NBL_REL_TO_PREFIX_) @@ -36,11 +50,11 @@ foreach(_MANIFEST_INSTALL_REL_FILE_ IN LISTS NBL_CMAKE_INSTALL_MANIFEST_CONTENT) set(_X_ "${_NBL_REL_TO_PREFIX_}/${_MANIFEST_INSTALL_REL_FILE_}") cmake_path(NORMAL_PATH _X_ OUTPUT_VARIABLE _X_) - list(APPEND NABLA_INSTALL_${_Cu_}_${_NBL_CONFIG_} "${_X_}") + list(APPEND NABLA_INSTALL_${_Cu_}_${_LTu_}_${_NBL_CONFIG_} "${_X_}") endif() endforeach() -set(_NBL_PROXY_ NABLA_INSTALL_${_Cu_}_${_NBL_CONFIG_}) +set(_NBL_PROXY_ NABLA_INSTALL_${_Cu_}_${_LTu_}_${_NBL_CONFIG_}) string(APPEND NBL_MANIFEST_IMPL "set(${_NBL_PROXY_}\n\t${${_NBL_PROXY_}}\n)") string(REPLACE ";" "\n\t" NBL_MANIFEST_IMPL "${NBL_MANIFEST_IMPL}") @@ -49,32 +63,30 @@ file(WRITE "${NBL_CMAKE_COMPOMENT_OUTPUT_FILE}" "${NBL_MANIFEST_IMPL_CONF}") # the reason behind this weird looking thing is you cannot nest bracket arguments https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#bracket-argument # some variables need evaluation but some not and must be literals, to make this code read-able & work we do a small workaround + +# Compoment configure_file("${NBL_ROOT_PATH}/cmake/cpack/find/compoment/template.cmake" "${NBL_CMAKE_COMPOMENT_OUTPUT_FILE}.tmp" @ONLY) file(READ "${NBL_CMAKE_COMPOMENT_OUTPUT_FILE}.tmp" _NBL_COMPOMENT_INCLUDE_LIST_TRANFORM_) file(REMOVE "${NBL_CMAKE_COMPOMENT_OUTPUT_FILE}.tmp") file(APPEND "${NBL_CMAKE_COMPOMENT_OUTPUT_FILE}" "\n${_NBL_COMPOMENT_INCLUDE_LIST_TRANFORM_}") # Config +if(NOT EXISTS "${NBL_CMAKE_CONFIG_OUTPUT_FILE}") + file(READ "${NBL_ROOT_PATH}/cmake/cpack/find/licence/template.cmake" _NBL_LICENCE_) + file(APPEND "${NBL_CMAKE_CONFIG_OUTPUT_FILE}" "${_NBL_LICENCE_}") +endif() +configure_file("${NBL_ROOT_PATH}/cmake/cpack/find/config/template.cmake" "${NBL_CMAKE_CONFIG_OUTPUT_FILE}.tmp" @ONLY) +file(READ "${NBL_CMAKE_CONFIG_OUTPUT_FILE}.tmp" _NBL_CONFIG_FILE_CONTENT_) +file(REMOVE "${NBL_CMAKE_CONFIG_OUTPUT_FILE}.tmp") +file(APPEND "${NBL_CMAKE_CONFIG_OUTPUT_FILE}" "\n\n${_NBL_CONFIG_FILE_CONTENT_}") ]=] ) install(CODE "${NBL_FIND_NABLA_IMPL}" COMPONENT ${_COMPOMENT_}) endfunction() -function(NBL_GEN_FIND_NABLA_CONFIG_CODE_IMPL) -string(APPEND NBL_FIND_NABLA_IMPL -[=[ - -]=] - -install(CODE "${NBL_FIND_NABLA_IMPL}" ALL_COMPONENTS) -endfunction() - -# Generate component configurations +# Generate compoment configurations NBL_GEN_FIND_NABLA_COMPONENT_CODE_IMPL(Headers include) NBL_GEN_FIND_NABLA_COMPONENT_CODE_IMPL(Libraries lib) -NBL_GEN_FIND_NABLA_COMPONENT_CODE_IMPL(Runtimes runtime) - -# Generate config file -NBL_GEN_FIND_NABLA_CONFIG_CODE_IMPL() \ No newline at end of file +NBL_GEN_FIND_NABLA_COMPONENT_CODE_IMPL(Runtimes runtime) \ No newline at end of file diff --git a/docker/compose/Dockerfile b/docker/compose/Dockerfile index a3f166eca0..959c5de7b3 100644 --- a/docker/compose/Dockerfile +++ b/docker/compose/Dockerfile @@ -87,7 +87,7 @@ ARG APACHE_ZOOKEEPER_INSTALL_DIRECTORY RUN ` # Download Apache ZooKeeper ` - curl -SL --output zookeeper.zip https://dlcdn.apache.org/zookeeper/zookeeper-3.8.3/apache-zookeeper-3.8.3-bin.tar.gz ` + curl -SL --output zookeeper.zip https://dlcdn.apache.org/zookeeper/stable/apache-zookeeper-3.8.4-bin.tar.gz ` ` # Create install directory ` @@ -101,7 +101,7 @@ RUN ` ` && del /q zookeeper.zip ` ` - && setx PATH "%PATH%;%APACHE_ZOOKEEPER_INSTALL_DIRECTORY%\apache-zookeeper-3.8.3-bin\bin" /M + && setx PATH "%PATH%;%APACHE_ZOOKEEPER_INSTALL_DIRECTORY%\apache-zookeeper-3.8.4-bin\bin" /M RUN ` # Download kazoo 2.8.0 Python (more recent versions doesn't work well with Windows) module From 7b17a40e1aa6a9e27f077308283d560d2022b1cc Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Mon, 18 Mar 2024 20:05:41 +0100 Subject: [PATCH 005/358] Implement FindNabla.cmake, add NABLA_INSTALL__DIRECTORY__ install package variable, perform tests --- CMakeLists.txt | 3 ++ cmake/FindNabla.cmake | 39 +++++++++++++++++++++++ cmake/cpack/find/compoment/template.cmake | 4 ++- cmake/cpack/find/nabla.cmake | 11 +++++-- 4 files changed, 53 insertions(+), 4 deletions(-) create mode 100644 cmake/FindNabla.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 7973541830..83f53ac1bf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -362,3 +362,6 @@ option(NBL_CPACK_INCLUDE_EXAMPLES "CPack with examples and media" ON) include(cpack/package) export(TARGETS ${_NBL_3RDPARTY_TARGETS_} Nabla NAMESPACE Nabla:: APPEND FILE ${NBL_ROOT_PATH_BINARY}/NablaExport.cmake) + +#set(NBL_CONFIG_ROOT_DIRECTORY "C:/Users/arekl/Desktop/ci-nabla-amd64-mt-s-15381bac3a7365231d75b6a000d4064a3ef820cd-win64") +#find_package(Nabla REQUIRED) \ No newline at end of file diff --git a/cmake/FindNabla.cmake b/cmake/FindNabla.cmake new file mode 100644 index 0000000000..152933d02d --- /dev/null +++ b/cmake/FindNabla.cmake @@ -0,0 +1,39 @@ +# Define NBL_CONFIG_ROOT_DIRECTORY +# variable to help the module find +# Nabla package + +if(NOT DEFINED CMAKE_CONFIGURATION_TYPES) + set(CMAKE_CONFIGURATION_TYPES Release;RelWithDebInfo;Debug) +endif() + +if(NOT DEFINED NBL_PACKAGE_STATIC) # turn ON NBL_PACKAGE_STATIC to look for package with STATIC library type, turn off to look for DYNAMIC + if(${NBL_STATIC_BUILD}) # internal, if called with Nabla's build system it will get detected autoamtically + set(NBL_PACKAGE_STATIC ON) + else() + message(FATAL_ERROR "NBL_PACKAGE_STATIC must be defined!") + endif() +endif() + +if(NBL_PACKAGE_STATIC) + set(NBL_LIBRARY_TYPE static) +else() + set(NBL_LIBRARY_TYPE dynamic) +endif() + +foreach(X IN LISTS CMAKE_CONFIGURATION_TYPES) + if(NOT "${X}" STREQUAL "") + string(TOLOWER "nabla-${NBL_LIBRARY_TYPE}-${X}" _NBL_TARGET_PACKAGE_) + + if(DEFINED NBL_CONFIG_ROOT_DIRECTORY) + file(GLOB_RECURSE _NBL_G_CONFIG_ROOT_DIRECTORY_ "${NBL_CONFIG_ROOT_DIRECTORY}/*/${_NBL_TARGET_PACKAGE_}Config.cmake") + cmake_path(GET _NBL_G_CONFIG_ROOT_DIRECTORY_ PARENT_PATH _NBL_G_CONFIG_ROOT_DIRECTORY_) + else() + unset(_NBL_G_CONFIG_ROOT_DIRECTORY_) + endif() + + find_package(${_NBL_TARGET_PACKAGE_} QUIET + GLOBAL + PATHS ${_NBL_G_CONFIG_ROOT_DIRECTORY_} + ) + endif() +endforeach() \ No newline at end of file diff --git a/cmake/cpack/find/compoment/template.cmake b/cmake/cpack/find/compoment/template.cmake index 595053d410..f65c1a5233 100644 --- a/cmake/cpack/find/compoment/template.cmake +++ b/cmake/cpack/find/compoment/template.cmake @@ -1,4 +1,6 @@ list(TRANSFORM @_NBL_PROXY_@ PREPEND "${CMAKE_CURRENT_LIST_DIR}/") +set(@_NBL_COMPOMENT_D_@ "${CMAKE_CURRENT_LIST_DIR}/@_NBL_COMPOMENT_D_V_@") + include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(@_NBL_PACKAGE_@ DEFAULT_MSG @_NBL_PROXY_@) \ No newline at end of file +find_package_handle_standard_args(@_NBL_PACKAGE_@ DEFAULT_MSG @_NBL_PROXY_@ @_NBL_COMPOMENT_D_@) \ No newline at end of file diff --git a/cmake/cpack/find/nabla.cmake b/cmake/cpack/find/nabla.cmake index 3c6fcd0b1d..698157a6b7 100644 --- a/cmake/cpack/find/nabla.cmake +++ b/cmake/cpack/find/nabla.cmake @@ -15,6 +15,9 @@ endif() string(TOUPPER "${_COMPOMENT_}" _Cu_) string(TOLOWER "${_COMPOMENT_}" _Cl_) +string(TOUPPER "${_SPATH_}" _Su_) +string(TOLOWER "${_SPATH_}" _Sl_) + string(TOUPPER "${CMAKE_INSTALL_CONFIG_NAME}" _NBL_CONFIG_) string(TOLOWER "${CMAKE_INSTALL_CONFIG_NAME}" _NBL_CONFIG_L_) @@ -43,6 +46,10 @@ set(NBL_CMAKE_CONFIG_OUTPUT_FILE "${NBL_CMAKE_OUTPUT_DIRECTORY}/${_NBL_COMPLETE_ cmake_path(RELATIVE_PATH CMAKE_INSTALL_PREFIX BASE_DIRECTORY "${NBL_CMAKE_COMPOMENT_OUTPUT_DIRECTORY}" OUTPUT_VARIABLE _NBL_REL_TO_PREFIX_) +set(_NBL_PROXY_ NABLA_INSTALL_${_Cu_}_${_LTu_}_${_NBL_CONFIG_}) +set(_NBL_COMPOMENT_D_ "NABLA_INSTALL_${_Cu_}_DIRECTORY_${_LTu_}_${_NBL_CONFIG_}") +set(_NBL_COMPOMENT_D_V_ "${_NBL_REL_TO_PREFIX_}") + foreach(_MANIFEST_INSTALL_REL_FILE_ IN LISTS NBL_CMAKE_INSTALL_MANIFEST_CONTENT) string(FIND "${_MANIFEST_INSTALL_REL_FILE_}" "/${_SPATH_}/" _NBL_FOUND_) @@ -50,12 +57,10 @@ foreach(_MANIFEST_INSTALL_REL_FILE_ IN LISTS NBL_CMAKE_INSTALL_MANIFEST_CONTENT) set(_X_ "${_NBL_REL_TO_PREFIX_}/${_MANIFEST_INSTALL_REL_FILE_}") cmake_path(NORMAL_PATH _X_ OUTPUT_VARIABLE _X_) - list(APPEND NABLA_INSTALL_${_Cu_}_${_LTu_}_${_NBL_CONFIG_} "${_X_}") + list(APPEND ${_NBL_PROXY_} "${_X_}") endif() endforeach() -set(_NBL_PROXY_ NABLA_INSTALL_${_Cu_}_${_LTu_}_${_NBL_CONFIG_}) - string(APPEND NBL_MANIFEST_IMPL "set(${_NBL_PROXY_}\n\t${${_NBL_PROXY_}}\n)") string(REPLACE ";" "\n\t" NBL_MANIFEST_IMPL "${NBL_MANIFEST_IMPL}") string(CONFIGURE "${NBL_MANIFEST_IMPL}" NBL_MANIFEST_IMPL_CONF) From 4a1c47a47c1d9c4eaaa78d74a0ae633cb400e333 Mon Sep 17 00:00:00 2001 From: Przemek Date: Mon, 13 May 2024 15:26:55 +0200 Subject: [PATCH 006/358] Implemented emulated float64 type --- .../nbl/builtin/hlsl/emulated_float64_t.hlsl | 129 ++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 include/nbl/builtin/hlsl/emulated_float64_t.hlsl diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl new file mode 100644 index 0000000000..8ebfaf0a48 --- /dev/null +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -0,0 +1,129 @@ +template +T _static_cast(const U); + +using float32_t = float; +//using emulated_float64_t = double; + +namespace emulated +{ + struct emulated_float64_t + { + // TODO: change to `uint64_t` when on the emulation stage + using storage_t = float32_t; + + storage_t data; + + // constructors + // TODO: specializations? + template + static emulated_float64_t create(T val) + { + emulated_float64_t output; + output.data = val; + return output; + } + + // arithmetic operators + emulated_float64_t operator+(const emulated_float64_t rhs) + { + emulated_float64_t retval; + retval.data = data + rhs.data; + return retval; + } + + emulated_float64_t operator-(const emulated_float64_t rhs) + { + emulated_float64_t retval; + retval.data = data - rhs.data; + return retval; + } + + emulated_float64_t operator*(const emulated_float64_t rhs) + { + emulated_float64_t retval; + retval.data = data * rhs.data; + return retval; + } + + emulated_float64_t operator/(const emulated_float64_t rhs) + { + emulated_float64_t retval; + retval.data = data / rhs.data; + return retval; + } + + // relational operators + bool operator==(const emulated_float64_t rhs) { return !(uint64_t(data) ^ uint64_t(rhs.data)); } + bool operator!=(const emulated_float64_t rhs) { return uint64_t(data) ^ uint64_t(rhs.data); } + bool operator<(const emulated_float64_t rhs) { return data < rhs.data; } + bool operator>(const emulated_float64_t rhs) { return data > rhs.data; } + bool operator<=(const emulated_float64_t rhs) { return !operator>(rhs); } + bool operator>=(const emulated_float64_t rhs) { return !operator<(rhs); } + + //logical operators + bool operator&&(const emulated_float64_t rhs) { return bool(data) && bool(rhs.data); } + bool operator||(const emulated_float64_t rhs) { return bool(data) || bool(rhs.data); } + bool operator!() { return !bool(data); } + + // conversion operators + operator bool() { return bool(data); } + operator int() { return int(data); } + operator uint32_t() { return uint32_t(data); } + operator uint64_t() { return uint64_t(data); } + operator float() { return float(data); } +#ifdef __HLSL_VERSION + operator min16int() { return min16int(data);} + operator float64_t() { return float64_t(data); } + operator half() { return half(data); } +#else + operator uint16_t() { return uint16_t(data);} + operator double() { return double(data); } +#endif + + //explicit operator int() const { return int(data); } + + // HERE OMITED OPERATORS + // - not implementing bitwise and modulo operators since floating point types doesn't support them + // - compound operator overload not supported in HLSL + // - access operators (dereference and addressof) not supported in HLSL +#ifndef __HLSL_VERSION + // compound assignment operators + emulated_float64_t operator+=(emulated_float64_t rhs) + { + data = data + rhs.data; + return create(data); + } + + emulated_float64_t operator-=(emulated_float64_t rhs) + { + data = data - rhs.data; + return create(data); + } + + emulated_float64_t operator*=(emulated_float64_t rhs) + { + data = data * rhs.data; + return create(data); + } + + emulated_float64_t operator/=(emulated_float64_t rhs) + { + data = data / rhs.data; + return create(data); + } + + // access operators + emulated_float64_t operator*() { return *this; } + emulated_float64_t* operator&() { return this; } +#endif + }; +} + +// upgrades float to a double +template<> +emulated::emulated_float64_t _static_cast(const float val) +{ + emulated::emulated_float64_t retval; + retval.data = val; // TODO: manually upgrade the `val` IEEE754 32bit pattern to 64bit + return retval; +} From 76ec495a7d5abf304c9d232f91c4df91330f8071 Mon Sep 17 00:00:00 2001 From: Przemek Date: Wed, 15 May 2024 14:24:29 +0200 Subject: [PATCH 007/358] Updated examples --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 38b5b08ffe..03077793c2 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 38b5b08ffe470ed7845062f532f470a8fa389bb7 +Subproject commit 03077793c29043f2705d18c2b95e1552780e5c86 From e3974024ae0f867846f6be0033c16e417bfe00e7 Mon Sep 17 00:00:00 2001 From: Przemek Date: Wed, 15 May 2024 15:36:57 +0200 Subject: [PATCH 008/358] Updated examples --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 03077793c2..ca34bfe81d 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 03077793c29043f2705d18c2b95e1552780e5c86 +Subproject commit ca34bfe81dcebdffa2a9da818839319cfff1eaf7 From 1f0a8eb51c9727907fa414fd05ed9b6b43461a5d Mon Sep 17 00:00:00 2001 From: Przemek Date: Wed, 15 May 2024 23:40:51 +0200 Subject: [PATCH 009/358] Modified emulated float file --- examples_tests | 2 +- include/nbl/builtin/hlsl/emulated_float64_t.hlsl | 11 ----------- src/nbl/builtin/CMakeLists.txt | 2 ++ 3 files changed, 3 insertions(+), 12 deletions(-) diff --git a/examples_tests b/examples_tests index ca34bfe81d..2e00f2043d 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit ca34bfe81dcebdffa2a9da818839319cfff1eaf7 +Subproject commit 2e00f2043d19ff31c1af3d2852b43ecc915d6a17 diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index 8ebfaf0a48..5cc061d3b9 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -1,5 +1,3 @@ -template -T _static_cast(const U); using float32_t = float; //using emulated_float64_t = double; @@ -118,12 +116,3 @@ namespace emulated #endif }; } - -// upgrades float to a double -template<> -emulated::emulated_float64_t _static_cast(const float val) -{ - emulated::emulated_float64_t retval; - retval.data = val; // TODO: manually upgrade the `val` IEEE754 32bit pattern to 64bit - return retval; -} diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index aad4a589e2..18ee1383ba 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -227,6 +227,8 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/normalization/shared_nor # HLSL LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/macros.h") +#emulated +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated_float64_t.hlsl") #spirv intrinsics LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/core.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/fragment_shader_pixel_interlock.hlsl") From 338213c9e9a7b4991b19d5ffea7e7e5d67184309 Mon Sep 17 00:00:00 2001 From: Przemek Date: Thu, 16 May 2024 18:37:17 +0200 Subject: [PATCH 010/358] Fixed GPU values test --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated_float64_t.hlsl | 39 ++----------------- 2 files changed, 4 insertions(+), 37 deletions(-) diff --git a/examples_tests b/examples_tests index 2e00f2043d..2ad6a6ce47 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 2e00f2043d19ff31c1af3d2852b43ecc915d6a17 +Subproject commit 2ad6a6ce47597495eb54a2b1910b192a9057c220 diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index 5cc061d3b9..e4b20e74b3 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -55,8 +55,8 @@ namespace emulated bool operator!=(const emulated_float64_t rhs) { return uint64_t(data) ^ uint64_t(rhs.data); } bool operator<(const emulated_float64_t rhs) { return data < rhs.data; } bool operator>(const emulated_float64_t rhs) { return data > rhs.data; } - bool operator<=(const emulated_float64_t rhs) { return !operator>(rhs); } - bool operator>=(const emulated_float64_t rhs) { return !operator<(rhs); } + bool operator<=(const emulated_float64_t rhs) { return data <= rhs.data; } + bool operator>=(const emulated_float64_t rhs) { return data >= rhs.data; } //logical operators bool operator&&(const emulated_float64_t rhs) { return bool(data) && bool(rhs.data); } @@ -73,46 +73,13 @@ namespace emulated operator min16int() { return min16int(data);} operator float64_t() { return float64_t(data); } operator half() { return half(data); } -#else - operator uint16_t() { return uint16_t(data);} - operator double() { return double(data); } #endif //explicit operator int() const { return int(data); } - // HERE OMITED OPERATORS + // OMITED OPERATORS // - not implementing bitwise and modulo operators since floating point types doesn't support them // - compound operator overload not supported in HLSL // - access operators (dereference and addressof) not supported in HLSL -#ifndef __HLSL_VERSION - // compound assignment operators - emulated_float64_t operator+=(emulated_float64_t rhs) - { - data = data + rhs.data; - return create(data); - } - - emulated_float64_t operator-=(emulated_float64_t rhs) - { - data = data - rhs.data; - return create(data); - } - - emulated_float64_t operator*=(emulated_float64_t rhs) - { - data = data * rhs.data; - return create(data); - } - - emulated_float64_t operator/=(emulated_float64_t rhs) - { - data = data / rhs.data; - return create(data); - } - - // access operators - emulated_float64_t operator*() { return *this; } - emulated_float64_t* operator&() { return this; } -#endif }; } From 9c2dc77c1b8f154c325854c8aa0ce6cb603be490 Mon Sep 17 00:00:00 2001 From: Przemek Date: Thu, 23 May 2024 15:19:07 +0200 Subject: [PATCH 011/358] SavingWork --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated_float64_t.hlsl | 118 ++++++++++++++++-- 2 files changed, 109 insertions(+), 11 deletions(-) diff --git a/examples_tests b/examples_tests index 2ad6a6ce47..ea4fbbf556 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 2ad6a6ce47597495eb54a2b1910b192a9057c220 +Subproject commit ea4fbbf55604c654d6d118173df08e34624e1249 diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index e4b20e74b3..025d427d79 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -1,18 +1,89 @@ +#include using float32_t = float; //using emulated_float64_t = double; namespace emulated { + namespace impl + { + nbl::hlsl::uint32_t2 umulExtended(uint32_t lhs, uint32_t rhs) + { + uint64_t product = uint64_t(lhs) * uint64_t(rhs); + nbl::hlsl::uint32_t2 output; + output.x = (product & 0xFFFFFFFF00000000) >> 32; + output.y = product & 0x00000000FFFFFFFFull; + return output; + } + + nbl::hlsl::uint32_t2 add64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) + { + nbl::hlsl::uint32_t2 output; + output.x = a1 + b1; + output.y = a0 + b0 + uint32_t(output.x < a1); + + return output; + } + + nbl::hlsl::uint32_t2 shortShift64Left(uint32_t a0, uint32_t a1, int count) + { + nbl::hlsl::uint32_t2 output; + output.x = a1 << count; + output.y = nbl::hlsl::lerp((a0 << count | (a1 >> ((-count) & 31))), a0, count == 0); + }; + + nbl::hlsl::uint32_t4 mul64to128(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1, uint32_t z0Ptr) + { + uint32_t z0 = 0u; + uint32_t z1 = 0u; + uint32_t z2 = 0u; + uint32_t z3 = 0u; + uint32_t more1 = 0u; + uint32_t more2 = 0u; + + nbl::hlsl::uint32_t2 z2z3 = umulExtended(a0, b1); + z2 = z2z3.x; + z3 = z2z3.y; + nbl::hlsl::uint32_t2 z1more2 = umulExtended(a1, b0); + z1 = z1more2.x; + more2 = z1more2.y; + nbl::hlsl::uint32_t2 z1z2 = add64(z1, more2, 0u, z2); + z1 = z1z2.x; + z2 = z1z2.y; + nbl::hlsl::uint32_t2 z0more1 = umulExtended(a0, b0); + z0 = z0more1.x; + more1 = z0more1.y; + nbl::hlsl::uint32_t2 z0z1 = add64(z0, more1, 0u, z1); + z0 = z0z1.x; + z1 = z0z1.y; + nbl::hlsl::uint32_t2 more1more2 = umulExtended(a0, b1); + more1 = more1more2.x; + more2 = more1more2.y; + nbl::hlsl::uint32_t2 more1z2 = add64(more1, more2, 0u, z2); + more1 = more1z2.x; + z2 = more1z2.y; + nbl::hlsl::uint32_t2 z0z12 = add64(z0, z1, 0u, more1); + z0 = z0z12.x; + z1 = z0z12.y; + + + nbl::hlsl::uint32_t4 output; + output.x = z0; + output.y = z1; + output.z = z2; + output.w = z3; + return output; + } + } + struct emulated_float64_t { - // TODO: change to `uint64_t` when on the emulation stage - using storage_t = float32_t; + using storage_t = uint64_t; storage_t data; // constructors - // TODO: specializations? + // TODO: specializations template static emulated_float64_t create(T val) { @@ -39,8 +110,37 @@ namespace emulated emulated_float64_t operator*(const emulated_float64_t rhs) { emulated_float64_t retval; - retval.data = data * rhs.data; - return retval; + + uint32_t lhsLow = uint32_t(data); + uint32_t rhsLow = uint32_t(rhs.data); + uint32_t lhsHigh = uint32_t((data & 0x000FFFFF00000000ull) >> 32); + uint32_t rhsHigh = uint32_t((rhs.data & 0x000FFFFF00000000ull) >> 32); + uint32_t lhsExp = uint32_t((data >> 52) & 0x7FFull); + uint32_t rhsExp = uint32_t((rhs.data >> 52) & 0x7FFull); + + int32_t exp = lhsExp + rhsExp - 0x400ull; + uint64_t sign = (data ^ rhs.data) & 0x8000000000000000ull; + + + lhsHigh |= 0x00100000u; + nbl::hlsl::uint32_t2 shifted = emulated::impl::shortShift64Left(rhsHigh, rhsLow, 12); + rhsHigh = shifted.x; + rhsLow = shifted.y; + + nbl::hlsl::uint64_t4 product = emulated::impl::mul64To128(lhsHigh, lhsLow, rhsHigh, rhsLow); + product.xy = emulated::impl::add64(product.x, product.y, lhsHigh, aLow); + product.z |= uint32_t(product.w != 0u); + if (0x00200000u <= product.x) + { + //__shift64ExtraRightJamming( + // zFrac0, zFrac1, zFrac2, 1, zFrac0, zFrac1, zFrac2); + ++zExp; + } + //return __roundAndPackFloat64(zSign, zExp, zFrac0, zFrac1, zFrac2); + + //uint32_t frac = 1; + + return emulated_float64_t::create(sign | ((uint64_t(exp) + 1023ull) << 52) | (uint64_t(frac) & 0x000FFFFFFFFFFFFull)); } emulated_float64_t operator/(const emulated_float64_t rhs) @@ -69,11 +169,9 @@ namespace emulated operator uint32_t() { return uint32_t(data); } operator uint64_t() { return uint64_t(data); } operator float() { return float(data); } -#ifdef __HLSL_VERSION - operator min16int() { return min16int(data);} - operator float64_t() { return float64_t(data); } - operator half() { return half(data); } -#endif + //operator min16int() { return min16int(data);} + //operator float64_t() { return float64_t(data); } + //operator half() { return half(data); } //explicit operator int() const { return int(data); } From 9043b292196a43f73a1894c388b8b26923d89b21 Mon Sep 17 00:00:00 2001 From: Przemek Date: Mon, 27 May 2024 23:00:32 +0200 Subject: [PATCH 012/358] Impliminted primitive multiplication for emulated_float64_t --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated_float64_t.hlsl | 178 +++++++++++++++--- 2 files changed, 157 insertions(+), 23 deletions(-) diff --git a/examples_tests b/examples_tests index ea4fbbf556..2a878a000c 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit ea4fbbf55604c654d6d118173df08e34624e1249 +Subproject commit 2a878a000c9898fe98d707ae965b5ba2122b0049 diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index 025d427d79..2cb5d303b8 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -19,8 +19,8 @@ namespace emulated nbl::hlsl::uint32_t2 add64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) { nbl::hlsl::uint32_t2 output; - output.x = a1 + b1; - output.y = a0 + b0 + uint32_t(output.x < a1); + output.y = a1 + b1; + output.x = a0 + b0 + uint32_t(output.y < a1); return output; } @@ -28,11 +28,13 @@ namespace emulated nbl::hlsl::uint32_t2 shortShift64Left(uint32_t a0, uint32_t a1, int count) { nbl::hlsl::uint32_t2 output; - output.x = a1 << count; - output.y = nbl::hlsl::lerp((a0 << count | (a1 >> ((-count) & 31))), a0, count == 0); + output.y = a1 << count; + output.x = nbl::hlsl::lerp((a0 << count | (a1 >> ((-count) & 31))), a0, count == 0); + + return output; }; - nbl::hlsl::uint32_t4 mul64to128(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1, uint32_t z0Ptr) + nbl::hlsl::uint32_t4 mul64to128(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) { uint32_t z0 = 0u; uint32_t z1 = 0u; @@ -74,6 +76,133 @@ namespace emulated output.w = z3; return output; } + + nbl::hlsl::uint32_t3 shift64ExtraRightJamming(uint32_t a0, uint32_t a1, uint32_t a2, int count) + { + nbl::hlsl::uint32_t3 output; + output.x = 0u; + + int negCount = (-count) & 31; + + output.z = nbl::hlsl::lerp(uint32_t(a0 != 0u), a0, count == 64); + output.z = nbl::hlsl::lerp(output.z, a0 << negCount, count < 64); + output.z = nbl::hlsl::lerp(output.z, a1 << negCount, count < 32); + + output.y = nbl::hlsl::lerp(0u, (a0 >> (count & 31)), count < 64); + output.y = nbl::hlsl::lerp(output.y, (a0<>count), count < 32); + + a2 = nbl::hlsl::lerp(a2 | a1, a2, count < 32); + output.x = nbl::hlsl::lerp(output.x, a0 >> count, count < 32); + output.z |= uint32_t(a2 != 0u); + + output.x = nbl::hlsl::lerp(output.x, 0u, (count == 32)); + output.y = nbl::hlsl::lerp(output.y, a0, (count == 32)); + output.z = nbl::hlsl::lerp(output.z, a1, (count == 32)); + output.x = nbl::hlsl::lerp(output.x, a0, (count == 0)); + output.y = nbl::hlsl::lerp(output.y, a1, (count == 0)); + output.z = nbl::hlsl::lerp(output.z, a2, (count == 0)); + + return output; + } + + + uint64_t packFloat64(uint32_t zSign, int zExp, uint32_t zFrac0, uint32_t zFrac1) + { + nbl::hlsl::uint32_t2 z; + + z.x = zSign + (uint32_t(zExp) << 20) + zFrac0; + z.y = zFrac1; + + uint64_t output = 0u; + output |= (uint64_t(z.x) << 32) & 0xFFFFFFFF00000000ull; + output |= uint64_t(z.y); + return output; + } + + + uint64_t roundAndPackFloat64(uint32_t zSign, int zExp, uint32_t zFrac0, uint32_t zFrac1, uint32_t zFrac2) + { + bool roundNearestEven; + bool increment; + + roundNearestEven = true; + increment = int(zFrac2) < 0; + if (!roundNearestEven) + { + if (false) //(FLOAT_ROUNDING_MODE == FLOAT_ROUND_TO_ZERO) + { + increment = false; + } + else + { + if (false) //(zSign != 0u) + { + //increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) && + // (zFrac2 != 0u); + } + else + { + //increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP) && + // (zFrac2 != 0u); + } + } + } + if (0x7FD <= zExp) + { + if ((0x7FD < zExp) || ((zExp == 0x7FD) && (0x001FFFFFu == zFrac0 && 0xFFFFFFFFu == zFrac1) && increment)) + { + if (false) // ((FLOAT_ROUNDING_MODE == FLOAT_ROUND_TO_ZERO) || + // ((zSign != 0u) && (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP)) || + // ((zSign == 0u) && (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN))) + { + return packFloat64(zSign, 0x7FE, 0x000FFFFFu, 0xFFFFFFFFu); + } + + return packFloat64(zSign, 0x7FF, 0u, 0u); + } + } + + if (zExp < 0) + { + nbl::hlsl::uint32_t3 shifted = shift64ExtraRightJamming(zFrac0, zFrac1, zFrac2, -zExp); + zFrac0 = shifted.x; + zFrac1 = shifted.y; + zFrac2 = shifted.z; + zExp = 0; + + if (roundNearestEven) + { + increment = zFrac2 < 0u; + } + else + { + if (zSign != 0u) + { + //increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) && (zFrac2 != 0u); + } + else + { + //increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP) && (zFrac2 != 0u); + } + } + } + + if (increment) + { + nbl::hlsl::uint32_t2 added = add64(zFrac0, zFrac1, 0u, 1u); + zFrac0 = added.x; + zFrac1 = added.y; + zFrac1 &= ~((zFrac2 + uint32_t(zFrac2 == 0u)) & uint32_t(roundNearestEven)); + } + else + { + zExp = nbl::hlsl::lerp(zExp, 0, (zFrac0 | zFrac1) == 0u); + } + + return packFloat64(zSign, zExp, zFrac0, zFrac1); + } + + } struct emulated_float64_t @@ -91,6 +220,14 @@ namespace emulated output.data = val; return output; } + + template<> + static emulated_float64_t create(double val) + { + emulated_float64_t output; + output.data = reinterpret_cast(val); + return output; + } // arithmetic operators emulated_float64_t operator+(const emulated_float64_t rhs) @@ -109,16 +246,17 @@ namespace emulated emulated_float64_t operator*(const emulated_float64_t rhs) { - emulated_float64_t retval; - - uint32_t lhsLow = uint32_t(data); - uint32_t rhsLow = uint32_t(rhs.data); + emulated_float64_t retval = emulated_float64_t::create(0u); + + + uint32_t lhsLow = uint32_t(data & 0x00000000FFFFFFFFull); + uint32_t rhsLow = uint32_t(rhs.data & 0x00000000FFFFFFFFull); uint32_t lhsHigh = uint32_t((data & 0x000FFFFF00000000ull) >> 32); uint32_t rhsHigh = uint32_t((rhs.data & 0x000FFFFF00000000ull) >> 32); uint32_t lhsExp = uint32_t((data >> 52) & 0x7FFull); uint32_t rhsExp = uint32_t((rhs.data >> 52) & 0x7FFull); - int32_t exp = lhsExp + rhsExp - 0x400ull; + int32_t exp = int32_t(lhsExp + rhsExp) - 0x400u; uint64_t sign = (data ^ rhs.data) & 0x8000000000000000ull; @@ -127,20 +265,16 @@ namespace emulated rhsHigh = shifted.x; rhsLow = shifted.y; - nbl::hlsl::uint64_t4 product = emulated::impl::mul64To128(lhsHigh, lhsLow, rhsHigh, rhsLow); - product.xy = emulated::impl::add64(product.x, product.y, lhsHigh, aLow); - product.z |= uint32_t(product.w != 0u); - if (0x00200000u <= product.x) + nbl::hlsl::uint32_t4 fracUnpacked = impl::mul64to128(lhsHigh, lhsLow, rhsHigh, rhsLow); + fracUnpacked.xy = emulated::impl::add64(fracUnpacked.x, fracUnpacked.y, lhsHigh, lhsLow); + fracUnpacked.z |= uint32_t(fracUnpacked.w != 0u); + if (0x00200000u <= fracUnpacked.x) { - //__shift64ExtraRightJamming( - // zFrac0, zFrac1, zFrac2, 1, zFrac0, zFrac1, zFrac2); - ++zExp; + fracUnpacked = nbl::hlsl::uint32_t4(impl::shift64ExtraRightJamming(fracUnpacked.x, fracUnpacked.y, fracUnpacked.z, 1), 0u); + ++exp; } - //return __roundAndPackFloat64(zSign, zExp, zFrac0, zFrac1, zFrac2); - - //uint32_t frac = 1; - - return emulated_float64_t::create(sign | ((uint64_t(exp) + 1023ull) << 52) | (uint64_t(frac) & 0x000FFFFFFFFFFFFull)); + + return emulated_float64_t::create(impl::roundAndPackFloat64(sign, exp, fracUnpacked.x, fracUnpacked.y, fracUnpacked.z)); } emulated_float64_t operator/(const emulated_float64_t rhs) From b20de536558e4cddacd02b176eca0ced55a78908 Mon Sep 17 00:00:00 2001 From: Przemek Date: Fri, 31 May 2024 18:55:50 +0200 Subject: [PATCH 013/358] Implemented add sub i mul operators --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated_float64_t.hlsl | 336 ++++++++++++++++-- 2 files changed, 304 insertions(+), 34 deletions(-) diff --git a/examples_tests b/examples_tests index 2a878a000c..1169a1301f 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 2a878a000c9898fe98d707ae965b5ba2122b0049 +Subproject commit 1169a1301f8e21957ea962a2f6abf502b58d6591 diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index 2cb5d303b8..65210835d6 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -1,7 +1,30 @@ #include -using float32_t = float; -//using emulated_float64_t = double; +#ifdef __HLSL_VERSION +#define LERP lerp +#else +#define LERP nbl::hlsl::lerp +#endif + +#ifdef __HLSL_VERSION +#define ABS abs +#else +#define ABS std::abs +#endif + +// TODO: inline function +#define EXCHANGE(a, b) \ + do { \ + a ^= b; \ + b ^= a; \ + a ^= b; \ + } while (false) + +#define FLOAT_ROUND_NEAREST_EVEN 0 +#define FLOAT_ROUND_TO_ZERO 1 +#define FLOAT_ROUND_DOWN 2 +#define FLOAT_ROUND_UP 3 +#define FLOAT_ROUNDING_MODE FLOAT_ROUND_NEAREST_EVEN namespace emulated { @@ -11,8 +34,8 @@ namespace emulated { uint64_t product = uint64_t(lhs) * uint64_t(rhs); nbl::hlsl::uint32_t2 output; - output.x = (product & 0xFFFFFFFF00000000) >> 32; - output.y = product & 0x00000000FFFFFFFFull; + output.x = uint32_t((product & 0xFFFFFFFF00000000) >> 32); + output.y = uint32_t(product & 0x00000000FFFFFFFFull); return output; } @@ -24,16 +47,56 @@ namespace emulated return output; } + + nbl::hlsl::uint32_t2 sub64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) + { + nbl::hlsl::uint32_t2 output; + output.y = a1 - b1; + output.x = a0 - b0 - uint32_t(a1 < b1); + + return output; + } + + // TODO: test + int countLeadingZeros32(uint32_t val) + { +#ifndef __HLSL_VERSION + return 31 - nbl::hlsl::findMSB(val); +#else + return 31 - firstbithigh(val); +#endif + } + nbl::hlsl::uint32_t2 shortShift64Left(uint32_t a0, uint32_t a1, int count) { nbl::hlsl::uint32_t2 output; output.y = a1 << count; - output.x = nbl::hlsl::lerp((a0 << count | (a1 >> ((-count) & 31))), a0, count == 0); + output.x = LERP((a0 << count | (a1 >> ((-count) & 31))), a0, count == 0); return output; }; + nbl::hlsl::uint32_t2 shift64RightJamming(uint32_t a0, uint32_t a1, int count) + { + nbl::hlsl::uint32_t2 output; + const int negCount = (-count) & 31; + + output.x = LERP(0u, a0, count == 0); + output.x = LERP(output.x, (a0 >> count), count < 32); + + output.y = uint32_t((a0 | a1) != 0u); /* count >= 64 */ + uint32_t z1_lt64 = (a0>>(count & 31)) | uint32_t(((a0<>count) | uint32_t ((a1<> (count & 31)), count < 64); - output.y = nbl::hlsl::lerp(output.y, (a0<>count), count < 32); + output.y = LERP(0u, (a0 >> (count & 31)), count < 64); + output.y = LERP(output.y, (a0<>count), count < 32); - a2 = nbl::hlsl::lerp(a2 | a1, a2, count < 32); - output.x = nbl::hlsl::lerp(output.x, a0 >> count, count < 32); + a2 = LERP(a2 | a1, a2, count < 32); + output.x = LERP(output.x, a0 >> count, count < 32); output.z |= uint32_t(a2 != 0u); - output.x = nbl::hlsl::lerp(output.x, 0u, (count == 32)); - output.y = nbl::hlsl::lerp(output.y, a0, (count == 32)); - output.z = nbl::hlsl::lerp(output.z, a1, (count == 32)); - output.x = nbl::hlsl::lerp(output.x, a0, (count == 0)); - output.y = nbl::hlsl::lerp(output.y, a1, (count == 0)); - output.z = nbl::hlsl::lerp(output.z, a2, (count == 0)); + output.x = LERP(output.x, 0u, (count == 32)); + output.y = LERP(output.y, a0, (count == 32)); + output.z = LERP(output.z, a1, (count == 32)); + output.x = LERP(output.x, a0, (count == 0)); + output.y = LERP(output.y, a1, (count == 0)); + output.z = LERP(output.z, a2, (count == 0)); return output; } @@ -178,11 +241,11 @@ namespace emulated { if (zSign != 0u) { - //increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) && (zFrac2 != 0u); + increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) && (zFrac2 != 0u); } else { - //increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP) && (zFrac2 != 0u); + increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP) && (zFrac2 != 0u); } } } @@ -196,12 +259,36 @@ namespace emulated } else { - zExp = nbl::hlsl::lerp(zExp, 0, (zFrac0 | zFrac1) == 0u); + zExp = LERP(zExp, 0, (zFrac0 | zFrac1) == 0u); } return packFloat64(zSign, zExp, zFrac0, zFrac1); } - + + uint64_t normalizeRoundAndPackFloat64(uint32_t sign, int exp, uint32_t frac0, uint32_t frac1) + { + int shiftCount; + nbl::hlsl::uint32_t3 frac = nbl::hlsl::uint32_t3(frac0, frac1, 0u); + + if (frac.x == 0u) + { + exp -= 32; + frac.x = frac.y; + frac.y = 0u; + } + + shiftCount = countLeadingZeros32(frac.x) - 11; + if (0 <= shiftCount) + { + frac.xy = shortShift64Left(frac.x, frac.y, shiftCount); + } + else + { + frac.xyz = shift64ExtraRightJamming(frac.x, frac.y, 0u, -shiftCount); + } + exp -= shiftCount; + return roundAndPackFloat64(sign, exp, frac.x, frac.y, frac.z); + } } @@ -215,12 +302,37 @@ namespace emulated // TODO: specializations template static emulated_float64_t create(T val) - { + { + emulated_float64_t output; + output.data = val; + return output; + } + + static emulated_float64_t createEmulatedFloat64PreserveBitPattern(uint64_t val) + { emulated_float64_t output; output.data = val; return output; } + // TODO: won't not work for uints with msb of index > 52 + template<> + static emulated_float64_t create(uint64_t val) + { +#ifndef __HLSL_VERSION + const uint64_t msbIndex = nbl::hlsl::findMSB(val); +#else + const uint64_t msbIndex = firstbithigh(val); +#endif + uint64_t exp = ((msbIndex + 1023) << 52) & 0x7FF0000000000000; + uint64_t mantissa = (val << (52 - msbIndex)) & 0x000FFFFFFFFFFFFFull; + emulated_float64_t output; + output.data = exp | mantissa; + return output; + } + + // TODO: temporary, remove +#ifndef __HLSL_VERSION template<> static emulated_float64_t create(double val) { @@ -228,36 +340,184 @@ namespace emulated output.data = reinterpret_cast(val); return output; } +#endif // arithmetic operators emulated_float64_t operator+(const emulated_float64_t rhs) { - emulated_float64_t retval; - retval.data = data + rhs.data; - return retval; + emulated_float64_t retval = createEmulatedFloat64PreserveBitPattern(0u); + + uint32_t lhsSign = uint32_t((data & 0x8000000000000000ull) >> 32); + uint32_t rhsSign = uint32_t((rhs.data & 0x8000000000000000ull) >> 32); + + uint32_t lhsLow = uint32_t(data & 0x00000000FFFFFFFFull); + uint32_t rhsLow = uint32_t(rhs.data & 0x00000000FFFFFFFFull); + uint32_t lhsHigh = uint32_t((data & 0x000FFFFF00000000ull) >> 32); + uint32_t rhsHigh = uint32_t((rhs.data & 0x000FFFFF00000000ull) >> 32); + + int lhsExp = int((data >> 52) & 0x7FFull); + int rhsExp = int((rhs.data >> 52) & 0x7FFull); + + int expDiff = lhsExp - rhsExp; + + if (lhsSign == rhsSign) + { + nbl::hlsl::uint32_t3 frac; + int exp; + + if (expDiff == 0) + { + //if (lhsExp == 0x7FF) + //{ + // bool propagate = ((lhsHigh | rhsHigh) | (lhsLow| rhsLow)) != 0u; + // return nbl::hlsl::lerp(a, propagateFloat64NaN(a, b), propagate); + //} + frac.xy = impl::add64(lhsHigh, lhsLow, rhsHigh, rhsLow); + if (lhsExp == 0) + return createEmulatedFloat64PreserveBitPattern(impl::packFloat64(lhsSign, 0, frac.x, frac.y)); + frac.z = 0u; + frac.x |= 0x00200000u; + exp = lhsExp; + frac = impl::shift64ExtraRightJamming(frac.x, frac.y, frac.z, 1); + } + else + { + if (expDiff < 0) + { + EXCHANGE(lhsHigh, rhsHigh); + EXCHANGE(lhsLow, rhsLow); + EXCHANGE(lhsExp, rhsExp); + } + + //if (lhsExp == 0x7FF) + //{ + // bool propagate = (lhsHigh | lhsLow) != 0u; + // return nbl::hlsl::lerp(__packFloat64(lhsSign, 0x7ff, 0u, 0u), __propagateFloat64NaN(a, b), propagate); + //} + + expDiff = LERP(ABS(expDiff), ABS(expDiff) - 1, rhsExp == 0); + rhsHigh = LERP(rhsHigh | 0x00100000u, rhsHigh, rhsExp == 0); + nbl::hlsl::float32_t3 shifted = impl::shift64ExtraRightJamming(rhsHigh, rhsLow, 0u, expDiff); + rhsHigh = shifted.x; + rhsLow = shifted.y; + frac.z = shifted.z; + exp = lhsExp; + + lhsHigh |= 0x00100000u; + frac.xy = impl::add64(lhsHigh, lhsLow, rhsHigh, rhsLow); + --exp; + if (!(frac.x < 0x00200000u)) + { + frac = impl::shift64ExtraRightJamming(frac.x, frac.y, frac.z, 1); + ++exp; + } + + return createEmulatedFloat64PreserveBitPattern(impl::roundAndPackFloat64(lhsSign, exp, frac.x, frac.y, frac.z)); + } + + // cannot happen but compiler cries about not every path returning value + return createEmulatedFloat64PreserveBitPattern(0xdeadbeefbadcaffeull); + } + else + { + int exp; + + nbl::hlsl::uint32_t2 lhsShifted = impl::shortShift64Left(lhsHigh, lhsLow, 10); + lhsHigh = lhsShifted.x; + lhsLow = lhsShifted.y; + nbl::hlsl::uint32_t2 rhsShifted = impl::shortShift64Left(rhsHigh, rhsLow, 10); + rhsHigh = rhsShifted.x; + rhsLow = rhsShifted.y; + + if (expDiff != 0) + { + nbl::hlsl::uint32_t2 frac; + + if (expDiff < 0) + { + EXCHANGE(lhsHigh, rhsHigh); + EXCHANGE(lhsLow, rhsLow); + EXCHANGE(lhsExp, rhsExp); + lhsSign ^= 0x80000000u; // smth not right about that + } + + //if (lhsExp == 0x7FF) + //{ + // bool propagate = (lhsHigh | lhsLow) != 0u; + // return nbl::hlsl::lerp(__packFloat64(lhsSign, 0x7ff, 0u, 0u), __propagateFloat64NaN(a, b), propagate); + //} + + expDiff = LERP(ABS(expDiff), ABS(expDiff) - 1, rhsExp == 0); + rhsHigh = LERP(rhsHigh | 0x40000000u, rhsHigh, rhsExp == 0); + nbl::hlsl::uint32_t2 shifted = impl::shift64RightJamming(rhsHigh, rhsLow, expDiff); + rhsHigh = shifted.x; + rhsLow = shifted.y; + lhsHigh |= 0x40000000u; + frac.xy = impl::sub64(lhsHigh, lhsLow, rhsHigh, rhsLow); + exp = lhsExp; + --exp; + return createEmulatedFloat64PreserveBitPattern(impl::normalizeRoundAndPackFloat64(lhsSign, exp - 10, frac.x, frac.y)); + } + //if (lhsExp == 0x7FF) + //{ + // bool propagate = ((lhsHigh | rhsHigh) | (lhsLow | rhsLow)) != 0u; + // return nbl::hlsl::lerp(0xFFFFFFFFFFFFFFFFUL, __propagateFloat64NaN(a, b), propagate); + //} + rhsExp = LERP(rhsExp, 1, lhsExp == 0); + lhsExp = LERP(lhsExp, 1, lhsExp == 0); + + nbl::hlsl::uint32_t2 frac; + uint32_t signOfDifference = 0; + if (rhsHigh < lhsHigh) + { + frac.xy = impl::sub64(lhsHigh, lhsLow, rhsHigh, rhsLow); + } + else if (lhsHigh < rhsHigh) + { + frac.xy = impl::sub64(rhsHigh, rhsLow, lhsHigh, lhsLow); + signOfDifference = 0x80000000; + } + else if (rhsLow <= lhsLow) + { + /* It is possible that frac.x and frac.y may be zero after this. */ + frac.xy = impl::sub64(lhsHigh, lhsLow, rhsHigh, rhsLow); + } + else + { + frac.xy = impl::sub64(rhsHigh, rhsLow, lhsHigh, lhsLow); + signOfDifference = 0x80000000; + } + + exp = LERP(rhsExp, lhsExp, signOfDifference == 0u); + lhsSign ^= signOfDifference; + uint64_t retval_0 = impl::packFloat64(uint32_t(FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) << 31, 0, 0u, 0u); + uint64_t retval_1 = impl::normalizeRoundAndPackFloat64(lhsSign, exp - 11, frac.x, frac.y); + return createEmulatedFloat64PreserveBitPattern(LERP(retval_0, retval_1, frac.x != 0u || frac.y != 0u)); + } } - emulated_float64_t operator-(const emulated_float64_t rhs) + emulated_float64_t operator-(emulated_float64_t rhs) { - emulated_float64_t retval; - retval.data = data - rhs.data; - return retval; + emulated_float64_t lhs = createEmulatedFloat64PreserveBitPattern(data); + emulated_float64_t rhsFlipped = rhs.flipSign(); + + return lhs + rhsFlipped; } emulated_float64_t operator*(const emulated_float64_t rhs) { emulated_float64_t retval = emulated_float64_t::create(0u); - uint32_t lhsLow = uint32_t(data & 0x00000000FFFFFFFFull); uint32_t rhsLow = uint32_t(rhs.data & 0x00000000FFFFFFFFull); uint32_t lhsHigh = uint32_t((data & 0x000FFFFF00000000ull) >> 32); uint32_t rhsHigh = uint32_t((rhs.data & 0x000FFFFF00000000ull) >> 32); + uint32_t lhsExp = uint32_t((data >> 52) & 0x7FFull); uint32_t rhsExp = uint32_t((rhs.data >> 52) & 0x7FFull); int32_t exp = int32_t(lhsExp + rhsExp) - 0x400u; - uint64_t sign = (data ^ rhs.data) & 0x8000000000000000ull; + uint64_t sign = uint32_t(((data ^ rhs.data) & 0x8000000000000000ull) >> 32); lhsHigh |= 0x00100000u; @@ -274,9 +534,10 @@ namespace emulated ++exp; } - return emulated_float64_t::create(impl::roundAndPackFloat64(sign, exp, fracUnpacked.x, fracUnpacked.y, fracUnpacked.z)); + return createEmulatedFloat64PreserveBitPattern(impl::roundAndPackFloat64(sign, exp, fracUnpacked.x, fracUnpacked.y, fracUnpacked.z)); } + // TODO emulated_float64_t operator/(const emulated_float64_t rhs) { emulated_float64_t retval; @@ -313,5 +574,14 @@ namespace emulated // - not implementing bitwise and modulo operators since floating point types doesn't support them // - compound operator overload not supported in HLSL // - access operators (dereference and addressof) not supported in HLSL + + // TODO: should modify self? + emulated_float64_t flipSign() + { + const uint64_t flippedSign = ((~data) & 0x8000000000000000ull); + return createEmulatedFloat64PreserveBitPattern(flippedSign | (data & 0x7FFFFFFFFFFFFFFFull)); + } }; + + //_NBL_STATIC_INLINE_CONSTEXPR emulated_float64_t EMULATED_FLOAT64_NAN = emulated_float64_t::create(0.0 / 0.0); } From c5a635267e6ee21195b1a9fa9098fbba3187ff07 Mon Sep 17 00:00:00 2001 From: Przemek Date: Wed, 5 Jun 2024 15:16:06 +0200 Subject: [PATCH 014/358] All operators (except div) works on both CPU and GPU --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated_float64_t.hlsl | 25 +++++++++++++------ 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/examples_tests b/examples_tests index 1169a1301f..2d5d301138 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 1169a1301f8e21957ea962a2f6abf502b58d6591 +Subproject commit 2d5d3011384474aec6e9527abafc707214f427bd diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index 65210835d6..217ac2b14b 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -303,9 +303,19 @@ namespace emulated template static emulated_float64_t create(T val) { +#ifndef __HLSL_VERSION emulated_float64_t output; - output.data = val; + output.data = reinterpret_cast(val); return output; +#else + uint32_t lowBits; + uint32_t highBits; + asuint(val, lowBits, highBits); + + emulated_float64_t output; + output.data = (uint64_t(highBits) << 32) | uint64_t(lowBits); + return output; +#endif } static emulated_float64_t createEmulatedFloat64PreserveBitPattern(uint64_t val) @@ -316,6 +326,7 @@ namespace emulated } // TODO: won't not work for uints with msb of index > 52 +#ifndef __HLSL_VERSION template<> static emulated_float64_t create(uint64_t val) { @@ -330,7 +341,7 @@ namespace emulated output.data = exp | mantissa; return output; } - +#endif // TODO: temporary, remove #ifndef __HLSL_VERSION template<> @@ -438,7 +449,7 @@ namespace emulated EXCHANGE(lhsHigh, rhsHigh); EXCHANGE(lhsLow, rhsLow); EXCHANGE(lhsExp, rhsExp); - lhsSign ^= 0x80000000u; // smth not right about that + lhsSign ^= 0x80000000u; // TODO: smth not right about that } //if (lhsExp == 0x7FF) @@ -540,14 +551,12 @@ namespace emulated // TODO emulated_float64_t operator/(const emulated_float64_t rhs) { - emulated_float64_t retval; - retval.data = data / rhs.data; - return retval; + return createEmulatedFloat64PreserveBitPattern(0xdeadbeefbadcaffeull); } // relational operators - bool operator==(const emulated_float64_t rhs) { return !(uint64_t(data) ^ uint64_t(rhs.data)); } - bool operator!=(const emulated_float64_t rhs) { return uint64_t(data) ^ uint64_t(rhs.data); } + bool operator==(const emulated_float64_t rhs) { return !(data ^ rhs.data); } + bool operator!=(const emulated_float64_t rhs) { return data ^ rhs.data; } bool operator<(const emulated_float64_t rhs) { return data < rhs.data; } bool operator>(const emulated_float64_t rhs) { return data > rhs.data; } bool operator<=(const emulated_float64_t rhs) { return data <= rhs.data; } From 9de77db3e4b052a535812f252d312f838b4d35aa Mon Sep 17 00:00:00 2001 From: Przemek Date: Mon, 10 Jun 2024 15:02:49 +0200 Subject: [PATCH 015/358] Ported ieee754.glsl --- include/nbl/builtin/hlsl/bit.hlsl | 8 +- include/nbl/builtin/hlsl/cpp_compat.hlsl | 2 +- .../nbl/builtin/hlsl/emulated_float64_t.hlsl | 56 +++++++++--- include/nbl/builtin/hlsl/ieee754.hlsl | 88 +++++++++++++++++++ 4 files changed, 138 insertions(+), 16 deletions(-) create mode 100644 include/nbl/builtin/hlsl/ieee754.hlsl diff --git a/include/nbl/builtin/hlsl/bit.hlsl b/include/nbl/builtin/hlsl/bit.hlsl index fd6ff0c167..e1f1117af6 100644 --- a/include/nbl/builtin/hlsl/bit.hlsl +++ b/include/nbl/builtin/hlsl/bit.hlsl @@ -33,11 +33,11 @@ namespace nbl namespace hlsl { -template -T bit_cast(U val) +template +To bit_cast(From val) { - static_assert(sizeof(T) <= sizeof(U)); - return spirv::bitcast(val); + static_assert(sizeof(To) <= sizeof(From)); + return spirv::bitcast(val); } template diff --git a/include/nbl/builtin/hlsl/cpp_compat.hlsl b/include/nbl/builtin/hlsl/cpp_compat.hlsl index ee615ba5b7..a22e8cc0c5 100644 --- a/include/nbl/builtin/hlsl/cpp_compat.hlsl +++ b/include/nbl/builtin/hlsl/cpp_compat.hlsl @@ -41,7 +41,7 @@ using add_pointer = std::add_pointer; #define ARROW .arrow(). #define NBL_CONSTEXPR const static #define NBL_CONSTEXPR_STATIC_INLINE const static -#define NBL_CONST_MEMBER_FUNC +#define NBL_CONST_MEMBER_FUNC namespace nbl { diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index 217ac2b14b..175e610914 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -38,6 +38,11 @@ namespace emulated output.y = uint32_t(product & 0x00000000FFFFFFFFull); return output; } + + bool isNaN64(uint64_t val) + { + return bool((0x7FF0000000000000ull & val) && (0x000FFFFFFFFFFFFFull & val)); + } nbl::hlsl::uint32_t2 add64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) { @@ -47,7 +52,8 @@ namespace emulated return output; } - + + nbl::hlsl::uint32_t2 sub64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) { nbl::hlsl::uint32_t2 output; @@ -66,6 +72,24 @@ namespace emulated return 31 - firstbithigh(val); #endif } + + uint64_t propagateFloat64NaN(uint64_t a, uint64_t b) + { + #if defined RELAXED_NAN_PROPAGATION + return a | b; + #else + + bool aIsNaN = isNaN64(a); + bool bIsNaN = isNaN64(b); + a |= 0x0008000000000000ull; + b |= 0x0008000000000000ull; + + // TODO: + //return LERP(b, LERP(a, b, nbl::hlsl::float32_t2(bIsNaN, bIsNaN)), nbl::hlsl::float32_t2(aIsNaN, aIsNaN)); + return 0xdeadbeefbadcaffeull; + #endif + } + nbl::hlsl::uint32_t2 shortShift64Left(uint32_t a0, uint32_t a1, int count) @@ -289,7 +313,10 @@ namespace emulated exp -= shiftCount; return roundAndPackFloat64(sign, exp, frac.x, frac.y, frac.z); } - + + static const uint64_t SIGN_MASK = 0x8000000000000000ull; + static const uint64_t EXP_MASK = 0x7FF0000000000000ull; + static const uint64_t MANTISA_MASK = 0x000FFFFFFFFFFFFFull; } struct emulated_float64_t @@ -357,7 +384,7 @@ namespace emulated emulated_float64_t operator+(const emulated_float64_t rhs) { emulated_float64_t retval = createEmulatedFloat64PreserveBitPattern(0u); - + uint32_t lhsSign = uint32_t((data & 0x8000000000000000ull) >> 32); uint32_t rhsSign = uint32_t((rhs.data & 0x8000000000000000ull) >> 32); @@ -380,9 +407,10 @@ namespace emulated { //if (lhsExp == 0x7FF) //{ - // bool propagate = ((lhsHigh | rhsHigh) | (lhsLow| rhsLow)) != 0u; - // return nbl::hlsl::lerp(a, propagateFloat64NaN(a, b), propagate); + // bool propagate = (lhsMantissa | rhsMantissa) != 0u; + // return createEmulatedFloat64PreserveBitPattern(LERP(data, impl::propagateFloat64NaN(data, rhs.data), propagate)); //} + frac.xy = impl::add64(lhsHigh, lhsLow, rhsHigh, rhsLow); if (lhsExp == 0) return createEmulatedFloat64PreserveBitPattern(impl::packFloat64(lhsSign, 0, frac.x, frac.y)); @@ -398,13 +426,14 @@ namespace emulated EXCHANGE(lhsHigh, rhsHigh); EXCHANGE(lhsLow, rhsLow); EXCHANGE(lhsExp, rhsExp); + EXCHANGE(lhsExp, rhsExp); } - //if (lhsExp == 0x7FF) - //{ - // bool propagate = (lhsHigh | lhsLow) != 0u; - // return nbl::hlsl::lerp(__packFloat64(lhsSign, 0x7ff, 0u, 0u), __propagateFloat64NaN(a, b), propagate); - //} + if (lhsExp == 0x7FF) + { + bool propagate = (lhsHigh | lhsLow) != 0u; + return createEmulatedFloat64PreserveBitPattern(LERP(0x7FF0000000000000ull | (uint64_t(lhsSign) << 32), impl::propagateFloat64NaN(data, rhs.data), propagate)); + } expDiff = LERP(ABS(expDiff), ABS(expDiff) - 1, rhsExp == 0); rhsHigh = LERP(rhsHigh | 0x00100000u, rhsHigh, rhsExp == 0); @@ -449,7 +478,7 @@ namespace emulated EXCHANGE(lhsHigh, rhsHigh); EXCHANGE(lhsLow, rhsLow); EXCHANGE(lhsExp, rhsExp); - lhsSign ^= 0x80000000u; // TODO: smth not right about that + lhsSign ^= 0x80000000u; } //if (lhsExp == 0x7FF) @@ -590,6 +619,11 @@ namespace emulated const uint64_t flippedSign = ((~data) & 0x8000000000000000ull); return createEmulatedFloat64PreserveBitPattern(flippedSign | (data & 0x7FFFFFFFFFFFFFFFull)); } + + bool isNaN() + { + return impl::isNaN64(data); + } }; //_NBL_STATIC_INLINE_CONSTEXPR emulated_float64_t EMULATED_FLOAT64_NAN = emulated_float64_t::create(0.0 / 0.0); diff --git a/include/nbl/builtin/hlsl/ieee754.hlsl b/include/nbl/builtin/hlsl/ieee754.hlsl new file mode 100644 index 0000000000..481d7678f7 --- /dev/null +++ b/include/nbl/builtin/hlsl/ieee754.hlsl @@ -0,0 +1,88 @@ +#ifndef _NBL_BUILTIN_HLSL_IEE754_H_INCLUDED_ +#define _NBL_BUILTIN_HLSL_IEE754_H_INCLUDED_ + +#include + +namespace nbl::hlsl::ieee754 +{ + template + int getExponentBitCnt() { return 0xdeadbeefu; } + template<> int getExponentBitCnt() { return 5; } + template<> int getExponentBitCnt() { return 5; } + template<> int getExponentBitCnt() { return 8; } + template<> int getExponentBitCnt() { return 8; } + template<> int getExponentBitCnt() { return 11; } + template<> int getExponentBitCnt() { return 11; } + + template + int getMantissaBitCnt() { return 0xdeadbeefu; } + template<> int getMantissaBitCnt() { return 10; } + template<> int getMantissaBitCnt() { return 10; } + template<> int getMantissaBitCnt() { return 23; } + template<> int getMantissaBitCnt() { return 23; } + template<> int getMantissaBitCnt() { return 52; } + template<> int getMantissaBitCnt() { return 52; } + + template + int getExponentBias() { return 0xdeadbeefu; } + template<> int getExponentBias() { return 15; } + template<> int getExponentBias() { return 15; } + template<> int getExponentBias() { return 127; } + template<> int getExponentBias() { return 127; } + template<> int getExponentBias() { return 1023; } + template<> int getExponentBias() { return 1023; } + + template + unsigned_integer_of_size::type getExponentMask() { return 0xdeadbeefu; } + template<> unsigned_integer_of_size<2>::type getExponentMask() { return 0x7C00; } + template<> unsigned_integer_of_size<2>::type getExponentMask() { return 0x7C00; } + template<> unsigned_integer_of_size<4>::type getExponentMask() { return 0x7F800000u; } + template<> unsigned_integer_of_size<4>::type getExponentMask() { return 0x7F800000u; } + template<> unsigned_integer_of_size<8>::type getExponentMask() { return 0x7FF0000000000000ull; } + template<> unsigned_integer_of_size<8>::type getExponentMask() { return 0x7FF0000000000000ull; } + + template + unsigned_integer_of_size::type getMantissaMask() { return 0xdeadbeefu; } + template<> unsigned_integer_of_size<2>::type getMantissaMask() { return 0x03FF; } + template<> unsigned_integer_of_size<2>::type getMantissaMask() { return 0x03FF; } + template<> unsigned_integer_of_size<4>::type getMantissaMask() { return 0x007FFFFFu; } + template<> unsigned_integer_of_size<4>::type getMantissaMask() { return 0x007FFFFFu; } + template<> unsigned_integer_of_size<8>::type getMantissaMask() { return 0x000FFFFFFFFFFFFFull; } + template<> unsigned_integer_of_size<8>::type getMantissaMask() { return 0x000FFFFFFFFFFFFFull; } + + template + uint32_t extractBiasedExponent(T x) + { + using AsUint = typename unsigned_integer_of_size::type; + return bitfieldExtract(bit_cast(x), getMantissaBitCnt(), getExponentBitCnt()); + } + + template + int extractExponent(T x) + { + return int(extractBiasedExponent(x) - getExponentBias()); + } + + template + T replaceBiasedExponent(T x, uint32_t biasedExp) + { + using AsUint = typename unsigned_integer_of_size::type; + return bitCast(uintBitsToFloat(bitfieldInsert(bit_cast(x), biasedExp, getMantissaBitCnt(), getExponentBitCnt()))); + } + + // performs no overflow tests, returns x*exp2(n) + template + T fastMulExp2(T x, int n) + { + return replaceBiasedExponent(x, extractBiasedExponent(x) + uint32_t(n)); + } + + template + unsigned_integer_of_size::type extractMantissa(T x) + { + using AsUint = typename unsigned_integer_of_size::type; + return (bit_cast(x) & getMantissaMask()); + } +} + +#endif \ No newline at end of file From aba21cef477aa123ac08bae332ddf2193e38b886 Mon Sep 17 00:00:00 2001 From: Przemek Date: Thu, 13 Jun 2024 23:25:59 +0200 Subject: [PATCH 016/358] Implemented ieee754.hlsl --- include/nbl/builtin/hlsl/bit.hlsl | 14 +++++ include/nbl/builtin/hlsl/ieee754.hlsl | 64 ++++++++++++++++++----- include/nbl/builtin/hlsl/type_traits.hlsl | 24 +++++++++ 3 files changed, 90 insertions(+), 12 deletions(-) diff --git a/include/nbl/builtin/hlsl/bit.hlsl b/include/nbl/builtin/hlsl/bit.hlsl index e1f1117af6..d9bf8b0f42 100644 --- a/include/nbl/builtin/hlsl/bit.hlsl +++ b/include/nbl/builtin/hlsl/bit.hlsl @@ -19,6 +19,20 @@ constexpr To bit_cast(const From& from) return std::bit_cast(from); } +template +NBL_FORCE_INLINE constexpr T bitfield_insert(T base, T insert, int32_t offset, int32_t bits) +{ + NBL_CONSTEXPR T one = static_cast(1); + const T mask = (one << bits) - one; + const T shifted_mask = mask << offset; + + insert &= mask; + base &= (~shifted_mask); + base |= (insert << offset); + + return base; +} + NBL_ALIAS_TEMPLATE_FUNCTION(std::rotl, rotl); NBL_ALIAS_TEMPLATE_FUNCTION(std::rotr, rotr); NBL_ALIAS_TEMPLATE_FUNCTION(std::countl_zero, countl_zero); diff --git a/include/nbl/builtin/hlsl/ieee754.hlsl b/include/nbl/builtin/hlsl/ieee754.hlsl index 481d7678f7..55cfb5214f 100644 --- a/include/nbl/builtin/hlsl/ieee754.hlsl +++ b/include/nbl/builtin/hlsl/ieee754.hlsl @@ -5,22 +5,63 @@ namespace nbl::hlsl::ieee754 { + namespace impl + { + template + unsigned_integer_of_size::type castToUintType(T x) + { + using AsUint = typename unsigned_integer_of_size::type; + return bit_cast(x); + } + + // to avoid bit cast from uintN_t to uintN_t + template <> unsigned_integer_of_size<2>::type castToUintType(uint16_t x) { return x; } + template <> unsigned_integer_of_size<4>::type castToUintType(uint32_t x) { return x; } + template <> unsigned_integer_of_size<8>::type castToUintType(uint64_t x) { return x; } + + template + T replaceBiasedExponent(T x, uint32_t biasedExp) + { + static_assert(is_same::value || is_same::value || is_same::value, + "Invalid type! Only floating point or unsigned integer types are allowed."); + return bitfield_insert(x, T(biasedExp), getMantissaBitCnt(), getExponentBitCnt()); + } + + template<> + float16_t replaceBiasedExponent(float16_t x, uint32_t biasedExp) + { + return bit_cast(replaceBiasedExponent(bit_cast(x), biasedExp)); + } + + template<> + float32_t replaceBiasedExponent(float32_t x, uint32_t biasedExp) + { + return bit_cast(replaceBiasedExponent(bit_cast(x), biasedExp)); + } + + template<> + float64_t replaceBiasedExponent(float64_t x, uint32_t biasedExp) + { + return bit_cast(replaceBiasedExponent(bit_cast(x), biasedExp)); + } + } + template int getExponentBitCnt() { return 0xdeadbeefu; } template<> int getExponentBitCnt() { return 5; } template<> int getExponentBitCnt() { return 5; } - template<> int getExponentBitCnt() { return 8; } + template<> int getExponentBitCnt() { return 8; } template<> int getExponentBitCnt() { return 8; } - template<> int getExponentBitCnt() { return 11; } + template<> int getExponentBitCnt() { return 11; } template<> int getExponentBitCnt() { return 11; } template int getMantissaBitCnt() { return 0xdeadbeefu; } template<> int getMantissaBitCnt() { return 10; } template<> int getMantissaBitCnt() { return 10; } - template<> int getMantissaBitCnt() { return 23; } + template<> int getMantissaBitCnt() { return 23; } template<> int getMantissaBitCnt() { return 23; } - template<> int getMantissaBitCnt() { return 52; } + template<> int getMantissaBitCnt() { return 52; } template<> int getMantissaBitCnt() { return 52; } template @@ -36,25 +77,25 @@ namespace nbl::hlsl::ieee754 unsigned_integer_of_size::type getExponentMask() { return 0xdeadbeefu; } template<> unsigned_integer_of_size<2>::type getExponentMask() { return 0x7C00; } template<> unsigned_integer_of_size<2>::type getExponentMask() { return 0x7C00; } - template<> unsigned_integer_of_size<4>::type getExponentMask() { return 0x7F800000u; } + template<> unsigned_integer_of_size<4>::type getExponentMask() { return 0x7F800000u; } template<> unsigned_integer_of_size<4>::type getExponentMask() { return 0x7F800000u; } - template<> unsigned_integer_of_size<8>::type getExponentMask() { return 0x7FF0000000000000ull; } + template<> unsigned_integer_of_size<8>::type getExponentMask() { return 0x7FF0000000000000ull; } template<> unsigned_integer_of_size<8>::type getExponentMask() { return 0x7FF0000000000000ull; } template unsigned_integer_of_size::type getMantissaMask() { return 0xdeadbeefu; } template<> unsigned_integer_of_size<2>::type getMantissaMask() { return 0x03FF; } template<> unsigned_integer_of_size<2>::type getMantissaMask() { return 0x03FF; } - template<> unsigned_integer_of_size<4>::type getMantissaMask() { return 0x007FFFFFu; } + template<> unsigned_integer_of_size<4>::type getMantissaMask() { return 0x007FFFFFu; } template<> unsigned_integer_of_size<4>::type getMantissaMask() { return 0x007FFFFFu; } - template<> unsigned_integer_of_size<8>::type getMantissaMask() { return 0x000FFFFFFFFFFFFFull; } + template<> unsigned_integer_of_size<8>::type getMantissaMask() { return 0x000FFFFFFFFFFFFFull; } template<> unsigned_integer_of_size<8>::type getMantissaMask() { return 0x000FFFFFFFFFFFFFull; } template uint32_t extractBiasedExponent(T x) { using AsUint = typename unsigned_integer_of_size::type; - return bitfieldExtract(bit_cast(x), getMantissaBitCnt(), getExponentBitCnt()); + return bitfieldExtract(impl::castToUintType(x), getMantissaBitCnt(), getExponentBitCnt()); } template @@ -66,8 +107,7 @@ namespace nbl::hlsl::ieee754 template T replaceBiasedExponent(T x, uint32_t biasedExp) { - using AsUint = typename unsigned_integer_of_size::type; - return bitCast(uintBitsToFloat(bitfieldInsert(bit_cast(x), biasedExp, getMantissaBitCnt(), getExponentBitCnt()))); + return impl::replaceBiasedExponent(x, biasedExp); } // performs no overflow tests, returns x*exp2(n) @@ -81,7 +121,7 @@ namespace nbl::hlsl::ieee754 unsigned_integer_of_size::type extractMantissa(T x) { using AsUint = typename unsigned_integer_of_size::type; - return (bit_cast(x) & getMantissaMask()); + return (impl::castToUintType(x) & getMantissaMask()); } } diff --git a/include/nbl/builtin/hlsl/type_traits.hlsl b/include/nbl/builtin/hlsl/type_traits.hlsl index 6a58c157a1..f1b383f4bc 100644 --- a/include/nbl/builtin/hlsl/type_traits.hlsl +++ b/include/nbl/builtin/hlsl/type_traits.hlsl @@ -661,6 +661,30 @@ struct unsigned_integer_of_size<8> using type = uint64_t; }; +template +struct float_of_size +{ + using type = void; +}; + +template<> +struct float_of_size<2> +{ + using type = float16_t; +}; + +template<> +struct float_of_size<4> +{ + using type = float32_t; +}; + +template<> +struct float_of_size<8> +{ + using type = uint64_t; +}; + } } From d54ac4d3f13c6972f2731ed57ea4c7199a3476d4 Mon Sep 17 00:00:00 2001 From: Przemek Date: Tue, 18 Jun 2024 22:10:41 +0200 Subject: [PATCH 017/358] Saving work --- examples_tests | 2 +- include/nbl/builtin/hlsl/bit.hlsl | 14 - .../nbl/builtin/hlsl/emulated_float64_t.hlsl | 702 ++++++++++-------- .../nbl/builtin/hlsl/glsl_compat/core.hlsl | 117 +-- include/nbl/builtin/hlsl/ieee754.hlsl | 127 ++-- src/nbl/builtin/CMakeLists.txt | 2 + 6 files changed, 520 insertions(+), 444 deletions(-) diff --git a/examples_tests b/examples_tests index 2d5d301138..9874800e41 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 2d5d3011384474aec6e9527abafc707214f427bd +Subproject commit 9874800e41d75327724b5ed6cf4b11cabc7eedd7 diff --git a/include/nbl/builtin/hlsl/bit.hlsl b/include/nbl/builtin/hlsl/bit.hlsl index d9bf8b0f42..e1f1117af6 100644 --- a/include/nbl/builtin/hlsl/bit.hlsl +++ b/include/nbl/builtin/hlsl/bit.hlsl @@ -19,20 +19,6 @@ constexpr To bit_cast(const From& from) return std::bit_cast(from); } -template -NBL_FORCE_INLINE constexpr T bitfield_insert(T base, T insert, int32_t offset, int32_t bits) -{ - NBL_CONSTEXPR T one = static_cast(1); - const T mask = (one << bits) - one; - const T shifted_mask = mask << offset; - - insert &= mask; - base &= (~shifted_mask); - base |= (insert << offset); - - return base; -} - NBL_ALIAS_TEMPLATE_FUNCTION(std::rotl, rotl); NBL_ALIAS_TEMPLATE_FUNCTION(std::rotr, rotr); NBL_ALIAS_TEMPLATE_FUNCTION(std::countl_zero, countl_zero); diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index 175e610914..fe37f001be 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -1,4 +1,8 @@ +#ifndef _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_INCLUDED_ +#define _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_INCLUDED_ + #include +#include #ifdef __HLSL_VERSION #define LERP lerp @@ -25,298 +29,312 @@ #define FLOAT_ROUND_DOWN 2 #define FLOAT_ROUND_UP 3 #define FLOAT_ROUNDING_MODE FLOAT_ROUND_NEAREST_EVEN - -namespace emulated +namespace nbl +{ +namespace hlsl { - namespace impl +namespace impl +{ + template + uint64_t promoteToUint64(T val) { - nbl::hlsl::uint32_t2 umulExtended(uint32_t lhs, uint32_t rhs) - { - uint64_t product = uint64_t(lhs) * uint64_t(rhs); - nbl::hlsl::uint32_t2 output; - output.x = uint32_t((product & 0xFFFFFFFF00000000) >> 32); - output.y = uint32_t(product & 0x00000000FFFFFFFFull); - return output; - } - - bool isNaN64(uint64_t val) - { - return bool((0x7FF0000000000000ull & val) && (0x000FFFFFFFFFFFFFull & val)); - } + using AsFloat = unsigned_integer_of_size; + uint64_t asUint = ieee754::impl::castToUintType(val); - nbl::hlsl::uint32_t2 add64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) - { - nbl::hlsl::uint32_t2 output; - output.y = a1 + b1; - output.x = a0 + b0 + uint32_t(output.y < a1); + const uint64_t sign = (uint64_t(ieee754::getSignMask()) | asUint) << (sizeof(uint64_t) - sizeof(T) - 2); + const uint64_t exp = (uint64_t(ieee754::getExponentMask()) | asUint) << (ieee754::getMantissaBitCnt() - ieee754::getMantissaBitCnt() + ieee754::getExponentBitCnt()); + const uint64_t mantissa = (uint64_t(ieee754::getMantissaMask()) | asUint) << (ieee754::getMantissaBitCnt() - ieee754::getMantissaBitCnt()); - return output; - } + return sign | exp | mantissa; + }; + template<> uint64_t promoteToUint64(float64_t val) { return bit_cast(val); } - nbl::hlsl::uint32_t2 sub64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) - { - nbl::hlsl::uint32_t2 output; - output.y = a1 - b1; - output.x = a0 - b0 - uint32_t(a1 < b1); - - return output; - } + nbl::hlsl::uint32_t2 umulExtended(uint32_t lhs, uint32_t rhs) + { + uint64_t product = uint64_t(lhs) * uint64_t(rhs); + nbl::hlsl::uint32_t2 output; + output.x = uint32_t((product & 0xFFFFFFFF00000000) >> 32); + output.y = uint32_t(product & 0x00000000FFFFFFFFull); + return output; + } + + bool isNaN64(uint64_t val) + { + return bool((0x7FF0000000000000ull & val) && (0x000FFFFFFFFFFFFFull & val)); + } - // TODO: test - int countLeadingZeros32(uint32_t val) - { + nbl::hlsl::uint32_t2 add64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) + { + nbl::hlsl::uint32_t2 output; + output.y = a1 + b1; + output.x = a0 + b0 + uint32_t(output.y < a1); + + return output; + } + + + nbl::hlsl::uint32_t2 sub64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) + { + nbl::hlsl::uint32_t2 output; + output.y = a1 - b1; + output.x = a0 - b0 - uint32_t(a1 < b1); + + return output; + } + + // TODO: test + int countLeadingZeros32(uint32_t val) + { #ifndef __HLSL_VERSION - return 31 - nbl::hlsl::findMSB(val); + return 31 - nbl::hlsl::findMSB(val); #else - return 31 - firstbithigh(val); + return 31 - firstbithigh(val); #endif - } - - uint64_t propagateFloat64NaN(uint64_t a, uint64_t b) - { - #if defined RELAXED_NAN_PROPAGATION - return a | b; - #else + } + + uint64_t propagateFloat64NaN(uint64_t a, uint64_t b) + { + #if defined RELAXED_NAN_PROPAGATION + return a | b; + #else + + bool aIsNaN = isNaN64(a); + bool bIsNaN = isNaN64(b); + a |= 0x0008000000000000ull; + b |= 0x0008000000000000ull; + + // TODO: + //return LERP(b, LERP(a, b, nbl::hlsl::float32_t2(bIsNaN, bIsNaN)), nbl::hlsl::float32_t2(aIsNaN, aIsNaN)); + return 0xdeadbeefbadcaffeull; + #endif + } + + nbl::hlsl::uint32_t2 shortShift64Left(uint32_t a0, uint32_t a1, int count) + { + nbl::hlsl::uint32_t2 output; + output.y = a1 << count; + output.x = LERP((a0 << count | (a1 >> ((-count) & 31))), a0, count == 0); - bool aIsNaN = isNaN64(a); - bool bIsNaN = isNaN64(b); - a |= 0x0008000000000000ull; - b |= 0x0008000000000000ull; + return output; + }; + + nbl::hlsl::uint32_t2 shift64RightJamming(uint32_t a0, uint32_t a1, int count) + { + nbl::hlsl::uint32_t2 output; + const int negCount = (-count) & 31; + + output.x = LERP(0u, a0, count == 0); + output.x = LERP(output.x, (a0 >> count), count < 32); + + output.y = uint32_t((a0 | a1) != 0u); /* count >= 64 */ + uint32_t z1_lt64 = (a0>>(count & 31)) | uint32_t(((a0<>count) | uint32_t ((a1<> (count & 31)), count < 64); + output.y = LERP(output.y, (a0<>count), count < 32); + + a2 = LERP(a2 | a1, a2, count < 32); + output.x = LERP(output.x, a0 >> count, count < 32); + output.z |= uint32_t(a2 != 0u); + + output.x = LERP(output.x, 0u, (count == 32)); + output.y = LERP(output.y, a0, (count == 32)); + output.z = LERP(output.z, a1, (count == 32)); + output.x = LERP(output.x, a0, (count == 0)); + output.y = LERP(output.y, a1, (count == 0)); + output.z = LERP(output.z, a2, (count == 0)); + + return output; + } + + + uint64_t packFloat64(uint32_t zSign, int zExp, uint32_t zFrac0, uint32_t zFrac1) + { + nbl::hlsl::uint32_t2 z; + + z.x = zSign + (uint32_t(zExp) << 20) + zFrac0; + z.y = zFrac1; + + uint64_t output = 0u; + output |= (uint64_t(z.x) << 32) & 0xFFFFFFFF00000000ull; + output |= uint64_t(z.y); + return output; + } + + + uint64_t roundAndPackFloat64(uint32_t zSign, int zExp, uint32_t zFrac0, uint32_t zFrac1, uint32_t zFrac2) + { + bool roundNearestEven; + bool increment; + + roundNearestEven = true; + increment = int(zFrac2) < 0; + if (!roundNearestEven) + { + if (false) //(FLOAT_ROUNDING_MODE == FLOAT_ROUND_TO_ZERO) + { + increment = false; + } + else + { + if (false) //(zSign != 0u) + { + //increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) && + // (zFrac2 != 0u); } - - - - nbl::hlsl::uint32_t2 shortShift64Left(uint32_t a0, uint32_t a1, int count) + else { - nbl::hlsl::uint32_t2 output; - output.y = a1 << count; - output.x = LERP((a0 << count | (a1 >> ((-count) & 31))), a0, count == 0); - - return output; - }; - - nbl::hlsl::uint32_t2 shift64RightJamming(uint32_t a0, uint32_t a1, int count) + //increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP) && + // (zFrac2 != 0u); + } + } + } + if (0x7FD <= zExp) { - nbl::hlsl::uint32_t2 output; - const int negCount = (-count) & 31; - - output.x = LERP(0u, a0, count == 0); - output.x = LERP(output.x, (a0 >> count), count < 32); - - output.y = uint32_t((a0 | a1) != 0u); /* count >= 64 */ - uint32_t z1_lt64 = (a0>>(count & 31)) | uint32_t(((a0<>count) | uint32_t ((a1<> (count & 31)), count < 64); - output.y = LERP(output.y, (a0<>count), count < 32); - - a2 = LERP(a2 | a1, a2, count < 32); - output.x = LERP(output.x, a0 >> count, count < 32); - output.z |= uint32_t(a2 != 0u); - - output.x = LERP(output.x, 0u, (count == 32)); - output.y = LERP(output.y, a0, (count == 32)); - output.z = LERP(output.z, a1, (count == 32)); - output.x = LERP(output.x, a0, (count == 0)); - output.y = LERP(output.y, a1, (count == 0)); - output.z = LERP(output.z, a2, (count == 0)); - - return output; + if (zSign != 0u) + { + increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) && (zFrac2 != 0u); } - - - uint64_t packFloat64(uint32_t zSign, int zExp, uint32_t zFrac0, uint32_t zFrac1) + else { - nbl::hlsl::uint32_t2 z; - - z.x = zSign + (uint32_t(zExp) << 20) + zFrac0; - z.y = zFrac1; - - uint64_t output = 0u; - output |= (uint64_t(z.x) << 32) & 0xFFFFFFFF00000000ull; - output |= uint64_t(z.y); - return output; + increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP) && (zFrac2 != 0u); } - - - uint64_t roundAndPackFloat64(uint32_t zSign, int zExp, uint32_t zFrac0, uint32_t zFrac1, uint32_t zFrac2) + } + } + + if (increment) { - bool roundNearestEven; - bool increment; - - roundNearestEven = true; - increment = int(zFrac2) < 0; - if (!roundNearestEven) - { - if (false) //(FLOAT_ROUNDING_MODE == FLOAT_ROUND_TO_ZERO) - { - increment = false; - } - else - { - if (false) //(zSign != 0u) - { - //increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) && - // (zFrac2 != 0u); - } - else - { - //increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP) && - // (zFrac2 != 0u); - } - } - } - if (0x7FD <= zExp) - { - if ((0x7FD < zExp) || ((zExp == 0x7FD) && (0x001FFFFFu == zFrac0 && 0xFFFFFFFFu == zFrac1) && increment)) - { - if (false) // ((FLOAT_ROUNDING_MODE == FLOAT_ROUND_TO_ZERO) || - // ((zSign != 0u) && (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP)) || - // ((zSign == 0u) && (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN))) - { - return packFloat64(zSign, 0x7FE, 0x000FFFFFu, 0xFFFFFFFFu); - } - - return packFloat64(zSign, 0x7FF, 0u, 0u); - } - } - - if (zExp < 0) - { - nbl::hlsl::uint32_t3 shifted = shift64ExtraRightJamming(zFrac0, zFrac1, zFrac2, -zExp); - zFrac0 = shifted.x; - zFrac1 = shifted.y; - zFrac2 = shifted.z; - zExp = 0; - - if (roundNearestEven) - { - increment = zFrac2 < 0u; - } - else - { - if (zSign != 0u) - { - increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) && (zFrac2 != 0u); - } - else - { - increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP) && (zFrac2 != 0u); - } - } - } - - if (increment) - { - nbl::hlsl::uint32_t2 added = add64(zFrac0, zFrac1, 0u, 1u); - zFrac0 = added.x; - zFrac1 = added.y; - zFrac1 &= ~((zFrac2 + uint32_t(zFrac2 == 0u)) & uint32_t(roundNearestEven)); - } - else - { - zExp = LERP(zExp, 0, (zFrac0 | zFrac1) == 0u); - } - - return packFloat64(zSign, zExp, zFrac0, zFrac1); + nbl::hlsl::uint32_t2 added = add64(zFrac0, zFrac1, 0u, 1u); + zFrac0 = added.x; + zFrac1 = added.y; + zFrac1 &= ~((zFrac2 + uint32_t(zFrac2 == 0u)) & uint32_t(roundNearestEven)); } - - uint64_t normalizeRoundAndPackFloat64(uint32_t sign, int exp, uint32_t frac0, uint32_t frac1) + else { - int shiftCount; - nbl::hlsl::uint32_t3 frac = nbl::hlsl::uint32_t3(frac0, frac1, 0u); - - if (frac.x == 0u) - { - exp -= 32; - frac.x = frac.y; - frac.y = 0u; - } - - shiftCount = countLeadingZeros32(frac.x) - 11; - if (0 <= shiftCount) - { - frac.xy = shortShift64Left(frac.x, frac.y, shiftCount); - } - else - { - frac.xyz = shift64ExtraRightJamming(frac.x, frac.y, 0u, -shiftCount); - } - exp -= shiftCount; - return roundAndPackFloat64(sign, exp, frac.x, frac.y, frac.z); + zExp = LERP(zExp, 0, (zFrac0 | zFrac1) == 0u); } - - static const uint64_t SIGN_MASK = 0x8000000000000000ull; - static const uint64_t EXP_MASK = 0x7FF0000000000000ull; - static const uint64_t MANTISA_MASK = 0x000FFFFFFFFFFFFFull; + + return packFloat64(zSign, zExp, zFrac0, zFrac1); + } + + uint64_t normalizeRoundAndPackFloat64(uint32_t sign, int exp, uint32_t frac0, uint32_t frac1) + { + int shiftCount; + nbl::hlsl::uint32_t3 frac = nbl::hlsl::uint32_t3(frac0, frac1, 0u); + + if (frac.x == 0u) + { + exp -= 32; + frac.x = frac.y; + frac.y = 0u; + } + + shiftCount = countLeadingZeros32(frac.x) - 11; + if (0 <= shiftCount) + { + frac.xy = shortShift64Left(frac.x, frac.y, shiftCount); + } + else + { + frac.xyz = shift64ExtraRightJamming(frac.x, frac.y, 0u, -shiftCount); + } + exp -= shiftCount; + return roundAndPackFloat64(sign, exp, frac.x, frac.y, frac.z); + } + + static const uint64_t SIGN_MASK = 0x8000000000000000ull; + static const uint64_t EXP_MASK = 0x7FF0000000000000ull; + static const uint64_t MANTISA_MASK = 0x000FFFFFFFFFFFFFull; } struct emulated_float64_t @@ -326,59 +344,61 @@ namespace emulated storage_t data; // constructors - // TODO: specializations - template - static emulated_float64_t create(T val) + static emulated_float64_t create(uint64_t val) { -#ifndef __HLSL_VERSION - emulated_float64_t output; - output.data = reinterpret_cast(val); - return output; -#else - uint32_t lowBits; - uint32_t highBits; - asuint(val, lowBits, highBits); - - emulated_float64_t output; - output.data = (uint64_t(highBits) << 32) | uint64_t(lowBits); - return output; -#endif + return emulated_float64_t(val); } - - static emulated_float64_t createEmulatedFloat64PreserveBitPattern(uint64_t val) + + static emulated_float64_t create(uint32_t val) { - emulated_float64_t output; - output.data = val; - return output; + return emulated_float64_t(impl::promoteToUint64(val)); + } + + static emulated_float64_t create(uint16_t val) + { + return emulated_float64_t(impl::promoteToUint64(val)); + } + + static emulated_float64_t create(float64_t val) + { + return emulated_float64_t(bit_cast(val)); } - // TODO: won't not work for uints with msb of index > 52 -#ifndef __HLSL_VERSION - template<> - static emulated_float64_t create(uint64_t val) + static emulated_float64_t create(float16_t val) { -#ifndef __HLSL_VERSION - const uint64_t msbIndex = nbl::hlsl::findMSB(val); -#else - const uint64_t msbIndex = firstbithigh(val); -#endif - uint64_t exp = ((msbIndex + 1023) << 52) & 0x7FF0000000000000; - uint64_t mantissa = (val << (52 - msbIndex)) & 0x000FFFFFFFFFFFFFull; - emulated_float64_t output; - output.data = exp | mantissa; - return output; + return emulated_float64_t(impl::promoteToUint64(val)); } -#endif - // TODO: temporary, remove -#ifndef __HLSL_VERSION - template<> - static emulated_float64_t create(double val) + + static emulated_float64_t create(float32_t val) + { + return emulated_float64_t(impl::promoteToUint64(val)); + } + + //TODO do i need that? + static emulated_float64_t createEmulatedFloat64PreserveBitPattern(uint64_t val) { emulated_float64_t output; - output.data = reinterpret_cast(val); + output.data = val; return output; } -#endif + + // TODO: won't not work for uints with msb of index > 52 +//#ifndef __HLSL_VERSION +// template<> +// static emulated_float64_t create(uint64_t val) +// { +//#ifndef __HLSL_VERSION +// const uint64_t msbIndex = nbl::hlsl::findMSB(val); +//#else +// const uint64_t msbIndex = firstbithigh(val); +//#endif +// uint64_t exp = ((msbIndex + 1023) << 52) & 0x7FF0000000000000; +// uint64_t mantissa = (val << (52 - msbIndex)) & 0x000FFFFFFFFFFFFFull; +// emulated_float64_t output; +// output.data = exp | mantissa; +// return output; +// } +//#endif // arithmetic operators emulated_float64_t operator+(const emulated_float64_t rhs) @@ -561,12 +581,12 @@ namespace emulated lhsHigh |= 0x00100000u; - nbl::hlsl::uint32_t2 shifted = emulated::impl::shortShift64Left(rhsHigh, rhsLow, 12); + nbl::hlsl::uint32_t2 shifted = impl::shortShift64Left(rhsHigh, rhsLow, 12); rhsHigh = shifted.x; rhsLow = shifted.y; nbl::hlsl::uint32_t4 fracUnpacked = impl::mul64to128(lhsHigh, lhsLow, rhsHigh, rhsLow); - fracUnpacked.xy = emulated::impl::add64(fracUnpacked.x, fracUnpacked.y, lhsHigh, lhsLow); + fracUnpacked.xy = impl::add64(fracUnpacked.x, fracUnpacked.y, lhsHigh, lhsLow); fracUnpacked.z |= uint32_t(fracUnpacked.w != 0u); if (0x00200000u <= fracUnpacked.x) { @@ -596,18 +616,6 @@ namespace emulated bool operator||(const emulated_float64_t rhs) { return bool(data) || bool(rhs.data); } bool operator!() { return !bool(data); } - // conversion operators - operator bool() { return bool(data); } - operator int() { return int(data); } - operator uint32_t() { return uint32_t(data); } - operator uint64_t() { return uint64_t(data); } - operator float() { return float(data); } - //operator min16int() { return min16int(data);} - //operator float64_t() { return float64_t(data); } - //operator half() { return half(data); } - - //explicit operator int() const { return int(data); } - // OMITED OPERATORS // - not implementing bitwise and modulo operators since floating point types doesn't support them // - compound operator overload not supported in HLSL @@ -627,4 +635,54 @@ namespace emulated }; //_NBL_STATIC_INLINE_CONSTEXPR emulated_float64_t EMULATED_FLOAT64_NAN = emulated_float64_t::create(0.0 / 0.0); + +namespace ieee754 +{ + template<> int getExponentBitCnt() { return getExponentBitCnt(); } + template<> int getMantissaBitCnt() { return getMantissaBitCnt(); } + template<> int getExponentBias() { return getExponentBias(); } + template<> unsigned_integer_of_size<8>::type getExponentMask() { return getExponentMask(); } + template<> unsigned_integer_of_size<8>::type getMantissaMask() { return getMantissaMask(); } + template<> + unsigned_integer_of_size<8>::type getSignMask() + { + using AsUint = typename unsigned_integer_of_size::type; + return AsUint(0x1) << (sizeof(float64_t) * 4 - 1); + } + + template <> + uint32_t extractBiasedExponent(emulated_float64_t x) + { + return extractBiasedExponent(x.data); + } + + template <> + int extractExponent(emulated_float64_t x) + { + return extractExponent(x.data); + } + + template <> + emulated_float64_t replaceBiasedExponent(emulated_float64_t x, typename unsigned_integer_of_size::type biasedExp) + { + return emulated_float64_t(replaceBiasedExponent(x.data, biasedExp)); + } + + //// performs no overflow tests, returns x*exp2(n) + template <> + emulated_float64_t fastMulExp2(emulated_float64_t x, int n) + { + return emulated_float64_t(replaceBiasedExponent(x.data, extractBiasedExponent(x) + uint32_t(n))); + } + + template <> + unsigned_integer_of_size::type extractMantissa(emulated_float64_t x) + { + return extractMantissa(x.data); + } } + +} +} + +#endif diff --git a/include/nbl/builtin/hlsl/glsl_compat/core.hlsl b/include/nbl/builtin/hlsl/glsl_compat/core.hlsl index 1fbeb337dd..2f540916e5 100644 --- a/include/nbl/builtin/hlsl/glsl_compat/core.hlsl +++ b/include/nbl/builtin/hlsl/glsl_compat/core.hlsl @@ -15,6 +15,81 @@ namespace hlsl namespace glsl { +template +T bitfieldInsert(T base, T insert, int32_t offset, int32_t bits) +{ + NBL_CONSTEXPR T one = typename unsigned_integer_of_size::type(1); + const T mask = (one << bits) - one; + const T shifted_mask = mask << offset; + + insert &= mask; + base &= (~shifted_mask); + base |= (insert << offset); + + return base; +} + +namespace impl +{ + + template + struct bitfieldExtract {}; + + template + struct bitfieldExtract + { + static T __call(T val, uint32_t offsetBits, uint32_t numBits) + { + static_assert(is_integral::value, "T is not an integral type!"); + return val; + } + }; + +#ifndef __HLSL_VERSION + template + T _bitfieldExtract(T val, uint32_t offsetBits, uint32_t numBits) + { + const T mask = (T(0x1) << numBits) - 1; + const T shiftedMask = mask << offsetBits; + return (val & shiftedMask) >> offsetBits; + } +#endif + + template + struct bitfieldExtract + { + static T __call(T val, uint32_t offsetBits, uint32_t numBits) + { +#ifdef __HLSL_VERSION + return spirv::bitFieldSExtract(val, offsetBits, numBits); +#else + return _bitfieldExtract(val, offsetBits, numBits); +#endif + } + }; + + template + struct bitfieldExtract + { + static T __call(T val, uint32_t offsetBits, uint32_t numBits) + { + +#ifdef __HLSL_VERSION + return spirv::bitFieldUExtract(val, offsetBits, numBits); +#else + return _bitfieldExtract(val, offsetBits, numBits); +#endif + } + }; + +} + +template +T bitfieldExtract(T val, uint32_t offsetBits, uint32_t numBits) +{ + return impl::bitfieldExtract::value, is_integral::value>::template __call(val, offsetBits, numBits); +} + #ifdef __HLSL_VERSION /** * Generic SPIR-V @@ -112,48 +187,6 @@ void memoryBarrierShared() { spirv::memoryBarrier(spv::ScopeDevice, spv::MemorySemanticsAcquireReleaseMask | spv::MemorySemanticsWorkgroupMemoryMask); } -namespace impl -{ - -template -struct bitfieldExtract {}; - -template -struct bitfieldExtract -{ - static T __call( T val, uint32_t offsetBits, uint32_t numBits ) - { - static_assert( is_integral::value, "T is not an integral type!" ); - return val; - } -}; - -template -struct bitfieldExtract -{ - static T __call( T val, uint32_t offsetBits, uint32_t numBits ) - { - return spirv::bitFieldSExtract( val, offsetBits, numBits ); - } -}; - -template -struct bitfieldExtract -{ - static T __call( T val, uint32_t offsetBits, uint32_t numBits ) - { - return spirv::bitFieldUExtract( val, offsetBits, numBits ); - } -}; - -} - -template -T bitfieldExtract( T val, uint32_t offsetBits, uint32_t numBits ) -{ - return impl::bitfieldExtract::value, is_integral::value>::template __call(val,offsetBits,numBits); -} - #endif } diff --git a/include/nbl/builtin/hlsl/ieee754.hlsl b/include/nbl/builtin/hlsl/ieee754.hlsl index 55cfb5214f..1951fb33a9 100644 --- a/include/nbl/builtin/hlsl/ieee754.hlsl +++ b/include/nbl/builtin/hlsl/ieee754.hlsl @@ -2,100 +2,94 @@ #define _NBL_BUILTIN_HLSL_IEE754_H_INCLUDED_ #include +#include +#include -namespace nbl::hlsl::ieee754 +namespace nbl { - namespace impl +namespace hlsl +{ +namespace ieee754 +{ +namespace impl +{ + template + NBL_CONSTEXPR_STATIC_INLINE bool isTypeAllowed() + { + return is_same::value || + is_same::value || + is_same::value || + is_same::value || + is_same::value || + is_same::value; + } + + template + typename unsigned_integer_of_size::type castToUintType(T x) + { + using AsUint = typename unsigned_integer_of_size::type; + return bit_cast(x); + } + // to avoid bit cast from uintN_t to uintN_t + template <> unsigned_integer_of_size<2>::type castToUintType(uint16_t x) { return x; } + template <> unsigned_integer_of_size<4>::type castToUintType(uint32_t x) { return x; } + template <> unsigned_integer_of_size<8>::type castToUintType(uint64_t x) { return x; } + + template + T castBackToFloatType(T x) { - template - unsigned_integer_of_size::type castToUintType(T x) - { - using AsUint = typename unsigned_integer_of_size::type; - return bit_cast(x); - } - - // to avoid bit cast from uintN_t to uintN_t - template <> unsigned_integer_of_size<2>::type castToUintType(uint16_t x) { return x; } - template <> unsigned_integer_of_size<4>::type castToUintType(uint32_t x) { return x; } - template <> unsigned_integer_of_size<8>::type castToUintType(uint64_t x) { return x; } - - template - T replaceBiasedExponent(T x, uint32_t biasedExp) - { - static_assert(is_same::value || is_same::value || is_same::value, - "Invalid type! Only floating point or unsigned integer types are allowed."); - return bitfield_insert(x, T(biasedExp), getMantissaBitCnt(), getExponentBitCnt()); - } - - template<> - float16_t replaceBiasedExponent(float16_t x, uint32_t biasedExp) - { - return bit_cast(replaceBiasedExponent(bit_cast(x), biasedExp)); - } - - template<> - float32_t replaceBiasedExponent(float32_t x, uint32_t biasedExp) - { - return bit_cast(replaceBiasedExponent(bit_cast(x), biasedExp)); - } - - template<> - float64_t replaceBiasedExponent(float64_t x, uint32_t biasedExp) - { - return bit_cast(replaceBiasedExponent(bit_cast(x), biasedExp)); - } + using AsFloat = typename float_of_size::type; + return bit_cast(x); } + template<> uint16_t castBackToFloatType(uint16_t x) { return x; } + template<> uint32_t castBackToFloatType(uint32_t x) { return x; } + template<> uint64_t castBackToFloatType(uint64_t x) { return x; } +} template int getExponentBitCnt() { return 0xdeadbeefu; } template<> int getExponentBitCnt() { return 5; } - template<> int getExponentBitCnt() { return 5; } template<> int getExponentBitCnt() { return 8; } - template<> int getExponentBitCnt() { return 8; } template<> int getExponentBitCnt() { return 11; } - template<> int getExponentBitCnt() { return 11; } template int getMantissaBitCnt() { return 0xdeadbeefu; } template<> int getMantissaBitCnt() { return 10; } - template<> int getMantissaBitCnt() { return 10; } template<> int getMantissaBitCnt() { return 23; } - template<> int getMantissaBitCnt() { return 23; } template<> int getMantissaBitCnt() { return 52; } - template<> int getMantissaBitCnt() { return 52; } template - int getExponentBias() { return 0xdeadbeefu; } - template<> int getExponentBias() { return 15; } - template<> int getExponentBias() { return 15; } - template<> int getExponentBias() { return 127; } - template<> int getExponentBias() { return 127; } - template<> int getExponentBias() { return 1023; } - template<> int getExponentBias() { return 1023; } + int getExponentBias() + { + //static_assert(impl::isTypeAllowed(), "Invalid type! Only floating point or unsigned integer types are allowed."); + return (0x1 << (getExponentBitCnt::type>() - 1)) - 1; + } + + template + typename unsigned_integer_of_size::type getSignMask() + { + //static_assert(impl::isTypeAllowed(), "Invalid type! Only floating point or unsigned integer types are allowed."); + using AsUint = typename unsigned_integer_of_size::type; + return AsUint(0x1) << (sizeof(T) * 4 - 1); + } template - unsigned_integer_of_size::type getExponentMask() { return 0xdeadbeefu; } + typename unsigned_integer_of_size::type getExponentMask() { return 0xdeadbeefu; } template<> unsigned_integer_of_size<2>::type getExponentMask() { return 0x7C00; } - template<> unsigned_integer_of_size<2>::type getExponentMask() { return 0x7C00; } template<> unsigned_integer_of_size<4>::type getExponentMask() { return 0x7F800000u; } - template<> unsigned_integer_of_size<4>::type getExponentMask() { return 0x7F800000u; } template<> unsigned_integer_of_size<8>::type getExponentMask() { return 0x7FF0000000000000ull; } - template<> unsigned_integer_of_size<8>::type getExponentMask() { return 0x7FF0000000000000ull; } template - unsigned_integer_of_size::type getMantissaMask() { return 0xdeadbeefu; } + typename unsigned_integer_of_size::type getMantissaMask() { return 0xdeadbeefu; } template<> unsigned_integer_of_size<2>::type getMantissaMask() { return 0x03FF; } - template<> unsigned_integer_of_size<2>::type getMantissaMask() { return 0x03FF; } template<> unsigned_integer_of_size<4>::type getMantissaMask() { return 0x007FFFFFu; } - template<> unsigned_integer_of_size<4>::type getMantissaMask() { return 0x007FFFFFu; } template<> unsigned_integer_of_size<8>::type getMantissaMask() { return 0x000FFFFFFFFFFFFFull; } - template<> unsigned_integer_of_size<8>::type getMantissaMask() { return 0x000FFFFFFFFFFFFFull; } template uint32_t extractBiasedExponent(T x) { using AsUint = typename unsigned_integer_of_size::type; - return bitfieldExtract(impl::castToUintType(x), getMantissaBitCnt(), getExponentBitCnt()); + return glsl::bitfieldExtract(impl::castToUintType(x), getMantissaBitCnt::type>(), getExponentBitCnt::type>()); } template @@ -105,9 +99,10 @@ namespace nbl::hlsl::ieee754 } template - T replaceBiasedExponent(T x, uint32_t biasedExp) + T replaceBiasedExponent(T x, typename unsigned_integer_of_size::type biasedExp) { - return impl::replaceBiasedExponent(x, biasedExp); + //static_assert(impl::isTypeAllowed(), "Invalid type! Only floating point or unsigned integer types are allowed."); + return impl::castBackToFloatType(glsl::bitfieldInsert(impl::castToUintType(x), biasedExp, getMantissaBitCnt(), getExponentBitCnt())); } // performs no overflow tests, returns x*exp2(n) @@ -118,11 +113,13 @@ namespace nbl::hlsl::ieee754 } template - unsigned_integer_of_size::type extractMantissa(T x) + typename unsigned_integer_of_size::type extractMantissa(T x) { using AsUint = typename unsigned_integer_of_size::type; - return (impl::castToUintType(x) & getMantissaMask()); + return impl::castToUintType(x) & getMantissaMask::type>(); } } +} +} #endif \ No newline at end of file diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index 18ee1383ba..fe9c8d4abf 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -229,6 +229,8 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/normalization/shared_nor LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/macros.h") #emulated LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated_float64_t.hlsl") +#utility +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ieee754.hlsl") #spirv intrinsics LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/core.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/fragment_shader_pixel_interlock.hlsl") From 4a937b958d16ee7609bc23aa2b8512ca921362b1 Mon Sep 17 00:00:00 2001 From: Przemek Date: Fri, 21 Jun 2024 20:03:51 +0200 Subject: [PATCH 018/358] Saving work --- examples_tests | 2 +- include/nbl/builtin/hlsl/bit.hlsl | 3 +- .../nbl/builtin/hlsl/emulated_float64_t.hlsl | 46 +++++++++---------- .../nbl/builtin/hlsl/glsl_compat/core.hlsl | 3 +- include/nbl/builtin/hlsl/ieee754.hlsl | 28 +++++++---- include/nbl/builtin/hlsl/type_traits.hlsl | 2 +- 6 files changed, 47 insertions(+), 37 deletions(-) diff --git a/examples_tests b/examples_tests index 9874800e41..f17208a703 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 9874800e41d75327724b5ed6cf4b11cabc7eedd7 +Subproject commit f17208a7039e30af359d0e99cf896f2d013670ff diff --git a/include/nbl/builtin/hlsl/bit.hlsl b/include/nbl/builtin/hlsl/bit.hlsl index e1f1117af6..a94103e515 100644 --- a/include/nbl/builtin/hlsl/bit.hlsl +++ b/include/nbl/builtin/hlsl/bit.hlsl @@ -36,7 +36,8 @@ namespace hlsl template To bit_cast(From val) { - static_assert(sizeof(To) <= sizeof(From)); + // TODO: fix + //static_assert(sizeof(To) <= sizeof(From)); return spirv::bitcast(val); } diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index fe37f001be..f8f3b202a2 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -38,12 +38,14 @@ namespace impl template uint64_t promoteToUint64(T val) { - using AsFloat = unsigned_integer_of_size; + using AsFloat = typename float_of_size::type; uint64_t asUint = ieee754::impl::castToUintType(val); - const uint64_t sign = (uint64_t(ieee754::getSignMask()) | asUint) << (sizeof(uint64_t) - sizeof(T) - 2); - const uint64_t exp = (uint64_t(ieee754::getExponentMask()) | asUint) << (ieee754::getMantissaBitCnt() - ieee754::getMantissaBitCnt() + ieee754::getExponentBitCnt()); - const uint64_t mantissa = (uint64_t(ieee754::getMantissaMask()) | asUint) << (ieee754::getMantissaBitCnt() - ieee754::getMantissaBitCnt()); + const uint64_t sign = (uint64_t(ieee754::getSignMask()) & asUint) << (sizeof(float64_t) - sizeof(T)); + const int64_t newExponent = ieee754::extractExponent(val) + ieee754::getExponentBias(); + + const uint64_t exp = (uint64_t(ieee754::extractExponent(val)) + ieee754::getExponentBias()) << (ieee754::getMantissaBitCnt()); + const uint64_t mantissa = (uint64_t(ieee754::getMantissaMask()) & asUint) << (ieee754::getMantissaBitCnt() - ieee754::getMantissaBitCnt()); return sign | exp | mantissa; }; @@ -351,6 +353,9 @@ namespace impl static emulated_float64_t create(uint32_t val) { +#ifndef __HLSL_VERSION + std::cout << val; +#endif return emulated_float64_t(impl::promoteToUint64(val)); } @@ -373,14 +378,6 @@ namespace impl { return emulated_float64_t(impl::promoteToUint64(val)); } - - //TODO do i need that? - static emulated_float64_t createEmulatedFloat64PreserveBitPattern(uint64_t val) - { - emulated_float64_t output; - output.data = val; - return output; - } // TODO: won't not work for uints with msb of index > 52 //#ifndef __HLSL_VERSION @@ -403,7 +400,7 @@ namespace impl // arithmetic operators emulated_float64_t operator+(const emulated_float64_t rhs) { - emulated_float64_t retval = createEmulatedFloat64PreserveBitPattern(0u); + emulated_float64_t retval = create(0u); uint32_t lhsSign = uint32_t((data & 0x8000000000000000ull) >> 32); uint32_t rhsSign = uint32_t((rhs.data & 0x8000000000000000ull) >> 32); @@ -428,12 +425,12 @@ namespace impl //if (lhsExp == 0x7FF) //{ // bool propagate = (lhsMantissa | rhsMantissa) != 0u; - // return createEmulatedFloat64PreserveBitPattern(LERP(data, impl::propagateFloat64NaN(data, rhs.data), propagate)); + // return create(LERP(data, impl::propagateFloat64NaN(data, rhs.data), propagate)); //} frac.xy = impl::add64(lhsHigh, lhsLow, rhsHigh, rhsLow); if (lhsExp == 0) - return createEmulatedFloat64PreserveBitPattern(impl::packFloat64(lhsSign, 0, frac.x, frac.y)); + return create(impl::packFloat64(lhsSign, 0, frac.x, frac.y)); frac.z = 0u; frac.x |= 0x00200000u; exp = lhsExp; @@ -446,13 +443,12 @@ namespace impl EXCHANGE(lhsHigh, rhsHigh); EXCHANGE(lhsLow, rhsLow); EXCHANGE(lhsExp, rhsExp); - EXCHANGE(lhsExp, rhsExp); } if (lhsExp == 0x7FF) { bool propagate = (lhsHigh | lhsLow) != 0u; - return createEmulatedFloat64PreserveBitPattern(LERP(0x7FF0000000000000ull | (uint64_t(lhsSign) << 32), impl::propagateFloat64NaN(data, rhs.data), propagate)); + return create(LERP(0x7FF0000000000000ull | (uint64_t(lhsSign) << 32), impl::propagateFloat64NaN(data, rhs.data), propagate)); } expDiff = LERP(ABS(expDiff), ABS(expDiff) - 1, rhsExp == 0); @@ -472,11 +468,11 @@ namespace impl ++exp; } - return createEmulatedFloat64PreserveBitPattern(impl::roundAndPackFloat64(lhsSign, exp, frac.x, frac.y, frac.z)); + return create(impl::roundAndPackFloat64(lhsSign, exp, frac.x, frac.y, frac.z)); } // cannot happen but compiler cries about not every path returning value - return createEmulatedFloat64PreserveBitPattern(0xdeadbeefbadcaffeull); + return create(0xdeadbeefbadcaffeull); } else { @@ -516,7 +512,7 @@ namespace impl frac.xy = impl::sub64(lhsHigh, lhsLow, rhsHigh, rhsLow); exp = lhsExp; --exp; - return createEmulatedFloat64PreserveBitPattern(impl::normalizeRoundAndPackFloat64(lhsSign, exp - 10, frac.x, frac.y)); + return create(impl::normalizeRoundAndPackFloat64(lhsSign, exp - 10, frac.x, frac.y)); } //if (lhsExp == 0x7FF) //{ @@ -552,13 +548,13 @@ namespace impl lhsSign ^= signOfDifference; uint64_t retval_0 = impl::packFloat64(uint32_t(FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) << 31, 0, 0u, 0u); uint64_t retval_1 = impl::normalizeRoundAndPackFloat64(lhsSign, exp - 11, frac.x, frac.y); - return createEmulatedFloat64PreserveBitPattern(LERP(retval_0, retval_1, frac.x != 0u || frac.y != 0u)); + return create(LERP(retval_0, retval_1, frac.x != 0u || frac.y != 0u)); } } emulated_float64_t operator-(emulated_float64_t rhs) { - emulated_float64_t lhs = createEmulatedFloat64PreserveBitPattern(data); + emulated_float64_t lhs = create(data); emulated_float64_t rhsFlipped = rhs.flipSign(); return lhs + rhsFlipped; @@ -594,13 +590,13 @@ namespace impl ++exp; } - return createEmulatedFloat64PreserveBitPattern(impl::roundAndPackFloat64(sign, exp, fracUnpacked.x, fracUnpacked.y, fracUnpacked.z)); + return create(impl::roundAndPackFloat64(sign, exp, fracUnpacked.x, fracUnpacked.y, fracUnpacked.z)); } // TODO emulated_float64_t operator/(const emulated_float64_t rhs) { - return createEmulatedFloat64PreserveBitPattern(0xdeadbeefbadcaffeull); + return create(0xdeadbeefbadcaffeull); } // relational operators @@ -625,7 +621,7 @@ namespace impl emulated_float64_t flipSign() { const uint64_t flippedSign = ((~data) & 0x8000000000000000ull); - return createEmulatedFloat64PreserveBitPattern(flippedSign | (data & 0x7FFFFFFFFFFFFFFFull)); + return create(flippedSign | (data & 0x7FFFFFFFFFFFFFFFull)); } bool isNaN() diff --git a/include/nbl/builtin/hlsl/glsl_compat/core.hlsl b/include/nbl/builtin/hlsl/glsl_compat/core.hlsl index 2337618df1..b9f24e65be 100644 --- a/include/nbl/builtin/hlsl/glsl_compat/core.hlsl +++ b/include/nbl/builtin/hlsl/glsl_compat/core.hlsl @@ -87,7 +87,8 @@ namespace impl template T bitfieldExtract(T val, uint32_t offsetBits, uint32_t numBits) { - return impl::bitfieldExtract::value, is_integral::value>::template __call(val, offsetBits, numBits); + impl::bitfieldExtract::value, is_integral::value> extractStruct; + return extractStruct.__call(val, offsetBits, numBits); } #ifdef __HLSL_VERSION diff --git a/include/nbl/builtin/hlsl/ieee754.hlsl b/include/nbl/builtin/hlsl/ieee754.hlsl index 1951fb33a9..2cd74caa54 100644 --- a/include/nbl/builtin/hlsl/ieee754.hlsl +++ b/include/nbl/builtin/hlsl/ieee754.hlsl @@ -5,6 +5,18 @@ #include #include +// TODO: delete +#ifdef __HLSL_VERSION +#define staticAssertTmp(...) ; +#else +void dbgBreakIf(bool condition) +{ + if (!condition) + __debugbreak(); +} +#define staticAssertTmp(x, ...) dbgBreakIf(x); +#endif + namespace nbl { namespace hlsl @@ -47,13 +59,13 @@ namespace impl } template - int getExponentBitCnt() { return 0xdeadbeefu; } + int getExponentBitCnt() { staticAssertTmp(false); return 0xdeadbeefu; } template<> int getExponentBitCnt() { return 5; } template<> int getExponentBitCnt() { return 8; } template<> int getExponentBitCnt() { return 11; } template - int getMantissaBitCnt() { return 0xdeadbeefu; } + int getMantissaBitCnt() { staticAssertTmp(false); return 0xdeadbeefu; /*TODO: add static assert fail to every 0xdeadbeef, fix all static asserts*/ } template<> int getMantissaBitCnt() { return 10; } template<> int getMantissaBitCnt() { return 23; } template<> int getMantissaBitCnt() { return 52; } @@ -61,26 +73,26 @@ namespace impl template int getExponentBias() { - //static_assert(impl::isTypeAllowed(), "Invalid type! Only floating point or unsigned integer types are allowed."); + staticAssertTmp(impl::isTypeAllowed(), "Invalid type! Only floating point or unsigned integer types are allowed."); return (0x1 << (getExponentBitCnt::type>() - 1)) - 1; } template typename unsigned_integer_of_size::type getSignMask() { - //static_assert(impl::isTypeAllowed(), "Invalid type! Only floating point or unsigned integer types are allowed."); + staticAssertTmp(impl::isTypeAllowed(), "Invalid type! Only floating point or unsigned integer types are allowed."); using AsUint = typename unsigned_integer_of_size::type; return AsUint(0x1) << (sizeof(T) * 4 - 1); } template - typename unsigned_integer_of_size::type getExponentMask() { return 0xdeadbeefu; } + typename unsigned_integer_of_size::type getExponentMask() { staticAssertTmp(false); return 0xdeadbeefu; } template<> unsigned_integer_of_size<2>::type getExponentMask() { return 0x7C00; } template<> unsigned_integer_of_size<4>::type getExponentMask() { return 0x7F800000u; } template<> unsigned_integer_of_size<8>::type getExponentMask() { return 0x7FF0000000000000ull; } template - typename unsigned_integer_of_size::type getMantissaMask() { return 0xdeadbeefu; } + typename unsigned_integer_of_size::type getMantissaMask() { staticAssertTmp(false); return 0xdeadbeefu; } template<> unsigned_integer_of_size<2>::type getMantissaMask() { return 0x03FF; } template<> unsigned_integer_of_size<4>::type getMantissaMask() { return 0x007FFFFFu; } template<> unsigned_integer_of_size<8>::type getMantissaMask() { return 0x000FFFFFFFFFFFFFull; } @@ -95,13 +107,13 @@ namespace impl template int extractExponent(T x) { - return int(extractBiasedExponent(x) - getExponentBias()); + return int(extractBiasedExponent(x)) - int(getExponentBias()); } template T replaceBiasedExponent(T x, typename unsigned_integer_of_size::type biasedExp) { - //static_assert(impl::isTypeAllowed(), "Invalid type! Only floating point or unsigned integer types are allowed."); + staticAssertTmp(impl::isTypeAllowed(), "Invalid type! Only floating point or unsigned integer types are allowed."); return impl::castBackToFloatType(glsl::bitfieldInsert(impl::castToUintType(x), biasedExp, getMantissaBitCnt(), getExponentBitCnt())); } diff --git a/include/nbl/builtin/hlsl/type_traits.hlsl b/include/nbl/builtin/hlsl/type_traits.hlsl index 286f86d3a0..8dab4bcb1c 100644 --- a/include/nbl/builtin/hlsl/type_traits.hlsl +++ b/include/nbl/builtin/hlsl/type_traits.hlsl @@ -698,7 +698,7 @@ struct float_of_size<4> template<> struct float_of_size<8> { - using type = uint64_t; + using type = float64_t; }; } From 669983fddc8cf281f434a80ea21e4d9a60f5cc38 Mon Sep 17 00:00:00 2001 From: Przemek Date: Mon, 8 Jul 2024 16:58:32 +0200 Subject: [PATCH 019/358] Improvements --- include/nbl/builtin/hlsl/algorithm.hlsl | 79 ++ .../nbl/builtin/hlsl/emulated_float64_t.hlsl | 827 ++++++++++-------- include/nbl/builtin/hlsl/ieee754.hlsl | 152 ++-- include/nbl/builtin/hlsl/tgmath.hlsl | 25 + src/nbl/builtin/CMakeLists.txt | 1 + 5 files changed, 661 insertions(+), 423 deletions(-) create mode 100644 include/nbl/builtin/hlsl/tgmath.hlsl diff --git a/include/nbl/builtin/hlsl/algorithm.hlsl b/include/nbl/builtin/hlsl/algorithm.hlsl index 757105081b..564ef59f20 100644 --- a/include/nbl/builtin/hlsl/algorithm.hlsl +++ b/include/nbl/builtin/hlsl/algorithm.hlsl @@ -11,6 +11,84 @@ namespace nbl namespace hlsl { +namespace impl +{ +#ifdef __HLSL_VERSION + + // TODO: use structs + + template + NBL_CONSTEXPR_STATIC_INLINE void swap(NBL_REF_ARG(T) lhs, NBL_REF_ARG(T) rhs) + { + T tmp = lhs; + lhs = rhs; + rhs = tmp; + } + + template<> + NBL_CONSTEXPR_STATIC_INLINE void swap(NBL_REF_ARG(uint16_t) lhs, NBL_REF_ARG(uint16_t) rhs) + { + lhs ^= rhs; + rhs ^= lhs; + lhs ^= rhs; + } + + template<> + NBL_CONSTEXPR_STATIC_INLINE void swap(NBL_REF_ARG(uint32_t) lhs, NBL_REF_ARG(uint32_t) rhs) + { + lhs ^= rhs; + rhs ^= lhs; + lhs ^= rhs; + } + + template<> + NBL_CONSTEXPR_STATIC_INLINE void swap(NBL_REF_ARG(uint64_t) lhs, NBL_REF_ARG(uint64_t) rhs) + { + lhs ^= rhs; + rhs ^= lhs; + lhs ^= rhs; + } + + template<> + NBL_CONSTEXPR_STATIC_INLINE void swap(NBL_REF_ARG(int16_t) lhs, NBL_REF_ARG(int16_t) rhs) + { + lhs ^= rhs; + rhs ^= lhs; + lhs ^= rhs; + } + + template<> + NBL_CONSTEXPR_STATIC_INLINE void swap(NBL_REF_ARG(int32_t) lhs, NBL_REF_ARG(int32_t) rhs) + { + lhs ^= rhs; + rhs ^= lhs; + lhs ^= rhs; + } + + template<> + NBL_CONSTEXPR_STATIC_INLINE void swap(NBL_REF_ARG(int64_t) lhs, NBL_REF_ARG(int64_t) rhs) + { + lhs ^= rhs; + rhs ^= lhs; + lhs ^= rhs; + } +#else + template + NBL_CONSTEXPR_STATIC_INLINE void swap(NBL_REF_ARG(T) lhs, NBL_REF_ARG(T) rhs) + { + std::swap(lhs, rhs); + } +#endif +} + +template +NBL_CONSTEXPR_STATIC_INLINE void swap(NBL_REF_ARG(T) lhs, NBL_REF_ARG(T) rhs) +{ + impl::swap(lhs, rhs); +} + + +#ifdef __HLSL_VERSION namespace impl { @@ -146,6 +224,7 @@ uint upper_bound(inout Accessor accessor, const uint begin, const uint end, cons return impl::upper_bound(accessor,begin,end,value); } +#endif } } diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index f8f3b202a2..a631b0ff03 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -3,32 +3,44 @@ #include #include - -#ifdef __HLSL_VERSION -#define LERP lerp -#else -#define LERP nbl::hlsl::lerp -#endif - -#ifdef __HLSL_VERSION -#define ABS abs -#else -#define ABS std::abs -#endif - -// TODO: inline function -#define EXCHANGE(a, b) \ - do { \ - a ^= b; \ - b ^= a; \ - a ^= b; \ - } while (false) +#include +#include + +// TODO: when it will be possible, use this unions wherever they fit: +/* +* union Mantissa +* { +* struct +* { +* uint32_t highBits; +* uint64_t lowBits; +* }; +* +* uint32_t2 packed; +* }; +* +*/ + +/* +* union Mantissa +* { +* struct +* { +* uint64_t lhs; +* uint64_t rhs; +* }; +* +* uint32_t4 packed; +* }; +* +*/ #define FLOAT_ROUND_NEAREST_EVEN 0 #define FLOAT_ROUND_TO_ZERO 1 #define FLOAT_ROUND_DOWN 2 #define FLOAT_ROUND_UP 3 #define FLOAT_ROUNDING_MODE FLOAT_ROUND_NEAREST_EVEN + namespace nbl { namespace hlsl @@ -41,11 +53,11 @@ namespace impl using AsFloat = typename float_of_size::type; uint64_t asUint = ieee754::impl::castToUintType(val); - const uint64_t sign = (uint64_t(ieee754::getSignMask()) & asUint) << (sizeof(float64_t) - sizeof(T)); - const int64_t newExponent = ieee754::extractExponent(val) + ieee754::getExponentBias(); + const uint64_t sign = (uint64_t(ieee754::traits::signMask) & asUint) << (sizeof(float64_t) - sizeof(T)); + const int64_t newExponent = ieee754::extractExponent(val) + ieee754::traits::exponentBias; - const uint64_t exp = (uint64_t(ieee754::extractExponent(val)) + ieee754::getExponentBias()) << (ieee754::getMantissaBitCnt()); - const uint64_t mantissa = (uint64_t(ieee754::getMantissaMask()) & asUint) << (ieee754::getMantissaBitCnt() - ieee754::getMantissaBitCnt()); + const uint64_t exp = (uint64_t(ieee754::extractExponent(val)) + ieee754::traits::exponentBias) << (ieee754::traits::mantissaBitCnt); + const uint64_t mantissa = (uint64_t(ieee754::traits::mantissaMask) & asUint) << (ieee754::traits::exponentBias - ieee754::traits::mantissaBitCnt); return sign | exp | mantissa; }; @@ -60,10 +72,42 @@ namespace impl output.y = uint32_t(product & 0x00000000FFFFFFFFull); return output; } - - bool isNaN64(uint64_t val) + + uint64_t propagateFloat64NaN(uint64_t lhs, uint64_t rhs) { - return bool((0x7FF0000000000000ull & val) && (0x000FFFFFFFFFFFFFull & val)); +#if defined RELAXED_NAN_PROPAGATION + return lhs | rhs; +#else + const bool lhsIsNaN = isnan(bit_cast(lhs)); + const bool rhsIsNaN = isnan(bit_cast(rhs)); + lhs |= 0x0000000000080000ull; + rhs |= 0x0000000000080000ull; + + return lerp(rhs, lerp(lhs, rhs, rhsIsNaN), lhsIsNaN); +#endif + } + + uint64_t packFloat64(uint32_t zSign, int zExp, uint32_t zFrac0, uint32_t zFrac1) + { + nbl::hlsl::uint32_t2 z; + + z.x = zSign + (uint32_t(zExp) << 20) + zFrac0; + z.y = zFrac1; + + uint64_t output = 0u; + output |= (uint64_t(z.x) << 32) & 0xFFFFFFFF00000000ull; + output |= uint64_t(z.y); + return output; + } + + uint32_t2 packUint64(uint64_t val) + { + return uint32_t2((val & 0xFFFFFFFF00000000ull) >> 32, val & 0x00000000FFFFFFFFull); + } + + uint64_t unpackUint64(uint32_t2 val) + { + return ((uint64_t(val.x) & 0x00000000FFFFFFFFull) << 32) | uint64_t(val.y); } nbl::hlsl::uint32_t2 add64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) @@ -95,248 +139,233 @@ namespace impl #endif } - uint64_t propagateFloat64NaN(uint64_t a, uint64_t b) - { - #if defined RELAXED_NAN_PROPAGATION - return a | b; - #else - - bool aIsNaN = isNaN64(a); - bool bIsNaN = isNaN64(b); - a |= 0x0008000000000000ull; - b |= 0x0008000000000000ull; - - // TODO: - //return LERP(b, LERP(a, b, nbl::hlsl::float32_t2(bIsNaN, bIsNaN)), nbl::hlsl::float32_t2(aIsNaN, aIsNaN)); - return 0xdeadbeefbadcaffeull; - #endif - } - - nbl::hlsl::uint32_t2 shortShift64Left(uint32_t a0, uint32_t a1, int count) - { - nbl::hlsl::uint32_t2 output; - output.y = a1 << count; - output.x = LERP((a0 << count | (a1 >> ((-count) & 31))), a0, count == 0); - - return output; - }; - - nbl::hlsl::uint32_t2 shift64RightJamming(uint32_t a0, uint32_t a1, int count) + uint32_t2 shift64RightJamming(uint32_t2 val, int count) { - nbl::hlsl::uint32_t2 output; + uint32_t2 output; const int negCount = (-count) & 31; - output.x = LERP(0u, a0, count == 0); - output.x = LERP(output.x, (a0 >> count), count < 32); + output.x = lerp(0u, val.x, count == 0); + output.x = lerp(output.x, (val.x >> count), count < 32); - output.y = uint32_t((a0 | a1) != 0u); /* count >= 64 */ - uint32_t z1_lt64 = (a0>>(count & 31)) | uint32_t(((a0<>count) | uint32_t ((a1<= 64 */ + uint32_t z1_lt64 = (val.x>>(count & 31)) | uint32_t(((val.x<>count) | uint32_t((val.y<> (count & 31)), count < 64); - output.y = LERP(output.y, (a0<>count), count < 32); + output.y = lerp(0u, (val.x >> (count & 31)), count < 64); + output.y = lerp(output.y, (val.x << negCount) | (val.y >> count), count < 32); - a2 = LERP(a2 | a1, a2, count < 32); - output.x = LERP(output.x, a0 >> count, count < 32); - output.z |= uint32_t(a2 != 0u); + val.z = lerp(val.z | val.y, val.z, count < 32); + output.x = lerp(output.x, val.x >> count, count < 32); + output.z |= uint32_t(val.z != 0u); - output.x = LERP(output.x, 0u, (count == 32)); - output.y = LERP(output.y, a0, (count == 32)); - output.z = LERP(output.z, a1, (count == 32)); - output.x = LERP(output.x, a0, (count == 0)); - output.y = LERP(output.y, a1, (count == 0)); - output.z = LERP(output.z, a2, (count == 0)); + output.x = lerp(output.x, 0u, (count == 32)); + output.y = lerp(output.y, val.x, (count == 32)); + output.z = lerp(output.z, val.y, (count == 32)); + output.x = lerp(output.x, val.x, (count == 0)); + output.y = lerp(output.y, val.y, (count == 0)); + output.z = lerp(output.z, val.z, (count == 0)); return output; } - - - uint64_t packFloat64(uint32_t zSign, int zExp, uint32_t zFrac0, uint32_t zFrac1) + + uint64_t shortShift64Left(uint64_t val, int count) { - nbl::hlsl::uint32_t2 z; - - z.x = zSign + (uint32_t(zExp) << 20) + zFrac0; - z.y = zFrac1; - - uint64_t output = 0u; - output |= (uint64_t(z.x) << 32) & 0xFFFFFFFF00000000ull; - output |= uint64_t(z.y); - return output; + const uint32_t2 packed = packUint64(val); + + nbl::hlsl::uint32_t2 output; + output.y = packed.y << count; + // TODO: fix + output.x = lerp((packed.x << count | (packed.y >> ((-count) & 31))), packed.x, count == 0); + + // y = 3092377600 + // x = 2119009566 + return unpackUint64(output); + }; + + uint64_t assembleFloat64(uint64_t signShifted, uint64_t expShifted, uint64_t mantissa) + { + return signShifted + expShifted + mantissa; } - - uint64_t roundAndPackFloat64(uint32_t zSign, int zExp, uint32_t zFrac0, uint32_t zFrac1, uint32_t zFrac2) + uint64_t roundAndPackFloat64(uint64_t zSign, int zExp, uint32_t3 mantissaExtended) { bool roundNearestEven; bool increment; roundNearestEven = true; - increment = int(zFrac2) < 0; + increment = int(mantissaExtended.z) < 0; if (!roundNearestEven) { - if (false) //(FLOAT_ROUNDING_MODE == FLOAT_ROUND_TO_ZERO) - { - increment = false; - } - else - { - if (false) //(zSign != 0u) - { - //increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) && - // (zFrac2 != 0u); - } - else - { - //increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP) && - // (zFrac2 != 0u); - } - } + if (false) //(FLOAT_ROUNDING_MODE == FLOAT_ROUND_TO_ZERO) + { + increment = false; + } + else + { + if (false) //(zSign != 0u) + { + //increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) && + // (zFrac2 != 0u); + } + else + { + //increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP) && + // (zFrac2 != 0u); + } + } } if (0x7FD <= zExp) { - if ((0x7FD < zExp) || ((zExp == 0x7FD) && (0x001FFFFFu == zFrac0 && 0xFFFFFFFFu == zFrac1) && increment)) - { - if (false) // ((FLOAT_ROUNDING_MODE == FLOAT_ROUND_TO_ZERO) || - // ((zSign != 0u) && (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP)) || - // ((zSign == 0u) && (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN))) - { - return packFloat64(zSign, 0x7FE, 0x000FFFFFu, 0xFFFFFFFFu); - } - - return packFloat64(zSign, 0x7FF, 0u, 0u); + if ((0x7FD < zExp) || ((zExp == 0x7FD) && (0x001FFFFFu == mantissaExtended.x && 0xFFFFFFFFu == mantissaExtended.y) && increment)) + { + if (false) // ((FLOAT_ROUNDING_MODE == FLOAT_ROUND_TO_ZERO) || + // ((zSign != 0u) && (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP)) || + // ((zSign == 0u) && (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN))) + { + return packFloat64(zSign, 0x7FE, 0x000FFFFFu, 0xFFFFFFFFu); + } + + return packFloat64(zSign, 0x7FF, 0u, 0u); } } if (zExp < 0) { - nbl::hlsl::uint32_t3 shifted = shift64ExtraRightJamming(zFrac0, zFrac1, zFrac2, -zExp); - zFrac0 = shifted.x; - zFrac1 = shifted.y; - zFrac2 = shifted.z; - zExp = 0; - - if (roundNearestEven) - { - increment = zFrac2 < 0u; - } - else - { - if (zSign != 0u) - { - increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) && (zFrac2 != 0u); - } - else - { - increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP) && (zFrac2 != 0u); - } - } + mantissaExtended = shift64ExtraRightJamming(mantissaExtended, -zExp); + zExp = 0; + + if (roundNearestEven) + { + increment = mantissaExtended.z < 0u; + } + else + { + if (zSign != 0u) + { + increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) && (mantissaExtended.z != 0u); + } + else + { + increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP) && (mantissaExtended.z != 0u); + } + } } if (increment) { - nbl::hlsl::uint32_t2 added = add64(zFrac0, zFrac1, 0u, 1u); - zFrac0 = added.x; - zFrac1 = added.y; - zFrac1 &= ~((zFrac2 + uint32_t(zFrac2 == 0u)) & uint32_t(roundNearestEven)); + const uint64_t added = impl::unpackUint64(uint32_t2(mantissaExtended.xy)) + 1ull; + mantissaExtended.xy = packUint64(added); + mantissaExtended.y &= ~((mantissaExtended.z + uint32_t(mantissaExtended.z == 0u)) & uint32_t(roundNearestEven)); } else { - zExp = LERP(zExp, 0, (zFrac0 | zFrac1) == 0u); + zExp = lerp(zExp, 0, (mantissaExtended.x | mantissaExtended.y) == 0u); } - return packFloat64(zSign, zExp, zFrac0, zFrac1); + return assembleFloat64(zSign, uint64_t(zExp) << ieee754::traits::mantissaBitCnt, unpackUint64(mantissaExtended.xy)); } - uint64_t normalizeRoundAndPackFloat64(uint32_t sign, int exp, uint32_t frac0, uint32_t frac1) + uint64_t normalizeRoundAndPackFloat64(uint64_t sign, int exp, uint32_t frac0, uint32_t frac1) { int shiftCount; nbl::hlsl::uint32_t3 frac = nbl::hlsl::uint32_t3(frac0, frac1, 0u); if (frac.x == 0u) { - exp -= 32; - frac.x = frac.y; - frac.y = 0u; + exp -= 32; + frac.x = frac.y; + frac.y = 0u; } shiftCount = countLeadingZeros32(frac.x) - 11; if (0 <= shiftCount) { - frac.xy = shortShift64Left(frac.x, frac.y, shiftCount); + frac.xy = shortShift64Left(unpackUint64(frac.xy), shiftCount); } else { - frac.xyz = shift64ExtraRightJamming(frac.x, frac.y, 0u, -shiftCount); + frac.xyz = shift64ExtraRightJamming(uint32_t3(frac.xy, 0), -shiftCount); } exp -= shiftCount; - return roundAndPackFloat64(sign, exp, frac.x, frac.y, frac.z); + return roundAndPackFloat64(sign, exp, frac); } - - static const uint64_t SIGN_MASK = 0x8000000000000000ull; - static const uint64_t EXP_MASK = 0x7FF0000000000000ull; - static const uint64_t MANTISA_MASK = 0x000FFFFFFFFFFFFFull; + + void normalizeFloat64Subnormal(uint64_t mantissa, + NBL_REF_ARG(int) outExp, + NBL_REF_ARG(uint64_t) outMantissa) + { + uint32_t2 mantissaPacked = packUint64(mantissa); + int shiftCount; + uint32_t2 temp; + shiftCount = countLeadingZeros32(lerp(mantissaPacked.x, mantissaPacked.y, mantissaPacked.x == 0u)) - 11; + outExp = lerp(1 - shiftCount, -shiftCount - 31, mantissaPacked.x == 0u); + + temp.x = lerp(mantissaPacked.y << shiftCount, mantissaPacked.y >> (-shiftCount), shiftCount < 0); + temp.y = lerp(0u, mantissaPacked.y << (shiftCount & 31), shiftCount < 0); + + shortShift64Left(impl::unpackUint64(mantissaPacked), shiftCount); + + outMantissa = lerp(outMantissa, unpackUint64(temp), mantissaPacked.x == 0); + } + } struct emulated_float64_t @@ -346,144 +375,130 @@ namespace impl storage_t data; // constructors - static emulated_float64_t create(uint64_t val) + /*static emulated_float64_t create(uint16_t val) { - return emulated_float64_t(val); + return emulated_float64_t(bit_cast(float64_t(val))); + }*/ + + static emulated_float64_t create(int32_t val) + { + return emulated_float64_t(bit_cast(float64_t(val))); + } + + static emulated_float64_t create(int64_t val) + { + return emulated_float64_t(bit_cast(float64_t(val))); } static emulated_float64_t create(uint32_t val) { -#ifndef __HLSL_VERSION - std::cout << val; -#endif - return emulated_float64_t(impl::promoteToUint64(val)); + return emulated_float64_t(bit_cast(float64_t(val))); } - static emulated_float64_t create(uint16_t val) + static emulated_float64_t create(uint64_t val) { - return emulated_float64_t(impl::promoteToUint64(val)); + return emulated_float64_t(bit_cast(float64_t(val))); } static emulated_float64_t create(float64_t val) { - return emulated_float64_t(bit_cast(val)); + return emulated_float64_t(bit_cast(val)); } - static emulated_float64_t create(float16_t val) + // TODO: unresolved external symbol imath_half_to_float_table + /*static emulated_float64_t create(float16_t val) { - return emulated_float64_t(impl::promoteToUint64(val)); - } + return emulated_float64_t(bit_cast(float64_t(val))); + }*/ static emulated_float64_t create(float32_t val) { - return emulated_float64_t(impl::promoteToUint64(val)); + return emulated_float64_t(bit_cast(float64_t(val))); + } + + static emulated_float64_t createPreserveBitPattern(uint64_t val) + { + return emulated_float64_t(val); } - - // TODO: won't not work for uints with msb of index > 52 -//#ifndef __HLSL_VERSION -// template<> -// static emulated_float64_t create(uint64_t val) -// { -//#ifndef __HLSL_VERSION -// const uint64_t msbIndex = nbl::hlsl::findMSB(val); -//#else -// const uint64_t msbIndex = firstbithigh(val); -//#endif -// uint64_t exp = ((msbIndex + 1023) << 52) & 0x7FF0000000000000; -// uint64_t mantissa = (val << (52 - msbIndex)) & 0x000FFFFFFFFFFFFFull; -// emulated_float64_t output; -// output.data = exp | mantissa; -// return output; -// } -//#endif // arithmetic operators emulated_float64_t operator+(const emulated_float64_t rhs) { - emulated_float64_t retval = create(0u); - - uint32_t lhsSign = uint32_t((data & 0x8000000000000000ull) >> 32); - uint32_t rhsSign = uint32_t((rhs.data & 0x8000000000000000ull) >> 32); - - uint32_t lhsLow = uint32_t(data & 0x00000000FFFFFFFFull); - uint32_t rhsLow = uint32_t(rhs.data & 0x00000000FFFFFFFFull); - uint32_t lhsHigh = uint32_t((data & 0x000FFFFF00000000ull) >> 32); - uint32_t rhsHigh = uint32_t((rhs.data & 0x000FFFFF00000000ull) >> 32); - - int lhsExp = int((data >> 52) & 0x7FFull); - int rhsExp = int((rhs.data >> 52) & 0x7FFull); + emulated_float64_t retval = createPreserveBitPattern(0u); + + uint64_t mantissa; + uint32_t3 mantissaExtended; + int biasedExp; + + uint64_t lhsSign = data & ieee754::traits::signMask; + uint64_t rhsSign = rhs.data & ieee754::traits::signMask; + uint64_t lhsMantissa = ieee754::extractMantissa(data); + uint64_t rhsMantissa = ieee754::extractMantissa(rhs.data); + int lhsBiasedExp = ieee754::extractBiasedExponent(data); + int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); - int expDiff = lhsExp - rhsExp; + int expDiff = lhsBiasedExp - rhsBiasedExp; if (lhsSign == rhsSign) { - nbl::hlsl::uint32_t3 frac; - int exp; - if (expDiff == 0) { - //if (lhsExp == 0x7FF) - //{ - // bool propagate = (lhsMantissa | rhsMantissa) != 0u; - // return create(LERP(data, impl::propagateFloat64NaN(data, rhs.data), propagate)); - //} + //if (lhsExp == 0x7FF) + //{ + // bool propagate = (lhsMantissa | rhsMantissa) != 0u; + // return createPreserveBitPattern(lerp(data, impl::propagateFloat64NaN(data, rhs.data), propagate)); + //} - frac.xy = impl::add64(lhsHigh, lhsLow, rhsHigh, rhsLow); - if (lhsExp == 0) - return create(impl::packFloat64(lhsSign, 0, frac.x, frac.y)); - frac.z = 0u; - frac.x |= 0x00200000u; - exp = lhsExp; - frac = impl::shift64ExtraRightJamming(frac.x, frac.y, frac.z, 1); + mantissa = lhsMantissa + rhsMantissa; + if (lhsBiasedExp == 0) + return createPreserveBitPattern(impl::assembleFloat64(lhsSign, 0, mantissa)); + mantissaExtended.xy = impl::packUint64(mantissa); + mantissaExtended.x |= 0x00200000u; + mantissaExtended.z = 0u; + biasedExp = lhsBiasedExp; + + mantissaExtended = impl::shift64ExtraRightJamming(mantissaExtended, 1); } else { if (expDiff < 0) { - EXCHANGE(lhsHigh, rhsHigh); - EXCHANGE(lhsLow, rhsLow); - EXCHANGE(lhsExp, rhsExp); + swap(lhsMantissa, rhsMantissa); + swap(lhsBiasedExp, rhsBiasedExp); } - if (lhsExp == 0x7FF) + if (lhsBiasedExp == 0x7FF) { - bool propagate = (lhsHigh | lhsLow) != 0u; - return create(LERP(0x7FF0000000000000ull | (uint64_t(lhsSign) << 32), impl::propagateFloat64NaN(data, rhs.data), propagate)); + const bool propagate = (lhsMantissa) != 0u; + return createPreserveBitPattern(lerp(ieee754::traits::exponentMask | lhsSign, impl::propagateFloat64NaN(data, rhs.data), propagate)); } - expDiff = LERP(ABS(expDiff), ABS(expDiff) - 1, rhsExp == 0); - rhsHigh = LERP(rhsHigh | 0x00100000u, rhsHigh, rhsExp == 0); - nbl::hlsl::float32_t3 shifted = impl::shift64ExtraRightJamming(rhsHigh, rhsLow, 0u, expDiff); - rhsHigh = shifted.x; - rhsLow = shifted.y; - frac.z = shifted.z; - exp = lhsExp; - - lhsHigh |= 0x00100000u; - frac.xy = impl::add64(lhsHigh, lhsLow, rhsHigh, rhsLow); - --exp; - if (!(frac.x < 0x00200000u)) + expDiff = lerp(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); + rhsMantissa = lerp(rhsMantissa | 0x0010000000000000ull, rhsMantissa, rhsBiasedExp == 0); + const uint32_t3 shifted = impl::shift64ExtraRightJamming(uint32_t3(impl::packUint64(rhsMantissa), 0u), expDiff); + rhsMantissa = impl::unpackUint64(shifted.xy); + mantissaExtended.z = shifted.z; + biasedExp = lhsBiasedExp; + + lhsMantissa |= 0x0010000000000000ull; + mantissaExtended.xy = impl::packUint64(lhsMantissa + rhsMantissa); + --biasedExp; + if (!(mantissaExtended.x < 0x00200000u)) { - frac = impl::shift64ExtraRightJamming(frac.x, frac.y, frac.z, 1); - ++exp; + mantissaExtended = impl::shift64ExtraRightJamming(mantissaExtended, 1); + ++biasedExp; } - return create(impl::roundAndPackFloat64(lhsSign, exp, frac.x, frac.y, frac.z)); + return createPreserveBitPattern(impl::roundAndPackFloat64(lhsSign, biasedExp, mantissaExtended.xyz)); } // cannot happen but compiler cries about not every path returning value - return create(0xdeadbeefbadcaffeull); + return createPreserveBitPattern(0xdeadbeefbadcaffeull); } else { - int exp; - - nbl::hlsl::uint32_t2 lhsShifted = impl::shortShift64Left(lhsHigh, lhsLow, 10); - lhsHigh = lhsShifted.x; - lhsLow = lhsShifted.y; - nbl::hlsl::uint32_t2 rhsShifted = impl::shortShift64Left(rhsHigh, rhsLow, 10); - rhsHigh = rhsShifted.x; - rhsLow = rhsShifted.y; + lhsMantissa = impl::shortShift64Left(lhsMantissa, 10); + rhsMantissa = impl::shortShift64Left(rhsMantissa, 10); if (expDiff != 0) { @@ -491,10 +506,9 @@ namespace impl if (expDiff < 0) { - EXCHANGE(lhsHigh, rhsHigh); - EXCHANGE(lhsLow, rhsLow); - EXCHANGE(lhsExp, rhsExp); - lhsSign ^= 0x80000000u; + swap(lhsMantissa, rhsMantissa); + swap(lhsBiasedExp, rhsBiasedExp); + lhsSign ^= ieee754::traits::signMask; } //if (lhsExp == 0x7FF) @@ -503,113 +517,206 @@ namespace impl // return nbl::hlsl::lerp(__packFloat64(lhsSign, 0x7ff, 0u, 0u), __propagateFloat64NaN(a, b), propagate); //} - expDiff = LERP(ABS(expDiff), ABS(expDiff) - 1, rhsExp == 0); - rhsHigh = LERP(rhsHigh | 0x40000000u, rhsHigh, rhsExp == 0); - nbl::hlsl::uint32_t2 shifted = impl::shift64RightJamming(rhsHigh, rhsLow, expDiff); - rhsHigh = shifted.x; - rhsLow = shifted.y; - lhsHigh |= 0x40000000u; - frac.xy = impl::sub64(lhsHigh, lhsLow, rhsHigh, rhsLow); - exp = lhsExp; - --exp; - return create(impl::normalizeRoundAndPackFloat64(lhsSign, exp - 10, frac.x, frac.y)); + expDiff = lerp(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); + rhsMantissa = lerp(rhsMantissa | 0x4000000000000000ull, rhsMantissa, rhsBiasedExp == 0); + rhsMantissa = impl::unpackUint64(impl::shift64RightJamming(impl::packUint64(rhsMantissa), expDiff)); + lhsMantissa |= 0x4000000000000000ull; + frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); + biasedExp = lhsBiasedExp; + --biasedExp; + return createPreserveBitPattern(impl::normalizeRoundAndPackFloat64(lhsSign, biasedExp - 10, frac.x, frac.y)); } //if (lhsExp == 0x7FF) //{ // bool propagate = ((lhsHigh | rhsHigh) | (lhsLow | rhsLow)) != 0u; // return nbl::hlsl::lerp(0xFFFFFFFFFFFFFFFFUL, __propagateFloat64NaN(a, b), propagate); //} - rhsExp = LERP(rhsExp, 1, lhsExp == 0); - lhsExp = LERP(lhsExp, 1, lhsExp == 0); + rhsBiasedExp = lerp(rhsBiasedExp, 1, lhsBiasedExp == 0); + lhsBiasedExp = lerp(lhsBiasedExp, 1, lhsBiasedExp == 0); - nbl::hlsl::uint32_t2 frac; - uint32_t signOfDifference = 0; - if (rhsHigh < lhsHigh) + + const uint32_t2 lhsMantissaPacked = impl::packUint64(lhsMantissa); + const uint32_t2 rhsMantissaPacked = impl::packUint64(rhsMantissa); + + uint32_t2 frac; + uint64_t signOfDifference = 0; + if (rhsMantissaPacked.x < lhsMantissaPacked.x) { - frac.xy = impl::sub64(lhsHigh, lhsLow, rhsHigh, rhsLow); + frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); } - else if (lhsHigh < rhsHigh) + else if (lhsMantissaPacked.x < rhsMantissaPacked.x) { - frac.xy = impl::sub64(rhsHigh, rhsLow, lhsHigh, lhsLow); - signOfDifference = 0x80000000; + frac.xy = impl::packUint64(rhsMantissa - lhsMantissa); + signOfDifference = ieee754::traits::signMask; } - else if (rhsLow <= lhsLow) + else if (rhsMantissaPacked.y <= lhsMantissaPacked.y) { - /* It is possible that frac.x and frac.y may be zero after this. */ - frac.xy = impl::sub64(lhsHigh, lhsLow, rhsHigh, rhsLow); + /* It is possible that frac.x and frac.y may be zero after this. */ + frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); } else { - frac.xy = impl::sub64(rhsHigh, rhsLow, lhsHigh, lhsLow); - signOfDifference = 0x80000000; + frac.xy = impl::packUint64(rhsMantissa - lhsMantissa); + signOfDifference = ieee754::traits::signMask; } - exp = LERP(rhsExp, lhsExp, signOfDifference == 0u); + biasedExp = lerp(rhsBiasedExp, lhsBiasedExp, signOfDifference == 0u); lhsSign ^= signOfDifference; uint64_t retval_0 = impl::packFloat64(uint32_t(FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) << 31, 0, 0u, 0u); - uint64_t retval_1 = impl::normalizeRoundAndPackFloat64(lhsSign, exp - 11, frac.x, frac.y); - return create(LERP(retval_0, retval_1, frac.x != 0u || frac.y != 0u)); + uint64_t retval_1 = impl::normalizeRoundAndPackFloat64(lhsSign, biasedExp - 11, frac.x, frac.y); + return createPreserveBitPattern(lerp(retval_0, retval_1, frac.x != 0u || frac.y != 0u)); } } emulated_float64_t operator-(emulated_float64_t rhs) { - emulated_float64_t lhs = create(data); + emulated_float64_t lhs = createPreserveBitPattern(data); emulated_float64_t rhsFlipped = rhs.flipSign(); return lhs + rhsFlipped; } - emulated_float64_t operator*(const emulated_float64_t rhs) + emulated_float64_t operator*(emulated_float64_t rhs) { - emulated_float64_t retval = emulated_float64_t::create(0u); - - uint32_t lhsLow = uint32_t(data & 0x00000000FFFFFFFFull); - uint32_t rhsLow = uint32_t(rhs.data & 0x00000000FFFFFFFFull); - uint32_t lhsHigh = uint32_t((data & 0x000FFFFF00000000ull) >> 32); - uint32_t rhsHigh = uint32_t((rhs.data & 0x000FFFFF00000000ull) >> 32); + emulated_float64_t retval = emulated_float64_t::createPreserveBitPattern(0u); - uint32_t lhsExp = uint32_t((data >> 52) & 0x7FFull); - uint32_t rhsExp = uint32_t((rhs.data >> 52) & 0x7FFull); + uint64_t lhsSign = data & ieee754::traits::signMask; + uint64_t rhsSign = rhs.data & ieee754::traits::signMask; + uint64_t lhsMantissa = ieee754::extractMantissa(data); + uint64_t rhsMantissa = ieee754::extractMantissa(rhs.data); + int lhsBiasedExp = ieee754::extractBiasedExponent(data); + int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); - int32_t exp = int32_t(lhsExp + rhsExp) - 0x400u; - uint64_t sign = uint32_t(((data ^ rhs.data) & 0x8000000000000000ull) >> 32); + int exp = int(lhsBiasedExp + rhsBiasedExp) - 0x400; + uint64_t sign = (data ^ rhs.data) & ieee754::traits::signMask; + if (lhsBiasedExp == 0x7FF) + { + if ((lhsMantissa != 0u) || + ((rhsBiasedExp == 0x7FF) && (rhsMantissa != 0u))) { + return createPreserveBitPattern(impl::propagateFloat64NaN(data, rhs.data)); + } + if ((uint64_t(rhsBiasedExp) | rhsMantissa) == 0u) + return createPreserveBitPattern(0xFFFFFFFFFFFFFFFFull); - lhsHigh |= 0x00100000u; - nbl::hlsl::uint32_t2 shifted = impl::shortShift64Left(rhsHigh, rhsLow, 12); - rhsHigh = shifted.x; - rhsLow = shifted.y; + return createPreserveBitPattern(impl::assembleFloat64(sign, ieee754::traits::exponentMask, 0ull)); + } + if (rhsBiasedExp == 0x7FF) + { + /* a cannot be NaN, but is b NaN? */ + if (rhsMantissa != 0u) +#ifdef RELAXED_NAN_PROPAGATION + return rhs.data; +#else + return createPreserveBitPattern(impl::propagateFloat64NaN(data, rhs.data)); +#endif + if ((uint64_t(lhsBiasedExp) | lhsMantissa) == 0u) + return createPreserveBitPattern(0xFFFFFFFFFFFFFFFFull); - nbl::hlsl::uint32_t4 fracUnpacked = impl::mul64to128(lhsHigh, lhsLow, rhsHigh, rhsLow); - fracUnpacked.xy = impl::add64(fracUnpacked.x, fracUnpacked.y, lhsHigh, lhsLow); - fracUnpacked.z |= uint32_t(fracUnpacked.w != 0u); - if (0x00200000u <= fracUnpacked.x) + return createPreserveBitPattern(sign | ieee754::traits::exponentMask); + } + if (lhsBiasedExp == 0) { - fracUnpacked = nbl::hlsl::uint32_t4(impl::shift64ExtraRightJamming(fracUnpacked.x, fracUnpacked.y, fracUnpacked.z, 1), 0u); - ++exp; + if (lhsMantissa == 0u) + return createPreserveBitPattern(sign); + impl::normalizeFloat64Subnormal(lhsMantissa, lhsBiasedExp, lhsMantissa); } - - return create(impl::roundAndPackFloat64(sign, exp, fracUnpacked.x, fracUnpacked.y, fracUnpacked.z)); + if (rhsBiasedExp == 0) + { + if (rhsMantissa == 0u) + return createPreserveBitPattern(sign); + impl::normalizeFloat64Subnormal(rhsMantissa, rhsBiasedExp, rhsMantissa); + } + + lhsMantissa |= 0x0010000000000000ull; + rhsMantissa = impl::shortShift64Left(rhsMantissa, 12); + + uint32_t4 mantissasPacked; + mantissasPacked.xy = impl::packUint64(lhsMantissa); + mantissasPacked.zw = impl::packUint64(rhsMantissa); + + mantissasPacked = impl::mul64to128(mantissasPacked); + + mantissasPacked.xy = impl::packUint64(impl::unpackUint64(mantissasPacked.xy) + lhsMantissa); + mantissasPacked.z |= uint32_t(mantissasPacked.w != 0u); + if (0x00200000u <= mantissasPacked.x) + { + mantissasPacked = uint32_t4(impl::shift64ExtraRightJamming(mantissasPacked.xyz, 1), 0u); + ++exp; + } + + return createPreserveBitPattern(impl::roundAndPackFloat64(sign, exp, mantissasPacked.xyz)); } // TODO emulated_float64_t operator/(const emulated_float64_t rhs) { - return create(0xdeadbeefbadcaffeull); + return createPreserveBitPattern(0xdeadbeefbadcaffeull); } // relational operators - bool operator==(const emulated_float64_t rhs) { return !(data ^ rhs.data); } - bool operator!=(const emulated_float64_t rhs) { return data ^ rhs.data; } - bool operator<(const emulated_float64_t rhs) { return data < rhs.data; } - bool operator>(const emulated_float64_t rhs) { return data > rhs.data; } - bool operator<=(const emulated_float64_t rhs) { return data <= rhs.data; } - bool operator>=(const emulated_float64_t rhs) { return data >= rhs.data; } + bool operator==(emulated_float64_t rhs) + { + if (isnan(data) || isnan(rhs.data)) + return false; + + const emulated_float64_t xored = emulated_float64_t::createPreserveBitPattern(data ^ rhs.data); + // TODO: check what fast math returns for -0 == 0 + if ((xored.data & 0x7FFFFFFFFFFFFFFFull) == 0ull) + return true; + + return !(xored.data); + } + bool operator!=(emulated_float64_t rhs) + { + if (isnan(data) || isnan(rhs.data)) + return true; + + const emulated_float64_t xored = emulated_float64_t::createPreserveBitPattern(data ^ rhs.data); + + // TODO: check what fast math returns for -0 == 0 + if ((xored.data & 0x7FFFFFFFFFFFFFFFull) == 0ull) + return false; + + return xored.data; + } + bool operator<(emulated_float64_t rhs) + { + const uint64_t lhsSign = ieee754::extractSign(data); + const uint64_t rhsSign = ieee754::extractSign(rhs.data); + + // flip bits of negative numbers and flip signs of all numbers + uint64_t lhsFlipped = data ^ ((0x7FFFFFFFFFFFFFFFull * lhsSign) | ieee754::traits::signMask); + uint64_t rhsFlipped = rhs.data ^ ((0x7FFFFFFFFFFFFFFFull * rhsSign) | ieee754::traits::signMask); + + uint64_t diffBits = lhsFlipped ^ rhsFlipped; + + return (lhsFlipped & diffBits) < (rhsFlipped & diffBits); + } + bool operator>(emulated_float64_t rhs) + { +#ifndef __HLSL_VERSION + std::cout << reinterpret_cast(data) << std::endl; + std::cout << reinterpret_cast(rhs.data) << std::endl; +#endif + + const uint64_t lhsSign = ieee754::extractSign(data); + const uint64_t rhsSign = ieee754::extractSign(rhs.data); + + // flip bits of negative numbers and flip signs of all numbers + uint64_t lhsFlipped = data ^ ((0x7FFFFFFFFFFFFFFFull * lhsSign) | ieee754::traits::signMask); + uint64_t rhsFlipped = rhs.data ^ ((0x7FFFFFFFFFFFFFFFull * rhsSign) | ieee754::traits::signMask); + + uint64_t diffBits = lhsFlipped ^ rhsFlipped; + + return (lhsFlipped & diffBits) > (rhsFlipped & diffBits); + } + bool operator<=(emulated_float64_t rhs) { return !(emulated_float64_t::createPreserveBitPattern(data) > emulated_float64_t::createPreserveBitPattern(rhs.data)); } + bool operator>=(emulated_float64_t rhs) { return !(emulated_float64_t::createPreserveBitPattern(data) < emulated_float64_t::createPreserveBitPattern(rhs.data)); } //logical operators - bool operator&&(const emulated_float64_t rhs) { return bool(data) && bool(rhs.data); } - bool operator||(const emulated_float64_t rhs) { return bool(data) || bool(rhs.data); } + bool operator&&(emulated_float64_t rhs) { return bool(data) && bool(rhs.data); } + bool operator||(emulated_float64_t rhs) { return bool(data) || bool(rhs.data); } bool operator!() { return !bool(data); } // OMITED OPERATORS @@ -620,45 +727,37 @@ namespace impl // TODO: should modify self? emulated_float64_t flipSign() { - const uint64_t flippedSign = ((~data) & 0x8000000000000000ull); - return create(flippedSign | (data & 0x7FFFFFFFFFFFFFFFull)); + return createPreserveBitPattern(data ^ ieee754::traits::signMask); } bool isNaN() { - return impl::isNaN64(data); + return isnan(bit_cast(data)); } }; - - //_NBL_STATIC_INLINE_CONSTEXPR emulated_float64_t EMULATED_FLOAT64_NAN = emulated_float64_t::create(0.0 / 0.0); namespace ieee754 { - template<> int getExponentBitCnt() { return getExponentBitCnt(); } - template<> int getMantissaBitCnt() { return getMantissaBitCnt(); } - template<> int getExponentBias() { return getExponentBias(); } - template<> unsigned_integer_of_size<8>::type getExponentMask() { return getExponentMask(); } - template<> unsigned_integer_of_size<8>::type getMantissaMask() { return getMantissaMask(); } template<> - unsigned_integer_of_size<8>::type getSignMask() + struct traits_base { - using AsUint = typename unsigned_integer_of_size::type; - return AsUint(0x1) << (sizeof(float64_t) * 4 - 1); - } + NBL_CONSTEXPR_STATIC_INLINE int16_t exponentBitCnt = 11; + NBL_CONSTEXPR_STATIC_INLINE int16_t mantissaBitCnt = 52; + }; - template <> + template<> uint32_t extractBiasedExponent(emulated_float64_t x) { return extractBiasedExponent(x.data); } - template <> + template<> int extractExponent(emulated_float64_t x) { return extractExponent(x.data); } - template <> + template<> emulated_float64_t replaceBiasedExponent(emulated_float64_t x, typename unsigned_integer_of_size::type biasedExp) { return emulated_float64_t(replaceBiasedExponent(x.data, biasedExp)); @@ -681,4 +780,10 @@ namespace ieee754 } } +#undef FLOAT_ROUND_NEAREST_EVEN +#undef FLOAT_ROUND_TO_ZERO +#undef FLOAT_ROUND_DOWN +#undef FLOAT_ROUND_UP +#undef FLOAT_ROUNDING_MODE + #endif diff --git a/include/nbl/builtin/hlsl/ieee754.hlsl b/include/nbl/builtin/hlsl/ieee754.hlsl index 2cd74caa54..0a4ff1aa7b 100644 --- a/include/nbl/builtin/hlsl/ieee754.hlsl +++ b/include/nbl/builtin/hlsl/ieee754.hlsl @@ -58,78 +58,106 @@ namespace impl template<> uint64_t castBackToFloatType(uint64_t x) { return x; } } - template - int getExponentBitCnt() { staticAssertTmp(false); return 0xdeadbeefu; } - template<> int getExponentBitCnt() { return 5; } - template<> int getExponentBitCnt() { return 8; } - template<> int getExponentBitCnt() { return 11; } +template +struct traits_base +{ + NBL_CONSTEXPR_STATIC_INLINE int16_t exponentBitCnt = 0xbeef; + NBL_CONSTEXPR_STATIC_INLINE int16_t mantissaBitCnt = 0xbeef; +}; - template - int getMantissaBitCnt() { staticAssertTmp(false); return 0xdeadbeefu; /*TODO: add static assert fail to every 0xdeadbeef, fix all static asserts*/ } - template<> int getMantissaBitCnt() { return 10; } - template<> int getMantissaBitCnt() { return 23; } - template<> int getMantissaBitCnt() { return 52; } +template<> +struct traits_base +{ + NBL_CONSTEXPR_STATIC_INLINE int16_t exponentBitCnt = 5; + NBL_CONSTEXPR_STATIC_INLINE int16_t mantissaBitCnt = 10; +}; - template - int getExponentBias() - { - staticAssertTmp(impl::isTypeAllowed(), "Invalid type! Only floating point or unsigned integer types are allowed."); - return (0x1 << (getExponentBitCnt::type>() - 1)) - 1; - } +template<> +struct traits_base +{ + NBL_CONSTEXPR_STATIC_INLINE int16_t exponentBitCnt = 8; + NBL_CONSTEXPR_STATIC_INLINE int16_t mantissaBitCnt = 23; +}; - template - typename unsigned_integer_of_size::type getSignMask() - { - staticAssertTmp(impl::isTypeAllowed(), "Invalid type! Only floating point or unsigned integer types are allowed."); - using AsUint = typename unsigned_integer_of_size::type; - return AsUint(0x1) << (sizeof(T) * 4 - 1); - } +template<> +struct traits_base +{ + NBL_CONSTEXPR_STATIC_INLINE int16_t exponentBitCnt = 11; + NBL_CONSTEXPR_STATIC_INLINE int16_t mantissaBitCnt = 52; +}; - template - typename unsigned_integer_of_size::type getExponentMask() { staticAssertTmp(false); return 0xdeadbeefu; } - template<> unsigned_integer_of_size<2>::type getExponentMask() { return 0x7C00; } - template<> unsigned_integer_of_size<4>::type getExponentMask() { return 0x7F800000u; } - template<> unsigned_integer_of_size<8>::type getExponentMask() { return 0x7FF0000000000000ull; } +template +struct traits : traits_base +{ + using bit_rep_t = typename unsigned_integer_of_size::type; + using base_t = traits_base; - template - typename unsigned_integer_of_size::type getMantissaMask() { staticAssertTmp(false); return 0xdeadbeefu; } - template<> unsigned_integer_of_size<2>::type getMantissaMask() { return 0x03FF; } - template<> unsigned_integer_of_size<4>::type getMantissaMask() { return 0x007FFFFFu; } - template<> unsigned_integer_of_size<8>::type getMantissaMask() { return 0x000FFFFFFFFFFFFFull; } + NBL_CONSTEXPR_STATIC_INLINE bit_rep_t signMask = bit_rep_t(0x1u) << (sizeof(Float) * 8 - 1); + NBL_CONSTEXPR_STATIC_INLINE bit_rep_t exponentMask = ((~bit_rep_t(0)) << base_t::mantissaBitCnt) ^ signMask; + NBL_CONSTEXPR_STATIC_INLINE bit_rep_t mantissaMask = (bit_rep_t(0x1u) << base_t::mantissaBitCnt) - 1; + NBL_CONSTEXPR_STATIC_INLINE bit_rep_t exponentBias = (int(0x1) << (base_t::exponentBitCnt - 1)) - 1; +}; - template - uint32_t extractBiasedExponent(T x) - { - using AsUint = typename unsigned_integer_of_size::type; - return glsl::bitfieldExtract(impl::castToUintType(x), getMantissaBitCnt::type>(), getExponentBitCnt::type>()); - } +template +uint32_t extractBiasedExponent(T x) +{ + using AsUint = typename unsigned_integer_of_size::type; + return glsl::bitfieldExtract(impl::castToUintType(x), traits::type>::mantissaBitCnt, traits::type>::exponentBitCnt); +} - template - int extractExponent(T x) - { - return int(extractBiasedExponent(x)) - int(getExponentBias()); - } +template<> +uint32_t extractBiasedExponent(uint64_t x) +{ + const uint32_t highBits = uint32_t(x >> 32); + return glsl::bitfieldExtract(highBits, traits::mantissaBitCnt - 32, traits::exponentBitCnt); +} - template - T replaceBiasedExponent(T x, typename unsigned_integer_of_size::type biasedExp) - { - staticAssertTmp(impl::isTypeAllowed(), "Invalid type! Only floating point or unsigned integer types are allowed."); - return impl::castBackToFloatType(glsl::bitfieldInsert(impl::castToUintType(x), biasedExp, getMantissaBitCnt(), getExponentBitCnt())); - } +template<> +uint32_t extractBiasedExponent(float64_t x) +{ + return extractBiasedExponent(impl::castToUintType(x)); +} - // performs no overflow tests, returns x*exp2(n) - template - T fastMulExp2(T x, int n) - { - return replaceBiasedExponent(x, extractBiasedExponent(x) + uint32_t(n)); - } +template +int extractExponent(T x) +{ + return int(extractBiasedExponent(x)) - int(traits::exponentBias); +} + +template +T replaceBiasedExponent(T x, typename unsigned_integer_of_size::type biasedExp) +{ + staticAssertTmp(impl::isTypeAllowed(), "Invalid type! Only floating point or unsigned integer types are allowed."); + using AsFloat = typename float_of_size::type; + return impl::castBackToFloatType(glsl::bitfieldInsert(impl::castToUintType(x), biasedExp, traits::mantissaBitCnt, traits::exponentBitCnt)); +} + +// performs no overflow tests, returns x*exp2(n) +template +T fastMulExp2(T x, int n) +{ + return replaceBiasedExponent(x, extractBiasedExponent(x) + uint32_t(n)); +} + +template +typename unsigned_integer_of_size::type extractMantissa(T x) +{ + using AsUint = typename unsigned_integer_of_size::type; + return impl::castToUintType(x) & traits::type>::mantissaMask; +} + +template +typename unsigned_integer_of_size::type extractSign(T x) +{ + return (impl::castToUintType(x) & traits::signMask) >> ((sizeof(T) * 8) - 1); +} + +template +typename unsigned_integer_of_size::type extractSignPreserveBitPattern(T x) +{ + return impl::castToUintType(x) & traits::signMask; +} - template - typename unsigned_integer_of_size::type extractMantissa(T x) - { - using AsUint = typename unsigned_integer_of_size::type; - return impl::castToUintType(x) & getMantissaMask::type>(); - } } } } diff --git a/include/nbl/builtin/hlsl/tgmath.hlsl b/include/nbl/builtin/hlsl/tgmath.hlsl new file mode 100644 index 0000000000..464b4ed9ac --- /dev/null +++ b/include/nbl/builtin/hlsl/tgmath.hlsl @@ -0,0 +1,25 @@ +// Copyright (C) 2022 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_BUILTIN_HLSL_TGMATH_INCLUDED_ +#define _NBL_BUILTIN_HLSL_TGMATH_INCLUDED_ + +#include +#include + +namespace nbl +{ +namespace hlsl +{ +template +bool isnan(Float val) +{ + using AsUint = typename unsigned_integer_of_size::type; + AsUint asUint = bit_cast(val); + return bool((asUint & ieee754::traits::exponentMask) && (asUint & ieee754::traits::mantissaMask)); +} + +} +} + +#endif \ No newline at end of file diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index 3edccd2aa9..e1e71c542c 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -233,6 +233,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/normalization/shared_nor LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/macros.h") #emulated LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated_float64_t.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tgmath.hlsl") #utility LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ieee754.hlsl") #spirv intrinsics From afef3ec716461639ab7a71ce2da8e00a9847b8c1 Mon Sep 17 00:00:00 2001 From: Przemek Date: Tue, 9 Jul 2024 11:09:28 +0200 Subject: [PATCH 020/358] Updated example --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index f17208a703..5aaabfd75b 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit f17208a7039e30af359d0e99cf896f2d013670ff +Subproject commit 5aaabfd75b59b95bd88cf649f8dd6b69bfc54a7e From 10b8a305318d154b169e9beb4db2eea06f9ba611 Mon Sep 17 00:00:00 2001 From: Przemek Date: Wed, 24 Jul 2024 23:47:23 +0200 Subject: [PATCH 021/358] Saving work --- examples_tests | 2 +- include/nbl/builtin/hlsl/algorithm.hlsl | 18 +- include/nbl/builtin/hlsl/cpp_compat.hlsl | 4 + .../nbl/builtin/hlsl/emulated_float64_t.hlsl | 514 ++++-------------- include/nbl/builtin/hlsl/ieee754.hlsl | 7 +- .../hlsl/impl/emulated_float64_t_impl.hlsl | 469 ++++++++++++++++ src/nbl/builtin/CMakeLists.txt | 2 + 7 files changed, 596 insertions(+), 420 deletions(-) create mode 100644 include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl diff --git a/examples_tests b/examples_tests index 5aaabfd75b..25dfb67454 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 5aaabfd75b59b95bd88cf649f8dd6b69bfc54a7e +Subproject commit 25dfb6745482473bb63c893f8ae73770d2698983 diff --git a/include/nbl/builtin/hlsl/algorithm.hlsl b/include/nbl/builtin/hlsl/algorithm.hlsl index 564ef59f20..276282cedc 100644 --- a/include/nbl/builtin/hlsl/algorithm.hlsl +++ b/include/nbl/builtin/hlsl/algorithm.hlsl @@ -18,7 +18,7 @@ namespace impl // TODO: use structs template - NBL_CONSTEXPR_STATIC_INLINE void swap(NBL_REF_ARG(T) lhs, NBL_REF_ARG(T) rhs) + NBL_CONSTEXPR_INLINE_FUNC void swap(NBL_REF_ARG(T) lhs, NBL_REF_ARG(T) rhs) { T tmp = lhs; lhs = rhs; @@ -26,7 +26,7 @@ namespace impl } template<> - NBL_CONSTEXPR_STATIC_INLINE void swap(NBL_REF_ARG(uint16_t) lhs, NBL_REF_ARG(uint16_t) rhs) + NBL_CONSTEXPR_INLINE_FUNC void swap(NBL_REF_ARG(uint16_t) lhs, NBL_REF_ARG(uint16_t) rhs) { lhs ^= rhs; rhs ^= lhs; @@ -34,7 +34,7 @@ namespace impl } template<> - NBL_CONSTEXPR_STATIC_INLINE void swap(NBL_REF_ARG(uint32_t) lhs, NBL_REF_ARG(uint32_t) rhs) + NBL_CONSTEXPR_INLINE_FUNC void swap(NBL_REF_ARG(uint32_t) lhs, NBL_REF_ARG(uint32_t) rhs) { lhs ^= rhs; rhs ^= lhs; @@ -42,7 +42,7 @@ namespace impl } template<> - NBL_CONSTEXPR_STATIC_INLINE void swap(NBL_REF_ARG(uint64_t) lhs, NBL_REF_ARG(uint64_t) rhs) + NBL_CONSTEXPR_INLINE_FUNC void swap(NBL_REF_ARG(uint64_t) lhs, NBL_REF_ARG(uint64_t) rhs) { lhs ^= rhs; rhs ^= lhs; @@ -50,7 +50,7 @@ namespace impl } template<> - NBL_CONSTEXPR_STATIC_INLINE void swap(NBL_REF_ARG(int16_t) lhs, NBL_REF_ARG(int16_t) rhs) + NBL_CONSTEXPR_INLINE_FUNC void swap(NBL_REF_ARG(int16_t) lhs, NBL_REF_ARG(int16_t) rhs) { lhs ^= rhs; rhs ^= lhs; @@ -58,7 +58,7 @@ namespace impl } template<> - NBL_CONSTEXPR_STATIC_INLINE void swap(NBL_REF_ARG(int32_t) lhs, NBL_REF_ARG(int32_t) rhs) + NBL_CONSTEXPR_INLINE_FUNC void swap(NBL_REF_ARG(int32_t) lhs, NBL_REF_ARG(int32_t) rhs) { lhs ^= rhs; rhs ^= lhs; @@ -66,7 +66,7 @@ namespace impl } template<> - NBL_CONSTEXPR_STATIC_INLINE void swap(NBL_REF_ARG(int64_t) lhs, NBL_REF_ARG(int64_t) rhs) + NBL_CONSTEXPR_INLINE_FUNC void swap(NBL_REF_ARG(int64_t) lhs, NBL_REF_ARG(int64_t) rhs) { lhs ^= rhs; rhs ^= lhs; @@ -74,7 +74,7 @@ namespace impl } #else template - NBL_CONSTEXPR_STATIC_INLINE void swap(NBL_REF_ARG(T) lhs, NBL_REF_ARG(T) rhs) + NBL_CONSTEXPR_INLINE_FUNC void swap(NBL_REF_ARG(T) lhs, NBL_REF_ARG(T) rhs) { std::swap(lhs, rhs); } @@ -82,7 +82,7 @@ namespace impl } template -NBL_CONSTEXPR_STATIC_INLINE void swap(NBL_REF_ARG(T) lhs, NBL_REF_ARG(T) rhs) +NBL_CONSTEXPR_INLINE_FUNC void swap(NBL_REF_ARG(T) lhs, NBL_REF_ARG(T) rhs) { impl::swap(lhs, rhs); } diff --git a/include/nbl/builtin/hlsl/cpp_compat.hlsl b/include/nbl/builtin/hlsl/cpp_compat.hlsl index a22e8cc0c5..f3cf538e28 100644 --- a/include/nbl/builtin/hlsl/cpp_compat.hlsl +++ b/include/nbl/builtin/hlsl/cpp_compat.hlsl @@ -11,6 +11,8 @@ #define NBL_CONSTEXPR constexpr #define NBL_CONSTEXPR_STATIC constexpr static #define NBL_CONSTEXPR_STATIC_INLINE constexpr static inline +#define NBL_CONSTEXPR_FUNC constexpr +#define NBL_CONSTEXPR_INLINE_FUNC constexpr inline #define NBL_CONST_MEMBER_FUNC const #define NBL_ALIAS_TEMPLATE_FUNCTION(origFunctionName, functionAlias) \ @@ -41,6 +43,8 @@ using add_pointer = std::add_pointer; #define ARROW .arrow(). #define NBL_CONSTEXPR const static #define NBL_CONSTEXPR_STATIC_INLINE const static +#define NBL_CONSTEXPR_FUNC +#define NBL_CONSTEXPR_INLINE_FUNC #define NBL_CONST_MEMBER_FUNC namespace nbl diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index 455c6f67a5..8e8a26d6b3 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -5,35 +5,7 @@ #include #include #include - -// TODO: when it will be possible, use this unions wherever they fit: -/* -* union Mantissa -* { -* struct -* { -* uint32_t highBits; -* uint64_t lowBits; -* }; -* -* uint32_t2 packed; -* }; -* -*/ - -/* -* union Mantissa -* { -* struct -* { -* uint64_t lhs; -* uint64_t rhs; -* }; -* -* uint32_t4 packed; -* }; -* -*/ +#include #define FLOAT_ROUND_NEAREST_EVEN 0 #define FLOAT_ROUND_TO_ZERO 1 @@ -45,386 +17,64 @@ namespace nbl { namespace hlsl { -namespace impl -{ - template - uint64_t promoteToUint64(T val) - { - using AsFloat = typename float_of_size::type; - uint64_t asUint = ieee754::impl::castToUintType(val); - - const uint64_t sign = (uint64_t(ieee754::traits::signMask) & asUint) << (sizeof(float64_t) - sizeof(T)); - const int64_t newExponent = ieee754::extractExponent(val) + ieee754::traits::exponentBias; - - const uint64_t exp = (uint64_t(ieee754::extractExponent(val)) + ieee754::traits::exponentBias) << (ieee754::traits::mantissaBitCnt); - const uint64_t mantissa = (uint64_t(ieee754::traits::mantissaMask) & asUint) << (ieee754::traits::exponentBias - ieee754::traits::mantissaBitCnt); - - return sign | exp | mantissa; - }; - - template<> uint64_t promoteToUint64(float64_t val) { return bit_cast(val); } - - nbl::hlsl::uint32_t2 umulExtended(uint32_t lhs, uint32_t rhs) - { - uint64_t product = uint64_t(lhs) * uint64_t(rhs); - nbl::hlsl::uint32_t2 output; - output.x = uint32_t((product & 0xFFFFFFFF00000000) >> 32); - output.y = uint32_t(product & 0x00000000FFFFFFFFull); - return output; - } - - uint64_t propagateFloat64NaN(uint64_t lhs, uint64_t rhs) - { -#if defined RELAXED_NAN_PROPAGATION - return lhs | rhs; -#else - const bool lhsIsNaN = isnan(bit_cast(lhs)); - const bool rhsIsNaN = isnan(bit_cast(rhs)); - lhs |= 0x0000000000080000ull; - rhs |= 0x0000000000080000ull; - - return lerp(rhs, lerp(lhs, rhs, rhsIsNaN), lhsIsNaN); -#endif - } - - uint64_t packFloat64(uint32_t zSign, int zExp, uint32_t zFrac0, uint32_t zFrac1) - { - nbl::hlsl::uint32_t2 z; - - z.x = zSign + (uint32_t(zExp) << 20) + zFrac0; - z.y = zFrac1; - - uint64_t output = 0u; - output |= (uint64_t(z.x) << 32) & 0xFFFFFFFF00000000ull; - output |= uint64_t(z.y); - return output; - } - - uint32_t2 packUint64(uint64_t val) - { - return uint32_t2((val & 0xFFFFFFFF00000000ull) >> 32, val & 0x00000000FFFFFFFFull); - } - - uint64_t unpackUint64(uint32_t2 val) - { - return ((uint64_t(val.x) & 0x00000000FFFFFFFFull) << 32) | uint64_t(val.y); - } - - nbl::hlsl::uint32_t2 add64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) - { - nbl::hlsl::uint32_t2 output; - output.y = a1 + b1; - output.x = a0 + b0 + uint32_t(output.y < a1); - - return output; - } - - - nbl::hlsl::uint32_t2 sub64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) - { - nbl::hlsl::uint32_t2 output; - output.y = a1 - b1; - output.x = a0 - b0 - uint32_t(a1 < b1); - - return output; - } - - // TODO: test - int countLeadingZeros32(uint32_t val) - { -#ifndef __HLSL_VERSION - return 31 - nbl::hlsl::findMSB(val); -#else - return 31 - firstbithigh(val); -#endif - } - - uint32_t2 shift64RightJamming(uint32_t2 val, int count) - { - uint32_t2 output; - const int negCount = (-count) & 31; - - output.x = lerp(0u, val.x, count == 0); - output.x = lerp(output.x, (val.x >> count), count < 32); - - output.y = uint32_t((val.x | val.y) != 0u); /* count >= 64 */ - uint32_t z1_lt64 = (val.x>>(count & 31)) | uint32_t(((val.x<>count) | uint32_t((val.y<> (count & 31)), count < 64); - output.y = lerp(output.y, (val.x << negCount) | (val.y >> count), count < 32); - - val.z = lerp(val.z | val.y, val.z, count < 32); - output.x = lerp(output.x, val.x >> count, count < 32); - output.z |= uint32_t(val.z != 0u); - - output.x = lerp(output.x, 0u, (count == 32)); - output.y = lerp(output.y, val.x, (count == 32)); - output.z = lerp(output.z, val.y, (count == 32)); - output.x = lerp(output.x, val.x, (count == 0)); - output.y = lerp(output.y, val.y, (count == 0)); - output.z = lerp(output.z, val.z, (count == 0)); - - return output; - } - - uint64_t shortShift64Left(uint64_t val, int count) - { - const uint32_t2 packed = packUint64(val); - - nbl::hlsl::uint32_t2 output; - output.y = packed.y << count; - // TODO: fix - output.x = lerp((packed.x << count | (packed.y >> ((-count) & 31))), packed.x, count == 0); - - // y = 3092377600 - // x = 2119009566 - return unpackUint64(output); - }; - - uint64_t assembleFloat64(uint64_t signShifted, uint64_t expShifted, uint64_t mantissa) - { - return signShifted + expShifted + mantissa; - } - - uint64_t roundAndPackFloat64(uint64_t zSign, int zExp, uint32_t3 mantissaExtended) - { - bool roundNearestEven; - bool increment; - - roundNearestEven = true; - increment = int(mantissaExtended.z) < 0; - if (!roundNearestEven) - { - if (false) //(FLOAT_ROUNDING_MODE == FLOAT_ROUND_TO_ZERO) - { - increment = false; - } - else - { - if (false) //(zSign != 0u) - { - //increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) && - // (zFrac2 != 0u); - } - else - { - //increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP) && - // (zFrac2 != 0u); - } - } - } - if (0x7FD <= zExp) - { - if ((0x7FD < zExp) || ((zExp == 0x7FD) && (0x001FFFFFu == mantissaExtended.x && 0xFFFFFFFFu == mantissaExtended.y) && increment)) - { - if (false) // ((FLOAT_ROUNDING_MODE == FLOAT_ROUND_TO_ZERO) || - // ((zSign != 0u) && (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP)) || - // ((zSign == 0u) && (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN))) - { - return packFloat64(zSign, 0x7FE, 0x000FFFFFu, 0xFFFFFFFFu); - } - - return packFloat64(zSign, 0x7FF, 0u, 0u); - } - } - - if (zExp < 0) - { - mantissaExtended = shift64ExtraRightJamming(mantissaExtended, -zExp); - zExp = 0; - - if (roundNearestEven) - { - increment = mantissaExtended.z < 0u; - } - else - { - if (zSign != 0u) - { - increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) && (mantissaExtended.z != 0u); - } - else - { - increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP) && (mantissaExtended.z != 0u); - } - } - } - - if (increment) - { - const uint64_t added = impl::unpackUint64(uint32_t2(mantissaExtended.xy)) + 1ull; - mantissaExtended.xy = packUint64(added); - mantissaExtended.y &= ~((mantissaExtended.z + uint32_t(mantissaExtended.z == 0u)) & uint32_t(roundNearestEven)); - } - else - { - zExp = lerp(zExp, 0, (mantissaExtended.x | mantissaExtended.y) == 0u); - } - - return assembleFloat64(zSign, uint64_t(zExp) << ieee754::traits::mantissaBitCnt, unpackUint64(mantissaExtended.xy)); - } - - uint64_t normalizeRoundAndPackFloat64(uint64_t sign, int exp, uint32_t frac0, uint32_t frac1) - { - int shiftCount; - nbl::hlsl::uint32_t3 frac = nbl::hlsl::uint32_t3(frac0, frac1, 0u); - - if (frac.x == 0u) - { - exp -= 32; - frac.x = frac.y; - frac.y = 0u; - } - - shiftCount = countLeadingZeros32(frac.x) - 11; - if (0 <= shiftCount) - { - frac.xy = shortShift64Left(unpackUint64(frac.xy), shiftCount); - } - else - { - frac.xyz = shift64ExtraRightJamming(uint32_t3(frac.xy, 0), -shiftCount); - } - exp -= shiftCount; - return roundAndPackFloat64(sign, exp, frac); - } - - void normalizeFloat64Subnormal(uint64_t mantissa, - NBL_REF_ARG(int) outExp, - NBL_REF_ARG(uint64_t) outMantissa) - { - uint32_t2 mantissaPacked = packUint64(mantissa); - int shiftCount; - uint32_t2 temp; - shiftCount = countLeadingZeros32(lerp(mantissaPacked.x, mantissaPacked.y, mantissaPacked.x == 0u)) - 11; - outExp = lerp(1 - shiftCount, -shiftCount - 31, mantissaPacked.x == 0u); - - temp.x = lerp(mantissaPacked.y << shiftCount, mantissaPacked.y >> (-shiftCount), shiftCount < 0); - temp.y = lerp(0u, mantissaPacked.y << (shiftCount & 31), shiftCount < 0); - - shortShift64Left(impl::unpackUint64(mantissaPacked), shiftCount); - - outMantissa = lerp(outMantissa, unpackUint64(temp), mantissaPacked.x == 0); - } - - } - - struct emulated_float64_t + template + struct emulated_float64_t_impl { using storage_t = uint64_t; storage_t data; // constructors - /*static emulated_float64_t create(uint16_t val) + /*static emulated_float64_t_impl create(uint16_t val) { - return emulated_float64_t(bit_cast(float64_t(val))); + return emulated_float64_t_impl(bit_cast(float64_t(val))); }*/ - static emulated_float64_t create(int32_t val) + static emulated_float64_t_impl create(int32_t val) { - return emulated_float64_t(bit_cast(float64_t(val))); + return emulated_float64_t_impl(bit_cast(float64_t(val))); } - static emulated_float64_t create(int64_t val) + static emulated_float64_t_impl create(int64_t val) { - return emulated_float64_t(bit_cast(float64_t(val))); + return emulated_float64_t_impl(bit_cast(float64_t(val))); } - static emulated_float64_t create(uint32_t val) + static emulated_float64_t_impl create(uint32_t val) { - return emulated_float64_t(bit_cast(float64_t(val))); + return emulated_float64_t_impl(bit_cast(float64_t(val))); } - static emulated_float64_t create(uint64_t val) + static emulated_float64_t_impl create(uint64_t val) { - return emulated_float64_t(bit_cast(float64_t(val))); + return emulated_float64_t_impl(bit_cast(float64_t(val))); } - static emulated_float64_t create(float64_t val) + static emulated_float64_t_impl create(float64_t val) { - return emulated_float64_t(bit_cast(val)); + return emulated_float64_t_impl(bit_cast(val)); } // TODO: unresolved external symbol imath_half_to_float_table - /*static emulated_float64_t create(float16_t val) + /*static emulated_float64_t_impl create(float16_t val) { - return emulated_float64_t(bit_cast(float64_t(val))); + return emulated_float64_t_impl(bit_cast(float64_t(val))); }*/ - static emulated_float64_t create(float32_t val) + static emulated_float64_t_impl create(float32_t val) { - return emulated_float64_t(bit_cast(float64_t(val))); + return emulated_float64_t_impl(bit_cast(float64_t(val))); } - static emulated_float64_t createPreserveBitPattern(uint64_t val) + static emulated_float64_t_impl createPreserveBitPattern(uint64_t val) { - return emulated_float64_t(val); + return emulated_float64_t_impl(val); } // arithmetic operators - emulated_float64_t operator+(const emulated_float64_t rhs) + emulated_float64_t_impl operator+(const emulated_float64_t_impl rhs) { - emulated_float64_t retval = createPreserveBitPattern(0u); + emulated_float64_t_impl retval = createPreserveBitPattern(0u); uint64_t mantissa; uint32_t3 mantissaExtended; @@ -502,7 +152,7 @@ namespace impl if (expDiff != 0) { - nbl::hlsl::uint32_t2 frac; + uint32_t2 frac; if (expDiff < 0) { @@ -568,17 +218,17 @@ namespace impl } } - emulated_float64_t operator-(emulated_float64_t rhs) + emulated_float64_t_impl operator-(emulated_float64_t_impl rhs) { - emulated_float64_t lhs = createPreserveBitPattern(data); - emulated_float64_t rhsFlipped = rhs.flipSign(); + emulated_float64_t_impl lhs = createPreserveBitPattern(data); + emulated_float64_t_impl rhsFlipped = rhs.flipSign(); return lhs + rhsFlipped; } - emulated_float64_t operator*(emulated_float64_t rhs) + emulated_float64_t_impl operator*(emulated_float64_t_impl rhs) { - emulated_float64_t retval = emulated_float64_t::createPreserveBitPattern(0u); + emulated_float64_t_impl retval = emulated_float64_t_impl::createPreserveBitPattern(0u); uint64_t lhsSign = data & ieee754::traits::signMask; uint64_t rhsSign = rhs.data & ieee754::traits::signMask; @@ -587,7 +237,7 @@ namespace impl int lhsBiasedExp = ieee754::extractBiasedExponent(data); int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); - int exp = int(lhsBiasedExp + rhsBiasedExp) - 0x400; + int exp = int(lhsBiasedExp + rhsBiasedExp) - ieee754::traits::exponentBias; uint64_t sign = (data ^ rhs.data) & ieee754::traits::signMask; if (lhsBiasedExp == 0x7FF) @@ -628,51 +278,97 @@ namespace impl impl::normalizeFloat64Subnormal(rhsMantissa, rhsBiasedExp, rhsMantissa); } - lhsMantissa |= 0x0010000000000000ull; - rhsMantissa = impl::shortShift64Left(rhsMantissa, 12); + if (false) + { + lhsMantissa |= 1ull << 52; + rhsMantissa = impl::shortShift64Left(rhsMantissa, 12); + + uint32_t4 mantissasPacked; + mantissasPacked.xy = impl::packUint64(lhsMantissa); + mantissasPacked.zw = impl::packUint64(rhsMantissa); - uint32_t4 mantissasPacked; - mantissasPacked.xy = impl::packUint64(lhsMantissa); - mantissasPacked.zw = impl::packUint64(rhsMantissa); + mantissasPacked = impl::mul64to128(mantissasPacked); - mantissasPacked = impl::mul64to128(mantissasPacked); + mantissasPacked.xy = impl::packUint64(impl::unpackUint64(mantissasPacked.xy) + lhsMantissa); + mantissasPacked.z |= uint32_t(mantissasPacked.w != 0u); + if (0x00200000u <= mantissasPacked.x) + { + mantissasPacked = uint32_t4(impl::shift64ExtraRightJamming(mantissasPacked.xyz, 1), 0u); + ++exp; + } - mantissasPacked.xy = impl::packUint64(impl::unpackUint64(mantissasPacked.xy) + lhsMantissa); - mantissasPacked.z |= uint32_t(mantissasPacked.w != 0u); - if (0x00200000u <= mantissasPacked.x) + return createPreserveBitPattern(impl::roundAndPackFloat64(sign, exp, mantissasPacked.xyz)); + } + else { - mantissasPacked = uint32_t4(impl::shift64ExtraRightJamming(mantissasPacked.xyz, 1), 0u); - ++exp; + lhsMantissa |= 1ull << 52; + rhsMantissa |= 1ull << 52; + + uint32_t2 lhsPacked = impl::packUint64(lhsMantissa); + uint32_t2 rhsPacked = impl::packUint64(rhsMantissa); + uint64_t lhsHigh = lhsPacked.x; + uint64_t lhsLow = lhsPacked.y; + uint64_t rhsHigh = rhsPacked.x; + uint64_t rhsLow = rhsPacked.y; + + //((hi_lhs * hi_rhs) << 11) + ((hi_lhs * lo_rhs + lo_lhs * hi_rhs) >> 37) + + uint64_t newPseudoMantissa = ((lhsHigh * rhsHigh) << 11) + ((lhsHigh * rhsLow + lhsLow * rhsHigh) >> 37); + newPseudoMantissa <<= 1; + //newPseudoMantissa >>= 52; + /*if (newPseudoMantissa >= (1ull << 52)) + { + newPseudoMantissa >>= 1; + ++exp; + }*/ + + return createPreserveBitPattern(impl::assembleFloat64(sign, uint64_t(exp) << ieee754::traits::mantissaBitCnt, newPseudoMantissa & ieee754::traits::mantissaMask)); } - return createPreserveBitPattern(impl::roundAndPackFloat64(sign, exp, mantissasPacked.xyz)); + } - // TODO - emulated_float64_t operator/(const emulated_float64_t rhs) + emulated_float64_t_impl operator/(const emulated_float64_t_impl rhs) { - return createPreserveBitPattern(0xdeadbeefbadcaffeull); + + // TODO: maybe add function to extract real mantissa + const uint64_t lhsRealMantissa = (ieee754::extractMantissa(data) | (1ull << ieee754::traits::mantissaBitCnt)); + const uint64_t rhsRealMantissa = ieee754::extractMantissa(rhs.data) | (1ull << ieee754::traits::mantissaBitCnt); + + + const uint64_t sign = (data ^ rhs.data) & ieee754::traits::signMask; + int exp = ieee754::extractExponent(data) - ieee754::extractExponent(rhs.data) + ieee754::traits::exponentBias; + uint64_t mantissa = impl::divMantissas(lhsRealMantissa, rhsRealMantissa); + + if (mantissa & (1ULL << (ieee754::traits::mantissaBitCnt + 1))) + { + mantissa >>= 1; + ++exp; + } + + return createPreserveBitPattern(impl::assembleFloat64(sign, exp, mantissa & ieee754::traits::mantissaMask)); } + // relational operators - bool operator==(emulated_float64_t rhs) + bool operator==(emulated_float64_t_impl rhs) { - if (isnan(data) || isnan(rhs.data)) + if (FastMath && (isnan(data) || isnan(rhs.data))) return false; - const emulated_float64_t xored = emulated_float64_t::createPreserveBitPattern(data ^ rhs.data); + const emulated_float64_t_impl xored = emulated_float64_t_impl::createPreserveBitPattern(data ^ rhs.data); // TODO: check what fast math returns for -0 == 0 if ((xored.data & 0x7FFFFFFFFFFFFFFFull) == 0ull) return true; return !(xored.data); } - bool operator!=(emulated_float64_t rhs) + bool operator!=(emulated_float64_t_impl rhs) { - if (isnan(data) || isnan(rhs.data)) + if (FastMath && (isnan(data) || isnan(rhs.data))) return true; - const emulated_float64_t xored = emulated_float64_t::createPreserveBitPattern(data ^ rhs.data); + const emulated_float64_t_impl xored = emulated_float64_t_impl::createPreserveBitPattern(data ^ rhs.data); // TODO: check what fast math returns for -0 == 0 if ((xored.data & 0x7FFFFFFFFFFFFFFFull) == 0ull) @@ -680,7 +376,7 @@ namespace impl return xored.data; } - bool operator<(emulated_float64_t rhs) + bool operator<(emulated_float64_t_impl rhs) { const uint64_t lhsSign = ieee754::extractSign(data); const uint64_t rhsSign = ieee754::extractSign(rhs.data); @@ -693,7 +389,7 @@ namespace impl return (lhsFlipped & diffBits) < (rhsFlipped & diffBits); } - bool operator>(emulated_float64_t rhs) + bool operator>(emulated_float64_t_impl rhs) { const uint64_t lhsSign = ieee754::extractSign(data); const uint64_t rhsSign = ieee754::extractSign(rhs.data); @@ -706,12 +402,12 @@ namespace impl return (lhsFlipped & diffBits) > (rhsFlipped & diffBits); } - bool operator<=(emulated_float64_t rhs) { return !(emulated_float64_t::createPreserveBitPattern(data) > emulated_float64_t::createPreserveBitPattern(rhs.data)); } - bool operator>=(emulated_float64_t rhs) { return !(emulated_float64_t::createPreserveBitPattern(data) < emulated_float64_t::createPreserveBitPattern(rhs.data)); } + bool operator<=(emulated_float64_t_impl rhs) { return !(emulated_float64_t_impl::createPreserveBitPattern(data) > emulated_float64_t_impl::createPreserveBitPattern(rhs.data)); } + bool operator>=(emulated_float64_t_impl rhs) { return !(emulated_float64_t_impl::createPreserveBitPattern(data) < emulated_float64_t_impl::createPreserveBitPattern(rhs.data)); } //logical operators - bool operator&&(emulated_float64_t rhs) { return bool(data) && bool(rhs.data); } - bool operator||(emulated_float64_t rhs) { return bool(data) || bool(rhs.data); } + bool operator&&(emulated_float64_t_impl rhs) { return bool(data) && bool(rhs.data); } + bool operator||(emulated_float64_t_impl rhs) { return bool(data) || bool(rhs.data); } bool operator!() { return !bool(data); } // OMITED OPERATORS @@ -720,7 +416,7 @@ namespace impl // - access operators (dereference and addressof) not supported in HLSL // TODO: should modify self? - emulated_float64_t flipSign() + emulated_float64_t_impl flipSign() { return createPreserveBitPattern(data ^ ieee754::traits::signMask); } @@ -731,6 +427,8 @@ namespace impl } }; + using emulated_float64_t = emulated_float64_t_impl; + namespace ieee754 { template<> diff --git a/include/nbl/builtin/hlsl/ieee754.hlsl b/include/nbl/builtin/hlsl/ieee754.hlsl index 0a4ff1aa7b..b4a65d785d 100644 --- a/include/nbl/builtin/hlsl/ieee754.hlsl +++ b/include/nbl/builtin/hlsl/ieee754.hlsl @@ -61,8 +61,8 @@ namespace impl template struct traits_base { - NBL_CONSTEXPR_STATIC_INLINE int16_t exponentBitCnt = 0xbeef; - NBL_CONSTEXPR_STATIC_INLINE int16_t mantissaBitCnt = 0xbeef; + NBL_CONSTEXPR_STATIC_INLINE int16_t exponentBitCnt = int16_t(0xbeef); + NBL_CONSTEXPR_STATIC_INLINE int16_t mantissaBitCnt = int16_t(0xbeef); }; template<> @@ -89,6 +89,8 @@ struct traits_base template struct traits : traits_base { + //static_assert(is_same_v || is_same_v || is_same_v); + using bit_rep_t = typename unsigned_integer_of_size::type; using base_t = traits_base; @@ -96,6 +98,7 @@ struct traits : traits_base NBL_CONSTEXPR_STATIC_INLINE bit_rep_t exponentMask = ((~bit_rep_t(0)) << base_t::mantissaBitCnt) ^ signMask; NBL_CONSTEXPR_STATIC_INLINE bit_rep_t mantissaMask = (bit_rep_t(0x1u) << base_t::mantissaBitCnt) - 1; NBL_CONSTEXPR_STATIC_INLINE bit_rep_t exponentBias = (int(0x1) << (base_t::exponentBitCnt - 1)) - 1; + NBL_CONSTEXPR_STATIC_INLINE bit_rep_t inf = exponentMask; }; template diff --git a/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl b/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl new file mode 100644 index 0000000000..33ea303968 --- /dev/null +++ b/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl @@ -0,0 +1,469 @@ +#ifndef _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_IMPL_INCLUDED_ +#define _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_IMPL_INCLUDED_ + +#include +#include +#include +#include + +// TODO: when it will be possible, use this unions wherever they fit: +/* +* union Mantissa +* { +* struct +* { +* uint32_t highBits; +* uint64_t lowBits; +* }; +* +* uint32_t2 packed; +* }; +* +*/ + +/* +* union Mantissa +* { +* struct +* { +* uint64_t lhs; +* uint64_t rhs; +* }; +* +* uint32_t4 packed; +* }; +* +*/ + +namespace nbl +{ +namespace hlsl +{ +namespace impl +{ + struct uint128Mantissa + { + uint64_t highBits; + uint64_t lowBits; + + static uint128Mantissa create(uint64_t mantissa64) + { + uint128Mantissa output; + output.highBits = 0u; + output.lowBits = mantissa64; + + return output; + } + + void shiftLeftByOne() + { + highBits = (highBits << 1) | (lowBits >> 63); + lowBits <<= 1; + } + + void shiftRightByOne() + { + highBits >>= 1; + lowBits = (highBits >> 63) | (lowBits >> 1); + } + + // TODO: more efficient comparisions + bool operator>=(uint128Mantissa rhs) + { + return (highBits > rhs.highBits) || (highBits == rhs.highBits && lowBits >= rhs.lowBits); + } + + bool operator<(uint128Mantissa rhs) + { + return (highBits < rhs.highBits) || (highBits == rhs.highBits && lowBits < rhs.lowBits); + } + + uint128Mantissa operator-(uint128Mantissa rhs) + { + uint128Mantissa result; + result.lowBits = lowBits - rhs.lowBits; + result.highBits = highBits - rhs.highBits - (lowBits < rhs.lowBits); + return result; + } + + static uint128Mantissa createAsShiftedByMantissaBitCnt(uint64_t mantissa64) + { + uint128Mantissa output; + output.highBits = mantissa64 >> (64 - nbl::hlsl::ieee754::traits::mantissaBitCnt); + output.lowBits = mantissa64 << nbl::hlsl::ieee754::traits::mantissaBitCnt; + + return output; + } + + uint64_t divByFloat64(uint64_t floatRep) + { + uint128Mantissa output = create(0); + + uint128Mantissa divisor = create(floatRep); + uint128Mantissa remainder; + remainder.highBits = highBits; + remainder.lowBits = lowBits; + uint128Mantissa one = create(1); + + + while ((divisor.highBits < (1ULL << 63)) && (divisor < remainder)) + { + divisor.shiftLeftByOne(); + one.shiftLeftByOne(); + } + + while (one.highBits != 0 || one.lowBits != 0) + { + if (remainder >= divisor) + { + remainder = remainder - divisor; + output.highBits |= one.highBits; + output.lowBits |= one.lowBits; + } + output.shiftRightByOne(); + one.shiftRightByOne(); + } + + return output.lowBits; + } + }; + + uint64_t divMantissas(uint64_t lhs, uint64_t rhs) + { + uint128Mantissa lhs128 = uint128Mantissa::createAsShiftedByMantissaBitCnt(lhs); + return lhs128.divByFloat64(rhs); + } + + template + uint64_t promoteToUint64(T val) + { + using AsFloat = typename float_of_size::type; + uint64_t asUint = ieee754::impl::castToUintType(val); + + const uint64_t sign = (uint64_t(ieee754::traits::signMask) & asUint) << (sizeof(float64_t) - sizeof(T)); + const int64_t newExponent = ieee754::extractExponent(val) + ieee754::traits::exponentBias; + + const uint64_t exp = (uint64_t(ieee754::extractExponent(val)) + ieee754::traits::exponentBias) << (ieee754::traits::mantissaBitCnt); + const uint64_t mantissa = (uint64_t(ieee754::traits::mantissaMask) & asUint) << (ieee754::traits::exponentBias - ieee754::traits::mantissaBitCnt); + + return sign | exp | mantissa; + }; + + template<> uint64_t promoteToUint64(float64_t val) { return bit_cast(val); } + + uint32_t2 umulExtended(uint32_t lhs, uint32_t rhs) + { + uint64_t product = uint64_t(lhs) * uint64_t(rhs); + nbl::hlsl::uint32_t2 output; + output.x = uint32_t((product & 0xFFFFFFFF00000000) >> 32); + output.y = uint32_t(product & 0x00000000FFFFFFFFull); + return output; + } + + uint64_t propagateFloat64NaN(uint64_t lhs, uint64_t rhs) + { +#if defined RELAXED_NAN_PROPAGATION + return lhs | rhs; +#else + const bool lhsIsNaN = isnan(bit_cast(lhs)); + const bool rhsIsNaN = isnan(bit_cast(rhs)); + lhs |= 0x0000000000080000ull; + rhs |= 0x0000000000080000ull; + + return lerp(rhs, lerp(lhs, rhs, rhsIsNaN), lhsIsNaN); +#endif + } + + uint64_t packFloat64(uint32_t zSign, int zExp, uint32_t zFrac0, uint32_t zFrac1) + { + nbl::hlsl::uint32_t2 z; + + z.x = zSign + (uint32_t(zExp) << 20) + zFrac0; + z.y = zFrac1; + + uint64_t output = 0u; + output |= (uint64_t(z.x) << 32) & 0xFFFFFFFF00000000ull; + output |= uint64_t(z.y); + return output; + } + + uint32_t2 packUint64(uint64_t val) + { + return uint32_t2((val & 0xFFFFFFFF00000000ull) >> 32, val & 0x00000000FFFFFFFFull); + } + + uint64_t unpackUint64(uint32_t2 val) + { + return ((uint64_t(val.x) & 0x00000000FFFFFFFFull) << 32) | uint64_t(val.y); + } + + nbl::hlsl::uint32_t2 add64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) + { + nbl::hlsl::uint32_t2 output; + output.y = a1 + b1; + output.x = a0 + b0 + uint32_t(output.y < a1); + + return output; + } + + + nbl::hlsl::uint32_t2 sub64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) + { + nbl::hlsl::uint32_t2 output; + output.y = a1 - b1; + output.x = a0 - b0 - uint32_t(a1 < b1); + + return output; + } + + // TODO: test + int countLeadingZeros32(uint32_t val) + { +#ifndef __HLSL_VERSION + return 31 - nbl::hlsl::findMSB(val); +#else + return 31 - firstbithigh(val); +#endif + } + + uint32_t2 shift64RightJamming(uint32_t2 val, int count) + { + uint32_t2 output; + const int negCount = (-count) & 31; + + output.x = lerp(0u, val.x, count == 0); + output.x = lerp(output.x, (val.x >> count), count < 32); + + output.y = uint32_t((val.x | val.y) != 0u); /* count >= 64 */ + uint32_t z1_lt64 = (val.x>>(count & 31)) | uint32_t(((val.x<>count) | uint32_t((val.y<> (count & 31)), count < 64); + output.y = lerp(output.y, (val.x << negCount) | (val.y >> count), count < 32); + + val.z = lerp(val.z | val.y, val.z, count < 32); + output.x = lerp(output.x, val.x >> count, count < 32); + output.z |= uint32_t(val.z != 0u); + + output.x = lerp(output.x, 0u, (count == 32)); + output.y = lerp(output.y, val.x, (count == 32)); + output.z = lerp(output.z, val.y, (count == 32)); + output.x = lerp(output.x, val.x, (count == 0)); + output.y = lerp(output.y, val.y, (count == 0)); + output.z = lerp(output.z, val.z, (count == 0)); + + return output; + } + + uint64_t shortShift64Left(uint64_t val, int count) + { + const uint32_t2 packed = packUint64(val); + + nbl::hlsl::uint32_t2 output; + output.y = packed.y << count; + // TODO: fix + output.x = lerp((packed.x << count | (packed.y >> ((-count) & 31))), packed.x, count == 0); + + // y = 3092377600 + // x = 2119009566 + return unpackUint64(output); + }; + + uint64_t assembleFloat64(uint64_t signShifted, uint64_t expShifted, uint64_t mantissa) + { + return signShifted + expShifted + mantissa; + } + + uint64_t roundAndPackFloat64(uint64_t zSign, int zExp, uint32_t3 mantissaExtended) + { + bool roundNearestEven; + bool increment; + + roundNearestEven = true; + increment = int(mantissaExtended.z) < 0; + if (!roundNearestEven) + { + if (false) //(FLOAT_ROUNDING_MODE == FLOAT_ROUND_TO_ZERO) + { + increment = false; + } + else + { + if (false) //(zSign != 0u) + { + //increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) && + // (zFrac2 != 0u); + } + else + { + //increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP) && + // (zFrac2 != 0u); + } + } + } + if (0x7FD <= zExp) + { + if ((0x7FD < zExp) || ((zExp == 0x7FD) && (0x001FFFFFu == mantissaExtended.x && 0xFFFFFFFFu == mantissaExtended.y) && increment)) + { + if (false) // ((FLOAT_ROUNDING_MODE == FLOAT_ROUND_TO_ZERO) || + // ((zSign != 0u) && (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP)) || + // ((zSign == 0u) && (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN))) + { + return packFloat64(zSign, 0x7FE, 0x000FFFFFu, 0xFFFFFFFFu); + } + + return packFloat64(zSign, 0x7FF, 0u, 0u); + } + } + + if (zExp < 0) + { + mantissaExtended = shift64ExtraRightJamming(mantissaExtended, -zExp); + zExp = 0; + + if (roundNearestEven) + { + increment = mantissaExtended.z < 0u; + } + else + { + if (zSign != 0u) + { + increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) && (mantissaExtended.z != 0u); + } + else + { + increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP) && (mantissaExtended.z != 0u); + } + } + } + + if (increment) + { + const uint64_t added = impl::unpackUint64(uint32_t2(mantissaExtended.xy)) + 1ull; + mantissaExtended.xy = packUint64(added); + mantissaExtended.y &= ~((mantissaExtended.z + uint32_t(mantissaExtended.z == 0u)) & uint32_t(roundNearestEven)); + } + else + { + zExp = lerp(zExp, 0, (mantissaExtended.x | mantissaExtended.y) == 0u); + } + + return assembleFloat64(zSign, uint64_t(zExp) << nbl::hlsl::ieee754::traits::mantissaBitCnt, unpackUint64(mantissaExtended.xy)); + } + + uint64_t normalizeRoundAndPackFloat64(uint64_t sign, int exp, uint32_t frac0, uint32_t frac1) + { + int shiftCount; + nbl::hlsl::uint32_t3 frac = nbl::hlsl::uint32_t3(frac0, frac1, 0u); + + if (frac.x == 0u) + { + exp -= 32; + frac.x = frac.y; + frac.y = 0u; + } + + shiftCount = countLeadingZeros32(frac.x) - 11; + if (0 <= shiftCount) + { + frac.xy = shortShift64Left(unpackUint64(frac.xy), shiftCount); + } + else + { + frac.xyz = shift64ExtraRightJamming(uint32_t3(frac.xy, 0), -shiftCount); + } + exp -= shiftCount; + return roundAndPackFloat64(sign, exp, frac); + } + + void normalizeFloat64Subnormal(uint64_t mantissa, + NBL_REF_ARG(int) outExp, + NBL_REF_ARG(uint64_t) outMantissa) + { + uint32_t2 mantissaPacked = packUint64(mantissa); + int shiftCount; + uint32_t2 temp; + shiftCount = countLeadingZeros32(lerp(mantissaPacked.x, mantissaPacked.y, mantissaPacked.x == 0u)) - 11; + outExp = lerp(1 - shiftCount, -shiftCount - 31, mantissaPacked.x == 0u); + + temp.x = lerp(mantissaPacked.y << shiftCount, mantissaPacked.y >> (-shiftCount), shiftCount < 0); + temp.y = lerp(0u, mantissaPacked.y << (shiftCount & 31), shiftCount < 0); + + shortShift64Left(impl::unpackUint64(mantissaPacked), shiftCount); + + outMantissa = lerp(outMantissa, unpackUint64(temp), mantissaPacked.x == 0); + } + + bool areBothSameSignInfinity(uint64_t lhs, uint64_t rhs) + { + lhs ^= ieee754::traits::signMask; + rhs ^= ieee754::traits::signMask; + + bool output = lhs == rhs && ieee754::traits::inf; + bool output = output && ((lhs & (~ieee754::traits::signMask)) == ieee754::traits::inf); + + return output; + } + +} + +#endif \ No newline at end of file diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index e1e71c542c..9b8676160f 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -231,6 +231,8 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/normalization/shared_nor # HLSL LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/macros.h") +#impl +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/impl/emulated_float64_t_impl.hlsl") #emulated LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated_float64_t.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tgmath.hlsl") From a0dfdb22fae3624fdd114fd01ab5f40d2dbcb612 Mon Sep 17 00:00:00 2001 From: Przemek Date: Tue, 30 Jul 2024 18:27:05 +0200 Subject: [PATCH 022/358] Fixes --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated_float64_t.hlsl | 396 +++++++++--------- include/nbl/builtin/hlsl/ieee754.hlsl | 4 +- .../hlsl/impl/emulated_float64_t_impl.hlsl | 235 ++++++----- include/nbl/builtin/hlsl/tgmath.hlsl | 2 +- 5 files changed, 338 insertions(+), 301 deletions(-) diff --git a/examples_tests b/examples_tests index 25dfb67454..c53f5c8186 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 25dfb6745482473bb63c893f8ae73770d2698983 +Subproject commit c53f5c8186780516833b2ffadba277c131bbc9fb diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index 8e8a26d6b3..fd156af407 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -1,80 +1,70 @@ #ifndef _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_INCLUDED_ #define _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_INCLUDED_ -#include -#include -#include -#include #include - -#define FLOAT_ROUND_NEAREST_EVEN 0 -#define FLOAT_ROUND_TO_ZERO 1 -#define FLOAT_ROUND_DOWN 2 -#define FLOAT_ROUND_UP 3 -#define FLOAT_ROUNDING_MODE FLOAT_ROUND_NEAREST_EVEN namespace nbl { namespace hlsl { - template - struct emulated_float64_t_impl + template + struct emulated_float64_t { using storage_t = uint64_t; storage_t data; // constructors - /*static emulated_float64_t_impl create(uint16_t val) + /*static emulated_float64_t create(uint16_t val) { - return emulated_float64_t_impl(bit_cast(float64_t(val))); + return emulated_float64_t(bit_cast(float64_t(val))); }*/ - static emulated_float64_t_impl create(int32_t val) + static emulated_float64_t create(int32_t val) { - return emulated_float64_t_impl(bit_cast(float64_t(val))); + return emulated_float64_t(bit_cast(float64_t(val))); } - static emulated_float64_t_impl create(int64_t val) + static emulated_float64_t create(int64_t val) { - return emulated_float64_t_impl(bit_cast(float64_t(val))); + return emulated_float64_t(bit_cast(float64_t(val))); } - static emulated_float64_t_impl create(uint32_t val) + static emulated_float64_t create(uint32_t val) { - return emulated_float64_t_impl(bit_cast(float64_t(val))); + return emulated_float64_t(bit_cast(float64_t(val))); } - static emulated_float64_t_impl create(uint64_t val) + static emulated_float64_t create(uint64_t val) { - return emulated_float64_t_impl(bit_cast(float64_t(val))); + return emulated_float64_t(bit_cast(float64_t(val))); } - static emulated_float64_t_impl create(float64_t val) + static emulated_float64_t create(float64_t val) { - return emulated_float64_t_impl(bit_cast(val)); + return emulated_float64_t(bit_cast(val)); } // TODO: unresolved external symbol imath_half_to_float_table - /*static emulated_float64_t_impl create(float16_t val) + /*static emulated_float64_t create(float16_t val) { - return emulated_float64_t_impl(bit_cast(float64_t(val))); + return emulated_float64_t(bit_cast(float64_t(val))); }*/ - static emulated_float64_t_impl create(float32_t val) + static emulated_float64_t create(float32_t val) { - return emulated_float64_t_impl(bit_cast(float64_t(val))); + return emulated_float64_t(bit_cast(float64_t(val))); } - static emulated_float64_t_impl createPreserveBitPattern(uint64_t val) + static emulated_float64_t createPreserveBitPattern(uint64_t val) { - return emulated_float64_t_impl(val); + return emulated_float64_t(val); } // arithmetic operators - emulated_float64_t_impl operator+(const emulated_float64_t_impl rhs) + emulated_float64_t operator+(const emulated_float64_t rhs) { - emulated_float64_t_impl retval = createPreserveBitPattern(0u); + emulated_float64_t retval = createPreserveBitPattern(0u); uint64_t mantissa; uint32_t3 mantissaExtended; @@ -93,11 +83,11 @@ namespace hlsl { if (expDiff == 0) { - //if (lhsExp == 0x7FF) - //{ - // bool propagate = (lhsMantissa | rhsMantissa) != 0u; - // return createPreserveBitPattern(lerp(data, impl::propagateFloat64NaN(data, rhs.data), propagate)); - //} + if (lhsBiasedExp == ieee754::traits::specialValueExp) + { + bool propagate = (lhsMantissa | rhsMantissa) != 0u; + return createPreserveBitPattern(lerp(data, impl::propagateFloat64NaN(data, rhs.data), propagate)); + } mantissa = lhsMantissa + rhsMantissa; if (lhsBiasedExp == 0) @@ -143,7 +133,7 @@ namespace hlsl } // cannot happen but compiler cries about not every path returning value - return createPreserveBitPattern(0xdeadbeefbadcaffeull); + return createPreserveBitPattern(impl::roundAndPackFloat64(lhsSign, biasedExp, mantissaExtended)); } else { @@ -161,11 +151,11 @@ namespace hlsl lhsSign ^= ieee754::traits::signMask; } - //if (lhsExp == 0x7FF) - //{ - // bool propagate = (lhsHigh | lhsLow) != 0u; - // return nbl::hlsl::lerp(__packFloat64(lhsSign, 0x7ff, 0u, 0u), __propagateFloat64NaN(a, b), propagate); - //} + if (lhsBiasedExp == 0x7FF) + { + bool propagate = lhsMantissa != 0u; + return createPreserveBitPattern(lerp(impl::assembleFloat64(lhsSign, ieee754::traits::exponentMask, 0ull), impl::propagateFloat64NaN(data, rhs.data), propagate)); + } expDiff = lerp(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); rhsMantissa = lerp(rhsMantissa | 0x4000000000000000ull, rhsMantissa, rhsBiasedExp == 0); @@ -176,11 +166,11 @@ namespace hlsl --biasedExp; return createPreserveBitPattern(impl::normalizeRoundAndPackFloat64(lhsSign, biasedExp - 10, frac.x, frac.y)); } - //if (lhsExp == 0x7FF) - //{ - // bool propagate = ((lhsHigh | rhsHigh) | (lhsLow | rhsLow)) != 0u; - // return nbl::hlsl::lerp(0xFFFFFFFFFFFFFFFFUL, __propagateFloat64NaN(a, b), propagate); - //} + if (lhsBiasedExp == ieee754::traits::specialValueExp) + { + bool propagate = ((lhsMantissa) | (rhsMantissa)) != 0u; + return createPreserveBitPattern(lerp(ieee754::traits::quietNaN, impl::propagateFloat64NaN(data, rhs.data), propagate)); + } rhsBiasedExp = lerp(rhsBiasedExp, 1, lhsBiasedExp == 0); lhsBiasedExp = lerp(lhsBiasedExp, 1, lhsBiasedExp == 0); @@ -218,17 +208,17 @@ namespace hlsl } } - emulated_float64_t_impl operator-(emulated_float64_t_impl rhs) + emulated_float64_t operator-(emulated_float64_t rhs) { - emulated_float64_t_impl lhs = createPreserveBitPattern(data); - emulated_float64_t_impl rhsFlipped = rhs.flipSign(); + emulated_float64_t lhs = createPreserveBitPattern(data); + emulated_float64_t rhsFlipped = rhs.flipSign(); return lhs + rhsFlipped; } - emulated_float64_t_impl operator*(emulated_float64_t_impl rhs) + emulated_float64_t operator*(emulated_float64_t rhs) { - emulated_float64_t_impl retval = emulated_float64_t_impl::createPreserveBitPattern(0u); + emulated_float64_t retval = emulated_float64_t::createPreserveBitPattern(0u); uint64_t lhsSign = data & ieee754::traits::signMask; uint64_t rhsSign = rhs.data & ieee754::traits::signMask; @@ -239,145 +229,129 @@ namespace hlsl int exp = int(lhsBiasedExp + rhsBiasedExp) - ieee754::traits::exponentBias; uint64_t sign = (data ^ rhs.data) & ieee754::traits::signMask; - - if (lhsBiasedExp == 0x7FF) + if (!FastMath) { - if ((lhsMantissa != 0u) || - ((rhsBiasedExp == 0x7FF) && (rhsMantissa != 0u))) { - return createPreserveBitPattern(impl::propagateFloat64NaN(data, rhs.data)); - } - if ((uint64_t(rhsBiasedExp) | rhsMantissa) == 0u) - return createPreserveBitPattern(0xFFFFFFFFFFFFFFFFull); + if (lhsBiasedExp == ieee754::traits::specialValueExp) + { + if ((lhsMantissa != 0u) || ((rhsBiasedExp == 0x7FF) && (rhsMantissa != 0u))) + return createPreserveBitPattern(impl::propagateFloat64NaN(data, rhs.data)); + if ((uint64_t(rhsBiasedExp) | rhsMantissa) == 0u) + return createPreserveBitPattern(ieee754::traits::quietNaN); - return createPreserveBitPattern(impl::assembleFloat64(sign, ieee754::traits::exponentMask, 0ull)); - } - if (rhsBiasedExp == 0x7FF) - { - /* a cannot be NaN, but is b NaN? */ - if (rhsMantissa != 0u) + return createPreserveBitPattern(impl::assembleFloat64(sign, ieee754::traits::exponentMask, 0ull)); + } + if (rhsBiasedExp == ieee754::traits::specialValueExp) + { + /* a cannot be NaN, but is b NaN? */ + if (rhsMantissa != 0u) #ifdef RELAXED_NAN_PROPAGATION - return rhs.data; + return rhs.data; #else - return createPreserveBitPattern(impl::propagateFloat64NaN(data, rhs.data)); + return createPreserveBitPattern(impl::propagateFloat64NaN(data, rhs.data)); #endif - if ((uint64_t(lhsBiasedExp) | lhsMantissa) == 0u) - return createPreserveBitPattern(0xFFFFFFFFFFFFFFFFull); + if ((uint64_t(lhsBiasedExp) | lhsMantissa) == 0u) + return createPreserveBitPattern(ieee754::traits::quietNaN); - return createPreserveBitPattern(sign | ieee754::traits::exponentMask); - } - if (lhsBiasedExp == 0) - { - if (lhsMantissa == 0u) - return createPreserveBitPattern(sign); - impl::normalizeFloat64Subnormal(lhsMantissa, lhsBiasedExp, lhsMantissa); - } - if (rhsBiasedExp == 0) - { - if (rhsMantissa == 0u) - return createPreserveBitPattern(sign); - impl::normalizeFloat64Subnormal(rhsMantissa, rhsBiasedExp, rhsMantissa); - } - - if (false) - { - lhsMantissa |= 1ull << 52; - rhsMantissa = impl::shortShift64Left(rhsMantissa, 12); - - uint32_t4 mantissasPacked; - mantissasPacked.xy = impl::packUint64(lhsMantissa); - mantissasPacked.zw = impl::packUint64(rhsMantissa); - - mantissasPacked = impl::mul64to128(mantissasPacked); - - mantissasPacked.xy = impl::packUint64(impl::unpackUint64(mantissasPacked.xy) + lhsMantissa); - mantissasPacked.z |= uint32_t(mantissasPacked.w != 0u); - if (0x00200000u <= mantissasPacked.x) + return createPreserveBitPattern(sign | ieee754::traits::exponentMask); + } + if (lhsBiasedExp == 0) { - mantissasPacked = uint32_t4(impl::shift64ExtraRightJamming(mantissasPacked.xyz, 1), 0u); - ++exp; + if (lhsMantissa == 0u) + return createPreserveBitPattern(sign); + impl::normalizeFloat64Subnormal(lhsMantissa, lhsBiasedExp, lhsMantissa); } - - return createPreserveBitPattern(impl::roundAndPackFloat64(sign, exp, mantissasPacked.xyz)); - } - else - { - lhsMantissa |= 1ull << 52; - rhsMantissa |= 1ull << 52; - - uint32_t2 lhsPacked = impl::packUint64(lhsMantissa); - uint32_t2 rhsPacked = impl::packUint64(rhsMantissa); - uint64_t lhsHigh = lhsPacked.x; - uint64_t lhsLow = lhsPacked.y; - uint64_t rhsHigh = rhsPacked.x; - uint64_t rhsLow = rhsPacked.y; - - //((hi_lhs * hi_rhs) << 11) + ((hi_lhs * lo_rhs + lo_lhs * hi_rhs) >> 37) - - uint64_t newPseudoMantissa = ((lhsHigh * rhsHigh) << 11) + ((lhsHigh * rhsLow + lhsLow * rhsHigh) >> 37); - newPseudoMantissa <<= 1; - //newPseudoMantissa >>= 52; - /*if (newPseudoMantissa >= (1ull << 52)) + if (rhsBiasedExp == 0) { - newPseudoMantissa >>= 1; - ++exp; - }*/ + if (rhsMantissa == 0u) + return createPreserveBitPattern(sign); + impl::normalizeFloat64Subnormal(rhsMantissa, rhsBiasedExp, rhsMantissa); + } + } + + const uint64_t hi_l = (lhsMantissa >> 21) | (1ull << 31); + const uint64_t lo_l = lhsMantissa & ((1ull << 21) - 1); + const uint64_t hi_r = (rhsMantissa >> 21) | (1ull << 31); + const uint64_t lo_r = rhsMantissa & ((1ull << 21) - 1); - return createPreserveBitPattern(impl::assembleFloat64(sign, uint64_t(exp) << ieee754::traits::mantissaBitCnt, newPseudoMantissa & ieee754::traits::mantissaMask)); + //const uint64_t RoundToNearest = (1ull << 31) - 1; + uint64_t newPseudoMantissa = ((hi_l * hi_r) >> 10) + ((hi_l * lo_r + lo_l * hi_r/* + RoundToNearest*/) >> 31); + + if (newPseudoMantissa & (0x1ull << 53)) + { + newPseudoMantissa >>= 1; + ++exp; } + newPseudoMantissa &= (ieee754::traits::mantissaMask); - + return createPreserveBitPattern(impl::assembleFloat64(sign, uint64_t(exp) << ieee754::traits::mantissaBitCnt, newPseudoMantissa)); } - emulated_float64_t_impl operator/(const emulated_float64_t_impl rhs) + emulated_float64_t operator/(const emulated_float64_t rhs) { - // TODO: maybe add function to extract real mantissa const uint64_t lhsRealMantissa = (ieee754::extractMantissa(data) | (1ull << ieee754::traits::mantissaBitCnt)); const uint64_t rhsRealMantissa = ieee754::extractMantissa(rhs.data) | (1ull << ieee754::traits::mantissaBitCnt); - const uint64_t sign = (data ^ rhs.data) & ieee754::traits::signMask; int exp = ieee754::extractExponent(data) - ieee754::extractExponent(rhs.data) + ieee754::traits::exponentBias; - uint64_t mantissa = impl::divMantissas(lhsRealMantissa, rhsRealMantissa); - if (mantissa & (1ULL << (ieee754::traits::mantissaBitCnt + 1))) + uint64_t2 lhsMantissaShifted = impl::shiftMantissaLeftBy52(lhsRealMantissa); + uint64_t mantissa = impl::divmod128by64(lhsMantissaShifted.x, lhsMantissaShifted.y, rhsRealMantissa).x; + + if (mantissa & (0x1ull << 53)) { - mantissa >>= 1; ++exp; } + else + { + mantissa >>= 1; + } - return createPreserveBitPattern(impl::assembleFloat64(sign, exp, mantissa & ieee754::traits::mantissaMask)); - } + mantissa &= ieee754::traits::mantissaMask; + return createPreserveBitPattern(impl::assembleFloat64(sign, uint64_t(exp) << ieee754::traits::mantissaBitCnt, mantissa)); + } // relational operators - bool operator==(emulated_float64_t_impl rhs) + bool operator==(emulated_float64_t rhs) { - if (FastMath && (isnan(data) || isnan(rhs.data))) + if (!FastMath && (isnan(data) || isnan(rhs.data))) return false; + // TODO: i'm not sure about this one + if (!FastMath && impl::areBothZero(data, rhs.data)) + return true; - const emulated_float64_t_impl xored = emulated_float64_t_impl::createPreserveBitPattern(data ^ rhs.data); + const emulated_float64_t xored = emulated_float64_t::createPreserveBitPattern(data ^ rhs.data); // TODO: check what fast math returns for -0 == 0 if ((xored.data & 0x7FFFFFFFFFFFFFFFull) == 0ull) return true; return !(xored.data); } - bool operator!=(emulated_float64_t_impl rhs) + bool operator!=(emulated_float64_t rhs) { - if (FastMath && (isnan(data) || isnan(rhs.data))) + if (!FastMath && (isnan(data) || isnan(rhs.data))) return true; + // TODO: i'm not sure about this one + if (!FastMath && impl::areBothSameSignZero(data, rhs.data)) + return false; - const emulated_float64_t_impl xored = emulated_float64_t_impl::createPreserveBitPattern(data ^ rhs.data); - + const emulated_float64_t xored = emulated_float64_t::createPreserveBitPattern(data ^ rhs.data); // TODO: check what fast math returns for -0 == 0 if ((xored.data & 0x7FFFFFFFFFFFFFFFull) == 0ull) return false; return xored.data; } - bool operator<(emulated_float64_t_impl rhs) + bool operator<(emulated_float64_t rhs) { + if (!FastMath && (isnan(data) || isnan(rhs.data))) + return false; + if (!FastMath && impl::areBothSameSignInfinity(data, rhs.data)) + return false; + if (!FastMath && impl::areBothZero(data, rhs.data)) + return false; + const uint64_t lhsSign = ieee754::extractSign(data); const uint64_t rhsSign = ieee754::extractSign(rhs.data); @@ -389,8 +363,15 @@ namespace hlsl return (lhsFlipped & diffBits) < (rhsFlipped & diffBits); } - bool operator>(emulated_float64_t_impl rhs) + bool operator>(emulated_float64_t rhs) { + if (!FastMath && (isnan(data) || isnan(rhs.data))) + return true; + if (!FastMath && impl::areBothSameSignInfinity(data, rhs.data)) + return false; + if (!FastMath && impl::areBothZero(data, rhs.data)) + return false; + const uint64_t lhsSign = ieee754::extractSign(data); const uint64_t rhsSign = ieee754::extractSign(rhs.data); @@ -402,12 +383,12 @@ namespace hlsl return (lhsFlipped & diffBits) > (rhsFlipped & diffBits); } - bool operator<=(emulated_float64_t_impl rhs) { return !(emulated_float64_t_impl::createPreserveBitPattern(data) > emulated_float64_t_impl::createPreserveBitPattern(rhs.data)); } - bool operator>=(emulated_float64_t_impl rhs) { return !(emulated_float64_t_impl::createPreserveBitPattern(data) < emulated_float64_t_impl::createPreserveBitPattern(rhs.data)); } + bool operator<=(emulated_float64_t rhs) { return !(emulated_float64_t::createPreserveBitPattern(data) > emulated_float64_t::createPreserveBitPattern(rhs.data)); } + bool operator>=(emulated_float64_t rhs) { return !(emulated_float64_t::createPreserveBitPattern(data) < emulated_float64_t::createPreserveBitPattern(rhs.data)); } //logical operators - bool operator&&(emulated_float64_t_impl rhs) { return bool(data) && bool(rhs.data); } - bool operator||(emulated_float64_t_impl rhs) { return bool(data) || bool(rhs.data); } + bool operator&&(emulated_float64_t rhs) { return bool(data) && bool(rhs.data); } + bool operator||(emulated_float64_t rhs) { return bool(data) || bool(rhs.data); } bool operator!() { return !bool(data); } // OMITED OPERATORS @@ -416,7 +397,7 @@ namespace hlsl // - access operators (dereference and addressof) not supported in HLSL // TODO: should modify self? - emulated_float64_t_impl flipSign() + emulated_float64_t flipSign() { return createPreserveBitPattern(data ^ ieee754::traits::signMask); } @@ -427,47 +408,83 @@ namespace hlsl } }; - using emulated_float64_t = emulated_float64_t_impl; +#define COMMA , +#define IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(Type) \ +template<>\ +struct traits_base\ +{\ + NBL_CONSTEXPR_STATIC_INLINE int16_t exponentBitCnt = 11;\ + NBL_CONSTEXPR_STATIC_INLINE int16_t mantissaBitCnt = 52;\ +};\ +template<>\ +uint32_t extractBiasedExponent(Type x)\ +{\ + return extractBiasedExponent(x.data);\ +}\ +\ +template<>\ +int extractExponent(Type x)\ +{\ + return extractExponent(x.data);\ +}\ +\ +template<>\ +Type replaceBiasedExponent(Type x, typename unsigned_integer_of_size::type biasedExp)\ +{\ + return Type(replaceBiasedExponent(x.data, biasedExp));\ +}\ +\ +template <>\ +Type fastMulExp2(Type x, int n)\ +{\ + return Type(replaceBiasedExponent(x.data, extractBiasedExponent(x) + uint32_t(n)));\ +}\ +\ +template <>\ +unsigned_integer_of_size::type extractMantissa(Type x)\ +{\ + return extractMantissa(x.data);\ +}\ + + namespace ieee754 { - template<> - struct traits_base - { - NBL_CONSTEXPR_STATIC_INLINE int16_t exponentBitCnt = 11; - NBL_CONSTEXPR_STATIC_INLINE int16_t mantissaBitCnt = 52; - }; - - template<> - uint32_t extractBiasedExponent(emulated_float64_t x) - { - return extractBiasedExponent(x.data); - } - - template<> - int extractExponent(emulated_float64_t x) - { - return extractExponent(x.data); - } - - template<> - emulated_float64_t replaceBiasedExponent(emulated_float64_t x, typename unsigned_integer_of_size::type biasedExp) - { - return emulated_float64_t(replaceBiasedExponent(x.data, biasedExp)); - } - - //// performs no overflow tests, returns x*exp2(n) - template <> - emulated_float64_t fastMulExp2(emulated_float64_t x, int n) - { - return emulated_float64_t(replaceBiasedExponent(x.data, extractBiasedExponent(x) + uint32_t(n))); - } - - template <> - unsigned_integer_of_size::type extractMantissa(emulated_float64_t x) - { - return extractMantissa(x.data); - } + IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); + IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); + IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); + IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); + + //template<> + //uint32_t extractBiasedExponent(emulated_float64_t x) + //{ + // return extractBiasedExponent(x.data); + //} + + //template<> + //int extractExponent(emulated_float64_t x) + //{ + // return extractExponent(x.data); + //} + + //template<> + // emulated_float64_t replaceBiasedExponent(emulated_float64_t x, typename unsigned_integer_of_size::type biasedExp) + //{ + // return emulated_float64_t(replaceBiasedExponent(x.data, biasedExp)); + //} + + ////// performs no overflow tests, returns x*exp2(n) + //template <> + // emulated_float64_t fastMulExp2(emulated_float64_t x, int n) + //{ + // return emulated_float64_t(replaceBiasedExponent(x.data, extractBiasedExponent(x) + uint32_t(n))); + //} + + //template <> + //unsigned_integer_of_size::type extractMantissa(emulated_float64_t x) + //{ + // return extractMantissa(x.data); + //} } } @@ -479,4 +496,7 @@ namespace ieee754 #undef FLOAT_ROUND_UP #undef FLOAT_ROUNDING_MODE +#undef COMMA +#undef IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE + #endif diff --git a/include/nbl/builtin/hlsl/ieee754.hlsl b/include/nbl/builtin/hlsl/ieee754.hlsl index b4a65d785d..606c415fdb 100644 --- a/include/nbl/builtin/hlsl/ieee754.hlsl +++ b/include/nbl/builtin/hlsl/ieee754.hlsl @@ -97,8 +97,10 @@ struct traits : traits_base NBL_CONSTEXPR_STATIC_INLINE bit_rep_t signMask = bit_rep_t(0x1u) << (sizeof(Float) * 8 - 1); NBL_CONSTEXPR_STATIC_INLINE bit_rep_t exponentMask = ((~bit_rep_t(0)) << base_t::mantissaBitCnt) ^ signMask; NBL_CONSTEXPR_STATIC_INLINE bit_rep_t mantissaMask = (bit_rep_t(0x1u) << base_t::mantissaBitCnt) - 1; - NBL_CONSTEXPR_STATIC_INLINE bit_rep_t exponentBias = (int(0x1) << (base_t::exponentBitCnt - 1)) - 1; + NBL_CONSTEXPR_STATIC_INLINE int exponentBias = (int(0x1) << (base_t::exponentBitCnt - 1)) - 1; NBL_CONSTEXPR_STATIC_INLINE bit_rep_t inf = exponentMask; + NBL_CONSTEXPR_STATIC_INLINE bit_rep_t specialValueExp = (1ull << base_t::exponentBitCnt) - 1; + NBL_CONSTEXPR_STATIC_INLINE bit_rep_t quietNaN = exponentMask | (1ull << (base_t::mantissaBitCnt - 1)); }; template diff --git a/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl b/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl index 33ea303968..63c99d4321 100644 --- a/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl +++ b/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl @@ -6,6 +6,12 @@ #include #include +#define FLOAT_ROUND_NEAREST_EVEN 0 +#define FLOAT_ROUND_TO_ZERO 1 +#define FLOAT_ROUND_DOWN 2 +#define FLOAT_ROUND_UP 3 +#define FLOAT_ROUNDING_MODE FLOAT_ROUND_NEAREST_EVEN + // TODO: when it will be possible, use this unions wherever they fit: /* * union Mantissa @@ -41,97 +47,13 @@ namespace hlsl { namespace impl { - struct uint128Mantissa + uint64_t2 shiftMantissaLeftBy52(uint64_t mantissa64) { - uint64_t highBits; - uint64_t lowBits; - - static uint128Mantissa create(uint64_t mantissa64) - { - uint128Mantissa output; - output.highBits = 0u; - output.lowBits = mantissa64; - - return output; - } - - void shiftLeftByOne() - { - highBits = (highBits << 1) | (lowBits >> 63); - lowBits <<= 1; - } - - void shiftRightByOne() - { - highBits >>= 1; - lowBits = (highBits >> 63) | (lowBits >> 1); - } - - // TODO: more efficient comparisions - bool operator>=(uint128Mantissa rhs) - { - return (highBits > rhs.highBits) || (highBits == rhs.highBits && lowBits >= rhs.lowBits); - } - - bool operator<(uint128Mantissa rhs) - { - return (highBits < rhs.highBits) || (highBits == rhs.highBits && lowBits < rhs.lowBits); - } - - uint128Mantissa operator-(uint128Mantissa rhs) - { - uint128Mantissa result; - result.lowBits = lowBits - rhs.lowBits; - result.highBits = highBits - rhs.highBits - (lowBits < rhs.lowBits); - return result; - } - - static uint128Mantissa createAsShiftedByMantissaBitCnt(uint64_t mantissa64) - { - uint128Mantissa output; - output.highBits = mantissa64 >> (64 - nbl::hlsl::ieee754::traits::mantissaBitCnt); - output.lowBits = mantissa64 << nbl::hlsl::ieee754::traits::mantissaBitCnt; + uint64_t2 output; + output.x = mantissa64 >> (64 - ieee754::traits::mantissaBitCnt); + output.y = mantissa64 << ieee754::traits::mantissaBitCnt; - return output; - } - - uint64_t divByFloat64(uint64_t floatRep) - { - uint128Mantissa output = create(0); - - uint128Mantissa divisor = create(floatRep); - uint128Mantissa remainder; - remainder.highBits = highBits; - remainder.lowBits = lowBits; - uint128Mantissa one = create(1); - - - while ((divisor.highBits < (1ULL << 63)) && (divisor < remainder)) - { - divisor.shiftLeftByOne(); - one.shiftLeftByOne(); - } - - while (one.highBits != 0 || one.lowBits != 0) - { - if (remainder >= divisor) - { - remainder = remainder - divisor; - output.highBits |= one.highBits; - output.lowBits |= one.lowBits; - } - output.shiftRightByOne(); - one.shiftRightByOne(); - } - - return output.lowBits; - } - }; - - uint64_t divMantissas(uint64_t lhs, uint64_t rhs) - { - uint128Mantissa lhs128 = uint128Mantissa::createAsShiftedByMantissaBitCnt(lhs); - return lhs128.divByFloat64(rhs); + return output; } template @@ -154,7 +76,7 @@ namespace impl uint32_t2 umulExtended(uint32_t lhs, uint32_t rhs) { uint64_t product = uint64_t(lhs) * uint64_t(rhs); - nbl::hlsl::uint32_t2 output; + uint32_t2 output; output.x = uint32_t((product & 0xFFFFFFFF00000000) >> 32); output.y = uint32_t(product & 0x00000000FFFFFFFFull); return output; @@ -165,18 +87,18 @@ namespace impl #if defined RELAXED_NAN_PROPAGATION return lhs | rhs; #else - const bool lhsIsNaN = isnan(bit_cast(lhs)); - const bool rhsIsNaN = isnan(bit_cast(rhs)); - lhs |= 0x0000000000080000ull; - rhs |= 0x0000000000080000ull; - return lerp(rhs, lerp(lhs, rhs, rhsIsNaN), lhsIsNaN); + lhs |= 0x0008000000000000ull; + rhs |= 0x0008000000000000ull; + return lerp(rhs, lerp(lhs, rhs, isnan(rhs)), isnan(lhs)); + return 0; #endif } + uint64_t packFloat64(uint32_t zSign, int zExp, uint32_t zFrac0, uint32_t zFrac1) { - nbl::hlsl::uint32_t2 z; + uint32_t2 z; z.x = zSign + (uint32_t(zExp) << 20) + zFrac0; z.y = zFrac1; @@ -197,9 +119,9 @@ namespace impl return ((uint64_t(val.x) & 0x00000000FFFFFFFFull) << 32) | uint64_t(val.y); } - nbl::hlsl::uint32_t2 add64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) + uint32_t2 add64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) { - nbl::hlsl::uint32_t2 output; + uint32_t2 output; output.y = a1 + b1; output.x = a0 + b0 + uint32_t(output.y < a1); @@ -207,9 +129,9 @@ namespace impl } - nbl::hlsl::uint32_t2 sub64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) + uint32_t2 sub64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) { - nbl::hlsl::uint32_t2 output; + uint32_t2 output; output.y = a1 - b1; output.x = a0 - b0 - uint32_t(a1 < b1); @@ -220,7 +142,7 @@ namespace impl int countLeadingZeros32(uint32_t val) { #ifndef __HLSL_VERSION - return 31 - nbl::hlsl::findMSB(val); + return 31 - findMSB(val); #else return 31 - firstbithigh(val); #endif @@ -285,7 +207,7 @@ namespace impl return output; } - nbl::hlsl::uint32_t3 shift64ExtraRightJamming(uint32_t3 val, int count) + uint32_t3 shift64ExtraRightJamming(uint32_t3 val, int count) { uint32_t3 output; output.x = 0u; @@ -317,13 +239,11 @@ namespace impl { const uint32_t2 packed = packUint64(val); - nbl::hlsl::uint32_t2 output; + uint32_t2 output; output.y = packed.y << count; // TODO: fix output.x = lerp((packed.x << count | (packed.y >> ((-count) & 31))), packed.x, count == 0); - // y = 3092377600 - // x = 2119009566 return unpackUint64(output); }; @@ -407,13 +327,13 @@ namespace impl zExp = lerp(zExp, 0, (mantissaExtended.x | mantissaExtended.y) == 0u); } - return assembleFloat64(zSign, uint64_t(zExp) << nbl::hlsl::ieee754::traits::mantissaBitCnt, unpackUint64(mantissaExtended.xy)); + return assembleFloat64(zSign, uint64_t(zExp) << ieee754::traits::mantissaBitCnt, unpackUint64(mantissaExtended.xy)); } uint64_t normalizeRoundAndPackFloat64(uint64_t sign, int exp, uint32_t frac0, uint32_t frac1) { int shiftCount; - nbl::hlsl::uint32_t3 frac = nbl::hlsl::uint32_t3(frac0, frac1, 0u); + uint32_t3 frac = uint32_t3(frac0, frac1, 0u); if (frac.x == 0u) { @@ -425,7 +345,8 @@ namespace impl shiftCount = countLeadingZeros32(frac.x) - 11; if (0 <= shiftCount) { - frac.xy = shortShift64Left(unpackUint64(frac.xy), shiftCount); + // TODO: this is packing and unpacking madness, fix it + frac.xy = packUint64(shortShift64Left(unpackUint64(frac.xy), shiftCount)); } else { @@ -459,11 +380,105 @@ namespace impl rhs ^= ieee754::traits::signMask; bool output = lhs == rhs && ieee754::traits::inf; - bool output = output && ((lhs & (~ieee754::traits::signMask)) == ieee754::traits::inf); + output = output && ((lhs & (~ieee754::traits::signMask)) == ieee754::traits::inf); return output; } -} + bool areBothZero(uint64_t lhs, uint64_t rhs) + { + return ((lhs << 1) == 0ull) && ((rhs << 1) == 0ull); + } + + bool areBothSameSignZero(uint64_t lhs, uint64_t rhs) + { + return ((lhs << 1) == 0ull) && (lhs == rhs); + } + + // TODO: find more efficient algorithm + uint64_t nlz64(uint64_t x) + { + static const uint64_t MASK = 1ull << 63; + + uint64_t counter = 0; + + while ((x & MASK) == 0) + { + x <<= 1; + ++counter; + } + return counter; + } + + uint64_t2 divmod128by64(const uint64_t u1, const uint64_t u0, uint64_t v) + { + const uint64_t b = 1ull << 32; + uint64_t un1, un0, vn1, vn0, q1, q0, un32, un21, un10, rhat, left, right; + uint64_t s; + + s = nlz64(v); + v <<= s; + vn1 = v >> 32; + vn0 = v & 0xffffffff; + if (s > 0) + { + un32 = (u1 << s) | (u0 >> (64 - s)); + un10 = u0 << s; + } + else + { + un32 = u1; + un10 = u0; + } + + un1 = un10 >> 32; + un0 = un10 & 0xffffffff; + + q1 = un32 / vn1; + rhat = un32 % vn1; + + left = q1 * vn0; + right = (rhat << 32) + un1; + while ((q1 >= b) || (left > right)) + { + --q1; + rhat += vn1; + if (rhat < b) + { + left -= vn0; + right = (rhat << 32) | un1; + } + break; + } + + un21 = (un32 << 32) + (un1 - (q1 * v)); + + q0 = un21 / vn1; + rhat = un21 % vn1; + + left = q0 * vn0; + right = (rhat << 32) | un0; + while ((q0 >= b) || (left > right)) + { + --q0; + rhat += vn1; + if (rhat < b) + { + left -= vn0; + right = (rhat << 32) | un0; + continue; + } + break; + } + + uint64_t2 output; + output.x = (q1 << 32) | q0; // quotient + output.y = ((un21 << 32) + (un0 - (q0 * v))) >> s; // remainder + + return output; + } +} +} +} #endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/tgmath.hlsl b/include/nbl/builtin/hlsl/tgmath.hlsl index 464b4ed9ac..6738ac2d86 100644 --- a/include/nbl/builtin/hlsl/tgmath.hlsl +++ b/include/nbl/builtin/hlsl/tgmath.hlsl @@ -16,7 +16,7 @@ bool isnan(Float val) { using AsUint = typename unsigned_integer_of_size::type; AsUint asUint = bit_cast(val); - return bool((asUint & ieee754::traits::exponentMask) && (asUint & ieee754::traits::mantissaMask)); + return bool((ieee754::extractBiasedExponent(val) == ieee754::traits::specialValueExp) && (asUint & ieee754::traits::mantissaMask)); } } From a382ee51fd94db6d721aa09df4635aa79dba5e09 Mon Sep 17 00:00:00 2001 From: Przemek Date: Wed, 31 Jul 2024 15:52:05 +0200 Subject: [PATCH 023/358] Refactorization --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated_float64_t.hlsl | 14 ++++---- .../hlsl/impl/emulated_float64_t_impl.hlsl | 35 +++++++++---------- include/nbl/builtin/hlsl/limits.hlsl | 5 +-- include/nbl/builtin/hlsl/tgmath.hlsl | 1 + 5 files changed, 28 insertions(+), 29 deletions(-) diff --git a/examples_tests b/examples_tests index c53f5c8186..822446f98b 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit c53f5c8186780516833b2ffadba277c131bbc9fb +Subproject commit 822446f98b718c4f719c9f8a9ab70fb0cea74e1a diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index fd156af407..ffe6b1f471 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -107,20 +107,20 @@ namespace hlsl swap(lhsBiasedExp, rhsBiasedExp); } - if (lhsBiasedExp == 0x7FF) + if (lhsBiasedExp == ieee754::traits::specialValueExp) { const bool propagate = (lhsMantissa) != 0u; return createPreserveBitPattern(lerp(ieee754::traits::exponentMask | lhsSign, impl::propagateFloat64NaN(data, rhs.data), propagate)); } expDiff = lerp(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); - rhsMantissa = lerp(rhsMantissa | 0x0010000000000000ull, rhsMantissa, rhsBiasedExp == 0); + rhsMantissa = lerp(rhsMantissa | (1ull << 52), rhsMantissa, rhsBiasedExp == 0); const uint32_t3 shifted = impl::shift64ExtraRightJamming(uint32_t3(impl::packUint64(rhsMantissa), 0u), expDiff); rhsMantissa = impl::unpackUint64(shifted.xy); mantissaExtended.z = shifted.z; biasedExp = lhsBiasedExp; - lhsMantissa |= 0x0010000000000000ull; + lhsMantissa |= (1ull << 52); mantissaExtended.xy = impl::packUint64(lhsMantissa + rhsMantissa); --biasedExp; if (!(mantissaExtended.x < 0x00200000u)) @@ -151,7 +151,7 @@ namespace hlsl lhsSign ^= ieee754::traits::signMask; } - if (lhsBiasedExp == 0x7FF) + if (lhsBiasedExp == ieee754::traits::specialValueExp) { bool propagate = lhsMantissa != 0u; return createPreserveBitPattern(lerp(impl::assembleFloat64(lhsSign, ieee754::traits::exponentMask, 0ull), impl::propagateFloat64NaN(data, rhs.data), propagate)); @@ -233,7 +233,7 @@ namespace hlsl { if (lhsBiasedExp == ieee754::traits::specialValueExp) { - if ((lhsMantissa != 0u) || ((rhsBiasedExp == 0x7FF) && (rhsMantissa != 0u))) + if ((lhsMantissa != 0u) || ((rhsBiasedExp == ieee754::traits::specialValueExp) && (rhsMantissa != 0u))) return createPreserveBitPattern(impl::propagateFloat64NaN(data, rhs.data)); if ((uint64_t(rhsBiasedExp) | rhsMantissa) == 0u) return createPreserveBitPattern(ieee754::traits::quietNaN); @@ -296,7 +296,7 @@ namespace hlsl int exp = ieee754::extractExponent(data) - ieee754::extractExponent(rhs.data) + ieee754::traits::exponentBias; uint64_t2 lhsMantissaShifted = impl::shiftMantissaLeftBy52(lhsRealMantissa); - uint64_t mantissa = impl::divmod128by64(lhsMantissaShifted.x, lhsMantissaShifted.y, rhsRealMantissa).x; + uint64_t mantissa = impl::divmod128by64(lhsMantissaShifted.x, lhsMantissaShifted.y, rhsRealMantissa); if (mantissa & (0x1ull << 53)) { @@ -446,8 +446,6 @@ unsigned_integer_of_size::type extractMantissa(Type x)\ return extractMantissa(x.data);\ }\ - - namespace ieee754 { IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); diff --git a/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl b/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl index 63c99d4321..478ae44090 100644 --- a/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl +++ b/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl @@ -287,10 +287,10 @@ namespace impl // ((zSign != 0u) && (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP)) || // ((zSign == 0u) && (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN))) { - return packFloat64(zSign, 0x7FE, 0x000FFFFFu, 0xFFFFFFFFu); + return assembleFloat64(zSign, 0x7FE << ieee754::traits::mantissaBitCnt, 0x000FFFFFFFFFFFFFull); } - return packFloat64(zSign, 0x7FF, 0u, 0u); + return assembleFloat64(zSign, ieee754::traits::exponentMask, 0ull); } } @@ -410,30 +410,33 @@ namespace impl return counter; } - uint64_t2 divmod128by64(const uint64_t u1, const uint64_t u0, uint64_t v) + // returns pair of quotient and remainder + uint64_t divmod128by64(const uint64_t dividentHigh, const uint64_t dividentLow, uint64_t divisor) { const uint64_t b = 1ull << 32; uint64_t un1, un0, vn1, vn0, q1, q0, un32, un21, un10, rhat, left, right; uint64_t s; - s = nlz64(v); - v <<= s; - vn1 = v >> 32; - vn0 = v & 0xffffffff; + //TODO: countl_zero + s = countl_zero(divisor); + //s = nlz64(divisor); + divisor <<= s; + vn1 = divisor >> 32; + vn0 = divisor & 0xFFFFFFFF; if (s > 0) { - un32 = (u1 << s) | (u0 >> (64 - s)); - un10 = u0 << s; + un32 = (dividentHigh << s) | (dividentLow >> (64 - s)); + un10 = dividentLow << s; } else { - un32 = u1; - un10 = u0; + un32 = dividentHigh; + un10 = dividentLow; } un1 = un10 >> 32; - un0 = un10 & 0xffffffff; + un0 = un10 & 0xFFFFFFFF; q1 = un32 / vn1; rhat = un32 % vn1; @@ -452,7 +455,7 @@ namespace impl break; } - un21 = (un32 << 32) + (un1 - (q1 * v)); + un21 = (un32 << 32) + (un1 - (q1 * divisor)); q0 = un21 / vn1; rhat = un21 % vn1; @@ -472,11 +475,7 @@ namespace impl break; } - uint64_t2 output; - output.x = (q1 << 32) | q0; // quotient - output.y = ((un21 << 32) + (un0 - (q0 * v))) >> s; // remainder - - return output; + return (q1 << 32) | q0; } } } diff --git a/include/nbl/builtin/hlsl/limits.hlsl b/include/nbl/builtin/hlsl/limits.hlsl index 585cf3eff6..4b93af099c 100644 --- a/include/nbl/builtin/hlsl/limits.hlsl +++ b/include/nbl/builtin/hlsl/limits.hlsl @@ -4,6 +4,7 @@ #ifndef _NBL_BUILTIN_HLSL_LIMITS_INCLUDED_ #define _NBL_BUILTIN_HLSL_LIMITS_INCLUDED_ +#include #include #include @@ -127,7 +128,7 @@ struct num_base : type_identity NBL_CONSTEXPR_STATIC_INLINE int32_t float_max_decimal_exponent = 4*S16 + 30*S32 + 232*S64; NBL_CONSTEXPR_STATIC_INLINE int32_t float_exponent_bits = 8 * size - float_digits - 1; - NBL_CONSTEXPR_STATIC_INLINE int32_t float_max_exponent = 1l << float_exponent_bits; + NBL_CONSTEXPR_STATIC_INLINE int32_t float_max_exponent = 1 << float_exponent_bits; NBL_CONSTEXPR_STATIC_INLINE int32_t float_min_exponent = 3 - float_max_exponent; NBL_CONSTEXPR_STATIC_INLINE bool is_bool = is_same::value; @@ -236,7 +237,7 @@ template<> struct num_traits : num_base { NBL_CONSTEXPR_STATIC_INLINE float64_t max = 1.7976931348623158e+308; - NBL_CONSTEXPR_STATIC_INLINE float64_t min = 2.2250738585072014e-308; + NBL_CONSTEXPR_STATIC_INLINE NBL_FP64_LITERAL(float64_t) min = 2.2250738585072014e-308; NBL_CONSTEXPR_STATIC_INLINE float64_t denorm_min = 4.9406564584124654e-324; NBL_CONSTEXPR_STATIC_INLINE uint64_t quiet_NaN = 0x7FF8000000000000ull; NBL_CONSTEXPR_STATIC_INLINE uint64_t signaling_NaN = 0x7FF0000000000001ull; diff --git a/include/nbl/builtin/hlsl/tgmath.hlsl b/include/nbl/builtin/hlsl/tgmath.hlsl index 6738ac2d86..63e7802df7 100644 --- a/include/nbl/builtin/hlsl/tgmath.hlsl +++ b/include/nbl/builtin/hlsl/tgmath.hlsl @@ -15,6 +15,7 @@ template bool isnan(Float val) { using AsUint = typename unsigned_integer_of_size::type; + using AsFloat = typename float_of_size::type; AsUint asUint = bit_cast(val); return bool((ieee754::extractBiasedExponent(val) == ieee754::traits::specialValueExp) && (asUint & ieee754::traits::mantissaMask)); } From 1e419da30d3210bdb232ab902577adde465e57df Mon Sep 17 00:00:00 2001 From: Przemek Date: Thu, 1 Aug 2024 20:52:20 +0200 Subject: [PATCH 024/358] Fixes --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated_float64_t.hlsl | 39 ++++++----- .../hlsl/impl/emulated_float64_t_impl.hlsl | 68 +++++++++++-------- include/nbl/builtin/hlsl/tgmath.hlsl | 7 ++ 4 files changed, 65 insertions(+), 51 deletions(-) diff --git a/examples_tests b/examples_tests index deaa9f2c24..f99b039d79 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit deaa9f2c248b25d808f5c7c58553abe2ec9d21ee +Subproject commit f99b039d7970be6ea0ca590e257dc999cecd8ae9 diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index ffe6b1f471..4945617ab8 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -2,7 +2,6 @@ #define _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_INCLUDED_ #include - namespace nbl { namespace hlsl @@ -86,7 +85,7 @@ namespace hlsl if (lhsBiasedExp == ieee754::traits::specialValueExp) { bool propagate = (lhsMantissa | rhsMantissa) != 0u; - return createPreserveBitPattern(lerp(data, impl::propagateFloat64NaN(data, rhs.data), propagate)); + return createPreserveBitPattern(nbl::hlsl::lerp(data, impl::propagateFloat64NaN(data, rhs.data), propagate)); } mantissa = lhsMantissa + rhsMantissa; @@ -110,11 +109,11 @@ namespace hlsl if (lhsBiasedExp == ieee754::traits::specialValueExp) { const bool propagate = (lhsMantissa) != 0u; - return createPreserveBitPattern(lerp(ieee754::traits::exponentMask | lhsSign, impl::propagateFloat64NaN(data, rhs.data), propagate)); + return createPreserveBitPattern(nbl::hlsl::lerp(ieee754::traits::exponentMask | lhsSign, impl::propagateFloat64NaN(data, rhs.data), propagate)); } - expDiff = lerp(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); - rhsMantissa = lerp(rhsMantissa | (1ull << 52), rhsMantissa, rhsBiasedExp == 0); + expDiff = nbl::hlsl::lerp(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); + rhsMantissa = nbl::hlsl::lerp(rhsMantissa | (1ull << 52), rhsMantissa, rhsBiasedExp == 0); const uint32_t3 shifted = impl::shift64ExtraRightJamming(uint32_t3(impl::packUint64(rhsMantissa), 0u), expDiff); rhsMantissa = impl::unpackUint64(shifted.xy); mantissaExtended.z = shifted.z; @@ -154,11 +153,11 @@ namespace hlsl if (lhsBiasedExp == ieee754::traits::specialValueExp) { bool propagate = lhsMantissa != 0u; - return createPreserveBitPattern(lerp(impl::assembleFloat64(lhsSign, ieee754::traits::exponentMask, 0ull), impl::propagateFloat64NaN(data, rhs.data), propagate)); + return createPreserveBitPattern(nbl::hlsl::lerp(impl::assembleFloat64(lhsSign, ieee754::traits::exponentMask, 0ull), impl::propagateFloat64NaN(data, rhs.data), propagate)); } - expDiff = lerp(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); - rhsMantissa = lerp(rhsMantissa | 0x4000000000000000ull, rhsMantissa, rhsBiasedExp == 0); + expDiff = nbl::hlsl::lerp(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); + rhsMantissa = nbl::hlsl::lerp(rhsMantissa | 0x4000000000000000ull, rhsMantissa, rhsBiasedExp == 0); rhsMantissa = impl::unpackUint64(impl::shift64RightJamming(impl::packUint64(rhsMantissa), expDiff)); lhsMantissa |= 0x4000000000000000ull; frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); @@ -169,10 +168,10 @@ namespace hlsl if (lhsBiasedExp == ieee754::traits::specialValueExp) { bool propagate = ((lhsMantissa) | (rhsMantissa)) != 0u; - return createPreserveBitPattern(lerp(ieee754::traits::quietNaN, impl::propagateFloat64NaN(data, rhs.data), propagate)); + return createPreserveBitPattern(nbl::hlsl::lerp(ieee754::traits::quietNaN, impl::propagateFloat64NaN(data, rhs.data), propagate)); } - rhsBiasedExp = lerp(rhsBiasedExp, 1, lhsBiasedExp == 0); - lhsBiasedExp = lerp(lhsBiasedExp, 1, lhsBiasedExp == 0); + rhsBiasedExp = nbl::hlsl::lerp(rhsBiasedExp, 1, lhsBiasedExp == 0); + lhsBiasedExp = nbl::hlsl::lerp(lhsBiasedExp, 1, lhsBiasedExp == 0); const uint32_t2 lhsMantissaPacked = impl::packUint64(lhsMantissa); @@ -200,11 +199,11 @@ namespace hlsl signOfDifference = ieee754::traits::signMask; } - biasedExp = lerp(rhsBiasedExp, lhsBiasedExp, signOfDifference == 0u); + biasedExp = nbl::hlsl::lerp(rhsBiasedExp, lhsBiasedExp, signOfDifference == 0u); lhsSign ^= signOfDifference; uint64_t retval_0 = impl::packFloat64(uint32_t(FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) << 31, 0, 0u, 0u); uint64_t retval_1 = impl::normalizeRoundAndPackFloat64(lhsSign, biasedExp - 11, frac.x, frac.y); - return createPreserveBitPattern(lerp(retval_0, retval_1, frac.x != 0u || frac.y != 0u)); + return createPreserveBitPattern(nbl::hlsl::lerp(retval_0, retval_1, frac.x != 0u || frac.y != 0u)); } } @@ -295,10 +294,10 @@ namespace hlsl const uint64_t sign = (data ^ rhs.data) & ieee754::traits::signMask; int exp = ieee754::extractExponent(data) - ieee754::extractExponent(rhs.data) + ieee754::traits::exponentBias; - uint64_t2 lhsMantissaShifted = impl::shiftMantissaLeftBy52(lhsRealMantissa); + uint64_t2 lhsMantissaShifted = impl::shiftMantissaLeftBy53(lhsRealMantissa); uint64_t mantissa = impl::divmod128by64(lhsMantissaShifted.x, lhsMantissaShifted.y, rhsRealMantissa); - if (mantissa & (0x1ull << 53)) + if (mantissa & (0x1ull << 54)) { ++exp; } @@ -315,7 +314,7 @@ namespace hlsl // relational operators bool operator==(emulated_float64_t rhs) { - if (!FastMath && (isnan(data) || isnan(rhs.data))) + if (!FastMath && (nbl::hlsl::isnan(data) || nbl::hlsl::isnan(rhs.data))) return false; // TODO: i'm not sure about this one if (!FastMath && impl::areBothZero(data, rhs.data)) @@ -330,7 +329,7 @@ namespace hlsl } bool operator!=(emulated_float64_t rhs) { - if (!FastMath && (isnan(data) || isnan(rhs.data))) + if (!FastMath && (nbl::hlsl::isnan(data) || nbl::hlsl::isnan(rhs.data))) return true; // TODO: i'm not sure about this one if (!FastMath && impl::areBothSameSignZero(data, rhs.data)) @@ -345,7 +344,7 @@ namespace hlsl } bool operator<(emulated_float64_t rhs) { - if (!FastMath && (isnan(data) || isnan(rhs.data))) + if (!FastMath && (nbl::hlsl::isnan(data) || nbl::hlsl::isnan(rhs.data))) return false; if (!FastMath && impl::areBothSameSignInfinity(data, rhs.data)) return false; @@ -365,7 +364,7 @@ namespace hlsl } bool operator>(emulated_float64_t rhs) { - if (!FastMath && (isnan(data) || isnan(rhs.data))) + if (!FastMath && (nbl::hlsl::isnan(data) || nbl::hlsl::isnan(rhs.data))) return true; if (!FastMath && impl::areBothSameSignInfinity(data, rhs.data)) return false; @@ -404,7 +403,7 @@ namespace hlsl bool isNaN() { - return isnan(bit_cast(data)); + return nbl::hlsl::isnan(bit_cast(data)); } }; diff --git a/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl b/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl index 478ae44090..2c284fbd43 100644 --- a/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl +++ b/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl @@ -45,13 +45,21 @@ namespace nbl { namespace hlsl { + // TODO: better implementation, also this needs to be moved somewhere else + template + UINT lerp(UINT a, UINT b, bool c) + { + return c ? b : a; + } + + namespace impl { - uint64_t2 shiftMantissaLeftBy52(uint64_t mantissa64) + uint64_t2 shiftMantissaLeftBy53(uint64_t mantissa64) { uint64_t2 output; - output.x = mantissa64 >> (64 - ieee754::traits::mantissaBitCnt); - output.y = mantissa64 << ieee754::traits::mantissaBitCnt; + output.x = mantissa64 >> (63 - ieee754::traits::mantissaBitCnt); + output.y = mantissa64 << (ieee754::traits::mantissaBitCnt + 1); return output; } @@ -90,7 +98,7 @@ namespace impl lhs |= 0x0008000000000000ull; rhs |= 0x0008000000000000ull; - return lerp(rhs, lerp(lhs, rhs, isnan(rhs)), isnan(lhs)); + return nbl::hlsl::lerp(rhs, nbl::hlsl::lerp(lhs, rhs, nbl::hlsl::isnan(rhs)), nbl::hlsl::isnan(lhs)); return 0; #endif } @@ -153,16 +161,16 @@ namespace impl uint32_t2 output; const int negCount = (-count) & 31; - output.x = lerp(0u, val.x, count == 0); - output.x = lerp(output.x, (val.x >> count), count < 32); + output.x = nbl::hlsl::lerp(0u, val.x, count == 0); + output.x = nbl::hlsl::lerp(output.x, (val.x >> count), count < 32); output.y = uint32_t((val.x | val.y) != 0u); /* count >= 64 */ uint32_t z1_lt64 = (val.x>>(count & 31)) | uint32_t(((val.x<>count) | uint32_t((val.y<> (count & 31)), count < 64); - output.y = lerp(output.y, (val.x << negCount) | (val.y >> count), count < 32); + output.y = nbl::hlsl::lerp(0u, (val.x >> (count & 31)), count < 64); + output.y = nbl::hlsl::lerp(output.y, (val.x << negCount) | (val.y >> count), count < 32); - val.z = lerp(val.z | val.y, val.z, count < 32); - output.x = lerp(output.x, val.x >> count, count < 32); + val.z = nbl::hlsl::lerp(val.z | val.y, val.z, count < 32); + output.x = nbl::hlsl::lerp(output.x, val.x >> count, count < 32); output.z |= uint32_t(val.z != 0u); - output.x = lerp(output.x, 0u, (count == 32)); - output.y = lerp(output.y, val.x, (count == 32)); - output.z = lerp(output.z, val.y, (count == 32)); - output.x = lerp(output.x, val.x, (count == 0)); - output.y = lerp(output.y, val.y, (count == 0)); - output.z = lerp(output.z, val.z, (count == 0)); + output.x = nbl::hlsl::lerp(output.x, 0u, (count == 32)); + output.y = nbl::hlsl::lerp(output.y, val.x, (count == 32)); + output.z = nbl::hlsl::lerp(output.z, val.y, (count == 32)); + output.x = nbl::hlsl::lerp(output.x, val.x, (count == 0)); + output.y = nbl::hlsl::lerp(output.y, val.y, (count == 0)); + output.z = nbl::hlsl::lerp(output.z, val.z, (count == 0)); return output; } @@ -242,7 +250,7 @@ namespace impl uint32_t2 output; output.y = packed.y << count; // TODO: fix - output.x = lerp((packed.x << count | (packed.y >> ((-count) & 31))), packed.x, count == 0); + output.x = nbl::hlsl::lerp((packed.x << count | (packed.y >> ((-count) & 31))), packed.x, count == 0); return unpackUint64(output); }; @@ -324,7 +332,7 @@ namespace impl } else { - zExp = lerp(zExp, 0, (mantissaExtended.x | mantissaExtended.y) == 0u); + zExp = nbl::hlsl::lerp(zExp, 0, (mantissaExtended.x | mantissaExtended.y) == 0u); } return assembleFloat64(zSign, uint64_t(zExp) << ieee754::traits::mantissaBitCnt, unpackUint64(mantissaExtended.xy)); @@ -363,15 +371,15 @@ namespace impl uint32_t2 mantissaPacked = packUint64(mantissa); int shiftCount; uint32_t2 temp; - shiftCount = countLeadingZeros32(lerp(mantissaPacked.x, mantissaPacked.y, mantissaPacked.x == 0u)) - 11; - outExp = lerp(1 - shiftCount, -shiftCount - 31, mantissaPacked.x == 0u); + shiftCount = countLeadingZeros32(nbl::hlsl::lerp(mantissaPacked.x, mantissaPacked.y, mantissaPacked.x == 0u)) - 11; + outExp = nbl::hlsl::lerp(1 - shiftCount, -shiftCount - 31, mantissaPacked.x == 0u); - temp.x = lerp(mantissaPacked.y << shiftCount, mantissaPacked.y >> (-shiftCount), shiftCount < 0); - temp.y = lerp(0u, mantissaPacked.y << (shiftCount & 31), shiftCount < 0); + temp.x = nbl::hlsl::lerp(mantissaPacked.y << shiftCount, mantissaPacked.y >> (-shiftCount), shiftCount < 0); + temp.y = nbl::hlsl::lerp(0u, mantissaPacked.y << (shiftCount & 31), shiftCount < 0); shortShift64Left(impl::unpackUint64(mantissaPacked), shiftCount); - outMantissa = lerp(outMantissa, unpackUint64(temp), mantissaPacked.x == 0); + outMantissa = nbl::hlsl::lerp(outMantissa, unpackUint64(temp), mantissaPacked.x == 0); } bool areBothSameSignInfinity(uint64_t lhs, uint64_t rhs) diff --git a/include/nbl/builtin/hlsl/tgmath.hlsl b/include/nbl/builtin/hlsl/tgmath.hlsl index 63e7802df7..38a49171e5 100644 --- a/include/nbl/builtin/hlsl/tgmath.hlsl +++ b/include/nbl/builtin/hlsl/tgmath.hlsl @@ -20,6 +20,13 @@ bool isnan(Float val) return bool((ieee754::extractBiasedExponent(val) == ieee754::traits::specialValueExp) && (asUint & ieee754::traits::mantissaMask)); } +template <> +bool isnan(uint64_t val) +{ + float64_t asFloat = bit_cast(val); + return bool((ieee754::extractBiasedExponent(asFloat) == ieee754::traits::specialValueExp) && (val & ieee754::traits::mantissaMask)); +} + } } From 2b7ba9be32650b1c8352c69776dc165fc1c59415 Mon Sep 17 00:00:00 2001 From: Przemek Date: Thu, 1 Aug 2024 20:54:43 +0200 Subject: [PATCH 025/358] Correction --- include/nbl/builtin/hlsl/emulated_float64_t.hlsl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index 4945617ab8..a781145866 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -297,7 +297,7 @@ namespace hlsl uint64_t2 lhsMantissaShifted = impl::shiftMantissaLeftBy53(lhsRealMantissa); uint64_t mantissa = impl::divmod128by64(lhsMantissaShifted.x, lhsMantissaShifted.y, rhsRealMantissa); - if (mantissa & (0x1ull << 54)) + if (mantissa & (0x1ull << 53)) { ++exp; } From f6c6e59e563a49a237587d52c74c11699396579a Mon Sep 17 00:00:00 2001 From: Przemek Date: Tue, 6 Aug 2024 15:12:09 +0200 Subject: [PATCH 026/358] Saving work --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated_float64_t.hlsl | 142 ++++++++++++++---- include/nbl/builtin/hlsl/ieee754.hlsl | 3 +- .../hlsl/impl/emulated_float64_t_impl.hlsl | 62 ++++---- include/nbl/builtin/hlsl/tgmath.hlsl | 13 ++ 5 files changed, 154 insertions(+), 68 deletions(-) diff --git a/examples_tests b/examples_tests index f99b039d79..54cf5e9271 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit f99b039d7970be6ea0ca590e257dc999cecd8ae9 +Subproject commit 54cf5e9271636f6898a174e61aff1eec36bda51a diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index a781145866..cd7e3c43fe 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -60,9 +60,91 @@ namespace hlsl return emulated_float64_t(val); } + uint64_t shiftLeftAllowNegBitCnt(uint64_t val, int n) + { + if (n < 0) + return val >> -n; + else + return val << n; + } + // arithmetic operators emulated_float64_t operator+(const emulated_float64_t rhs) { +// { +// uint64_t lhsSign = data & ieee754::traits::signMask; +// uint64_t rhsSign = rhs.data & ieee754::traits::signMask; +// uint64_t lhsMantissa = ieee754::extractMantissa(data); +// uint64_t rhsMantissa = ieee754::extractMantissa(rhs.data); +// int lhsBiasedExp = ieee754::extractBiasedExponent(data); +// int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); +// +// if (tgmath::isnan(data) || tgmath::isnan(rhs.data)) +// return createPreserveBitPattern(ieee754::traits::quietNaN); +// /*if (std::isinf(lhs) || std::isinf(rhs)) +// { +// if (std::isinf(lhs) && !std::isinf(rhs)) +// return lhs; +// if (std::isinf(rhs) && !std::isinf(lhs)) +// return rhs; +// if (rhs == lhs) +// return rhs; +// +// return nan(); +// }*/ +// +// int rp = min(ieee754::extractExponent(data), ieee754::extractExponent(rhs.data)) - ieee754::traits::mantissaBitCnt; +// +// uint64_t lhsRealMantissa = lhsMantissa | (1ull << ieee754::traits::mantissaBitCnt); +// uint64_t rhsRealMantissa = rhsMantissa | (1ull << ieee754::traits::mantissaBitCnt); +// uint64_t lhsSignTmp = lhsSign >> (52 + 11); +// uint64_t rhsSignTmp = rhsSign >> (52 + 11); +// +// uint64_t sign = 0u; +// if (lhsSign != rhsSign) +// { +// uint64_t _min = max(data, rhs.data); +// uint64_t _max = min(data, rhs.data); +// uint64_t minAbs = _min ^ ieee754::traits::signMask; +// if (minAbs > _max) +// sign = ieee754::traits::signMask; +// +// } +// +// int64_t lhsMantissaTmp = (shiftLeftAllowNegBitCnt(lhsRealMantissa, lhsBiasedExp - rp - ieee754::traits::mantissaBitCnt - ieee754::traits::exponentBias) ^ (-lhsSignTmp)) + lhsSignTmp; +// int64_t rhsMantissaTmp = (shiftLeftAllowNegBitCnt(rhsRealMantissa, rhsBiasedExp - rp - ieee754::traits::mantissaBitCnt - ieee754::traits::exponentBias) ^ (-rhsSignTmp)) + rhsSignTmp; +// +// uint64_t addTmp = bit_cast(lhsMantissaTmp + rhsMantissaTmp); +// +// // renormalize +// if (!FastMath && false) // TODO: hande nan +// { +// +// } +// else +// { +//#ifndef __HLSL_VERSION +// int l2 = log2(double(addTmp)); +//#else +// int intl2 = 0; +//#endif +// +// if (!FastMath && (rp + l2 + 1 < nbl::hlsl::numeric_limits::min_exponent)) +// { +// return createPreserveBitPattern(impl::assembleFloat64(0, ieee754::traits::exponentMask, 0)); +// } +// else +// { +// rp = addTmp ? l2 + rp + ieee754::traits::exponentBias : 0; +// return createPreserveBitPattern(impl::assembleFloat64( +// sign, +// (uint64_t(rp) << ieee754::traits::mantissaBitCnt) & ieee754::traits::exponentMask, +// shiftLeftAllowNegBitCnt(addTmp, (ieee754::traits::mantissaBitCnt - l2)) & ieee754::traits::mantissaMask) +// ); +// } +// } +// } + emulated_float64_t retval = createPreserveBitPattern(0u); uint64_t mantissa; @@ -85,7 +167,7 @@ namespace hlsl if (lhsBiasedExp == ieee754::traits::specialValueExp) { bool propagate = (lhsMantissa | rhsMantissa) != 0u; - return createPreserveBitPattern(nbl::hlsl::lerp(data, impl::propagateFloat64NaN(data, rhs.data), propagate)); + return createPreserveBitPattern(tgmath::lerp(data, impl::propagateFloat64NaN(data, rhs.data), propagate)); } mantissa = lhsMantissa + rhsMantissa; @@ -109,11 +191,11 @@ namespace hlsl if (lhsBiasedExp == ieee754::traits::specialValueExp) { const bool propagate = (lhsMantissa) != 0u; - return createPreserveBitPattern(nbl::hlsl::lerp(ieee754::traits::exponentMask | lhsSign, impl::propagateFloat64NaN(data, rhs.data), propagate)); + return createPreserveBitPattern(tgmath::lerp(ieee754::traits::exponentMask | lhsSign, impl::propagateFloat64NaN(data, rhs.data), propagate)); } - expDiff = nbl::hlsl::lerp(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); - rhsMantissa = nbl::hlsl::lerp(rhsMantissa | (1ull << 52), rhsMantissa, rhsBiasedExp == 0); + expDiff = tgmath::lerp(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); + rhsMantissa = tgmath::lerp(rhsMantissa | (1ull << 52), rhsMantissa, rhsBiasedExp == 0); const uint32_t3 shifted = impl::shift64ExtraRightJamming(uint32_t3(impl::packUint64(rhsMantissa), 0u), expDiff); rhsMantissa = impl::unpackUint64(shifted.xy); mantissaExtended.z = shifted.z; @@ -153,11 +235,11 @@ namespace hlsl if (lhsBiasedExp == ieee754::traits::specialValueExp) { bool propagate = lhsMantissa != 0u; - return createPreserveBitPattern(nbl::hlsl::lerp(impl::assembleFloat64(lhsSign, ieee754::traits::exponentMask, 0ull), impl::propagateFloat64NaN(data, rhs.data), propagate)); + return createPreserveBitPattern(tgmath::lerp(impl::assembleFloat64(lhsSign, ieee754::traits::exponentMask, 0ull), impl::propagateFloat64NaN(data, rhs.data), propagate)); } - expDiff = nbl::hlsl::lerp(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); - rhsMantissa = nbl::hlsl::lerp(rhsMantissa | 0x4000000000000000ull, rhsMantissa, rhsBiasedExp == 0); + expDiff = tgmath::lerp(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); + rhsMantissa = tgmath::lerp(rhsMantissa | 0x4000000000000000ull, rhsMantissa, rhsBiasedExp == 0); rhsMantissa = impl::unpackUint64(impl::shift64RightJamming(impl::packUint64(rhsMantissa), expDiff)); lhsMantissa |= 0x4000000000000000ull; frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); @@ -168,10 +250,10 @@ namespace hlsl if (lhsBiasedExp == ieee754::traits::specialValueExp) { bool propagate = ((lhsMantissa) | (rhsMantissa)) != 0u; - return createPreserveBitPattern(nbl::hlsl::lerp(ieee754::traits::quietNaN, impl::propagateFloat64NaN(data, rhs.data), propagate)); + return createPreserveBitPattern(tgmath::lerp(ieee754::traits::quietNaN, impl::propagateFloat64NaN(data, rhs.data), propagate)); } - rhsBiasedExp = nbl::hlsl::lerp(rhsBiasedExp, 1, lhsBiasedExp == 0); - lhsBiasedExp = nbl::hlsl::lerp(lhsBiasedExp, 1, lhsBiasedExp == 0); + rhsBiasedExp = tgmath::lerp(rhsBiasedExp, 1, lhsBiasedExp == 0); + lhsBiasedExp = tgmath::lerp(lhsBiasedExp, 1, lhsBiasedExp == 0); const uint32_t2 lhsMantissaPacked = impl::packUint64(lhsMantissa); @@ -199,11 +281,11 @@ namespace hlsl signOfDifference = ieee754::traits::signMask; } - biasedExp = nbl::hlsl::lerp(rhsBiasedExp, lhsBiasedExp, signOfDifference == 0u); + biasedExp = tgmath::lerp(rhsBiasedExp, lhsBiasedExp, signOfDifference == 0u); lhsSign ^= signOfDifference; uint64_t retval_0 = impl::packFloat64(uint32_t(FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) << 31, 0, 0u, 0u); uint64_t retval_1 = impl::normalizeRoundAndPackFloat64(lhsSign, biasedExp - 11, frac.x, frac.y); - return createPreserveBitPattern(nbl::hlsl::lerp(retval_0, retval_1, frac.x != 0u || frac.y != 0u)); + return createPreserveBitPattern(tgmath::lerp(retval_0, retval_1, frac.x != 0u || frac.y != 0u)); } } @@ -285,9 +367,18 @@ namespace hlsl return createPreserveBitPattern(impl::assembleFloat64(sign, uint64_t(exp) << ieee754::traits::mantissaBitCnt, newPseudoMantissa)); } + /*emulated_float64_t reciprocal(uint64_t x) + { + using ThisType = emulated_float64_t; + ThisType output = ThisType::createPreserveBitPattern((0xbfcdd6a18f6a6f52ULL - x) >> 1); + output = output * output; + return output; + }*/ + emulated_float64_t operator/(const emulated_float64_t rhs) { - // TODO: maybe add function to extract real mantissa + //return emulated_float64_t::createPreserveBitPattern(data) * reciprocal(rhs.data); + const uint64_t lhsRealMantissa = (ieee754::extractMantissa(data) | (1ull << ieee754::traits::mantissaBitCnt)); const uint64_t rhsRealMantissa = ieee754::extractMantissa(rhs.data) | (1ull << ieee754::traits::mantissaBitCnt); @@ -312,15 +403,15 @@ namespace hlsl } // relational operators - bool operator==(emulated_float64_t rhs) + bool operator==(emulated_float64_t rhs) { - if (!FastMath && (nbl::hlsl::isnan(data) || nbl::hlsl::isnan(rhs.data))) + if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) return false; // TODO: i'm not sure about this one if (!FastMath && impl::areBothZero(data, rhs.data)) return true; - const emulated_float64_t xored = emulated_float64_t::createPreserveBitPattern(data ^ rhs.data); + const emulated_float64_t xored = createPreserveBitPattern(data ^ rhs.data); // TODO: check what fast math returns for -0 == 0 if ((xored.data & 0x7FFFFFFFFFFFFFFFull) == 0ull) return true; @@ -329,22 +420,11 @@ namespace hlsl } bool operator!=(emulated_float64_t rhs) { - if (!FastMath && (nbl::hlsl::isnan(data) || nbl::hlsl::isnan(rhs.data))) - return true; - // TODO: i'm not sure about this one - if (!FastMath && impl::areBothSameSignZero(data, rhs.data)) - return false; - - const emulated_float64_t xored = emulated_float64_t::createPreserveBitPattern(data ^ rhs.data); - // TODO: check what fast math returns for -0 == 0 - if ((xored.data & 0x7FFFFFFFFFFFFFFFull) == 0ull) - return false; - - return xored.data; + return !(createPreserveBitPattern(data) == rhs); } bool operator<(emulated_float64_t rhs) { - if (!FastMath && (nbl::hlsl::isnan(data) || nbl::hlsl::isnan(rhs.data))) + if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) return false; if (!FastMath && impl::areBothSameSignInfinity(data, rhs.data)) return false; @@ -364,7 +444,7 @@ namespace hlsl } bool operator>(emulated_float64_t rhs) { - if (!FastMath && (nbl::hlsl::isnan(data) || nbl::hlsl::isnan(rhs.data))) + if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) return true; if (!FastMath && impl::areBothSameSignInfinity(data, rhs.data)) return false; @@ -403,7 +483,7 @@ namespace hlsl bool isNaN() { - return nbl::hlsl::isnan(bit_cast(data)); + return tgmath::isnan(bit_cast(data)); } }; diff --git a/include/nbl/builtin/hlsl/ieee754.hlsl b/include/nbl/builtin/hlsl/ieee754.hlsl index 606c415fdb..712d7440b8 100644 --- a/include/nbl/builtin/hlsl/ieee754.hlsl +++ b/include/nbl/builtin/hlsl/ieee754.hlsl @@ -126,7 +126,8 @@ uint32_t extractBiasedExponent(float64_t x) template int extractExponent(T x) { - return int(extractBiasedExponent(x)) - int(traits::exponentBias); + using AsFloat = typename float_of_size::type; + return int(extractBiasedExponent(x)) - int(traits::exponentBias); } template diff --git a/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl b/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl index 2c284fbd43..f2c2760994 100644 --- a/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl +++ b/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl @@ -45,14 +45,6 @@ namespace nbl { namespace hlsl { - // TODO: better implementation, also this needs to be moved somewhere else - template - UINT lerp(UINT a, UINT b, bool c) - { - return c ? b : a; - } - - namespace impl { uint64_t2 shiftMantissaLeftBy53(uint64_t mantissa64) @@ -98,7 +90,7 @@ namespace impl lhs |= 0x0008000000000000ull; rhs |= 0x0008000000000000ull; - return nbl::hlsl::lerp(rhs, nbl::hlsl::lerp(lhs, rhs, nbl::hlsl::isnan(rhs)), nbl::hlsl::isnan(lhs)); + return tgmath::lerp(rhs, tgmath::lerp(lhs, rhs, tgmath::isnan(rhs)), tgmath::isnan(lhs)); return 0; #endif } @@ -161,16 +153,16 @@ namespace impl uint32_t2 output; const int negCount = (-count) & 31; - output.x = nbl::hlsl::lerp(0u, val.x, count == 0); - output.x = nbl::hlsl::lerp(output.x, (val.x >> count), count < 32); + output.x = tgmath::lerp(0u, val.x, count == 0); + output.x = tgmath::lerp(output.x, (val.x >> count), count < 32); output.y = uint32_t((val.x | val.y) != 0u); /* count >= 64 */ uint32_t z1_lt64 = (val.x>>(count & 31)) | uint32_t(((val.x<>count) | uint32_t((val.y<> (count & 31)), count < 64); - output.y = nbl::hlsl::lerp(output.y, (val.x << negCount) | (val.y >> count), count < 32); + output.y = tgmath::lerp(0u, (val.x >> (count & 31)), count < 64); + output.y = tgmath::lerp(output.y, (val.x << negCount) | (val.y >> count), count < 32); - val.z = nbl::hlsl::lerp(val.z | val.y, val.z, count < 32); - output.x = nbl::hlsl::lerp(output.x, val.x >> count, count < 32); + val.z = tgmath::lerp(val.z | val.y, val.z, count < 32); + output.x = tgmath::lerp(output.x, val.x >> count, count < 32); output.z |= uint32_t(val.z != 0u); - output.x = nbl::hlsl::lerp(output.x, 0u, (count == 32)); - output.y = nbl::hlsl::lerp(output.y, val.x, (count == 32)); - output.z = nbl::hlsl::lerp(output.z, val.y, (count == 32)); - output.x = nbl::hlsl::lerp(output.x, val.x, (count == 0)); - output.y = nbl::hlsl::lerp(output.y, val.y, (count == 0)); - output.z = nbl::hlsl::lerp(output.z, val.z, (count == 0)); + output.x = tgmath::lerp(output.x, 0u, (count == 32)); + output.y = tgmath::lerp(output.y, val.x, (count == 32)); + output.z = tgmath::lerp(output.z, val.y, (count == 32)); + output.x = tgmath::lerp(output.x, val.x, (count == 0)); + output.y = tgmath::lerp(output.y, val.y, (count == 0)); + output.z = tgmath::lerp(output.z, val.z, (count == 0)); return output; } @@ -250,7 +242,7 @@ namespace impl uint32_t2 output; output.y = packed.y << count; // TODO: fix - output.x = nbl::hlsl::lerp((packed.x << count | (packed.y >> ((-count) & 31))), packed.x, count == 0); + output.x = tgmath::lerp((packed.x << count | (packed.y >> ((-count) & 31))), packed.x, count == 0); return unpackUint64(output); }; @@ -332,7 +324,7 @@ namespace impl } else { - zExp = nbl::hlsl::lerp(zExp, 0, (mantissaExtended.x | mantissaExtended.y) == 0u); + zExp = tgmath::lerp(zExp, 0, (mantissaExtended.x | mantissaExtended.y) == 0u); } return assembleFloat64(zSign, uint64_t(zExp) << ieee754::traits::mantissaBitCnt, unpackUint64(mantissaExtended.xy)); @@ -371,15 +363,15 @@ namespace impl uint32_t2 mantissaPacked = packUint64(mantissa); int shiftCount; uint32_t2 temp; - shiftCount = countLeadingZeros32(nbl::hlsl::lerp(mantissaPacked.x, mantissaPacked.y, mantissaPacked.x == 0u)) - 11; - outExp = nbl::hlsl::lerp(1 - shiftCount, -shiftCount - 31, mantissaPacked.x == 0u); + shiftCount = countLeadingZeros32(tgmath::lerp(mantissaPacked.x, mantissaPacked.y, mantissaPacked.x == 0u)) - 11; + outExp = tgmath::lerp(1 - shiftCount, -shiftCount - 31, mantissaPacked.x == 0u); - temp.x = nbl::hlsl::lerp(mantissaPacked.y << shiftCount, mantissaPacked.y >> (-shiftCount), shiftCount < 0); - temp.y = nbl::hlsl::lerp(0u, mantissaPacked.y << (shiftCount & 31), shiftCount < 0); + temp.x = tgmath::lerp(mantissaPacked.y << shiftCount, mantissaPacked.y >> (-shiftCount), shiftCount < 0); + temp.y = tgmath::lerp(0u, mantissaPacked.y << (shiftCount & 31), shiftCount < 0); shortShift64Left(impl::unpackUint64(mantissaPacked), shiftCount); - outMantissa = nbl::hlsl::lerp(outMantissa, unpackUint64(temp), mantissaPacked.x == 0); + outMantissa = tgmath::lerp(outMantissa, unpackUint64(temp), mantissaPacked.x == 0); } bool areBothSameSignInfinity(uint64_t lhs, uint64_t rhs) diff --git a/include/nbl/builtin/hlsl/tgmath.hlsl b/include/nbl/builtin/hlsl/tgmath.hlsl index 38a49171e5..490c3f9f70 100644 --- a/include/nbl/builtin/hlsl/tgmath.hlsl +++ b/include/nbl/builtin/hlsl/tgmath.hlsl @@ -11,6 +11,10 @@ namespace nbl { namespace hlsl { + +namespace tgmath +{ + template bool isnan(Float val) { @@ -27,6 +31,15 @@ bool isnan(uint64_t val) return bool((ieee754::extractBiasedExponent(asFloat) == ieee754::traits::specialValueExp) && (val & ieee754::traits::mantissaMask)); } +// TODO: better implementation, also i'm not sure this is the right place for this function +template +UINT lerp(UINT a, UINT b, bool c) +{ + return c ? b : a; +} + +} + } } From b816d68c7e72dd02341d4228723e66861b466700 Mon Sep 17 00:00:00 2001 From: Przemek Date: Tue, 6 Aug 2024 15:28:54 +0200 Subject: [PATCH 027/358] Updated examples --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 54cf5e9271..10605e9087 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 54cf5e9271636f6898a174e61aff1eec36bda51a +Subproject commit 10605e9087cf56b802b41bad92ddf273156d1020 From 9cc10c3a0cde6be2d8b303e4afc10d5a55535da5 Mon Sep 17 00:00:00 2001 From: Przemek Date: Wed, 7 Aug 2024 17:30:36 +0200 Subject: [PATCH 028/358] Saving work --- examples_tests | 2 +- src/nbl/video/CVulkanPhysicalDevice.cpp | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/examples_tests b/examples_tests index 10605e9087..aefcde9939 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 10605e9087cf56b802b41bad92ddf273156d1020 +Subproject commit aefcde99399b6ff77675fefe5bc16b5d3bb99c6d diff --git a/src/nbl/video/CVulkanPhysicalDevice.cpp b/src/nbl/video/CVulkanPhysicalDevice.cpp index 541a600b03..f13429efc1 100644 --- a/src/nbl/video/CVulkanPhysicalDevice.cpp +++ b/src/nbl/video/CVulkanPhysicalDevice.cpp @@ -1615,7 +1615,8 @@ core::smart_refctd_ptr CVulkanPhysicalDevice::createLogicalDevic vk_deviceFeatures2.features.shaderStorageImageArrayDynamicIndexing = limits.shaderStorageImageArrayDynamicIndexing; vk_deviceFeatures2.features.shaderClipDistance = true; // good device support vk_deviceFeatures2.features.shaderCullDistance = enabledFeatures.shaderCullDistance; - vk_deviceFeatures2.features.shaderFloat64 = limits.shaderFloat64; + //vk_deviceFeatures2.features.shaderFloat64 = limits.shaderFloat64; // TODO: enable back + vk_deviceFeatures2.features.shaderFloat64 = VK_FALSE; vk_deviceFeatures2.features.shaderInt64 = true; // always enable vk_deviceFeatures2.features.shaderInt16 = true; // always enable vk_deviceFeatures2.features.shaderResourceResidency = enabledFeatures.shaderResourceResidency; From 48b133c364cbd2e449c7cfc9e31dffbb024dedf0 Mon Sep 17 00:00:00 2001 From: Przemek Date: Tue, 13 Aug 2024 12:35:39 +0200 Subject: [PATCH 029/358] Saving work --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated_float64_t.hlsl | 964 ++++++++++++------ include/nbl/builtin/hlsl/ieee754.hlsl | 49 +- .../hlsl/impl/emulated_float64_t_impl.hlsl | 704 ++++++------- include/nbl/builtin/hlsl/shapes/beziers.hlsl | 39 +- include/nbl/builtin/hlsl/tgmath.hlsl | 16 +- 6 files changed, 1061 insertions(+), 713 deletions(-) diff --git a/examples_tests b/examples_tests index aefcde9939..9b1edf6a48 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit aefcde99399b6ff77675fefe5bc16b5d3bb99c6d +Subproject commit 9b1edf6a48513f28a7a9312260c3c790a8392232 diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index cd7e3c43fe..dd517c2f8c 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -2,6 +2,14 @@ #define _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_INCLUDED_ #include + +// weird dxc compiler errors +#ifndef __HLSL_VERSION +#define CONST const +#else +#define CONST +#endif + namespace nbl { namespace hlsl @@ -19,27 +27,32 @@ namespace hlsl return emulated_float64_t(bit_cast(float64_t(val))); }*/ - static emulated_float64_t create(int32_t val) + NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(emulated_float64_t val) + { + return createPreserveBitPattern(val.data); + } + + NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(int32_t val) { return emulated_float64_t(bit_cast(float64_t(val))); } - static emulated_float64_t create(int64_t val) + NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(int64_t val) { return emulated_float64_t(bit_cast(float64_t(val))); } - static emulated_float64_t create(uint32_t val) + NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(uint32_t val) { return emulated_float64_t(bit_cast(float64_t(val))); } - static emulated_float64_t create(uint64_t val) + NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(uint64_t val) { return emulated_float64_t(bit_cast(float64_t(val))); } - static emulated_float64_t create(float64_t val) + NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(float64_t val) { return emulated_float64_t(bit_cast(val)); } @@ -50,16 +63,24 @@ namespace hlsl return emulated_float64_t(bit_cast(float64_t(val))); }*/ - static emulated_float64_t create(float32_t val) + NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(float32_t val) { return emulated_float64_t(bit_cast(float64_t(val))); } - static emulated_float64_t createPreserveBitPattern(uint64_t val) + NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t createPreserveBitPattern(uint64_t val) { return emulated_float64_t(val); } + inline float getAsFloat32() + { + // TODO: don't use double + return float(bit_cast(data)); + + } + +#if 0 uint64_t shiftLeftAllowNegBitCnt(uint64_t val, int n) { if (n < 0) @@ -67,229 +88,240 @@ namespace hlsl else return val << n; } +#endif // arithmetic operators - emulated_float64_t operator+(const emulated_float64_t rhs) + emulated_float64_t operator+(const emulated_float64_t rhs) CONST { -// { -// uint64_t lhsSign = data & ieee754::traits::signMask; -// uint64_t rhsSign = rhs.data & ieee754::traits::signMask; -// uint64_t lhsMantissa = ieee754::extractMantissa(data); -// uint64_t rhsMantissa = ieee754::extractMantissa(rhs.data); -// int lhsBiasedExp = ieee754::extractBiasedExponent(data); -// int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); -// -// if (tgmath::isnan(data) || tgmath::isnan(rhs.data)) -// return createPreserveBitPattern(ieee754::traits::quietNaN); -// /*if (std::isinf(lhs) || std::isinf(rhs)) -// { -// if (std::isinf(lhs) && !std::isinf(rhs)) -// return lhs; -// if (std::isinf(rhs) && !std::isinf(lhs)) -// return rhs; -// if (rhs == lhs) -// return rhs; -// -// return nan(); -// }*/ -// -// int rp = min(ieee754::extractExponent(data), ieee754::extractExponent(rhs.data)) - ieee754::traits::mantissaBitCnt; -// -// uint64_t lhsRealMantissa = lhsMantissa | (1ull << ieee754::traits::mantissaBitCnt); -// uint64_t rhsRealMantissa = rhsMantissa | (1ull << ieee754::traits::mantissaBitCnt); -// uint64_t lhsSignTmp = lhsSign >> (52 + 11); -// uint64_t rhsSignTmp = rhsSign >> (52 + 11); -// -// uint64_t sign = 0u; -// if (lhsSign != rhsSign) -// { -// uint64_t _min = max(data, rhs.data); -// uint64_t _max = min(data, rhs.data); -// uint64_t minAbs = _min ^ ieee754::traits::signMask; -// if (minAbs > _max) -// sign = ieee754::traits::signMask; -// -// } -// -// int64_t lhsMantissaTmp = (shiftLeftAllowNegBitCnt(lhsRealMantissa, lhsBiasedExp - rp - ieee754::traits::mantissaBitCnt - ieee754::traits::exponentBias) ^ (-lhsSignTmp)) + lhsSignTmp; -// int64_t rhsMantissaTmp = (shiftLeftAllowNegBitCnt(rhsRealMantissa, rhsBiasedExp - rp - ieee754::traits::mantissaBitCnt - ieee754::traits::exponentBias) ^ (-rhsSignTmp)) + rhsSignTmp; -// -// uint64_t addTmp = bit_cast(lhsMantissaTmp + rhsMantissaTmp); -// -// // renormalize -// if (!FastMath && false) // TODO: hande nan -// { -// -// } -// else -// { -//#ifndef __HLSL_VERSION -// int l2 = log2(double(addTmp)); -//#else -// int intl2 = 0; -//#endif -// -// if (!FastMath && (rp + l2 + 1 < nbl::hlsl::numeric_limits::min_exponent)) -// { -// return createPreserveBitPattern(impl::assembleFloat64(0, ieee754::traits::exponentMask, 0)); -// } -// else -// { -// rp = addTmp ? l2 + rp + ieee754::traits::exponentBias : 0; -// return createPreserveBitPattern(impl::assembleFloat64( -// sign, -// (uint64_t(rp) << ieee754::traits::mantissaBitCnt) & ieee754::traits::exponentMask, -// shiftLeftAllowNegBitCnt(addTmp, (ieee754::traits::mantissaBitCnt - l2)) & ieee754::traits::mantissaMask) -// ); -// } -// } -// } - - emulated_float64_t retval = createPreserveBitPattern(0u); - - uint64_t mantissa; - uint32_t3 mantissaExtended; - int biasedExp; - - uint64_t lhsSign = data & ieee754::traits::signMask; - uint64_t rhsSign = rhs.data & ieee754::traits::signMask; - uint64_t lhsMantissa = ieee754::extractMantissa(data); - uint64_t rhsMantissa = ieee754::extractMantissa(rhs.data); - int lhsBiasedExp = ieee754::extractBiasedExponent(data); - int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); - - int expDiff = lhsBiasedExp - rhsBiasedExp; - - if (lhsSign == rhsSign) +#if 0 { - if (expDiff == 0) + uint64_t lhsSign = data & ieee754::traits::signMask; + uint64_t rhsSign = rhs.data & ieee754::traits::signMask; + uint64_t lhsMantissa = ieee754::extractMantissa(data); + uint64_t rhsMantissa = ieee754::extractMantissa(rhs.data); + int lhsBiasedExp = ieee754::extractBiasedExponent(data); + int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); + + if (tgmath::isnan(data) || tgmath::isnan(rhs.data)) + return createPreserveBitPattern(ieee754::traits::quietNaN); + /*if (std::isinf(lhs) || std::isinf(rhs)) { - if (lhsBiasedExp == ieee754::traits::specialValueExp) - { - bool propagate = (lhsMantissa | rhsMantissa) != 0u; - return createPreserveBitPattern(tgmath::lerp(data, impl::propagateFloat64NaN(data, rhs.data), propagate)); - } - - mantissa = lhsMantissa + rhsMantissa; - if (lhsBiasedExp == 0) - return createPreserveBitPattern(impl::assembleFloat64(lhsSign, 0, mantissa)); - mantissaExtended.xy = impl::packUint64(mantissa); - mantissaExtended.x |= 0x00200000u; - mantissaExtended.z = 0u; - biasedExp = lhsBiasedExp; + if (std::isinf(lhs) && !std::isinf(rhs)) + return lhs; + if (std::isinf(rhs) && !std::isinf(lhs)) + return rhs; + if (rhs == lhs) + return rhs; + + return nan(); + }*/ + + int rp = min(ieee754::extractExponent(data), ieee754::extractExponent(rhs.data)) - ieee754::traits::mantissaBitCnt; + + uint64_t lhsRealMantissa = lhsMantissa | (1ull << ieee754::traits::mantissaBitCnt); + uint64_t rhsRealMantissa = rhsMantissa | (1ull << ieee754::traits::mantissaBitCnt); + uint64_t lhsSignTmp = lhsSign >> (52 + 11); + uint64_t rhsSignTmp = rhsSign >> (52 + 11); + + uint64_t sign = 0u; + if (lhsSign != rhsSign) + { + uint64_t _min = max(data, rhs.data); + uint64_t _max = min(data, rhs.data); + uint64_t minAbs = _min ^ ieee754::traits::signMask; + if (minAbs > _max) + sign = ieee754::traits::signMask; - mantissaExtended = impl::shift64ExtraRightJamming(mantissaExtended, 1); } - else + + int64_t lhsMantissaTmp = (shiftLeftAllowNegBitCnt(lhsRealMantissa, lhsBiasedExp - rp - ieee754::traits::mantissaBitCnt - ieee754::traits::exponentBias) ^ (-lhsSignTmp)) + lhsSignTmp; + int64_t rhsMantissaTmp = (shiftLeftAllowNegBitCnt(rhsRealMantissa, rhsBiasedExp - rp - ieee754::traits::mantissaBitCnt - ieee754::traits::exponentBias) ^ (-rhsSignTmp)) + rhsSignTmp; + + uint64_t addTmp = bit_cast(lhsMantissaTmp + rhsMantissaTmp); + + // renormalize + if (!FastMath && false) // TODO: hande nan { - if (expDiff < 0) - { - swap(lhsMantissa, rhsMantissa); - swap(lhsBiasedExp, rhsBiasedExp); - } - - if (lhsBiasedExp == ieee754::traits::specialValueExp) - { - const bool propagate = (lhsMantissa) != 0u; - return createPreserveBitPattern(tgmath::lerp(ieee754::traits::exponentMask | lhsSign, impl::propagateFloat64NaN(data, rhs.data), propagate)); - } - - expDiff = tgmath::lerp(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); - rhsMantissa = tgmath::lerp(rhsMantissa | (1ull << 52), rhsMantissa, rhsBiasedExp == 0); - const uint32_t3 shifted = impl::shift64ExtraRightJamming(uint32_t3(impl::packUint64(rhsMantissa), 0u), expDiff); - rhsMantissa = impl::unpackUint64(shifted.xy); - mantissaExtended.z = shifted.z; - biasedExp = lhsBiasedExp; - - lhsMantissa |= (1ull << 52); - mantissaExtended.xy = impl::packUint64(lhsMantissa + rhsMantissa); - --biasedExp; - if (!(mantissaExtended.x < 0x00200000u)) - { - mantissaExtended = impl::shift64ExtraRightJamming(mantissaExtended, 1); - ++biasedExp; - } - - return createPreserveBitPattern(impl::roundAndPackFloat64(lhsSign, biasedExp, mantissaExtended.xyz)); + } - - // cannot happen but compiler cries about not every path returning value - return createPreserveBitPattern(impl::roundAndPackFloat64(lhsSign, biasedExp, mantissaExtended)); - } - else - { - lhsMantissa = impl::shortShift64Left(lhsMantissa, 10); - rhsMantissa = impl::shortShift64Left(rhsMantissa, 10); - - if (expDiff != 0) + else { - uint32_t2 frac; - - if (expDiff < 0) +#ifndef __HLSL_VERSION + int l2 = log2(double(addTmp)); +#else + int intl2 = 0; +#endif + + if (!FastMath && (rp + l2 + 1 < nbl::hlsl::numeric_limits::min_exponent)) { - swap(lhsMantissa, rhsMantissa); - swap(lhsBiasedExp, rhsBiasedExp); - lhsSign ^= ieee754::traits::signMask; + return createPreserveBitPattern(impl::assembleFloat64(0, ieee754::traits::exponentMask, 0)); } - - if (lhsBiasedExp == ieee754::traits::specialValueExp) + else { - bool propagate = lhsMantissa != 0u; - return createPreserveBitPattern(tgmath::lerp(impl::assembleFloat64(lhsSign, ieee754::traits::exponentMask, 0ull), impl::propagateFloat64NaN(data, rhs.data), propagate)); + rp = addTmp ? l2 + rp + ieee754::traits::exponentBias : 0; + return createPreserveBitPattern(impl::assembleFloat64( + sign, + (uint64_t(rp) << ieee754::traits::mantissaBitCnt) & ieee754::traits::exponentMask, + shiftLeftAllowNegBitCnt(addTmp, (ieee754::traits::mantissaBitCnt - l2)) & ieee754::traits::mantissaMask) + ); } - - expDiff = tgmath::lerp(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); - rhsMantissa = tgmath::lerp(rhsMantissa | 0x4000000000000000ull, rhsMantissa, rhsBiasedExp == 0); - rhsMantissa = impl::unpackUint64(impl::shift64RightJamming(impl::packUint64(rhsMantissa), expDiff)); - lhsMantissa |= 0x4000000000000000ull; - frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); - biasedExp = lhsBiasedExp; - --biasedExp; - return createPreserveBitPattern(impl::normalizeRoundAndPackFloat64(lhsSign, biasedExp - 10, frac.x, frac.y)); } - if (lhsBiasedExp == ieee754::traits::specialValueExp) - { - bool propagate = ((lhsMantissa) | (rhsMantissa)) != 0u; - return createPreserveBitPattern(tgmath::lerp(ieee754::traits::quietNaN, impl::propagateFloat64NaN(data, rhs.data), propagate)); - } - rhsBiasedExp = tgmath::lerp(rhsBiasedExp, 1, lhsBiasedExp == 0); - lhsBiasedExp = tgmath::lerp(lhsBiasedExp, 1, lhsBiasedExp == 0); - + } +#endif - const uint32_t2 lhsMantissaPacked = impl::packUint64(lhsMantissa); - const uint32_t2 rhsMantissaPacked = impl::packUint64(rhsMantissa); + if (FlushDenormToZero) + { + emulated_float64_t retval = createPreserveBitPattern(0u); - uint32_t2 frac; - uint64_t signOfDifference = 0; - if (rhsMantissaPacked.x < lhsMantissaPacked.x) - { - frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); - } - else if (lhsMantissaPacked.x < rhsMantissaPacked.x) - { - frac.xy = impl::packUint64(rhsMantissa - lhsMantissa); - signOfDifference = ieee754::traits::signMask; - } - else if (rhsMantissaPacked.y <= lhsMantissaPacked.y) + uint64_t mantissa; + uint32_t3 mantissaExtended; + int biasedExp; + + uint64_t lhsSign = data & ieee754::traits::signMask; + uint64_t rhsSign = rhs.data & ieee754::traits::signMask; + uint64_t lhsMantissa = ieee754::extractMantissa(data); + uint64_t rhsMantissa = ieee754::extractMantissa(rhs.data); + int lhsBiasedExp = ieee754::extractBiasedExponent(data); + int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); + + int expDiff = lhsBiasedExp - rhsBiasedExp; + + if (lhsSign == rhsSign) { - /* It is possible that frac.x and frac.y may be zero after this. */ - frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); + if (expDiff == 0) + { + if (lhsBiasedExp == ieee754::traits::specialValueExp) + { + bool propagate = (lhsMantissa | rhsMantissa) != 0u; + return createPreserveBitPattern(tgmath::lerp(data, impl::propagateFloat64NaN(data, rhs.data), propagate)); + } + + mantissa = lhsMantissa + rhsMantissa; + if (lhsBiasedExp == 0) + return createPreserveBitPattern(impl::assembleFloat64(lhsSign, 0, mantissa)); + mantissaExtended.xy = impl::packUint64(mantissa); + mantissaExtended.x |= 0x00200000u; + mantissaExtended.z = 0u; + biasedExp = lhsBiasedExp; + + mantissaExtended = impl::shift64ExtraRightJamming(mantissaExtended, 1); + } + else + { + if (expDiff < 0) + { + swap(lhsMantissa, rhsMantissa); + swap(lhsBiasedExp, rhsBiasedExp); + } + + if (lhsBiasedExp == ieee754::traits::specialValueExp) + { + const bool propagate = (lhsMantissa) != 0u; + return createPreserveBitPattern(tgmath::lerp(ieee754::traits::exponentMask | lhsSign, impl::propagateFloat64NaN(data, rhs.data), propagate)); + } + + expDiff = tgmath::lerp(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); + rhsMantissa = tgmath::lerp(rhsMantissa | (1ull << 52), rhsMantissa, rhsBiasedExp == 0); + const uint32_t3 shifted = impl::shift64ExtraRightJamming(uint32_t3(impl::packUint64(rhsMantissa), 0u), expDiff); + rhsMantissa = impl::unpackUint64(shifted.xy); + mantissaExtended.z = shifted.z; + biasedExp = lhsBiasedExp; + + lhsMantissa |= (1ull << 52); + mantissaExtended.xy = impl::packUint64(lhsMantissa + rhsMantissa); + --biasedExp; + if (!(mantissaExtended.x < 0x00200000u)) + { + mantissaExtended = impl::shift64ExtraRightJamming(mantissaExtended, 1); + ++biasedExp; + } + + return createPreserveBitPattern(impl::roundAndPackFloat64(lhsSign, biasedExp, mantissaExtended.xyz)); + } + + // cannot happen but compiler cries about not every path returning value + return createPreserveBitPattern(impl::roundAndPackFloat64(lhsSign, biasedExp, mantissaExtended)); } else { - frac.xy = impl::packUint64(rhsMantissa - lhsMantissa); - signOfDifference = ieee754::traits::signMask; + lhsMantissa = impl::shortShift64Left(lhsMantissa, 10); + rhsMantissa = impl::shortShift64Left(rhsMantissa, 10); + + if (expDiff != 0) + { + uint32_t2 frac; + + if (expDiff < 0) + { + swap(lhsMantissa, rhsMantissa); + swap(lhsBiasedExp, rhsBiasedExp); + lhsSign ^= ieee754::traits::signMask; + } + + if (lhsBiasedExp == ieee754::traits::specialValueExp) + { + bool propagate = lhsMantissa != 0u; + return createPreserveBitPattern(tgmath::lerp(impl::assembleFloat64(lhsSign, ieee754::traits::exponentMask, 0ull), impl::propagateFloat64NaN(data, rhs.data), propagate)); + } + + expDiff = tgmath::lerp(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); + rhsMantissa = tgmath::lerp(rhsMantissa | 0x4000000000000000ull, rhsMantissa, rhsBiasedExp == 0); + rhsMantissa = impl::unpackUint64(impl::shift64RightJamming(impl::packUint64(rhsMantissa), expDiff)); + lhsMantissa |= 0x4000000000000000ull; + frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); + biasedExp = lhsBiasedExp; + --biasedExp; + return createPreserveBitPattern(impl::normalizeRoundAndPackFloat64(lhsSign, biasedExp - 10, frac.x, frac.y)); + } + if (lhsBiasedExp == ieee754::traits::specialValueExp) + { + bool propagate = ((lhsMantissa) | (rhsMantissa)) != 0u; + return createPreserveBitPattern(tgmath::lerp(ieee754::traits::quietNaN, impl::propagateFloat64NaN(data, rhs.data), propagate)); + } + rhsBiasedExp = tgmath::lerp(rhsBiasedExp, 1, lhsBiasedExp == 0); + lhsBiasedExp = tgmath::lerp(lhsBiasedExp, 1, lhsBiasedExp == 0); + + + const uint32_t2 lhsMantissaPacked = impl::packUint64(lhsMantissa); + const uint32_t2 rhsMantissaPacked = impl::packUint64(rhsMantissa); + + uint32_t2 frac; + uint64_t signOfDifference = 0; + if (rhsMantissaPacked.x < lhsMantissaPacked.x) + { + frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); + } + else if (lhsMantissaPacked.x < rhsMantissaPacked.x) + { + frac.xy = impl::packUint64(rhsMantissa - lhsMantissa); + signOfDifference = ieee754::traits::signMask; + } + else if (rhsMantissaPacked.y <= lhsMantissaPacked.y) + { + /* It is possible that frac.x and frac.y may be zero after this. */ + frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); + } + else + { + frac.xy = impl::packUint64(rhsMantissa - lhsMantissa); + signOfDifference = ieee754::traits::signMask; + } + + biasedExp = tgmath::lerp(rhsBiasedExp, lhsBiasedExp, signOfDifference == 0u); + lhsSign ^= signOfDifference; + uint64_t retval_0 = impl::packFloat64(uint32_t(FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) << 31, 0, 0u, 0u); + uint64_t retval_1 = impl::normalizeRoundAndPackFloat64(lhsSign, biasedExp - 11, frac.x, frac.y); + return createPreserveBitPattern(tgmath::lerp(retval_0, retval_1, frac.x != 0u || frac.y != 0u)); } - - biasedExp = tgmath::lerp(rhsBiasedExp, lhsBiasedExp, signOfDifference == 0u); - lhsSign ^= signOfDifference; - uint64_t retval_0 = impl::packFloat64(uint32_t(FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) << 31, 0, 0u, 0u); - uint64_t retval_1 = impl::normalizeRoundAndPackFloat64(lhsSign, biasedExp - 11, frac.x, frac.y); - return createPreserveBitPattern(tgmath::lerp(retval_0, retval_1, frac.x != 0u || frac.y != 0u)); + } + else + { + //static_assert(false, "not implemented yet"); + return createPreserveBitPattern(0xdeadbeefbadcaffeull); } } - emulated_float64_t operator-(emulated_float64_t rhs) + emulated_float64_t operator-(emulated_float64_t rhs) CONST { emulated_float64_t lhs = createPreserveBitPattern(data); emulated_float64_t rhsFlipped = rhs.flipSign(); @@ -297,74 +329,82 @@ namespace hlsl return lhs + rhsFlipped; } - emulated_float64_t operator*(emulated_float64_t rhs) + emulated_float64_t operator*(emulated_float64_t rhs) CONST { - emulated_float64_t retval = emulated_float64_t::createPreserveBitPattern(0u); - - uint64_t lhsSign = data & ieee754::traits::signMask; - uint64_t rhsSign = rhs.data & ieee754::traits::signMask; - uint64_t lhsMantissa = ieee754::extractMantissa(data); - uint64_t rhsMantissa = ieee754::extractMantissa(rhs.data); - int lhsBiasedExp = ieee754::extractBiasedExponent(data); - int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); - - int exp = int(lhsBiasedExp + rhsBiasedExp) - ieee754::traits::exponentBias; - uint64_t sign = (data ^ rhs.data) & ieee754::traits::signMask; - if (!FastMath) + if(FlushDenormToZero) { - if (lhsBiasedExp == ieee754::traits::specialValueExp) + emulated_float64_t retval = emulated_float64_t::createPreserveBitPattern(0u); + + uint64_t lhsSign = data & ieee754::traits::signMask; + uint64_t rhsSign = rhs.data & ieee754::traits::signMask; + uint64_t lhsMantissa = ieee754::extractMantissa(data); + uint64_t rhsMantissa = ieee754::extractMantissa(rhs.data); + int lhsBiasedExp = ieee754::extractBiasedExponent(data); + int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); + + int exp = int(lhsBiasedExp + rhsBiasedExp) - ieee754::traits::exponentBias; + uint64_t sign = (data ^ rhs.data) & ieee754::traits::signMask; + if (!FastMath) { - if ((lhsMantissa != 0u) || ((rhsBiasedExp == ieee754::traits::specialValueExp) && (rhsMantissa != 0u))) - return createPreserveBitPattern(impl::propagateFloat64NaN(data, rhs.data)); - if ((uint64_t(rhsBiasedExp) | rhsMantissa) == 0u) - return createPreserveBitPattern(ieee754::traits::quietNaN); + if (lhsBiasedExp == ieee754::traits::specialValueExp) + { + if ((lhsMantissa != 0u) || ((rhsBiasedExp == ieee754::traits::specialValueExp) && (rhsMantissa != 0u))) + return createPreserveBitPattern(impl::propagateFloat64NaN(data, rhs.data)); + if ((uint64_t(rhsBiasedExp) | rhsMantissa) == 0u) + return createPreserveBitPattern(ieee754::traits::quietNaN); - return createPreserveBitPattern(impl::assembleFloat64(sign, ieee754::traits::exponentMask, 0ull)); - } - if (rhsBiasedExp == ieee754::traits::specialValueExp) - { - /* a cannot be NaN, but is b NaN? */ - if (rhsMantissa != 0u) + return createPreserveBitPattern(impl::assembleFloat64(sign, ieee754::traits::exponentMask, 0ull)); + } + if (rhsBiasedExp == ieee754::traits::specialValueExp) + { + /* a cannot be NaN, but is b NaN? */ + if (rhsMantissa != 0u) #ifdef RELAXED_NAN_PROPAGATION - return rhs.data; + return rhs.data; #else - return createPreserveBitPattern(impl::propagateFloat64NaN(data, rhs.data)); + return createPreserveBitPattern(impl::propagateFloat64NaN(data, rhs.data)); #endif - if ((uint64_t(lhsBiasedExp) | lhsMantissa) == 0u) - return createPreserveBitPattern(ieee754::traits::quietNaN); + if ((uint64_t(lhsBiasedExp) | lhsMantissa) == 0u) + return createPreserveBitPattern(ieee754::traits::quietNaN); - return createPreserveBitPattern(sign | ieee754::traits::exponentMask); - } - if (lhsBiasedExp == 0) - { - if (lhsMantissa == 0u) - return createPreserveBitPattern(sign); - impl::normalizeFloat64Subnormal(lhsMantissa, lhsBiasedExp, lhsMantissa); + return createPreserveBitPattern(sign | ieee754::traits::exponentMask); + } + if (lhsBiasedExp == 0) + { + if (lhsMantissa == 0u) + return createPreserveBitPattern(sign); + impl::normalizeFloat64Subnormal(lhsMantissa, lhsBiasedExp, lhsMantissa); + } + if (rhsBiasedExp == 0) + { + if (rhsMantissa == 0u) + return createPreserveBitPattern(sign); + impl::normalizeFloat64Subnormal(rhsMantissa, rhsBiasedExp, rhsMantissa); + } } - if (rhsBiasedExp == 0) + + const uint64_t hi_l = (lhsMantissa >> 21) | (1ull << 31); + const uint64_t lo_l = lhsMantissa & ((1ull << 21) - 1); + const uint64_t hi_r = (rhsMantissa >> 21) | (1ull << 31); + const uint64_t lo_r = rhsMantissa & ((1ull << 21) - 1); + + //const uint64_t RoundToNearest = (1ull << 31) - 1; + uint64_t newPseudoMantissa = ((hi_l * hi_r) >> 10) + ((hi_l * lo_r + lo_l * hi_r/* + RoundToNearest*/) >> 31); + + if (newPseudoMantissa & (0x1ull << 53)) { - if (rhsMantissa == 0u) - return createPreserveBitPattern(sign); - impl::normalizeFloat64Subnormal(rhsMantissa, rhsBiasedExp, rhsMantissa); + newPseudoMantissa >>= 1; + ++exp; } - } - - const uint64_t hi_l = (lhsMantissa >> 21) | (1ull << 31); - const uint64_t lo_l = lhsMantissa & ((1ull << 21) - 1); - const uint64_t hi_r = (rhsMantissa >> 21) | (1ull << 31); - const uint64_t lo_r = rhsMantissa & ((1ull << 21) - 1); + newPseudoMantissa &= (ieee754::traits::mantissaMask); - //const uint64_t RoundToNearest = (1ull << 31) - 1; - uint64_t newPseudoMantissa = ((hi_l * hi_r) >> 10) + ((hi_l * lo_r + lo_l * hi_r/* + RoundToNearest*/) >> 31); - - if (newPseudoMantissa & (0x1ull << 53)) + return createPreserveBitPattern(impl::assembleFloat64(sign, uint64_t(exp) << ieee754::traits::mantissaBitCnt, newPseudoMantissa)); + } + else { - newPseudoMantissa >>= 1; - ++exp; + //static_assert(false, "not implemented yet"); + return createPreserveBitPattern(0xdeadbeefbadcaffeull); } - newPseudoMantissa &= (ieee754::traits::mantissaMask); - - return createPreserveBitPattern(impl::assembleFloat64(sign, uint64_t(exp) << ieee754::traits::mantissaBitCnt, newPseudoMantissa)); } /*emulated_float64_t reciprocal(uint64_t x) @@ -375,35 +415,58 @@ namespace hlsl return output; }*/ - emulated_float64_t operator/(const emulated_float64_t rhs) + emulated_float64_t operator/(const emulated_float64_t rhs) CONST { - //return emulated_float64_t::createPreserveBitPattern(data) * reciprocal(rhs.data); + if (FlushDenormToZero) + { + //return emulated_float64_t::createPreserveBitPattern(data) * reciprocal(rhs.data); - const uint64_t lhsRealMantissa = (ieee754::extractMantissa(data) | (1ull << ieee754::traits::mantissaBitCnt)); - const uint64_t rhsRealMantissa = ieee754::extractMantissa(rhs.data) | (1ull << ieee754::traits::mantissaBitCnt); - - const uint64_t sign = (data ^ rhs.data) & ieee754::traits::signMask; - int exp = ieee754::extractExponent(data) - ieee754::extractExponent(rhs.data) + ieee754::traits::exponentBias; + if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) + return createPreserveBitPattern(ieee754::traits::quietNaN); + if (!FastMath && ((rhs.data << 1) == 0)) + return createPreserveBitPattern(ieee754::traits::quietNaN); - uint64_t2 lhsMantissaShifted = impl::shiftMantissaLeftBy53(lhsRealMantissa); - uint64_t mantissa = impl::divmod128by64(lhsMantissaShifted.x, lhsMantissaShifted.y, rhsRealMantissa); + const uint64_t sign = (data ^ rhs.data) & ieee754::traits::signMask; - if (mantissa & (0x1ull << 53)) - { - ++exp; + if (!FastMath && impl::areBothInfinity(data, rhs.data)) + return createPreserveBitPattern(ieee754::traits::quietNaN | sign); + + if (!FastMath && tgmath::isInf(data)) + return createPreserveBitPattern((data & ~ieee754::traits::signMask) | sign); + + if (!FastMath && tgmath::isInf(rhs.data)) + return createPreserveBitPattern(0ull | sign); + + + + const uint64_t lhsRealMantissa = (ieee754::extractMantissa(data) | (1ull << ieee754::traits::mantissaBitCnt)); + const uint64_t rhsRealMantissa = ieee754::extractMantissa(rhs.data) | (1ull << ieee754::traits::mantissaBitCnt); + + int exp = ieee754::extractExponent(data) - ieee754::extractExponent(rhs.data) + ieee754::traits::exponentBias; + + uint64_t2 lhsMantissaShifted = impl::shiftMantissaLeftBy53(lhsRealMantissa); + uint64_t mantissa = impl::divmod128by64(lhsMantissaShifted.x, lhsMantissaShifted.y, rhsRealMantissa); + + while (mantissa < (1ull << 52)) + { + mantissa <<= 1; + exp--; + } + + mantissa &= ieee754::traits::mantissaMask; + + return createPreserveBitPattern(impl::assembleFloat64(sign, uint64_t(exp) << ieee754::traits::mantissaBitCnt, mantissa)); } else { - mantissa >>= 1; + //static_assert(false, "not implemented yet"); + return createPreserveBitPattern(0xdeadbeefbadcaffeull); } - - mantissa &= ieee754::traits::mantissaMask; - - return createPreserveBitPattern(impl::assembleFloat64(sign, uint64_t(exp) << ieee754::traits::mantissaBitCnt, mantissa)); } // relational operators - bool operator==(emulated_float64_t rhs) + // TODO: should `FlushDenormToZero` affect relational operators? + bool operator==(emulated_float64_t rhs) CONST { if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) return false; @@ -418,11 +481,11 @@ namespace hlsl return !(xored.data); } - bool operator!=(emulated_float64_t rhs) + bool operator!=(emulated_float64_t rhs) CONST { return !(createPreserveBitPattern(data) == rhs); } - bool operator<(emulated_float64_t rhs) + bool operator<(emulated_float64_t rhs) CONST { if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) return false; @@ -442,7 +505,7 @@ namespace hlsl return (lhsFlipped & diffBits) < (rhsFlipped & diffBits); } - bool operator>(emulated_float64_t rhs) + bool operator>(emulated_float64_t rhs) CONST { if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) return true; @@ -462,18 +525,13 @@ namespace hlsl return (lhsFlipped & diffBits) > (rhsFlipped & diffBits); } - bool operator<=(emulated_float64_t rhs) { return !(emulated_float64_t::createPreserveBitPattern(data) > emulated_float64_t::createPreserveBitPattern(rhs.data)); } + bool operator<=(emulated_float64_t rhs) CONST { return !(emulated_float64_t::createPreserveBitPattern(data) > emulated_float64_t::createPreserveBitPattern(rhs.data)); } bool operator>=(emulated_float64_t rhs) { return !(emulated_float64_t::createPreserveBitPattern(data) < emulated_float64_t::createPreserveBitPattern(rhs.data)); } //logical operators - bool operator&&(emulated_float64_t rhs) { return bool(data) && bool(rhs.data); } - bool operator||(emulated_float64_t rhs) { return bool(data) || bool(rhs.data); } - bool operator!() { return !bool(data); } - - // OMITED OPERATORS - // - not implementing bitwise and modulo operators since floating point types doesn't support them - // - compound operator overload not supported in HLSL - // - access operators (dereference and addressof) not supported in HLSL + bool operator&&(emulated_float64_t rhs) CONST { return bool(data) && bool(rhs.data); } + bool operator||(emulated_float64_t rhs) CONST { return bool(data) || bool(rhs.data); } + bool operator!() CONST { return !bool(data); } // TODO: should modify self? emulated_float64_t flipSign() @@ -496,31 +554,31 @@ struct traits_base\ NBL_CONSTEXPR_STATIC_INLINE int16_t mantissaBitCnt = 52;\ };\ template<>\ -uint32_t extractBiasedExponent(Type x)\ +static inline uint32_t extractBiasedExponent(Type x)\ {\ return extractBiasedExponent(x.data);\ }\ \ template<>\ -int extractExponent(Type x)\ +static inline int extractExponent(Type x)\ {\ return extractExponent(x.data);\ }\ \ template<>\ -Type replaceBiasedExponent(Type x, typename unsigned_integer_of_size::type biasedExp)\ +NBL_CONSTEXPR_STATIC_INLINE Type replaceBiasedExponent(Type x, typename unsigned_integer_of_size::type biasedExp)\ {\ return Type(replaceBiasedExponent(x.data, biasedExp));\ }\ \ template <>\ -Type fastMulExp2(Type x, int n)\ +NBL_CONSTEXPR_STATIC_INLINE Type fastMulExp2(Type x, int n)\ {\ return Type(replaceBiasedExponent(x.data, extractBiasedExponent(x) + uint32_t(n)));\ }\ \ template <>\ -unsigned_integer_of_size::type extractMantissa(Type x)\ +NBL_CONSTEXPR_STATIC_INLINE unsigned_integer_of_size::type extractMantissa(Type x)\ {\ return extractMantissa(x.data);\ }\ @@ -531,42 +589,298 @@ namespace ieee754 IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); - - //template<> - //uint32_t extractBiasedExponent(emulated_float64_t x) - //{ - // return extractBiasedExponent(x.data); - //} - - //template<> - //int extractExponent(emulated_float64_t x) - //{ - // return extractExponent(x.data); - //} - - //template<> - // emulated_float64_t replaceBiasedExponent(emulated_float64_t x, typename unsigned_integer_of_size::type biasedExp) - //{ - // return emulated_float64_t(replaceBiasedExponent(x.data, biasedExp)); - //} - - ////// performs no overflow tests, returns x*exp2(n) - //template <> - // emulated_float64_t fastMulExp2(emulated_float64_t x, int n) - //{ - // return emulated_float64_t(replaceBiasedExponent(x.data, extractBiasedExponent(x) + uint32_t(n))); - //} - - //template <> - //unsigned_integer_of_size::type extractMantissa(emulated_float64_t x) - //{ - // return extractMantissa(x.data); - //} } +// TODO: finish it + +// TODO: this is mess, refactorize it +#ifndef __HLSL_VERSION +using ef64_t2 = vector; +using ef64_t3 = vector; +using ef64_t4 = vector; +using ef64_t3x3 = matrix; +using ef64_t2x2 = matrix; +#else +struct ef64_t2 +{ + emulated_float64_t x; + emulated_float64_t y; + + emulated_float64_t calcComponentSum() CONST + { + return x + y; + } + + NBL_CONSTEXPR_STATIC_INLINE ef64_t2 create(emulated_float64_t x, emulated_float64_t y) + { + ef64_t2 output; + output.x = x; + output.y = y; + + return output; + } + + NBL_CONSTEXPR_STATIC_INLINE ef64_t2 create(float val) + { + ef64_t2 output; + output.x = emulated_float64_t::create(val); + output.y = emulated_float64_t::create(val); + + return output; + } + + NBL_CONSTEXPR_STATIC_INLINE ef64_t2 create(float32_t2 val) + { + ef64_t2 output; + output.x = emulated_float64_t::create(val.x); + output.y = emulated_float64_t::create(val.y); + + return output; + } + + ef64_t2 operator+(float rhs) + { + ef64_t2 output; + emulated_float64_t rhsAsEF64 = emulated_float64_t::create(rhs); + output.x = x + rhsAsEF64; + output.y = y + rhsAsEF64; + + return output; + } + + ef64_t2 operator+(emulated_float64_t rhs) + { + ef64_t2 output; + output.x = x + rhs; + output.y = y + rhs; + + return output; + } + + ef64_t2 operator+(ef64_t2 rhs) + { + ef64_t2 output; + output.x = x + rhs.x; + output.y = y + rhs.y; + + return output; + } + + ef64_t2 operator-(float rhs) + { + return create(x, y) + (-rhs); + } + + ef64_t2 operator-(emulated_float64_t rhs) + { + return create(x, y) + (rhs.flipSign()); + } + + ef64_t2 operator-(ef64_t2 rhs) + { + rhs.x = rhs.x.flipSign(); + rhs.y = rhs.y.flipSign(); + return create(x, y) + rhs; + } + + ef64_t2 operator*(float rhs) + { + ef64_t2 output; + emulated_float64_t rhsAsEF64 = emulated_float64_t::create(rhs); + output.x = x * rhsAsEF64; + output.y = y * rhsAsEF64; + + return output; + } + + ef64_t2 operator*(emulated_float64_t rhs) + { + ef64_t2 output; + output.x = x * rhs; + output.y = y * rhs; + + return output; + } + + ef64_t2 operator*(ef64_t2 rhs) + { + ef64_t2 output; + output.x = x * rhs.x; + output.y = y * rhs.y; + + return output; + } + + float2 getAsFloat2() + { + return float2(x.getAsFloat32(), y.getAsFloat32()); + } +}; + +struct ef64_t3 +{ + emulated_float64_t x; + emulated_float64_t y; + emulated_float64_t z; + + static ef64_t3 create(NBL_REF_ARG(ef64_t3) other) + { + ef64_t3 output; + + output.x = other.x; + output.y = other.y; + output.z = other.z; + + return output; + } + + static ef64_t3 create(NBL_REF_ARG(ef64_t2) other, emulated_float64_t z) + { + ef64_t3 output; + + output.x = other.x; + output.y = other.y; + output.z = z; + + return output; + } + + static ef64_t3 create(NBL_REF_ARG(ef64_t2) other, int z) + { + ef64_t3 output; + + output.x = other.x; + output.y = other.y; + output.z = emulated_float64_t::create(z); + + return output; + } + + emulated_float64_t calcComponentSum() CONST + { + return x + y + z; + } + + ef64_t3 operator*(NBL_CONST_REF_ARG(ef64_t3) rhs) CONST + { + ef64_t3 output; + output.x = x * rhs.x; + output.y = x * rhs.y; + output.z = x * rhs.z; + + return output; + } +}; + +struct ef64_t4 +{ + emulated_float64_t x; + emulated_float64_t y; + emulated_float64_t z; + emulated_float64_t w; +}; + +struct ef64_t3x3 +{ + ef64_t3 columns[3]; + + ef64_t3x3 getTransposed() CONST + { + ef64_t3x3 output; + + output.columns[1].x = columns[0].y; + output.columns[2].x = columns[0].z; + + output.columns[0].y = columns[1].x; + output.columns[2].y = columns[1].z; + + output.columns[0].z = columns[3].x; + output.columns[1].z = columns[3].y; + + return output; + } + + ef64_t3x3 operator*(NBL_CONST_REF_ARG(ef64_t3x3) rhs) CONST + { + ef64_t3x3 output; + ef64_t3x3 lhsTransposed = getTransposed(); + + output.columns[0].x = (lhsTransposed.columns[0] * rhs.columns[0]).calcComponentSum(); + output.columns[0].y = (lhsTransposed.columns[0] * rhs.columns[1]).calcComponentSum(); + output.columns[0].z = (lhsTransposed.columns[0] * rhs.columns[2]).calcComponentSum(); + + output.columns[1].x = (lhsTransposed.columns[1] * rhs.columns[0]).calcComponentSum(); + output.columns[1].y = (lhsTransposed.columns[1] * rhs.columns[1]).calcComponentSum(); + output.columns[1].z = (lhsTransposed.columns[1] * rhs.columns[2]).calcComponentSum(); + + output.columns[2].x = (lhsTransposed.columns[2] * rhs.columns[0]).calcComponentSum(); + output.columns[2].y = (lhsTransposed.columns[2] * rhs.columns[1]).calcComponentSum(); + output.columns[2].z = (lhsTransposed.columns[2] * rhs.columns[2]).calcComponentSum(); + + return output; + } + + ef64_t3 operator*(NBL_CONST_REF_ARG(ef64_t3) rhs) + { + ef64_t3 output; + ef64_t3x3 lhsTransposed = getTransposed(); + + output.x = (columns[0] * rhs).calcComponentSum(); + output.y = (columns[1] * rhs).calcComponentSum(); + output.z = (columns[2] * rhs).calcComponentSum(); + + return output; + } +}; + +struct ef64_t2x2 +{ + ef64_t2 columns[2]; + + ef64_t2x2 getTransposed() CONST + { + ef64_t2x2 output; + + output.columns[1].x = columns[0].y; + output.columns[0].y = columns[1].x; + + return output; + } + + ef64_t2x2 operator*(NBL_CONST_REF_ARG(ef64_t2x2) rhs) CONST + { + ef64_t2x2 output; + ef64_t2x2 lhsTransposed = getTransposed(); + + output.columns[0].x = (lhsTransposed.columns[0] * rhs.columns[0]).calcComponentSum(); + output.columns[0].y = (lhsTransposed.columns[0] * rhs.columns[1]).calcComponentSum(); + + output.columns[1].x = (lhsTransposed.columns[1] * rhs.columns[0]).calcComponentSum(); + output.columns[1].y = (lhsTransposed.columns[1] * rhs.columns[1]).calcComponentSum(); + + return output; + } + + ef64_t2 operator*(NBL_CONST_REF_ARG(ef64_t2) rhs) + { + ef64_t2 output; + ef64_t2x2 lhsTransposed = getTransposed(); + + output.x = (columns[0] * rhs).calcComponentSum(); + output.y = (columns[1] * rhs).calcComponentSum(); + + return output; + } +}; + +#endif + } + } +#undef CONST + #undef FLOAT_ROUND_NEAREST_EVEN #undef FLOAT_ROUND_TO_ZERO #undef FLOAT_ROUND_DOWN diff --git a/include/nbl/builtin/hlsl/ieee754.hlsl b/include/nbl/builtin/hlsl/ieee754.hlsl index 712d7440b8..e92b45713f 100644 --- a/include/nbl/builtin/hlsl/ieee754.hlsl +++ b/include/nbl/builtin/hlsl/ieee754.hlsl @@ -5,18 +5,6 @@ #include #include -// TODO: delete -#ifdef __HLSL_VERSION -#define staticAssertTmp(...) ; -#else -void dbgBreakIf(bool condition) -{ - if (!condition) - __debugbreak(); -} -#define staticAssertTmp(x, ...) dbgBreakIf(x); -#endif - namespace nbl { namespace hlsl @@ -37,25 +25,25 @@ namespace impl } template - typename unsigned_integer_of_size::type castToUintType(T x) + NBL_CONSTEXPR_STATIC_INLINE typename unsigned_integer_of_size::type castToUintType(T x) { using AsUint = typename unsigned_integer_of_size::type; return bit_cast(x); } // to avoid bit cast from uintN_t to uintN_t - template <> unsigned_integer_of_size<2>::type castToUintType(uint16_t x) { return x; } - template <> unsigned_integer_of_size<4>::type castToUintType(uint32_t x) { return x; } - template <> unsigned_integer_of_size<8>::type castToUintType(uint64_t x) { return x; } + template <> NBL_CONSTEXPR_STATIC_INLINE unsigned_integer_of_size<2>::type castToUintType(uint16_t x) { return x; } + template <> NBL_CONSTEXPR_STATIC_INLINE unsigned_integer_of_size<4>::type castToUintType(uint32_t x) { return x; } + template <> NBL_CONSTEXPR_STATIC_INLINE unsigned_integer_of_size<8>::type castToUintType(uint64_t x) { return x; } template - T castBackToFloatType(T x) + NBL_CONSTEXPR_STATIC_INLINE T castBackToFloatType(T x) { using AsFloat = typename float_of_size::type; return bit_cast(x); } - template<> uint16_t castBackToFloatType(uint16_t x) { return x; } - template<> uint32_t castBackToFloatType(uint32_t x) { return x; } - template<> uint64_t castBackToFloatType(uint64_t x) { return x; } + template<> NBL_CONSTEXPR_STATIC_INLINE uint16_t castBackToFloatType(uint16_t x) { return x; } + template<> NBL_CONSTEXPR_STATIC_INLINE uint32_t castBackToFloatType(uint32_t x) { return x; } + template<> NBL_CONSTEXPR_STATIC_INLINE uint64_t castBackToFloatType(uint64_t x) { return x; } } template @@ -104,62 +92,63 @@ struct traits : traits_base }; template -uint32_t extractBiasedExponent(T x) +static inline uint32_t extractBiasedExponent(T x) { using AsUint = typename unsigned_integer_of_size::type; return glsl::bitfieldExtract(impl::castToUintType(x), traits::type>::mantissaBitCnt, traits::type>::exponentBitCnt); } template<> -uint32_t extractBiasedExponent(uint64_t x) +static inline uint32_t extractBiasedExponent(uint64_t x) { const uint32_t highBits = uint32_t(x >> 32); return glsl::bitfieldExtract(highBits, traits::mantissaBitCnt - 32, traits::exponentBitCnt); } template<> -uint32_t extractBiasedExponent(float64_t x) +static inline uint32_t extractBiasedExponent(float64_t x) { return extractBiasedExponent(impl::castToUintType(x)); } template -int extractExponent(T x) +static inline int extractExponent(T x) { using AsFloat = typename float_of_size::type; return int(extractBiasedExponent(x)) - int(traits::exponentBias); } template -T replaceBiasedExponent(T x, typename unsigned_integer_of_size::type biasedExp) +NBL_CONSTEXPR_STATIC_INLINE T replaceBiasedExponent(T x, typename unsigned_integer_of_size::type biasedExp) { - staticAssertTmp(impl::isTypeAllowed(), "Invalid type! Only floating point or unsigned integer types are allowed."); + // TODO: + //staticAssertTmp(impl::isTypeAllowed(), "Invalid type! Only floating point or unsigned integer types are allowed."); using AsFloat = typename float_of_size::type; return impl::castBackToFloatType(glsl::bitfieldInsert(impl::castToUintType(x), biasedExp, traits::mantissaBitCnt, traits::exponentBitCnt)); } // performs no overflow tests, returns x*exp2(n) template -T fastMulExp2(T x, int n) +NBL_CONSTEXPR_STATIC_INLINE T fastMulExp2(T x, int n) { return replaceBiasedExponent(x, extractBiasedExponent(x) + uint32_t(n)); } template -typename unsigned_integer_of_size::type extractMantissa(T x) +NBL_CONSTEXPR_STATIC_INLINE typename unsigned_integer_of_size::type extractMantissa(T x) { using AsUint = typename unsigned_integer_of_size::type; return impl::castToUintType(x) & traits::type>::mantissaMask; } template -typename unsigned_integer_of_size::type extractSign(T x) +NBL_CONSTEXPR_STATIC_INLINE typename unsigned_integer_of_size::type extractSign(T x) { return (impl::castToUintType(x) & traits::signMask) >> ((sizeof(T) * 8) - 1); } template -typename unsigned_integer_of_size::type extractSignPreserveBitPattern(T x) +NBL_CONSTEXPR_STATIC_INLINE typename unsigned_integer_of_size::type extractSignPreserveBitPattern(T x) { return impl::castToUintType(x) & traits::signMask; } diff --git a/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl b/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl index f2c2760994..da9586207f 100644 --- a/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl +++ b/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl @@ -47,436 +47,436 @@ namespace hlsl { namespace impl { - uint64_t2 shiftMantissaLeftBy53(uint64_t mantissa64) - { - uint64_t2 output; - output.x = mantissa64 >> (63 - ieee754::traits::mantissaBitCnt); - output.y = mantissa64 << (ieee754::traits::mantissaBitCnt + 1); +NBL_CONSTEXPR_STATIC_INLINE uint64_t2 shiftMantissaLeftBy53(uint64_t mantissa64) +{ + uint64_t2 output; + output.x = mantissa64 >> (64 - ieee754::traits::mantissaBitCnt); + output.y = mantissa64 << (ieee754::traits::mantissaBitCnt); - return output; - } + return output; +} - template - uint64_t promoteToUint64(T val) - { - using AsFloat = typename float_of_size::type; - uint64_t asUint = ieee754::impl::castToUintType(val); +template +NBL_CONSTEXPR_STATIC_INLINE uint64_t promoteToUint64(T val) +{ + using AsFloat = typename float_of_size::type; + uint64_t asUint = ieee754::impl::castToUintType(val); - const uint64_t sign = (uint64_t(ieee754::traits::signMask) & asUint) << (sizeof(float64_t) - sizeof(T)); - const int64_t newExponent = ieee754::extractExponent(val) + ieee754::traits::exponentBias; + const uint64_t sign = (uint64_t(ieee754::traits::signMask) & asUint) << (sizeof(float64_t) - sizeof(T)); + const int64_t newExponent = ieee754::extractExponent(val) + ieee754::traits::exponentBias; - const uint64_t exp = (uint64_t(ieee754::extractExponent(val)) + ieee754::traits::exponentBias) << (ieee754::traits::mantissaBitCnt); - const uint64_t mantissa = (uint64_t(ieee754::traits::mantissaMask) & asUint) << (ieee754::traits::exponentBias - ieee754::traits::mantissaBitCnt); + const uint64_t exp = (uint64_t(ieee754::extractExponent(val)) + ieee754::traits::exponentBias) << (ieee754::traits::mantissaBitCnt); + const uint64_t mantissa = (uint64_t(ieee754::traits::mantissaMask) & asUint) << (ieee754::traits::exponentBias - ieee754::traits::mantissaBitCnt); - return sign | exp | mantissa; - }; + return sign | exp | mantissa; +}; - template<> uint64_t promoteToUint64(float64_t val) { return bit_cast(val); } +template<> NBL_CONSTEXPR_STATIC_INLINE uint64_t promoteToUint64(float64_t val) { return bit_cast(val); } - uint32_t2 umulExtended(uint32_t lhs, uint32_t rhs) - { - uint64_t product = uint64_t(lhs) * uint64_t(rhs); - uint32_t2 output; - output.x = uint32_t((product & 0xFFFFFFFF00000000) >> 32); - output.y = uint32_t(product & 0x00000000FFFFFFFFull); - return output; - } +NBL_CONSTEXPR_STATIC_INLINE uint32_t2 umulExtended(uint32_t lhs, uint32_t rhs) +{ + uint64_t product = uint64_t(lhs) * uint64_t(rhs); + uint32_t2 output; + output.x = uint32_t((product & 0xFFFFFFFF00000000) >> 32); + output.y = uint32_t(product & 0x00000000FFFFFFFFull); + return output; +} - uint64_t propagateFloat64NaN(uint64_t lhs, uint64_t rhs) - { +NBL_CONSTEXPR_STATIC_INLINE uint64_t propagateFloat64NaN(uint64_t lhs, uint64_t rhs) +{ #if defined RELAXED_NAN_PROPAGATION - return lhs | rhs; + return lhs | rhs; #else - lhs |= 0x0008000000000000ull; - rhs |= 0x0008000000000000ull; - return tgmath::lerp(rhs, tgmath::lerp(lhs, rhs, tgmath::isnan(rhs)), tgmath::isnan(lhs)); - return 0; + lhs |= 0x0008000000000000ull; + rhs |= 0x0008000000000000ull; + return tgmath::lerp(rhs, tgmath::lerp(lhs, rhs, tgmath::isnan(rhs)), tgmath::isnan(lhs)); + return 0; #endif - } - - - uint64_t packFloat64(uint32_t zSign, int zExp, uint32_t zFrac0, uint32_t zFrac1) - { - uint32_t2 z; +} - z.x = zSign + (uint32_t(zExp) << 20) + zFrac0; - z.y = zFrac1; +NBL_CONSTEXPR_STATIC_INLINE uint64_t packFloat64(uint32_t zSign, int zExp, uint32_t zFrac0, uint32_t zFrac1) +{ + uint32_t2 z; - uint64_t output = 0u; - output |= (uint64_t(z.x) << 32) & 0xFFFFFFFF00000000ull; - output |= uint64_t(z.y); - return output; - } + z.x = zSign + (uint32_t(zExp) << 20) + zFrac0; + z.y = zFrac1; - uint32_t2 packUint64(uint64_t val) - { - return uint32_t2((val & 0xFFFFFFFF00000000ull) >> 32, val & 0x00000000FFFFFFFFull); - } + uint64_t output = 0u; + output |= (uint64_t(z.x) << 32) & 0xFFFFFFFF00000000ull; + output |= uint64_t(z.y); + return output; +} - uint64_t unpackUint64(uint32_t2 val) - { - return ((uint64_t(val.x) & 0x00000000FFFFFFFFull) << 32) | uint64_t(val.y); - } +NBL_CONSTEXPR_STATIC_INLINE uint32_t2 packUint64(uint64_t val) +{ + return uint32_t2((val & 0xFFFFFFFF00000000ull) >> 32, val & 0x00000000FFFFFFFFull); +} - uint32_t2 add64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) - { - uint32_t2 output; - output.y = a1 + b1; - output.x = a0 + b0 + uint32_t(output.y < a1); +NBL_CONSTEXPR_STATIC_INLINE uint64_t unpackUint64(uint32_t2 val) +{ + return ((uint64_t(val.x) & 0x00000000FFFFFFFFull) << 32) | uint64_t(val.y); +} - return output; - } +NBL_CONSTEXPR_STATIC_INLINE uint32_t2 add64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) +{ + uint32_t2 output; + output.y = a1 + b1; + output.x = a0 + b0 + uint32_t(output.y < a1); + return output; +} - uint32_t2 sub64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) - { - uint32_t2 output; - output.y = a1 - b1; - output.x = a0 - b0 - uint32_t(a1 < b1); - - return output; - } +NBL_CONSTEXPR_STATIC_INLINE uint32_t2 sub64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) +{ + uint32_t2 output; + output.y = a1 - b1; + output.x = a0 - b0 - uint32_t(a1 < b1); + + return output; +} // TODO: test - int countLeadingZeros32(uint32_t val) - { +static inline int countLeadingZeros32(uint32_t val) +{ #ifndef __HLSL_VERSION - return 31 - findMSB(val); + return 31 - findMSB(val); #else - return 31 - firstbithigh(val); + return 31 - firstbithigh(val); #endif - } - - uint32_t2 shift64RightJamming(uint32_t2 val, int count) - { - uint32_t2 output; - const int negCount = (-count) & 31; - - output.x = tgmath::lerp(0u, val.x, count == 0); - output.x = tgmath::lerp(output.x, (val.x >> count), count < 32); - - output.y = uint32_t((val.x | val.y) != 0u); /* count >= 64 */ - uint32_t z1_lt64 = (val.x>>(count & 31)) | uint32_t(((val.x<>count) | uint32_t((val.y<> (count & 31)), count < 64); - output.y = tgmath::lerp(output.y, (val.x << negCount) | (val.y >> count), count < 32); +} - val.z = tgmath::lerp(val.z | val.y, val.z, count < 32); - output.x = tgmath::lerp(output.x, val.x >> count, count < 32); - output.z |= uint32_t(val.z != 0u); +NBL_CONSTEXPR_STATIC_INLINE uint32_t2 shift64RightJamming(uint32_t2 val, int count) +{ + uint32_t2 output; + const int negCount = (-count) & 31; + + output.x = tgmath::lerp(0u, val.x, count == 0); + output.x = tgmath::lerp(output.x, (val.x >> count), count < 32); + + output.y = uint32_t((val.x | val.y) != 0u); /* count >= 64 */ + uint32_t z1_lt64 = (val.x>>(count & 31)) | uint32_t(((val.x<>count) | uint32_t((val.y<> ((-count) & 31))), packed.x, count == 0); - return unpackUint64(output); - }; +NBL_CONSTEXPR_STATIC_INLINE uint32_t4 mul64to128(uint32_t4 mantissasPacked) +{ + uint32_t4 output; + uint32_t more1 = 0u; + uint32_t more2 = 0u; + + // a0 = x + // a1 = y + // b0 = z + // b1 = w + + uint32_t2 z2z3 = umulExtended(mantissasPacked.y, mantissasPacked.w); + output.z = z2z3.x; + output.w = z2z3.y; + uint32_t2 z1more2 = umulExtended(mantissasPacked.y, mantissasPacked.z); + output.y = z1more2.x; + more2 = z1more2.y; + uint32_t2 z1z2 = add64(output.y, more2, 0u, output.z); + output.y = z1z2.x; + output.z = z1z2.y; + uint32_t2 z0more1 = umulExtended(mantissasPacked.x, mantissasPacked.z); + output.x = z0more1.x; + more1 = z0more1.y; + uint32_t2 z0z1 = add64(output.x, more1, 0u, output.y); + output.x = z0z1.x; + output.y = z0z1.y; + uint32_t2 more1more2 = umulExtended(mantissasPacked.x, mantissasPacked.w); + more1 = more1more2.x; + more2 = more1more2.y; + uint32_t2 more1z2 = add64(more1, more2, 0u, output.z); + more1 = more1z2.x; + output.z = more1z2.y; + uint32_t2 z0z12 = add64(output.x, output.y, 0u, more1); + output.x = z0z12.x; + output.y = z0z12.y; + + return output; +} + +NBL_CONSTEXPR_STATIC_INLINE uint32_t3 shift64ExtraRightJamming(uint32_t3 val, int count) +{ + uint32_t3 output; + output.x = 0u; + + int negCount = (-count) & 31; + + output.z = tgmath::lerp(uint32_t(val.x != 0u), val.x, count == 64); + output.z = tgmath::lerp(output.z, val.x << negCount, count < 64); + output.z = tgmath::lerp(output.z, val.y << negCount, count < 32); + + output.y = tgmath::lerp(0u, (val.x >> (count & 31)), count < 64); + output.y = tgmath::lerp(output.y, (val.x << negCount) | (val.y >> count), count < 32); + + val.z = tgmath::lerp(val.z | val.y, val.z, count < 32); + output.x = tgmath::lerp(output.x, val.x >> count, count < 32); + output.z |= uint32_t(val.z != 0u); + + output.x = tgmath::lerp(output.x, 0u, (count == 32)); + output.y = tgmath::lerp(output.y, val.x, (count == 32)); + output.z = tgmath::lerp(output.z, val.y, (count == 32)); + output.x = tgmath::lerp(output.x, val.x, (count == 0)); + output.y = tgmath::lerp(output.y, val.y, (count == 0)); + output.z = tgmath::lerp(output.z, val.z, (count == 0)); + + return output; +} - uint64_t assembleFloat64(uint64_t signShifted, uint64_t expShifted, uint64_t mantissa) - { - return signShifted + expShifted + mantissa; - } +NBL_CONSTEXPR_STATIC_INLINE uint64_t shortShift64Left(uint64_t val, int count) +{ + const uint32_t2 packed = packUint64(val); - uint64_t roundAndPackFloat64(uint64_t zSign, int zExp, uint32_t3 mantissaExtended) + uint32_t2 output; + output.y = packed.y << count; + // TODO: fix + output.x = tgmath::lerp((packed.x << count | (packed.y >> ((-count) & 31))), packed.x, count == 0); + + return unpackUint64(output); +}; + +NBL_CONSTEXPR_STATIC_INLINE uint64_t assembleFloat64(uint64_t signShifted, uint64_t expShifted, uint64_t mantissa) +{ + return signShifted + expShifted + mantissa; +} + +NBL_CONSTEXPR_STATIC_INLINE uint64_t roundAndPackFloat64(uint64_t zSign, int zExp, uint32_t3 mantissaExtended) +{ + bool roundNearestEven; + bool increment; + + roundNearestEven = true; + increment = int(mantissaExtended.z) < 0; + if (!roundNearestEven) { - bool roundNearestEven; - bool increment; - - roundNearestEven = true; - increment = int(mantissaExtended.z) < 0; - if (!roundNearestEven) + if (false) //(FLOAT_ROUNDING_MODE == FLOAT_ROUND_TO_ZERO) { - if (false) //(FLOAT_ROUNDING_MODE == FLOAT_ROUND_TO_ZERO) + increment = false; + } + else + { + if (false) //(zSign != 0u) { - increment = false; - } + //increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) && + // (zFrac2 != 0u); + } else { - if (false) //(zSign != 0u) - { - //increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) && - // (zFrac2 != 0u); - } - else - { - //increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP) && - // (zFrac2 != 0u); - } + //increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP) && + // (zFrac2 != 0u); } } - if (0x7FD <= zExp) + } + if (0x7FD <= zExp) + { + if ((0x7FD < zExp) || ((zExp == 0x7FD) && (0x001FFFFFu == mantissaExtended.x && 0xFFFFFFFFu == mantissaExtended.y) && increment)) { - if ((0x7FD < zExp) || ((zExp == 0x7FD) && (0x001FFFFFu == mantissaExtended.x && 0xFFFFFFFFu == mantissaExtended.y) && increment)) + if (false) // ((FLOAT_ROUNDING_MODE == FLOAT_ROUND_TO_ZERO) || + // ((zSign != 0u) && (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP)) || + // ((zSign == 0u) && (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN))) { - if (false) // ((FLOAT_ROUNDING_MODE == FLOAT_ROUND_TO_ZERO) || - // ((zSign != 0u) && (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP)) || - // ((zSign == 0u) && (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN))) - { - return assembleFloat64(zSign, 0x7FE << ieee754::traits::mantissaBitCnt, 0x000FFFFFFFFFFFFFull); - } - - return assembleFloat64(zSign, ieee754::traits::exponentMask, 0ull); - } + return assembleFloat64(zSign, 0x7FE << ieee754::traits::mantissaBitCnt, 0x000FFFFFFFFFFFFFull); + } + + return assembleFloat64(zSign, ieee754::traits::exponentMask, 0ull); + } + } + + if (zExp < 0) + { + mantissaExtended = shift64ExtraRightJamming(mantissaExtended, -zExp); + zExp = 0; + + if (roundNearestEven) + { + increment = mantissaExtended.z < 0u; } - - if (zExp < 0) + else { - mantissaExtended = shift64ExtraRightJamming(mantissaExtended, -zExp); - zExp = 0; - - if (roundNearestEven) + if (zSign != 0u) { - increment = mantissaExtended.z < 0u; + increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) && (mantissaExtended.z != 0u); } else { - if (zSign != 0u) - { - increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) && (mantissaExtended.z != 0u); - } - else - { - increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP) && (mantissaExtended.z != 0u); - } + increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP) && (mantissaExtended.z != 0u); } } - - if (increment) - { - const uint64_t added = impl::unpackUint64(uint32_t2(mantissaExtended.xy)) + 1ull; - mantissaExtended.xy = packUint64(added); - mantissaExtended.y &= ~((mantissaExtended.z + uint32_t(mantissaExtended.z == 0u)) & uint32_t(roundNearestEven)); - } - else - { - zExp = tgmath::lerp(zExp, 0, (mantissaExtended.x | mantissaExtended.y) == 0u); - } - - return assembleFloat64(zSign, uint64_t(zExp) << ieee754::traits::mantissaBitCnt, unpackUint64(mantissaExtended.xy)); } - - uint64_t normalizeRoundAndPackFloat64(uint64_t sign, int exp, uint32_t frac0, uint32_t frac1) + + if (increment) { - int shiftCount; - uint32_t3 frac = uint32_t3(frac0, frac1, 0u); - - if (frac.x == 0u) - { - exp -= 32; - frac.x = frac.y; - frac.y = 0u; - } - - shiftCount = countLeadingZeros32(frac.x) - 11; - if (0 <= shiftCount) - { - // TODO: this is packing and unpacking madness, fix it - frac.xy = packUint64(shortShift64Left(unpackUint64(frac.xy), shiftCount)); - } - else - { - frac.xyz = shift64ExtraRightJamming(uint32_t3(frac.xy, 0), -shiftCount); - } - exp -= shiftCount; - return roundAndPackFloat64(sign, exp, frac); + const uint64_t added = impl::unpackUint64(uint32_t2(mantissaExtended.xy)) + 1ull; + mantissaExtended.xy = packUint64(added); + mantissaExtended.y &= ~((mantissaExtended.z + uint32_t(mantissaExtended.z == 0u)) & uint32_t(roundNearestEven)); } - - void normalizeFloat64Subnormal(uint64_t mantissa, - NBL_REF_ARG(int) outExp, - NBL_REF_ARG(uint64_t) outMantissa) + else { - uint32_t2 mantissaPacked = packUint64(mantissa); - int shiftCount; - uint32_t2 temp; - shiftCount = countLeadingZeros32(tgmath::lerp(mantissaPacked.x, mantissaPacked.y, mantissaPacked.x == 0u)) - 11; - outExp = tgmath::lerp(1 - shiftCount, -shiftCount - 31, mantissaPacked.x == 0u); - - temp.x = tgmath::lerp(mantissaPacked.y << shiftCount, mantissaPacked.y >> (-shiftCount), shiftCount < 0); - temp.y = tgmath::lerp(0u, mantissaPacked.y << (shiftCount & 31), shiftCount < 0); + zExp = tgmath::lerp(zExp, 0, (mantissaExtended.x | mantissaExtended.y) == 0u); + } + + return assembleFloat64(zSign, uint64_t(zExp) << ieee754::traits::mantissaBitCnt, unpackUint64(mantissaExtended.xy)); +} - shortShift64Left(impl::unpackUint64(mantissaPacked), shiftCount); +static inline uint64_t normalizeRoundAndPackFloat64(uint64_t sign, int exp, uint32_t frac0, uint32_t frac1) +{ + int shiftCount; + uint32_t3 frac = uint32_t3(frac0, frac1, 0u); - outMantissa = tgmath::lerp(outMantissa, unpackUint64(temp), mantissaPacked.x == 0); - } - - bool areBothSameSignInfinity(uint64_t lhs, uint64_t rhs) + if (frac.x == 0u) { - lhs ^= ieee754::traits::signMask; - rhs ^= ieee754::traits::signMask; - - bool output = lhs == rhs && ieee754::traits::inf; - output = output && ((lhs & (~ieee754::traits::signMask)) == ieee754::traits::inf); - - return output; + exp -= 32; + frac.x = frac.y; + frac.y = 0u; } - bool areBothZero(uint64_t lhs, uint64_t rhs) + shiftCount = countLeadingZeros32(frac.x) - 11; + if (0 <= shiftCount) { - return ((lhs << 1) == 0ull) && ((rhs << 1) == 0ull); + // TODO: this is packing and unpacking madness, fix it + frac.xy = packUint64(shortShift64Left(unpackUint64(frac.xy), shiftCount)); } - - bool areBothSameSignZero(uint64_t lhs, uint64_t rhs) + else { - return ((lhs << 1) == 0ull) && (lhs == rhs); + frac.xyz = shift64ExtraRightJamming(uint32_t3(frac.xy, 0), -shiftCount); } + exp -= shiftCount; + return roundAndPackFloat64(sign, exp, frac); +} - // TODO: find more efficient algorithm - uint64_t nlz64(uint64_t x) - { - static const uint64_t MASK = 1ull << 63; +static inline void normalizeFloat64Subnormal(uint64_t mantissa, + NBL_REF_ARG(int) outExp, + NBL_REF_ARG(uint64_t) outMantissa) +{ + uint32_t2 mantissaPacked = packUint64(mantissa); + int shiftCount; + uint32_t2 temp; + shiftCount = countLeadingZeros32(tgmath::lerp(mantissaPacked.x, mantissaPacked.y, mantissaPacked.x == 0u)) - 11; + outExp = tgmath::lerp(1 - shiftCount, -shiftCount - 31, mantissaPacked.x == 0u); - uint64_t counter = 0; + temp.x = tgmath::lerp(mantissaPacked.y << shiftCount, mantissaPacked.y >> (-shiftCount), shiftCount < 0); + temp.y = tgmath::lerp(0u, mantissaPacked.y << (shiftCount & 31), shiftCount < 0); - while ((x & MASK) == 0) - { - x <<= 1; - ++counter; - } - return counter; + shortShift64Left(impl::unpackUint64(mantissaPacked), shiftCount); + + outMantissa = tgmath::lerp(outMantissa, unpackUint64(temp), mantissaPacked.x == 0); +} + +NBL_CONSTEXPR_STATIC_INLINE bool areBothInfinity(uint64_t lhs, uint64_t rhs) +{ + lhs &= ~ieee754::traits::signMask; + rhs &= ~ieee754::traits::signMask; + + return lhs == rhs && lhs == ieee754::traits::inf; +} + +NBL_CONSTEXPR_STATIC_INLINE bool areBothSameSignInfinity(uint64_t lhs, uint64_t rhs) +{ + return lhs == rhs && (lhs & ~ieee754::traits::signMask) == ieee754::traits::inf; +} + +NBL_CONSTEXPR_STATIC_INLINE bool areBothZero(uint64_t lhs, uint64_t rhs) +{ + return ((lhs << 1) == 0ull) && ((rhs << 1) == 0ull); +} + +NBL_CONSTEXPR_STATIC_INLINE bool areBothSameSignZero(uint64_t lhs, uint64_t rhs) +{ + return ((lhs << 1) == 0ull) && (lhs == rhs); +} + +// TODO: find more efficient algorithm +static inline uint64_t nlz64(uint64_t x) +{ + static const uint64_t MASK = 1ull << 63; + + uint64_t counter = 0; + + while ((x & MASK) == 0) + { + x <<= 1; + ++counter; } + return counter; +} - // returns pair of quotient and remainder - uint64_t divmod128by64(const uint64_t dividentHigh, const uint64_t dividentLow, uint64_t divisor) +// returns pair of quotient and remainder +static inline uint64_t divmod128by64(const uint64_t dividentHigh, const uint64_t dividentLow, uint64_t divisor) +{ + const uint64_t b = 1ull << 32; + uint64_t un1, un0, vn1, vn0, q1, q0, un32, un21, un10, rhat, left, right; + uint64_t s; + + //TODO: countl_zero + s = countl_zero(divisor); + //s = nlz64(divisor); + divisor <<= s; + vn1 = divisor >> 32; + vn0 = divisor & 0xFFFFFFFF; + + if (s > 0) { - const uint64_t b = 1ull << 32; - uint64_t un1, un0, vn1, vn0, q1, q0, un32, un21, un10, rhat, left, right; - uint64_t s; - - //TODO: countl_zero - s = countl_zero(divisor); - //s = nlz64(divisor); - divisor <<= s; - vn1 = divisor >> 32; - vn0 = divisor & 0xFFFFFFFF; - - if (s > 0) - { - un32 = (dividentHigh << s) | (dividentLow >> (64 - s)); - un10 = dividentLow << s; - } - else - { - un32 = dividentHigh; - un10 = dividentLow; - } + un32 = (dividentHigh << s) | (dividentLow >> (64 - s)); + un10 = dividentLow << s; + } + else + { + un32 = dividentHigh; + un10 = dividentLow; + } - un1 = un10 >> 32; - un0 = un10 & 0xFFFFFFFF; + un1 = un10 >> 32; + un0 = un10 & 0xFFFFFFFF; - q1 = un32 / vn1; - rhat = un32 % vn1; + q1 = un32 / vn1; + rhat = un32 % vn1; - left = q1 * vn0; - right = (rhat << 32) + un1; - while ((q1 >= b) || (left > right)) + left = q1 * vn0; + right = (rhat << 32) + un1; + while ((q1 >= b) || (left > right)) + { + --q1; + rhat += vn1; + if (rhat < b) { - --q1; - rhat += vn1; - if (rhat < b) - { - left -= vn0; - right = (rhat << 32) | un1; - } - break; + left -= vn0; + right = (rhat << 32) | un1; } + break; + } - un21 = (un32 << 32) + (un1 - (q1 * divisor)); + un21 = (un32 << 32) + (un1 - (q1 * divisor)); - q0 = un21 / vn1; - rhat = un21 % vn1; + q0 = un21 / vn1; + rhat = un21 % vn1; - left = q0 * vn0; - right = (rhat << 32) | un0; - while ((q0 >= b) || (left > right)) + left = q0 * vn0; + right = (rhat << 32) | un0; + while ((q0 >= b) || (left > right)) + { + --q0; + rhat += vn1; + if (rhat < b) { - --q0; - rhat += vn1; - if (rhat < b) - { - left -= vn0; - right = (rhat << 32) | un0; - continue; - } - break; + left -= vn0; + right = (rhat << 32) | un0; + continue; } - - return (q1 << 32) | q0; + break; } + + return (q1 << 32) | q0; +} } } } diff --git a/include/nbl/builtin/hlsl/shapes/beziers.hlsl b/include/nbl/builtin/hlsl/shapes/beziers.hlsl index adea0556c2..e454c4c8ef 100644 --- a/include/nbl/builtin/hlsl/shapes/beziers.hlsl +++ b/include/nbl/builtin/hlsl/shapes/beziers.hlsl @@ -10,6 +10,7 @@ #include #include #include +#include // TODO: Later include from correct hlsl header (numeric_limits.hlsl) #ifndef nbl_hlsl_FLT_EPSILON @@ -26,16 +27,52 @@ namespace nbl { namespace hlsl { + +// TODO(emulated_float64_t): this shouldn't be in the nbl::hlsl space +// struct VecT is solution to +// error: 'nbl::hlsl::emulated_float64_t' cannot be used as a type parameter where a scalar is required +// using float_t2 = typename conditional >::value, ef64_t2, vector >::type; +#ifdef __HLSL_VERSION +template +struct VecT { using type = void; }; +template<> +struct VecT { using type = vector; }; +template<> +struct VecT { using type = vector; }; +template<> +struct VecT { using type = vector; }; +template<> +struct VecT, 2> { using type = ef64_t2; }; +template<> +struct VecT, 3> { using type = ef64_t3; }; +template<> +struct VecT, 4> { using type = float64_t4; }; + +template +struct Mat2x2T { using type = float64_t2x2; }; +template<> +struct Mat2x2T { using type = float64_t2x2; }; +template<> +struct Mat2x2T > { using type = ef64_t2x2; }; + +#endif + namespace shapes { template struct QuadraticBezier { +#ifndef __HLSL_VERSION using float_t2 = vector; using float_t3 = vector; using float_t4 = vector; using float_t2x2 = matrix; - +#else + using float_t2 = typename VecT::type; + using float_t3 = typename VecT::type; + using float_t4 = typename VecT::type; + using float_t2x2 = typename Mat2x2T::type; +#endif float_t2 P0; float_t2 P1; float_t2 P2; diff --git a/include/nbl/builtin/hlsl/tgmath.hlsl b/include/nbl/builtin/hlsl/tgmath.hlsl index 490c3f9f70..d3afb09bb7 100644 --- a/include/nbl/builtin/hlsl/tgmath.hlsl +++ b/include/nbl/builtin/hlsl/tgmath.hlsl @@ -4,6 +4,7 @@ #ifndef _NBL_BUILTIN_HLSL_TGMATH_INCLUDED_ #define _NBL_BUILTIN_HLSL_TGMATH_INCLUDED_ +#include #include #include @@ -16,7 +17,7 @@ namespace tgmath { template -bool isnan(Float val) +static inline bool isnan(Float val) { using AsUint = typename unsigned_integer_of_size::type; using AsFloat = typename float_of_size::type; @@ -25,19 +26,26 @@ bool isnan(Float val) } template <> -bool isnan(uint64_t val) +static inline bool isnan(uint64_t val) { float64_t asFloat = bit_cast(val); return bool((ieee754::extractBiasedExponent(asFloat) == ieee754::traits::specialValueExp) && (val & ieee754::traits::mantissaMask)); } // TODO: better implementation, also i'm not sure this is the right place for this function -template -UINT lerp(UINT a, UINT b, bool c) +template +NBL_CONSTEXPR_STATIC_INLINE Uint lerp(Uint a, Uint b, bool c) { return c ? b : a; } +template +NBL_CONSTEXPR_STATIC_INLINE bool isInf(Uint val) +{ + using AsFloat = typename float_of_size::type; + return (val & ~ieee754::traits::signMask) == ieee754::traits::inf; +} + } } From 5ce988f505bb041d7758d7b328bf766d7248af36 Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Tue, 13 Aug 2024 17:28:52 +0200 Subject: [PATCH 030/358] update DXC submodule --- 3rdparty/dxc/dxc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc index a08b6cbeb1..bcedaf749f 160000 --- a/3rdparty/dxc/dxc +++ b/3rdparty/dxc/dxc @@ -1 +1 @@ -Subproject commit a08b6cbeb1038d14d0586d10a8cfa507b2fda8eb +Subproject commit bcedaf749fb6325dc41f9b436f1f2ea0a660de5e From 6b0bb287a5de4362c92987b72c961bbfa3882744 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Tue, 20 Aug 2024 12:27:00 +0100 Subject: [PATCH 031/358] No Float64 cap --- examples_tests | 2 +- include/nbl/builtin/hlsl/cpp_compat.hlsl | 2 +- .../nbl/builtin/hlsl/emulated_float64_t.hlsl | 390 +++--------------- .../hlsl/emulated_float64_t_utils.hlsl | 365 ++++++++++++++++ include/nbl/builtin/hlsl/ieee754.hlsl | 40 +- .../hlsl/impl/emulated_float64_t_impl.hlsl | 151 ++++--- .../hlsl/math/equations/quadratic.hlsl | 76 ++-- .../builtin/hlsl/math/equations/quartic.hlsl | 6 +- include/nbl/builtin/hlsl/shapes/beziers.hlsl | 87 ++-- include/nbl/builtin/hlsl/tgmath.hlsl | 22 +- src/nbl/builtin/CMakeLists.txt | 3 +- 11 files changed, 648 insertions(+), 496 deletions(-) create mode 100644 include/nbl/builtin/hlsl/emulated_float64_t_utils.hlsl diff --git a/examples_tests b/examples_tests index 9b1edf6a48..8e853f5518 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 9b1edf6a48513f28a7a9312260c3c790a8392232 +Subproject commit 8e853f551805370c64ed72313c1195bf6ededb70 diff --git a/include/nbl/builtin/hlsl/cpp_compat.hlsl b/include/nbl/builtin/hlsl/cpp_compat.hlsl index f3cf538e28..11839d9e0d 100644 --- a/include/nbl/builtin/hlsl/cpp_compat.hlsl +++ b/include/nbl/builtin/hlsl/cpp_compat.hlsl @@ -44,7 +44,7 @@ using add_pointer = std::add_pointer; #define NBL_CONSTEXPR const static #define NBL_CONSTEXPR_STATIC_INLINE const static #define NBL_CONSTEXPR_FUNC -#define NBL_CONSTEXPR_INLINE_FUNC +#define NBL_CONSTEXPR_INLINE_FUNC inline #define NBL_CONST_MEMBER_FUNC namespace nbl diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index dd517c2f8c..d6666c018b 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -3,13 +3,6 @@ #include -// weird dxc compiler errors -#ifndef __HLSL_VERSION -#define CONST const -#else -#define CONST -#endif - namespace nbl { namespace hlsl @@ -29,32 +22,54 @@ namespace hlsl NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(emulated_float64_t val) { + return createPreserveBitPattern(bit_cast(float64_t(val))); return createPreserveBitPattern(val.data); } NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(int32_t val) { - return emulated_float64_t(bit_cast(float64_t(val))); + return createPreserveBitPattern(bit_cast(float64_t(val))); + return emulated_float64_t::createPreserveBitPattern(impl::castToUint64WithFloat64BitPattern(int64_t(val))); } NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(int64_t val) { - return emulated_float64_t(bit_cast(float64_t(val))); + return createPreserveBitPattern(bit_cast(float64_t(val))); + return emulated_float64_t::createPreserveBitPattern(impl::castToUint64WithFloat64BitPattern(val)); } NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(uint32_t val) { - return emulated_float64_t(bit_cast(float64_t(val))); + return createPreserveBitPattern(bit_cast(float64_t(val))); + return emulated_float64_t::createPreserveBitPattern(impl::castToUint64WithFloat64BitPattern(uint64_t(val))); } NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(uint64_t val) { - return emulated_float64_t(bit_cast(float64_t(val))); + return createPreserveBitPattern(bit_cast(float64_t(val))); + return emulated_float64_t::createPreserveBitPattern(impl::castToUint64WithFloat64BitPattern(val)); + } + + NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(float32_t val) + { + return createPreserveBitPattern(bit_cast(float64_t(val))); + emulated_float64_t output; + output.data = impl::castToUint64WithFloat64BitPattern(val); + return output; } NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(float64_t val) { - return emulated_float64_t(bit_cast(val)); + return createPreserveBitPattern(bit_cast(float64_t(val))); +#ifdef __HLSL_VERSION + emulated_float64_t retval; + uint32_t lo, hi; + asuint(val, lo, hi); + retval.data = (uint64_t(hi) << 32) | lo; + return retval; +#else + return createPreserveBitPattern(reinterpret_cast(val)); +#endif } // TODO: unresolved external symbol imath_half_to_float_table @@ -63,11 +78,6 @@ namespace hlsl return emulated_float64_t(bit_cast(float64_t(val))); }*/ - NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(float32_t val) - { - return emulated_float64_t(bit_cast(float64_t(val))); - } - NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t createPreserveBitPattern(uint64_t val) { return emulated_float64_t(val); @@ -75,9 +85,11 @@ namespace hlsl inline float getAsFloat32() { - // TODO: don't use double - return float(bit_cast(data)); - + // TODO: fix + uint32_t sign = uint32_t((data & ieee754::traits::signMask) >> 32); + uint32_t exponent = (uint32_t(ieee754::extractExponent(data)) + ieee754::traits::exponentBias) + ieee754::traits::mantissaBitCnt; + uint32_t mantissa = uint32_t(data >> (ieee754::traits::mantissaBitCnt - ieee754::traits::mantissaBitCnt)) & ieee754::traits::mantissaMask; + return sign | exponent | mantissa; } #if 0 @@ -91,7 +103,7 @@ namespace hlsl #endif // arithmetic operators - emulated_float64_t operator+(const emulated_float64_t rhs) CONST + emulated_float64_t operator+(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { #if 0 { @@ -321,7 +333,12 @@ namespace hlsl } } - emulated_float64_t operator-(emulated_float64_t rhs) CONST + emulated_float64_t operator+(float rhs) + { + return createPreserveBitPattern(data) + create(rhs); + } + + emulated_float64_t operator-(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { emulated_float64_t lhs = createPreserveBitPattern(data); emulated_float64_t rhsFlipped = rhs.flipSign(); @@ -329,7 +346,12 @@ namespace hlsl return lhs + rhsFlipped; } - emulated_float64_t operator*(emulated_float64_t rhs) CONST + emulated_float64_t operator-(float rhs) NBL_CONST_MEMBER_FUNC + { + return createPreserveBitPattern(data) - create(rhs); + } + + emulated_float64_t operator*(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { if(FlushDenormToZero) { @@ -407,6 +429,11 @@ namespace hlsl } } + emulated_float64_t operator*(float rhs) + { + return createPreserveBitPattern(data) * create(rhs); + } + /*emulated_float64_t reciprocal(uint64_t x) { using ThisType = emulated_float64_t; @@ -415,7 +442,7 @@ namespace hlsl return output; }*/ - emulated_float64_t operator/(const emulated_float64_t rhs) CONST + emulated_float64_t operator/(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { if (FlushDenormToZero) { @@ -466,7 +493,7 @@ namespace hlsl // relational operators // TODO: should `FlushDenormToZero` affect relational operators? - bool operator==(emulated_float64_t rhs) CONST + bool operator==(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) return false; @@ -481,11 +508,11 @@ namespace hlsl return !(xored.data); } - bool operator!=(emulated_float64_t rhs) CONST + bool operator!=(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { return !(createPreserveBitPattern(data) == rhs); } - bool operator<(emulated_float64_t rhs) CONST + bool operator<(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) return false; @@ -505,7 +532,7 @@ namespace hlsl return (lhsFlipped & diffBits) < (rhsFlipped & diffBits); } - bool operator>(emulated_float64_t rhs) CONST + bool operator>(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) return true; @@ -525,13 +552,13 @@ namespace hlsl return (lhsFlipped & diffBits) > (rhsFlipped & diffBits); } - bool operator<=(emulated_float64_t rhs) CONST { return !(emulated_float64_t::createPreserveBitPattern(data) > emulated_float64_t::createPreserveBitPattern(rhs.data)); } + bool operator<=(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { return !(emulated_float64_t::createPreserveBitPattern(data) > emulated_float64_t::createPreserveBitPattern(rhs.data)); } bool operator>=(emulated_float64_t rhs) { return !(emulated_float64_t::createPreserveBitPattern(data) < emulated_float64_t::createPreserveBitPattern(rhs.data)); } //logical operators - bool operator&&(emulated_float64_t rhs) CONST { return bool(data) && bool(rhs.data); } - bool operator||(emulated_float64_t rhs) CONST { return bool(data) || bool(rhs.data); } - bool operator!() CONST { return !bool(data); } + bool operator&&(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { return bool(data) && bool(rhs.data); } + bool operator||(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { return bool(data) || bool(rhs.data); } + bool operator!() NBL_CONST_MEMBER_FUNC { return !bool(data); } // TODO: should modify self? emulated_float64_t flipSign() @@ -541,7 +568,7 @@ namespace hlsl bool isNaN() { - return tgmath::isnan(bit_cast(data)); + return tgmath::isnan(data); } }; @@ -554,31 +581,31 @@ struct traits_base\ NBL_CONSTEXPR_STATIC_INLINE int16_t mantissaBitCnt = 52;\ };\ template<>\ -static inline uint32_t extractBiasedExponent(Type x)\ +inline uint32_t extractBiasedExponent(Type x)\ {\ return extractBiasedExponent(x.data);\ }\ \ template<>\ -static inline int extractExponent(Type x)\ +inline int extractExponent(Type x)\ {\ return extractExponent(x.data);\ }\ \ template<>\ -NBL_CONSTEXPR_STATIC_INLINE Type replaceBiasedExponent(Type x, typename unsigned_integer_of_size::type biasedExp)\ +NBL_CONSTEXPR_INLINE_FUNC Type replaceBiasedExponent(Type x, typename unsigned_integer_of_size::type biasedExp)\ {\ return Type(replaceBiasedExponent(x.data, biasedExp));\ }\ \ template <>\ -NBL_CONSTEXPR_STATIC_INLINE Type fastMulExp2(Type x, int n)\ +NBL_CONSTEXPR_INLINE_FUNC Type fastMulExp2(Type x, int n)\ {\ return Type(replaceBiasedExponent(x.data, extractBiasedExponent(x) + uint32_t(n)));\ }\ \ template <>\ -NBL_CONSTEXPR_STATIC_INLINE unsigned_integer_of_size::type extractMantissa(Type x)\ +NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size::type extractMantissa(Type x)\ {\ return extractMantissa(x.data);\ }\ @@ -591,296 +618,9 @@ namespace ieee754 IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); } -// TODO: finish it - -// TODO: this is mess, refactorize it -#ifndef __HLSL_VERSION -using ef64_t2 = vector; -using ef64_t3 = vector; -using ef64_t4 = vector; -using ef64_t3x3 = matrix; -using ef64_t2x2 = matrix; -#else -struct ef64_t2 -{ - emulated_float64_t x; - emulated_float64_t y; - - emulated_float64_t calcComponentSum() CONST - { - return x + y; - } - - NBL_CONSTEXPR_STATIC_INLINE ef64_t2 create(emulated_float64_t x, emulated_float64_t y) - { - ef64_t2 output; - output.x = x; - output.y = y; - - return output; - } - - NBL_CONSTEXPR_STATIC_INLINE ef64_t2 create(float val) - { - ef64_t2 output; - output.x = emulated_float64_t::create(val); - output.y = emulated_float64_t::create(val); - - return output; - } - - NBL_CONSTEXPR_STATIC_INLINE ef64_t2 create(float32_t2 val) - { - ef64_t2 output; - output.x = emulated_float64_t::create(val.x); - output.y = emulated_float64_t::create(val.y); - - return output; - } - - ef64_t2 operator+(float rhs) - { - ef64_t2 output; - emulated_float64_t rhsAsEF64 = emulated_float64_t::create(rhs); - output.x = x + rhsAsEF64; - output.y = y + rhsAsEF64; - - return output; - } - - ef64_t2 operator+(emulated_float64_t rhs) - { - ef64_t2 output; - output.x = x + rhs; - output.y = y + rhs; - - return output; - } - - ef64_t2 operator+(ef64_t2 rhs) - { - ef64_t2 output; - output.x = x + rhs.x; - output.y = y + rhs.y; - - return output; - } - - ef64_t2 operator-(float rhs) - { - return create(x, y) + (-rhs); - } - - ef64_t2 operator-(emulated_float64_t rhs) - { - return create(x, y) + (rhs.flipSign()); - } - - ef64_t2 operator-(ef64_t2 rhs) - { - rhs.x = rhs.x.flipSign(); - rhs.y = rhs.y.flipSign(); - return create(x, y) + rhs; - } - - ef64_t2 operator*(float rhs) - { - ef64_t2 output; - emulated_float64_t rhsAsEF64 = emulated_float64_t::create(rhs); - output.x = x * rhsAsEF64; - output.y = y * rhsAsEF64; - - return output; - } - - ef64_t2 operator*(emulated_float64_t rhs) - { - ef64_t2 output; - output.x = x * rhs; - output.y = y * rhs; - - return output; - } - - ef64_t2 operator*(ef64_t2 rhs) - { - ef64_t2 output; - output.x = x * rhs.x; - output.y = y * rhs.y; - - return output; - } - - float2 getAsFloat2() - { - return float2(x.getAsFloat32(), y.getAsFloat32()); - } -}; - -struct ef64_t3 -{ - emulated_float64_t x; - emulated_float64_t y; - emulated_float64_t z; - - static ef64_t3 create(NBL_REF_ARG(ef64_t3) other) - { - ef64_t3 output; - - output.x = other.x; - output.y = other.y; - output.z = other.z; - - return output; - } - - static ef64_t3 create(NBL_REF_ARG(ef64_t2) other, emulated_float64_t z) - { - ef64_t3 output; - - output.x = other.x; - output.y = other.y; - output.z = z; - - return output; - } - - static ef64_t3 create(NBL_REF_ARG(ef64_t2) other, int z) - { - ef64_t3 output; - - output.x = other.x; - output.y = other.y; - output.z = emulated_float64_t::create(z); - - return output; - } - - emulated_float64_t calcComponentSum() CONST - { - return x + y + z; - } - - ef64_t3 operator*(NBL_CONST_REF_ARG(ef64_t3) rhs) CONST - { - ef64_t3 output; - output.x = x * rhs.x; - output.y = x * rhs.y; - output.z = x * rhs.z; - - return output; - } -}; - -struct ef64_t4 -{ - emulated_float64_t x; - emulated_float64_t y; - emulated_float64_t z; - emulated_float64_t w; -}; - -struct ef64_t3x3 -{ - ef64_t3 columns[3]; - - ef64_t3x3 getTransposed() CONST - { - ef64_t3x3 output; - - output.columns[1].x = columns[0].y; - output.columns[2].x = columns[0].z; - - output.columns[0].y = columns[1].x; - output.columns[2].y = columns[1].z; - - output.columns[0].z = columns[3].x; - output.columns[1].z = columns[3].y; - - return output; - } - - ef64_t3x3 operator*(NBL_CONST_REF_ARG(ef64_t3x3) rhs) CONST - { - ef64_t3x3 output; - ef64_t3x3 lhsTransposed = getTransposed(); - - output.columns[0].x = (lhsTransposed.columns[0] * rhs.columns[0]).calcComponentSum(); - output.columns[0].y = (lhsTransposed.columns[0] * rhs.columns[1]).calcComponentSum(); - output.columns[0].z = (lhsTransposed.columns[0] * rhs.columns[2]).calcComponentSum(); - - output.columns[1].x = (lhsTransposed.columns[1] * rhs.columns[0]).calcComponentSum(); - output.columns[1].y = (lhsTransposed.columns[1] * rhs.columns[1]).calcComponentSum(); - output.columns[1].z = (lhsTransposed.columns[1] * rhs.columns[2]).calcComponentSum(); - - output.columns[2].x = (lhsTransposed.columns[2] * rhs.columns[0]).calcComponentSum(); - output.columns[2].y = (lhsTransposed.columns[2] * rhs.columns[1]).calcComponentSum(); - output.columns[2].z = (lhsTransposed.columns[2] * rhs.columns[2]).calcComponentSum(); - - return output; - } - - ef64_t3 operator*(NBL_CONST_REF_ARG(ef64_t3) rhs) - { - ef64_t3 output; - ef64_t3x3 lhsTransposed = getTransposed(); - - output.x = (columns[0] * rhs).calcComponentSum(); - output.y = (columns[1] * rhs).calcComponentSum(); - output.z = (columns[2] * rhs).calcComponentSum(); - - return output; - } -}; - -struct ef64_t2x2 -{ - ef64_t2 columns[2]; - - ef64_t2x2 getTransposed() CONST - { - ef64_t2x2 output; - - output.columns[1].x = columns[0].y; - output.columns[0].y = columns[1].x; - - return output; - } - - ef64_t2x2 operator*(NBL_CONST_REF_ARG(ef64_t2x2) rhs) CONST - { - ef64_t2x2 output; - ef64_t2x2 lhsTransposed = getTransposed(); - - output.columns[0].x = (lhsTransposed.columns[0] * rhs.columns[0]).calcComponentSum(); - output.columns[0].y = (lhsTransposed.columns[0] * rhs.columns[1]).calcComponentSum(); - - output.columns[1].x = (lhsTransposed.columns[1] * rhs.columns[0]).calcComponentSum(); - output.columns[1].y = (lhsTransposed.columns[1] * rhs.columns[1]).calcComponentSum(); - - return output; - } - - ef64_t2 operator*(NBL_CONST_REF_ARG(ef64_t2) rhs) - { - ef64_t2 output; - ef64_t2x2 lhsTransposed = getTransposed(); - - output.x = (columns[0] * rhs).calcComponentSum(); - output.y = (columns[1] * rhs).calcComponentSum(); - - return output; - } -}; - -#endif - } - } -#undef CONST - #undef FLOAT_ROUND_NEAREST_EVEN #undef FLOAT_ROUND_TO_ZERO #undef FLOAT_ROUND_DOWN diff --git a/include/nbl/builtin/hlsl/emulated_float64_t_utils.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t_utils.hlsl new file mode 100644 index 0000000000..4fe18cc5e8 --- /dev/null +++ b/include/nbl/builtin/hlsl/emulated_float64_t_utils.hlsl @@ -0,0 +1,365 @@ +#ifndef _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_UTILS_INCLUDED_ +#define _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_UTILS_INCLUDED_ + +#include + +namespace nbl +{ +namespace hlsl +{ +// should i use this namespace? +//namespace ef64_util +//{ +// TODO: this is mess, refactorize it +#ifndef __HLSL_VERSION +using ef64_t2 = float64_t2; +using ef64_t3 = float64_t3; +using ef64_t4 = float64_t4; +using ef64_t3x3 = float64_t3x3; +using ef64_t2x2 = float64_t4x4; +#else +struct ef64_t2 +{ + emulated_float64_t x; + emulated_float64_t y; + + emulated_float64_t calcComponentSum() NBL_CONST_MEMBER_FUNC + { + return x + y; + } + + NBL_CONSTEXPR_STATIC_INLINE ef64_t2 create(emulated_float64_t x, emulated_float64_t y) + { + ef64_t2 output; + output.x = x; + output.y = y; + + return output; + } + + NBL_CONSTEXPR_STATIC_INLINE ef64_t2 create(float val) + { + ef64_t2 output; + output.x = emulated_float64_t::create(val); + output.y = emulated_float64_t::create(val); + + return output; + } + + NBL_CONSTEXPR_STATIC_INLINE ef64_t2 create(float32_t2 val) + { + ef64_t2 output; + output.x = emulated_float64_t::create(val.x); + output.y = emulated_float64_t::create(val.y); + + return output; + } + + NBL_CONSTEXPR_STATIC_INLINE ef64_t2 create(uint32_t2 val) + { + ef64_t2 output; + output.x = emulated_float64_t::create(val.x); + output.y = emulated_float64_t::create(val.y); + + return output; + } + + ef64_t2 operator+(float rhs) + { + ef64_t2 output; + emulated_float64_t rhsAsEF64 = emulated_float64_t::create(rhs); + output.x = x + rhsAsEF64; + output.y = y + rhsAsEF64; + + return output; + } + + ef64_t2 operator+(emulated_float64_t rhs) + { + ef64_t2 output; + output.x = x + rhs; + output.y = y + rhs; + + return output; + } + + ef64_t2 operator+(ef64_t2 rhs) + { + ef64_t2 output; + output.x = x + rhs.x; + output.y = y + rhs.y; + + return output; + } + + ef64_t2 operator-(float rhs) + { + return create(x, y) + (-rhs); + } + + ef64_t2 operator-(emulated_float64_t rhs) + { + return create(x, y) + (rhs.flipSign()); + } + + ef64_t2 operator-(ef64_t2 rhs) + { + rhs.x = rhs.x.flipSign(); + rhs.y = rhs.y.flipSign(); + return create(x, y) + rhs; + } + + ef64_t2 operator*(float rhs) + { + ef64_t2 output; + emulated_float64_t rhsAsEF64 = emulated_float64_t::create(rhs); + output.x = x * rhsAsEF64; + output.y = y * rhsAsEF64; + + return output; + } + + ef64_t2 operator*(emulated_float64_t rhs) + { + ef64_t2 output; + output.x = x * rhs; + output.y = y * rhs; + + return output; + } + + ef64_t2 operator*(ef64_t2 rhs) + { + ef64_t2 output; + output.x = x * rhs.x; + output.y = y * rhs.y; + + return output; + } + +#ifdef __HLSL_VERSION + float2 getAsFloat2() + { + return float2(x.getAsFloat32(), y.getAsFloat32()); + } +#endif +}; + +struct ef64_t3 +{ + emulated_float64_t x; + emulated_float64_t y; + emulated_float64_t z; + + NBL_CONSTEXPR_STATIC_INLINE ef64_t3 create(NBL_REF_ARG(ef64_t3) other) + { + ef64_t3 output; + + output.x = other.x; + output.y = other.y; + output.z = other.z; + + return output; + } + + NBL_CONSTEXPR_STATIC_INLINE ef64_t3 create(NBL_REF_ARG(ef64_t2) other, emulated_float64_t z) + { + ef64_t3 output; + + output.x = other.x; + output.y = other.y; + output.z = z; + + return output; + } + + NBL_CONSTEXPR_STATIC_INLINE ef64_t3 create(NBL_REF_ARG(ef64_t2) other, int z) + { + ef64_t3 output; + + output.x = other.x; + output.y = other.y; + output.z = emulated_float64_t::create(z); + + return output; + } + + emulated_float64_t calcComponentSum() NBL_CONST_MEMBER_FUNC + { + return x + y + z; + } + + ef64_t3 operator*(NBL_CONST_REF_ARG(ef64_t3) rhs) NBL_CONST_MEMBER_FUNC + { + ef64_t3 output; + output.x = x * rhs.x; + output.y = y * rhs.y; + output.z = z * rhs.z; + + return output; + } +}; + +struct ef64_t4 +{ + emulated_float64_t x; + emulated_float64_t y; + emulated_float64_t z; + emulated_float64_t w; +}; + +struct ef64_t3x3 +{ + ef64_t3 columns[3]; + + ef64_t3x3 getTransposed() NBL_CONST_MEMBER_FUNC + { + ef64_t3x3 output; + + output.columns[0].x = columns[0].x; + output.columns[1].x = columns[0].y; + output.columns[2].x = columns[0].z; + + output.columns[0].y = columns[1].x; + output.columns[1].y = columns[1].y; + output.columns[2].y = columns[1].z; + + output.columns[0].z = columns[2].x; + output.columns[1].z = columns[2].y; + output.columns[2].z = columns[2].z; + + return output; + } + + ef64_t3x3 operator*(NBL_CONST_REF_ARG(ef64_t3x3) rhs) NBL_CONST_MEMBER_FUNC + { + ef64_t3x3 output; + ef64_t3x3 lhsTransposed = getTransposed(); + + output.columns[0].x = (lhsTransposed.columns[0] * rhs.columns[0]).calcComponentSum(); + output.columns[0].y = (lhsTransposed.columns[0] * rhs.columns[1]).calcComponentSum(); + output.columns[0].z = (lhsTransposed.columns[0] * rhs.columns[2]).calcComponentSum(); + + output.columns[1].x = (lhsTransposed.columns[1] * rhs.columns[0]).calcComponentSum(); + output.columns[1].y = (lhsTransposed.columns[1] * rhs.columns[1]).calcComponentSum(); + output.columns[1].z = (lhsTransposed.columns[1] * rhs.columns[2]).calcComponentSum(); + + output.columns[2].x = (lhsTransposed.columns[2] * rhs.columns[0]).calcComponentSum(); + output.columns[2].y = (lhsTransposed.columns[2] * rhs.columns[1]).calcComponentSum(); + output.columns[2].z = (lhsTransposed.columns[2] * rhs.columns[2]).calcComponentSum(); + + // TODO: avoid transpose + return output.getTransposed(); + } + + ef64_t3 operator*(NBL_CONST_REF_ARG(ef64_t3) rhs) + { + ef64_t3 output; + ef64_t3x3 lhsTransposed = getTransposed(); + + output.x = (columns[0] * rhs).calcComponentSum(); + output.y = (columns[1] * rhs).calcComponentSum(); + output.z = (columns[2] * rhs).calcComponentSum(); + + return output; + } +}; + +struct ef64_t2x2 +{ + ef64_t2 columns[2]; + + ef64_t2x2 getTransposed() NBL_CONST_MEMBER_FUNC + { + ef64_t2x2 output; + + output.columns[0].x = columns[0].x; + output.columns[1].x = columns[0].y; + + output.columns[0].y = columns[1].x; + output.columns[1].y = columns[1].y; + + return output; + } + + ef64_t2x2 operator*(NBL_CONST_REF_ARG(ef64_t2x2) rhs) NBL_CONST_MEMBER_FUNC + { + ef64_t2x2 output; + ef64_t2x2 lhsTransposed = getTransposed(); + + output.columns[0].x = (lhsTransposed.columns[0] * rhs.columns[0]).calcComponentSum(); + output.columns[0].y = (lhsTransposed.columns[0] * rhs.columns[1]).calcComponentSum(); + + output.columns[1].x = (lhsTransposed.columns[1] * rhs.columns[0]).calcComponentSum(); + output.columns[1].y = (lhsTransposed.columns[1] * rhs.columns[1]).calcComponentSum(); + + return output.getTransposed(); + } + + ef64_t2 operator*(NBL_CONST_REF_ARG(ef64_t2) rhs) + { + ef64_t2 output; + ef64_t2x2 lhsTransposed = getTransposed(); + + output.x = (columns[0] * rhs).calcComponentSum(); + output.y = (columns[1] * rhs).calcComponentSum(); + + return output; + } +}; + +#endif + +// struct VecT is solution to +// error: 'nbl::hlsl::emulated_float64_t' cannot be used as a type parameter where a scalar is required +// using float_t2 = typename conditional >::value, ef64_t2, vector >::type; + +// TODO: better solution + +#ifndef __HLSL_VERSION +using F64_t = double; +#else +using F64_t = emulated_float64_t; +#endif + +template +struct VecT { using type = void; }; + +template<> +struct VecT { using type = vector; }; +template<> +struct VecT { using type = vector; }; +template<> +struct VecT { using type = vector; }; + +#ifndef __HLSL_VERSION +template<> +struct VecT { using type = float64_t2; }; +template<> +struct VecT { using type = float64_t3; }; +template<> +struct VecT { using type = float64_t4; }; +#endif + +template<> +struct VecT, 2> { using type = ef64_t2; }; +template<> +struct VecT, 3> { using type = ef64_t3; }; +template<> +struct VecT, 4> { using type = ef64_t4; }; + +template +struct Mat2x2T { using type = void; }; +template<> +struct Mat2x2T { using type = float32_t2x2; }; +#ifndef __HLSL_VERSION +template<> +struct Mat2x2T { using type = float64_t2x2; }; +#endif +template<> +struct Mat2x2T > { using type = ef64_t2x2; }; + +//} +} +} +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/ieee754.hlsl b/include/nbl/builtin/hlsl/ieee754.hlsl index e92b45713f..13308bb2dd 100644 --- a/include/nbl/builtin/hlsl/ieee754.hlsl +++ b/include/nbl/builtin/hlsl/ieee754.hlsl @@ -11,10 +11,12 @@ namespace hlsl { namespace ieee754 { + +// TODO: move to builtin/hlsl/impl/ieee754_impl.hlsl? namespace impl { template - NBL_CONSTEXPR_STATIC_INLINE bool isTypeAllowed() + NBL_CONSTEXPR_INLINE_FUNC bool isTypeAllowed() { return is_same::value || is_same::value || @@ -25,25 +27,25 @@ namespace impl } template - NBL_CONSTEXPR_STATIC_INLINE typename unsigned_integer_of_size::type castToUintType(T x) + NBL_CONSTEXPR_INLINE_FUNC typename unsigned_integer_of_size::type castToUintType(T x) { using AsUint = typename unsigned_integer_of_size::type; return bit_cast(x); } // to avoid bit cast from uintN_t to uintN_t - template <> NBL_CONSTEXPR_STATIC_INLINE unsigned_integer_of_size<2>::type castToUintType(uint16_t x) { return x; } - template <> NBL_CONSTEXPR_STATIC_INLINE unsigned_integer_of_size<4>::type castToUintType(uint32_t x) { return x; } - template <> NBL_CONSTEXPR_STATIC_INLINE unsigned_integer_of_size<8>::type castToUintType(uint64_t x) { return x; } + template <> NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size<2>::type castToUintType(uint16_t x) { return x; } + template <> NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size<4>::type castToUintType(uint32_t x) { return x; } + template <> NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size<8>::type castToUintType(uint64_t x) { return x; } template - NBL_CONSTEXPR_STATIC_INLINE T castBackToFloatType(T x) + NBL_CONSTEXPR_INLINE_FUNC T castBackToFloatType(T x) { using AsFloat = typename float_of_size::type; return bit_cast(x); } - template<> NBL_CONSTEXPR_STATIC_INLINE uint16_t castBackToFloatType(uint16_t x) { return x; } - template<> NBL_CONSTEXPR_STATIC_INLINE uint32_t castBackToFloatType(uint32_t x) { return x; } - template<> NBL_CONSTEXPR_STATIC_INLINE uint64_t castBackToFloatType(uint64_t x) { return x; } + template<> NBL_CONSTEXPR_INLINE_FUNC uint16_t castBackToFloatType(uint16_t x) { return x; } + template<> NBL_CONSTEXPR_INLINE_FUNC uint32_t castBackToFloatType(uint32_t x) { return x; } + template<> NBL_CONSTEXPR_INLINE_FUNC uint64_t castBackToFloatType(uint64_t x) { return x; } } template @@ -89,37 +91,39 @@ struct traits : traits_base NBL_CONSTEXPR_STATIC_INLINE bit_rep_t inf = exponentMask; NBL_CONSTEXPR_STATIC_INLINE bit_rep_t specialValueExp = (1ull << base_t::exponentBitCnt) - 1; NBL_CONSTEXPR_STATIC_INLINE bit_rep_t quietNaN = exponentMask | (1ull << (base_t::mantissaBitCnt - 1)); + NBL_CONSTEXPR_STATIC_INLINE bit_rep_t max = ((1ull << (sizeof(Float) * 8 - 1)) - 1) & (~(1ull << base_t::mantissaBitCnt)); + NBL_CONSTEXPR_STATIC_INLINE bit_rep_t min = 1ull << base_t::mantissaBitCnt; }; template -static inline uint32_t extractBiasedExponent(T x) +inline uint32_t extractBiasedExponent(T x) { using AsUint = typename unsigned_integer_of_size::type; return glsl::bitfieldExtract(impl::castToUintType(x), traits::type>::mantissaBitCnt, traits::type>::exponentBitCnt); } template<> -static inline uint32_t extractBiasedExponent(uint64_t x) +inline uint32_t extractBiasedExponent(uint64_t x) { const uint32_t highBits = uint32_t(x >> 32); return glsl::bitfieldExtract(highBits, traits::mantissaBitCnt - 32, traits::exponentBitCnt); } template<> -static inline uint32_t extractBiasedExponent(float64_t x) +inline uint32_t extractBiasedExponent(float64_t x) { return extractBiasedExponent(impl::castToUintType(x)); } template -static inline int extractExponent(T x) +inline int extractExponent(T x) { using AsFloat = typename float_of_size::type; return int(extractBiasedExponent(x)) - int(traits::exponentBias); } template -NBL_CONSTEXPR_STATIC_INLINE T replaceBiasedExponent(T x, typename unsigned_integer_of_size::type biasedExp) +NBL_CONSTEXPR_INLINE_FUNC T replaceBiasedExponent(T x, typename unsigned_integer_of_size::type biasedExp) { // TODO: //staticAssertTmp(impl::isTypeAllowed(), "Invalid type! Only floating point or unsigned integer types are allowed."); @@ -129,26 +133,26 @@ NBL_CONSTEXPR_STATIC_INLINE T replaceBiasedExponent(T x, typename unsigned_integ // performs no overflow tests, returns x*exp2(n) template -NBL_CONSTEXPR_STATIC_INLINE T fastMulExp2(T x, int n) +NBL_CONSTEXPR_INLINE_FUNC T fastMulExp2(T x, int n) { return replaceBiasedExponent(x, extractBiasedExponent(x) + uint32_t(n)); } template -NBL_CONSTEXPR_STATIC_INLINE typename unsigned_integer_of_size::type extractMantissa(T x) +NBL_CONSTEXPR_INLINE_FUNC typename unsigned_integer_of_size::type extractMantissa(T x) { using AsUint = typename unsigned_integer_of_size::type; return impl::castToUintType(x) & traits::type>::mantissaMask; } template -NBL_CONSTEXPR_STATIC_INLINE typename unsigned_integer_of_size::type extractSign(T x) +NBL_CONSTEXPR_INLINE_FUNC typename unsigned_integer_of_size::type extractSign(T x) { return (impl::castToUintType(x) & traits::signMask) >> ((sizeof(T) * 8) - 1); } template -NBL_CONSTEXPR_STATIC_INLINE typename unsigned_integer_of_size::type extractSignPreserveBitPattern(T x) +NBL_CONSTEXPR_INLINE_FUNC typename unsigned_integer_of_size::type extractSignPreserveBitPattern(T x) { return impl::castToUintType(x) & traits::signMask; } diff --git a/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl b/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl index da9586207f..bdaa9202a9 100644 --- a/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl +++ b/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl @@ -47,7 +47,7 @@ namespace hlsl { namespace impl { -NBL_CONSTEXPR_STATIC_INLINE uint64_t2 shiftMantissaLeftBy53(uint64_t mantissa64) +NBL_CONSTEXPR_INLINE_FUNC uint64_t2 shiftMantissaLeftBy53(uint64_t mantissa64) { uint64_t2 output; output.x = mantissa64 >> (64 - ieee754::traits::mantissaBitCnt); @@ -56,24 +56,102 @@ NBL_CONSTEXPR_STATIC_INLINE uint64_t2 shiftMantissaLeftBy53(uint64_t mantissa64) return output; } -template -NBL_CONSTEXPR_STATIC_INLINE uint64_t promoteToUint64(T val) +NBL_CONSTEXPR_INLINE_FUNC uint64_t packFloat64(uint32_t zSign, int zExp, uint32_t zFrac0, uint32_t zFrac1) { - using AsFloat = typename float_of_size::type; - uint64_t asUint = ieee754::impl::castToUintType(val); + uint32_t2 z; - const uint64_t sign = (uint64_t(ieee754::traits::signMask) & asUint) << (sizeof(float64_t) - sizeof(T)); - const int64_t newExponent = ieee754::extractExponent(val) + ieee754::traits::exponentBias; + z.x = zSign + (uint32_t(zExp) << 20) + zFrac0; + z.y = zFrac1; - const uint64_t exp = (uint64_t(ieee754::extractExponent(val)) + ieee754::traits::exponentBias) << (ieee754::traits::mantissaBitCnt); - const uint64_t mantissa = (uint64_t(ieee754::traits::mantissaMask) & asUint) << (ieee754::traits::exponentBias - ieee754::traits::mantissaBitCnt); + uint64_t output = 0u; + output |= (uint64_t(z.x) << 32) & 0xFFFFFFFF00000000ull; + output |= uint64_t(z.y); + return output; +} - return sign | exp | mantissa; +NBL_CONSTEXPR_INLINE_FUNC uint32_t2 packUint64(uint64_t val) +{ + return uint32_t2((val & 0xFFFFFFFF00000000ull) >> 32, val & 0x00000000FFFFFFFFull); +} + +NBL_CONSTEXPR_INLINE_FUNC uint64_t unpackUint64(uint32_t2 val) +{ + return ((uint64_t(val.x) & 0x00000000FFFFFFFFull) << 32) | uint64_t(val.y); +} + +inline uint64_t castToUint64WithFloat64BitPattern(float32_t val) +{ + uint32_t asUint = ieee754::impl::castToUintType(val); + + const uint64_t sign = (uint64_t(ieee754::traits::signMask) & asUint) << (sizeof(float32_t) * 8); + + const uint64_t biasedExp = (uint64_t(ieee754::extractExponent(val)) + ieee754::traits::exponentBias) << (ieee754::traits::mantissaBitCnt); + const uint64_t mantissa = (uint64_t(ieee754::traits::mantissaMask) & asUint) << (ieee754::traits::mantissaBitCnt - ieee754::traits::mantissaBitCnt); + + return sign | biasedExp | mantissa; }; -template<> NBL_CONSTEXPR_STATIC_INLINE uint64_t promoteToUint64(float64_t val) { return bit_cast(val); } +inline uint64_t castToUint64WithFloat64BitPattern(uint64_t val) +{ + if (val == 0) + return val; + +#ifndef __HLSL_VERSION + int exp = findMSB(val); +#else + uint32_t2 valPacked = packUint64(val); + int exp = valPacked.x ? firstbithigh(valPacked.x) + 32 : firstbithigh(valPacked.y); +#endif + uint64_t mantissa; + + int shiftCnt = 52 - exp; + if (shiftCnt >= 0) + { + mantissa = val << shiftCnt; + } + else + { + const int shiftCntAbs = -shiftCnt; + uint64_t roundingBit = 1ull << (shiftCnt - 1); + uint64_t stickyBitMask = roundingBit - 1; + uint64_t stickyBit = val & stickyBitMask; + + mantissa = val >> shiftCntAbs; -NBL_CONSTEXPR_STATIC_INLINE uint32_t2 umulExtended(uint32_t lhs, uint32_t rhs) + if ((val & roundingBit) && (!stickyBit)) + { + bool isEven = mantissa & 1; + if (!isEven) + mantissa++; + } + else if ((val & roundingBit) && (stickyBit || (mantissa & 1))) + val += roundingBit; + + + + //val += (1ull << (shiftCnt)) - 1; + //mantissa = val >> shiftCntAbs; + + if (mantissa & 1ull << 53) + { + mantissa >>= 1; + exp++; + } + } + mantissa &= ieee754::traits::mantissaMask; + const uint64_t biasedExp = uint64_t(ieee754::traits::exponentBias + exp) << ieee754::traits::mantissaBitCnt; + + return biasedExp | mantissa; +}; + +inline uint64_t castToUint64WithFloat64BitPattern(int64_t val) +{ + const uint64_t sign = val & ieee754::traits::signMask; + const uint64_t absVal = abs(val); + return sign | castToUint64WithFloat64BitPattern(absVal); +}; + +NBL_CONSTEXPR_INLINE_FUNC uint32_t2 umulExtended(uint32_t lhs, uint32_t rhs) { uint64_t product = uint64_t(lhs) * uint64_t(rhs); uint32_t2 output; @@ -82,7 +160,7 @@ NBL_CONSTEXPR_STATIC_INLINE uint32_t2 umulExtended(uint32_t lhs, uint32_t rhs) return output; } -NBL_CONSTEXPR_STATIC_INLINE uint64_t propagateFloat64NaN(uint64_t lhs, uint64_t rhs) +NBL_CONSTEXPR_INLINE_FUNC uint64_t propagateFloat64NaN(uint64_t lhs, uint64_t rhs) { #if defined RELAXED_NAN_PROPAGATION return lhs | rhs; @@ -95,30 +173,7 @@ NBL_CONSTEXPR_STATIC_INLINE uint64_t propagateFloat64NaN(uint64_t lhs, uint64_t #endif } -NBL_CONSTEXPR_STATIC_INLINE uint64_t packFloat64(uint32_t zSign, int zExp, uint32_t zFrac0, uint32_t zFrac1) -{ - uint32_t2 z; - - z.x = zSign + (uint32_t(zExp) << 20) + zFrac0; - z.y = zFrac1; - - uint64_t output = 0u; - output |= (uint64_t(z.x) << 32) & 0xFFFFFFFF00000000ull; - output |= uint64_t(z.y); - return output; -} - -NBL_CONSTEXPR_STATIC_INLINE uint32_t2 packUint64(uint64_t val) -{ - return uint32_t2((val & 0xFFFFFFFF00000000ull) >> 32, val & 0x00000000FFFFFFFFull); -} - -NBL_CONSTEXPR_STATIC_INLINE uint64_t unpackUint64(uint32_t2 val) -{ - return ((uint64_t(val.x) & 0x00000000FFFFFFFFull) << 32) | uint64_t(val.y); -} - -NBL_CONSTEXPR_STATIC_INLINE uint32_t2 add64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) +NBL_CONSTEXPR_INLINE_FUNC uint32_t2 add64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) { uint32_t2 output; output.y = a1 + b1; @@ -127,7 +182,7 @@ NBL_CONSTEXPR_STATIC_INLINE uint32_t2 add64(uint32_t a0, uint32_t a1, uint32_t b return output; } -NBL_CONSTEXPR_STATIC_INLINE uint32_t2 sub64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) +NBL_CONSTEXPR_INLINE_FUNC uint32_t2 sub64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) { uint32_t2 output; output.y = a1 - b1; @@ -146,7 +201,7 @@ static inline int countLeadingZeros32(uint32_t val) #endif } -NBL_CONSTEXPR_STATIC_INLINE uint32_t2 shift64RightJamming(uint32_t2 val, int count) +NBL_CONSTEXPR_INLINE_FUNC uint32_t2 shift64RightJamming(uint32_t2 val, int count) { uint32_t2 output; const int negCount = (-count) & 31; @@ -166,7 +221,7 @@ NBL_CONSTEXPR_STATIC_INLINE uint32_t2 shift64RightJamming(uint32_t2 val, int cou } -NBL_CONSTEXPR_STATIC_INLINE uint32_t4 mul64to128(uint32_t4 mantissasPacked) +NBL_CONSTEXPR_INLINE_FUNC uint32_t4 mul64to128(uint32_t4 mantissasPacked) { uint32_t4 output; uint32_t more1 = 0u; @@ -205,7 +260,7 @@ NBL_CONSTEXPR_STATIC_INLINE uint32_t4 mul64to128(uint32_t4 mantissasPacked) return output; } -NBL_CONSTEXPR_STATIC_INLINE uint32_t3 shift64ExtraRightJamming(uint32_t3 val, int count) +NBL_CONSTEXPR_INLINE_FUNC uint32_t3 shift64ExtraRightJamming(uint32_t3 val, int count) { uint32_t3 output; output.x = 0u; @@ -233,7 +288,7 @@ NBL_CONSTEXPR_STATIC_INLINE uint32_t3 shift64ExtraRightJamming(uint32_t3 val, in return output; } -NBL_CONSTEXPR_STATIC_INLINE uint64_t shortShift64Left(uint64_t val, int count) +NBL_CONSTEXPR_INLINE_FUNC uint64_t shortShift64Left(uint64_t val, int count) { const uint32_t2 packed = packUint64(val); @@ -245,12 +300,12 @@ NBL_CONSTEXPR_STATIC_INLINE uint64_t shortShift64Left(uint64_t val, int count) return unpackUint64(output); }; -NBL_CONSTEXPR_STATIC_INLINE uint64_t assembleFloat64(uint64_t signShifted, uint64_t expShifted, uint64_t mantissa) +NBL_CONSTEXPR_INLINE_FUNC uint64_t assembleFloat64(uint64_t signShifted, uint64_t expShifted, uint64_t mantissa) { return signShifted + expShifted + mantissa; } -NBL_CONSTEXPR_STATIC_INLINE uint64_t roundAndPackFloat64(uint64_t zSign, int zExp, uint32_t3 mantissaExtended) +NBL_CONSTEXPR_INLINE_FUNC uint64_t roundAndPackFloat64(uint64_t zSign, int zExp, uint32_t3 mantissaExtended) { bool roundNearestEven; bool increment; @@ -372,7 +427,7 @@ static inline void normalizeFloat64Subnormal(uint64_t mantissa, outMantissa = tgmath::lerp(outMantissa, unpackUint64(temp), mantissaPacked.x == 0); } -NBL_CONSTEXPR_STATIC_INLINE bool areBothInfinity(uint64_t lhs, uint64_t rhs) +NBL_CONSTEXPR_INLINE_FUNC bool areBothInfinity(uint64_t lhs, uint64_t rhs) { lhs &= ~ieee754::traits::signMask; rhs &= ~ieee754::traits::signMask; @@ -380,17 +435,17 @@ NBL_CONSTEXPR_STATIC_INLINE bool areBothInfinity(uint64_t lhs, uint64_t rhs) return lhs == rhs && lhs == ieee754::traits::inf; } -NBL_CONSTEXPR_STATIC_INLINE bool areBothSameSignInfinity(uint64_t lhs, uint64_t rhs) +NBL_CONSTEXPR_INLINE_FUNC bool areBothSameSignInfinity(uint64_t lhs, uint64_t rhs) { return lhs == rhs && (lhs & ~ieee754::traits::signMask) == ieee754::traits::inf; } -NBL_CONSTEXPR_STATIC_INLINE bool areBothZero(uint64_t lhs, uint64_t rhs) +NBL_CONSTEXPR_INLINE_FUNC bool areBothZero(uint64_t lhs, uint64_t rhs) { return ((lhs << 1) == 0ull) && ((rhs << 1) == 0ull); } -NBL_CONSTEXPR_STATIC_INLINE bool areBothSameSignZero(uint64_t lhs, uint64_t rhs) +NBL_CONSTEXPR_INLINE_FUNC bool areBothSameSignZero(uint64_t lhs, uint64_t rhs) { return ((lhs << 1) == 0ull) && (lhs == rhs); } diff --git a/include/nbl/builtin/hlsl/math/equations/quadratic.hlsl b/include/nbl/builtin/hlsl/math/equations/quadratic.hlsl index 9ec0156bc4..d8dacdc7bb 100644 --- a/include/nbl/builtin/hlsl/math/equations/quadratic.hlsl +++ b/include/nbl/builtin/hlsl/math/equations/quadratic.hlsl @@ -5,6 +5,8 @@ #ifndef _NBL_BUILTIN_HLSL_MATH_EQUATIONS_QUADRATIC_INCLUDED_ #define _NBL_BUILTIN_HLSL_MATH_EQUATIONS_QUADRATIC_INCLUDED_ +#include + // TODO: Later include from correct hlsl header #ifndef nbl_hlsl_FLT_EPSILON #define nbl_hlsl_FLT_EPSILON 5.96046447754e-08 @@ -24,53 +26,53 @@ namespace math { namespace equations { - template - struct Quadratic +template +struct Quadratic +{ + using float_t22 = typename VecT::type; + using float_t33 = typename VecT::type; + + float_t a; + float_t b; + float_t c; + + static Quadratic construct(float_t a, float_t b, float_t c) { - using float_t2 = vector; - using float_t3 = vector; + Quadratic ret = { a, b, c }; + return ret; + } - float_t a; - float_t b; - float_t c; + float_t evaluate(float_t t) + { + return t * (a * t + b) + c; + } - static Quadratic construct(float_t a, float_t b, float_t c) - { - Quadratic ret = { a, b, c }; - return ret; - } + float_t22 computeRoots() + { + float_t22 ret; + + const float_t det = b * b - 4.0 * a * c; + const float_t detSqrt = sqrt(det); + const float_t rcp = 0.5 / a; + const float_t bOver2A = b * rcp; - float_t evaluate(float_t t) + float_t t0 = 0.0, t1 = 0.0; + if (b >= 0) { - return t * (a * t + b) + c; + ret[0] = -detSqrt * rcp - bOver2A; + ret[1] = 2 * c / (-b - detSqrt); } - - float_t2 computeRoots() + else { - float_t2 ret; - - const float_t det = b * b - 4.0 * a * c; - const float_t detSqrt = sqrt(det); - const float_t rcp = 0.5 / a; - const float_t bOver2A = b * rcp; - - float_t t0 = 0.0, t1 = 0.0; - if (b >= 0) - { - ret[0] = -detSqrt * rcp - bOver2A; - ret[1] = 2 * c / (-b - detSqrt); - } - else - { - ret[0] = 2 * c / (-b + detSqrt); - ret[1] = +detSqrt * rcp - bOver2A; - } - - return ret; + ret[0] = 2 * c / (-b + detSqrt); + ret[1] = +detSqrt * rcp - bOver2A; } + return ret; + } + - }; +}; } } } diff --git a/include/nbl/builtin/hlsl/math/equations/quartic.hlsl b/include/nbl/builtin/hlsl/math/equations/quartic.hlsl index b94de0afb5..882f7a33ba 100644 --- a/include/nbl/builtin/hlsl/math/equations/quartic.hlsl +++ b/include/nbl/builtin/hlsl/math/equations/quartic.hlsl @@ -24,9 +24,9 @@ namespace equations template struct Quartic { - using float_t2 = vector; - using float_t3 = vector; - using float_t4 = vector; + using float_t2 = typename VecT::type; + using float_t3 = typename VecT::type; + using float_t4 = typename VecT::type; // form: ax^4 + bx^3 + cx^2 + dx + e float_t a; diff --git a/include/nbl/builtin/hlsl/shapes/beziers.hlsl b/include/nbl/builtin/hlsl/shapes/beziers.hlsl index e454c4c8ef..f966fc138c 100644 --- a/include/nbl/builtin/hlsl/shapes/beziers.hlsl +++ b/include/nbl/builtin/hlsl/shapes/beziers.hlsl @@ -11,6 +11,7 @@ #include #include #include +#include // TODO: Later include from correct hlsl header (numeric_limits.hlsl) #ifndef nbl_hlsl_FLT_EPSILON @@ -28,51 +29,16 @@ namespace nbl namespace hlsl { -// TODO(emulated_float64_t): this shouldn't be in the nbl::hlsl space -// struct VecT is solution to -// error: 'nbl::hlsl::emulated_float64_t' cannot be used as a type parameter where a scalar is required -// using float_t2 = typename conditional >::value, ef64_t2, vector >::type; -#ifdef __HLSL_VERSION -template -struct VecT { using type = void; }; -template<> -struct VecT { using type = vector; }; -template<> -struct VecT { using type = vector; }; -template<> -struct VecT { using type = vector; }; -template<> -struct VecT, 2> { using type = ef64_t2; }; -template<> -struct VecT, 3> { using type = ef64_t3; }; -template<> -struct VecT, 4> { using type = float64_t4; }; - -template -struct Mat2x2T { using type = float64_t2x2; }; -template<> -struct Mat2x2T { using type = float64_t2x2; }; -template<> -struct Mat2x2T > { using type = ef64_t2x2; }; - -#endif - namespace shapes { template struct QuadraticBezier { -#ifndef __HLSL_VERSION - using float_t2 = vector; - using float_t3 = vector; - using float_t4 = vector; - using float_t2x2 = matrix; -#else using float_t2 = typename VecT::type; using float_t3 = typename VecT::type; using float_t4 = typename VecT::type; using float_t2x2 = typename Mat2x2T::type; -#endif + float_t2 P0; float_t2 P1; float_t2 P2; @@ -245,18 +211,29 @@ struct QuadraticBezier template struct Quadratic { +#ifndef __HLSL_VERSION using scalar_t = float_t; using float_t2 = vector; using float_t3 = vector; using float_t2x2 = matrix; - +#else + using scalar_t = float_t; + using float_t2 = typename VecT::type; + using float_t3 = typename VecT::type; + using float_t2x2 = typename Mat2x2T::type; +#endif + float_t2 A; float_t2 B; float_t2 C; struct AnalyticArcLengthCalculator { +#ifndef __HLSL_VERSION using float_t2 = vector; +#else + using float_t2 = typename VecT::type; +#endif static AnalyticArcLengthCalculator construct(float_t lenA2, float_t AdotB, float_t a, float_t b, float_t c, float_t b_over_4a) { @@ -547,6 +524,13 @@ struct Quadratic template static math::equations::Quartic getBezierBezierIntersectionEquation(NBL_CONST_REF_ARG(QuadraticBezier) lhs, NBL_CONST_REF_ARG(QuadraticBezier) rhs) { +#ifndef __HLSL_VERSION + using scalar_t = double; +#else + using scalar_t = emulated_float64_t; +#endif + using float_t2 = typename VecT::type; + // Algorithm based on Computer Aided Geometric Design: // https://scholarsarchive.byu.edu/cgi/viewcontent.cgi?article=1000&context=facpub#page99 // Chapter 17.6 describes the implicitization of a curve, which transforms it into the following format: @@ -581,30 +565,37 @@ static math::equations::Quartic getBezierBezierIntersectionEquation(NBL Quadratic quadratic = Quadratic::constructFromBezier(lhs); // for convenience - const float64_t2 A = quadratic.A; - const float64_t2 B = quadratic.B; - const float64_t2 C = quadratic.C; + const float_t2 A = quadratic.A; + const float_t2 B = quadratic.B; + const float_t2 C = quadratic.C; // substitute parametric into implicit equation: // Getting the quartic params - double a = ((A.x * A.x) * k0) + (A.x * A.y * k1) + (A.y * A.y * k2); - double b = (2 * A.x * B.x * k0) + (A.x * B.y * k1) + (B.x * A.y * k1) + (2 * A.y * B.y * k2); - double c = (2 * A.x * C.x * k0) + (A.x * C.y * k1) + (A.x * k3) + ((B.x * B.x) * k0) + (B.x * B.y * k1) + (C.x * A.y * k1) + (2 * A.y * C.y * k2) + (A.y * k4) + ((B.y * B.y) * k2); - double d = (2 * B.x * C.x * k0) + (B.x * C.y * k1) + (B.x * k3) + (C.x * B.y * k1) + (2 * B.y * C.y * k2) + (B.y * k4); - double e = ((C.x * C.x) * k0) + (C.x * C.y * k1) + (C.x * k3) + ((C.y * C.y) * k2) + (C.y * k4) + (k5); + scalar_t a = ((A.x * A.x) * k0) + (A.x * A.y * k1) + (A.y * A.y * k2); + scalar_t b = (A.x * B.x * k0 * 2.0f) + (A.x * B.y * k1) + (B.x * A.y * k1) + (A.y * B.y * k2 * 2.0f); + scalar_t c = (A.x * C.x * k0 * 2.0f) + (A.x * C.y * k1) + (A.x * k3) + ((B.x * B.x) * k0) + (B.x * B.y * k1) + (C.x * A.y * k1) + (A.y * C.y * k2 * 2.0f) + (A.y * k4) + ((B.y * B.y) * k2); + scalar_t d = (B.x * C.x * k0 * 2.0f) + (B.x * C.y * k1) + (B.x * k3) + (C.x * B.y * k1) + (B.y * C.y * k2 * 2.0f) + (B.y * k4); + scalar_t e = ((C.x * C.x) * k0) + (C.x * C.y * k1) + (C.x * k3) + ((C.y * C.y) * k2) + (C.y * k4) + (k5); - return math::equations::Quartic::construct(a, b, c, d, e); + return math::equations::Quartic::construct(a, b, c, d, e); } // This function returns the analytic quadratic equation to solve for bezier's t value for intersection with another bezier curve template -static math::equations::Quadratic getBezierLineIntersectionEquation(QuadraticBezier bezier, NBL_CONST_REF_ARG(vector) lineStart, NBL_CONST_REF_ARG(vector) lineVector) +static math::equations::Quadratic getBezierLineIntersectionEquation(QuadraticBezier bezier, NBL_CONST_REF_ARG(typename VecT::type) lineStart, NBL_CONST_REF_ARG(typename VecT::type) lineVector) { +#ifndef __HLSL_VERSION using float_t2 = vector; using float_t3 = vector; using float_t4 = vector; using float_t2x2 = matrix; +#else + using float_t2 = typename VecT::type; + using float_t3 = typename VecT::type; + using float_t4 = typename VecT::type; + using float_t2x2 = typename Mat2x2T::type; +#endif float_t2 lineDir = normalize(lineVector); float_t2x2 rotate = float_t2x2(float_t2(lineDir.x, lineDir.y), float_t2(-lineDir.y, lineDir.x)); @@ -612,7 +603,7 @@ static math::equations::Quadratic getBezierLineIntersectionEquation(Qua bezier.P1 = mul(rotate, bezier.P1 - lineStart); bezier.P2 = mul(rotate, bezier.P2 - lineStart); Quadratic quadratic = Quadratic::constructFromBezier(bezier); - return math::equations::Quadratic::construct(quadratic.A.y, quadratic.B.y, quadratic.C.y); + return math::equations::Quadratic::construct(quadratic.A.y, quadratic.B.y, quadratic.C.y); } } // namespace shapes diff --git a/include/nbl/builtin/hlsl/tgmath.hlsl b/include/nbl/builtin/hlsl/tgmath.hlsl index d3afb09bb7..3be5230f7f 100644 --- a/include/nbl/builtin/hlsl/tgmath.hlsl +++ b/include/nbl/builtin/hlsl/tgmath.hlsl @@ -16,31 +16,25 @@ namespace hlsl namespace tgmath { -template -static inline bool isnan(Float val) +template +inline bool isnan(T val) { - using AsUint = typename unsigned_integer_of_size::type; - using AsFloat = typename float_of_size::type; - AsUint asUint = bit_cast(val); - return bool((ieee754::extractBiasedExponent(val) == ieee754::traits::specialValueExp) && (asUint & ieee754::traits::mantissaMask)); -} + using AsUint = typename unsigned_integer_of_size::type; + using AsFloat = typename float_of_size::type; -template <> -static inline bool isnan(uint64_t val) -{ - float64_t asFloat = bit_cast(val); - return bool((ieee754::extractBiasedExponent(asFloat) == ieee754::traits::specialValueExp) && (val & ieee754::traits::mantissaMask)); + AsUint asUint = bit_cast(val); + return bool((ieee754::extractBiasedExponent(val) == ieee754::traits::specialValueExp) && (asUint & ieee754::traits::mantissaMask)); } // TODO: better implementation, also i'm not sure this is the right place for this function template -NBL_CONSTEXPR_STATIC_INLINE Uint lerp(Uint a, Uint b, bool c) +NBL_CONSTEXPR_INLINE_FUNC Uint lerp(Uint a, Uint b, bool c) { return c ? b : a; } template -NBL_CONSTEXPR_STATIC_INLINE bool isInf(Uint val) +NBL_CONSTEXPR_INLINE_FUNC bool isInf(Uint val) { using AsFloat = typename float_of_size::type; return (val & ~ieee754::traits::signMask) == ieee754::traits::inf; diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index e2040ce5fe..1e584b0a97 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -235,8 +235,9 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/macros.h") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/impl/emulated_float64_t_impl.hlsl") #emulated LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated_float64_t.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tgmath.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated_float64_t_utils.hlsl") #utility +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tgmath.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ieee754.hlsl") #spirv intrinsics LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/core.hlsl") From 7af8d63d023ebecb3ed16277049d263ccef61c20 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Tue, 20 Aug 2024 17:07:47 +0100 Subject: [PATCH 032/358] Fixes --- .../nbl/builtin/hlsl/emulated_float64_t.hlsl | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index d6666c018b..d77a22ae2b 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -22,37 +22,31 @@ namespace hlsl NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(emulated_float64_t val) { - return createPreserveBitPattern(bit_cast(float64_t(val))); return createPreserveBitPattern(val.data); } NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(int32_t val) { - return createPreserveBitPattern(bit_cast(float64_t(val))); return emulated_float64_t::createPreserveBitPattern(impl::castToUint64WithFloat64BitPattern(int64_t(val))); } NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(int64_t val) { - return createPreserveBitPattern(bit_cast(float64_t(val))); return emulated_float64_t::createPreserveBitPattern(impl::castToUint64WithFloat64BitPattern(val)); } NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(uint32_t val) { - return createPreserveBitPattern(bit_cast(float64_t(val))); return emulated_float64_t::createPreserveBitPattern(impl::castToUint64WithFloat64BitPattern(uint64_t(val))); } NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(uint64_t val) { - return createPreserveBitPattern(bit_cast(float64_t(val))); return emulated_float64_t::createPreserveBitPattern(impl::castToUint64WithFloat64BitPattern(val)); } NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(float32_t val) { - return createPreserveBitPattern(bit_cast(float64_t(val))); emulated_float64_t output; output.data = impl::castToUint64WithFloat64BitPattern(val); return output; @@ -85,11 +79,25 @@ namespace hlsl inline float getAsFloat32() { + int exponent = ieee754::extractExponent(data); + if (!FastMath) + { + if (exponent > 127) + return bit_cast(ieee754::traits::inf); + if (exponent < -126) + return -bit_cast(ieee754::traits::inf); + if (tgmath::isnan(data)) + return bit_cast(ieee754::traits::quietNaN); + } + + //return float(bit_cast(data)); // TODO: fix - uint32_t sign = uint32_t((data & ieee754::traits::signMask) >> 32); - uint32_t exponent = (uint32_t(ieee754::extractExponent(data)) + ieee754::traits::exponentBias) + ieee754::traits::mantissaBitCnt; + uint32_t sign = uint32_t((data & ieee754::traits::signMask) >> 32); + uint32_t biasedExponent = uint32_t(exponent + ieee754::traits::exponentBias) << ieee754::traits::mantissaBitCnt; uint32_t mantissa = uint32_t(data >> (ieee754::traits::mantissaBitCnt - ieee754::traits::mantissaBitCnt)) & ieee754::traits::mantissaMask; - return sign | exponent | mantissa; + + return bit_cast(sign | biasedExponent | mantissa); + } #if 0 From bc379a7002f253cc1e9157affb99e73e457b1662 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Thu, 22 Aug 2024 18:47:46 +0100 Subject: [PATCH 033/358] Refactor --- examples_tests | 2 +- .../hlsl/emulated_float64_t_utils.hlsl | 454 +++++++++++------- .../hlsl/math/equations/quadratic.hlsl | 7 +- .../builtin/hlsl/math/equations/quartic.hlsl | 6 +- include/nbl/builtin/hlsl/shapes/beziers.hlsl | 72 +-- 5 files changed, 308 insertions(+), 233 deletions(-) diff --git a/examples_tests b/examples_tests index 8e853f5518..0bfc208a6e 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 8e853f551805370c64ed72313c1195bf6ededb70 +Subproject commit 0bfc208a6e658f5b41d85d5afc74d4073a0d2f91 diff --git a/include/nbl/builtin/hlsl/emulated_float64_t_utils.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t_utils.hlsl index 4fe18cc5e8..8b8ee0966c 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t_utils.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t_utils.hlsl @@ -1,220 +1,240 @@ #ifndef _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_UTILS_INCLUDED_ #define _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_UTILS_INCLUDED_ +#include #include namespace nbl { namespace hlsl { -// should i use this namespace? -//namespace ef64_util -//{ -// TODO: this is mess, refactorize it +// TODO: enable +//template +//using portable_float64_t = conditional_t::shaderFloat64, float64_t, typename emulated_float64_t >; + #ifndef __HLSL_VERSION -using ef64_t2 = float64_t2; -using ef64_t3 = float64_t3; -using ef64_t4 = float64_t4; -using ef64_t3x3 = float64_t3x3; -using ef64_t2x2 = float64_t4x4; +template +using portable_float64_t = typename conditional >::type; #else -struct ef64_t2 +template +using portable_float64_t = typename conditional >::type; +#endif + +template +struct emulated_vector {}; + +template +struct emulated_vector { - emulated_float64_t x; - emulated_float64_t y; + using type = emulated_vector; - emulated_float64_t calcComponentSum() NBL_CONST_MEMBER_FUNC + EmulatedType x; + EmulatedType y; + + EmulatedType calcComponentSum() NBL_CONST_MEMBER_FUNC { return x + y; } - NBL_CONSTEXPR_STATIC_INLINE ef64_t2 create(emulated_float64_t x, emulated_float64_t y) + NBL_CONSTEXPR_STATIC_INLINE type create(EmulatedType x, EmulatedType y) { - ef64_t2 output; + type output; output.x = x; output.y = y; return output; } - NBL_CONSTEXPR_STATIC_INLINE ef64_t2 create(float val) - { - ef64_t2 output; - output.x = emulated_float64_t::create(val); - output.y = emulated_float64_t::create(val); - - return output; - } - - NBL_CONSTEXPR_STATIC_INLINE ef64_t2 create(float32_t2 val) - { - ef64_t2 output; - output.x = emulated_float64_t::create(val.x); - output.y = emulated_float64_t::create(val.y); - - return output; - } - - NBL_CONSTEXPR_STATIC_INLINE ef64_t2 create(uint32_t2 val) + type operator+(float rhs) { - ef64_t2 output; - output.x = emulated_float64_t::create(val.x); - output.y = emulated_float64_t::create(val.y); - - return output; - } - - ef64_t2 operator+(float rhs) - { - ef64_t2 output; - emulated_float64_t rhsAsEF64 = emulated_float64_t::create(rhs); + type output; + EmulatedType rhsAsEF64 = EmulatedType::create(rhs); output.x = x + rhsAsEF64; output.y = y + rhsAsEF64; return output; } - ef64_t2 operator+(emulated_float64_t rhs) + type operator+(EmulatedType rhs) { - ef64_t2 output; + type output; output.x = x + rhs; output.y = y + rhs; return output; } - ef64_t2 operator+(ef64_t2 rhs) + type operator+(type rhs) { - ef64_t2 output; + type output; output.x = x + rhs.x; output.y = y + rhs.y; return output; } - ef64_t2 operator-(float rhs) + type operator-(float rhs) { return create(x, y) + (-rhs); } - ef64_t2 operator-(emulated_float64_t rhs) + type operator-(EmulatedType rhs) { return create(x, y) + (rhs.flipSign()); } - ef64_t2 operator-(ef64_t2 rhs) + type operator-(type rhs) { rhs.x = rhs.x.flipSign(); rhs.y = rhs.y.flipSign(); return create(x, y) + rhs; } - ef64_t2 operator*(float rhs) + type operator*(float rhs) { - ef64_t2 output; - emulated_float64_t rhsAsEF64 = emulated_float64_t::create(rhs); + type output; + EmulatedType rhsAsEF64 = EmulatedType::create(rhs); output.x = x * rhsAsEF64; output.y = y * rhsAsEF64; return output; } - ef64_t2 operator*(emulated_float64_t rhs) + type operator*(EmulatedType rhs) { - ef64_t2 output; + type output; output.x = x * rhs; output.y = y * rhs; return output; } - ef64_t2 operator*(ef64_t2 rhs) + type operator*(type rhs) { - ef64_t2 output; + type output; output.x = x * rhs.x; output.y = y * rhs.y; return output; } -#ifdef __HLSL_VERSION - float2 getAsFloat2() + float32_t2 getAsFloat2() { - return float2(x.getAsFloat32(), y.getAsFloat32()); + return float32_t2(x.getAsFloat32(), y.getAsFloat32()); } -#endif }; -struct ef64_t3 +template +struct emulated_vector { - emulated_float64_t x; - emulated_float64_t y; - emulated_float64_t z; + using type = emulated_vector; - NBL_CONSTEXPR_STATIC_INLINE ef64_t3 create(NBL_REF_ARG(ef64_t3) other) + EmulatedType x; + EmulatedType y; + EmulatedType z; + + EmulatedType calcComponentSum() NBL_CONST_MEMBER_FUNC { - ef64_t3 output; + return x + y + z; + } - output.x = other.x; - output.y = other.y; - output.z = other.z; + type operator*(NBL_CONST_REF_ARG(type) rhs) NBL_CONST_MEMBER_FUNC + { + type output; + output.x = x * rhs.x; + output.y = y * rhs.y; + output.z = z * rhs.z; return output; } +}; - NBL_CONSTEXPR_STATIC_INLINE ef64_t3 create(NBL_REF_ARG(ef64_t2) other, emulated_float64_t z) - { - ef64_t3 output; +template +struct emulated_vector +{ + using type = emulated_vector; - output.x = other.x; - output.y = other.y; - output.z = z; + EmulatedType x; + EmulatedType y; + EmulatedType z; + EmulatedType w; +}; - return output; - } +template +using emulated_vector_t2 = emulated_vector; +template +using emulated_vector_t3 = emulated_vector; +template +using emulated_vector_t4 = emulated_vector; + +//template +//struct emulated_matrix_base +//{ +// using vec_t = emulated_vector; +// vec_t columns[M]; +//}; + +template +struct emulated_matrix {}; // : emulated_matrix_base {}; - NBL_CONSTEXPR_STATIC_INLINE ef64_t3 create(NBL_REF_ARG(ef64_t2) other, int z) +template +struct emulated_matrix// : emulated_matrix_base +{ + using vec_t = emulated_vector_t2; + using type = emulated_matrix; + + vec_t columns[2]; + + type getTransposed() NBL_CONST_MEMBER_FUNC { - ef64_t3 output; + type output; + + output.columns[0].x = columns[0].x; + output.columns[1].x = columns[0].y; - output.x = other.x; - output.y = other.y; - output.z = emulated_float64_t::create(z); + output.columns[0].y = columns[1].x; + output.columns[1].y = columns[1].y; return output; } - emulated_float64_t calcComponentSum() NBL_CONST_MEMBER_FUNC + type operator*(NBL_CONST_REF_ARG(type) rhs) NBL_CONST_MEMBER_FUNC { - return x + y + z; + type output; + type lhsTransposed = getTransposed(); + + output.columns[0].x = (lhsTransposed.columns[0] * rhs.columns[0]).calcComponentSum(); + output.columns[0].y = (lhsTransposed.columns[0] * rhs.columns[1]).calcComponentSum(); + + output.columns[1].x = (lhsTransposed.columns[1] * rhs.columns[0]).calcComponentSum(); + output.columns[1].y = (lhsTransposed.columns[1] * rhs.columns[1]).calcComponentSum(); + + return output.getTransposed(); } - ef64_t3 operator*(NBL_CONST_REF_ARG(ef64_t3) rhs) NBL_CONST_MEMBER_FUNC + vec_t operator*(NBL_CONST_REF_ARG(vec_t) rhs) { - ef64_t3 output; - output.x = x * rhs.x; - output.y = y * rhs.y; - output.z = z * rhs.z; + vec_t output; + type lhsTransposed = getTransposed(); + + output.x = (columns[0] * rhs).calcComponentSum(); + output.y = (columns[1] * rhs).calcComponentSum(); return output; } }; -struct ef64_t4 +template +struct emulated_matrix // : emulated_matrix_base { - emulated_float64_t x; - emulated_float64_t y; - emulated_float64_t z; - emulated_float64_t w; -}; + using vec_t = emulated_vector_t3; + using type = emulated_matrix; -struct ef64_t3x3 -{ - ef64_t3 columns[3]; + vec_t columns[3]; - ef64_t3x3 getTransposed() NBL_CONST_MEMBER_FUNC + type getTransposed() NBL_CONST_MEMBER_FUNC { - ef64_t3x3 output; + type output; output.columns[0].x = columns[0].x; output.columns[1].x = columns[0].y; @@ -231,10 +251,10 @@ struct ef64_t3x3 return output; } - ef64_t3x3 operator*(NBL_CONST_REF_ARG(ef64_t3x3) rhs) NBL_CONST_MEMBER_FUNC + type operator*(NBL_CONST_REF_ARG(type) rhs) NBL_CONST_MEMBER_FUNC { - ef64_t3x3 output; - ef64_t3x3 lhsTransposed = getTransposed(); + type output; + type lhsTransposed = getTransposed(); output.columns[0].x = (lhsTransposed.columns[0] * rhs.columns[0]).calcComponentSum(); output.columns[0].y = (lhsTransposed.columns[0] * rhs.columns[1]).calcComponentSum(); @@ -252,10 +272,10 @@ struct ef64_t3x3 return output.getTransposed(); } - ef64_t3 operator*(NBL_CONST_REF_ARG(ef64_t3) rhs) + vec_t operator*(NBL_CONST_REF_ARG(vec_t) rhs) { - ef64_t3 output; - ef64_t3x3 lhsTransposed = getTransposed(); + vec_t output; + type lhsTransposed = getTransposed(); output.x = (columns[0] * rhs).calcComponentSum(); output.y = (columns[1] * rhs).calcComponentSum(); @@ -265,101 +285,179 @@ struct ef64_t3x3 } }; -struct ef64_t2x2 +template +using emulated_matrix_t2x2 = emulated_matrix; +template +using emulated_matrix_t3x3 = emulated_matrix; + +namespace impl +{ +template +struct is_emulated { - ef64_t2 columns[2]; + NBL_CONSTEXPR_STATIC_INLINE bool value = is_same >::value || + is_same >::value || + is_same >::value || + is_same >::value; +}; - ef64_t2x2 getTransposed() NBL_CONST_MEMBER_FUNC - { - ef64_t2x2 output; +template::value > +struct portable_vector +{ + using type = emulated_vector; +}; +// specialization for builtins +template +struct portable_vector +{ + using type = vector; +}; - output.columns[0].x = columns[0].x; - output.columns[1].x = columns[0].y; +template::value > +struct portable_matrix +{ + using type = emulated_matrix; +}; - output.columns[0].y = columns[1].x; - output.columns[1].y = columns[1].y; +template +struct portable_matrix +{ + using type = matrix; +}; - return output; - } +} - ef64_t2x2 operator*(NBL_CONST_REF_ARG(ef64_t2x2) rhs) NBL_CONST_MEMBER_FUNC - { - ef64_t2x2 output; - ef64_t2x2 lhsTransposed = getTransposed(); +template +using portable_vector_t = typename impl::portable_vector::type; - output.columns[0].x = (lhsTransposed.columns[0] * rhs.columns[0]).calcComponentSum(); - output.columns[0].y = (lhsTransposed.columns[0] * rhs.columns[1]).calcComponentSum(); +template +using portable_vector_t2 = portable_vector_t; +template +using portable_vector_t3 = portable_vector_t; +template +using portable_vector_t4 = portable_vector_t; - output.columns[1].x = (lhsTransposed.columns[1] * rhs.columns[0]).calcComponentSum(); - output.columns[1].y = (lhsTransposed.columns[1] * rhs.columns[1]).calcComponentSum(); +using portable_vector64_t2 = portable_vector_t2 >; +using portable_vector64_t3 = portable_vector_t3 >; +using portable_vector64_t4 = portable_vector_t4 >; - return output.getTransposed(); - } +template +using portable_matrix_t = typename impl::portable_matrix::type; - ef64_t2 operator*(NBL_CONST_REF_ARG(ef64_t2) rhs) - { - ef64_t2 output; - ef64_t2x2 lhsTransposed = getTransposed(); +template +using portable_matrix_t2x2 = portable_matrix_t; +template +using portable_matrix_t3x3 = portable_matrix_t; - output.x = (columns[0] * rhs).calcComponentSum(); - output.y = (columns[1] * rhs).calcComponentSum(); +using portable_matrix64_t2x2 = portable_matrix_t2x2 >; +using portable_matrix64_t3x3 = portable_matrix_t3x3 >; - return output; - } -}; +template +NBL_CONSTEXPR_INLINE_FUNC portable_float64_t<> create_portable_float64_t(T val) +{ + //return impl::portable_float64_t_creator::create(val); + if (impl::is_emulated >::value) + return portable_float64_t<>::create(val); + else + return portable_float64_t<>(val); +} -#endif +template +NBL_CONSTEXPR_INLINE_FUNC portable_vector64_t2 create_portable_vector64_t2(T val) +{ + portable_vector64_t2 output; + output.x = create_portable_float64_t(val); + output.y = create_portable_float64_t(val); -// struct VecT is solution to -// error: 'nbl::hlsl::emulated_float64_t' cannot be used as a type parameter where a scalar is required -// using float_t2 = typename conditional >::value, ef64_t2, vector >::type; + return output; +} -// TODO: better solution +template +NBL_CONSTEXPR_INLINE_FUNC portable_vector64_t2 create_portable_vector64_t2(X x, Y y) +{ + portable_vector64_t2 output; + output.x = create_portable_float64_t(x); + output.y = create_portable_float64_t(y); -#ifndef __HLSL_VERSION -using F64_t = double; -#else -using F64_t = emulated_float64_t; -#endif + return output; +} -template -struct VecT { using type = void; }; +template +NBL_CONSTEXPR_INLINE_FUNC portable_vector64_t2 create_portable_vector64_t2_from_2d_vec(VecType vec) +{ + portable_vector64_t2 output; + output.x = create_portable_float64_t(vec.x); + output.y = create_portable_float64_t(vec.y); -template<> -struct VecT { using type = vector; }; -template<> -struct VecT { using type = vector; }; -template<> -struct VecT { using type = vector; }; + return output; +} + +template +NBL_CONSTEXPR_INLINE_FUNC portable_vector64_t3 create_portable_vector64_t3(T val) +{ + portable_vector64_t3 output; + output.x = create_portable_float64_t(val); + output.y = create_portable_float64_t(val); + output.z = create_portable_float64_t(val); + + return output; +} + +template +NBL_CONSTEXPR_INLINE_FUNC portable_vector64_t3 create_portable_vector64_t3(X x, Y y, Z z) +{ + portable_vector64_t3 output; + output.x = create_portable_float64_t(x); + output.y = create_portable_float64_t(y); + output.z = create_portable_float64_t(z); + + return output; +} + +template +NBL_CONSTEXPR_INLINE_FUNC portable_vector64_t3 create_portable_vector64_t2_from_3d_vec(VecType vec) +{ + portable_vector64_t3 output; + output.x = create_portable_float64_t(vec.x); + output.y = create_portable_float64_t(vec.y); + output.z = create_portable_float64_t(vec.z); + + return output; +} + +template >::value> +inline float32_t2 convert_portable_vector64_t2_to_float32_t2(portable_vector64_t2 vec) +{ + return float32_t2(vec.x, vec.y); +} -#ifndef __HLSL_VERSION -template<> -struct VecT { using type = float64_t2; }; -template<> -struct VecT { using type = float64_t3; }; template<> -struct VecT { using type = float64_t4; }; +inline float32_t2 convert_portable_vector64_t2_to_float32_t2(portable_vector64_t2 vec) +{ +#ifdef __HLSL_VERSION + return emulated_vector, 2>::create(vec.x, vec.y).getAsFloat2(); +#else + return float32_t2(bit_cast(0xdeadbeefu), bit_cast(0xbadcaffeu)); #endif +} -template<> -struct VecT, 2> { using type = ef64_t2; }; -template<> -struct VecT, 3> { using type = ef64_t3; }; -template<> -struct VecT, 4> { using type = ef64_t4; }; +template >::value> +inline float32_t convert_portable_float64_t_to_float(portable_float64_t<> val) +{ + + return float32_t(val); +} -template -struct Mat2x2T { using type = void; }; template<> -struct Mat2x2T { using type = float32_t2x2; }; -#ifndef __HLSL_VERSION -template<> -struct Mat2x2T { using type = float64_t2x2; }; +inline float32_t convert_portable_float64_t_to_float(portable_float64_t<> val) +{ +#ifdef __HLSL_VERSION + return val.getAsFloat32(); +#else + return float32_t(bit_cast(0xdeadbeefu)); #endif -template<> -struct Mat2x2T > { using type = ef64_t2x2; }; +} -//} } } #endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/math/equations/quadratic.hlsl b/include/nbl/builtin/hlsl/math/equations/quadratic.hlsl index d8dacdc7bb..ba0f70ba67 100644 --- a/include/nbl/builtin/hlsl/math/equations/quadratic.hlsl +++ b/include/nbl/builtin/hlsl/math/equations/quadratic.hlsl @@ -29,8 +29,7 @@ namespace equations template struct Quadratic { - using float_t22 = typename VecT::type; - using float_t33 = typename VecT::type; + using float_t2 = portable_vector_t2; float_t a; float_t b; @@ -47,9 +46,9 @@ struct Quadratic return t * (a * t + b) + c; } - float_t22 computeRoots() + float_t2 computeRoots() { - float_t22 ret; + float_t2 ret; const float_t det = b * b - 4.0 * a * c; const float_t detSqrt = sqrt(det); diff --git a/include/nbl/builtin/hlsl/math/equations/quartic.hlsl b/include/nbl/builtin/hlsl/math/equations/quartic.hlsl index 882f7a33ba..c34c25602f 100644 --- a/include/nbl/builtin/hlsl/math/equations/quartic.hlsl +++ b/include/nbl/builtin/hlsl/math/equations/quartic.hlsl @@ -24,9 +24,9 @@ namespace equations template struct Quartic { - using float_t2 = typename VecT::type; - using float_t3 = typename VecT::type; - using float_t4 = typename VecT::type; + using float_t2 = portable_vector_t2; + using float_t3 = portable_vector_t3; + using float_t4 = portable_vector_t4; // form: ax^4 + bx^3 + cx^2 + dx + e float_t a; diff --git a/include/nbl/builtin/hlsl/shapes/beziers.hlsl b/include/nbl/builtin/hlsl/shapes/beziers.hlsl index f966fc138c..44d79800ef 100644 --- a/include/nbl/builtin/hlsl/shapes/beziers.hlsl +++ b/include/nbl/builtin/hlsl/shapes/beziers.hlsl @@ -34,10 +34,10 @@ namespace shapes template struct QuadraticBezier { - using float_t2 = typename VecT::type; - using float_t3 = typename VecT::type; - using float_t4 = typename VecT::type; - using float_t2x2 = typename Mat2x2T::type; + using float_t2 = portable_vector_t2; + using float_t3 = portable_vector_t3; + using float_t4 = portable_vector_t4; + using float_t2x2 = portable_matrix_t2x2; float_t2 P0; float_t2 P1; @@ -211,17 +211,12 @@ struct QuadraticBezier template struct Quadratic { -#ifndef __HLSL_VERSION using scalar_t = float_t; - using float_t2 = vector; - using float_t3 = vector; - using float_t2x2 = matrix; -#else - using scalar_t = float_t; - using float_t2 = typename VecT::type; - using float_t3 = typename VecT::type; - using float_t2x2 = typename Mat2x2T::type; -#endif + using float_t2 = portable_vector_t2; + using float_t3 = portable_vector_t3; + using float_t4 = portable_vector_t4; + using float_t2x2 = portable_matrix_t2x2; + //using float_t3x3 = portable_matrix_t2x2; float_t2 A; float_t2 B; @@ -229,11 +224,6 @@ struct Quadratic struct AnalyticArcLengthCalculator { -#ifndef __HLSL_VERSION - using float_t2 = vector; -#else - using float_t2 = typename VecT::type; -#endif static AnalyticArcLengthCalculator construct(float_t lenA2, float_t AdotB, float_t a, float_t b, float_t c, float_t b_over_4a) { @@ -524,12 +514,7 @@ struct Quadratic template static math::equations::Quartic getBezierBezierIntersectionEquation(NBL_CONST_REF_ARG(QuadraticBezier) lhs, NBL_CONST_REF_ARG(QuadraticBezier) rhs) { -#ifndef __HLSL_VERSION - using scalar_t = double; -#else - using scalar_t = emulated_float64_t; -#endif - using float_t2 = typename VecT::type; + using float_t2 = portable_vector_t2; // Algorithm based on Computer Aided Geometric Design: // https://scholarsarchive.byu.edu/cgi/viewcontent.cgi?article=1000&context=facpub#page99 @@ -565,37 +550,30 @@ static math::equations::Quartic getBezierBezierIntersectionEquation(NBL Quadratic quadratic = Quadratic::constructFromBezier(lhs); // for convenience - const float_t2 A = quadratic.A; - const float_t2 B = quadratic.B; - const float_t2 C = quadratic.C; + const portable_vector64_t2 A = quadratic.A; + const portable_vector64_t2 B = quadratic.B; + const portable_vector64_t2 C = quadratic.C; // substitute parametric into implicit equation: // Getting the quartic params - scalar_t a = ((A.x * A.x) * k0) + (A.x * A.y * k1) + (A.y * A.y * k2); - scalar_t b = (A.x * B.x * k0 * 2.0f) + (A.x * B.y * k1) + (B.x * A.y * k1) + (A.y * B.y * k2 * 2.0f); - scalar_t c = (A.x * C.x * k0 * 2.0f) + (A.x * C.y * k1) + (A.x * k3) + ((B.x * B.x) * k0) + (B.x * B.y * k1) + (C.x * A.y * k1) + (A.y * C.y * k2 * 2.0f) + (A.y * k4) + ((B.y * B.y) * k2); - scalar_t d = (B.x * C.x * k0 * 2.0f) + (B.x * C.y * k1) + (B.x * k3) + (C.x * B.y * k1) + (B.y * C.y * k2 * 2.0f) + (B.y * k4); - scalar_t e = ((C.x * C.x) * k0) + (C.x * C.y * k1) + (C.x * k3) + ((C.y * C.y) * k2) + (C.y * k4) + (k5); + portable_float64_t<> a = ((A.x * A.x) * k0) + (A.x * A.y * k1) + (A.y * A.y * k2); + portable_float64_t<> b = (A.x * B.x * k0 * 2.0f) + (A.x * B.y * k1) + (B.x * A.y * k1) + (A.y * B.y * k2 * 2.0f); + portable_float64_t<> c = (A.x * C.x * k0 * 2.0f) + (A.x * C.y * k1) + (A.x * k3) + ((B.x * B.x) * k0) + (B.x * B.y * k1) + (C.x * A.y * k1) + (A.y * C.y * k2 * 2.0f) + (A.y * k4) + ((B.y * B.y) * k2); + portable_float64_t<> d = (B.x * C.x * k0 * 2.0f) + (B.x * C.y * k1) + (B.x * k3) + (C.x * B.y * k1) + (B.y * C.y * k2 * 2.0f) + (B.y * k4); + portable_float64_t<> e = ((C.x * C.x) * k0) + (C.x * C.y * k1) + (C.x * k3) + ((C.y * C.y) * k2) + (C.y * k4) + (k5); - return math::equations::Quartic::construct(a, b, c, d, e); + return math::equations::Quartic >::construct(a, b, c, d, e); } // This function returns the analytic quadratic equation to solve for bezier's t value for intersection with another bezier curve template -static math::equations::Quadratic getBezierLineIntersectionEquation(QuadraticBezier bezier, NBL_CONST_REF_ARG(typename VecT::type) lineStart, NBL_CONST_REF_ARG(typename VecT::type) lineVector) +static math::equations::Quadratic getBezierLineIntersectionEquation(QuadraticBezier bezier, NBL_CONST_REF_ARG(portable_vector_t2) lineStart, NBL_CONST_REF_ARG(portable_vector_t2) lineVector) { -#ifndef __HLSL_VERSION - using float_t2 = vector; - using float_t3 = vector; - using float_t4 = vector; - using float_t2x2 = matrix; -#else - using float_t2 = typename VecT::type; - using float_t3 = typename VecT::type; - using float_t4 = typename VecT::type; - using float_t2x2 = typename Mat2x2T::type; -#endif + using float_t2 = portable_vector_t2; + using float_t3 = portable_vector_t3; + using float_t4 = portable_vector_t4; + using float_t2x2 = portable_matrix_t2x2; float_t2 lineDir = normalize(lineVector); float_t2x2 rotate = float_t2x2(float_t2(lineDir.x, lineDir.y), float_t2(-lineDir.y, lineDir.x)); @@ -603,7 +581,7 @@ static math::equations::Quadratic getBezierLineIntersectionEquation(Qua bezier.P1 = mul(rotate, bezier.P1 - lineStart); bezier.P2 = mul(rotate, bezier.P2 - lineStart); Quadratic quadratic = Quadratic::constructFromBezier(bezier); - return math::equations::Quadratic::construct(quadratic.A.y, quadratic.B.y, quadratic.C.y); + return math::equations::Quadratic >::construct(quadratic.A.y, quadratic.B.y, quadratic.C.y); } } // namespace shapes From 424d3a89480eaeb80655fe73f3fb3ee336609555 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Fri, 23 Aug 2024 13:49:42 +0100 Subject: [PATCH 034/358] Saving work --- .../hlsl/impl/emulated_float64_t_impl.hlsl | 59 ------------------- include/nbl/builtin/hlsl/tgmath.hlsl | 4 +- 2 files changed, 2 insertions(+), 61 deletions(-) diff --git a/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl b/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl index bdaa9202a9..a828449d13 100644 --- a/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl +++ b/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl @@ -173,25 +173,6 @@ NBL_CONSTEXPR_INLINE_FUNC uint64_t propagateFloat64NaN(uint64_t lhs, uint64_t rh #endif } -NBL_CONSTEXPR_INLINE_FUNC uint32_t2 add64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) -{ - uint32_t2 output; - output.y = a1 + b1; - output.x = a0 + b0 + uint32_t(output.y < a1); - - return output; -} - -NBL_CONSTEXPR_INLINE_FUNC uint32_t2 sub64(uint32_t a0, uint32_t a1, uint32_t b0, uint32_t b1) -{ - uint32_t2 output; - output.y = a1 - b1; - output.x = a0 - b0 - uint32_t(a1 < b1); - - return output; -} - - // TODO: test static inline int countLeadingZeros32(uint32_t val) { #ifndef __HLSL_VERSION @@ -220,46 +201,6 @@ NBL_CONSTEXPR_INLINE_FUNC uint32_t2 shift64RightJamming(uint32_t2 val, int count return output; } - -NBL_CONSTEXPR_INLINE_FUNC uint32_t4 mul64to128(uint32_t4 mantissasPacked) -{ - uint32_t4 output; - uint32_t more1 = 0u; - uint32_t more2 = 0u; - - // a0 = x - // a1 = y - // b0 = z - // b1 = w - - uint32_t2 z2z3 = umulExtended(mantissasPacked.y, mantissasPacked.w); - output.z = z2z3.x; - output.w = z2z3.y; - uint32_t2 z1more2 = umulExtended(mantissasPacked.y, mantissasPacked.z); - output.y = z1more2.x; - more2 = z1more2.y; - uint32_t2 z1z2 = add64(output.y, more2, 0u, output.z); - output.y = z1z2.x; - output.z = z1z2.y; - uint32_t2 z0more1 = umulExtended(mantissasPacked.x, mantissasPacked.z); - output.x = z0more1.x; - more1 = z0more1.y; - uint32_t2 z0z1 = add64(output.x, more1, 0u, output.y); - output.x = z0z1.x; - output.y = z0z1.y; - uint32_t2 more1more2 = umulExtended(mantissasPacked.x, mantissasPacked.w); - more1 = more1more2.x; - more2 = more1more2.y; - uint32_t2 more1z2 = add64(more1, more2, 0u, output.z); - more1 = more1z2.x; - output.z = more1z2.y; - uint32_t2 z0z12 = add64(output.x, output.y, 0u, more1); - output.x = z0z12.x; - output.y = z0z12.y; - - return output; -} - NBL_CONSTEXPR_INLINE_FUNC uint32_t3 shift64ExtraRightJamming(uint32_t3 val, int count) { uint32_t3 output; diff --git a/include/nbl/builtin/hlsl/tgmath.hlsl b/include/nbl/builtin/hlsl/tgmath.hlsl index 3be5230f7f..7190723602 100644 --- a/include/nbl/builtin/hlsl/tgmath.hlsl +++ b/include/nbl/builtin/hlsl/tgmath.hlsl @@ -27,8 +27,8 @@ inline bool isnan(T val) } // TODO: better implementation, also i'm not sure this is the right place for this function -template -NBL_CONSTEXPR_INLINE_FUNC Uint lerp(Uint a, Uint b, bool c) +template +NBL_CONSTEXPR_INLINE_FUNC enable_if::type, T>::type lerp(T a, T b, bool c) { return c ? b : a; } From 2e9a25560807274e49cfcbc59403190d93edc636 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Fri, 23 Aug 2024 20:01:44 +0100 Subject: [PATCH 035/358] Saving work --- examples_tests | 2 +- include/nbl/builtin/hlsl/cpp_compat/basic.h | 19 ++- .../nbl/builtin/hlsl/emulated_float64_t.hlsl | 137 ++++++++++-------- .../hlsl/emulated_float64_t_utils.hlsl | 16 ++ 4 files changed, 109 insertions(+), 65 deletions(-) diff --git a/examples_tests b/examples_tests index 0bfc208a6e..293ec73d47 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 0bfc208a6e658f5b41d85d5afc74d4073a0d2f91 +Subproject commit 293ec73d47f2ecdedd2e4b00b31b663e03aa000a diff --git a/include/nbl/builtin/hlsl/cpp_compat/basic.h b/include/nbl/builtin/hlsl/cpp_compat/basic.h index 688834f730..dfa12aa33d 100644 --- a/include/nbl/builtin/hlsl/cpp_compat/basic.h +++ b/include/nbl/builtin/hlsl/cpp_compat/basic.h @@ -49,10 +49,23 @@ namespace nbl { namespace hlsl { - template - T _static_cast(U v) + namespace impl { - return (T)v; + template + struct static_cast_helper + { + static inline To cast(From u) + { + return To(u); + } + }; + } + + template + To _static_cast(From v) + { + return impl::static_cast_helper(v); + //return (T)v; } #if 0 // TODO: for later diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index d77a22ae2b..cba77b7ddd 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -22,27 +22,29 @@ namespace hlsl NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(emulated_float64_t val) { - return createPreserveBitPattern(val.data); + //return bit_cast >(val.data); + return val; + //TODO: return val? } NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(int32_t val) { - return emulated_float64_t::createPreserveBitPattern(impl::castToUint64WithFloat64BitPattern(int64_t(val))); + return bit_cast >(impl::castToUint64WithFloat64BitPattern(int64_t(val))); } NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(int64_t val) { - return emulated_float64_t::createPreserveBitPattern(impl::castToUint64WithFloat64BitPattern(val)); + return bit_cast >(impl::castToUint64WithFloat64BitPattern(val)); } NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(uint32_t val) { - return emulated_float64_t::createPreserveBitPattern(impl::castToUint64WithFloat64BitPattern(uint64_t(val))); + return bit_cast >(impl::castToUint64WithFloat64BitPattern(uint64_t(val))); } NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(uint64_t val) { - return emulated_float64_t::createPreserveBitPattern(impl::castToUint64WithFloat64BitPattern(val)); + return bit_cast >(impl::castToUint64WithFloat64BitPattern(val)); } NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(float32_t val) @@ -54,7 +56,7 @@ namespace hlsl NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(float64_t val) { - return createPreserveBitPattern(bit_cast(float64_t(val))); + return bit_cast >(bit_cast(float64_t(val))); #ifdef __HLSL_VERSION emulated_float64_t retval; uint32_t lo, hi; @@ -62,7 +64,7 @@ namespace hlsl retval.data = (uint64_t(hi) << 32) | lo; return retval; #else - return createPreserveBitPattern(reinterpret_cast(val)); + return bit_cast >(reinterpret_cast(val)); #endif } @@ -72,11 +74,6 @@ namespace hlsl return emulated_float64_t(bit_cast(float64_t(val))); }*/ - NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t createPreserveBitPattern(uint64_t val) - { - return emulated_float64_t(val); - } - inline float getAsFloat32() { int exponent = ieee754::extractExponent(data); @@ -123,7 +120,7 @@ namespace hlsl int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); if (tgmath::isnan(data) || tgmath::isnan(rhs.data)) - return createPreserveBitPattern(ieee754::traits::quietNaN); + return bit_cast >(ieee754::traits::quietNaN); /*if (std::isinf(lhs) || std::isinf(rhs)) { if (std::isinf(lhs) && !std::isinf(rhs)) @@ -174,12 +171,12 @@ namespace hlsl if (!FastMath && (rp + l2 + 1 < nbl::hlsl::numeric_limits::min_exponent)) { - return createPreserveBitPattern(impl::assembleFloat64(0, ieee754::traits::exponentMask, 0)); + return bit_cast >(impl::assembleFloat64(0, ieee754::traits::exponentMask, 0)); } else { rp = addTmp ? l2 + rp + ieee754::traits::exponentBias : 0; - return createPreserveBitPattern(impl::assembleFloat64( + return bit_cast >(impl::assembleFloat64( sign, (uint64_t(rp) << ieee754::traits::mantissaBitCnt) & ieee754::traits::exponentMask, shiftLeftAllowNegBitCnt(addTmp, (ieee754::traits::mantissaBitCnt - l2)) & ieee754::traits::mantissaMask) @@ -191,7 +188,7 @@ namespace hlsl if (FlushDenormToZero) { - emulated_float64_t retval = createPreserveBitPattern(0u); + emulated_float64_t retval = emulated_float64_t::create(0ull); uint64_t mantissa; uint32_t3 mantissaExtended; @@ -213,12 +210,12 @@ namespace hlsl if (lhsBiasedExp == ieee754::traits::specialValueExp) { bool propagate = (lhsMantissa | rhsMantissa) != 0u; - return createPreserveBitPattern(tgmath::lerp(data, impl::propagateFloat64NaN(data, rhs.data), propagate)); + return bit_cast >(tgmath::lerp(data, impl::propagateFloat64NaN(data, rhs.data), propagate)); } mantissa = lhsMantissa + rhsMantissa; if (lhsBiasedExp == 0) - return createPreserveBitPattern(impl::assembleFloat64(lhsSign, 0, mantissa)); + return bit_cast >(impl::assembleFloat64(lhsSign, 0, mantissa)); mantissaExtended.xy = impl::packUint64(mantissa); mantissaExtended.x |= 0x00200000u; mantissaExtended.z = 0u; @@ -237,7 +234,7 @@ namespace hlsl if (lhsBiasedExp == ieee754::traits::specialValueExp) { const bool propagate = (lhsMantissa) != 0u; - return createPreserveBitPattern(tgmath::lerp(ieee754::traits::exponentMask | lhsSign, impl::propagateFloat64NaN(data, rhs.data), propagate)); + return bit_cast >(tgmath::lerp(ieee754::traits::exponentMask | lhsSign, impl::propagateFloat64NaN(data, rhs.data), propagate)); } expDiff = tgmath::lerp(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); @@ -256,11 +253,11 @@ namespace hlsl ++biasedExp; } - return createPreserveBitPattern(impl::roundAndPackFloat64(lhsSign, biasedExp, mantissaExtended.xyz)); + return bit_cast >(impl::roundAndPackFloat64(lhsSign, biasedExp, mantissaExtended.xyz)); } // cannot happen but compiler cries about not every path returning value - return createPreserveBitPattern(impl::roundAndPackFloat64(lhsSign, biasedExp, mantissaExtended)); + return bit_cast >(impl::roundAndPackFloat64(lhsSign, biasedExp, mantissaExtended)); } else { @@ -281,7 +278,7 @@ namespace hlsl if (lhsBiasedExp == ieee754::traits::specialValueExp) { bool propagate = lhsMantissa != 0u; - return createPreserveBitPattern(tgmath::lerp(impl::assembleFloat64(lhsSign, ieee754::traits::exponentMask, 0ull), impl::propagateFloat64NaN(data, rhs.data), propagate)); + return bit_cast >(tgmath::lerp(impl::assembleFloat64(lhsSign, ieee754::traits::exponentMask, 0ull), impl::propagateFloat64NaN(data, rhs.data), propagate)); } expDiff = tgmath::lerp(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); @@ -291,12 +288,12 @@ namespace hlsl frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); biasedExp = lhsBiasedExp; --biasedExp; - return createPreserveBitPattern(impl::normalizeRoundAndPackFloat64(lhsSign, biasedExp - 10, frac.x, frac.y)); + return bit_cast >(impl::normalizeRoundAndPackFloat64(lhsSign, biasedExp - 10, frac.x, frac.y)); } if (lhsBiasedExp == ieee754::traits::specialValueExp) { bool propagate = ((lhsMantissa) | (rhsMantissa)) != 0u; - return createPreserveBitPattern(tgmath::lerp(ieee754::traits::quietNaN, impl::propagateFloat64NaN(data, rhs.data), propagate)); + return bit_cast >(tgmath::lerp(ieee754::traits::quietNaN, impl::propagateFloat64NaN(data, rhs.data), propagate)); } rhsBiasedExp = tgmath::lerp(rhsBiasedExp, 1, lhsBiasedExp == 0); lhsBiasedExp = tgmath::lerp(lhsBiasedExp, 1, lhsBiasedExp == 0); @@ -331,24 +328,24 @@ namespace hlsl lhsSign ^= signOfDifference; uint64_t retval_0 = impl::packFloat64(uint32_t(FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) << 31, 0, 0u, 0u); uint64_t retval_1 = impl::normalizeRoundAndPackFloat64(lhsSign, biasedExp - 11, frac.x, frac.y); - return createPreserveBitPattern(tgmath::lerp(retval_0, retval_1, frac.x != 0u || frac.y != 0u)); + return bit_cast >(tgmath::lerp(retval_0, retval_1, frac.x != 0u || frac.y != 0u)); } } else { //static_assert(false, "not implemented yet"); - return createPreserveBitPattern(0xdeadbeefbadcaffeull); + return bit_cast >(0xdeadbeefbadcaffeull); } } emulated_float64_t operator+(float rhs) { - return createPreserveBitPattern(data) + create(rhs); + return bit_cast >(data) + create(rhs); } emulated_float64_t operator-(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { - emulated_float64_t lhs = createPreserveBitPattern(data); + emulated_float64_t lhs = bit_cast >(data); emulated_float64_t rhsFlipped = rhs.flipSign(); return lhs + rhsFlipped; @@ -356,14 +353,14 @@ namespace hlsl emulated_float64_t operator-(float rhs) NBL_CONST_MEMBER_FUNC { - return createPreserveBitPattern(data) - create(rhs); + return bit_cast >(data) - create(rhs); } emulated_float64_t operator*(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { if(FlushDenormToZero) { - emulated_float64_t retval = emulated_float64_t::createPreserveBitPattern(0u); + emulated_float64_t retval = emulated_float64_t::create(0ull); uint64_t lhsSign = data & ieee754::traits::signMask; uint64_t rhsSign = rhs.data & ieee754::traits::signMask; @@ -379,11 +376,11 @@ namespace hlsl if (lhsBiasedExp == ieee754::traits::specialValueExp) { if ((lhsMantissa != 0u) || ((rhsBiasedExp == ieee754::traits::specialValueExp) && (rhsMantissa != 0u))) - return createPreserveBitPattern(impl::propagateFloat64NaN(data, rhs.data)); + return bit_cast >(impl::propagateFloat64NaN(data, rhs.data)); if ((uint64_t(rhsBiasedExp) | rhsMantissa) == 0u) - return createPreserveBitPattern(ieee754::traits::quietNaN); + return bit_cast >(ieee754::traits::quietNaN); - return createPreserveBitPattern(impl::assembleFloat64(sign, ieee754::traits::exponentMask, 0ull)); + return bit_cast >(impl::assembleFloat64(sign, ieee754::traits::exponentMask, 0ull)); } if (rhsBiasedExp == ieee754::traits::specialValueExp) { @@ -392,23 +389,23 @@ namespace hlsl #ifdef RELAXED_NAN_PROPAGATION return rhs.data; #else - return createPreserveBitPattern(impl::propagateFloat64NaN(data, rhs.data)); + return bit_cast >(impl::propagateFloat64NaN(data, rhs.data)); #endif if ((uint64_t(lhsBiasedExp) | lhsMantissa) == 0u) - return createPreserveBitPattern(ieee754::traits::quietNaN); + return bit_cast >(ieee754::traits::quietNaN); - return createPreserveBitPattern(sign | ieee754::traits::exponentMask); + return bit_cast >(sign | ieee754::traits::exponentMask); } if (lhsBiasedExp == 0) { if (lhsMantissa == 0u) - return createPreserveBitPattern(sign); + return bit_cast >(sign); impl::normalizeFloat64Subnormal(lhsMantissa, lhsBiasedExp, lhsMantissa); } if (rhsBiasedExp == 0) { if (rhsMantissa == 0u) - return createPreserveBitPattern(sign); + return bit_cast >(sign); impl::normalizeFloat64Subnormal(rhsMantissa, rhsBiasedExp, rhsMantissa); } } @@ -428,24 +425,24 @@ namespace hlsl } newPseudoMantissa &= (ieee754::traits::mantissaMask); - return createPreserveBitPattern(impl::assembleFloat64(sign, uint64_t(exp) << ieee754::traits::mantissaBitCnt, newPseudoMantissa)); + return bit_cast >(impl::assembleFloat64(sign, uint64_t(exp) << ieee754::traits::mantissaBitCnt, newPseudoMantissa)); } else { //static_assert(false, "not implemented yet"); - return createPreserveBitPattern(0xdeadbeefbadcaffeull); + return bit_cast >(0xdeadbeefbadcaffeull); } } emulated_float64_t operator*(float rhs) { - return createPreserveBitPattern(data) * create(rhs); + return bit_cast >(data) * create(rhs); } /*emulated_float64_t reciprocal(uint64_t x) { using ThisType = emulated_float64_t; - ThisType output = ThisType::createPreserveBitPattern((0xbfcdd6a18f6a6f52ULL - x) >> 1); + ThisType output = ThisType::bit_cast >((0xbfcdd6a18f6a6f52ULL - x) >> 1); output = output * output; return output; }*/ @@ -454,23 +451,23 @@ namespace hlsl { if (FlushDenormToZero) { - //return emulated_float64_t::createPreserveBitPattern(data) * reciprocal(rhs.data); + //return emulated_float64_t::bit_cast >(data) * reciprocal(rhs.data); if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) - return createPreserveBitPattern(ieee754::traits::quietNaN); + return bit_cast >(ieee754::traits::quietNaN); if (!FastMath && ((rhs.data << 1) == 0)) - return createPreserveBitPattern(ieee754::traits::quietNaN); + return bit_cast >(ieee754::traits::quietNaN); const uint64_t sign = (data ^ rhs.data) & ieee754::traits::signMask; if (!FastMath && impl::areBothInfinity(data, rhs.data)) - return createPreserveBitPattern(ieee754::traits::quietNaN | sign); + return bit_cast >(ieee754::traits::quietNaN | sign); if (!FastMath && tgmath::isInf(data)) - return createPreserveBitPattern((data & ~ieee754::traits::signMask) | sign); + return bit_cast >((data & ~ieee754::traits::signMask) | sign); if (!FastMath && tgmath::isInf(rhs.data)) - return createPreserveBitPattern(0ull | sign); + return bit_cast >(0ull | sign); @@ -490,12 +487,12 @@ namespace hlsl mantissa &= ieee754::traits::mantissaMask; - return createPreserveBitPattern(impl::assembleFloat64(sign, uint64_t(exp) << ieee754::traits::mantissaBitCnt, mantissa)); + return bit_cast >(impl::assembleFloat64(sign, uint64_t(exp) << ieee754::traits::mantissaBitCnt, mantissa)); } else { //static_assert(false, "not implemented yet"); - return createPreserveBitPattern(0xdeadbeefbadcaffeull); + return bit_cast >(0xdeadbeefbadcaffeull); } } @@ -509,7 +506,7 @@ namespace hlsl if (!FastMath && impl::areBothZero(data, rhs.data)) return true; - const emulated_float64_t xored = createPreserveBitPattern(data ^ rhs.data); + const emulated_float64_t xored = bit_cast >(data ^ rhs.data); // TODO: check what fast math returns for -0 == 0 if ((xored.data & 0x7FFFFFFFFFFFFFFFull) == 0ull) return true; @@ -518,7 +515,7 @@ namespace hlsl } bool operator!=(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { - return !(createPreserveBitPattern(data) == rhs); + return !(bit_cast >(data) == rhs); } bool operator<(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { @@ -560,18 +557,17 @@ namespace hlsl return (lhsFlipped & diffBits) > (rhsFlipped & diffBits); } - bool operator<=(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { return !(emulated_float64_t::createPreserveBitPattern(data) > emulated_float64_t::createPreserveBitPattern(rhs.data)); } - bool operator>=(emulated_float64_t rhs) { return !(emulated_float64_t::createPreserveBitPattern(data) < emulated_float64_t::createPreserveBitPattern(rhs.data)); } + bool operator<=(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { return !(bit_cast >(data) > bit_cast >(rhs.data)); } + bool operator>=(emulated_float64_t rhs) { return !(bit_cast >(data) < bit_cast >(rhs.data)); } //logical operators bool operator&&(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { return bool(data) && bool(rhs.data); } bool operator||(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { return bool(data) || bool(rhs.data); } bool operator!() NBL_CONST_MEMBER_FUNC { return !bool(data); } - - // TODO: should modify self? + emulated_float64_t flipSign() { - return createPreserveBitPattern(data ^ ieee754::traits::signMask); + return bit_cast >(data ^ ieee754::traits::signMask); } bool isNaN() @@ -618,12 +614,31 @@ NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size::type extractMa return extractMantissa(x.data);\ }\ + +// TODO: this is wrong! fix it +#define DEFINE_BIT_CAST_SPEC(Type)\ +template<>\ +NBL_CONSTEXPR_FUNC Type bit_cast(NBL_CONST_REF_ARG(uint64_t) val)\ +{\ +Type output; \ +output.data = val; \ +\ +return output; \ +}\ +\ + + +DEFINE_BIT_CAST_SPEC(emulated_float64_t); +DEFINE_BIT_CAST_SPEC(emulated_float64_t); +DEFINE_BIT_CAST_SPEC(emulated_float64_t); +DEFINE_BIT_CAST_SPEC(emulated_float64_t); + namespace ieee754 { - IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); - IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); - IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); - IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); +IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); +IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); +IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); +IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); } } diff --git a/include/nbl/builtin/hlsl/emulated_float64_t_utils.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t_utils.hlsl index 8b8ee0966c..19f94b3c11 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t_utils.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t_utils.hlsl @@ -1,6 +1,7 @@ #ifndef _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_UTILS_INCLUDED_ #define _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_UTILS_INCLUDED_ +#include #include #include @@ -20,6 +21,21 @@ template >::type; #endif +namespace impl +{ + +template +struct static_cast_helper > +{ + static inline portable_float64_t<> cast(From u) + { + return int(u) - 1; + } +}; + +} + + template struct emulated_vector {}; From fa3636a57baa97dc1b59fa63d5fbf55ef70269ea Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Sat, 24 Aug 2024 18:33:30 +0200 Subject: [PATCH 036/358] add missing if statement for device gen builtins (top level cmake builtins include got removed causing it to not see some functions) --- src/nbl/device/CMakeLists.txt | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/nbl/device/CMakeLists.txt b/src/nbl/device/CMakeLists.txt index e850954dfe..62daf7398f 100644 --- a/src/nbl/device/CMakeLists.txt +++ b/src/nbl/device/CMakeLists.txt @@ -61,17 +61,19 @@ add_custom_command(OUTPUT ${NBL_OUTPUT_HEADERS} add_custom_target(DeviceHeaders DEPENDS ${NBL_OUTPUT_HEADERS}) -LIST_BUILTIN_RESOURCE(NBL_DEVICE_GEN_RESOURCES_TO_EMBED "video/device_capabilities_traits_members.hlsl") -LIST_BUILTIN_RESOURCE(NBL_DEVICE_GEN_RESOURCES_TO_EMBED "video/device_capabilities_traits_testers.hlsl") -LIST_BUILTIN_RESOURCE(NBL_DEVICE_GEN_RESOURCES_TO_EMBED "video/device_capabilities_traits_defaults.hlsl") -LIST_BUILTIN_RESOURCE(NBL_DEVICE_GEN_RESOURCES_TO_EMBED "video/device_capabilities_traits_floats.hlsl") -LIST_BUILTIN_RESOURCE(NBL_DEVICE_GEN_RESOURCES_TO_EMBED "video/device_capabilities_traits_enums.hlsl") +if(NBL_EMBED_BUILTIN_RESOURCES) + LIST_BUILTIN_RESOURCE(NBL_DEVICE_GEN_RESOURCES_TO_EMBED "video/device_capabilities_traits_members.hlsl") + LIST_BUILTIN_RESOURCE(NBL_DEVICE_GEN_RESOURCES_TO_EMBED "video/device_capabilities_traits_testers.hlsl") + LIST_BUILTIN_RESOURCE(NBL_DEVICE_GEN_RESOURCES_TO_EMBED "video/device_capabilities_traits_defaults.hlsl") + LIST_BUILTIN_RESOURCE(NBL_DEVICE_GEN_RESOURCES_TO_EMBED "video/device_capabilities_traits_floats.hlsl") + LIST_BUILTIN_RESOURCE(NBL_DEVICE_GEN_RESOURCES_TO_EMBED "video/device_capabilities_traits_enums.hlsl") -get_filename_component(_DEVICE_GEN_BR_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/builtin/include" ABSOLUTE) -get_filename_component(_DEVICE_GEN_BR_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/builtin/src" ABSOLUTE) + get_filename_component(_DEVICE_GEN_BR_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/builtin/include" ABSOLUTE) + get_filename_component(_DEVICE_GEN_BR_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/builtin/src" ABSOLUTE) -ADD_CUSTOM_BUILTIN_RESOURCES(deviceGenBuiltinResourceData NBL_DEVICE_GEN_RESOURCES_TO_EMBED "${NBL_DEVICE_GEN_INCLUDE_DIR}" "nbl" "nbl::devicegen::builtin" "${_DEVICE_GEN_BR_OUTPUT_DIRECTORY_HEADER_}" "${_DEVICE_GEN_BR_OUTPUT_DIRECTORY_SOURCE_}" "STATIC" "INTERNAL") -add_dependencies(deviceGenBuiltinResourceData DeviceHeaders) + ADD_CUSTOM_BUILTIN_RESOURCES(deviceGenBuiltinResourceData NBL_DEVICE_GEN_RESOURCES_TO_EMBED "${NBL_DEVICE_GEN_INCLUDE_DIR}" "nbl" "nbl::devicegen::builtin" "${_DEVICE_GEN_BR_OUTPUT_DIRECTORY_HEADER_}" "${_DEVICE_GEN_BR_OUTPUT_DIRECTORY_SOURCE_}" "STATIC" "INTERNAL") + add_dependencies(deviceGenBuiltinResourceData DeviceHeaders) +endif() set(NBL_DEVICE_GEN_INCLUDE_DIR "${NBL_DEVICE_GEN_INCLUDE_DIR}" From a265dbadd2c4d51bf08e0120ebb5efb01c60ca04 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Sat, 24 Aug 2024 17:38:38 +0100 Subject: [PATCH 037/358] Saving work --- examples_tests | 2 +- include/nbl/builtin/hlsl/cpp_compat/basic.h | 76 ++++++------ .../nbl/builtin/hlsl/emulated_float64_t.hlsl | 82 +++++++++++++ .../hlsl/emulated_float64_t_utils.hlsl | 110 +++++++++++------- 4 files changed, 193 insertions(+), 77 deletions(-) diff --git a/examples_tests b/examples_tests index 293ec73d47..b6b11ae462 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 293ec73d47f2ecdedd2e4b00b31b663e03aa000a +Subproject commit b6b11ae462a551ead3380f1b87c29a342e125365 diff --git a/include/nbl/builtin/hlsl/cpp_compat/basic.h b/include/nbl/builtin/hlsl/cpp_compat/basic.h index dfa12aa33d..9bf57fb29e 100644 --- a/include/nbl/builtin/hlsl/cpp_compat/basic.h +++ b/include/nbl/builtin/hlsl/cpp_compat/basic.h @@ -3,6 +3,28 @@ #include +namespace nbl +{ +namespace hlsl +{ +namespace impl +{ + template + struct static_cast_helper + { + static inline To cast(From u) + { +#ifndef __HLSL_VERSION + return static_cast(u); +#else + return To(u); +#endif + } + }; +} +} +} + #ifndef __HLSL_VERSION #include @@ -16,10 +38,10 @@ namespace nbl::hlsl { - template - T _static_cast(U v) + template + To _static_cast(From v) { - return static_cast(v); + return impl::static_cast_helper::cast(v); } template @@ -47,41 +69,29 @@ namespace nbl::hlsl namespace nbl { - namespace hlsl - { - namespace impl - { - template - struct static_cast_helper - { - static inline To cast(From u) - { - return To(u); - } - }; - } +namespace hlsl +{ - template - To _static_cast(From v) - { - return impl::static_cast_helper(v); - //return (T)v; - } +template +To _static_cast(From v) +{ + return impl::static_cast_helper::cast(v); +} #if 0 // TODO: for later - template - struct add_reference - { - using type = ref; - }; - template - struct add_pointer - { - using type = ptr; - }; +template +struct add_reference +{ + using type = ref; +}; +template +struct add_pointer +{ + using type = ptr; +}; #endif - } +} } #define NBL_REF_ARG(...) inout __VA_ARGS__ diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index cba77b7ddd..9fbf8792da 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -574,6 +574,11 @@ namespace hlsl { return tgmath::isnan(data); } + + NBL_CONSTEXPR_STATIC_INLINE bool supportsFastMath() + { + return FastMath; + } }; #define COMMA , @@ -627,6 +632,81 @@ return output; \ }\ \ +namespace impl +{ + +#if 0 +template +struct static_cast_helper,void> +{ + using From = emulated_float64_t; + + static inline Scalar cast(From v) + { + if (is_floating_point::value) // DOUBLE ALSO REPORTS THIS AS TRUE! (so does float16_t) + { + int exponent = ieee754::extractExponent(v.data); + if (!From::supportsFastMath()) + { + if (exponent > 127) + return bit_cast(ieee754::traits::inf); + if (exponent < -126) + return -bit_cast(ieee754::traits::inf); + if (tgmath::isnan(v.data)) + return bit_cast(ieee754::traits::quietNaN); + } + + uint32_t sign = uint32_t((v.data & ieee754::traits::signMask) >> 32); + uint32_t biasedExponent = uint32_t(exponent + ieee754::traits::exponentBias) << ieee754::traits::mantissaBitCnt; + uint32_t mantissa = uint32_t(v.data >> (ieee754::traits::mantissaBitCnt - ieee754::traits::mantissaBitCnt)) & ieee754::traits::mantissaMask; + + return bit_cast(sign | biasedExponent | mantissa); + } + + return bit_cast(ieee754::traits::quietNaN); + } +}; +#endif + +// TODO: fix cast to float +#define DEFINE_EMULATED_FLOAT64_STATIC_CAST(Type)\ +template\ +struct static_cast_helper\ +{\ + static inline To cast(Type v)\ + {\ + if (is_floating_point::value)\ + {\ + int exponent = ieee754::extractExponent(v.data);\ + if (!Type::supportsFastMath())\ + {\ + if (exponent > 127)\ + return bit_cast(ieee754::traits::inf);\ + if (exponent < -126)\ + return -bit_cast(ieee754::traits::inf);\ + if (tgmath::isnan(v.data))\ + return bit_cast(ieee754::traits::quietNaN);\ + }\ +\ + uint32_t sign = uint32_t((v.data & ieee754::traits::signMask) >> 32);\ + uint32_t biasedExponent = uint32_t(exponent + ieee754::traits::exponentBias) << ieee754::traits::mantissaBitCnt;\ + uint32_t mantissa = uint32_t(v.data >> (ieee754::traits::mantissaBitCnt - ieee754::traits::mantissaBitCnt)) & ieee754::traits::mantissaMask;\ +\ + return bit_cast(sign | biasedExponent | mantissa);\ + }\ +\ + return bit_cast(ieee754::traits::quietNaN);\ + }\ +};\ +\ + +DEFINE_EMULATED_FLOAT64_STATIC_CAST(emulated_float64_t); +DEFINE_EMULATED_FLOAT64_STATIC_CAST(emulated_float64_t); +DEFINE_EMULATED_FLOAT64_STATIC_CAST(emulated_float64_t); +DEFINE_EMULATED_FLOAT64_STATIC_CAST(emulated_float64_t); + +} + DEFINE_BIT_CAST_SPEC(emulated_float64_t); DEFINE_BIT_CAST_SPEC(emulated_float64_t); @@ -652,5 +732,7 @@ IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t >::type; #endif -namespace impl -{ - -template -struct static_cast_helper > -{ - static inline portable_float64_t<> cast(From u) - { - return int(u) - 1; - } -}; - -} - - template struct emulated_vector {}; @@ -133,11 +118,6 @@ struct emulated_vector return output; } - - float32_t2 getAsFloat2() - { - return float32_t2(x.getAsFloat32(), y.getAsFloat32()); - } }; template @@ -183,6 +163,64 @@ using emulated_vector_t3 = emulated_vector; template using emulated_vector_t4 = emulated_vector; +// TODO: works only for float, fix +namespace impl +{ + +#if 1 +template +struct static_cast_helper,emulated_vector,void> +{ + static inline vector cast(emulated_vector vec) + { + return vector(_static_cast(vec.x), _static_cast(vec.y)); + } +}; +#endif + +#if 0 +#define DEFINE_EMULATED_VECTOR_STATIC_CAST(COND,...)\ +template\ +struct static_cast_helper >\ +{\ + static inline To cast(emulated_vector_t2<__VA_ARGS__ > vec)\ + {\ + return To(_static_cast(vec.x), _static_cast(vec.y));\ + }\ +};\ +\ +template\ +struct static_cast_helper >\ +{\ + static inline To cast(emulated_vector_t3<__VA_ARGS__ > vec)\ + {\ + return To(_static_cast(vec.x), _static_cast(vec.y), _static_cast(vec.z));\ + }\ +};\ +\ +template\ +struct static_cast_helper >\ +{\ + static inline To cast(emulated_vector_t4<__VA_ARGS__ > vec)\ + {\ + return To(_static_cast(vec.x), _static_cast(vec.y), _static_cast(vec.z), _static_cast(vec.w));\ + }\ +};\ +\ + +#define COND +DEFINE_EMULATED_VECTOR_STATIC_CAST(COND,emulated_float64_t); +DEFINE_EMULATED_VECTOR_STATIC_CAST(COND,emulated_float64_t); +DEFINE_EMULATED_VECTOR_STATIC_CAST(COND,emulated_float64_t); +DEFINE_EMULATED_VECTOR_STATIC_CAST(COND,emulated_float64_t); + +#undef DEFINE_EMULATED_VECTOR_STATIC_CAST +#undef COND +#endif + + +} + //template //struct emulated_matrix_base //{ @@ -311,10 +349,10 @@ namespace impl template struct is_emulated { - NBL_CONSTEXPR_STATIC_INLINE bool value = is_same >::value || - is_same >::value || - is_same >::value || - is_same >::value; + NBL_CONSTEXPR_STATIC_INLINE bool value = is_same_v > || + is_same_v > || + is_same_v > || + is_same_v >; }; template::value > @@ -447,15 +485,11 @@ inline float32_t2 convert_portable_vector64_t2_to_float32_t2(portable_vector64_t return float32_t2(vec.x, vec.y); } -template<> -inline float32_t2 convert_portable_vector64_t2_to_float32_t2(portable_vector64_t2 vec) -{ -#ifdef __HLSL_VERSION - return emulated_vector, 2>::create(vec.x, vec.y).getAsFloat2(); -#else - return float32_t2(bit_cast(0xdeadbeefu), bit_cast(0xbadcaffeu)); -#endif -} +//template<> +//inline float32_t2 convert_portable_vector64_t2_to_float32_t2(portable_vector64_t2 vec) +//{ +// return _static_cast(vec); +//} template >::value> inline float32_t convert_portable_float64_t_to_float(portable_float64_t<> val) @@ -464,16 +498,6 @@ inline float32_t convert_portable_float64_t_to_float(portable_float64_t<> val) return float32_t(val); } -template<> -inline float32_t convert_portable_float64_t_to_float(portable_float64_t<> val) -{ -#ifdef __HLSL_VERSION - return val.getAsFloat32(); -#else - return float32_t(bit_cast(0xdeadbeefu)); -#endif -} - } } #endif \ No newline at end of file From 96d3281eed469f70cdfee5af7357430fc289a731 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Mon, 26 Aug 2024 19:09:12 +0100 Subject: [PATCH 038/358] Implemented casting functions for emulated_float64_t --- examples_tests | 2 +- include/nbl/builtin/hlsl/cpp_compat/basic.h | 19 +- .../nbl/builtin/hlsl/emulated_float64_t.hlsl | 180 +++++++++--------- .../hlsl/emulated_float64_t_utils.hlsl | 55 ------ .../nbl/builtin/hlsl/glsl_compat/core.hlsl | 6 + include/nbl/builtin/hlsl/ieee754.hlsl | 2 + .../hlsl/impl/emulated_float64_t_impl.hlsl | 55 +++--- include/nbl/builtin/hlsl/tgmath.hlsl | 15 +- 8 files changed, 136 insertions(+), 198 deletions(-) diff --git a/examples_tests b/examples_tests index b6b11ae462..dc1010363e 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit b6b11ae462a551ead3380f1b87c29a342e125365 +Subproject commit dc1010363efd212a943c03f40e1a2dc404b36bfc diff --git a/include/nbl/builtin/hlsl/cpp_compat/basic.h b/include/nbl/builtin/hlsl/cpp_compat/basic.h index 9bf57fb29e..87c96d6fe3 100644 --- a/include/nbl/builtin/hlsl/cpp_compat/basic.h +++ b/include/nbl/builtin/hlsl/cpp_compat/basic.h @@ -22,6 +22,13 @@ namespace impl } }; } + +template +To _static_cast(From v) +{ + return impl::static_cast_helper::cast(v); +} + } } @@ -38,12 +45,6 @@ namespace impl namespace nbl::hlsl { - template - To _static_cast(From v) - { - return impl::static_cast_helper::cast(v); - } - template using add_reference = std::add_lvalue_reference; @@ -72,12 +73,6 @@ namespace nbl namespace hlsl { -template -To _static_cast(From v) -{ - return impl::static_cast_helper::cast(v); -} - #if 0 // TODO: for later template struct add_reference diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index 9fbf8792da..fcda118b33 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -22,9 +22,7 @@ namespace hlsl NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(emulated_float64_t val) { - //return bit_cast >(val.data); return val; - //TODO: return val? } NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(int32_t val) @@ -210,7 +208,7 @@ namespace hlsl if (lhsBiasedExp == ieee754::traits::specialValueExp) { bool propagate = (lhsMantissa | rhsMantissa) != 0u; - return bit_cast >(tgmath::lerp(data, impl::propagateFloat64NaN(data, rhs.data), propagate)); + return bit_cast >(glsl::mix(data, impl::propagateFloat64NaN(data, rhs.data), propagate)); } mantissa = lhsMantissa + rhsMantissa; @@ -234,11 +232,11 @@ namespace hlsl if (lhsBiasedExp == ieee754::traits::specialValueExp) { const bool propagate = (lhsMantissa) != 0u; - return bit_cast >(tgmath::lerp(ieee754::traits::exponentMask | lhsSign, impl::propagateFloat64NaN(data, rhs.data), propagate)); + return bit_cast >(glsl::mix(ieee754::traits::exponentMask | lhsSign, impl::propagateFloat64NaN(data, rhs.data), propagate)); } - expDiff = tgmath::lerp(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); - rhsMantissa = tgmath::lerp(rhsMantissa | (1ull << 52), rhsMantissa, rhsBiasedExp == 0); + expDiff = glsl::mix(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); + rhsMantissa = glsl::mix(rhsMantissa | (1ull << 52), rhsMantissa, rhsBiasedExp == 0); const uint32_t3 shifted = impl::shift64ExtraRightJamming(uint32_t3(impl::packUint64(rhsMantissa), 0u), expDiff); rhsMantissa = impl::unpackUint64(shifted.xy); mantissaExtended.z = shifted.z; @@ -278,11 +276,11 @@ namespace hlsl if (lhsBiasedExp == ieee754::traits::specialValueExp) { bool propagate = lhsMantissa != 0u; - return bit_cast >(tgmath::lerp(impl::assembleFloat64(lhsSign, ieee754::traits::exponentMask, 0ull), impl::propagateFloat64NaN(data, rhs.data), propagate)); + return bit_cast >(glsl::mix(impl::assembleFloat64(lhsSign, ieee754::traits::exponentMask, 0ull), impl::propagateFloat64NaN(data, rhs.data), propagate)); } - expDiff = tgmath::lerp(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); - rhsMantissa = tgmath::lerp(rhsMantissa | 0x4000000000000000ull, rhsMantissa, rhsBiasedExp == 0); + expDiff = glsl::mix(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); + rhsMantissa = glsl::mix(rhsMantissa | 0x4000000000000000ull, rhsMantissa, rhsBiasedExp == 0); rhsMantissa = impl::unpackUint64(impl::shift64RightJamming(impl::packUint64(rhsMantissa), expDiff)); lhsMantissa |= 0x4000000000000000ull; frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); @@ -293,10 +291,10 @@ namespace hlsl if (lhsBiasedExp == ieee754::traits::specialValueExp) { bool propagate = ((lhsMantissa) | (rhsMantissa)) != 0u; - return bit_cast >(tgmath::lerp(ieee754::traits::quietNaN, impl::propagateFloat64NaN(data, rhs.data), propagate)); + return bit_cast >(glsl::mix(ieee754::traits::quietNaN, impl::propagateFloat64NaN(data, rhs.data), propagate)); } - rhsBiasedExp = tgmath::lerp(rhsBiasedExp, 1, lhsBiasedExp == 0); - lhsBiasedExp = tgmath::lerp(lhsBiasedExp, 1, lhsBiasedExp == 0); + rhsBiasedExp = glsl::mix(rhsBiasedExp, 1, lhsBiasedExp == 0); + lhsBiasedExp = glsl::mix(lhsBiasedExp, 1, lhsBiasedExp == 0); const uint32_t2 lhsMantissaPacked = impl::packUint64(lhsMantissa); @@ -324,11 +322,11 @@ namespace hlsl signOfDifference = ieee754::traits::signMask; } - biasedExp = tgmath::lerp(rhsBiasedExp, lhsBiasedExp, signOfDifference == 0u); + biasedExp = glsl::mix(rhsBiasedExp, lhsBiasedExp, signOfDifference == 0u); lhsSign ^= signOfDifference; uint64_t retval_0 = impl::packFloat64(uint32_t(FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) << 31, 0, 0u, 0u); uint64_t retval_1 = impl::normalizeRoundAndPackFloat64(lhsSign, biasedExp - 11, frac.x, frac.y); - return bit_cast >(tgmath::lerp(retval_0, retval_1, frac.x != 0u || frac.y != 0u)); + return bit_cast >(glsl::mix(retval_0, retval_1, frac.x != 0u || frac.y != 0u)); } } else @@ -581,51 +579,48 @@ namespace hlsl } }; -#define COMMA , -#define IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(Type) \ +#define IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(...) \ template<>\ -struct traits_base\ +struct traits_base<__VA_ARGS__ >\ {\ NBL_CONSTEXPR_STATIC_INLINE int16_t exponentBitCnt = 11;\ NBL_CONSTEXPR_STATIC_INLINE int16_t mantissaBitCnt = 52;\ };\ template<>\ -inline uint32_t extractBiasedExponent(Type x)\ +inline uint32_t extractBiasedExponent(__VA_ARGS__ x)\ {\ return extractBiasedExponent(x.data);\ }\ \ template<>\ -inline int extractExponent(Type x)\ +inline int extractExponent(__VA_ARGS__ x)\ {\ return extractExponent(x.data);\ }\ \ template<>\ -NBL_CONSTEXPR_INLINE_FUNC Type replaceBiasedExponent(Type x, typename unsigned_integer_of_size::type biasedExp)\ +NBL_CONSTEXPR_INLINE_FUNC __VA_ARGS__ replaceBiasedExponent(__VA_ARGS__ x, typename unsigned_integer_of_size::type biasedExp)\ {\ - return Type(replaceBiasedExponent(x.data, biasedExp));\ + return __VA_ARGS__(replaceBiasedExponent(x.data, biasedExp));\ }\ \ template <>\ -NBL_CONSTEXPR_INLINE_FUNC Type fastMulExp2(Type x, int n)\ +NBL_CONSTEXPR_INLINE_FUNC __VA_ARGS__ fastMulExp2(__VA_ARGS__ x, int n)\ {\ - return Type(replaceBiasedExponent(x.data, extractBiasedExponent(x) + uint32_t(n)));\ + return __VA_ARGS__(replaceBiasedExponent(x.data, extractBiasedExponent(x) + uint32_t(n)));\ }\ \ template <>\ -NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size::type extractMantissa(Type x)\ +NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size::type extractMantissa(__VA_ARGS__ x)\ {\ return extractMantissa(x.data);\ }\ - -// TODO: this is wrong! fix it -#define DEFINE_BIT_CAST_SPEC(Type)\ +#define DEFINE_BIT_CAST_SPEC(...)\ template<>\ -NBL_CONSTEXPR_FUNC Type bit_cast(NBL_CONST_REF_ARG(uint64_t) val)\ +NBL_CONSTEXPR_FUNC __VA_ARGS__ bit_cast<__VA_ARGS__, uint64_t>(NBL_CONST_REF_ARG(uint64_t) val)\ {\ -Type output; \ +__VA_ARGS__ output; \ output.data = val; \ \ return output; \ @@ -635,90 +630,92 @@ return output; \ namespace impl { -#if 0 -template -struct static_cast_helper,void> +template +struct static_cast_helper,void> { + // TODO: + // static_assert(is_arithmetic::value); + using From = emulated_float64_t; - static inline Scalar cast(From v) + // TODO: test + static inline To cast(From v) { - if (is_floating_point::value) // DOUBLE ALSO REPORTS THIS AS TRUE! (so does float16_t) + if (is_same_v) + return To(bit_cast(v.data)); + + if (is_floating_point::value) { - int exponent = ieee754::extractExponent(v.data); + const int exponent = ieee754::extractExponent(v.data); if (!From::supportsFastMath()) { - if (exponent > 127) - return bit_cast(ieee754::traits::inf); - if (exponent < -126) - return -bit_cast(ieee754::traits::inf); + if (exponent > ieee754::traits::exponentMax) + return bit_cast(ieee754::traits::inf); + if (exponent < ieee754::traits::exponentMin) + return -bit_cast(ieee754::traits::inf); if (tgmath::isnan(v.data)) - return bit_cast(ieee754::traits::quietNaN); + return bit_cast(ieee754::traits::quietNaN); } - uint32_t sign = uint32_t((v.data & ieee754::traits::signMask) >> 32); - uint32_t biasedExponent = uint32_t(exponent + ieee754::traits::exponentBias) << ieee754::traits::mantissaBitCnt; - uint32_t mantissa = uint32_t(v.data >> (ieee754::traits::mantissaBitCnt - ieee754::traits::mantissaBitCnt)) & ieee754::traits::mantissaMask; + using AsUint = typename unsigned_integer_of_size::type; - return bit_cast(sign | biasedExponent | mantissa); + const uint32_t toBitSize = sizeof(To) * 8; + const AsUint sign = AsUint(ieee754::extractSign(v.data) << (toBitSize - 1)); + const AsUint biasedExponent = AsUint(exponent + ieee754::traits::exponentBias) << ieee754::traits::mantissaBitCnt; + const AsUint mantissa = AsUint(v.data >> (ieee754::traits::mantissaBitCnt - ieee754::traits::mantissaBitCnt)) & ieee754::traits::mantissaMask; + + return bit_cast(sign | biasedExponent | mantissa); } - return bit_cast(ieee754::traits::quietNaN); - } -}; -#endif + // NOTE: casting from negative float to unsigned int is an UB, function will return abs value in this case + if (is_integral::value) + { + const int exponent = ieee754::extractExponent(v.data); + if (exponent < 0) + return 0; -// TODO: fix cast to float -#define DEFINE_EMULATED_FLOAT64_STATIC_CAST(Type)\ -template\ -struct static_cast_helper\ -{\ - static inline To cast(Type v)\ - {\ - if (is_floating_point::value)\ - {\ - int exponent = ieee754::extractExponent(v.data);\ - if (!Type::supportsFastMath())\ - {\ - if (exponent > 127)\ - return bit_cast(ieee754::traits::inf);\ - if (exponent < -126)\ - return -bit_cast(ieee754::traits::inf);\ - if (tgmath::isnan(v.data))\ - return bit_cast(ieee754::traits::quietNaN);\ - }\ -\ - uint32_t sign = uint32_t((v.data & ieee754::traits::signMask) >> 32);\ - uint32_t biasedExponent = uint32_t(exponent + ieee754::traits::exponentBias) << ieee754::traits::mantissaBitCnt;\ - uint32_t mantissa = uint32_t(v.data >> (ieee754::traits::mantissaBitCnt - ieee754::traits::mantissaBitCnt)) & ieee754::traits::mantissaMask;\ -\ - return bit_cast(sign | biasedExponent | mantissa);\ - }\ -\ - return bit_cast(ieee754::traits::quietNaN);\ - }\ -};\ -\ + uint64_t unsignedOutput = ieee754::extractMantissa(v.data) & 1ull << ieee754::traits::mantissaBitCnt; + const int shiftAmount = exponent - int(ieee754::traits::mantissaBitCnt); + + if (shiftAmount < 0) + unsignedOutput <<= -shiftAmount; + else + unsignedOutput >>= shiftAmount; -DEFINE_EMULATED_FLOAT64_STATIC_CAST(emulated_float64_t); -DEFINE_EMULATED_FLOAT64_STATIC_CAST(emulated_float64_t); -DEFINE_EMULATED_FLOAT64_STATIC_CAST(emulated_float64_t); -DEFINE_EMULATED_FLOAT64_STATIC_CAST(emulated_float64_t); + if (is_signed::value) + { + int64_t signedOutput64 = unsignedOutput & ((1ull << 63) - 1); + To signedOutput = To(signedOutput64); + if (ieee754::extractSignPreserveBitPattern(v.data) != 0) + signedOutput = -signedOutput; + + return signedOutput; + } + + return To(unsignedOutput); + } + + // assert(false); + return 0xdeadbeefbadcaffeull; + } +}; } +DEFINE_BIT_CAST_SPEC(emulated_float64_t); +DEFINE_BIT_CAST_SPEC(emulated_float64_t); +DEFINE_BIT_CAST_SPEC(emulated_float64_t); +DEFINE_BIT_CAST_SPEC(emulated_float64_t); -DEFINE_BIT_CAST_SPEC(emulated_float64_t); -DEFINE_BIT_CAST_SPEC(emulated_float64_t); -DEFINE_BIT_CAST_SPEC(emulated_float64_t); -DEFINE_BIT_CAST_SPEC(emulated_float64_t); +template +struct is_floating_point > : bool_constant {}; namespace ieee754 { -IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); -IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); -IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); -IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); +IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); +IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); +IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); +IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); } } @@ -730,7 +727,6 @@ IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t; namespace impl { -#if 1 template struct static_cast_helper,emulated_vector,void> { @@ -176,48 +175,6 @@ struct static_cast_helper,emulated_vector,void> return vector(_static_cast(vec.x), _static_cast(vec.y)); } }; -#endif - -#if 0 -#define DEFINE_EMULATED_VECTOR_STATIC_CAST(COND,...)\ -template\ -struct static_cast_helper >\ -{\ - static inline To cast(emulated_vector_t2<__VA_ARGS__ > vec)\ - {\ - return To(_static_cast(vec.x), _static_cast(vec.y));\ - }\ -};\ -\ -template\ -struct static_cast_helper >\ -{\ - static inline To cast(emulated_vector_t3<__VA_ARGS__ > vec)\ - {\ - return To(_static_cast(vec.x), _static_cast(vec.y), _static_cast(vec.z));\ - }\ -};\ -\ -template\ -struct static_cast_helper >\ -{\ - static inline To cast(emulated_vector_t4<__VA_ARGS__ > vec)\ - {\ - return To(_static_cast(vec.x), _static_cast(vec.y), _static_cast(vec.z), _static_cast(vec.w));\ - }\ -};\ -\ - -#define COND -DEFINE_EMULATED_VECTOR_STATIC_CAST(COND,emulated_float64_t); -DEFINE_EMULATED_VECTOR_STATIC_CAST(COND,emulated_float64_t); -DEFINE_EMULATED_VECTOR_STATIC_CAST(COND,emulated_float64_t); -DEFINE_EMULATED_VECTOR_STATIC_CAST(COND,emulated_float64_t); - -#undef DEFINE_EMULATED_VECTOR_STATIC_CAST -#undef COND -#endif - } @@ -479,18 +436,6 @@ NBL_CONSTEXPR_INLINE_FUNC portable_vector64_t3 create_portable_vector64_t2_from_ return output; } -template >::value> -inline float32_t2 convert_portable_vector64_t2_to_float32_t2(portable_vector64_t2 vec) -{ - return float32_t2(vec.x, vec.y); -} - -//template<> -//inline float32_t2 convert_portable_vector64_t2_to_float32_t2(portable_vector64_t2 vec) -//{ -// return _static_cast(vec); -//} - template >::value> inline float32_t convert_portable_float64_t_to_float(portable_float64_t<> val) { diff --git a/include/nbl/builtin/hlsl/glsl_compat/core.hlsl b/include/nbl/builtin/hlsl/glsl_compat/core.hlsl index 61bc20e655..bcc0c76bdb 100644 --- a/include/nbl/builtin/hlsl/glsl_compat/core.hlsl +++ b/include/nbl/builtin/hlsl/glsl_compat/core.hlsl @@ -15,6 +15,12 @@ namespace hlsl namespace glsl { +template +NBL_CONSTEXPR_INLINE_FUNC typename enable_if::value, T>::type mix(T a, T b, bool c) +{ + return c ? b : a; +} + #ifndef __HLSL_VERSION // GLM Aliases diff --git a/include/nbl/builtin/hlsl/ieee754.hlsl b/include/nbl/builtin/hlsl/ieee754.hlsl index 13308bb2dd..e08831e2ce 100644 --- a/include/nbl/builtin/hlsl/ieee754.hlsl +++ b/include/nbl/builtin/hlsl/ieee754.hlsl @@ -93,6 +93,8 @@ struct traits : traits_base NBL_CONSTEXPR_STATIC_INLINE bit_rep_t quietNaN = exponentMask | (1ull << (base_t::mantissaBitCnt - 1)); NBL_CONSTEXPR_STATIC_INLINE bit_rep_t max = ((1ull << (sizeof(Float) * 8 - 1)) - 1) & (~(1ull << base_t::mantissaBitCnt)); NBL_CONSTEXPR_STATIC_INLINE bit_rep_t min = 1ull << base_t::mantissaBitCnt; + NBL_CONSTEXPR_STATIC_INLINE int exponentMax = exponentBias; + NBL_CONSTEXPR_STATIC_INLINE int exponentMin = -(exponentBias - 1); }; template diff --git a/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl b/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl index a828449d13..92df407530 100644 --- a/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl +++ b/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl @@ -5,6 +5,7 @@ #include #include #include +#include #define FLOAT_ROUND_NEAREST_EVEN 0 #define FLOAT_ROUND_TO_ZERO 1 @@ -168,7 +169,7 @@ NBL_CONSTEXPR_INLINE_FUNC uint64_t propagateFloat64NaN(uint64_t lhs, uint64_t rh lhs |= 0x0008000000000000ull; rhs |= 0x0008000000000000ull; - return tgmath::lerp(rhs, tgmath::lerp(lhs, rhs, tgmath::isnan(rhs)), tgmath::isnan(lhs)); + return glsl::mix(rhs, glsl::mix(lhs, rhs, tgmath::isnan(rhs)), tgmath::isnan(lhs)); return 0; #endif } @@ -187,16 +188,16 @@ NBL_CONSTEXPR_INLINE_FUNC uint32_t2 shift64RightJamming(uint32_t2 val, int count uint32_t2 output; const int negCount = (-count) & 31; - output.x = tgmath::lerp(0u, val.x, count == 0); - output.x = tgmath::lerp(output.x, (val.x >> count), count < 32); + output.x = glsl::mix(0u, val.x, count == 0); + output.x = glsl::mix(output.x, (val.x >> count), count < 32); output.y = uint32_t((val.x | val.y) != 0u); /* count >= 64 */ uint32_t z1_lt64 = (val.x>>(count & 31)) | uint32_t(((val.x<>count) | uint32_t((val.y<> (count & 31)), count < 64); - output.y = tgmath::lerp(output.y, (val.x << negCount) | (val.y >> count), count < 32); + output.y = glsl::mix(0u, (val.x >> (count & 31)), count < 64); + output.y = glsl::mix(output.y, (val.x << negCount) | (val.y >> count), count < 32); - val.z = tgmath::lerp(val.z | val.y, val.z, count < 32); - output.x = tgmath::lerp(output.x, val.x >> count, count < 32); + val.z = glsl::mix(val.z | val.y, val.z, count < 32); + output.x = glsl::mix(output.x, val.x >> count, count < 32); output.z |= uint32_t(val.z != 0u); - output.x = tgmath::lerp(output.x, 0u, (count == 32)); - output.y = tgmath::lerp(output.y, val.x, (count == 32)); - output.z = tgmath::lerp(output.z, val.y, (count == 32)); - output.x = tgmath::lerp(output.x, val.x, (count == 0)); - output.y = tgmath::lerp(output.y, val.y, (count == 0)); - output.z = tgmath::lerp(output.z, val.z, (count == 0)); + output.x = glsl::mix(output.x, 0u, (count == 32)); + output.y = glsl::mix(output.y, val.x, (count == 32)); + output.z = glsl::mix(output.z, val.y, (count == 32)); + output.x = glsl::mix(output.x, val.x, (count == 0)); + output.y = glsl::mix(output.y, val.y, (count == 0)); + output.z = glsl::mix(output.z, val.z, (count == 0)); return output; } @@ -236,7 +237,7 @@ NBL_CONSTEXPR_INLINE_FUNC uint64_t shortShift64Left(uint64_t val, int count) uint32_t2 output; output.y = packed.y << count; // TODO: fix - output.x = tgmath::lerp((packed.x << count | (packed.y >> ((-count) & 31))), packed.x, count == 0); + output.x = glsl::mix((packed.x << count | (packed.y >> ((-count) & 31))), packed.x, count == 0); return unpackUint64(output); }; @@ -318,7 +319,7 @@ NBL_CONSTEXPR_INLINE_FUNC uint64_t roundAndPackFloat64(uint64_t zSign, int zExp, } else { - zExp = tgmath::lerp(zExp, 0, (mantissaExtended.x | mantissaExtended.y) == 0u); + zExp = glsl::mix(zExp, 0, (mantissaExtended.x | mantissaExtended.y) == 0u); } return assembleFloat64(zSign, uint64_t(zExp) << ieee754::traits::mantissaBitCnt, unpackUint64(mantissaExtended.xy)); @@ -357,15 +358,15 @@ static inline void normalizeFloat64Subnormal(uint64_t mantissa, uint32_t2 mantissaPacked = packUint64(mantissa); int shiftCount; uint32_t2 temp; - shiftCount = countLeadingZeros32(tgmath::lerp(mantissaPacked.x, mantissaPacked.y, mantissaPacked.x == 0u)) - 11; - outExp = tgmath::lerp(1 - shiftCount, -shiftCount - 31, mantissaPacked.x == 0u); + shiftCount = countLeadingZeros32(glsl::mix(mantissaPacked.x, mantissaPacked.y, mantissaPacked.x == 0u)) - 11; + outExp = glsl::mix(1 - shiftCount, -shiftCount - 31, mantissaPacked.x == 0u); - temp.x = tgmath::lerp(mantissaPacked.y << shiftCount, mantissaPacked.y >> (-shiftCount), shiftCount < 0); - temp.y = tgmath::lerp(0u, mantissaPacked.y << (shiftCount & 31), shiftCount < 0); + temp.x = glsl::mix(mantissaPacked.y << shiftCount, mantissaPacked.y >> (-shiftCount), shiftCount < 0); + temp.y = glsl::mix(0u, mantissaPacked.y << (shiftCount & 31), shiftCount < 0); shortShift64Left(impl::unpackUint64(mantissaPacked), shiftCount); - outMantissa = tgmath::lerp(outMantissa, unpackUint64(temp), mantissaPacked.x == 0); + outMantissa = glsl::mix(outMantissa, unpackUint64(temp), mantissaPacked.x == 0); } NBL_CONSTEXPR_INLINE_FUNC bool areBothInfinity(uint64_t lhs, uint64_t rhs) diff --git a/include/nbl/builtin/hlsl/tgmath.hlsl b/include/nbl/builtin/hlsl/tgmath.hlsl index 2ff17e608e..4a35e10a07 100644 --- a/include/nbl/builtin/hlsl/tgmath.hlsl +++ b/include/nbl/builtin/hlsl/tgmath.hlsl @@ -26,19 +26,12 @@ inline bool isnan(T val) return bool((ieee754::extractBiasedExponent(val) == ieee754::traits::specialValueExp) && (asUint & ieee754::traits::mantissaMask)); } -// TODO: better implementation template -//NBL_CONSTEXPR_INLINE_FUNC enable_if::type, T>::type lerp(T a, T b, bool c) -NBL_CONSTEXPR_INLINE_FUNC T lerp(T a, T b, bool c) +NBL_CONSTEXPR_INLINE_FUNC bool isInf(T val) { - return c ? b : a; -} - -template -NBL_CONSTEXPR_INLINE_FUNC bool isInf(Uint val) -{ - using AsFloat = typename float_of_size::type; - return (val & ~ieee754::traits::signMask) == ieee754::traits::inf; + using AsUint = typename unsigned_integer_of_size::type; + AsUint tmp = bit_cast(val); + return (tmp & ~ieee754::traits::signMask) == ieee754::traits::inf; } } From 916ba166df7860fb51ab7ac72fd5277989b5f7be Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Tue, 27 Aug 2024 21:46:40 +0100 Subject: [PATCH 039/358] Removed unnecessary branches from add operator --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated_float64_t.hlsl | 362 +++++------------- include/nbl/builtin/hlsl/ieee754.hlsl | 8 + .../hlsl/impl/emulated_float64_t_impl.hlsl | 4 + 4 files changed, 115 insertions(+), 261 deletions(-) diff --git a/examples_tests b/examples_tests index dc1010363e..912f012bfb 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit dc1010363efd212a943c03f40e1a2dc404b36bfc +Subproject commit 912f012bfbaa244309a57d1707c9438db5cff56a diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index fcda118b33..8030e9d221 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -11,6 +11,7 @@ namespace hlsl struct emulated_float64_t { using storage_t = uint64_t; + using this_t = emulated_float64_t; storage_t data; @@ -20,41 +21,41 @@ namespace hlsl return emulated_float64_t(bit_cast(float64_t(val))); }*/ - NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(emulated_float64_t val) + NBL_CONSTEXPR_STATIC_INLINE this_t create(this_t val) { return val; } - NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(int32_t val) + NBL_CONSTEXPR_STATIC_INLINE this_t create(int32_t val) { - return bit_cast >(impl::castToUint64WithFloat64BitPattern(int64_t(val))); + return bit_cast(impl::castToUint64WithFloat64BitPattern(int64_t(val))); } - NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(int64_t val) + NBL_CONSTEXPR_STATIC_INLINE this_t create(int64_t val) { - return bit_cast >(impl::castToUint64WithFloat64BitPattern(val)); + return bit_cast(impl::castToUint64WithFloat64BitPattern(val)); } - NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(uint32_t val) + NBL_CONSTEXPR_STATIC_INLINE this_t create(uint32_t val) { - return bit_cast >(impl::castToUint64WithFloat64BitPattern(uint64_t(val))); + return bit_cast(impl::castToUint64WithFloat64BitPattern(uint64_t(val))); } - NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(uint64_t val) + NBL_CONSTEXPR_STATIC_INLINE this_t create(uint64_t val) { - return bit_cast >(impl::castToUint64WithFloat64BitPattern(val)); + return bit_cast(impl::castToUint64WithFloat64BitPattern(val)); } - NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(float32_t val) + NBL_CONSTEXPR_STATIC_INLINE this_t create(float32_t val) { - emulated_float64_t output; + this_t output; output.data = impl::castToUint64WithFloat64BitPattern(val); return output; } - NBL_CONSTEXPR_STATIC_INLINE emulated_float64_t create(float64_t val) + NBL_CONSTEXPR_STATIC_INLINE this_t create(float64_t val) { - return bit_cast >(bit_cast(float64_t(val))); + return bit_cast(bit_cast(float64_t(val))); #ifdef __HLSL_VERSION emulated_float64_t retval; uint32_t lo, hi; @@ -62,7 +63,7 @@ namespace hlsl retval.data = (uint64_t(hi) << 32) | lo; return retval; #else - return bit_cast >(reinterpret_cast(val)); + return bit_cast(reinterpret_cast(val)); #endif } @@ -95,255 +96,89 @@ namespace hlsl } -#if 0 - uint64_t shiftLeftAllowNegBitCnt(uint64_t val, int n) - { - if (n < 0) - return val >> -n; - else - return val << n; - } -#endif - // arithmetic operators emulated_float64_t operator+(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { -#if 0 + if (FlushDenormToZero) { - uint64_t lhsSign = data & ieee754::traits::signMask; - uint64_t rhsSign = rhs.data & ieee754::traits::signMask; - uint64_t lhsMantissa = ieee754::extractMantissa(data); - uint64_t rhsMantissa = ieee754::extractMantissa(rhs.data); - int lhsBiasedExp = ieee754::extractBiasedExponent(data); - int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); - - if (tgmath::isnan(data) || tgmath::isnan(rhs.data)) - return bit_cast >(ieee754::traits::quietNaN); - /*if (std::isinf(lhs) || std::isinf(rhs)) - { - if (std::isinf(lhs) && !std::isinf(rhs)) - return lhs; - if (std::isinf(rhs) && !std::isinf(lhs)) - return rhs; - if (rhs == lhs) - return rhs; - - return nan(); - }*/ - - int rp = min(ieee754::extractExponent(data), ieee754::extractExponent(rhs.data)) - ieee754::traits::mantissaBitCnt; + if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) + return bit_cast(ieee754::traits::quietNaN); - uint64_t lhsRealMantissa = lhsMantissa | (1ull << ieee754::traits::mantissaBitCnt); - uint64_t rhsRealMantissa = rhsMantissa | (1ull << ieee754::traits::mantissaBitCnt); - uint64_t lhsSignTmp = lhsSign >> (52 + 11); - uint64_t rhsSignTmp = rhsSign >> (52 + 11); + this_t retval = this_t::create(0ull); - uint64_t sign = 0u; - if (lhsSign != rhsSign) - { - uint64_t _min = max(data, rhs.data); - uint64_t _max = min(data, rhs.data); - uint64_t minAbs = _min ^ ieee754::traits::signMask; - if (minAbs > _max) - sign = ieee754::traits::signMask; - - } + uint64_t mantissa; + uint32_t3 mantissaExtended; + int biasedExp; - int64_t lhsMantissaTmp = (shiftLeftAllowNegBitCnt(lhsRealMantissa, lhsBiasedExp - rp - ieee754::traits::mantissaBitCnt - ieee754::traits::exponentBias) ^ (-lhsSignTmp)) + lhsSignTmp; - int64_t rhsMantissaTmp = (shiftLeftAllowNegBitCnt(rhsRealMantissa, rhsBiasedExp - rp - ieee754::traits::mantissaBitCnt - ieee754::traits::exponentBias) ^ (-rhsSignTmp)) + rhsSignTmp; + uint64_t lhsSign = ieee754::extractSign(data); + uint64_t rhsSign = ieee754::extractSign(rhs.data); - uint64_t addTmp = bit_cast(lhsMantissaTmp + rhsMantissaTmp); + // TODO: delete 2 below if not needed + uint64_t lhsMantissa = ieee754::extractMantissa(data); + uint64_t rhsMantissa = ieee754::extractMantissa(rhs.data); - // renormalize - if (!FastMath && false) // TODO: hande nan - { + int64_t lhsNormMantissa = int64_t(ieee754::extractNormalizeMantissa(data)); + int64_t rhsNormMantissa = int64_t(ieee754::extractNormalizeMantissa(rhs.data)); - } - else + // TODO: branchless? + if (lhsSign != rhsSign) { -#ifndef __HLSL_VERSION - int l2 = log2(double(addTmp)); -#else - int intl2 = 0; -#endif - - if (!FastMath && (rp + l2 + 1 < nbl::hlsl::numeric_limits::min_exponent)) - { - return bit_cast >(impl::assembleFloat64(0, ieee754::traits::exponentMask, 0)); - } - else - { - rp = addTmp ? l2 + rp + ieee754::traits::exponentBias : 0; - return bit_cast >(impl::assembleFloat64( - sign, - (uint64_t(rp) << ieee754::traits::mantissaBitCnt) & ieee754::traits::exponentMask, - shiftLeftAllowNegBitCnt(addTmp, (ieee754::traits::mantissaBitCnt - l2)) & ieee754::traits::mantissaMask) - ); - } + if (lhsSign) + lhsNormMantissa *= -1; + if (rhsSign) + rhsNormMantissa *= -1; } - } -#endif - if (FlushDenormToZero) - { - emulated_float64_t retval = emulated_float64_t::create(0ull); - - uint64_t mantissa; - uint32_t3 mantissaExtended; - int biasedExp; - - uint64_t lhsSign = data & ieee754::traits::signMask; - uint64_t rhsSign = rhs.data & ieee754::traits::signMask; - uint64_t lhsMantissa = ieee754::extractMantissa(data); - uint64_t rhsMantissa = ieee754::extractMantissa(rhs.data); int lhsBiasedExp = ieee754::extractBiasedExponent(data); int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); int expDiff = lhsBiasedExp - rhsBiasedExp; - if (lhsSign == rhsSign) - { - if (expDiff == 0) - { - if (lhsBiasedExp == ieee754::traits::specialValueExp) - { - bool propagate = (lhsMantissa | rhsMantissa) != 0u; - return bit_cast >(glsl::mix(data, impl::propagateFloat64NaN(data, rhs.data), propagate)); - } - - mantissa = lhsMantissa + rhsMantissa; - if (lhsBiasedExp == 0) - return bit_cast >(impl::assembleFloat64(lhsSign, 0, mantissa)); - mantissaExtended.xy = impl::packUint64(mantissa); - mantissaExtended.x |= 0x00200000u; - mantissaExtended.z = 0u; - biasedExp = lhsBiasedExp; - - mantissaExtended = impl::shift64ExtraRightJamming(mantissaExtended, 1); - } - else - { - if (expDiff < 0) - { - swap(lhsMantissa, rhsMantissa); - swap(lhsBiasedExp, rhsBiasedExp); - } - - if (lhsBiasedExp == ieee754::traits::specialValueExp) - { - const bool propagate = (lhsMantissa) != 0u; - return bit_cast >(glsl::mix(ieee754::traits::exponentMask | lhsSign, impl::propagateFloat64NaN(data, rhs.data), propagate)); - } - - expDiff = glsl::mix(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); - rhsMantissa = glsl::mix(rhsMantissa | (1ull << 52), rhsMantissa, rhsBiasedExp == 0); - const uint32_t3 shifted = impl::shift64ExtraRightJamming(uint32_t3(impl::packUint64(rhsMantissa), 0u), expDiff); - rhsMantissa = impl::unpackUint64(shifted.xy); - mantissaExtended.z = shifted.z; - biasedExp = lhsBiasedExp; - - lhsMantissa |= (1ull << 52); - mantissaExtended.xy = impl::packUint64(lhsMantissa + rhsMantissa); - --biasedExp; - if (!(mantissaExtended.x < 0x00200000u)) - { - mantissaExtended = impl::shift64ExtraRightJamming(mantissaExtended, 1); - ++biasedExp; - } - - return bit_cast >(impl::roundAndPackFloat64(lhsSign, biasedExp, mantissaExtended.xyz)); - } + int exp = max(lhsBiasedExp, rhsBiasedExp) - ieee754::traits::exponentBias; + uint32_t shiftAmount = abs(expDiff); - // cannot happen but compiler cries about not every path returning value - return bit_cast >(impl::roundAndPackFloat64(lhsSign, biasedExp, mantissaExtended)); - } - else - { - lhsMantissa = impl::shortShift64Left(lhsMantissa, 10); - rhsMantissa = impl::shortShift64Left(rhsMantissa, 10); - - if (expDiff != 0) - { - uint32_t2 frac; - - if (expDiff < 0) - { - swap(lhsMantissa, rhsMantissa); - swap(lhsBiasedExp, rhsBiasedExp); - lhsSign ^= ieee754::traits::signMask; - } - - if (lhsBiasedExp == ieee754::traits::specialValueExp) - { - bool propagate = lhsMantissa != 0u; - return bit_cast >(glsl::mix(impl::assembleFloat64(lhsSign, ieee754::traits::exponentMask, 0ull), impl::propagateFloat64NaN(data, rhs.data), propagate)); - } - - expDiff = glsl::mix(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); - rhsMantissa = glsl::mix(rhsMantissa | 0x4000000000000000ull, rhsMantissa, rhsBiasedExp == 0); - rhsMantissa = impl::unpackUint64(impl::shift64RightJamming(impl::packUint64(rhsMantissa), expDiff)); - lhsMantissa |= 0x4000000000000000ull; - frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); - biasedExp = lhsBiasedExp; - --biasedExp; - return bit_cast >(impl::normalizeRoundAndPackFloat64(lhsSign, biasedExp - 10, frac.x, frac.y)); - } - if (lhsBiasedExp == ieee754::traits::specialValueExp) - { - bool propagate = ((lhsMantissa) | (rhsMantissa)) != 0u; - return bit_cast >(glsl::mix(ieee754::traits::quietNaN, impl::propagateFloat64NaN(data, rhs.data), propagate)); - } - rhsBiasedExp = glsl::mix(rhsBiasedExp, 1, lhsBiasedExp == 0); - lhsBiasedExp = glsl::mix(lhsBiasedExp, 1, lhsBiasedExp == 0); + // so lhsNormMantissa always holds mantissa of number with greater exponent + if (expDiff < 0) + swap(lhsNormMantissa, rhsNormMantissa); + rhsNormMantissa >>= shiftAmount; + int64_t resultMantissa = lhsNormMantissa + rhsNormMantissa; + const uint64_t resultSign = uint64_t((lhsSign && rhsSign) || (bit_cast(resultMantissa) & (lhsSign << 63))) << 63; + uint64_t resultBiasedExp = uint64_t(exp) + ieee754::traits::exponentBias; - const uint32_t2 lhsMantissaPacked = impl::packUint64(lhsMantissa); - const uint32_t2 rhsMantissaPacked = impl::packUint64(rhsMantissa); + resultMantissa = abs(resultMantissa); - uint32_t2 frac; - uint64_t signOfDifference = 0; - if (rhsMantissaPacked.x < lhsMantissaPacked.x) - { - frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); - } - else if (lhsMantissaPacked.x < rhsMantissaPacked.x) - { - frac.xy = impl::packUint64(rhsMantissa - lhsMantissa); - signOfDifference = ieee754::traits::signMask; - } - else if (rhsMantissaPacked.y <= lhsMantissaPacked.y) - { - /* It is possible that frac.x and frac.y may be zero after this. */ - frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); - } - else - { - frac.xy = impl::packUint64(rhsMantissa - lhsMantissa); - signOfDifference = ieee754::traits::signMask; - } + if (resultMantissa & 1ull << 53) + { + ++resultBiasedExp; + resultMantissa >>= 1; + } - biasedExp = glsl::mix(rhsBiasedExp, lhsBiasedExp, signOfDifference == 0u); - lhsSign ^= signOfDifference; - uint64_t retval_0 = impl::packFloat64(uint32_t(FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) << 31, 0, 0u, 0u); - uint64_t retval_1 = impl::normalizeRoundAndPackFloat64(lhsSign, biasedExp - 11, frac.x, frac.y); - return bit_cast >(glsl::mix(retval_0, retval_1, frac.x != 0u || frac.y != 0u)); + // TODO: better implementation with no loop + while (resultMantissa < (1ull << 52)) + { + --resultBiasedExp; + resultMantissa <<= 1; } + + resultMantissa &= ieee754::traits::mantissaMask; + uint64_t output = impl::assembleFloat64(resultSign, uint64_t(resultBiasedExp) << ieee754::traits::mantissaBitCnt, abs(resultMantissa)); + return bit_cast(output); } - else - { - //static_assert(false, "not implemented yet"); - return bit_cast >(0xdeadbeefbadcaffeull); - } + + // not implemented + if (!FlushDenormToZero) + return bit_cast(0xdeadbeefbadcaffeull); } emulated_float64_t operator+(float rhs) { - return bit_cast >(data) + create(rhs); + return bit_cast(data) + create(rhs); } emulated_float64_t operator-(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { - emulated_float64_t lhs = bit_cast >(data); + emulated_float64_t lhs = bit_cast(data); emulated_float64_t rhsFlipped = rhs.flipSign(); return lhs + rhsFlipped; @@ -351,14 +186,14 @@ namespace hlsl emulated_float64_t operator-(float rhs) NBL_CONST_MEMBER_FUNC { - return bit_cast >(data) - create(rhs); + return bit_cast(data) - create(rhs); } emulated_float64_t operator*(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { if(FlushDenormToZero) { - emulated_float64_t retval = emulated_float64_t::create(0ull); + emulated_float64_t retval = this_t::create(0ull); uint64_t lhsSign = data & ieee754::traits::signMask; uint64_t rhsSign = rhs.data & ieee754::traits::signMask; @@ -374,11 +209,11 @@ namespace hlsl if (lhsBiasedExp == ieee754::traits::specialValueExp) { if ((lhsMantissa != 0u) || ((rhsBiasedExp == ieee754::traits::specialValueExp) && (rhsMantissa != 0u))) - return bit_cast >(impl::propagateFloat64NaN(data, rhs.data)); + return bit_cast(impl::propagateFloat64NaN(data, rhs.data)); if ((uint64_t(rhsBiasedExp) | rhsMantissa) == 0u) - return bit_cast >(ieee754::traits::quietNaN); + return bit_cast(ieee754::traits::quietNaN); - return bit_cast >(impl::assembleFloat64(sign, ieee754::traits::exponentMask, 0ull)); + return bit_cast(impl::assembleFloat64(sign, ieee754::traits::exponentMask, 0ull)); } if (rhsBiasedExp == ieee754::traits::specialValueExp) { @@ -387,23 +222,23 @@ namespace hlsl #ifdef RELAXED_NAN_PROPAGATION return rhs.data; #else - return bit_cast >(impl::propagateFloat64NaN(data, rhs.data)); + return bit_cast(impl::propagateFloat64NaN(data, rhs.data)); #endif if ((uint64_t(lhsBiasedExp) | lhsMantissa) == 0u) - return bit_cast >(ieee754::traits::quietNaN); + return bit_cast(ieee754::traits::quietNaN); - return bit_cast >(sign | ieee754::traits::exponentMask); + return bit_cast(sign | ieee754::traits::exponentMask); } if (lhsBiasedExp == 0) { if (lhsMantissa == 0u) - return bit_cast >(sign); + return bit_cast(sign); impl::normalizeFloat64Subnormal(lhsMantissa, lhsBiasedExp, lhsMantissa); } if (rhsBiasedExp == 0) { if (rhsMantissa == 0u) - return bit_cast >(sign); + return bit_cast(sign); impl::normalizeFloat64Subnormal(rhsMantissa, rhsBiasedExp, rhsMantissa); } } @@ -423,24 +258,24 @@ namespace hlsl } newPseudoMantissa &= (ieee754::traits::mantissaMask); - return bit_cast >(impl::assembleFloat64(sign, uint64_t(exp) << ieee754::traits::mantissaBitCnt, newPseudoMantissa)); + return bit_cast(impl::assembleFloat64(sign, uint64_t(exp) << ieee754::traits::mantissaBitCnt, newPseudoMantissa)); } else { //static_assert(false, "not implemented yet"); - return bit_cast >(0xdeadbeefbadcaffeull); + return bit_cast(0xdeadbeefbadcaffeull); } } emulated_float64_t operator*(float rhs) { - return bit_cast >(data) * create(rhs); + return bit_cast(data) * create(rhs); } - /*emulated_float64_t reciprocal(uint64_t x) + /*this_t reciprocal(uint64_t x) { - using ThisType = emulated_float64_t; - ThisType output = ThisType::bit_cast >((0xbfcdd6a18f6a6f52ULL - x) >> 1); + using ThisType = this_t; + ThisType output = ThisType::bit_cast((0xbfcdd6a18f6a6f52ULL - x) >> 1); output = output * output; return output; }*/ @@ -449,23 +284,23 @@ namespace hlsl { if (FlushDenormToZero) { - //return emulated_float64_t::bit_cast >(data) * reciprocal(rhs.data); + //return this_t::bit_cast(data) * reciprocal(rhs.data); if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) - return bit_cast >(ieee754::traits::quietNaN); + return bit_cast(ieee754::traits::quietNaN); if (!FastMath && ((rhs.data << 1) == 0)) - return bit_cast >(ieee754::traits::quietNaN); + return bit_cast(ieee754::traits::quietNaN); const uint64_t sign = (data ^ rhs.data) & ieee754::traits::signMask; if (!FastMath && impl::areBothInfinity(data, rhs.data)) - return bit_cast >(ieee754::traits::quietNaN | sign); + return bit_cast(ieee754::traits::quietNaN | sign); if (!FastMath && tgmath::isInf(data)) - return bit_cast >((data & ~ieee754::traits::signMask) | sign); + return bit_cast((data & ~ieee754::traits::signMask) | sign); if (!FastMath && tgmath::isInf(rhs.data)) - return bit_cast >(0ull | sign); + return bit_cast(0ull | sign); @@ -485,18 +320,18 @@ namespace hlsl mantissa &= ieee754::traits::mantissaMask; - return bit_cast >(impl::assembleFloat64(sign, uint64_t(exp) << ieee754::traits::mantissaBitCnt, mantissa)); + return bit_cast(impl::assembleFloat64(sign, uint64_t(exp) << ieee754::traits::mantissaBitCnt, mantissa)); } else { //static_assert(false, "not implemented yet"); - return bit_cast >(0xdeadbeefbadcaffeull); + return bit_cast(0xdeadbeefbadcaffeull); } } // relational operators // TODO: should `FlushDenormToZero` affect relational operators? - bool operator==(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC + bool operator==(this_t rhs) NBL_CONST_MEMBER_FUNC { if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) return false; @@ -504,7 +339,7 @@ namespace hlsl if (!FastMath && impl::areBothZero(data, rhs.data)) return true; - const emulated_float64_t xored = bit_cast >(data ^ rhs.data); + const emulated_float64_t xored = bit_cast(data ^ rhs.data); // TODO: check what fast math returns for -0 == 0 if ((xored.data & 0x7FFFFFFFFFFFFFFFull) == 0ull) return true; @@ -513,7 +348,7 @@ namespace hlsl } bool operator!=(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { - return !(bit_cast >(data) == rhs); + return !(bit_cast(data) == rhs); } bool operator<(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { @@ -555,8 +390,8 @@ namespace hlsl return (lhsFlipped & diffBits) > (rhsFlipped & diffBits); } - bool operator<=(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { return !(bit_cast >(data) > bit_cast >(rhs.data)); } - bool operator>=(emulated_float64_t rhs) { return !(bit_cast >(data) < bit_cast >(rhs.data)); } + bool operator<=(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { return !(bit_cast(data) > bit_cast(rhs.data)); } + bool operator>=(emulated_float64_t rhs) { return !(bit_cast(data) < bit_cast(rhs.data)); } //logical operators bool operator&&(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { return bool(data) && bool(rhs.data); } @@ -565,7 +400,7 @@ namespace hlsl emulated_float64_t flipSign() { - return bit_cast >(data ^ ieee754::traits::signMask); + return bit_cast(data ^ ieee754::traits::signMask); } bool isNaN() @@ -615,6 +450,13 @@ NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size::type ex {\ return extractMantissa(x.data);\ }\ +\ +template <>\ +NBL_CONSTEXPR_INLINE_FUNC uint64_t extractNormalizeMantissa(__VA_ARGS__ x)\ +{\ + return extractNormalizeMantissa(x.data);\ +}\ +\ #define DEFINE_BIT_CAST_SPEC(...)\ template<>\ diff --git a/include/nbl/builtin/hlsl/ieee754.hlsl b/include/nbl/builtin/hlsl/ieee754.hlsl index e08831e2ce..d39526e04b 100644 --- a/include/nbl/builtin/hlsl/ieee754.hlsl +++ b/include/nbl/builtin/hlsl/ieee754.hlsl @@ -147,6 +147,14 @@ NBL_CONSTEXPR_INLINE_FUNC typename unsigned_integer_of_size::type ext return impl::castToUintType(x) & traits::type>::mantissaMask; } +template +NBL_CONSTEXPR_INLINE_FUNC typename unsigned_integer_of_size::type extractNormalizeMantissa(T x) +{ + using AsUint = typename unsigned_integer_of_size::type; + using AsFloat = typename float_of_size::type; + return extractMantissa(x) | (AsUint(1) << traits::mantissaBitCnt); +} + template NBL_CONSTEXPR_INLINE_FUNC typename unsigned_integer_of_size::type extractSign(T x) { diff --git a/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl b/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl index 92df407530..d86d5c6469 100644 --- a/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl +++ b/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl @@ -274,6 +274,9 @@ NBL_CONSTEXPR_INLINE_FUNC uint64_t roundAndPackFloat64(uint64_t zSign, int zExp, } } } + + // overflow handling? + // if biased exp is lesser then 2045 if (0x7FD <= zExp) { if ((0x7FD < zExp) || ((zExp == 0x7FD) && (0x001FFFFFu == mantissaExtended.x && 0xFFFFFFFFu == mantissaExtended.y) && increment)) @@ -319,6 +322,7 @@ NBL_CONSTEXPR_INLINE_FUNC uint64_t roundAndPackFloat64(uint64_t zSign, int zExp, } else { + // ?? zExp = glsl::mix(zExp, 0, (mantissaExtended.x | mantissaExtended.y) == 0u); } From aff0ca44d8e22c46d0058170af2ebe5585d3dcb7 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Tue, 27 Aug 2024 23:15:07 +0100 Subject: [PATCH 040/358] Revert unintended changes --- include/nbl/builtin/hlsl/glsl_compat/core.hlsl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/nbl/builtin/hlsl/glsl_compat/core.hlsl b/include/nbl/builtin/hlsl/glsl_compat/core.hlsl index bcc0c76bdb..88eed8202a 100644 --- a/include/nbl/builtin/hlsl/glsl_compat/core.hlsl +++ b/include/nbl/builtin/hlsl/glsl_compat/core.hlsl @@ -44,6 +44,7 @@ genIUType bitfieldInsert(genIUType const& Base, genIUType const& Insert, int Off // Fun fact: ideally atomics should detect the address space of `ptr` and narrow down the sync-scope properly // https://github.com/microsoft/DirectXShaderCompiler/issues/6508 // Would need own meta-type/tagged-type to implement, without & and fancy operator overloads... not posssible +// TODO: we can template on `StorageClass` instead of Ptr_T then resolve the memory scope and semantics properly template T atomicAdd(NBL_REF_ARG(T) ptr, T value) { @@ -120,9 +121,9 @@ T atomicCompSwap(NBL_REF_ARG(T) ptr, T comparator, T value) return spirv::atomicCompareExchange(ptr, spv::ScopeDevice, spv::MemorySemanticsMaskNone, spv::MemorySemanticsMaskNone, value, comparator); } template // DXC Workaround -enable_if_t, T> atomicCompSwap(Ptr_T ptr, T value) +enable_if_t, T> atomicCompSwap(Ptr_T ptr, T comparator, T value) { - return spirv::atomicCompareExchange(ptr, spv::ScopeDevice, spv::MemorySemanticsMaskNone, value); + return spirv::atomicCompareExchange(ptr, spv::ScopeDevice, spv::MemorySemanticsMaskNone, spv::MemorySemanticsMaskNone, value, comparator); } /** @@ -205,7 +206,7 @@ struct bitfieldExtract } }; -} +} //namespace impl template T bitfieldExtract( T val, uint32_t offsetBits, uint32_t numBits ) From ede8b0cd415ce72c9faca37b6de81a39913be305 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Wed, 28 Aug 2024 21:44:28 +0100 Subject: [PATCH 041/358] Fixes --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated_float64_t.hlsl | 221 ++++++++++++++++-- .../hlsl/impl/emulated_float64_t_impl.hlsl | 53 +++-- include/nbl/builtin/hlsl/tgmath.hlsl | 4 +- 4 files changed, 242 insertions(+), 38 deletions(-) diff --git a/examples_tests b/examples_tests index 912f012bfb..da25126a01 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 912f012bfbaa244309a57d1707c9438db5cff56a +Subproject commit da25126a01b92ff244bc143852685947c98c0721 diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index 8030e9d221..de737f8a1c 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -49,7 +49,7 @@ namespace hlsl NBL_CONSTEXPR_STATIC_INLINE this_t create(float32_t val) { this_t output; - output.data = impl::castToUint64WithFloat64BitPattern(val); + output.data = impl::castFloat32ToStorageType(val); return output; } @@ -96,30 +96,191 @@ namespace hlsl } + // TODO: remove + emulated_float64_t addOld(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC + { + if (FlushDenormToZero) + { + emulated_float64_t retval = emulated_float64_t::create(0ull); + + uint64_t mantissa; + uint32_t3 mantissaExtended; + int biasedExp; + + uint64_t lhsSign = data & ieee754::traits::signMask; + uint64_t rhsSign = rhs.data & ieee754::traits::signMask; + uint64_t lhsMantissa = ieee754::extractMantissa(data); + uint64_t rhsMantissa = ieee754::extractMantissa(rhs.data); + int lhsBiasedExp = ieee754::extractBiasedExponent(data); + int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); + + int expDiff = lhsBiasedExp - rhsBiasedExp; + + if (lhsSign == rhsSign) + { + if (expDiff == 0) + { + if (lhsBiasedExp == ieee754::traits::specialValueExp) + { + bool propagate = (lhsMantissa | rhsMantissa) != 0u; + return bit_cast >(glsl::mix(data, impl::propagateFloat64NaN(data, rhs.data), propagate)); + } + + mantissa = lhsMantissa + rhsMantissa; + if (lhsBiasedExp == 0) + return bit_cast >(impl::assembleFloat64(lhsSign, 0, mantissa)); + mantissaExtended.xy = impl::packUint64(mantissa); + mantissaExtended.x |= 0x00200000u; + mantissaExtended.z = 0u; + biasedExp = lhsBiasedExp; + + mantissaExtended = impl::shift64ExtraRightJamming(mantissaExtended, 1); + } + else + { + if (expDiff < 0) + { + swap(lhsMantissa, rhsMantissa); + swap(lhsBiasedExp, rhsBiasedExp); + } + + if (lhsBiasedExp == ieee754::traits::specialValueExp) + { + const bool propagate = (lhsMantissa) != 0u; + return bit_cast >(glsl::mix(ieee754::traits::exponentMask | lhsSign, impl::propagateFloat64NaN(data, rhs.data), propagate)); + } + + expDiff = glsl::mix(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); + rhsMantissa = glsl::mix(rhsMantissa | (1ull << 52), rhsMantissa, rhsBiasedExp == 0); + const uint32_t3 shifted = impl::shift64ExtraRightJamming(uint32_t3(impl::packUint64(rhsMantissa), 0u), expDiff); + rhsMantissa = impl::unpackUint64(shifted.xy); + mantissaExtended.z = shifted.z; + biasedExp = lhsBiasedExp; + + lhsMantissa |= (1ull << 52); + mantissaExtended.xy = impl::packUint64(lhsMantissa + rhsMantissa); + --biasedExp; + if (!(mantissaExtended.x < 0x00200000u)) + { + mantissaExtended = impl::shift64ExtraRightJamming(mantissaExtended, 1); + ++biasedExp; + } + + return bit_cast >(impl::roundAndPackFloat64(lhsSign, biasedExp, mantissaExtended.xyz)); + } + + // cannot happen but compiler cries about not every path returning value + return bit_cast >(impl::roundAndPackFloat64(lhsSign, biasedExp, mantissaExtended)); + } + else + { + lhsMantissa = impl::shortShift64Left(lhsMantissa, 10); + rhsMantissa = impl::shortShift64Left(rhsMantissa, 10); + + if (expDiff != 0) + { + uint32_t2 frac; + + if (expDiff < 0) + { + swap(lhsMantissa, rhsMantissa); + swap(lhsBiasedExp, rhsBiasedExp); + lhsSign ^= ieee754::traits::signMask; + } + + if (lhsBiasedExp == ieee754::traits::specialValueExp) + { + bool propagate = lhsMantissa != 0u; + return bit_cast >(glsl::mix(impl::assembleFloat64(lhsSign, ieee754::traits::exponentMask, 0ull), impl::propagateFloat64NaN(data, rhs.data), propagate)); + } + + expDiff = glsl::mix(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); + rhsMantissa = glsl::mix(rhsMantissa | 0x4000000000000000ull, rhsMantissa, rhsBiasedExp == 0); + rhsMantissa = impl::unpackUint64(impl::shift64RightJamming(impl::packUint64(rhsMantissa), expDiff)); + lhsMantissa |= 0x4000000000000000ull; + frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); + biasedExp = lhsBiasedExp; + --biasedExp; + return bit_cast >(impl::normalizeRoundAndPackFloat64(lhsSign, biasedExp - 10, frac.x, frac.y)); + } + if (lhsBiasedExp == ieee754::traits::specialValueExp) + { + bool propagate = ((lhsMantissa) | (rhsMantissa)) != 0u; + return bit_cast >(glsl::mix(ieee754::traits::quietNaN, impl::propagateFloat64NaN(data, rhs.data), propagate)); + } + rhsBiasedExp = glsl::mix(rhsBiasedExp, 1, lhsBiasedExp == 0); + lhsBiasedExp = glsl::mix(lhsBiasedExp, 1, lhsBiasedExp == 0); + + + const uint32_t2 lhsMantissaPacked = impl::packUint64(lhsMantissa); + const uint32_t2 rhsMantissaPacked = impl::packUint64(rhsMantissa); + + uint32_t2 frac; + uint64_t signOfDifference = 0; + if (rhsMantissaPacked.x < lhsMantissaPacked.x) + { + frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); + } + else if (lhsMantissaPacked.x < rhsMantissaPacked.x) + { + frac.xy = impl::packUint64(rhsMantissa - lhsMantissa); + signOfDifference = ieee754::traits::signMask; + } + else if (rhsMantissaPacked.y <= lhsMantissaPacked.y) + { + /* It is possible that frac.x and frac.y may be zero after this. */ + frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); + } + else + { + frac.xy = impl::packUint64(rhsMantissa - lhsMantissa); + signOfDifference = ieee754::traits::signMask; + } + + biasedExp = glsl::mix(rhsBiasedExp, lhsBiasedExp, signOfDifference == 0u); + lhsSign ^= signOfDifference; + uint64_t retval_0 = impl::packFloat64(uint32_t(FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) << 31, 0, 0u, 0u); + uint64_t retval_1 = impl::normalizeRoundAndPackFloat64(lhsSign, biasedExp - 11, frac.x, frac.y); + return bit_cast >(glsl::mix(retval_0, retval_1, frac.x != 0u || frac.y != 0u)); + } + } + else + { + //static_assert(false, "not implemented yet"); + return bit_cast >(0xdeadbeefbadcaffeull); + } + } + // arithmetic operators - emulated_float64_t operator+(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC + this_t operator+(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { + return addOld(rhs); + if (FlushDenormToZero) { if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) return bit_cast(ieee754::traits::quietNaN); - this_t retval = this_t::create(0ull); + int lhsBiasedExp = ieee754::extractBiasedExponent(data); + int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); - uint64_t mantissa; - uint32_t3 mantissaExtended; - int biasedExp; + if (lhsBiasedExp == 0ull) + return bit_cast(rhs.data); + if (rhsBiasedExp == 0ull) + return bit_cast(data); uint64_t lhsSign = ieee754::extractSign(data); uint64_t rhsSign = ieee754::extractSign(rhs.data); - // TODO: delete 2 below if not needed - uint64_t lhsMantissa = ieee754::extractMantissa(data); - uint64_t rhsMantissa = ieee754::extractMantissa(rhs.data); + if (lhsSign != rhsSign) + return addOld(rhs); int64_t lhsNormMantissa = int64_t(ieee754::extractNormalizeMantissa(data)); int64_t rhsNormMantissa = int64_t(ieee754::extractNormalizeMantissa(rhs.data)); + lhsNormMantissa <<= 9; + rhsNormMantissa <<= 9; + // TODO: branchless? if (lhsSign != rhsSign) { @@ -129,9 +290,6 @@ namespace hlsl rhsNormMantissa *= -1; } - int lhsBiasedExp = ieee754::extractBiasedExponent(data); - int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); - int expDiff = lhsBiasedExp - rhsBiasedExp; int exp = max(lhsBiasedExp, rhsBiasedExp) - ieee754::traits::exponentBias; @@ -142,7 +300,11 @@ namespace hlsl swap(lhsNormMantissa, rhsNormMantissa); rhsNormMantissa >>= shiftAmount; + int64_t resultMantissa = lhsNormMantissa + rhsNormMantissa; + + resultMantissa >>= 9; + const uint64_t resultSign = uint64_t((lhsSign && rhsSign) || (bit_cast(resultMantissa) & (lhsSign << 63))) << 63; uint64_t resultBiasedExp = uint64_t(exp) + ieee754::traits::exponentBias; @@ -287,21 +449,21 @@ namespace hlsl //return this_t::bit_cast(data) * reciprocal(rhs.data); if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) - return bit_cast(ieee754::traits::quietNaN); - if (!FastMath && ((rhs.data << 1) == 0)) - return bit_cast(ieee754::traits::quietNaN); + return bit_cast(ieee754::traits::quietNaN); const uint64_t sign = (data ^ rhs.data) & ieee754::traits::signMask; + if (!FastMath && impl::isZero(rhs.data)) + return bit_cast(ieee754::traits::inf | sign); + if (!FastMath && impl::areBothInfinity(data, rhs.data)) - return bit_cast(ieee754::traits::quietNaN | sign); + return bit_cast(ieee754::traits::quietNaN); if (!FastMath && tgmath::isInf(data)) - return bit_cast((data & ~ieee754::traits::signMask) | sign); + return bit_cast(ieee754::traits::inf | sign); if (!FastMath && tgmath::isInf(rhs.data)) - return bit_cast(0ull | sign); - + return bit_cast(0ull | sign); const uint64_t lhsRealMantissa = (ieee754::extractMantissa(data) | (1ull << ieee754::traits::mantissaBitCnt)); @@ -348,6 +510,9 @@ namespace hlsl } bool operator!=(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { + if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) + return false; + return !(bit_cast(data) == rhs); } bool operator<(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC @@ -373,7 +538,7 @@ namespace hlsl bool operator>(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) - return true; + return false; if (!FastMath && impl::areBothSameSignInfinity(data, rhs.data)) return false; if (!FastMath && impl::areBothZero(data, rhs.data)) @@ -390,8 +555,20 @@ namespace hlsl return (lhsFlipped & diffBits) > (rhsFlipped & diffBits); } - bool operator<=(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { return !(bit_cast(data) > bit_cast(rhs.data)); } - bool operator>=(emulated_float64_t rhs) { return !(bit_cast(data) < bit_cast(rhs.data)); } + bool operator<=(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC + { + if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) + return false; + + return !(bit_cast(data) > bit_cast(rhs.data)); + } + bool operator>=(emulated_float64_t rhs) + { + if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) + return false; + + return !(bit_cast(data) < bit_cast(rhs.data)); + } //logical operators bool operator&&(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { return bool(data) && bool(rhs.data); } diff --git a/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl b/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl index d86d5c6469..3045df49d4 100644 --- a/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl +++ b/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl @@ -80,16 +80,28 @@ NBL_CONSTEXPR_INLINE_FUNC uint64_t unpackUint64(uint32_t2 val) return ((uint64_t(val.x) & 0x00000000FFFFFFFFull) << 32) | uint64_t(val.y); } -inline uint64_t castToUint64WithFloat64BitPattern(float32_t val) +template +inline uint64_t castFloat32ToStorageType(float32_t val) { - uint32_t asUint = ieee754::impl::castToUintType(val); - - const uint64_t sign = (uint64_t(ieee754::traits::signMask) & asUint) << (sizeof(float32_t) * 8); - - const uint64_t biasedExp = (uint64_t(ieee754::extractExponent(val)) + ieee754::traits::exponentBias) << (ieee754::traits::mantissaBitCnt); - const uint64_t mantissa = (uint64_t(ieee754::traits::mantissaMask) & asUint) << (ieee754::traits::mantissaBitCnt - ieee754::traits::mantissaBitCnt); - - return sign | biasedExp | mantissa; + if (FlushDenormToZero) + { + const uint64_t sign = uint64_t(ieee754::extractSign(val)) << 63; + if (tgmath::isInf(val)) + return ieee754::traits::inf | sign; + uint32_t asUint = ieee754::impl::castToUintType(val); + const int f32BiasedExp = ieee754::extractBiasedExponent(val); + if (f32BiasedExp == 0) + return sign; + const uint64_t biasedExp = uint64_t(f32BiasedExp - ieee754::traits::exponentBias + ieee754::traits::exponentBias) << (ieee754::traits::mantissaBitCnt); + const uint64_t mantissa = (uint64_t(ieee754::traits::mantissaMask) & asUint) << (ieee754::traits::mantissaBitCnt - ieee754::traits::mantissaBitCnt); + + return sign | biasedExp | mantissa; + } + else + { + // static_assert(false); + return 0xdeadbeefbadcaffeull; + } }; inline uint64_t castToUint64WithFloat64BitPattern(uint64_t val) @@ -100,8 +112,18 @@ inline uint64_t castToUint64WithFloat64BitPattern(uint64_t val) #ifndef __HLSL_VERSION int exp = findMSB(val); #else - uint32_t2 valPacked = packUint64(val); - int exp = valPacked.x ? firstbithigh(valPacked.x) + 32 : firstbithigh(valPacked.y); + int exp = 63; + uint64_t mask = ieee754::traits::signMask; + while (!(val & mask)) + { + --exp; + mask >>= 1; + } + + + //uint32_t2 valPacked = packUint64(val); + //int exp = valPacked.x ? firstbithigh(valPacked.x) + 32 : firstbithigh(valPacked.y); + //exp = 63 - exp; #endif uint64_t mantissa; @@ -128,8 +150,6 @@ inline uint64_t castToUint64WithFloat64BitPattern(uint64_t val) else if ((val & roundingBit) && (stickyBit || (mantissa & 1))) val += roundingBit; - - //val += (1ull << (shiftCnt)) - 1; //mantissa = val >> shiftCntAbs; @@ -148,7 +168,7 @@ inline uint64_t castToUint64WithFloat64BitPattern(uint64_t val) inline uint64_t castToUint64WithFloat64BitPattern(int64_t val) { const uint64_t sign = val & ieee754::traits::signMask; - const uint64_t absVal = abs(val); + const uint64_t absVal = uint64_t(abs(val)); return sign | castToUint64WithFloat64BitPattern(absVal); }; @@ -386,6 +406,11 @@ NBL_CONSTEXPR_INLINE_FUNC bool areBothSameSignInfinity(uint64_t lhs, uint64_t rh return lhs == rhs && (lhs & ~ieee754::traits::signMask) == ieee754::traits::inf; } +NBL_CONSTEXPR_INLINE_FUNC bool isZero(uint64_t val) +{ + return (val << 1) == 0; +} + NBL_CONSTEXPR_INLINE_FUNC bool areBothZero(uint64_t lhs, uint64_t rhs) { return ((lhs << 1) == 0ull) && ((rhs << 1) == 0ull); diff --git a/include/nbl/builtin/hlsl/tgmath.hlsl b/include/nbl/builtin/hlsl/tgmath.hlsl index 4a35e10a07..bd1e1f9e48 100644 --- a/include/nbl/builtin/hlsl/tgmath.hlsl +++ b/include/nbl/builtin/hlsl/tgmath.hlsl @@ -30,8 +30,10 @@ template NBL_CONSTEXPR_INLINE_FUNC bool isInf(T val) { using AsUint = typename unsigned_integer_of_size::type; + using AsFloat = typename float_of_size::type; + AsUint tmp = bit_cast(val); - return (tmp & ~ieee754::traits::signMask) == ieee754::traits::inf; + return (tmp & ~ieee754::traits::signMask) == ieee754::traits::inf; } } From 002581aba619a7a07b9bdbfd17bde230dc710b89 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Wed, 28 Aug 2024 23:24:37 +0100 Subject: [PATCH 042/358] Fixed shaders so they compile even if portable_float64_t is not emulated --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated_float64_t.hlsl | 8 +--- .../hlsl/emulated_float64_t_utils.hlsl | 42 +++++++++++++++---- include/nbl/builtin/hlsl/ieee754.hlsl | 7 +++- include/nbl/builtin/hlsl/tgmath.hlsl | 2 +- 5 files changed, 42 insertions(+), 19 deletions(-) diff --git a/examples_tests b/examples_tests index da25126a01..70ee941a76 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit da25126a01b92ff244bc143852685947c98c0721 +Subproject commit 70ee941a7658f590c7ea4d3a00ce207eeec12b9d diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index de737f8a1c..914e5df43b 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -55,7 +55,6 @@ namespace hlsl NBL_CONSTEXPR_STATIC_INLINE this_t create(float64_t val) { - return bit_cast(bit_cast(float64_t(val))); #ifdef __HLSL_VERSION emulated_float64_t retval; uint32_t lo, hi; @@ -272,9 +271,6 @@ namespace hlsl uint64_t lhsSign = ieee754::extractSign(data); uint64_t rhsSign = ieee754::extractSign(rhs.data); - if (lhsSign != rhsSign) - return addOld(rhs); - int64_t lhsNormMantissa = int64_t(ieee754::extractNormalizeMantissa(data)); int64_t rhsNormMantissa = int64_t(ieee754::extractNormalizeMantissa(rhs.data)); @@ -726,8 +722,8 @@ DEFINE_BIT_CAST_SPEC(emulated_float64_t); DEFINE_BIT_CAST_SPEC(emulated_float64_t); DEFINE_BIT_CAST_SPEC(emulated_float64_t); -template -struct is_floating_point > : bool_constant {}; +//template +//struct is_floating_point > : bool_constant {}; namespace ieee754 { diff --git a/include/nbl/builtin/hlsl/emulated_float64_t_utils.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t_utils.hlsl index 7e16081b31..133c9842bd 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t_utils.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t_utils.hlsl @@ -294,6 +294,11 @@ struct emulated_matrix // : emulated_matrix_base @@ -363,14 +368,12 @@ using portable_matrix_t3x3 = portable_matrix_t; using portable_matrix64_t2x2 = portable_matrix_t2x2 >; using portable_matrix64_t3x3 = portable_matrix_t3x3 >; + +// TODO: fix template NBL_CONSTEXPR_INLINE_FUNC portable_float64_t<> create_portable_float64_t(T val) { - //return impl::portable_float64_t_creator::create(val); - if (impl::is_emulated >::value) - return portable_float64_t<>::create(val); - else - return portable_float64_t<>(val); + return _static_cast >(val); } template @@ -436,13 +439,34 @@ NBL_CONSTEXPR_INLINE_FUNC portable_vector64_t3 create_portable_vector64_t2_from_ return output; } -template >::value> -inline float32_t convert_portable_float64_t_to_float(portable_float64_t<> val) +namespace impl { - - return float32_t(val); + template + struct PortableMul64Helper + { + static inline V multiply(M mat, V vec) + { + return mat * vec; + } + }; + + template + struct PortableMul64Helper + { + static inline V multiply(M mat, V vec) + { + return mul(mat, vec); + } + }; } +template +V portableMul64(M mat, V vec) +{ + return PortableMul64Helper >::multiply(mat, vec); +} + + } } #endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/ieee754.hlsl b/include/nbl/builtin/hlsl/ieee754.hlsl index d39526e04b..704e8b025c 100644 --- a/include/nbl/builtin/hlsl/ieee754.hlsl +++ b/include/nbl/builtin/hlsl/ieee754.hlsl @@ -51,6 +51,7 @@ namespace impl template struct traits_base { + static_assert(is_same::value || is_same::value); NBL_CONSTEXPR_STATIC_INLINE int16_t exponentBitCnt = int16_t(0xbeef); NBL_CONSTEXPR_STATIC_INLINE int16_t mantissaBitCnt = int16_t(0xbeef); }; @@ -158,13 +159,15 @@ NBL_CONSTEXPR_INLINE_FUNC typename unsigned_integer_of_size::type ext template NBL_CONSTEXPR_INLINE_FUNC typename unsigned_integer_of_size::type extractSign(T x) { - return (impl::castToUintType(x) & traits::signMask) >> ((sizeof(T) * 8) - 1); + using AsFloat = typename float_of_size::type; + return (impl::castToUintType(x) & traits::signMask) >> ((sizeof(T) * 8) - 1); } template NBL_CONSTEXPR_INLINE_FUNC typename unsigned_integer_of_size::type extractSignPreserveBitPattern(T x) { - return impl::castToUintType(x) & traits::signMask; + using AsFloat = typename float_of_size::type; + return impl::castToUintType(x) & traits::signMask; } } diff --git a/include/nbl/builtin/hlsl/tgmath.hlsl b/include/nbl/builtin/hlsl/tgmath.hlsl index bd1e1f9e48..75cddf27c0 100644 --- a/include/nbl/builtin/hlsl/tgmath.hlsl +++ b/include/nbl/builtin/hlsl/tgmath.hlsl @@ -33,7 +33,7 @@ NBL_CONSTEXPR_INLINE_FUNC bool isInf(T val) using AsFloat = typename float_of_size::type; AsUint tmp = bit_cast(val); - return (tmp & ~ieee754::traits::signMask) == ieee754::traits::inf; + return (tmp & ~ieee754::traits::signMask) == ieee754::traits::inf; } } From 9b8e61e2c356978a31eb6f5f3a31ade47518b646 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Thu, 29 Aug 2024 00:57:21 +0100 Subject: [PATCH 043/358] Saving work --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated_float64_t.hlsl | 103 ++++++++++++------ .../hlsl/emulated_float64_t_utils.hlsl | 40 +++---- 3 files changed, 85 insertions(+), 60 deletions(-) diff --git a/examples_tests b/examples_tests index 70ee941a76..c08cd35f85 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 70ee941a7658f590c7ea4d3a00ce207eeec12b9d +Subproject commit c08cd35f85ee2bf99110905e2072e239ca44470c diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index 914e5df43b..b1bbe88280 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -72,29 +72,6 @@ namespace hlsl return emulated_float64_t(bit_cast(float64_t(val))); }*/ - inline float getAsFloat32() - { - int exponent = ieee754::extractExponent(data); - if (!FastMath) - { - if (exponent > 127) - return bit_cast(ieee754::traits::inf); - if (exponent < -126) - return -bit_cast(ieee754::traits::inf); - if (tgmath::isnan(data)) - return bit_cast(ieee754::traits::quietNaN); - } - - //return float(bit_cast(data)); - // TODO: fix - uint32_t sign = uint32_t((data & ieee754::traits::signMask) >> 32); - uint32_t biasedExponent = uint32_t(exponent + ieee754::traits::exponentBias) << ieee754::traits::mantissaBitCnt; - uint32_t mantissa = uint32_t(data >> (ieee754::traits::mantissaBitCnt - ieee754::traits::mantissaBitCnt)) & ieee754::traits::mantissaMask; - - return bit_cast(sign | biasedExponent | mantissa); - - } - // TODO: remove emulated_float64_t addOld(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { @@ -656,30 +633,34 @@ struct static_cast_helper,void // TODO: test static inline To cast(From v) { + using ToAsFloat = typename float_of_size::type; + using ToAsUint = typename unsigned_integer_of_size::type; + + if (is_same_v) return To(bit_cast(v.data)); if (is_floating_point::value) { + const int exponent = ieee754::extractExponent(v.data); if (!From::supportsFastMath()) { - if (exponent > ieee754::traits::exponentMax) - return bit_cast(ieee754::traits::inf); - if (exponent < ieee754::traits::exponentMin) - return -bit_cast(ieee754::traits::inf); + if (exponent > ieee754::traits::exponentMax) + return bit_cast(ieee754::traits::inf); + if (exponent < ieee754::traits::exponentMin) + return -bit_cast(ieee754::traits::inf); if (tgmath::isnan(v.data)) - return bit_cast(ieee754::traits::quietNaN); + return bit_cast(ieee754::traits::quietNaN); } - using AsUint = typename unsigned_integer_of_size::type; const uint32_t toBitSize = sizeof(To) * 8; - const AsUint sign = AsUint(ieee754::extractSign(v.data) << (toBitSize - 1)); - const AsUint biasedExponent = AsUint(exponent + ieee754::traits::exponentBias) << ieee754::traits::mantissaBitCnt; - const AsUint mantissa = AsUint(v.data >> (ieee754::traits::mantissaBitCnt - ieee754::traits::mantissaBitCnt)) & ieee754::traits::mantissaMask; + const ToAsUint sign = ToAsUint(ieee754::extractSign(v.data) << (toBitSize - 1)); + const ToAsUint biasedExponent = ToAsUint(exponent + ieee754::traits::exponentBias) << ieee754::traits::mantissaBitCnt; + const ToAsUint mantissa = ToAsUint(v.data >> (ieee754::traits::mantissaBitCnt - ieee754::traits::mantissaBitCnt)) & ieee754::traits::mantissaMask; - return bit_cast(sign | biasedExponent | mantissa); + return bit_cast(sign | biasedExponent | mantissa); } // NOTE: casting from negative float to unsigned int is an UB, function will return abs value in this case @@ -711,7 +692,60 @@ struct static_cast_helper,void } // assert(false); - return 0xdeadbeefbadcaffeull; + return To(0xdeadbeefbadcaffeull); + } +}; + +template +struct static_cast_helper, float32_t, void> +{ + using To = emulated_float64_t; + + static inline To cast(float32_t v) + { + return To::create(v); + } +}; + +template +struct static_cast_helper, float64_t, void> +{ + using To = emulated_float64_t; + + static inline To cast(float64_t v) + { + return To::create(v); + } +}; + +template +struct static_cast_helper, uint32_t, void> +{ + using To = emulated_float64_t; + + static inline To cast(uint32_t v) + { + return To::create(v); + } +}; + +template +struct static_cast_helper, uint64_t, void> +{ + using To = emulated_float64_t; + + static inline To cast(uint64_t v) + { + return To::create(v); + } +}; + +template +struct static_cast_helper, emulated_float64_t, void> +{ + static inline emulated_float64_t cast(emulated_float64_t v) + { + return v; } }; @@ -744,6 +778,5 @@ IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t; namespace impl { -template -struct is_emulated -{ - NBL_CONSTEXPR_STATIC_INLINE bool value = is_same_v > || - is_same_v > || - is_same_v > || - is_same_v >; -}; template::value > struct portable_vector @@ -441,29 +433,29 @@ NBL_CONSTEXPR_INLINE_FUNC portable_vector64_t3 create_portable_vector64_t2_from_ namespace impl { - template - struct PortableMul64Helper +template +struct PortableMul64Helper +{ + static inline V multiply(M mat, V vec) { - static inline V multiply(M mat, V vec) - { - return mat * vec; - } - }; - - template - struct PortableMul64Helper + return mat * vec; + } +}; + +template +struct PortableMul64Helper +{ + static inline V multiply(M mat, V vec) { - static inline V multiply(M mat, V vec) - { - return mul(mat, vec); - } - }; + return mul(mat, vec); + } +}; } template V portableMul64(M mat, V vec) { - return PortableMul64Helper >::multiply(mat, vec); + return impl::PortableMul64Helper >::multiply(mat, vec); } From a2cd3956b20b9b12ea3456a3b2e8b765241a595b Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Fri, 30 Aug 2024 15:34:59 +0100 Subject: [PATCH 044/358] Found the bug --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated_float64_t.hlsl | 17 +++++++++-------- include/nbl/builtin/hlsl/ieee754.hlsl | 2 +- include/nbl/builtin/hlsl/shapes/beziers.hlsl | 6 ++++-- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/examples_tests b/examples_tests index c08cd35f85..a6b14dacdd 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit c08cd35f85ee2bf99110905e2072e239ca44470c +Subproject commit a6b14dacddc043e73191ed1a431a4341bc8b5e77 diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl index b1bbe88280..f96752ab63 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated_float64_t.hlsl @@ -404,7 +404,7 @@ namespace hlsl emulated_float64_t operator*(float rhs) { - return bit_cast(data) * create(rhs); + return _static_cast(data) * create(rhs); } /*this_t reciprocal(uint64_t x) @@ -626,7 +626,7 @@ template struct static_cast_helper,void> { // TODO: - // static_assert(is_arithmetic::value); + static_assert(is_scalar::value); using From = emulated_float64_t; @@ -646,12 +646,13 @@ struct static_cast_helper,void const int exponent = ieee754::extractExponent(v.data); if (!From::supportsFastMath()) { - if (exponent > ieee754::traits::exponentMax) - return bit_cast(ieee754::traits::inf); - if (exponent < ieee754::traits::exponentMin) - return -bit_cast(ieee754::traits::inf); - if (tgmath::isnan(v.data)) - return bit_cast(ieee754::traits::quietNaN); + // TODO: i have no idea why it doesn't work, fix + //if (exponent > ieee754::traits::exponentMax) + // return bit_cast(ieee754::traits::inf); + //if (exponent < ieee754::traits::exponentMin) + // return -bit_cast(ieee754::traits::inf); + //if (tgmath::isnan(v.data)) + // return bit_cast(ieee754::traits::quietNaN); } diff --git a/include/nbl/builtin/hlsl/ieee754.hlsl b/include/nbl/builtin/hlsl/ieee754.hlsl index 704e8b025c..7e36501f0a 100644 --- a/include/nbl/builtin/hlsl/ieee754.hlsl +++ b/include/nbl/builtin/hlsl/ieee754.hlsl @@ -122,7 +122,7 @@ template inline int extractExponent(T x) { using AsFloat = typename float_of_size::type; - return int(extractBiasedExponent(x)) - int(traits::exponentBias); + return int(extractBiasedExponent(x)) - traits::exponentBias; } template diff --git a/include/nbl/builtin/hlsl/shapes/beziers.hlsl b/include/nbl/builtin/hlsl/shapes/beziers.hlsl index 44d79800ef..d3178a29f7 100644 --- a/include/nbl/builtin/hlsl/shapes/beziers.hlsl +++ b/include/nbl/builtin/hlsl/shapes/beziers.hlsl @@ -563,7 +563,8 @@ static math::equations::Quartic getBezierBezierIntersectionEquation(NBL portable_float64_t<> d = (B.x * C.x * k0 * 2.0f) + (B.x * C.y * k1) + (B.x * k3) + (C.x * B.y * k1) + (B.y * C.y * k2 * 2.0f) + (B.y * k4); portable_float64_t<> e = ((C.x * C.x) * k0) + (C.x * C.y * k1) + (C.x * k3) + ((C.y * C.y) * k2) + (C.y * k4) + (k5); - return math::equations::Quartic >::construct(a, b, c, d, e); + return math::equations::Quartic::construct( + _static_cast(a), _static_cast(b), _static_cast(c), _static_cast(d), _static_cast(e)); } // This function returns the analytic quadratic equation to solve for bezier's t value for intersection with another bezier curve @@ -581,7 +582,8 @@ static math::equations::Quadratic getBezierLineIntersectionEquation(Qua bezier.P1 = mul(rotate, bezier.P1 - lineStart); bezier.P2 = mul(rotate, bezier.P2 - lineStart); Quadratic quadratic = Quadratic::constructFromBezier(bezier); - return math::equations::Quadratic >::construct(quadratic.A.y, quadratic.B.y, quadratic.C.y); + return math::equations::Quadratic::construct( + _static_cast(quadratic.A.y), _static_cast(quadratic.B.y), _static_cast(quadratic.C.y)); } } // namespace shapes From 6203ea66a037729118f7215d5e88cd26b2119bb4 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Tue, 3 Sep 2024 18:08:26 +0100 Subject: [PATCH 045/358] Fixed a bug --- examples_tests | 2 +- .../{ => emulated}/emulated_float64_t.hlsl | 1571 +++++++++-------- .../emulated_float64_t_impl.hlsl | 956 +++++----- .../emulated_float64_t_utils.hlsl | 4 +- .../builtin/hlsl/{ => ieee754}/ieee754.hlsl | 20 +- .../hlsl/math/equations/quadratic.hlsl | 2 +- include/nbl/builtin/hlsl/shapes/beziers.hlsl | 4 +- include/nbl/builtin/hlsl/tgmath.hlsl | 4 +- src/nbl/builtin/CMakeLists.txt | 10 +- src/nbl/video/CVulkanPhysicalDevice.cpp | 3 +- 10 files changed, 1260 insertions(+), 1316 deletions(-) rename include/nbl/builtin/hlsl/{ => emulated}/emulated_float64_t.hlsl (83%) rename include/nbl/builtin/hlsl/{impl => emulated}/emulated_float64_t_impl.hlsl (82%) rename include/nbl/builtin/hlsl/{ => emulated}/emulated_float64_t_utils.hlsl (98%) rename include/nbl/builtin/hlsl/{ => ieee754}/ieee754.hlsl (87%) diff --git a/examples_tests b/examples_tests index a6b14dacdd..f8f2c23aab 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit a6b14dacddc043e73191ed1a431a4341bc8b5e77 +Subproject commit f8f2c23aab0092015b3bf31cdab9875c1435f5dc diff --git a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated/emulated_float64_t.hlsl similarity index 83% rename from include/nbl/builtin/hlsl/emulated_float64_t.hlsl rename to include/nbl/builtin/hlsl/emulated/emulated_float64_t.hlsl index f96752ab63..088e535c5b 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/emulated_float64_t.hlsl @@ -1,783 +1,788 @@ -#ifndef _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_INCLUDED_ -#define _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_INCLUDED_ - -#include - -namespace nbl -{ -namespace hlsl -{ - template - struct emulated_float64_t - { - using storage_t = uint64_t; - using this_t = emulated_float64_t; - - storage_t data; - - // constructors - /*static emulated_float64_t create(uint16_t val) - { - return emulated_float64_t(bit_cast(float64_t(val))); - }*/ - - NBL_CONSTEXPR_STATIC_INLINE this_t create(this_t val) - { - return val; - } - - NBL_CONSTEXPR_STATIC_INLINE this_t create(int32_t val) - { - return bit_cast(impl::castToUint64WithFloat64BitPattern(int64_t(val))); - } - - NBL_CONSTEXPR_STATIC_INLINE this_t create(int64_t val) - { - return bit_cast(impl::castToUint64WithFloat64BitPattern(val)); - } - - NBL_CONSTEXPR_STATIC_INLINE this_t create(uint32_t val) - { - return bit_cast(impl::castToUint64WithFloat64BitPattern(uint64_t(val))); - } - - NBL_CONSTEXPR_STATIC_INLINE this_t create(uint64_t val) - { - return bit_cast(impl::castToUint64WithFloat64BitPattern(val)); - } - - NBL_CONSTEXPR_STATIC_INLINE this_t create(float32_t val) - { - this_t output; - output.data = impl::castFloat32ToStorageType(val); - return output; - } - - NBL_CONSTEXPR_STATIC_INLINE this_t create(float64_t val) - { -#ifdef __HLSL_VERSION - emulated_float64_t retval; - uint32_t lo, hi; - asuint(val, lo, hi); - retval.data = (uint64_t(hi) << 32) | lo; - return retval; -#else - return bit_cast(reinterpret_cast(val)); -#endif - } - - // TODO: unresolved external symbol imath_half_to_float_table - /*static emulated_float64_t create(float16_t val) - { - return emulated_float64_t(bit_cast(float64_t(val))); - }*/ - - // TODO: remove - emulated_float64_t addOld(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC - { - if (FlushDenormToZero) - { - emulated_float64_t retval = emulated_float64_t::create(0ull); - - uint64_t mantissa; - uint32_t3 mantissaExtended; - int biasedExp; - - uint64_t lhsSign = data & ieee754::traits::signMask; - uint64_t rhsSign = rhs.data & ieee754::traits::signMask; - uint64_t lhsMantissa = ieee754::extractMantissa(data); - uint64_t rhsMantissa = ieee754::extractMantissa(rhs.data); - int lhsBiasedExp = ieee754::extractBiasedExponent(data); - int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); - - int expDiff = lhsBiasedExp - rhsBiasedExp; - - if (lhsSign == rhsSign) - { - if (expDiff == 0) - { - if (lhsBiasedExp == ieee754::traits::specialValueExp) - { - bool propagate = (lhsMantissa | rhsMantissa) != 0u; - return bit_cast >(glsl::mix(data, impl::propagateFloat64NaN(data, rhs.data), propagate)); - } - - mantissa = lhsMantissa + rhsMantissa; - if (lhsBiasedExp == 0) - return bit_cast >(impl::assembleFloat64(lhsSign, 0, mantissa)); - mantissaExtended.xy = impl::packUint64(mantissa); - mantissaExtended.x |= 0x00200000u; - mantissaExtended.z = 0u; - biasedExp = lhsBiasedExp; - - mantissaExtended = impl::shift64ExtraRightJamming(mantissaExtended, 1); - } - else - { - if (expDiff < 0) - { - swap(lhsMantissa, rhsMantissa); - swap(lhsBiasedExp, rhsBiasedExp); - } - - if (lhsBiasedExp == ieee754::traits::specialValueExp) - { - const bool propagate = (lhsMantissa) != 0u; - return bit_cast >(glsl::mix(ieee754::traits::exponentMask | lhsSign, impl::propagateFloat64NaN(data, rhs.data), propagate)); - } - - expDiff = glsl::mix(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); - rhsMantissa = glsl::mix(rhsMantissa | (1ull << 52), rhsMantissa, rhsBiasedExp == 0); - const uint32_t3 shifted = impl::shift64ExtraRightJamming(uint32_t3(impl::packUint64(rhsMantissa), 0u), expDiff); - rhsMantissa = impl::unpackUint64(shifted.xy); - mantissaExtended.z = shifted.z; - biasedExp = lhsBiasedExp; - - lhsMantissa |= (1ull << 52); - mantissaExtended.xy = impl::packUint64(lhsMantissa + rhsMantissa); - --biasedExp; - if (!(mantissaExtended.x < 0x00200000u)) - { - mantissaExtended = impl::shift64ExtraRightJamming(mantissaExtended, 1); - ++biasedExp; - } - - return bit_cast >(impl::roundAndPackFloat64(lhsSign, biasedExp, mantissaExtended.xyz)); - } - - // cannot happen but compiler cries about not every path returning value - return bit_cast >(impl::roundAndPackFloat64(lhsSign, biasedExp, mantissaExtended)); - } - else - { - lhsMantissa = impl::shortShift64Left(lhsMantissa, 10); - rhsMantissa = impl::shortShift64Left(rhsMantissa, 10); - - if (expDiff != 0) - { - uint32_t2 frac; - - if (expDiff < 0) - { - swap(lhsMantissa, rhsMantissa); - swap(lhsBiasedExp, rhsBiasedExp); - lhsSign ^= ieee754::traits::signMask; - } - - if (lhsBiasedExp == ieee754::traits::specialValueExp) - { - bool propagate = lhsMantissa != 0u; - return bit_cast >(glsl::mix(impl::assembleFloat64(lhsSign, ieee754::traits::exponentMask, 0ull), impl::propagateFloat64NaN(data, rhs.data), propagate)); - } - - expDiff = glsl::mix(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); - rhsMantissa = glsl::mix(rhsMantissa | 0x4000000000000000ull, rhsMantissa, rhsBiasedExp == 0); - rhsMantissa = impl::unpackUint64(impl::shift64RightJamming(impl::packUint64(rhsMantissa), expDiff)); - lhsMantissa |= 0x4000000000000000ull; - frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); - biasedExp = lhsBiasedExp; - --biasedExp; - return bit_cast >(impl::normalizeRoundAndPackFloat64(lhsSign, biasedExp - 10, frac.x, frac.y)); - } - if (lhsBiasedExp == ieee754::traits::specialValueExp) - { - bool propagate = ((lhsMantissa) | (rhsMantissa)) != 0u; - return bit_cast >(glsl::mix(ieee754::traits::quietNaN, impl::propagateFloat64NaN(data, rhs.data), propagate)); - } - rhsBiasedExp = glsl::mix(rhsBiasedExp, 1, lhsBiasedExp == 0); - lhsBiasedExp = glsl::mix(lhsBiasedExp, 1, lhsBiasedExp == 0); - - - const uint32_t2 lhsMantissaPacked = impl::packUint64(lhsMantissa); - const uint32_t2 rhsMantissaPacked = impl::packUint64(rhsMantissa); - - uint32_t2 frac; - uint64_t signOfDifference = 0; - if (rhsMantissaPacked.x < lhsMantissaPacked.x) - { - frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); - } - else if (lhsMantissaPacked.x < rhsMantissaPacked.x) - { - frac.xy = impl::packUint64(rhsMantissa - lhsMantissa); - signOfDifference = ieee754::traits::signMask; - } - else if (rhsMantissaPacked.y <= lhsMantissaPacked.y) - { - /* It is possible that frac.x and frac.y may be zero after this. */ - frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); - } - else - { - frac.xy = impl::packUint64(rhsMantissa - lhsMantissa); - signOfDifference = ieee754::traits::signMask; - } - - biasedExp = glsl::mix(rhsBiasedExp, lhsBiasedExp, signOfDifference == 0u); - lhsSign ^= signOfDifference; - uint64_t retval_0 = impl::packFloat64(uint32_t(FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) << 31, 0, 0u, 0u); - uint64_t retval_1 = impl::normalizeRoundAndPackFloat64(lhsSign, biasedExp - 11, frac.x, frac.y); - return bit_cast >(glsl::mix(retval_0, retval_1, frac.x != 0u || frac.y != 0u)); - } - } - else - { - //static_assert(false, "not implemented yet"); - return bit_cast >(0xdeadbeefbadcaffeull); - } - } - - // arithmetic operators - this_t operator+(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC - { - return addOld(rhs); - - if (FlushDenormToZero) - { - if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) - return bit_cast(ieee754::traits::quietNaN); - - int lhsBiasedExp = ieee754::extractBiasedExponent(data); - int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); - - if (lhsBiasedExp == 0ull) - return bit_cast(rhs.data); - if (rhsBiasedExp == 0ull) - return bit_cast(data); - - uint64_t lhsSign = ieee754::extractSign(data); - uint64_t rhsSign = ieee754::extractSign(rhs.data); - - int64_t lhsNormMantissa = int64_t(ieee754::extractNormalizeMantissa(data)); - int64_t rhsNormMantissa = int64_t(ieee754::extractNormalizeMantissa(rhs.data)); - - lhsNormMantissa <<= 9; - rhsNormMantissa <<= 9; - - // TODO: branchless? - if (lhsSign != rhsSign) - { - if (lhsSign) - lhsNormMantissa *= -1; - if (rhsSign) - rhsNormMantissa *= -1; - } - - int expDiff = lhsBiasedExp - rhsBiasedExp; - - int exp = max(lhsBiasedExp, rhsBiasedExp) - ieee754::traits::exponentBias; - uint32_t shiftAmount = abs(expDiff); - - // so lhsNormMantissa always holds mantissa of number with greater exponent - if (expDiff < 0) - swap(lhsNormMantissa, rhsNormMantissa); - - rhsNormMantissa >>= shiftAmount; - - int64_t resultMantissa = lhsNormMantissa + rhsNormMantissa; - - resultMantissa >>= 9; - - const uint64_t resultSign = uint64_t((lhsSign && rhsSign) || (bit_cast(resultMantissa) & (lhsSign << 63))) << 63; - uint64_t resultBiasedExp = uint64_t(exp) + ieee754::traits::exponentBias; - - resultMantissa = abs(resultMantissa); - - if (resultMantissa & 1ull << 53) - { - ++resultBiasedExp; - resultMantissa >>= 1; - } - - // TODO: better implementation with no loop - while (resultMantissa < (1ull << 52)) - { - --resultBiasedExp; - resultMantissa <<= 1; - } - - resultMantissa &= ieee754::traits::mantissaMask; - uint64_t output = impl::assembleFloat64(resultSign, uint64_t(resultBiasedExp) << ieee754::traits::mantissaBitCnt, abs(resultMantissa)); - return bit_cast(output); - } - - // not implemented - if (!FlushDenormToZero) - return bit_cast(0xdeadbeefbadcaffeull); - } - - emulated_float64_t operator+(float rhs) - { - return bit_cast(data) + create(rhs); - } - - emulated_float64_t operator-(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC - { - emulated_float64_t lhs = bit_cast(data); - emulated_float64_t rhsFlipped = rhs.flipSign(); - - return lhs + rhsFlipped; - } - - emulated_float64_t operator-(float rhs) NBL_CONST_MEMBER_FUNC - { - return bit_cast(data) - create(rhs); - } - - emulated_float64_t operator*(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC - { - if(FlushDenormToZero) - { - emulated_float64_t retval = this_t::create(0ull); - - uint64_t lhsSign = data & ieee754::traits::signMask; - uint64_t rhsSign = rhs.data & ieee754::traits::signMask; - uint64_t lhsMantissa = ieee754::extractMantissa(data); - uint64_t rhsMantissa = ieee754::extractMantissa(rhs.data); - int lhsBiasedExp = ieee754::extractBiasedExponent(data); - int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); - - int exp = int(lhsBiasedExp + rhsBiasedExp) - ieee754::traits::exponentBias; - uint64_t sign = (data ^ rhs.data) & ieee754::traits::signMask; - if (!FastMath) - { - if (lhsBiasedExp == ieee754::traits::specialValueExp) - { - if ((lhsMantissa != 0u) || ((rhsBiasedExp == ieee754::traits::specialValueExp) && (rhsMantissa != 0u))) - return bit_cast(impl::propagateFloat64NaN(data, rhs.data)); - if ((uint64_t(rhsBiasedExp) | rhsMantissa) == 0u) - return bit_cast(ieee754::traits::quietNaN); - - return bit_cast(impl::assembleFloat64(sign, ieee754::traits::exponentMask, 0ull)); - } - if (rhsBiasedExp == ieee754::traits::specialValueExp) - { - /* a cannot be NaN, but is b NaN? */ - if (rhsMantissa != 0u) -#ifdef RELAXED_NAN_PROPAGATION - return rhs.data; -#else - return bit_cast(impl::propagateFloat64NaN(data, rhs.data)); -#endif - if ((uint64_t(lhsBiasedExp) | lhsMantissa) == 0u) - return bit_cast(ieee754::traits::quietNaN); - - return bit_cast(sign | ieee754::traits::exponentMask); - } - if (lhsBiasedExp == 0) - { - if (lhsMantissa == 0u) - return bit_cast(sign); - impl::normalizeFloat64Subnormal(lhsMantissa, lhsBiasedExp, lhsMantissa); - } - if (rhsBiasedExp == 0) - { - if (rhsMantissa == 0u) - return bit_cast(sign); - impl::normalizeFloat64Subnormal(rhsMantissa, rhsBiasedExp, rhsMantissa); - } - } - - const uint64_t hi_l = (lhsMantissa >> 21) | (1ull << 31); - const uint64_t lo_l = lhsMantissa & ((1ull << 21) - 1); - const uint64_t hi_r = (rhsMantissa >> 21) | (1ull << 31); - const uint64_t lo_r = rhsMantissa & ((1ull << 21) - 1); - - //const uint64_t RoundToNearest = (1ull << 31) - 1; - uint64_t newPseudoMantissa = ((hi_l * hi_r) >> 10) + ((hi_l * lo_r + lo_l * hi_r/* + RoundToNearest*/) >> 31); - - if (newPseudoMantissa & (0x1ull << 53)) - { - newPseudoMantissa >>= 1; - ++exp; - } - newPseudoMantissa &= (ieee754::traits::mantissaMask); - - return bit_cast(impl::assembleFloat64(sign, uint64_t(exp) << ieee754::traits::mantissaBitCnt, newPseudoMantissa)); - } - else - { - //static_assert(false, "not implemented yet"); - return bit_cast(0xdeadbeefbadcaffeull); - } - } - - emulated_float64_t operator*(float rhs) - { - return _static_cast(data) * create(rhs); - } - - /*this_t reciprocal(uint64_t x) - { - using ThisType = this_t; - ThisType output = ThisType::bit_cast((0xbfcdd6a18f6a6f52ULL - x) >> 1); - output = output * output; - return output; - }*/ - - emulated_float64_t operator/(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC - { - if (FlushDenormToZero) - { - //return this_t::bit_cast(data) * reciprocal(rhs.data); - - if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) - return bit_cast(ieee754::traits::quietNaN); - - const uint64_t sign = (data ^ rhs.data) & ieee754::traits::signMask; - - if (!FastMath && impl::isZero(rhs.data)) - return bit_cast(ieee754::traits::inf | sign); - - if (!FastMath && impl::areBothInfinity(data, rhs.data)) - return bit_cast(ieee754::traits::quietNaN); - - if (!FastMath && tgmath::isInf(data)) - return bit_cast(ieee754::traits::inf | sign); - - if (!FastMath && tgmath::isInf(rhs.data)) - return bit_cast(0ull | sign); - - - const uint64_t lhsRealMantissa = (ieee754::extractMantissa(data) | (1ull << ieee754::traits::mantissaBitCnt)); - const uint64_t rhsRealMantissa = ieee754::extractMantissa(rhs.data) | (1ull << ieee754::traits::mantissaBitCnt); - - int exp = ieee754::extractExponent(data) - ieee754::extractExponent(rhs.data) + ieee754::traits::exponentBias; - - uint64_t2 lhsMantissaShifted = impl::shiftMantissaLeftBy53(lhsRealMantissa); - uint64_t mantissa = impl::divmod128by64(lhsMantissaShifted.x, lhsMantissaShifted.y, rhsRealMantissa); - - while (mantissa < (1ull << 52)) - { - mantissa <<= 1; - exp--; - } - - mantissa &= ieee754::traits::mantissaMask; - - return bit_cast(impl::assembleFloat64(sign, uint64_t(exp) << ieee754::traits::mantissaBitCnt, mantissa)); - } - else - { - //static_assert(false, "not implemented yet"); - return bit_cast(0xdeadbeefbadcaffeull); - } - } - - // relational operators - // TODO: should `FlushDenormToZero` affect relational operators? - bool operator==(this_t rhs) NBL_CONST_MEMBER_FUNC - { - if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) - return false; - // TODO: i'm not sure about this one - if (!FastMath && impl::areBothZero(data, rhs.data)) - return true; - - const emulated_float64_t xored = bit_cast(data ^ rhs.data); - // TODO: check what fast math returns for -0 == 0 - if ((xored.data & 0x7FFFFFFFFFFFFFFFull) == 0ull) - return true; - - return !(xored.data); - } - bool operator!=(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC - { - if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) - return false; - - return !(bit_cast(data) == rhs); - } - bool operator<(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC - { - if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) - return false; - if (!FastMath && impl::areBothSameSignInfinity(data, rhs.data)) - return false; - if (!FastMath && impl::areBothZero(data, rhs.data)) - return false; - - const uint64_t lhsSign = ieee754::extractSign(data); - const uint64_t rhsSign = ieee754::extractSign(rhs.data); - - // flip bits of negative numbers and flip signs of all numbers - uint64_t lhsFlipped = data ^ ((0x7FFFFFFFFFFFFFFFull * lhsSign) | ieee754::traits::signMask); - uint64_t rhsFlipped = rhs.data ^ ((0x7FFFFFFFFFFFFFFFull * rhsSign) | ieee754::traits::signMask); - - uint64_t diffBits = lhsFlipped ^ rhsFlipped; - - return (lhsFlipped & diffBits) < (rhsFlipped & diffBits); - } - bool operator>(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC - { - if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) - return false; - if (!FastMath && impl::areBothSameSignInfinity(data, rhs.data)) - return false; - if (!FastMath && impl::areBothZero(data, rhs.data)) - return false; - - const uint64_t lhsSign = ieee754::extractSign(data); - const uint64_t rhsSign = ieee754::extractSign(rhs.data); - - // flip bits of negative numbers and flip signs of all numbers - uint64_t lhsFlipped = data ^ ((0x7FFFFFFFFFFFFFFFull * lhsSign) | ieee754::traits::signMask); - uint64_t rhsFlipped = rhs.data ^ ((0x7FFFFFFFFFFFFFFFull * rhsSign) | ieee754::traits::signMask); - - uint64_t diffBits = lhsFlipped ^ rhsFlipped; - - return (lhsFlipped & diffBits) > (rhsFlipped & diffBits); - } - bool operator<=(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC - { - if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) - return false; - - return !(bit_cast(data) > bit_cast(rhs.data)); - } - bool operator>=(emulated_float64_t rhs) - { - if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) - return false; - - return !(bit_cast(data) < bit_cast(rhs.data)); - } - - //logical operators - bool operator&&(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { return bool(data) && bool(rhs.data); } - bool operator||(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { return bool(data) || bool(rhs.data); } - bool operator!() NBL_CONST_MEMBER_FUNC { return !bool(data); } - - emulated_float64_t flipSign() - { - return bit_cast(data ^ ieee754::traits::signMask); - } - - bool isNaN() - { - return tgmath::isnan(data); - } - - NBL_CONSTEXPR_STATIC_INLINE bool supportsFastMath() - { - return FastMath; - } - }; - -#define IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(...) \ -template<>\ -struct traits_base<__VA_ARGS__ >\ -{\ - NBL_CONSTEXPR_STATIC_INLINE int16_t exponentBitCnt = 11;\ - NBL_CONSTEXPR_STATIC_INLINE int16_t mantissaBitCnt = 52;\ -};\ -template<>\ -inline uint32_t extractBiasedExponent(__VA_ARGS__ x)\ -{\ - return extractBiasedExponent(x.data);\ -}\ -\ -template<>\ -inline int extractExponent(__VA_ARGS__ x)\ -{\ - return extractExponent(x.data);\ -}\ -\ -template<>\ -NBL_CONSTEXPR_INLINE_FUNC __VA_ARGS__ replaceBiasedExponent(__VA_ARGS__ x, typename unsigned_integer_of_size::type biasedExp)\ -{\ - return __VA_ARGS__(replaceBiasedExponent(x.data, biasedExp));\ -}\ -\ -template <>\ -NBL_CONSTEXPR_INLINE_FUNC __VA_ARGS__ fastMulExp2(__VA_ARGS__ x, int n)\ -{\ - return __VA_ARGS__(replaceBiasedExponent(x.data, extractBiasedExponent(x) + uint32_t(n)));\ -}\ -\ -template <>\ -NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size::type extractMantissa(__VA_ARGS__ x)\ -{\ - return extractMantissa(x.data);\ -}\ -\ -template <>\ -NBL_CONSTEXPR_INLINE_FUNC uint64_t extractNormalizeMantissa(__VA_ARGS__ x)\ -{\ - return extractNormalizeMantissa(x.data);\ -}\ -\ - -#define DEFINE_BIT_CAST_SPEC(...)\ -template<>\ -NBL_CONSTEXPR_FUNC __VA_ARGS__ bit_cast<__VA_ARGS__, uint64_t>(NBL_CONST_REF_ARG(uint64_t) val)\ -{\ -__VA_ARGS__ output; \ -output.data = val; \ -\ -return output; \ -}\ -\ - -namespace impl -{ - -template -struct static_cast_helper,void> -{ - // TODO: - static_assert(is_scalar::value); - - using From = emulated_float64_t; - - // TODO: test - static inline To cast(From v) - { - using ToAsFloat = typename float_of_size::type; - using ToAsUint = typename unsigned_integer_of_size::type; - - - if (is_same_v) - return To(bit_cast(v.data)); - - if (is_floating_point::value) - { - - const int exponent = ieee754::extractExponent(v.data); - if (!From::supportsFastMath()) - { - // TODO: i have no idea why it doesn't work, fix - //if (exponent > ieee754::traits::exponentMax) - // return bit_cast(ieee754::traits::inf); - //if (exponent < ieee754::traits::exponentMin) - // return -bit_cast(ieee754::traits::inf); - //if (tgmath::isnan(v.data)) - // return bit_cast(ieee754::traits::quietNaN); - } - - - const uint32_t toBitSize = sizeof(To) * 8; - const ToAsUint sign = ToAsUint(ieee754::extractSign(v.data) << (toBitSize - 1)); - const ToAsUint biasedExponent = ToAsUint(exponent + ieee754::traits::exponentBias) << ieee754::traits::mantissaBitCnt; - const ToAsUint mantissa = ToAsUint(v.data >> (ieee754::traits::mantissaBitCnt - ieee754::traits::mantissaBitCnt)) & ieee754::traits::mantissaMask; - - return bit_cast(sign | biasedExponent | mantissa); - } - - // NOTE: casting from negative float to unsigned int is an UB, function will return abs value in this case - if (is_integral::value) - { - const int exponent = ieee754::extractExponent(v.data); - if (exponent < 0) - return 0; - - uint64_t unsignedOutput = ieee754::extractMantissa(v.data) & 1ull << ieee754::traits::mantissaBitCnt; - const int shiftAmount = exponent - int(ieee754::traits::mantissaBitCnt); - - if (shiftAmount < 0) - unsignedOutput <<= -shiftAmount; - else - unsignedOutput >>= shiftAmount; - - if (is_signed::value) - { - int64_t signedOutput64 = unsignedOutput & ((1ull << 63) - 1); - To signedOutput = To(signedOutput64); - if (ieee754::extractSignPreserveBitPattern(v.data) != 0) - signedOutput = -signedOutput; - - return signedOutput; - } - - return To(unsignedOutput); - } - - // assert(false); - return To(0xdeadbeefbadcaffeull); - } -}; - -template -struct static_cast_helper, float32_t, void> -{ - using To = emulated_float64_t; - - static inline To cast(float32_t v) - { - return To::create(v); - } -}; - -template -struct static_cast_helper, float64_t, void> -{ - using To = emulated_float64_t; - - static inline To cast(float64_t v) - { - return To::create(v); - } -}; - -template -struct static_cast_helper, uint32_t, void> -{ - using To = emulated_float64_t; - - static inline To cast(uint32_t v) - { - return To::create(v); - } -}; - -template -struct static_cast_helper, uint64_t, void> -{ - using To = emulated_float64_t; - - static inline To cast(uint64_t v) - { - return To::create(v); - } -}; - -template -struct static_cast_helper, emulated_float64_t, void> -{ - static inline emulated_float64_t cast(emulated_float64_t v) - { - return v; - } -}; - -} - -DEFINE_BIT_CAST_SPEC(emulated_float64_t); -DEFINE_BIT_CAST_SPEC(emulated_float64_t); -DEFINE_BIT_CAST_SPEC(emulated_float64_t); -DEFINE_BIT_CAST_SPEC(emulated_float64_t); - -//template -//struct is_floating_point > : bool_constant {}; - -namespace ieee754 -{ -IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); -IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); -IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); -IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); -} - -} -} - -#undef FLOAT_ROUND_NEAREST_EVEN -#undef FLOAT_ROUND_TO_ZERO -#undef FLOAT_ROUND_DOWN -#undef FLOAT_ROUND_UP -#undef FLOAT_ROUNDING_MODE - -#undef IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE -#undef DEFINE_BIT_CAST_SPEC - -#endif +#ifndef _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_INCLUDED_ +#define _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_INCLUDED_ + +#include + +namespace nbl +{ +namespace hlsl +{ + template + struct emulated_float64_t + { + using storage_t = uint64_t; + using this_t = emulated_float64_t; + + storage_t data; + + // constructors + /*static emulated_float64_t create(uint16_t val) + { + return emulated_float64_t(bit_cast(float64_t(val))); + }*/ + + NBL_CONSTEXPR_STATIC_INLINE this_t create(this_t val) + { + return val; + } + + NBL_CONSTEXPR_STATIC_INLINE this_t create(int32_t val) + { + return bit_cast(impl::castToUint64WithFloat64BitPattern(int64_t(val))); + } + + NBL_CONSTEXPR_STATIC_INLINE this_t create(int64_t val) + { + return bit_cast(impl::castToUint64WithFloat64BitPattern(val)); + } + + NBL_CONSTEXPR_STATIC_INLINE this_t create(uint32_t val) + { + return bit_cast(impl::castToUint64WithFloat64BitPattern(uint64_t(val))); + } + + NBL_CONSTEXPR_STATIC_INLINE this_t create(uint64_t val) + { + return bit_cast(impl::castToUint64WithFloat64BitPattern(val)); + } + + NBL_CONSTEXPR_STATIC_INLINE this_t create(float32_t val) + { + this_t output; + output.data = impl::castFloat32ToStorageType(val); + return output; + } + + NBL_CONSTEXPR_STATIC_INLINE this_t create(float64_t val) + { +#ifdef __HLSL_VERSION + emulated_float64_t retval; + uint32_t lo, hi; + asuint(val, lo, hi); + retval.data = (uint64_t(hi) << 32) | lo; + return retval; +#else + return bit_cast(reinterpret_cast(val)); +#endif + } + + // TODO: unresolved external symbol imath_half_to_float_table + /*static emulated_float64_t create(float16_t val) + { + return emulated_float64_t(bit_cast(float64_t(val))); + }*/ + + // TODO: remove + emulated_float64_t addOld(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC + { + if (FlushDenormToZero) + { + emulated_float64_t retval = emulated_float64_t::create(0ull); + + uint64_t mantissa; + uint32_t3 mantissaExtended; + int biasedExp; + + uint64_t lhsSign = data & ieee754::traits::signMask; + uint64_t rhsSign = rhs.data & ieee754::traits::signMask; + uint64_t lhsMantissa = ieee754::extractMantissa(data); + uint64_t rhsMantissa = ieee754::extractMantissa(rhs.data); + int lhsBiasedExp = ieee754::extractBiasedExponent(data); + int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); + + int expDiff = lhsBiasedExp - rhsBiasedExp; + + if (lhsSign == rhsSign) + { + if (expDiff == 0) + { + if (lhsBiasedExp == ieee754::traits::specialValueExp) + { + bool propagate = (lhsMantissa | rhsMantissa) != 0u; + return bit_cast >(glsl::mix(data, impl::propagateFloat64NaN(data, rhs.data), propagate)); + } + + mantissa = lhsMantissa + rhsMantissa; + if (lhsBiasedExp == 0) + return bit_cast >(impl::assembleFloat64(lhsSign, 0, mantissa)); + mantissaExtended.xy = impl::packUint64(mantissa); + mantissaExtended.x |= 0x00200000u; + mantissaExtended.z = 0u; + biasedExp = lhsBiasedExp; + + mantissaExtended = impl::shift64ExtraRightJamming(mantissaExtended, 1); + } + else + { + if (expDiff < 0) + { + swap(lhsMantissa, rhsMantissa); + swap(lhsBiasedExp, rhsBiasedExp); + } + + if (lhsBiasedExp == ieee754::traits::specialValueExp) + { + const bool propagate = (lhsMantissa) != 0u; + return bit_cast >(glsl::mix(ieee754::traits::exponentMask | lhsSign, impl::propagateFloat64NaN(data, rhs.data), propagate)); + } + + expDiff = glsl::mix(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); + rhsMantissa = glsl::mix(rhsMantissa | (1ull << 52), rhsMantissa, rhsBiasedExp == 0); + const uint32_t3 shifted = impl::shift64ExtraRightJamming(uint32_t3(impl::packUint64(rhsMantissa), 0u), expDiff); + rhsMantissa = impl::unpackUint64(shifted.xy); + mantissaExtended.z = shifted.z; + biasedExp = lhsBiasedExp; + + lhsMantissa |= (1ull << 52); + mantissaExtended.xy = impl::packUint64(lhsMantissa + rhsMantissa); + --biasedExp; + if (!(mantissaExtended.x < 0x00200000u)) + { + mantissaExtended = impl::shift64ExtraRightJamming(mantissaExtended, 1); + ++biasedExp; + } + + return bit_cast >(impl::roundAndPackFloat64(lhsSign, biasedExp, mantissaExtended.xyz)); + } + + // cannot happen but compiler cries about not every path returning value + return bit_cast >(impl::roundAndPackFloat64(lhsSign, biasedExp, mantissaExtended)); + } + else + { + lhsMantissa = impl::shortShift64Left(lhsMantissa, 10); + rhsMantissa = impl::shortShift64Left(rhsMantissa, 10); + + if (expDiff != 0) + { + uint32_t2 frac; + + if (expDiff < 0) + { + swap(lhsMantissa, rhsMantissa); + swap(lhsBiasedExp, rhsBiasedExp); + lhsSign ^= ieee754::traits::signMask; + } + + if (lhsBiasedExp == ieee754::traits::specialValueExp) + { + bool propagate = lhsMantissa != 0u; + return bit_cast >(glsl::mix(impl::assembleFloat64(lhsSign, ieee754::traits::exponentMask, 0ull), impl::propagateFloat64NaN(data, rhs.data), propagate)); + } + + expDiff = glsl::mix(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); + rhsMantissa = glsl::mix(rhsMantissa | 0x4000000000000000ull, rhsMantissa, rhsBiasedExp == 0); + rhsMantissa = impl::unpackUint64(impl::shift64RightJamming(impl::packUint64(rhsMantissa), expDiff)); + lhsMantissa |= 0x4000000000000000ull; + frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); + biasedExp = lhsBiasedExp; + --biasedExp; + return bit_cast >(impl::normalizeRoundAndPackFloat64(lhsSign, biasedExp - 10, frac.x, frac.y)); + } + if (lhsBiasedExp == ieee754::traits::specialValueExp) + { + bool propagate = ((lhsMantissa) | (rhsMantissa)) != 0u; + return bit_cast >(glsl::mix(ieee754::traits::quietNaN, impl::propagateFloat64NaN(data, rhs.data), propagate)); + } + rhsBiasedExp = glsl::mix(rhsBiasedExp, 1, lhsBiasedExp == 0); + lhsBiasedExp = glsl::mix(lhsBiasedExp, 1, lhsBiasedExp == 0); + + + const uint32_t2 lhsMantissaPacked = impl::packUint64(lhsMantissa); + const uint32_t2 rhsMantissaPacked = impl::packUint64(rhsMantissa); + + uint32_t2 frac; + uint64_t signOfDifference = 0; + if (rhsMantissaPacked.x < lhsMantissaPacked.x) + { + frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); + } + else if (lhsMantissaPacked.x < rhsMantissaPacked.x) + { + frac.xy = impl::packUint64(rhsMantissa - lhsMantissa); + signOfDifference = ieee754::traits::signMask; + } + else if (rhsMantissaPacked.y <= lhsMantissaPacked.y) + { + /* It is possible that frac.x and frac.y may be zero after this. */ + frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); + } + else + { + frac.xy = impl::packUint64(rhsMantissa - lhsMantissa); + signOfDifference = ieee754::traits::signMask; + } + + biasedExp = glsl::mix(rhsBiasedExp, lhsBiasedExp, signOfDifference == 0u); + lhsSign ^= signOfDifference; + uint64_t retval_0 = impl::packFloat64(0, 0, 0u, 0u); + uint64_t retval_1 = impl::normalizeRoundAndPackFloat64(lhsSign, biasedExp - 11, frac.x, frac.y); + return bit_cast >(glsl::mix(retval_0, retval_1, frac.x != 0u || frac.y != 0u)); + } + } + else + { + //static_assert(false, "not implemented yet"); + return bit_cast >(0xdeadbeefbadcaffeull); + } + } + + // arithmetic operators + this_t operator+(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC + { + if (FlushDenormToZero) + { + if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) + return bit_cast(ieee754::traits::quietNaN); + + int lhsBiasedExp = ieee754::extractBiasedExponent(data); + int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); + + if (lhsBiasedExp == 0ull) + return bit_cast(rhs.data); + if (rhsBiasedExp == 0ull) + return bit_cast(data); + + const uint64_t lhsSign = ieee754::extractSign(data); + const uint64_t rhsSign = ieee754::extractSign(rhs.data); + + if (lhsSign != rhsSign) + return addOld(rhs); + + // assuming lhsSign == rhsSign + const uint64_t resultSign = lhsSign == 0 ? 0 : ieee754::traits::signMask; + + if (!FastMath && (tgmath::isinf(data) || tgmath::isinf(rhs.data))) + return bit_cast(ieee754::traits::inf | resultSign); + + uint64_t lhsNormMantissa = ieee754::extractNormalizeMantissa(data); + uint64_t rhsNormMantissa = ieee754::extractNormalizeMantissa(rhs.data); + + // TODO: branchless? + /*if (lhsSign != rhsSign) + { + if (lhsSign) + lhsNormMantissa *= -1; + if (rhsSign) + rhsNormMantissa *= -1; + }*/ + + int expDiff = lhsBiasedExp - rhsBiasedExp; + + int exp = max(lhsBiasedExp, rhsBiasedExp) - ieee754::traits::exponentBias; + uint32_t shiftAmount = abs(expDiff); + + // so lhsNormMantissa always holds mantissa of number with greater exponent + if (expDiff < 0) + swap(lhsNormMantissa, rhsNormMantissa); + + rhsNormMantissa >>= shiftAmount; + + uint64_t resultMantissa = lhsNormMantissa + rhsNormMantissa; + + //const uint64_t resultSign = ((lhsSign && rhsSign) || (resultMantissa & (lhsSign << 63))) << 63; + uint64_t resultBiasedExp = uint64_t(exp) + ieee754::traits::exponentBias; + + resultMantissa = resultMantissa; + + if (resultMantissa & 1ull << 53) + { + ++resultBiasedExp; + resultMantissa >>= 1; + } + + // TODO: better implementation with no loop + while (resultMantissa < (1ull << 52)) + { + --resultBiasedExp; + resultMantissa <<= 1; + } + + resultMantissa &= ieee754::traits::mantissaMask; + uint64_t output = impl::assembleFloat64(resultSign, uint64_t(resultBiasedExp) << ieee754::traits::mantissaBitCnt, resultMantissa); + return bit_cast(output); + } + + // not implemented + if (!FlushDenormToZero) + return bit_cast(0xdeadbeefbadcaffeull); + } + + emulated_float64_t operator+(float rhs) + { + return bit_cast(data) + create(rhs); + } + + emulated_float64_t operator-(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC + { + emulated_float64_t lhs = bit_cast(data); + emulated_float64_t rhsFlipped = rhs.flipSign(); + + return lhs + rhsFlipped; + } + + emulated_float64_t operator-(float rhs) NBL_CONST_MEMBER_FUNC + { + return bit_cast(data) - create(rhs); + } + + emulated_float64_t operator*(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC + { + if(FlushDenormToZero) + { + emulated_float64_t retval = this_t::create(0ull); + + uint64_t lhsSign = data & ieee754::traits::signMask; + uint64_t rhsSign = rhs.data & ieee754::traits::signMask; + uint64_t lhsMantissa = ieee754::extractMantissa(data); + uint64_t rhsMantissa = ieee754::extractMantissa(rhs.data); + int lhsBiasedExp = ieee754::extractBiasedExponent(data); + int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); + + int exp = int(lhsBiasedExp + rhsBiasedExp) - ieee754::traits::exponentBias; + uint64_t sign = (data ^ rhs.data) & ieee754::traits::signMask; + if (!FastMath) + { + if (lhsBiasedExp == ieee754::traits::specialValueExp) + { + if ((lhsMantissa != 0u) || ((rhsBiasedExp == ieee754::traits::specialValueExp) && (rhsMantissa != 0u))) + return bit_cast(impl::propagateFloat64NaN(data, rhs.data)); + if ((uint64_t(rhsBiasedExp) | rhsMantissa) == 0u) + return bit_cast(ieee754::traits::quietNaN); + + return bit_cast(impl::assembleFloat64(sign, ieee754::traits::exponentMask, 0ull)); + } + if (rhsBiasedExp == ieee754::traits::specialValueExp) + { + /* a cannot be NaN, but is b NaN? */ + if (rhsMantissa != 0u) +#ifdef RELAXED_NAN_PROPAGATION + return rhs.data; +#else + return bit_cast(impl::propagateFloat64NaN(data, rhs.data)); +#endif + if ((uint64_t(lhsBiasedExp) | lhsMantissa) == 0u) + return bit_cast(ieee754::traits::quietNaN); + + return bit_cast(sign | ieee754::traits::exponentMask); + } + if (lhsBiasedExp == 0) + { + if (lhsMantissa == 0u) + return bit_cast(sign); + impl::normalizeFloat64Subnormal(lhsMantissa, lhsBiasedExp, lhsMantissa); + } + if (rhsBiasedExp == 0) + { + if (rhsMantissa == 0u) + return bit_cast(sign); + impl::normalizeFloat64Subnormal(rhsMantissa, rhsBiasedExp, rhsMantissa); + } + } + + const uint64_t hi_l = (lhsMantissa >> 21) | (1ull << 31); + const uint64_t lo_l = lhsMantissa & ((1ull << 21) - 1); + const uint64_t hi_r = (rhsMantissa >> 21) | (1ull << 31); + const uint64_t lo_r = rhsMantissa & ((1ull << 21) - 1); + + //const uint64_t RoundToNearest = (1ull << 31) - 1; + uint64_t newPseudoMantissa = ((hi_l * hi_r) >> 10) + ((hi_l * lo_r + lo_l * hi_r/* + RoundToNearest*/) >> 31); + + if (newPseudoMantissa & (0x1ull << 53)) + { + newPseudoMantissa >>= 1; + ++exp; + } + newPseudoMantissa &= (ieee754::traits::mantissaMask); + + return bit_cast(impl::assembleFloat64(sign, uint64_t(exp) << ieee754::traits::mantissaBitCnt, newPseudoMantissa)); + } + else + { + //static_assert(false, "not implemented yet"); + return bit_cast(0xdeadbeefbadcaffeull); + } + } + + emulated_float64_t operator*(float rhs) + { + return _static_cast(data) * create(rhs); + } + + /*this_t reciprocal(uint64_t x) + { + using ThisType = this_t; + ThisType output = ThisType::bit_cast((0xbfcdd6a18f6a6f52ULL - x) >> 1); + output = output * output; + return output; + }*/ + + emulated_float64_t operator/(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC + { + if (FlushDenormToZero) + { + //return this_t::bit_cast(data) * reciprocal(rhs.data); + + if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) + return bit_cast(ieee754::traits::quietNaN); + + const uint64_t sign = (data ^ rhs.data) & ieee754::traits::signMask; + + if (!FastMath && impl::isZero(rhs.data)) + return bit_cast(ieee754::traits::inf | sign); + + if (!FastMath && impl::areBothInfinity(data, rhs.data)) + return bit_cast(ieee754::traits::quietNaN); + + if (!FastMath && tgmath::isinf(data)) + return bit_cast(ieee754::traits::inf | sign); + + if (!FastMath && tgmath::isinf(rhs.data)) + return bit_cast(0ull | sign); + + + const uint64_t lhsRealMantissa = (ieee754::extractMantissa(data) | (1ull << ieee754::traits::mantissaBitCnt)); + const uint64_t rhsRealMantissa = ieee754::extractMantissa(rhs.data) | (1ull << ieee754::traits::mantissaBitCnt); + + int exp = ieee754::extractExponent(data) - ieee754::extractExponent(rhs.data) + ieee754::traits::exponentBias; + + uint64_t2 lhsMantissaShifted = impl::shiftMantissaLeftBy53(lhsRealMantissa); + uint64_t mantissa = impl::divmod128by64(lhsMantissaShifted.x, lhsMantissaShifted.y, rhsRealMantissa); + + while (mantissa < (1ull << 52)) + { + mantissa <<= 1; + exp--; + } + + mantissa &= ieee754::traits::mantissaMask; + + return bit_cast(impl::assembleFloat64(sign, uint64_t(exp) << ieee754::traits::mantissaBitCnt, mantissa)); + } + else + { + //static_assert(false, "not implemented yet"); + return bit_cast(0xdeadbeefbadcaffeull); + } + } + + // relational operators + // TODO: should `FlushDenormToZero` affect relational operators? + bool operator==(this_t rhs) NBL_CONST_MEMBER_FUNC + { + if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) + return false; + // TODO: i'm not sure about this one + if (!FastMath && impl::areBothZero(data, rhs.data)) + return true; + + const emulated_float64_t xored = bit_cast(data ^ rhs.data); + // TODO: check what fast math returns for -0 == 0 + if ((xored.data & 0x7FFFFFFFFFFFFFFFull) == 0ull) + return true; + + return !(xored.data); + } + bool operator!=(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC + { + if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) + return false; + + return !(bit_cast(data) == rhs); + } + bool operator<(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC + { + if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) + return false; + if (!FastMath && impl::areBothSameSignInfinity(data, rhs.data)) + return false; + if (!FastMath && impl::areBothZero(data, rhs.data)) + return false; + + const uint64_t lhsSign = ieee754::extractSign(data); + const uint64_t rhsSign = ieee754::extractSign(rhs.data); + + // flip bits of negative numbers and flip signs of all numbers + uint64_t lhsFlipped = data ^ ((0x7FFFFFFFFFFFFFFFull * lhsSign) | ieee754::traits::signMask); + uint64_t rhsFlipped = rhs.data ^ ((0x7FFFFFFFFFFFFFFFull * rhsSign) | ieee754::traits::signMask); + + return lhsFlipped < rhsFlipped; + } + bool operator>(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC + { + if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) + return false; + if (!FastMath && impl::areBothSameSignInfinity(data, rhs.data)) + return false; + if (!FastMath && impl::areBothZero(data, rhs.data)) + return false; + + const uint64_t lhsSign = ieee754::extractSign(data); + const uint64_t rhsSign = ieee754::extractSign(rhs.data); + + // flip bits of negative numbers and flip signs of all numbers + uint64_t lhsFlipped = data ^ ((0x7FFFFFFFFFFFFFFFull * lhsSign) | ieee754::traits::signMask); + uint64_t rhsFlipped = rhs.data ^ ((0x7FFFFFFFFFFFFFFFull * rhsSign) | ieee754::traits::signMask); + + return lhsFlipped > rhsFlipped; + } + bool operator<=(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC + { + if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) + return false; + + return !(bit_cast(data) > bit_cast(rhs.data)); + } + bool operator>=(emulated_float64_t rhs) + { + if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) + return false; + + return !(bit_cast(data) < bit_cast(rhs.data)); + } + + //logical operators + bool operator&&(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { return bool(data) && bool(rhs.data); } + bool operator||(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { return bool(data) || bool(rhs.data); } + bool operator!() NBL_CONST_MEMBER_FUNC { return !bool(data); } + + emulated_float64_t flipSign() + { + return bit_cast(data ^ ieee754::traits::signMask); + } + + NBL_CONSTEXPR_STATIC_INLINE bool supportsFastMath() + { + return FastMath; + } + + enum E_ROUNDING_MODE + { + FLOAT_ROUND_NEAREST_EVEN, + FLOAT_ROUND_TO_ZERO, + FLOAT_ROUND_DOWN, + FLOAT_ROUND_UP + }; + + static const E_ROUNDING_MODE RoundingMode = E_ROUNDING_MODE::FLOAT_ROUND_TO_ZERO; + }; + +#define IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(...) \ +template<>\ +struct traits_base<__VA_ARGS__ >\ +{\ + NBL_CONSTEXPR_STATIC_INLINE int16_t exponentBitCnt = 11;\ + NBL_CONSTEXPR_STATIC_INLINE int16_t mantissaBitCnt = 52;\ +};\ +template<>\ +inline uint32_t extractBiasedExponent(__VA_ARGS__ x)\ +{\ + return extractBiasedExponent(x.data);\ +}\ +\ +template<>\ +inline int extractExponent(__VA_ARGS__ x)\ +{\ + return extractExponent(x.data);\ +}\ +\ +template<>\ +NBL_CONSTEXPR_INLINE_FUNC __VA_ARGS__ replaceBiasedExponent(__VA_ARGS__ x, typename unsigned_integer_of_size::type biasedExp)\ +{\ + return __VA_ARGS__(replaceBiasedExponent(x.data, biasedExp));\ +}\ +\ +template <>\ +NBL_CONSTEXPR_INLINE_FUNC __VA_ARGS__ fastMulExp2(__VA_ARGS__ x, int n)\ +{\ + return __VA_ARGS__(replaceBiasedExponent(x.data, extractBiasedExponent(x) + uint32_t(n)));\ +}\ +\ +template <>\ +NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size::type extractMantissa(__VA_ARGS__ x)\ +{\ + return extractMantissa(x.data);\ +}\ +\ +template <>\ +NBL_CONSTEXPR_INLINE_FUNC uint64_t extractNormalizeMantissa(__VA_ARGS__ x)\ +{\ + return extractNormalizeMantissa(x.data);\ +}\ +\ + +#define DEFINE_BIT_CAST_SPEC(...)\ +template<>\ +NBL_CONSTEXPR_FUNC __VA_ARGS__ bit_cast<__VA_ARGS__, uint64_t>(NBL_CONST_REF_ARG(uint64_t) val)\ +{\ +__VA_ARGS__ output;\ +output.data = val;\ +\ +return output;\ +}\ +\ +template<>\ +NBL_CONSTEXPR_FUNC __VA_ARGS__ bit_cast<__VA_ARGS__, float64_t>(NBL_CONST_REF_ARG(float64_t) val)\ +{\ +__VA_ARGS__ output;\ +output.data = bit_cast(val);\ +\ +return output;\ +}\ +\ + +namespace impl +{ + +template +struct static_cast_helper,void> +{ + static_assert(is_scalar::value); + + using From = emulated_float64_t; + + // TODO: test + static inline To cast(From v) + { + using ToAsFloat = typename float_of_size::type; + using ToAsUint = typename unsigned_integer_of_size::type; + + + if (is_same_v) + return To(bit_cast(v.data)); + + if (is_floating_point::value) + { + + const int exponent = ieee754::extractExponent(v.data); + if (!From::supportsFastMath()) + { + //TODO: i have no idea why it doesn't work, fix + /*if (exponent > ieee754::traits::exponentMax) + return bit_cast(ieee754::traits::inf); + if (exponent < ieee754::traits::exponentMin) + return -bit_cast(ieee754::traits::inf); + if (tgmath::isnan(v.data)) + return bit_cast(ieee754::traits::quietNaN);*/ + } + + + const uint32_t toBitSize = sizeof(To) * 8; + const ToAsUint sign = ToAsUint(ieee754::extractSign(v.data) << (toBitSize - 1)); + const ToAsUint biasedExponent = ToAsUint(exponent + ieee754::traits::exponentBias) << ieee754::traits::mantissaBitCnt; + const ToAsUint mantissa = ToAsUint(v.data >> (ieee754::traits::mantissaBitCnt - ieee754::traits::mantissaBitCnt)) & ieee754::traits::mantissaMask; + + return bit_cast(sign | biasedExponent | mantissa); + } + + // NOTE: casting from negative float to unsigned int is an UB, function will return abs value in this case + if (is_integral::value) + { + const int exponent = ieee754::extractExponent(v.data); + if (exponent < 0) + return 0; + + uint64_t unsignedOutput = ieee754::extractMantissa(v.data) & 1ull << ieee754::traits::mantissaBitCnt; + const int shiftAmount = exponent - int(ieee754::traits::mantissaBitCnt); + + if (shiftAmount < 0) + unsignedOutput <<= -shiftAmount; + else + unsignedOutput >>= shiftAmount; + + if (is_signed::value) + { + int64_t signedOutput64 = unsignedOutput & ((1ull << 63) - 1); + To signedOutput = To(signedOutput64); + if (ieee754::extractSignPreserveBitPattern(v.data) != 0) + signedOutput = -signedOutput; + + return signedOutput; + } + + return To(unsignedOutput); + } + + // assert(false); + return To(0xdeadbeefbadcaffeull); + } +}; + +template +struct static_cast_helper, float32_t, void> +{ + using To = emulated_float64_t; + + static inline To cast(float32_t v) + { + return To::create(v); + } +}; + +template +struct static_cast_helper, float64_t, void> +{ + using To = emulated_float64_t; + + static inline To cast(float64_t v) + { + return To::create(v); + } +}; + +template +struct static_cast_helper, uint32_t, void> +{ + using To = emulated_float64_t; + + static inline To cast(uint32_t v) + { + return To::create(v); + } +}; + +template +struct static_cast_helper, uint64_t, void> +{ + using To = emulated_float64_t; + + static inline To cast(uint64_t v) + { + return To::create(v); + } +}; + +template +struct static_cast_helper, emulated_float64_t, void> +{ + static inline emulated_float64_t cast(emulated_float64_t v) + { + return v; + } +}; + +} + +DEFINE_BIT_CAST_SPEC(emulated_float64_t); +DEFINE_BIT_CAST_SPEC(emulated_float64_t); +DEFINE_BIT_CAST_SPEC(emulated_float64_t); +DEFINE_BIT_CAST_SPEC(emulated_float64_t); + +//template +//struct is_floating_point > : bool_constant {}; + +namespace ieee754 +{ +IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); +IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); +IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); +IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t); +} + +} +} + +#undef IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE +#undef DEFINE_BIT_CAST_SPEC + +#endif diff --git a/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl b/include/nbl/builtin/hlsl/emulated/emulated_float64_t_impl.hlsl similarity index 82% rename from include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl rename to include/nbl/builtin/hlsl/emulated/emulated_float64_t_impl.hlsl index 3045df49d4..d2b59e9607 100644 --- a/include/nbl/builtin/hlsl/impl/emulated_float64_t_impl.hlsl +++ b/include/nbl/builtin/hlsl/emulated/emulated_float64_t_impl.hlsl @@ -1,509 +1,449 @@ -#ifndef _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_IMPL_INCLUDED_ -#define _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_IMPL_INCLUDED_ - -#include -#include -#include -#include -#include - -#define FLOAT_ROUND_NEAREST_EVEN 0 -#define FLOAT_ROUND_TO_ZERO 1 -#define FLOAT_ROUND_DOWN 2 -#define FLOAT_ROUND_UP 3 -#define FLOAT_ROUNDING_MODE FLOAT_ROUND_NEAREST_EVEN - -// TODO: when it will be possible, use this unions wherever they fit: -/* -* union Mantissa -* { -* struct -* { -* uint32_t highBits; -* uint64_t lowBits; -* }; -* -* uint32_t2 packed; -* }; -* -*/ - -/* -* union Mantissa -* { -* struct -* { -* uint64_t lhs; -* uint64_t rhs; -* }; -* -* uint32_t4 packed; -* }; -* -*/ - -namespace nbl -{ -namespace hlsl -{ -namespace impl -{ -NBL_CONSTEXPR_INLINE_FUNC uint64_t2 shiftMantissaLeftBy53(uint64_t mantissa64) -{ - uint64_t2 output; - output.x = mantissa64 >> (64 - ieee754::traits::mantissaBitCnt); - output.y = mantissa64 << (ieee754::traits::mantissaBitCnt); - - return output; -} - -NBL_CONSTEXPR_INLINE_FUNC uint64_t packFloat64(uint32_t zSign, int zExp, uint32_t zFrac0, uint32_t zFrac1) -{ - uint32_t2 z; - - z.x = zSign + (uint32_t(zExp) << 20) + zFrac0; - z.y = zFrac1; - - uint64_t output = 0u; - output |= (uint64_t(z.x) << 32) & 0xFFFFFFFF00000000ull; - output |= uint64_t(z.y); - return output; -} - -NBL_CONSTEXPR_INLINE_FUNC uint32_t2 packUint64(uint64_t val) -{ - return uint32_t2((val & 0xFFFFFFFF00000000ull) >> 32, val & 0x00000000FFFFFFFFull); -} - -NBL_CONSTEXPR_INLINE_FUNC uint64_t unpackUint64(uint32_t2 val) -{ - return ((uint64_t(val.x) & 0x00000000FFFFFFFFull) << 32) | uint64_t(val.y); -} - -template -inline uint64_t castFloat32ToStorageType(float32_t val) -{ - if (FlushDenormToZero) - { - const uint64_t sign = uint64_t(ieee754::extractSign(val)) << 63; - if (tgmath::isInf(val)) - return ieee754::traits::inf | sign; - uint32_t asUint = ieee754::impl::castToUintType(val); - const int f32BiasedExp = ieee754::extractBiasedExponent(val); - if (f32BiasedExp == 0) - return sign; - const uint64_t biasedExp = uint64_t(f32BiasedExp - ieee754::traits::exponentBias + ieee754::traits::exponentBias) << (ieee754::traits::mantissaBitCnt); - const uint64_t mantissa = (uint64_t(ieee754::traits::mantissaMask) & asUint) << (ieee754::traits::mantissaBitCnt - ieee754::traits::mantissaBitCnt); - - return sign | biasedExp | mantissa; - } - else - { - // static_assert(false); - return 0xdeadbeefbadcaffeull; - } -}; - -inline uint64_t castToUint64WithFloat64BitPattern(uint64_t val) -{ - if (val == 0) - return val; - -#ifndef __HLSL_VERSION - int exp = findMSB(val); -#else - int exp = 63; - uint64_t mask = ieee754::traits::signMask; - while (!(val & mask)) - { - --exp; - mask >>= 1; - } - - - //uint32_t2 valPacked = packUint64(val); - //int exp = valPacked.x ? firstbithigh(valPacked.x) + 32 : firstbithigh(valPacked.y); - //exp = 63 - exp; -#endif - uint64_t mantissa; - - int shiftCnt = 52 - exp; - if (shiftCnt >= 0) - { - mantissa = val << shiftCnt; - } - else - { - const int shiftCntAbs = -shiftCnt; - uint64_t roundingBit = 1ull << (shiftCnt - 1); - uint64_t stickyBitMask = roundingBit - 1; - uint64_t stickyBit = val & stickyBitMask; - - mantissa = val >> shiftCntAbs; - - if ((val & roundingBit) && (!stickyBit)) - { - bool isEven = mantissa & 1; - if (!isEven) - mantissa++; - } - else if ((val & roundingBit) && (stickyBit || (mantissa & 1))) - val += roundingBit; - - //val += (1ull << (shiftCnt)) - 1; - //mantissa = val >> shiftCntAbs; - - if (mantissa & 1ull << 53) - { - mantissa >>= 1; - exp++; - } - } - mantissa &= ieee754::traits::mantissaMask; - const uint64_t biasedExp = uint64_t(ieee754::traits::exponentBias + exp) << ieee754::traits::mantissaBitCnt; - - return biasedExp | mantissa; -}; - -inline uint64_t castToUint64WithFloat64BitPattern(int64_t val) -{ - const uint64_t sign = val & ieee754::traits::signMask; - const uint64_t absVal = uint64_t(abs(val)); - return sign | castToUint64WithFloat64BitPattern(absVal); -}; - -NBL_CONSTEXPR_INLINE_FUNC uint32_t2 umulExtended(uint32_t lhs, uint32_t rhs) -{ - uint64_t product = uint64_t(lhs) * uint64_t(rhs); - uint32_t2 output; - output.x = uint32_t((product & 0xFFFFFFFF00000000) >> 32); - output.y = uint32_t(product & 0x00000000FFFFFFFFull); - return output; -} - -NBL_CONSTEXPR_INLINE_FUNC uint64_t propagateFloat64NaN(uint64_t lhs, uint64_t rhs) -{ -#if defined RELAXED_NAN_PROPAGATION - return lhs | rhs; -#else - - lhs |= 0x0008000000000000ull; - rhs |= 0x0008000000000000ull; - return glsl::mix(rhs, glsl::mix(lhs, rhs, tgmath::isnan(rhs)), tgmath::isnan(lhs)); - return 0; -#endif -} - -static inline int countLeadingZeros32(uint32_t val) -{ -#ifndef __HLSL_VERSION - return 31 - findMSB(val); -#else - return 31 - firstbithigh(val); -#endif -} - -NBL_CONSTEXPR_INLINE_FUNC uint32_t2 shift64RightJamming(uint32_t2 val, int count) -{ - uint32_t2 output; - const int negCount = (-count) & 31; - - output.x = glsl::mix(0u, val.x, count == 0); - output.x = glsl::mix(output.x, (val.x >> count), count < 32); - - output.y = uint32_t((val.x | val.y) != 0u); /* count >= 64 */ - uint32_t z1_lt64 = (val.x>>(count & 31)) | uint32_t(((val.x<>count) | uint32_t((val.y<> (count & 31)), count < 64); - output.y = glsl::mix(output.y, (val.x << negCount) | (val.y >> count), count < 32); - - val.z = glsl::mix(val.z | val.y, val.z, count < 32); - output.x = glsl::mix(output.x, val.x >> count, count < 32); - output.z |= uint32_t(val.z != 0u); - - output.x = glsl::mix(output.x, 0u, (count == 32)); - output.y = glsl::mix(output.y, val.x, (count == 32)); - output.z = glsl::mix(output.z, val.y, (count == 32)); - output.x = glsl::mix(output.x, val.x, (count == 0)); - output.y = glsl::mix(output.y, val.y, (count == 0)); - output.z = glsl::mix(output.z, val.z, (count == 0)); - - return output; -} - -NBL_CONSTEXPR_INLINE_FUNC uint64_t shortShift64Left(uint64_t val, int count) -{ - const uint32_t2 packed = packUint64(val); - - uint32_t2 output; - output.y = packed.y << count; - // TODO: fix - output.x = glsl::mix((packed.x << count | (packed.y >> ((-count) & 31))), packed.x, count == 0); - - return unpackUint64(output); -}; - -NBL_CONSTEXPR_INLINE_FUNC uint64_t assembleFloat64(uint64_t signShifted, uint64_t expShifted, uint64_t mantissa) -{ - return signShifted + expShifted + mantissa; -} - -NBL_CONSTEXPR_INLINE_FUNC uint64_t roundAndPackFloat64(uint64_t zSign, int zExp, uint32_t3 mantissaExtended) -{ - bool roundNearestEven; - bool increment; - - roundNearestEven = true; - increment = int(mantissaExtended.z) < 0; - if (!roundNearestEven) - { - if (false) //(FLOAT_ROUNDING_MODE == FLOAT_ROUND_TO_ZERO) - { - increment = false; - } - else - { - if (false) //(zSign != 0u) - { - //increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) && - // (zFrac2 != 0u); - } - else - { - //increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP) && - // (zFrac2 != 0u); - } - } - } - - // overflow handling? - // if biased exp is lesser then 2045 - if (0x7FD <= zExp) - { - if ((0x7FD < zExp) || ((zExp == 0x7FD) && (0x001FFFFFu == mantissaExtended.x && 0xFFFFFFFFu == mantissaExtended.y) && increment)) - { - if (false) // ((FLOAT_ROUNDING_MODE == FLOAT_ROUND_TO_ZERO) || - // ((zSign != 0u) && (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP)) || - // ((zSign == 0u) && (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN))) - { - return assembleFloat64(zSign, 0x7FE << ieee754::traits::mantissaBitCnt, 0x000FFFFFFFFFFFFFull); - } - - return assembleFloat64(zSign, ieee754::traits::exponentMask, 0ull); - } - } - - if (zExp < 0) - { - mantissaExtended = shift64ExtraRightJamming(mantissaExtended, -zExp); - zExp = 0; - - if (roundNearestEven) - { - increment = mantissaExtended.z < 0u; - } - else - { - if (zSign != 0u) - { - increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) && (mantissaExtended.z != 0u); - } - else - { - increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP) && (mantissaExtended.z != 0u); - } - } - } - - if (increment) - { - const uint64_t added = impl::unpackUint64(uint32_t2(mantissaExtended.xy)) + 1ull; - mantissaExtended.xy = packUint64(added); - mantissaExtended.y &= ~((mantissaExtended.z + uint32_t(mantissaExtended.z == 0u)) & uint32_t(roundNearestEven)); - } - else - { - // ?? - zExp = glsl::mix(zExp, 0, (mantissaExtended.x | mantissaExtended.y) == 0u); - } - - return assembleFloat64(zSign, uint64_t(zExp) << ieee754::traits::mantissaBitCnt, unpackUint64(mantissaExtended.xy)); -} - -static inline uint64_t normalizeRoundAndPackFloat64(uint64_t sign, int exp, uint32_t frac0, uint32_t frac1) -{ - int shiftCount; - uint32_t3 frac = uint32_t3(frac0, frac1, 0u); - - if (frac.x == 0u) - { - exp -= 32; - frac.x = frac.y; - frac.y = 0u; - } - - shiftCount = countLeadingZeros32(frac.x) - 11; - if (0 <= shiftCount) - { - // TODO: this is packing and unpacking madness, fix it - frac.xy = packUint64(shortShift64Left(unpackUint64(frac.xy), shiftCount)); - } - else - { - frac.xyz = shift64ExtraRightJamming(uint32_t3(frac.xy, 0), -shiftCount); - } - exp -= shiftCount; - return roundAndPackFloat64(sign, exp, frac); -} - -static inline void normalizeFloat64Subnormal(uint64_t mantissa, - NBL_REF_ARG(int) outExp, - NBL_REF_ARG(uint64_t) outMantissa) -{ - uint32_t2 mantissaPacked = packUint64(mantissa); - int shiftCount; - uint32_t2 temp; - shiftCount = countLeadingZeros32(glsl::mix(mantissaPacked.x, mantissaPacked.y, mantissaPacked.x == 0u)) - 11; - outExp = glsl::mix(1 - shiftCount, -shiftCount - 31, mantissaPacked.x == 0u); - - temp.x = glsl::mix(mantissaPacked.y << shiftCount, mantissaPacked.y >> (-shiftCount), shiftCount < 0); - temp.y = glsl::mix(0u, mantissaPacked.y << (shiftCount & 31), shiftCount < 0); - - shortShift64Left(impl::unpackUint64(mantissaPacked), shiftCount); - - outMantissa = glsl::mix(outMantissa, unpackUint64(temp), mantissaPacked.x == 0); -} - -NBL_CONSTEXPR_INLINE_FUNC bool areBothInfinity(uint64_t lhs, uint64_t rhs) -{ - lhs &= ~ieee754::traits::signMask; - rhs &= ~ieee754::traits::signMask; - - return lhs == rhs && lhs == ieee754::traits::inf; -} - -NBL_CONSTEXPR_INLINE_FUNC bool areBothSameSignInfinity(uint64_t lhs, uint64_t rhs) -{ - return lhs == rhs && (lhs & ~ieee754::traits::signMask) == ieee754::traits::inf; -} - -NBL_CONSTEXPR_INLINE_FUNC bool isZero(uint64_t val) -{ - return (val << 1) == 0; -} - -NBL_CONSTEXPR_INLINE_FUNC bool areBothZero(uint64_t lhs, uint64_t rhs) -{ - return ((lhs << 1) == 0ull) && ((rhs << 1) == 0ull); -} - -NBL_CONSTEXPR_INLINE_FUNC bool areBothSameSignZero(uint64_t lhs, uint64_t rhs) -{ - return ((lhs << 1) == 0ull) && (lhs == rhs); -} - -// TODO: find more efficient algorithm -static inline uint64_t nlz64(uint64_t x) -{ - static const uint64_t MASK = 1ull << 63; - - uint64_t counter = 0; - - while ((x & MASK) == 0) - { - x <<= 1; - ++counter; - } - return counter; -} - -// returns pair of quotient and remainder -static inline uint64_t divmod128by64(const uint64_t dividentHigh, const uint64_t dividentLow, uint64_t divisor) -{ - const uint64_t b = 1ull << 32; - uint64_t un1, un0, vn1, vn0, q1, q0, un32, un21, un10, rhat, left, right; - uint64_t s; - - //TODO: countl_zero - s = countl_zero(divisor); - //s = nlz64(divisor); - divisor <<= s; - vn1 = divisor >> 32; - vn0 = divisor & 0xFFFFFFFF; - - if (s > 0) - { - un32 = (dividentHigh << s) | (dividentLow >> (64 - s)); - un10 = dividentLow << s; - } - else - { - un32 = dividentHigh; - un10 = dividentLow; - } - - un1 = un10 >> 32; - un0 = un10 & 0xFFFFFFFF; - - q1 = un32 / vn1; - rhat = un32 % vn1; - - left = q1 * vn0; - right = (rhat << 32) + un1; - while ((q1 >= b) || (left > right)) - { - --q1; - rhat += vn1; - if (rhat < b) - { - left -= vn0; - right = (rhat << 32) | un1; - } - break; - } - - un21 = (un32 << 32) + (un1 - (q1 * divisor)); - - q0 = un21 / vn1; - rhat = un21 % vn1; - - left = q0 * vn0; - right = (rhat << 32) | un0; - while ((q0 >= b) || (left > right)) - { - --q0; - rhat += vn1; - if (rhat < b) - { - left -= vn0; - right = (rhat << 32) | un0; - continue; - } - break; - } - - return (q1 << 32) | q0; -} -} -} -} +#ifndef _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_IMPL_INCLUDED_ +#define _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_IMPL_INCLUDED_ + +#include +#include +#include +#include +#include + +// TODO: when it will be possible, use this unions wherever they fit: +/* +* union Mantissa +* { +* struct +* { +* uint32_t highBits; +* uint64_t lowBits; +* }; +* +* uint32_t2 packed; +* }; +* +*/ +/* +* union Mantissa +* { +* struct +* { +* uint64_t lhs; +* uint64_t rhs; +* }; +* +* uint32_t4 packed; +* }; +* +*/ + +namespace nbl +{ +namespace hlsl +{ +namespace impl +{ +NBL_CONSTEXPR_INLINE_FUNC uint64_t2 shiftMantissaLeftBy53(uint64_t mantissa64) +{ + uint64_t2 output; + output.x = mantissa64 >> (64 - ieee754::traits::mantissaBitCnt); + output.y = mantissa64 << (ieee754::traits::mantissaBitCnt); + + return output; +} + +NBL_CONSTEXPR_INLINE_FUNC uint64_t packFloat64(uint32_t zSign, int zExp, uint32_t zFrac0, uint32_t zFrac1) +{ + uint32_t2 z; + + z.x = zSign + (uint32_t(zExp) << 20) + zFrac0; + z.y = zFrac1; + + uint64_t output = 0u; + output |= (uint64_t(z.x) << 32) & 0xFFFFFFFF00000000ull; + output |= uint64_t(z.y); + return output; +} + +NBL_CONSTEXPR_INLINE_FUNC uint32_t2 packUint64(uint64_t val) +{ + return uint32_t2((val & 0xFFFFFFFF00000000ull) >> 32, val & 0x00000000FFFFFFFFull); +} + +NBL_CONSTEXPR_INLINE_FUNC uint64_t unpackUint64(uint32_t2 val) +{ + return ((uint64_t(val.x) & 0x00000000FFFFFFFFull) << 32) | uint64_t(val.y); +} + +template +inline uint64_t castFloat32ToStorageType(float32_t val) +{ + if (FlushDenormToZero) + { + const uint64_t sign = uint64_t(ieee754::extractSign(val)) << 63; + if (tgmath::isinf(val)) + return ieee754::traits::inf | sign; + uint32_t asUint = ieee754::impl::bitCastToUintType(val); + const int f32BiasedExp = ieee754::extractBiasedExponent(val); + if (f32BiasedExp == 0) + return sign; + const uint64_t biasedExp = uint64_t(f32BiasedExp - ieee754::traits::exponentBias + ieee754::traits::exponentBias) << (ieee754::traits::mantissaBitCnt); + const uint64_t mantissa = (uint64_t(ieee754::traits::mantissaMask) & asUint) << (ieee754::traits::mantissaBitCnt - ieee754::traits::mantissaBitCnt); + + return sign | biasedExp | mantissa; + } + else + { + // static_assert(false); + return 0xdeadbeefbadcaffeull; + } +}; + +inline uint64_t castToUint64WithFloat64BitPattern(uint64_t val) +{ + if (val == 0) + return val; + +#ifndef __HLSL_VERSION + int exp = findMSB(val); +#else + int exp = 63; + uint64_t mask = ieee754::traits::signMask; + while (!(val & mask)) + { + --exp; + mask >>= 1; + } + + + //uint32_t2 valPacked = packUint64(val); + //int exp = valPacked.x ? firstbithigh(valPacked.x) + 32 : firstbithigh(valPacked.y); + //exp = 63 - exp; +#endif + uint64_t mantissa; + + int shiftCnt = 52 - exp; + if (shiftCnt >= 0) + { + mantissa = val << shiftCnt; + } + else + { + const int shiftCntAbs = -shiftCnt; + uint64_t roundingBit = 1ull << (shiftCnt - 1); + uint64_t stickyBitMask = roundingBit - 1; + uint64_t stickyBit = val & stickyBitMask; + + mantissa = val >> shiftCntAbs; + + if ((val & roundingBit) && (!stickyBit)) + { + bool isEven = mantissa & 1; + if (!isEven) + mantissa++; + } + else if ((val & roundingBit) && (stickyBit || (mantissa & 1))) + val += roundingBit; + + //val += (1ull << (shiftCnt)) - 1; + //mantissa = val >> shiftCntAbs; + + if (mantissa & 1ull << 53) + { + mantissa >>= 1; + exp++; + } + } + mantissa &= ieee754::traits::mantissaMask; + const uint64_t biasedExp = uint64_t(ieee754::traits::exponentBias + exp) << ieee754::traits::mantissaBitCnt; + + return biasedExp | mantissa; +}; + +inline uint64_t castToUint64WithFloat64BitPattern(int64_t val) +{ + const uint64_t sign = val & ieee754::traits::signMask; + const uint64_t absVal = uint64_t(abs(val)); + return sign | castToUint64WithFloat64BitPattern(absVal); +}; + +NBL_CONSTEXPR_INLINE_FUNC uint32_t2 umulExtended(uint32_t lhs, uint32_t rhs) +{ + uint64_t product = uint64_t(lhs) * uint64_t(rhs); + uint32_t2 output; + output.x = uint32_t((product & 0xFFFFFFFF00000000) >> 32); + output.y = uint32_t(product & 0x00000000FFFFFFFFull); + return output; +} + +NBL_CONSTEXPR_INLINE_FUNC uint64_t propagateFloat64NaN(uint64_t lhs, uint64_t rhs) +{ +#if defined RELAXED_NAN_PROPAGATION + return lhs | rhs; +#else + + lhs |= 0x0008000000000000ull; + rhs |= 0x0008000000000000ull; + return glsl::mix(rhs, glsl::mix(lhs, rhs, tgmath::isnan(rhs)), tgmath::isnan(lhs)); + return 0; +#endif +} + +static inline int countLeadingZeros32(uint32_t val) +{ +#ifndef __HLSL_VERSION + return 31 - findMSB(val); +#else + return 31 - firstbithigh(val); +#endif +} + +NBL_CONSTEXPR_INLINE_FUNC uint32_t2 shift64RightJamming(uint32_t2 val, int count) +{ + uint32_t2 output; + const int negCount = (-count) & 31; + + output.x = glsl::mix(0u, val.x, count == 0); + output.x = glsl::mix(output.x, (val.x >> count), count < 32); + + output.y = uint32_t((val.x | val.y) != 0u); /* count >= 64 */ + uint32_t z1_lt64 = (val.x>>(count & 31)) | uint32_t(((val.x<>count) | uint32_t((val.y<> (count & 31)), count < 64); + output.y = glsl::mix(output.y, (val.x << negCount) | (val.y >> count), count < 32); + + val.z = glsl::mix(val.z | val.y, val.z, count < 32); + output.x = glsl::mix(output.x, val.x >> count, count < 32); + output.z |= uint32_t(val.z != 0u); + + output.x = glsl::mix(output.x, 0u, (count == 32)); + output.y = glsl::mix(output.y, val.x, (count == 32)); + output.z = glsl::mix(output.z, val.y, (count == 32)); + output.x = glsl::mix(output.x, val.x, (count == 0)); + output.y = glsl::mix(output.y, val.y, (count == 0)); + output.z = glsl::mix(output.z, val.z, (count == 0)); + + return output; +} + +NBL_CONSTEXPR_INLINE_FUNC uint64_t shortShift64Left(uint64_t val, int count) +{ + const uint32_t2 packed = packUint64(val); + + uint32_t2 output; + output.y = packed.y << count; + // TODO: fix + output.x = glsl::mix((packed.x << count | (packed.y >> ((-count) & 31))), packed.x, count == 0); + + return unpackUint64(output); +}; + +NBL_CONSTEXPR_INLINE_FUNC uint64_t assembleFloat64(uint64_t signShifted, uint64_t expShifted, uint64_t mantissa) +{ + return signShifted + expShifted + mantissa; +} + +NBL_CONSTEXPR_INLINE_FUNC uint64_t roundAndPackFloat64(uint64_t zSign, int zExp, uint32_t3 mantissaExtended) +{ + bool roundNearestEven; + bool increment; + + roundNearestEven = true; + increment = int(mantissaExtended.z) < 0; + + // overflow handling? + // if biased exp is lesser then 2045 + if (0x7FD <= zExp) + { + if ((0x7FD < zExp) || ((zExp == 0x7FD) && (0x001FFFFFu == mantissaExtended.x && 0xFFFFFFFFu == mantissaExtended.y) && increment)) + return assembleFloat64(zSign, 0x7FE << ieee754::traits::mantissaBitCnt, 0x000FFFFFFFFFFFFFull); + + return assembleFloat64(zSign, ieee754::traits::exponentMask, 0ull); + } + + if (zExp < 0) + { + mantissaExtended = shift64ExtraRightJamming(mantissaExtended, -zExp); + zExp = 0; + } + + zExp = glsl::mix(zExp, 0, (mantissaExtended.x | mantissaExtended.y) == 0u); + + return assembleFloat64(zSign, uint64_t(zExp) << ieee754::traits::mantissaBitCnt, unpackUint64(mantissaExtended.xy)); +} + +static inline uint64_t normalizeRoundAndPackFloat64(uint64_t sign, int exp, uint32_t frac0, uint32_t frac1) +{ + int shiftCount; + uint32_t3 frac = uint32_t3(frac0, frac1, 0u); + + if (frac.x == 0u) + { + exp -= 32; + frac.x = frac.y; + frac.y = 0u; + } + + shiftCount = countLeadingZeros32(frac.x) - 11; + if (0 <= shiftCount) + { + // TODO: this is packing and unpacking madness, fix it + frac.xy = packUint64(shortShift64Left(unpackUint64(frac.xy), shiftCount)); + } + else + { + frac.xyz = shift64ExtraRightJamming(uint32_t3(frac.xy, 0), -shiftCount); + } + exp -= shiftCount; + return roundAndPackFloat64(sign, exp, frac); +} + +static inline void normalizeFloat64Subnormal(uint64_t mantissa, + NBL_REF_ARG(int) outExp, + NBL_REF_ARG(uint64_t) outMantissa) +{ + uint32_t2 mantissaPacked = packUint64(mantissa); + int shiftCount; + uint32_t2 temp; + shiftCount = countLeadingZeros32(glsl::mix(mantissaPacked.x, mantissaPacked.y, mantissaPacked.x == 0u)) - 11; + outExp = glsl::mix(1 - shiftCount, -shiftCount - 31, mantissaPacked.x == 0u); + + temp.x = glsl::mix(mantissaPacked.y << shiftCount, mantissaPacked.y >> (-shiftCount), shiftCount < 0); + temp.y = glsl::mix(0u, mantissaPacked.y << (shiftCount & 31), shiftCount < 0); + + shortShift64Left(impl::unpackUint64(mantissaPacked), shiftCount); + + outMantissa = glsl::mix(outMantissa, unpackUint64(temp), mantissaPacked.x == 0); +} + +NBL_CONSTEXPR_INLINE_FUNC bool areBothInfinity(uint64_t lhs, uint64_t rhs) +{ + lhs &= ~ieee754::traits::signMask; + rhs &= ~ieee754::traits::signMask; + + return lhs == rhs && lhs == ieee754::traits::inf; +} + +NBL_CONSTEXPR_INLINE_FUNC bool areBothSameSignInfinity(uint64_t lhs, uint64_t rhs) +{ + return lhs == rhs && (lhs & ~ieee754::traits::signMask) == ieee754::traits::inf; +} + +NBL_CONSTEXPR_INLINE_FUNC bool isZero(uint64_t val) +{ + return (val << 1) == 0; +} + +NBL_CONSTEXPR_INLINE_FUNC bool areBothZero(uint64_t lhs, uint64_t rhs) +{ + return ((lhs << 1) == 0ull) && ((rhs << 1) == 0ull); +} + +NBL_CONSTEXPR_INLINE_FUNC bool areBothSameSignZero(uint64_t lhs, uint64_t rhs) +{ + return ((lhs << 1) == 0ull) && (lhs == rhs); +} + +// TODO: find more efficient algorithm +static inline uint64_t nlz64(uint64_t x) +{ + static const uint64_t MASK = 1ull << 63; + + uint64_t counter = 0; + + while ((x & MASK) == 0) + { + x <<= 1; + ++counter; + } + return counter; +} + +// returns pair of quotient and remainder +static inline uint64_t divmod128by64(const uint64_t dividentHigh, const uint64_t dividentLow, uint64_t divisor) +{ + const uint64_t b = 1ull << 32; + uint64_t un1, un0, vn1, vn0, q1, q0, un32, un21, un10, rhat, left, right; + uint64_t s; + + //TODO: countl_zero + s = countl_zero(divisor); + //s = nlz64(divisor); + divisor <<= s; + vn1 = divisor >> 32; + vn0 = divisor & 0xFFFFFFFF; + + if (s > 0) + { + un32 = (dividentHigh << s) | (dividentLow >> (64 - s)); + un10 = dividentLow << s; + } + else + { + un32 = dividentHigh; + un10 = dividentLow; + } + + un1 = un10 >> 32; + un0 = un10 & 0xFFFFFFFF; + + q1 = un32 / vn1; + rhat = un32 % vn1; + + left = q1 * vn0; + right = (rhat << 32) + un1; + while ((q1 >= b) || (left > right)) + { + --q1; + rhat += vn1; + if (rhat < b) + { + left -= vn0; + right = (rhat << 32) | un1; + } + break; + } + + un21 = (un32 << 32) + (un1 - (q1 * divisor)); + + q0 = un21 / vn1; + rhat = un21 % vn1; + + left = q0 * vn0; + right = (rhat << 32) | un0; + while ((q0 >= b) || (left > right)) + { + --q0; + rhat += vn1; + if (rhat < b) + { + left -= vn0; + right = (rhat << 32) | un0; + continue; + } + break; + } + + return (q1 << 32) | q0; +} +} +} +} #endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/emulated_float64_t_utils.hlsl b/include/nbl/builtin/hlsl/emulated/emulated_float64_t_utils.hlsl similarity index 98% rename from include/nbl/builtin/hlsl/emulated_float64_t_utils.hlsl rename to include/nbl/builtin/hlsl/emulated/emulated_float64_t_utils.hlsl index bf58f4e005..ac7b79e74b 100644 --- a/include/nbl/builtin/hlsl/emulated_float64_t_utils.hlsl +++ b/include/nbl/builtin/hlsl/emulated/emulated_float64_t_utils.hlsl @@ -3,7 +3,7 @@ #include #include -#include +#include namespace nbl { @@ -18,7 +18,7 @@ template >::type; #else template -using portable_float64_t = typename conditional >::type; +using portable_float64_t = typename conditional >::type; #endif template diff --git a/include/nbl/builtin/hlsl/ieee754.hlsl b/include/nbl/builtin/hlsl/ieee754/ieee754.hlsl similarity index 87% rename from include/nbl/builtin/hlsl/ieee754.hlsl rename to include/nbl/builtin/hlsl/ieee754/ieee754.hlsl index 7e36501f0a..f869f4ceba 100644 --- a/include/nbl/builtin/hlsl/ieee754.hlsl +++ b/include/nbl/builtin/hlsl/ieee754/ieee754.hlsl @@ -27,15 +27,15 @@ namespace impl } template - NBL_CONSTEXPR_INLINE_FUNC typename unsigned_integer_of_size::type castToUintType(T x) + NBL_CONSTEXPR_INLINE_FUNC typename unsigned_integer_of_size::type bitCastToUintType(T x) { using AsUint = typename unsigned_integer_of_size::type; return bit_cast(x); } // to avoid bit cast from uintN_t to uintN_t - template <> NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size<2>::type castToUintType(uint16_t x) { return x; } - template <> NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size<4>::type castToUintType(uint32_t x) { return x; } - template <> NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size<8>::type castToUintType(uint64_t x) { return x; } + template <> NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size<2>::type bitCastToUintType(uint16_t x) { return x; } + template <> NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size<4>::type bitCastToUintType(uint32_t x) { return x; } + template <> NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size<8>::type bitCastToUintType(uint64_t x) { return x; } template NBL_CONSTEXPR_INLINE_FUNC T castBackToFloatType(T x) @@ -102,7 +102,7 @@ template inline uint32_t extractBiasedExponent(T x) { using AsUint = typename unsigned_integer_of_size::type; - return glsl::bitfieldExtract(impl::castToUintType(x), traits::type>::mantissaBitCnt, traits::type>::exponentBitCnt); + return glsl::bitfieldExtract(impl::bitCastToUintType(x), traits::type>::mantissaBitCnt, traits::type>::exponentBitCnt); } template<> @@ -115,7 +115,7 @@ inline uint32_t extractBiasedExponent(uint64_t x) template<> inline uint32_t extractBiasedExponent(float64_t x) { - return extractBiasedExponent(impl::castToUintType(x)); + return extractBiasedExponent(impl::bitCastToUintType(x)); } template @@ -131,7 +131,7 @@ NBL_CONSTEXPR_INLINE_FUNC T replaceBiasedExponent(T x, typename unsigned_integer // TODO: //staticAssertTmp(impl::isTypeAllowed(), "Invalid type! Only floating point or unsigned integer types are allowed."); using AsFloat = typename float_of_size::type; - return impl::castBackToFloatType(glsl::bitfieldInsert(impl::castToUintType(x), biasedExp, traits::mantissaBitCnt, traits::exponentBitCnt)); + return impl::castBackToFloatType(glsl::bitfieldInsert(impl::bitCastToUintType(x), biasedExp, traits::mantissaBitCnt, traits::exponentBitCnt)); } // performs no overflow tests, returns x*exp2(n) @@ -145,7 +145,7 @@ template NBL_CONSTEXPR_INLINE_FUNC typename unsigned_integer_of_size::type extractMantissa(T x) { using AsUint = typename unsigned_integer_of_size::type; - return impl::castToUintType(x) & traits::type>::mantissaMask; + return impl::bitCastToUintType(x) & traits::type>::mantissaMask; } template @@ -160,14 +160,14 @@ template NBL_CONSTEXPR_INLINE_FUNC typename unsigned_integer_of_size::type extractSign(T x) { using AsFloat = typename float_of_size::type; - return (impl::castToUintType(x) & traits::signMask) >> ((sizeof(T) * 8) - 1); + return (impl::bitCastToUintType(x) & traits::signMask) >> ((sizeof(T) * 8) - 1); } template NBL_CONSTEXPR_INLINE_FUNC typename unsigned_integer_of_size::type extractSignPreserveBitPattern(T x) { using AsFloat = typename float_of_size::type; - return impl::castToUintType(x) & traits::signMask; + return impl::bitCastToUintType(x) & traits::signMask; } } diff --git a/include/nbl/builtin/hlsl/math/equations/quadratic.hlsl b/include/nbl/builtin/hlsl/math/equations/quadratic.hlsl index ba0f70ba67..4d40e6f327 100644 --- a/include/nbl/builtin/hlsl/math/equations/quadratic.hlsl +++ b/include/nbl/builtin/hlsl/math/equations/quadratic.hlsl @@ -5,7 +5,7 @@ #ifndef _NBL_BUILTIN_HLSL_MATH_EQUATIONS_QUADRATIC_INCLUDED_ #define _NBL_BUILTIN_HLSL_MATH_EQUATIONS_QUADRATIC_INCLUDED_ -#include +#include // TODO: Later include from correct hlsl header #ifndef nbl_hlsl_FLT_EPSILON diff --git a/include/nbl/builtin/hlsl/shapes/beziers.hlsl b/include/nbl/builtin/hlsl/shapes/beziers.hlsl index d3178a29f7..7fdab46942 100644 --- a/include/nbl/builtin/hlsl/shapes/beziers.hlsl +++ b/include/nbl/builtin/hlsl/shapes/beziers.hlsl @@ -10,8 +10,8 @@ #include #include #include -#include -#include +#include +#include // TODO: Later include from correct hlsl header (numeric_limits.hlsl) #ifndef nbl_hlsl_FLT_EPSILON diff --git a/include/nbl/builtin/hlsl/tgmath.hlsl b/include/nbl/builtin/hlsl/tgmath.hlsl index 75cddf27c0..9fac5dce96 100644 --- a/include/nbl/builtin/hlsl/tgmath.hlsl +++ b/include/nbl/builtin/hlsl/tgmath.hlsl @@ -5,7 +5,7 @@ #define _NBL_BUILTIN_HLSL_TGMATH_INCLUDED_ #include -#include +#include #include namespace nbl @@ -27,7 +27,7 @@ inline bool isnan(T val) } template -NBL_CONSTEXPR_INLINE_FUNC bool isInf(T val) +NBL_CONSTEXPR_INLINE_FUNC bool isinf(T val) { using AsUint = typename unsigned_integer_of_size::type; using AsFloat = typename float_of_size::type; diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index cad9995c2c..e06d9e2077 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -231,14 +231,14 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/normalization/shared_nor # HLSL LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/macros.h") -#impl -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/impl/emulated_float64_t_impl.hlsl") #emulated -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated_float64_t.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated_float64_t_utils.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated/emulated_float64_t.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated/emulated_float64_t_utils.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated/emulated_float64_t_impl.hlsl") +#ieee754 +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ieee754/ieee754.hlsl") #utility LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tgmath.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ieee754.hlsl") #spirv intrinsics LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/core.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/fragment_shader_pixel_interlock.hlsl") diff --git a/src/nbl/video/CVulkanPhysicalDevice.cpp b/src/nbl/video/CVulkanPhysicalDevice.cpp index f13429efc1..541a600b03 100644 --- a/src/nbl/video/CVulkanPhysicalDevice.cpp +++ b/src/nbl/video/CVulkanPhysicalDevice.cpp @@ -1615,8 +1615,7 @@ core::smart_refctd_ptr CVulkanPhysicalDevice::createLogicalDevic vk_deviceFeatures2.features.shaderStorageImageArrayDynamicIndexing = limits.shaderStorageImageArrayDynamicIndexing; vk_deviceFeatures2.features.shaderClipDistance = true; // good device support vk_deviceFeatures2.features.shaderCullDistance = enabledFeatures.shaderCullDistance; - //vk_deviceFeatures2.features.shaderFloat64 = limits.shaderFloat64; // TODO: enable back - vk_deviceFeatures2.features.shaderFloat64 = VK_FALSE; + vk_deviceFeatures2.features.shaderFloat64 = limits.shaderFloat64; vk_deviceFeatures2.features.shaderInt64 = true; // always enable vk_deviceFeatures2.features.shaderInt16 = true; // always enable vk_deviceFeatures2.features.shaderResourceResidency = enabledFeatures.shaderResourceResidency; From 1c029c10dc57aff734fccda728add61b2414c057 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Wed, 4 Sep 2024 00:13:39 +0100 Subject: [PATCH 046/358] Reduced branches in operator+ --- examples_tests | 2 +- .../hlsl/emulated/emulated_float64_t.hlsl | 212 +++--------------- include/nbl/builtin/hlsl/ieee754/ieee754.hlsl | 2 +- 3 files changed, 35 insertions(+), 181 deletions(-) diff --git a/examples_tests b/examples_tests index f8f2c23aab..a9e1007e59 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit f8f2c23aab0092015b3bf31cdab9875c1435f5dc +Subproject commit a9e1007e592786182c25ed1f6c25ad9e9306107c diff --git a/include/nbl/builtin/hlsl/emulated/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated/emulated_float64_t.hlsl index 088e535c5b..a6eb9311d3 100644 --- a/include/nbl/builtin/hlsl/emulated/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/emulated_float64_t.hlsl @@ -72,161 +72,6 @@ namespace hlsl return emulated_float64_t(bit_cast(float64_t(val))); }*/ - // TODO: remove - emulated_float64_t addOld(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC - { - if (FlushDenormToZero) - { - emulated_float64_t retval = emulated_float64_t::create(0ull); - - uint64_t mantissa; - uint32_t3 mantissaExtended; - int biasedExp; - - uint64_t lhsSign = data & ieee754::traits::signMask; - uint64_t rhsSign = rhs.data & ieee754::traits::signMask; - uint64_t lhsMantissa = ieee754::extractMantissa(data); - uint64_t rhsMantissa = ieee754::extractMantissa(rhs.data); - int lhsBiasedExp = ieee754::extractBiasedExponent(data); - int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); - - int expDiff = lhsBiasedExp - rhsBiasedExp; - - if (lhsSign == rhsSign) - { - if (expDiff == 0) - { - if (lhsBiasedExp == ieee754::traits::specialValueExp) - { - bool propagate = (lhsMantissa | rhsMantissa) != 0u; - return bit_cast >(glsl::mix(data, impl::propagateFloat64NaN(data, rhs.data), propagate)); - } - - mantissa = lhsMantissa + rhsMantissa; - if (lhsBiasedExp == 0) - return bit_cast >(impl::assembleFloat64(lhsSign, 0, mantissa)); - mantissaExtended.xy = impl::packUint64(mantissa); - mantissaExtended.x |= 0x00200000u; - mantissaExtended.z = 0u; - biasedExp = lhsBiasedExp; - - mantissaExtended = impl::shift64ExtraRightJamming(mantissaExtended, 1); - } - else - { - if (expDiff < 0) - { - swap(lhsMantissa, rhsMantissa); - swap(lhsBiasedExp, rhsBiasedExp); - } - - if (lhsBiasedExp == ieee754::traits::specialValueExp) - { - const bool propagate = (lhsMantissa) != 0u; - return bit_cast >(glsl::mix(ieee754::traits::exponentMask | lhsSign, impl::propagateFloat64NaN(data, rhs.data), propagate)); - } - - expDiff = glsl::mix(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); - rhsMantissa = glsl::mix(rhsMantissa | (1ull << 52), rhsMantissa, rhsBiasedExp == 0); - const uint32_t3 shifted = impl::shift64ExtraRightJamming(uint32_t3(impl::packUint64(rhsMantissa), 0u), expDiff); - rhsMantissa = impl::unpackUint64(shifted.xy); - mantissaExtended.z = shifted.z; - biasedExp = lhsBiasedExp; - - lhsMantissa |= (1ull << 52); - mantissaExtended.xy = impl::packUint64(lhsMantissa + rhsMantissa); - --biasedExp; - if (!(mantissaExtended.x < 0x00200000u)) - { - mantissaExtended = impl::shift64ExtraRightJamming(mantissaExtended, 1); - ++biasedExp; - } - - return bit_cast >(impl::roundAndPackFloat64(lhsSign, biasedExp, mantissaExtended.xyz)); - } - - // cannot happen but compiler cries about not every path returning value - return bit_cast >(impl::roundAndPackFloat64(lhsSign, biasedExp, mantissaExtended)); - } - else - { - lhsMantissa = impl::shortShift64Left(lhsMantissa, 10); - rhsMantissa = impl::shortShift64Left(rhsMantissa, 10); - - if (expDiff != 0) - { - uint32_t2 frac; - - if (expDiff < 0) - { - swap(lhsMantissa, rhsMantissa); - swap(lhsBiasedExp, rhsBiasedExp); - lhsSign ^= ieee754::traits::signMask; - } - - if (lhsBiasedExp == ieee754::traits::specialValueExp) - { - bool propagate = lhsMantissa != 0u; - return bit_cast >(glsl::mix(impl::assembleFloat64(lhsSign, ieee754::traits::exponentMask, 0ull), impl::propagateFloat64NaN(data, rhs.data), propagate)); - } - - expDiff = glsl::mix(abs(expDiff), abs(expDiff) - 1, rhsBiasedExp == 0); - rhsMantissa = glsl::mix(rhsMantissa | 0x4000000000000000ull, rhsMantissa, rhsBiasedExp == 0); - rhsMantissa = impl::unpackUint64(impl::shift64RightJamming(impl::packUint64(rhsMantissa), expDiff)); - lhsMantissa |= 0x4000000000000000ull; - frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); - biasedExp = lhsBiasedExp; - --biasedExp; - return bit_cast >(impl::normalizeRoundAndPackFloat64(lhsSign, biasedExp - 10, frac.x, frac.y)); - } - if (lhsBiasedExp == ieee754::traits::specialValueExp) - { - bool propagate = ((lhsMantissa) | (rhsMantissa)) != 0u; - return bit_cast >(glsl::mix(ieee754::traits::quietNaN, impl::propagateFloat64NaN(data, rhs.data), propagate)); - } - rhsBiasedExp = glsl::mix(rhsBiasedExp, 1, lhsBiasedExp == 0); - lhsBiasedExp = glsl::mix(lhsBiasedExp, 1, lhsBiasedExp == 0); - - - const uint32_t2 lhsMantissaPacked = impl::packUint64(lhsMantissa); - const uint32_t2 rhsMantissaPacked = impl::packUint64(rhsMantissa); - - uint32_t2 frac; - uint64_t signOfDifference = 0; - if (rhsMantissaPacked.x < lhsMantissaPacked.x) - { - frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); - } - else if (lhsMantissaPacked.x < rhsMantissaPacked.x) - { - frac.xy = impl::packUint64(rhsMantissa - lhsMantissa); - signOfDifference = ieee754::traits::signMask; - } - else if (rhsMantissaPacked.y <= lhsMantissaPacked.y) - { - /* It is possible that frac.x and frac.y may be zero after this. */ - frac.xy = impl::packUint64(lhsMantissa - rhsMantissa); - } - else - { - frac.xy = impl::packUint64(rhsMantissa - lhsMantissa); - signOfDifference = ieee754::traits::signMask; - } - - biasedExp = glsl::mix(rhsBiasedExp, lhsBiasedExp, signOfDifference == 0u); - lhsSign ^= signOfDifference; - uint64_t retval_0 = impl::packFloat64(0, 0, 0u, 0u); - uint64_t retval_1 = impl::normalizeRoundAndPackFloat64(lhsSign, biasedExp - 11, frac.x, frac.y); - return bit_cast >(glsl::mix(retval_0, retval_1, frac.x != 0u || frac.y != 0u)); - } - } - else - { - //static_assert(false, "not implemented yet"); - return bit_cast >(0xdeadbeefbadcaffeull); - } - } - // arithmetic operators this_t operator+(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { @@ -235,8 +80,8 @@ namespace hlsl if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) return bit_cast(ieee754::traits::quietNaN); - int lhsBiasedExp = ieee754::extractBiasedExponent(data); - int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); + const int lhsBiasedExp = ieee754::extractBiasedExponent(data); + const int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); if (lhsBiasedExp == 0ull) return bit_cast(rhs.data); @@ -246,39 +91,48 @@ namespace hlsl const uint64_t lhsSign = ieee754::extractSign(data); const uint64_t rhsSign = ieee754::extractSign(rhs.data); - if (lhsSign != rhsSign) - return addOld(rhs); - - // assuming lhsSign == rhsSign - const uint64_t resultSign = lhsSign == 0 ? 0 : ieee754::traits::signMask; - - if (!FastMath && (tgmath::isinf(data) || tgmath::isinf(rhs.data))) - return bit_cast(ieee754::traits::inf | resultSign); + if (!FastMath && tgmath::isinf(data)) + return bit_cast(ieee754::traits::inf | ieee754::extractSignPreserveBitPattern(max(data, rhs.data))); uint64_t lhsNormMantissa = ieee754::extractNormalizeMantissa(data); uint64_t rhsNormMantissa = ieee754::extractNormalizeMantissa(rhs.data); - // TODO: branchless? - /*if (lhsSign != rhsSign) - { - if (lhsSign) - lhsNormMantissa *= -1; - if (rhsSign) - rhsNormMantissa *= -1; - }*/ - - int expDiff = lhsBiasedExp - rhsBiasedExp; + const int expDiff = lhsBiasedExp - rhsBiasedExp; - int exp = max(lhsBiasedExp, rhsBiasedExp) - ieee754::traits::exponentBias; - uint32_t shiftAmount = abs(expDiff); + const int exp = max(lhsBiasedExp, rhsBiasedExp) - ieee754::traits::exponentBias; + const uint32_t shiftAmount = abs(expDiff); - // so lhsNormMantissa always holds mantissa of number with greater exponent + uint64_t resultSign; if (expDiff < 0) + { + // so lhsNormMantissa always holds mantissa of number with greater exponent swap(lhsNormMantissa, rhsNormMantissa); + resultSign = rhsSign > 0 ? ieee754::traits::signMask : 0ull; + } + else + { + resultSign = lhsSign > 0 ? ieee754::traits::signMask : 0ull; + } rhsNormMantissa >>= shiftAmount; - uint64_t resultMantissa = lhsNormMantissa + rhsNormMantissa; + uint64_t resultMantissa; + if (lhsSign != rhsSign) + { + int64_t mantissaDiff = lhsNormMantissa - rhsNormMantissa; + if (mantissaDiff < 0) + swap(lhsNormMantissa, rhsNormMantissa); + + lhsNormMantissa <<= 10; + rhsNormMantissa <<= 10; + resultMantissa = uint64_t(int64_t(lhsNormMantissa) - int64_t(rhsNormMantissa)); + resultMantissa >>= 10; + } + else + { + resultMantissa = lhsNormMantissa + rhsNormMantissa; + } + //const uint64_t resultSign = ((lhsSign && rhsSign) || (resultMantissa & (lhsSign << 63))) << 63; uint64_t resultBiasedExp = uint64_t(exp) + ieee754::traits::exponentBias; diff --git a/include/nbl/builtin/hlsl/ieee754/ieee754.hlsl b/include/nbl/builtin/hlsl/ieee754/ieee754.hlsl index f869f4ceba..e6e6477954 100644 --- a/include/nbl/builtin/hlsl/ieee754/ieee754.hlsl +++ b/include/nbl/builtin/hlsl/ieee754/ieee754.hlsl @@ -85,7 +85,7 @@ struct traits : traits_base using bit_rep_t = typename unsigned_integer_of_size::type; using base_t = traits_base; - NBL_CONSTEXPR_STATIC_INLINE bit_rep_t signMask = bit_rep_t(0x1u) << (sizeof(Float) * 8 - 1); + NBL_CONSTEXPR_STATIC_INLINE bit_rep_t signMask = bit_rep_t(0x1ull) << (sizeof(Float) * 8 - 1); NBL_CONSTEXPR_STATIC_INLINE bit_rep_t exponentMask = ((~bit_rep_t(0)) << base_t::mantissaBitCnt) ^ signMask; NBL_CONSTEXPR_STATIC_INLINE bit_rep_t mantissaMask = (bit_rep_t(0x1u) << base_t::mantissaBitCnt) - 1; NBL_CONSTEXPR_STATIC_INLINE int exponentBias = (int(0x1) << (base_t::exponentBitCnt - 1)) - 1; From c9f7e47e5e08cba58a270bb7e6ed4ddf0e61eb2c Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Fri, 6 Sep 2024 12:07:42 +0100 Subject: [PATCH 047/358] Enhanced more tests, more fixes --- examples_tests | 2 +- .../hlsl/emulated/emulated_float64_t.hlsl | 162 ++++++++---------- .../emulated/emulated_float64_t_impl.hlsl | 148 ++-------------- .../emulated/emulated_float64_t_utils.hlsl | 23 +-- include/nbl/builtin/hlsl/tgmath.hlsl | 2 +- 5 files changed, 95 insertions(+), 242 deletions(-) diff --git a/examples_tests b/examples_tests index a9e1007e59..1750aefb29 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit a9e1007e592786182c25ed1f6c25ad9e9306107c +Subproject commit 1750aefb2936657a4d5cd242399c7ba8895f43a0 diff --git a/include/nbl/builtin/hlsl/emulated/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated/emulated_float64_t.hlsl index a6eb9311d3..ecf9aa0d22 100644 --- a/include/nbl/builtin/hlsl/emulated/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/emulated_float64_t.hlsl @@ -80,38 +80,56 @@ namespace hlsl if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) return bit_cast(ieee754::traits::quietNaN); + if (!FastMath && impl::areBothInfinity(data, rhs.data)) + { + uint64_t lhsSign = data & ieee754::traits::signMask; + uint64_t rhsSign = rhs.data & ieee754::traits::signMask; + + if(lhsSign == rhsSign) + return bit_cast(ieee754::traits::inf | lhsSign); + else if(lhsSign || rhsSign) + return bit_cast(ieee754::traits::quietNaN | ieee754::traits::signMask); + } + + if (!FastMath && tgmath::isinf(data)) + return bit_cast(data); + + if (!FastMath && tgmath::isinf(rhs.data)) + return bit_cast(rhs.data); + const int lhsBiasedExp = ieee754::extractBiasedExponent(data); const int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); - if (lhsBiasedExp == 0ull) - return bit_cast(rhs.data); - if (rhsBiasedExp == 0ull) - return bit_cast(data); + uint64_t lhsData = impl::flushDenormToZero(lhsBiasedExp, data); + uint64_t rhsData = impl::flushDenormToZero(rhsBiasedExp, rhs.data); - const uint64_t lhsSign = ieee754::extractSign(data); - const uint64_t rhsSign = ieee754::extractSign(rhs.data); + uint64_t lhsSign = ieee754::extractSignPreserveBitPattern(lhsData); + uint64_t rhsSign = ieee754::extractSignPreserveBitPattern(rhsData); - if (!FastMath && tgmath::isinf(data)) - return bit_cast(ieee754::traits::inf | ieee754::extractSignPreserveBitPattern(max(data, rhs.data))); + if (!FastMath && impl::areBothZero(lhsData, rhsData)) + { + if (lhsSign == rhsSign) + return bit_cast(lhsSign); + else + return bit_cast(0ull); + } - uint64_t lhsNormMantissa = ieee754::extractNormalizeMantissa(data); - uint64_t rhsNormMantissa = ieee754::extractNormalizeMantissa(rhs.data); + if (!FastMath && tgmath::isinf(lhsData)) + return bit_cast(ieee754::traits::inf | ieee754::extractSignPreserveBitPattern(max(lhsData, rhsData))); + + uint64_t lhsNormMantissa = ieee754::extractNormalizeMantissa(lhsData); + uint64_t rhsNormMantissa = ieee754::extractNormalizeMantissa(rhsData); const int expDiff = lhsBiasedExp - rhsBiasedExp; const int exp = max(lhsBiasedExp, rhsBiasedExp) - ieee754::traits::exponentBias; const uint32_t shiftAmount = abs(expDiff); - uint64_t resultSign; if (expDiff < 0) { // so lhsNormMantissa always holds mantissa of number with greater exponent swap(lhsNormMantissa, rhsNormMantissa); - resultSign = rhsSign > 0 ? ieee754::traits::signMask : 0ull; - } - else - { - resultSign = lhsSign > 0 ? ieee754::traits::signMask : 0ull; + swap(lhsSign, rhsSign); } rhsNormMantissa >>= shiftAmount; @@ -121,7 +139,10 @@ namespace hlsl { int64_t mantissaDiff = lhsNormMantissa - rhsNormMantissa; if (mantissaDiff < 0) + { swap(lhsNormMantissa, rhsNormMantissa); + swap(lhsSign, rhsSign); + } lhsNormMantissa <<= 10; rhsNormMantissa <<= 10; @@ -132,20 +153,15 @@ namespace hlsl { resultMantissa = lhsNormMantissa + rhsNormMantissa; } - - //const uint64_t resultSign = ((lhsSign && rhsSign) || (resultMantissa & (lhsSign << 63))) << 63; uint64_t resultBiasedExp = uint64_t(exp) + ieee754::traits::exponentBias; - resultMantissa = resultMantissa; - if (resultMantissa & 1ull << 53) { ++resultBiasedExp; resultMantissa >>= 1; } - // TODO: better implementation with no loop while (resultMantissa < (1ull << 52)) { --resultBiasedExp; @@ -153,7 +169,7 @@ namespace hlsl } resultMantissa &= ieee754::traits::mantissaMask; - uint64_t output = impl::assembleFloat64(resultSign, uint64_t(resultBiasedExp) << ieee754::traits::mantissaBitCnt, resultMantissa); + uint64_t output = impl::assembleFloat64(lhsSign, resultBiasedExp << ieee754::traits::mantissaBitCnt, resultMantissa); return bit_cast(output); } @@ -186,53 +202,27 @@ namespace hlsl { emulated_float64_t retval = this_t::create(0ull); - uint64_t lhsSign = data & ieee754::traits::signMask; - uint64_t rhsSign = rhs.data & ieee754::traits::signMask; - uint64_t lhsMantissa = ieee754::extractMantissa(data); - uint64_t rhsMantissa = ieee754::extractMantissa(rhs.data); int lhsBiasedExp = ieee754::extractBiasedExponent(data); int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); - int exp = int(lhsBiasedExp + rhsBiasedExp) - ieee754::traits::exponentBias; - uint64_t sign = (data ^ rhs.data) & ieee754::traits::signMask; - if (!FastMath) - { - if (lhsBiasedExp == ieee754::traits::specialValueExp) - { - if ((lhsMantissa != 0u) || ((rhsBiasedExp == ieee754::traits::specialValueExp) && (rhsMantissa != 0u))) - return bit_cast(impl::propagateFloat64NaN(data, rhs.data)); - if ((uint64_t(rhsBiasedExp) | rhsMantissa) == 0u) - return bit_cast(ieee754::traits::quietNaN); + uint64_t lhsData = impl::flushDenormToZero(lhsBiasedExp, data); + uint64_t rhsData = impl::flushDenormToZero(rhsBiasedExp, rhs.data); - return bit_cast(impl::assembleFloat64(sign, ieee754::traits::exponentMask, 0ull)); - } - if (rhsBiasedExp == ieee754::traits::specialValueExp) - { - /* a cannot be NaN, but is b NaN? */ - if (rhsMantissa != 0u) -#ifdef RELAXED_NAN_PROPAGATION - return rhs.data; -#else - return bit_cast(impl::propagateFloat64NaN(data, rhs.data)); -#endif - if ((uint64_t(lhsBiasedExp) | lhsMantissa) == 0u) - return bit_cast(ieee754::traits::quietNaN); + uint64_t lhsSign = lhsData & ieee754::traits::signMask; + uint64_t rhsSign = rhsData & ieee754::traits::signMask; - return bit_cast(sign | ieee754::traits::exponentMask); - } - if (lhsBiasedExp == 0) - { - if (lhsMantissa == 0u) - return bit_cast(sign); - impl::normalizeFloat64Subnormal(lhsMantissa, lhsBiasedExp, lhsMantissa); - } - if (rhsBiasedExp == 0) - { - if (rhsMantissa == 0u) - return bit_cast(sign); - impl::normalizeFloat64Subnormal(rhsMantissa, rhsBiasedExp, rhsMantissa); - } - } + uint64_t lhsMantissa = ieee754::extractMantissa(lhsData); + uint64_t rhsMantissa = ieee754::extractMantissa(rhsData); + + int exp = int(lhsBiasedExp + rhsBiasedExp) - ieee754::traits::exponentBias; + uint64_t sign = (lhsData ^ rhsData) & ieee754::traits::signMask; + + if (!FastMath && (tgmath::isnan(lhsData) || tgmath::isnan(rhsData))) + return bit_cast(ieee754::traits::quietNaN | sign); + if (!FastMath && (tgmath::isinf(lhsData) || tgmath::isinf(rhsData))) + return bit_cast(ieee754::traits::inf | sign); + if (!FastMath && impl::areBothZero(lhsData, rhsData)) + return bit_cast(sign); const uint64_t hi_l = (lhsMantissa >> 21) | (1ull << 31); const uint64_t lo_l = lhsMantissa & ((1ull << 21) - 1); @@ -263,42 +253,36 @@ namespace hlsl return _static_cast(data) * create(rhs); } - /*this_t reciprocal(uint64_t x) - { - using ThisType = this_t; - ThisType output = ThisType::bit_cast((0xbfcdd6a18f6a6f52ULL - x) >> 1); - output = output * output; - return output; - }*/ - emulated_float64_t operator/(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { if (FlushDenormToZero) { - //return this_t::bit_cast(data) * reciprocal(rhs.data); - if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) return bit_cast(ieee754::traits::quietNaN); const uint64_t sign = (data ^ rhs.data) & ieee754::traits::signMask; if (!FastMath && impl::isZero(rhs.data)) - return bit_cast(ieee754::traits::inf | sign); - + return bit_cast(ieee754::traits::quietNaN | sign); if (!FastMath && impl::areBothInfinity(data, rhs.data)) - return bit_cast(ieee754::traits::quietNaN); - + return bit_cast(ieee754::traits::quietNaN | ieee754::traits::signMask); if (!FastMath && tgmath::isinf(data)) return bit_cast(ieee754::traits::inf | sign); - if (!FastMath && tgmath::isinf(rhs.data)) return bit_cast(0ull | sign); + if (!FastMath && impl::isZero(rhs.data)) + return bit_cast(ieee754::traits::quietNaN | sign); + + int lhsBiasedExp = ieee754::extractBiasedExponent(data); + int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); + uint64_t lhsData = impl::flushDenormToZero(lhsBiasedExp, data); + uint64_t rhsData = impl::flushDenormToZero(rhsBiasedExp, rhs.data); - const uint64_t lhsRealMantissa = (ieee754::extractMantissa(data) | (1ull << ieee754::traits::mantissaBitCnt)); - const uint64_t rhsRealMantissa = ieee754::extractMantissa(rhs.data) | (1ull << ieee754::traits::mantissaBitCnt); + const uint64_t lhsRealMantissa = (ieee754::extractMantissa(lhsData) | (1ull << ieee754::traits::mantissaBitCnt)); + const uint64_t rhsRealMantissa = ieee754::extractMantissa(rhsData) | (1ull << ieee754::traits::mantissaBitCnt); - int exp = ieee754::extractExponent(data) - ieee754::extractExponent(rhs.data) + ieee754::traits::exponentBias; + int exp = lhsBiasedExp - rhsBiasedExp + int(ieee754::traits::exponentBias); uint64_t2 lhsMantissaShifted = impl::shiftMantissaLeftBy53(lhsRealMantissa); uint64_t mantissa = impl::divmod128by64(lhsMantissaShifted.x, lhsMantissaShifted.y, rhsRealMantissa); @@ -326,12 +310,10 @@ namespace hlsl { if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) return false; - // TODO: i'm not sure about this one if (!FastMath && impl::areBothZero(data, rhs.data)) return true; const emulated_float64_t xored = bit_cast(data ^ rhs.data); - // TODO: check what fast math returns for -0 == 0 if ((xored.data & 0x7FFFFFFFFFFFFFFFull) == 0ull) return true; @@ -348,7 +330,7 @@ namespace hlsl { if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) return false; - if (!FastMath && impl::areBothSameSignInfinity(data, rhs.data)) + if (!FastMath && impl::areBothInfinity(data, rhs.data)) return false; if (!FastMath && impl::areBothZero(data, rhs.data)) return false; @@ -366,7 +348,7 @@ namespace hlsl { if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) return false; - if (!FastMath && impl::areBothSameSignInfinity(data, rhs.data)) + if (!FastMath && impl::areBothInfinity(data, rhs.data)) return false; if (!FastMath && impl::areBothZero(data, rhs.data)) return false; @@ -495,7 +477,6 @@ struct static_cast_helper,void using From = emulated_float64_t; - // TODO: test static inline To cast(From v) { using ToAsFloat = typename float_of_size::type; @@ -511,13 +492,12 @@ struct static_cast_helper,void const int exponent = ieee754::extractExponent(v.data); if (!From::supportsFastMath()) { - //TODO: i have no idea why it doesn't work, fix - /*if (exponent > ieee754::traits::exponentMax) + if (exponent > ieee754::traits::exponentMax) return bit_cast(ieee754::traits::inf); if (exponent < ieee754::traits::exponentMin) - return -bit_cast(ieee754::traits::inf); + return bit_cast(-ieee754::traits::inf); if (tgmath::isnan(v.data)) - return bit_cast(ieee754::traits::quietNaN);*/ + return bit_cast(ieee754::traits::quietNaN); } diff --git a/include/nbl/builtin/hlsl/emulated/emulated_float64_t_impl.hlsl b/include/nbl/builtin/hlsl/emulated/emulated_float64_t_impl.hlsl index d2b59e9607..6778cd34b5 100644 --- a/include/nbl/builtin/hlsl/emulated/emulated_float64_t_impl.hlsl +++ b/include/nbl/builtin/hlsl/emulated/emulated_float64_t_impl.hlsl @@ -97,9 +97,14 @@ inline uint64_t castFloat32ToStorageType(float32_t val) } }; +NBL_CONSTEXPR_INLINE_FUNC bool isZero(uint64_t val) +{ + return (val << 1) == 0; +} + inline uint64_t castToUint64WithFloat64BitPattern(uint64_t val) { - if (val == 0) + if (isZero(val)) return val; #ifndef __HLSL_VERSION @@ -187,6 +192,11 @@ NBL_CONSTEXPR_INLINE_FUNC uint64_t propagateFloat64NaN(uint64_t lhs, uint64_t rh #endif } +NBL_CONSTEXPR_INLINE_FUNC uint64_t flushDenormToZero(uint64_t extractedBiasedExponent, uint64_t value) +{ + return extractedBiasedExponent ? value : ieee754::extractSignPreserveBitPattern(value); +} + static inline int countLeadingZeros32(uint32_t val) { #ifndef __HLSL_VERSION @@ -215,122 +225,17 @@ NBL_CONSTEXPR_INLINE_FUNC uint32_t2 shift64RightJamming(uint32_t2 val, int count return output; } -NBL_CONSTEXPR_INLINE_FUNC uint32_t3 shift64ExtraRightJamming(uint32_t3 val, int count) -{ - uint32_t3 output; - output.x = 0u; - - int negCount = (-count) & 31; - - output.z = glsl::mix(uint32_t(val.x != 0u), val.x, count == 64); - output.z = glsl::mix(output.z, val.x << negCount, count < 64); - output.z = glsl::mix(output.z, val.y << negCount, count < 32); - - output.y = glsl::mix(0u, (val.x >> (count & 31)), count < 64); - output.y = glsl::mix(output.y, (val.x << negCount) | (val.y >> count), count < 32); - - val.z = glsl::mix(val.z | val.y, val.z, count < 32); - output.x = glsl::mix(output.x, val.x >> count, count < 32); - output.z |= uint32_t(val.z != 0u); - - output.x = glsl::mix(output.x, 0u, (count == 32)); - output.y = glsl::mix(output.y, val.x, (count == 32)); - output.z = glsl::mix(output.z, val.y, (count == 32)); - output.x = glsl::mix(output.x, val.x, (count == 0)); - output.y = glsl::mix(output.y, val.y, (count == 0)); - output.z = glsl::mix(output.z, val.z, (count == 0)); - - return output; -} - -NBL_CONSTEXPR_INLINE_FUNC uint64_t shortShift64Left(uint64_t val, int count) -{ - const uint32_t2 packed = packUint64(val); - - uint32_t2 output; - output.y = packed.y << count; - // TODO: fix - output.x = glsl::mix((packed.x << count | (packed.y >> ((-count) & 31))), packed.x, count == 0); - - return unpackUint64(output); -}; - NBL_CONSTEXPR_INLINE_FUNC uint64_t assembleFloat64(uint64_t signShifted, uint64_t expShifted, uint64_t mantissa) { return signShifted + expShifted + mantissa; } -NBL_CONSTEXPR_INLINE_FUNC uint64_t roundAndPackFloat64(uint64_t zSign, int zExp, uint32_t3 mantissaExtended) -{ - bool roundNearestEven; - bool increment; - - roundNearestEven = true; - increment = int(mantissaExtended.z) < 0; - - // overflow handling? - // if biased exp is lesser then 2045 - if (0x7FD <= zExp) - { - if ((0x7FD < zExp) || ((zExp == 0x7FD) && (0x001FFFFFu == mantissaExtended.x && 0xFFFFFFFFu == mantissaExtended.y) && increment)) - return assembleFloat64(zSign, 0x7FE << ieee754::traits::mantissaBitCnt, 0x000FFFFFFFFFFFFFull); - - return assembleFloat64(zSign, ieee754::traits::exponentMask, 0ull); - } - - if (zExp < 0) - { - mantissaExtended = shift64ExtraRightJamming(mantissaExtended, -zExp); - zExp = 0; - } - - zExp = glsl::mix(zExp, 0, (mantissaExtended.x | mantissaExtended.y) == 0u); - - return assembleFloat64(zSign, uint64_t(zExp) << ieee754::traits::mantissaBitCnt, unpackUint64(mantissaExtended.xy)); -} - -static inline uint64_t normalizeRoundAndPackFloat64(uint64_t sign, int exp, uint32_t frac0, uint32_t frac1) -{ - int shiftCount; - uint32_t3 frac = uint32_t3(frac0, frac1, 0u); - - if (frac.x == 0u) - { - exp -= 32; - frac.x = frac.y; - frac.y = 0u; - } - - shiftCount = countLeadingZeros32(frac.x) - 11; - if (0 <= shiftCount) - { - // TODO: this is packing and unpacking madness, fix it - frac.xy = packUint64(shortShift64Left(unpackUint64(frac.xy), shiftCount)); - } - else - { - frac.xyz = shift64ExtraRightJamming(uint32_t3(frac.xy, 0), -shiftCount); - } - exp -= shiftCount; - return roundAndPackFloat64(sign, exp, frac); -} - +//TODO: remove static inline void normalizeFloat64Subnormal(uint64_t mantissa, NBL_REF_ARG(int) outExp, NBL_REF_ARG(uint64_t) outMantissa) { - uint32_t2 mantissaPacked = packUint64(mantissa); - int shiftCount; - uint32_t2 temp; - shiftCount = countLeadingZeros32(glsl::mix(mantissaPacked.x, mantissaPacked.y, mantissaPacked.x == 0u)) - 11; - outExp = glsl::mix(1 - shiftCount, -shiftCount - 31, mantissaPacked.x == 0u); - - temp.x = glsl::mix(mantissaPacked.y << shiftCount, mantissaPacked.y >> (-shiftCount), shiftCount < 0); - temp.y = glsl::mix(0u, mantissaPacked.y << (shiftCount & 31), shiftCount < 0); - - shortShift64Left(impl::unpackUint64(mantissaPacked), shiftCount); - - outMantissa = glsl::mix(outMantissa, unpackUint64(temp), mantissaPacked.x == 0); + return; } NBL_CONSTEXPR_INLINE_FUNC bool areBothInfinity(uint64_t lhs, uint64_t rhs) @@ -341,16 +246,6 @@ NBL_CONSTEXPR_INLINE_FUNC bool areBothInfinity(uint64_t lhs, uint64_t rhs) return lhs == rhs && lhs == ieee754::traits::inf; } -NBL_CONSTEXPR_INLINE_FUNC bool areBothSameSignInfinity(uint64_t lhs, uint64_t rhs) -{ - return lhs == rhs && (lhs & ~ieee754::traits::signMask) == ieee754::traits::inf; -} - -NBL_CONSTEXPR_INLINE_FUNC bool isZero(uint64_t val) -{ - return (val << 1) == 0; -} - NBL_CONSTEXPR_INLINE_FUNC bool areBothZero(uint64_t lhs, uint64_t rhs) { return ((lhs << 1) == 0ull) && ((rhs << 1) == 0ull); @@ -361,21 +256,6 @@ NBL_CONSTEXPR_INLINE_FUNC bool areBothSameSignZero(uint64_t lhs, uint64_t rhs) return ((lhs << 1) == 0ull) && (lhs == rhs); } -// TODO: find more efficient algorithm -static inline uint64_t nlz64(uint64_t x) -{ - static const uint64_t MASK = 1ull << 63; - - uint64_t counter = 0; - - while ((x & MASK) == 0) - { - x <<= 1; - ++counter; - } - return counter; -} - // returns pair of quotient and remainder static inline uint64_t divmod128by64(const uint64_t dividentHigh, const uint64_t dividentLow, uint64_t divisor) { @@ -383,9 +263,7 @@ static inline uint64_t divmod128by64(const uint64_t dividentHigh, const uint64_t uint64_t un1, un0, vn1, vn0, q1, q0, un32, un21, un10, rhat, left, right; uint64_t s; - //TODO: countl_zero s = countl_zero(divisor); - //s = nlz64(divisor); divisor <<= s; vn1 = divisor >> 32; vn0 = divisor & 0xFFFFFFFF; diff --git a/include/nbl/builtin/hlsl/emulated/emulated_float64_t_utils.hlsl b/include/nbl/builtin/hlsl/emulated/emulated_float64_t_utils.hlsl index ac7b79e74b..6bc017fb73 100644 --- a/include/nbl/builtin/hlsl/emulated/emulated_float64_t_utils.hlsl +++ b/include/nbl/builtin/hlsl/emulated/emulated_float64_t_utils.hlsl @@ -163,7 +163,6 @@ using emulated_vector_t3 = emulated_vector; template using emulated_vector_t4 = emulated_vector; -// TODO: works only for float, fix namespace impl { @@ -265,21 +264,19 @@ struct emulated_matrix // : emulated_matrix_base; using portable_matrix64_t2x2 = portable_matrix_t2x2 >; using portable_matrix64_t3x3 = portable_matrix_t3x3 >; - -// TODO: fix template NBL_CONSTEXPR_INLINE_FUNC portable_float64_t<> create_portable_float64_t(T val) { diff --git a/include/nbl/builtin/hlsl/tgmath.hlsl b/include/nbl/builtin/hlsl/tgmath.hlsl index 9fac5dce96..d603dea2c0 100644 --- a/include/nbl/builtin/hlsl/tgmath.hlsl +++ b/include/nbl/builtin/hlsl/tgmath.hlsl @@ -33,7 +33,7 @@ NBL_CONSTEXPR_INLINE_FUNC bool isinf(T val) using AsFloat = typename float_of_size::type; AsUint tmp = bit_cast(val); - return (tmp & ~ieee754::traits::signMask) == ieee754::traits::inf; + return (tmp & (~ieee754::traits::signMask)) == ieee754::traits::inf; } } From fbfc7c289cf02a272a3733f88ed2ab7cba7e71d4 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Fri, 6 Sep 2024 16:41:29 +0100 Subject: [PATCH 048/358] Updated examples --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 1750aefb29..5e47cf34a4 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 1750aefb2936657a4d5cd242399c7ba8895f43a0 +Subproject commit 5e47cf34a46c1d32237a7f4aa5222a795aa81fde From 2743cb981828573e3686bd3a5aed182ddbfb252b Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Fri, 13 Sep 2024 11:24:36 +0100 Subject: [PATCH 049/358] 64 bit float type is now determined by device capabilities --- examples_tests | 2 +- .../hlsl/emulated/emulated_float64_t.hlsl | 195 +++++++------ .../emulated/emulated_float64_t_impl.hlsl | 61 ++-- .../hlsl/math/equations/quadratic.hlsl | 2 +- .../nbl/builtin/hlsl/portable_float64_t.hlsl | 22 ++ ...tils.hlsl => portable_float64_t_math.hlsl} | 267 +++++++++--------- include/nbl/builtin/hlsl/shapes/beziers.hlsl | 23 +- src/nbl/builtin/CMakeLists.txt | 3 +- 8 files changed, 308 insertions(+), 267 deletions(-) create mode 100644 include/nbl/builtin/hlsl/portable_float64_t.hlsl rename include/nbl/builtin/hlsl/{emulated/emulated_float64_t_utils.hlsl => portable_float64_t_math.hlsl} (64%) diff --git a/examples_tests b/examples_tests index 5e47cf34a4..904da40cc7 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 5e47cf34a46c1d32237a7f4aa5222a795aa81fde +Subproject commit 904da40cc731d5f25ff6319ec7f497b06c579b00 diff --git a/include/nbl/builtin/hlsl/emulated/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated/emulated_float64_t.hlsl index ecf9aa0d22..aa600e7eca 100644 --- a/include/nbl/builtin/hlsl/emulated/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/emulated_float64_t.hlsl @@ -7,6 +7,16 @@ namespace nbl { namespace hlsl { + /*enum E_ROUNDING_MODE + { + FLOAT_ROUND_NEAREST_EVEN, + FLOAT_ROUND_TO_ZERO, + FLOAT_ROUND_DOWN, + FLOAT_ROUND_UP + };*/ + + // currently only FLOAT_ROUND_TO_ZERO is supported, cannot implement partial specialization in this case due to dxc bug https://github.com/microsoft/DirectXShaderCompiler/issues/5563 + // TODO: partial specializations with new template parameter `E_ROUNDING_MODE RoundingMode` template struct emulated_float64_t { @@ -77,25 +87,28 @@ namespace hlsl { if (FlushDenormToZero) { - if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) - return bit_cast(ieee754::traits::quietNaN); - - if (!FastMath && impl::areBothInfinity(data, rhs.data)) + if(FastMath) { - uint64_t lhsSign = data & ieee754::traits::signMask; - uint64_t rhsSign = rhs.data & ieee754::traits::signMask; + if (tgmath::isnan(data) || tgmath::isnan(rhs.data)) + return bit_cast(ieee754::traits::quietNaN); - if(lhsSign == rhsSign) - return bit_cast(ieee754::traits::inf | lhsSign); - else if(lhsSign || rhsSign) - return bit_cast(ieee754::traits::quietNaN | ieee754::traits::signMask); - } + if (impl::areBothInfinity(data, rhs.data)) + { + uint64_t lhsSign = data & ieee754::traits::signMask; + uint64_t rhsSign = rhs.data & ieee754::traits::signMask; - if (!FastMath && tgmath::isinf(data)) - return bit_cast(data); + if (lhsSign == rhsSign) + return bit_cast(ieee754::traits::inf | lhsSign); + else if (lhsSign || rhsSign) + return bit_cast(ieee754::traits::quietNaN | ieee754::traits::signMask); + } - if (!FastMath && tgmath::isinf(rhs.data)) - return bit_cast(rhs.data); + if (tgmath::isinf(data)) + return bit_cast(data); + + if (tgmath::isinf(rhs.data)) + return bit_cast(rhs.data); + } const int lhsBiasedExp = ieee754::extractBiasedExponent(data); const int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); @@ -105,17 +118,20 @@ namespace hlsl uint64_t lhsSign = ieee754::extractSignPreserveBitPattern(lhsData); uint64_t rhsSign = ieee754::extractSignPreserveBitPattern(rhsData); - - if (!FastMath && impl::areBothZero(lhsData, rhsData)) + + if(FastMath) { - if (lhsSign == rhsSign) - return bit_cast(lhsSign); - else - return bit_cast(0ull); - } + if (impl::areBothZero(lhsData, rhsData)) + { + if (lhsSign == rhsSign) + return bit_cast(lhsSign); + else + return bit_cast(0ull); + } - if (!FastMath && tgmath::isinf(lhsData)) - return bit_cast(ieee754::traits::inf | ieee754::extractSignPreserveBitPattern(max(lhsData, rhsData))); + if (tgmath::isinf(lhsData)) + return bit_cast(ieee754::traits::inf | ieee754::extractSignPreserveBitPattern(max(lhsData, rhsData))); + } uint64_t lhsNormMantissa = ieee754::extractNormalizeMantissa(lhsData); uint64_t rhsNormMantissa = ieee754::extractNormalizeMantissa(rhsData); @@ -156,6 +172,9 @@ namespace hlsl uint64_t resultBiasedExp = uint64_t(exp) + ieee754::traits::exponentBias; + if (resultMantissa == 0ull) + return _static_cast(0ull); + if (resultMantissa & 1ull << 53) { ++resultBiasedExp; @@ -216,13 +235,16 @@ namespace hlsl int exp = int(lhsBiasedExp + rhsBiasedExp) - ieee754::traits::exponentBias; uint64_t sign = (lhsData ^ rhsData) & ieee754::traits::signMask; - - if (!FastMath && (tgmath::isnan(lhsData) || tgmath::isnan(rhsData))) - return bit_cast(ieee754::traits::quietNaN | sign); - if (!FastMath && (tgmath::isinf(lhsData) || tgmath::isinf(rhsData))) - return bit_cast(ieee754::traits::inf | sign); - if (!FastMath && impl::areBothZero(lhsData, rhsData)) - return bit_cast(sign); + if (FastMath) + { + if (tgmath::isnan(lhsData) || tgmath::isnan(rhsData)) + return bit_cast(ieee754::traits::quietNaN | sign); + if (tgmath::isinf(lhsData) || tgmath::isinf(rhsData)) + return bit_cast(ieee754::traits::inf | sign); + if (impl::areBothZero(lhsData, rhsData)) + return bit_cast(sign); + } + const uint64_t hi_l = (lhsMantissa >> 21) | (1ull << 31); const uint64_t lo_l = lhsMantissa & ((1ull << 21) - 1); @@ -232,6 +254,10 @@ namespace hlsl //const uint64_t RoundToNearest = (1ull << 31) - 1; uint64_t newPseudoMantissa = ((hi_l * hi_r) >> 10) + ((hi_l * lo_r + lo_l * hi_r/* + RoundToNearest*/) >> 31); + if (newPseudoMantissa == 0ull) + return _static_cast(0ull); + + if (newPseudoMantissa & (0x1ull << 53)) { newPseudoMantissa >>= 1; @@ -257,21 +283,23 @@ namespace hlsl { if (FlushDenormToZero) { - if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) - return bit_cast(ieee754::traits::quietNaN); - const uint64_t sign = (data ^ rhs.data) & ieee754::traits::signMask; - if (!FastMath && impl::isZero(rhs.data)) - return bit_cast(ieee754::traits::quietNaN | sign); - if (!FastMath && impl::areBothInfinity(data, rhs.data)) - return bit_cast(ieee754::traits::quietNaN | ieee754::traits::signMask); - if (!FastMath && tgmath::isinf(data)) - return bit_cast(ieee754::traits::inf | sign); - if (!FastMath && tgmath::isinf(rhs.data)) - return bit_cast(0ull | sign); - if (!FastMath && impl::isZero(rhs.data)) - return bit_cast(ieee754::traits::quietNaN | sign); + if(FastMath) + { + if (tgmath::isnan(data) || tgmath::isnan(rhs.data)) + return bit_cast(ieee754::traits::quietNaN); + if (impl::isZero(rhs.data)) + return bit_cast(ieee754::traits::quietNaN | sign); + if (impl::areBothInfinity(data, rhs.data)) + return bit_cast(ieee754::traits::quietNaN | ieee754::traits::signMask); + if (tgmath::isinf(data)) + return bit_cast(ieee754::traits::inf | sign); + if (tgmath::isinf(rhs.data)) + return bit_cast(0ull | sign); + if (impl::isZero(rhs.data)) + return bit_cast(ieee754::traits::quietNaN | sign); + } int lhsBiasedExp = ieee754::extractBiasedExponent(data); int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); @@ -287,10 +315,13 @@ namespace hlsl uint64_t2 lhsMantissaShifted = impl::shiftMantissaLeftBy53(lhsRealMantissa); uint64_t mantissa = impl::divmod128by64(lhsMantissaShifted.x, lhsMantissaShifted.y, rhsRealMantissa); - while (mantissa < (1ull << 52)) + const int msb = impl::_findMSB(mantissa); + if(msb != -1) { - mantissa <<= 1; - exp--; + const int shiftAmount = 52 - msb; + assert(shiftAmount >= 0); + mantissa <<= shiftAmount; + exp -= shiftAmount; } mantissa &= ieee754::traits::mantissaMask; @@ -305,13 +336,15 @@ namespace hlsl } // relational operators - // TODO: should `FlushDenormToZero` affect relational operators? bool operator==(this_t rhs) NBL_CONST_MEMBER_FUNC { - if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) - return false; - if (!FastMath && impl::areBothZero(data, rhs.data)) - return true; + if (FastMath) + { + if (tgmath::isnan(data) || tgmath::isnan(rhs.data)) + return false; + if (impl::areBothZero(data, rhs.data)) + return true; + } const emulated_float64_t xored = bit_cast(data ^ rhs.data); if ((xored.data & 0x7FFFFFFFFFFFFFFFull) == 0ull) @@ -321,19 +354,22 @@ namespace hlsl } bool operator!=(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { - if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) + if (FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) return false; return !(bit_cast(data) == rhs); } bool operator<(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { - if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) - return false; - if (!FastMath && impl::areBothInfinity(data, rhs.data)) - return false; - if (!FastMath && impl::areBothZero(data, rhs.data)) - return false; + if (FastMath) + { + if (tgmath::isnan(data) || tgmath::isnan(rhs.data)) + return false; + if (impl::areBothInfinity(data, rhs.data)) + return false; + if (impl::areBothZero(data, rhs.data)) + return false; + } const uint64_t lhsSign = ieee754::extractSign(data); const uint64_t rhsSign = ieee754::extractSign(rhs.data); @@ -346,12 +382,15 @@ namespace hlsl } bool operator>(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { - if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) - return false; - if (!FastMath && impl::areBothInfinity(data, rhs.data)) - return false; - if (!FastMath && impl::areBothZero(data, rhs.data)) - return false; + if (FastMath) + { + if (tgmath::isnan(data) || tgmath::isnan(rhs.data)) + return false; + if (impl::areBothInfinity(data, rhs.data)) + return false; + if (impl::areBothZero(data, rhs.data)) + return false; + } const uint64_t lhsSign = ieee754::extractSign(data); const uint64_t rhsSign = ieee754::extractSign(rhs.data); @@ -364,43 +403,25 @@ namespace hlsl } bool operator<=(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { - if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) + if (FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) return false; return !(bit_cast(data) > bit_cast(rhs.data)); } bool operator>=(emulated_float64_t rhs) { - if (!FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) + if (FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) return false; return !(bit_cast(data) < bit_cast(rhs.data)); } - //logical operators - bool operator&&(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { return bool(data) && bool(rhs.data); } - bool operator||(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { return bool(data) || bool(rhs.data); } - bool operator!() NBL_CONST_MEMBER_FUNC { return !bool(data); } - emulated_float64_t flipSign() { return bit_cast(data ^ ieee754::traits::signMask); } - NBL_CONSTEXPR_STATIC_INLINE bool supportsFastMath() - { - return FastMath; - } - - enum E_ROUNDING_MODE - { - FLOAT_ROUND_NEAREST_EVEN, - FLOAT_ROUND_TO_ZERO, - FLOAT_ROUND_DOWN, - FLOAT_ROUND_UP - }; - - static const E_ROUNDING_MODE RoundingMode = E_ROUNDING_MODE::FLOAT_ROUND_TO_ZERO; + NBL_CONSTEXPR_STATIC bool isFastMathSupported = FastMath; }; #define IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(...) \ @@ -490,7 +511,7 @@ struct static_cast_helper,void { const int exponent = ieee754::extractExponent(v.data); - if (!From::supportsFastMath()) + if (!From::isFastMathSuppoerted) { if (exponent > ieee754::traits::exponentMax) return bit_cast(ieee754::traits::inf); diff --git a/include/nbl/builtin/hlsl/emulated/emulated_float64_t_impl.hlsl b/include/nbl/builtin/hlsl/emulated/emulated_float64_t_impl.hlsl index 6778cd34b5..acb5adf713 100644 --- a/include/nbl/builtin/hlsl/emulated/emulated_float64_t_impl.hlsl +++ b/include/nbl/builtin/hlsl/emulated/emulated_float64_t_impl.hlsl @@ -63,16 +63,6 @@ NBL_CONSTEXPR_INLINE_FUNC uint64_t packFloat64(uint32_t zSign, int zExp, uint32_ return output; } -NBL_CONSTEXPR_INLINE_FUNC uint32_t2 packUint64(uint64_t val) -{ - return uint32_t2((val & 0xFFFFFFFF00000000ull) >> 32, val & 0x00000000FFFFFFFFull); -} - -NBL_CONSTEXPR_INLINE_FUNC uint64_t unpackUint64(uint32_t2 val) -{ - return ((uint64_t(val.x) & 0x00000000FFFFFFFFull) << 32) | uint64_t(val.y); -} - template inline uint64_t castFloat32ToStorageType(float32_t val) { @@ -102,27 +92,36 @@ NBL_CONSTEXPR_INLINE_FUNC bool isZero(uint64_t val) return (val << 1) == 0; } -inline uint64_t castToUint64WithFloat64BitPattern(uint64_t val) +// TODO: where do i move this function? also rename +template +static inline int _findMSB(Int val) { - if (isZero(val)) - return val; + //static_assert(is_integral::value); +#ifndef __HLSL_VERSION + return nbl::hlsl::findMSB(val); +#else + return firstbithigh(val); +#endif +} +template <> +static inline int _findMSB(uint64_t val) +{ #ifndef __HLSL_VERSION - int exp = findMSB(val); + return nbl::hlsl::findMSB(val); #else - int exp = 63; - uint64_t mask = ieee754::traits::signMask; - while (!(val & mask)) - { - --exp; - mask >>= 1; - } + int msbHigh = firstbithigh(uint32_t(val >> 32)); + int msbLow = firstbithigh(uint32_t(val)); + return msbHigh != -1 ? msbHigh + 32 : msbLow; +#endif +} +inline uint64_t castToUint64WithFloat64BitPattern(uint64_t val) +{ + if (isZero(val)) + return val; - //uint32_t2 valPacked = packUint64(val); - //int exp = valPacked.x ? firstbithigh(valPacked.x) + 32 : firstbithigh(valPacked.y); - //exp = 63 - exp; -#endif + int exp = _findMSB(val); uint64_t mantissa; int shiftCnt = 52 - exp; @@ -197,13 +196,13 @@ NBL_CONSTEXPR_INLINE_FUNC uint64_t flushDenormToZero(uint64_t extractedBiasedExp return extractedBiasedExponent ? value : ieee754::extractSignPreserveBitPattern(value); } -static inline int countLeadingZeros32(uint32_t val) +template +static inline int countLeadingZeros(Int val) { -#ifndef __HLSL_VERSION - return 31 - findMSB(val); -#else - return 31 - firstbithigh(val); -#endif + static_assert(is_integral::value); + + NBL_CONSTEXPR_STATIC int BitCntSubOne = sizeof(Int) * 8 - 1; + return BitCntSubOne - _findMSB(val); } NBL_CONSTEXPR_INLINE_FUNC uint32_t2 shift64RightJamming(uint32_t2 val, int count) diff --git a/include/nbl/builtin/hlsl/math/equations/quadratic.hlsl b/include/nbl/builtin/hlsl/math/equations/quadratic.hlsl index 4d40e6f327..5c7d60a870 100644 --- a/include/nbl/builtin/hlsl/math/equations/quadratic.hlsl +++ b/include/nbl/builtin/hlsl/math/equations/quadratic.hlsl @@ -5,7 +5,7 @@ #ifndef _NBL_BUILTIN_HLSL_MATH_EQUATIONS_QUADRATIC_INCLUDED_ #define _NBL_BUILTIN_HLSL_MATH_EQUATIONS_QUADRATIC_INCLUDED_ -#include +#include // TODO: Later include from correct hlsl header #ifndef nbl_hlsl_FLT_EPSILON diff --git a/include/nbl/builtin/hlsl/portable_float64_t.hlsl b/include/nbl/builtin/hlsl/portable_float64_t.hlsl new file mode 100644 index 0000000000..e82c0092ad --- /dev/null +++ b/include/nbl/builtin/hlsl/portable_float64_t.hlsl @@ -0,0 +1,22 @@ +#ifndef _NBL_BUILTIN_HLSL_PORTABLE_FLOAT64_T_INCLUDED_ +#define _NBL_BUILTIN_HLSL_PORTABLE_FLOAT64_T_INCLUDED_ + +#include +#include +namespace nbl +{ +namespace hlsl +{ +template +#ifdef __HLSL_VERSION +using portable_float64_t = typename conditional::shaderFloat64, float64_t, emulated_float64_t >::type; +#else +using portable_float64_t = float64_t; +#endif + +//static_assert(sizeof(portable_float64_t) == sizeof(float64_t)); + +} +} + +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/emulated/emulated_float64_t_utils.hlsl b/include/nbl/builtin/hlsl/portable_float64_t_math.hlsl similarity index 64% rename from include/nbl/builtin/hlsl/emulated/emulated_float64_t_utils.hlsl rename to include/nbl/builtin/hlsl/portable_float64_t_math.hlsl index 6bc017fb73..1b21785833 100644 --- a/include/nbl/builtin/hlsl/emulated/emulated_float64_t_utils.hlsl +++ b/include/nbl/builtin/hlsl/portable_float64_t_math.hlsl @@ -1,25 +1,15 @@ -#ifndef _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_UTILS_INCLUDED_ -#define _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_UTILS_INCLUDED_ +#ifndef _NBL_BUILTIN_HLSL_PORTABLE_FLOAT64_T_MATH_INCLUDED_ +#define _NBL_BUILTIN_HLSL_PORTABLE_FLOAT64_T_MATH_INCLUDED_ #include #include #include +#include namespace nbl { namespace hlsl { -// TODO: enable -//template -//using portable_float64_t = conditional_t::shaderFloat64, float64_t, typename emulated_float64_t >; - -#ifndef __HLSL_VERSION -template -using portable_float64_t = typename conditional >::type; -#else -template -using portable_float64_t = typename conditional >::type; -#endif template struct emulated_vector {}; @@ -177,18 +167,11 @@ struct static_cast_helper,emulated_vector,void> } -//template -//struct emulated_matrix_base -//{ -// using vec_t = emulated_vector; -// vec_t columns[M]; -//}; - template -struct emulated_matrix {}; // : emulated_matrix_base {}; +struct emulated_matrix {}; template -struct emulated_matrix// : emulated_matrix_base +struct emulated_matrix { using vec_t = emulated_vector_t2; using type = emulated_matrix; @@ -208,7 +191,7 @@ struct emulated_matrix// : emulated_matrix_base// : emulated_matrix_base -struct emulated_matrix // : emulated_matrix_base +struct emulated_matrix { using vec_t = emulated_vector_t3; using type = emulated_matrix; @@ -261,7 +244,7 @@ struct emulated_matrix // : emulated_matrix_base // : emulated_matrix_base; template using emulated_matrix_t3x3 = emulated_matrix; -namespace impl -{ - -template::value > +template::value> struct portable_vector -{ - using type = emulated_vector; -}; -// specialization for builtins -template -struct portable_vector { using type = vector; }; - -template::value > -struct portable_matrix -{ - using type = emulated_matrix; -}; - -template -struct portable_matrix +#ifdef __HLSL_VERSION +template +struct portable_vector { - using type = matrix; + using type = portable_vector; }; - -} +#endif template -using portable_vector_t = typename impl::portable_vector::type; +using portable_vector_t = typename portable_vector::type; template using portable_vector_t2 = portable_vector_t; @@ -342,88 +297,114 @@ using portable_vector_t3 = portable_vector_t; template using portable_vector_t4 = portable_vector_t; -using portable_vector64_t2 = portable_vector_t2 >; -using portable_vector64_t3 = portable_vector_t3 >; -using portable_vector64_t4 = portable_vector_t4 >; +#ifdef __HLSL_VERSION +template +using portable_vector64_t2 = portable_vector_t2 >; +template +using portable_vector64_t3 = portable_vector_t3 >; +template +using portable_vector64_t4 = portable_vector_t4 >; +#else +template +using portable_vector64_t2 = portable_vector_t2; +template +using portable_vector64_t3 = portable_vector_t3; +template +using portable_vector64_t4 = portable_vector_t4; +#endif +template::value> +struct portable_matrix +{ + using type = matrix; +}; +#ifdef __HLSL_VERSION template -using portable_matrix_t = typename impl::portable_matrix::type; +struct portable_matrix +{ + using type = emulated_matrix; +}; +#endif + +template +using portable_matrix_t = typename portable_matrix::type; template using portable_matrix_t2x2 = portable_matrix_t; template using portable_matrix_t3x3 = portable_matrix_t; -using portable_matrix64_t2x2 = portable_matrix_t2x2 >; -using portable_matrix64_t3x3 = portable_matrix_t3x3 >; - -template -NBL_CONSTEXPR_INLINE_FUNC portable_float64_t<> create_portable_float64_t(T val) -{ - return _static_cast >(val); -} - -template -NBL_CONSTEXPR_INLINE_FUNC portable_vector64_t2 create_portable_vector64_t2(T val) -{ - portable_vector64_t2 output; - output.x = create_portable_float64_t(val); - output.y = create_portable_float64_t(val); - - return output; -} - -template -NBL_CONSTEXPR_INLINE_FUNC portable_vector64_t2 create_portable_vector64_t2(X x, Y y) -{ - portable_vector64_t2 output; - output.x = create_portable_float64_t(x); - output.y = create_portable_float64_t(y); - - return output; -} - -template -NBL_CONSTEXPR_INLINE_FUNC portable_vector64_t2 create_portable_vector64_t2_from_2d_vec(VecType vec) -{ - portable_vector64_t2 output; - output.x = create_portable_float64_t(vec.x); - output.y = create_portable_float64_t(vec.y); - - return output; -} - -template -NBL_CONSTEXPR_INLINE_FUNC portable_vector64_t3 create_portable_vector64_t3(T val) -{ - portable_vector64_t3 output; - output.x = create_portable_float64_t(val); - output.y = create_portable_float64_t(val); - output.z = create_portable_float64_t(val); - - return output; -} - -template -NBL_CONSTEXPR_INLINE_FUNC portable_vector64_t3 create_portable_vector64_t3(X x, Y y, Z z) -{ - portable_vector64_t3 output; - output.x = create_portable_float64_t(x); - output.y = create_portable_float64_t(y); - output.z = create_portable_float64_t(z); - return output; -} +#ifdef __HLSL_VERSION +template +using portable_matrix64_t2x2 = portable_matrix_t2x2 >; +template +using portable_matrix64_t3x3 = portable_matrix_t3x3 >; +#else +template +using portable_matrix64_t2x2 = portable_matrix_t2x2; +template +using portable_matrix64_t3x3 = portable_matrix_t3x3; +#endif -template -NBL_CONSTEXPR_INLINE_FUNC portable_vector64_t3 create_portable_vector64_t2_from_3d_vec(VecType vec) +namespace impl { - portable_vector64_t3 output; - output.x = create_portable_float64_t(vec.x); - output.y = create_portable_float64_t(vec.y); - output.z = create_portable_float64_t(vec.z); - - return output; + template + struct static_cast_helper, vector, void> + { + static inline emulated_vector cast(vector vec) + { + return portable_vector_t(_static_cast(vec.x), _static_cast(vec.y)); + } + }; + + template + struct static_cast_helper, vector, void> + { + static inline emulated_vector cast(vector vec) + { + return portable_vector_t(_static_cast(vec.x), _static_cast(vec.y), _static_cast(vec.z)); + } + }; + + template + struct static_cast_helper, vector, void> + { + static inline emulated_vector cast(vector vec) + { + return portable_vector_t(_static_cast(vec.x), _static_cast(vec.y), _static_cast(vec.z), _static_cast(vec.w)); + } + }; + + /*template + struct static_cast_helper, From, void> + { + static inline emulated_vector cast(From val) + { + To vecComponent = To::create(val); + return emulated_vector(vecComponent, vecComponent); + } + }; + + template + struct static_cast_helper, From, void> + { + static inline emulated_vector cast(From val) + { + To vecComponent = To::create(val); + return emulated_vector(vecComponent, vecComponent, vecComponent); + } + }; + + template + struct static_cast_helper, vector, void> + { + static inline emulated_vector cast(From val) + { + To vecComponent = To::create(val); + return emulated_vector(vecComponent, vecComponent, vecComponent, vecComponent); + } + };*/ } namespace impl @@ -433,7 +414,14 @@ struct PortableMul64Helper { static inline V multiply(M mat, V vec) { - return mat * vec; + V output; + M matTransposed = mat.getTransposed(); + + output.x = (matTransposed.columns[0] * vec).calcComponentSum(); + output.y = (matTransposed.columns[1] * vec).calcComponentSum(); + output.z = (matTransposed.columns[2] * vec).calcComponentSum(); + + return output; } }; @@ -447,12 +435,19 @@ struct PortableMul64Helper }; } +#ifdef __HLSL_VERSION +template +V portableMul64(M mat, V vec) +{ + return impl::PortableMul64Helper >::multiply(mat, vec); +} +#else template V portableMul64(M mat, V vec) { - return impl::PortableMul64Helper >::multiply(mat, vec); + return impl::PortableMul64Helper::multiply(mat, vec); } - +#endif } } diff --git a/include/nbl/builtin/hlsl/shapes/beziers.hlsl b/include/nbl/builtin/hlsl/shapes/beziers.hlsl index 7fdab46942..92f3839240 100644 --- a/include/nbl/builtin/hlsl/shapes/beziers.hlsl +++ b/include/nbl/builtin/hlsl/shapes/beziers.hlsl @@ -11,7 +11,7 @@ #include #include #include -#include +#include // TODO: Later include from correct hlsl header (numeric_limits.hlsl) #ifndef nbl_hlsl_FLT_EPSILON @@ -511,10 +511,13 @@ struct Quadratic }; // This function returns the analytic quartic equation to solve for lhs bezier's t value for intersection with another bezier curve -template +template static math::equations::Quartic getBezierBezierIntersectionEquation(NBL_CONST_REF_ARG(QuadraticBezier) lhs, NBL_CONST_REF_ARG(QuadraticBezier) rhs) { using float_t2 = portable_vector_t2; + using float64 = portable_float64_t; + using float64_vec2 = portable_vector64_t2; + // Algorithm based on Computer Aided Geometric Design: // https://scholarsarchive.byu.edu/cgi/viewcontent.cgi?article=1000&context=facpub#page99 @@ -550,18 +553,18 @@ static math::equations::Quartic getBezierBezierIntersectionEquation(NBL Quadratic quadratic = Quadratic::constructFromBezier(lhs); // for convenience - const portable_vector64_t2 A = quadratic.A; - const portable_vector64_t2 B = quadratic.B; - const portable_vector64_t2 C = quadratic.C; + const float64_vec2 A = quadratic.A; + const float64_vec2 B = quadratic.B; + const float64_vec2 C = quadratic.C; // substitute parametric into implicit equation: // Getting the quartic params - portable_float64_t<> a = ((A.x * A.x) * k0) + (A.x * A.y * k1) + (A.y * A.y * k2); - portable_float64_t<> b = (A.x * B.x * k0 * 2.0f) + (A.x * B.y * k1) + (B.x * A.y * k1) + (A.y * B.y * k2 * 2.0f); - portable_float64_t<> c = (A.x * C.x * k0 * 2.0f) + (A.x * C.y * k1) + (A.x * k3) + ((B.x * B.x) * k0) + (B.x * B.y * k1) + (C.x * A.y * k1) + (A.y * C.y * k2 * 2.0f) + (A.y * k4) + ((B.y * B.y) * k2); - portable_float64_t<> d = (B.x * C.x * k0 * 2.0f) + (B.x * C.y * k1) + (B.x * k3) + (C.x * B.y * k1) + (B.y * C.y * k2 * 2.0f) + (B.y * k4); - portable_float64_t<> e = ((C.x * C.x) * k0) + (C.x * C.y * k1) + (C.x * k3) + ((C.y * C.y) * k2) + (C.y * k4) + (k5); + float64 a = ((A.x * A.x) * k0) + (A.x * A.y * k1) + (A.y * A.y * k2); + float64 b = (A.x * B.x * k0 * 2.0f) + (A.x * B.y * k1) + (B.x * A.y * k1) + (A.y * B.y * k2 * 2.0f); + float64 c = (A.x * C.x * k0 * 2.0f) + (A.x * C.y * k1) + (A.x * k3) + ((B.x * B.x) * k0) + (B.x * B.y * k1) + (C.x * A.y * k1) + (A.y * C.y * k2 * 2.0f) + (A.y * k4) + ((B.y * B.y) * k2); + float64 d = (B.x * C.x * k0 * 2.0f) + (B.x * C.y * k1) + (B.x * k3) + (C.x * B.y * k1) + (B.y * C.y * k2 * 2.0f) + (B.y * k4); + float64 e = ((C.x * C.x) * k0) + (C.x * C.y * k1) + (C.x * k3) + ((C.y * C.y) * k2) + (C.y * k4) + (k5); return math::equations::Quartic::construct( _static_cast(a), _static_cast(b), _static_cast(c), _static_cast(d), _static_cast(e)); diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index e06d9e2077..8c58add15a 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -233,12 +233,13 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/normalization/shared_nor LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/macros.h") #emulated LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated/emulated_float64_t.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated/emulated_float64_t_utils.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated/emulated_float64_t_impl.hlsl") #ieee754 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ieee754/ieee754.hlsl") #utility LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tgmath.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/portable_float64_t.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/portable_float64_t_math.hlsl") #spirv intrinsics LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/core.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/fragment_shader_pixel_interlock.hlsl") From 826d39b25bc7e13a4792e99b4f6850142fac2eab Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Fri, 13 Sep 2024 18:17:03 +0100 Subject: [PATCH 050/358] Refactor --- examples_tests | 2 +- ...emulated_float64_t.hlsl => float64_t.hlsl} | 6 +- ...loat64_t_impl.hlsl => float64_t_impl.hlsl} | 36 +- .../nbl/builtin/hlsl/emulated/matrix_t.hlsl | 76 +++ .../nbl/builtin/hlsl/emulated/vector_t.hlsl | 185 +++++++ .../builtin/hlsl/{ieee754 => }/ieee754.hlsl | 58 +-- include/nbl/builtin/hlsl/ieee754/impl.hlsl | 54 +++ .../hlsl/math/equations/quadratic.hlsl | 2 +- .../float64_t.hlsl} | 2 +- .../nbl/builtin/hlsl/portable/matrix_t.hlsl | 92 ++++ .../nbl/builtin/hlsl/portable/vector_t.hlsl | 54 +++ .../builtin/hlsl/portable_float64_t_math.hlsl | 454 ------------------ include/nbl/builtin/hlsl/shapes/beziers.hlsl | 5 +- include/nbl/builtin/hlsl/tgmath.hlsl | 2 +- src/nbl/builtin/CMakeLists.txt | 15 +- 15 files changed, 495 insertions(+), 548 deletions(-) rename include/nbl/builtin/hlsl/emulated/{emulated_float64_t.hlsl => float64_t.hlsl} (99%) rename include/nbl/builtin/hlsl/emulated/{emulated_float64_t_impl.hlsl => float64_t_impl.hlsl} (85%) create mode 100644 include/nbl/builtin/hlsl/emulated/matrix_t.hlsl create mode 100644 include/nbl/builtin/hlsl/emulated/vector_t.hlsl rename include/nbl/builtin/hlsl/{ieee754 => }/ieee754.hlsl (62%) create mode 100644 include/nbl/builtin/hlsl/ieee754/impl.hlsl rename include/nbl/builtin/hlsl/{portable_float64_t.hlsl => portable/float64_t.hlsl} (90%) create mode 100644 include/nbl/builtin/hlsl/portable/matrix_t.hlsl create mode 100644 include/nbl/builtin/hlsl/portable/vector_t.hlsl delete mode 100644 include/nbl/builtin/hlsl/portable_float64_t_math.hlsl diff --git a/examples_tests b/examples_tests index 904da40cc7..d6f0c587af 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 904da40cc731d5f25ff6319ec7f497b06c579b00 +Subproject commit d6f0c587af12531deea906d8fd582d70ee06db6c diff --git a/include/nbl/builtin/hlsl/emulated/emulated_float64_t.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl similarity index 99% rename from include/nbl/builtin/hlsl/emulated/emulated_float64_t.hlsl rename to include/nbl/builtin/hlsl/emulated/float64_t.hlsl index aa600e7eca..73d41b0001 100644 --- a/include/nbl/builtin/hlsl/emulated/emulated_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl @@ -1,7 +1,7 @@ -#ifndef _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_INCLUDED_ -#define _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_INCLUDED_ +#ifndef _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_HLSL_INCLUDED_ +#define _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_HLSL_INCLUDED_ -#include +#include namespace nbl { diff --git a/include/nbl/builtin/hlsl/emulated/emulated_float64_t_impl.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl similarity index 85% rename from include/nbl/builtin/hlsl/emulated/emulated_float64_t_impl.hlsl rename to include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl index acb5adf713..4e5ad6c7be 100644 --- a/include/nbl/builtin/hlsl/emulated/emulated_float64_t_impl.hlsl +++ b/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl @@ -1,8 +1,8 @@ -#ifndef _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_IMPL_INCLUDED_ -#define _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_IMPL_INCLUDED_ +#ifndef _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_IMPL_HLSL_INCLUDED_ +#define _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_IMPL_HLSL_INCLUDED_ #include -#include +#include #include #include #include @@ -196,37 +196,9 @@ NBL_CONSTEXPR_INLINE_FUNC uint64_t flushDenormToZero(uint64_t extractedBiasedExp return extractedBiasedExponent ? value : ieee754::extractSignPreserveBitPattern(value); } -template -static inline int countLeadingZeros(Int val) -{ - static_assert(is_integral::value); - - NBL_CONSTEXPR_STATIC int BitCntSubOne = sizeof(Int) * 8 - 1; - return BitCntSubOne - _findMSB(val); -} - -NBL_CONSTEXPR_INLINE_FUNC uint32_t2 shift64RightJamming(uint32_t2 val, int count) -{ - uint32_t2 output; - const int negCount = (-count) & 31; - - output.x = glsl::mix(0u, val.x, count == 0); - output.x = glsl::mix(output.x, (val.x >> count), count < 32); - - output.y = uint32_t((val.x | val.y) != 0u); /* count >= 64 */ - uint32_t z1_lt64 = (val.x>>(count & 31)) | uint32_t(((val.x<>count) | uint32_t((val.y< + +namespace nbl +{ +namespace hlsl +{ + +template +struct emulated_matrix {}; + +template +struct emulated_matrix +{ + using vec_t = emulated_vector_t2; + using this_t = emulated_matrix; + + vec_t columns[2]; + + this_t getTransposed() NBL_CONST_MEMBER_FUNC + { + this_t output; + + output.columns[0].x = columns[0].x; + output.columns[1].x = columns[0].y; + + output.columns[0].y = columns[1].x; + output.columns[1].y = columns[1].y; + + return output; + } +}; + +template +struct emulated_matrix +{ + using vec_t = emulated_vector_t3; + using this_t = emulated_matrix; + + vec_t columns[3]; + + this_t getTransposed() NBL_CONST_MEMBER_FUNC + { + this_t output; + + output.columns[0].x = columns[0].x; + output.columns[1].x = columns[0].y; + output.columns[2].x = columns[0].z; + + output.columns[0].y = columns[1].x; + output.columns[1].y = columns[1].y; + output.columns[2].y = columns[1].z; + + output.columns[0].z = columns[2].x; + output.columns[1].z = columns[2].y; + output.columns[2].z = columns[2].z; + + return output; + } + + vec_t operator[](uint32_t columnIdx) + { + return columns[columnIdx]; + } +}; + +template +using emulated_matrix_t2x2 = emulated_matrix; +template +using emulated_matrix_t3x3 = emulated_matrix; + +} +} +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/emulated/vector_t.hlsl b/include/nbl/builtin/hlsl/emulated/vector_t.hlsl new file mode 100644 index 0000000000..d4809647e4 --- /dev/null +++ b/include/nbl/builtin/hlsl/emulated/vector_t.hlsl @@ -0,0 +1,185 @@ +#ifndef _NBL_BUILTIN_HLSL_EMULATED_VECTOR_T_HLSL_INCLUDED_ +#define _NBL_BUILTIN_HLSL_EMULATED_VECTOR_T_HLSL_INCLUDED_ + +#include + +namespace nbl +{ +namespace hlsl +{ + +template +struct emulated_vector {}; + +template +struct emulated_vector +{ + using this_t = emulated_vector; + + EmulatedType x; + EmulatedType y; + + EmulatedType calcComponentSum() NBL_CONST_MEMBER_FUNC + { + return x + y; + } + + NBL_CONSTEXPR_STATIC_INLINE this_t create(EmulatedType x, EmulatedType y) + { + this_t output; + output.x = x; + output.y = y; + + return output; + } + + this_t operator+(float rhs) + { + this_t output; + EmulatedType rhsAsEF64 = EmulatedType::create(rhs); + output.x = x + rhsAsEF64; + output.y = y + rhsAsEF64; + + return output; + } + + this_t operator+(EmulatedType rhs) + { + this_t output; + output.x = x + rhs; + output.y = y + rhs; + + return output; + } + + this_t operator+(this_t rhs) + { + this_t output; + output.x = x + rhs.x; + output.y = y + rhs.y; + + return output; + } + + this_t operator-(float rhs) + { + return create(x, y) + (-rhs); + } + + this_t operator-(EmulatedType rhs) + { + return create(x, y) + (rhs.flipSign()); + } + + this_t operator-(this_t rhs) + { + rhs.x = rhs.x.flipSign(); + rhs.y = rhs.y.flipSign(); + return create(x, y) + rhs; + } + + this_t operator*(float rhs) + { + this_t output; + EmulatedType rhsAsEF64 = EmulatedType::create(rhs); + output.x = x * rhsAsEF64; + output.y = y * rhsAsEF64; + + return output; + } + + this_t operator*(EmulatedType rhs) + { + this_t output; + output.x = x * rhs; + output.y = y * rhs; + + return output; + } + + this_t operator*(this_t rhs) + { + this_t output; + output.x = x * rhs.x; + output.y = y * rhs.y; + + return output; + } +}; + +template +struct emulated_vector +{ + using this_t = emulated_vector; + + EmulatedType x; + EmulatedType y; + EmulatedType z; + + EmulatedType calcComponentSum() NBL_CONST_MEMBER_FUNC + { + return x + y + z; + } + + this_t operator*(NBL_CONST_REF_ARG(this_t) rhs) NBL_CONST_MEMBER_FUNC + { + this_t output; + output.x = x * rhs.x; + output.y = y * rhs.y; + output.z = z * rhs.z; + + return output; + } +}; + +template +struct emulated_vector +{ + using type = emulated_vector; + + EmulatedType x; + EmulatedType y; + EmulatedType z; + EmulatedType w; +}; + +template +using emulated_vector_t2 = emulated_vector; +template +using emulated_vector_t3 = emulated_vector; +template +using emulated_vector_t4 = emulated_vector; + +namespace impl +{ +template +struct static_cast_helper, vector, void> +{ + static inline emulated_vector_t2 cast(vector vec) + { + return emulated_vector_t2(_static_cast(vec.x), _static_cast(vec.y)); + } +}; + +template +struct static_cast_helper, vector, void> +{ + static inline emulated_vector_t3 cast(vector vec) + { + return emulated_vector_t3(_static_cast(vec.x), _static_cast(vec.y), _static_cast(vec.z)); + } +}; + +template +struct static_cast_helper, vector, void> +{ + static inline emulated_vector_t4 cast(vector vec) + { + return emulated_vector_t4(_static_cast(vec.x), _static_cast(vec.y), _static_cast(vec.z), _static_cast(vec.w)); + } +}; +} + +} +} +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/ieee754/ieee754.hlsl b/include/nbl/builtin/hlsl/ieee754.hlsl similarity index 62% rename from include/nbl/builtin/hlsl/ieee754/ieee754.hlsl rename to include/nbl/builtin/hlsl/ieee754.hlsl index e6e6477954..26593a2f62 100644 --- a/include/nbl/builtin/hlsl/ieee754/ieee754.hlsl +++ b/include/nbl/builtin/hlsl/ieee754.hlsl @@ -1,9 +1,7 @@ -#ifndef _NBL_BUILTIN_HLSL_IEE754_H_INCLUDED_ -#define _NBL_BUILTIN_HLSL_IEE754_H_INCLUDED_ +#ifndef _NBL_BUILTIN_HLSL_IEE754_HLSL_INCLUDED_ +#define _NBL_BUILTIN_HLSL_IEE754_HLSL_INCLUDED_ -#include -#include -#include +#include namespace nbl { @@ -12,46 +10,10 @@ namespace hlsl namespace ieee754 { -// TODO: move to builtin/hlsl/impl/ieee754_impl.hlsl? -namespace impl -{ - template - NBL_CONSTEXPR_INLINE_FUNC bool isTypeAllowed() - { - return is_same::value || - is_same::value || - is_same::value || - is_same::value || - is_same::value || - is_same::value; - } - - template - NBL_CONSTEXPR_INLINE_FUNC typename unsigned_integer_of_size::type bitCastToUintType(T x) - { - using AsUint = typename unsigned_integer_of_size::type; - return bit_cast(x); - } - // to avoid bit cast from uintN_t to uintN_t - template <> NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size<2>::type bitCastToUintType(uint16_t x) { return x; } - template <> NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size<4>::type bitCastToUintType(uint32_t x) { return x; } - template <> NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size<8>::type bitCastToUintType(uint64_t x) { return x; } - - template - NBL_CONSTEXPR_INLINE_FUNC T castBackToFloatType(T x) - { - using AsFloat = typename float_of_size::type; - return bit_cast(x); - } - template<> NBL_CONSTEXPR_INLINE_FUNC uint16_t castBackToFloatType(uint16_t x) { return x; } - template<> NBL_CONSTEXPR_INLINE_FUNC uint32_t castBackToFloatType(uint32_t x) { return x; } - template<> NBL_CONSTEXPR_INLINE_FUNC uint64_t castBackToFloatType(uint64_t x) { return x; } -} - template struct traits_base { - static_assert(is_same::value || is_same::value); + static_assert(is_same::value || is_same::value || is_same::value); NBL_CONSTEXPR_STATIC_INLINE int16_t exponentBitCnt = int16_t(0xbeef); NBL_CONSTEXPR_STATIC_INLINE int16_t mantissaBitCnt = int16_t(0xbeef); }; @@ -102,7 +64,7 @@ template inline uint32_t extractBiasedExponent(T x) { using AsUint = typename unsigned_integer_of_size::type; - return glsl::bitfieldExtract(impl::bitCastToUintType(x), traits::type>::mantissaBitCnt, traits::type>::exponentBitCnt); + return glsl::bitfieldExtract(ieee754::impl::bitCastToUintType(x), traits::type>::mantissaBitCnt, traits::type>::exponentBitCnt); } template<> @@ -115,7 +77,7 @@ inline uint32_t extractBiasedExponent(uint64_t x) template<> inline uint32_t extractBiasedExponent(float64_t x) { - return extractBiasedExponent(impl::bitCastToUintType(x)); + return extractBiasedExponent(ieee754::impl::bitCastToUintType(x)); } template @@ -131,7 +93,7 @@ NBL_CONSTEXPR_INLINE_FUNC T replaceBiasedExponent(T x, typename unsigned_integer // TODO: //staticAssertTmp(impl::isTypeAllowed(), "Invalid type! Only floating point or unsigned integer types are allowed."); using AsFloat = typename float_of_size::type; - return impl::castBackToFloatType(glsl::bitfieldInsert(impl::bitCastToUintType(x), biasedExp, traits::mantissaBitCnt, traits::exponentBitCnt)); + return impl::castBackToFloatType(glsl::bitfieldInsert(ieee754::impl::bitCastToUintType(x), biasedExp, traits::mantissaBitCnt, traits::exponentBitCnt)); } // performs no overflow tests, returns x*exp2(n) @@ -145,7 +107,7 @@ template NBL_CONSTEXPR_INLINE_FUNC typename unsigned_integer_of_size::type extractMantissa(T x) { using AsUint = typename unsigned_integer_of_size::type; - return impl::bitCastToUintType(x) & traits::type>::mantissaMask; + return ieee754::impl::bitCastToUintType(x) & traits::type>::mantissaMask; } template @@ -160,14 +122,14 @@ template NBL_CONSTEXPR_INLINE_FUNC typename unsigned_integer_of_size::type extractSign(T x) { using AsFloat = typename float_of_size::type; - return (impl::bitCastToUintType(x) & traits::signMask) >> ((sizeof(T) * 8) - 1); + return (ieee754::impl::bitCastToUintType(x) & traits::signMask) >> ((sizeof(T) * 8) - 1); } template NBL_CONSTEXPR_INLINE_FUNC typename unsigned_integer_of_size::type extractSignPreserveBitPattern(T x) { using AsFloat = typename float_of_size::type; - return impl::bitCastToUintType(x) & traits::signMask; + return ieee754::impl::bitCastToUintType(x) & traits::signMask; } } diff --git a/include/nbl/builtin/hlsl/ieee754/impl.hlsl b/include/nbl/builtin/hlsl/ieee754/impl.hlsl new file mode 100644 index 0000000000..e17eb9a8c7 --- /dev/null +++ b/include/nbl/builtin/hlsl/ieee754/impl.hlsl @@ -0,0 +1,54 @@ +#ifndef _NBL_BUILTIN_HLSL_IEE754_IMPL_HLSL_INCLUDED_ +#define _NBL_BUILTIN_HLSL_IEE754_IMPL_HLSL_INCLUDED_ + +#include +#include +#include + +namespace nbl +{ +namespace hlsl +{ +namespace ieee754 +{ + +namespace impl +{ +template +NBL_CONSTEXPR_INLINE_FUNC bool isTypeAllowed() +{ + return is_same::value || + is_same::value || + is_same::value || + is_same::value || + is_same::value || + is_same::value; +} + +template +NBL_CONSTEXPR_INLINE_FUNC typename unsigned_integer_of_size::type bitCastToUintType(T x) +{ + using AsUint = typename unsigned_integer_of_size::type; + return bit_cast(x); +} +// to avoid bit cast from uintN_t to uintN_t +template <> NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size<2>::type bitCastToUintType(uint16_t x) { return x; } +template <> NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size<4>::type bitCastToUintType(uint32_t x) { return x; } +template <> NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size<8>::type bitCastToUintType(uint64_t x) { return x; } + +template +NBL_CONSTEXPR_INLINE_FUNC T castBackToFloatType(T x) +{ + using AsFloat = typename float_of_size::type; + return bit_cast(x); +} +template<> NBL_CONSTEXPR_INLINE_FUNC uint16_t castBackToFloatType(uint16_t x) { return x; } +template<> NBL_CONSTEXPR_INLINE_FUNC uint32_t castBackToFloatType(uint32_t x) { return x; } +template<> NBL_CONSTEXPR_INLINE_FUNC uint64_t castBackToFloatType(uint64_t x) { return x; } +} + +} +} +} + +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/math/equations/quadratic.hlsl b/include/nbl/builtin/hlsl/math/equations/quadratic.hlsl index 5c7d60a870..1f93e0c5ff 100644 --- a/include/nbl/builtin/hlsl/math/equations/quadratic.hlsl +++ b/include/nbl/builtin/hlsl/math/equations/quadratic.hlsl @@ -5,7 +5,7 @@ #ifndef _NBL_BUILTIN_HLSL_MATH_EQUATIONS_QUADRATIC_INCLUDED_ #define _NBL_BUILTIN_HLSL_MATH_EQUATIONS_QUADRATIC_INCLUDED_ -#include +#include // TODO: Later include from correct hlsl header #ifndef nbl_hlsl_FLT_EPSILON diff --git a/include/nbl/builtin/hlsl/portable_float64_t.hlsl b/include/nbl/builtin/hlsl/portable/float64_t.hlsl similarity index 90% rename from include/nbl/builtin/hlsl/portable_float64_t.hlsl rename to include/nbl/builtin/hlsl/portable/float64_t.hlsl index e82c0092ad..92b6e53133 100644 --- a/include/nbl/builtin/hlsl/portable_float64_t.hlsl +++ b/include/nbl/builtin/hlsl/portable/float64_t.hlsl @@ -1,7 +1,7 @@ #ifndef _NBL_BUILTIN_HLSL_PORTABLE_FLOAT64_T_INCLUDED_ #define _NBL_BUILTIN_HLSL_PORTABLE_FLOAT64_T_INCLUDED_ -#include +#include #include namespace nbl { diff --git a/include/nbl/builtin/hlsl/portable/matrix_t.hlsl b/include/nbl/builtin/hlsl/portable/matrix_t.hlsl new file mode 100644 index 0000000000..8cf3d8233f --- /dev/null +++ b/include/nbl/builtin/hlsl/portable/matrix_t.hlsl @@ -0,0 +1,92 @@ +#ifndef _NBL_BUILTIN_HLSL_PORTABLE_MATRIX_T_HLSL_INCLUDED_ +#define _NBL_BUILTIN_HLSL_PORTABLE_MATRIX_T_HLSL_INCLUDED_ + +#include +#include + +namespace nbl +{ +namespace hlsl +{ + +template::value> +struct portable_matrix +{ + using type = matrix; +}; +#ifdef __HLSL_VERSION +template +struct portable_matrix +{ + using type = emulated_matrix; +}; +#endif + +template +using portable_matrix_t = typename portable_matrix::type; + +template +using portable_matrix_t2x2 = portable_matrix_t; +template +using portable_matrix_t3x3 = portable_matrix_t; + + +#ifdef __HLSL_VERSION +template +using portable_matrix64_t2x2 = portable_matrix_t2x2 >; +template +using portable_matrix64_t3x3 = portable_matrix_t3x3 >; +#else +template +using portable_matrix64_t2x2 = portable_matrix_t2x2; +template +using portable_matrix64_t3x3 = portable_matrix_t3x3; +#endif + +namespace impl +{ +// TODO: move to emulated/matrix.hlsl +template +struct PortableMul64Helper +{ + static inline V multiply(M mat, V vec) + { + V output; + M matTransposed = mat.getTransposed(); + + output.x = (matTransposed.columns[0] * vec).calcComponentSum(); + output.y = (matTransposed.columns[1] * vec).calcComponentSum(); + output.z = (matTransposed.columns[2] * vec).calcComponentSum(); + + return output; + } +}; + +template +struct PortableMul64Helper +{ + static inline V multiply(M mat, V vec) + { + return mul(mat, vec); + } +}; +} + +#ifdef __HLSL_VERSION +template +V portableMul64(M mat, V vec) +{ + return impl::PortableMul64Helper >::multiply(mat, vec); +} +#else +template +V portableMul64(M mat, V vec) +{ + return impl::PortableMul64Helper::multiply(mat, vec); +} +#endif + +} +} + +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/portable/vector_t.hlsl b/include/nbl/builtin/hlsl/portable/vector_t.hlsl new file mode 100644 index 0000000000..474975fb21 --- /dev/null +++ b/include/nbl/builtin/hlsl/portable/vector_t.hlsl @@ -0,0 +1,54 @@ +#ifndef _NBL_BUILTIN_HLSL_PORTABLE_VECTOR_T_HLSL_INCLUDED_ +#define _NBL_BUILTIN_HLSL_PORTABLE_VECTOR_T_HLSL_INCLUDED_ + +#include +#include + +namespace nbl +{ +namespace hlsl +{ + +template::value> +struct portable_vector +{ + using type = vector; +}; +#ifdef __HLSL_VERSION +template +struct portable_vector +{ + using type = portable_vector; +}; +#endif + +template +using portable_vector_t = typename portable_vector::type; + +template +using portable_vector_t2 = portable_vector_t; +template +using portable_vector_t3 = portable_vector_t; +template +using portable_vector_t4 = portable_vector_t; + +#ifdef __HLSL_VERSION +template +using portable_vector64_t2 = portable_vector_t2 >; +template +using portable_vector64_t3 = portable_vector_t3 >; +template +using portable_vector64_t4 = portable_vector_t4 >; +#else +template +using portable_vector64_t2 = portable_vector_t2; +template +using portable_vector64_t3 = portable_vector_t3; +template +using portable_vector64_t4 = portable_vector_t4; +#endif + +} +} + +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/portable_float64_t_math.hlsl b/include/nbl/builtin/hlsl/portable_float64_t_math.hlsl deleted file mode 100644 index 1b21785833..0000000000 --- a/include/nbl/builtin/hlsl/portable_float64_t_math.hlsl +++ /dev/null @@ -1,454 +0,0 @@ -#ifndef _NBL_BUILTIN_HLSL_PORTABLE_FLOAT64_T_MATH_INCLUDED_ -#define _NBL_BUILTIN_HLSL_PORTABLE_FLOAT64_T_MATH_INCLUDED_ - -#include -#include -#include -#include - -namespace nbl -{ -namespace hlsl -{ - -template -struct emulated_vector {}; - -template -struct emulated_vector -{ - using type = emulated_vector; - - EmulatedType x; - EmulatedType y; - - EmulatedType calcComponentSum() NBL_CONST_MEMBER_FUNC - { - return x + y; - } - - NBL_CONSTEXPR_STATIC_INLINE type create(EmulatedType x, EmulatedType y) - { - type output; - output.x = x; - output.y = y; - - return output; - } - - type operator+(float rhs) - { - type output; - EmulatedType rhsAsEF64 = EmulatedType::create(rhs); - output.x = x + rhsAsEF64; - output.y = y + rhsAsEF64; - - return output; - } - - type operator+(EmulatedType rhs) - { - type output; - output.x = x + rhs; - output.y = y + rhs; - - return output; - } - - type operator+(type rhs) - { - type output; - output.x = x + rhs.x; - output.y = y + rhs.y; - - return output; - } - - type operator-(float rhs) - { - return create(x, y) + (-rhs); - } - - type operator-(EmulatedType rhs) - { - return create(x, y) + (rhs.flipSign()); - } - - type operator-(type rhs) - { - rhs.x = rhs.x.flipSign(); - rhs.y = rhs.y.flipSign(); - return create(x, y) + rhs; - } - - type operator*(float rhs) - { - type output; - EmulatedType rhsAsEF64 = EmulatedType::create(rhs); - output.x = x * rhsAsEF64; - output.y = y * rhsAsEF64; - - return output; - } - - type operator*(EmulatedType rhs) - { - type output; - output.x = x * rhs; - output.y = y * rhs; - - return output; - } - - type operator*(type rhs) - { - type output; - output.x = x * rhs.x; - output.y = y * rhs.y; - - return output; - } -}; - -template -struct emulated_vector -{ - using type = emulated_vector; - - EmulatedType x; - EmulatedType y; - EmulatedType z; - - EmulatedType calcComponentSum() NBL_CONST_MEMBER_FUNC - { - return x + y + z; - } - - type operator*(NBL_CONST_REF_ARG(type) rhs) NBL_CONST_MEMBER_FUNC - { - type output; - output.x = x * rhs.x; - output.y = y * rhs.y; - output.z = z * rhs.z; - - return output; - } -}; - -template -struct emulated_vector -{ - using type = emulated_vector; - - EmulatedType x; - EmulatedType y; - EmulatedType z; - EmulatedType w; -}; - -template -using emulated_vector_t2 = emulated_vector; -template -using emulated_vector_t3 = emulated_vector; -template -using emulated_vector_t4 = emulated_vector; - -namespace impl -{ - -template -struct static_cast_helper,emulated_vector,void> -{ - static inline vector cast(emulated_vector vec) - { - return vector(_static_cast(vec.x), _static_cast(vec.y)); - } -}; - -} - -template -struct emulated_matrix {}; - -template -struct emulated_matrix -{ - using vec_t = emulated_vector_t2; - using type = emulated_matrix; - - vec_t columns[2]; - - type getTransposed() NBL_CONST_MEMBER_FUNC - { - type output; - - output.columns[0].x = columns[0].x; - output.columns[1].x = columns[0].y; - - output.columns[0].y = columns[1].x; - output.columns[1].y = columns[1].y; - - return output; - } - - /*type operator*(NBL_CONST_REF_ARG(type) rhs) NBL_CONST_MEMBER_FUNC - { - type output; - type lhsTransposed = getTransposed(); - - output.columns[0].x = (lhsTransposed.columns[0] * rhs.columns[0]).calcComponentSum(); - output.columns[0].y = (lhsTransposed.columns[0] * rhs.columns[1]).calcComponentSum(); - - output.columns[1].x = (lhsTransposed.columns[1] * rhs.columns[0]).calcComponentSum(); - output.columns[1].y = (lhsTransposed.columns[1] * rhs.columns[1]).calcComponentSum(); - - return output.getTransposed(); - } - - vec_t operator*(NBL_CONST_REF_ARG(vec_t) rhs) - { - vec_t output; - type lhsTransposed = getTransposed(); - - output.x = (columns[0] * rhs).calcComponentSum(); - output.y = (columns[1] * rhs).calcComponentSum(); - - return output; - }*/ -}; - -template -struct emulated_matrix -{ - using vec_t = emulated_vector_t3; - using type = emulated_matrix; - - vec_t columns[3]; - - type getTransposed() NBL_CONST_MEMBER_FUNC - { - type output; - - output.columns[0].x = columns[0].x; - output.columns[1].x = columns[0].y; - output.columns[2].x = columns[0].z; - - output.columns[0].y = columns[1].x; - output.columns[1].y = columns[1].y; - output.columns[2].y = columns[1].z; - - output.columns[0].z = columns[2].x; - output.columns[1].z = columns[2].y; - output.columns[2].z = columns[2].z; - - return output; - } - - /*type operator*(NBL_CONST_REF_ARG(type) rhs) NBL_CONST_MEMBER_FUNC - { - type output; - - output.columns[0].x = (columns[0] * rhs.columns[0]).calcComponentSum(); - output.columns[0].y = (columns[0] * rhs.columns[1]).calcComponentSum(); - output.columns[0].z = (columns[0] * rhs.columns[2]).calcComponentSum(); - - output.columns[1].x = (columns[1] * rhs.columns[0]).calcComponentSum(); - output.columns[1].y = (columns[1] * rhs.columns[1]).calcComponentSum(); - output.columns[1].z = (columns[1] * rhs.columns[2]).calcComponentSum(); - - output.columns[2].x = (columns[2] * rhs.columns[0]).calcComponentSum(); - output.columns[2].y = (columns[2] * rhs.columns[1]).calcComponentSum(); - output.columns[2].z = (columns[2] * rhs.columns[2]).calcComponentSum(); - - return output.getTransposed(); - }*/ - - vec_t operator[](uint32_t columnIdx) - { - return columns[columnIdx]; - } -}; - -template -using emulated_matrix_t2x2 = emulated_matrix; -template -using emulated_matrix_t3x3 = emulated_matrix; - -template::value> -struct portable_vector -{ - using type = vector; -}; -#ifdef __HLSL_VERSION -template -struct portable_vector -{ - using type = portable_vector; -}; -#endif - -template -using portable_vector_t = typename portable_vector::type; - -template -using portable_vector_t2 = portable_vector_t; -template -using portable_vector_t3 = portable_vector_t; -template -using portable_vector_t4 = portable_vector_t; - -#ifdef __HLSL_VERSION -template -using portable_vector64_t2 = portable_vector_t2 >; -template -using portable_vector64_t3 = portable_vector_t3 >; -template -using portable_vector64_t4 = portable_vector_t4 >; -#else -template -using portable_vector64_t2 = portable_vector_t2; -template -using portable_vector64_t3 = portable_vector_t3; -template -using portable_vector64_t4 = portable_vector_t4; -#endif - -template::value> -struct portable_matrix -{ - using type = matrix; -}; -#ifdef __HLSL_VERSION -template -struct portable_matrix -{ - using type = emulated_matrix; -}; -#endif - -template -using portable_matrix_t = typename portable_matrix::type; - -template -using portable_matrix_t2x2 = portable_matrix_t; -template -using portable_matrix_t3x3 = portable_matrix_t; - - -#ifdef __HLSL_VERSION -template -using portable_matrix64_t2x2 = portable_matrix_t2x2 >; -template -using portable_matrix64_t3x3 = portable_matrix_t3x3 >; -#else -template -using portable_matrix64_t2x2 = portable_matrix_t2x2; -template -using portable_matrix64_t3x3 = portable_matrix_t3x3; -#endif - -namespace impl -{ - template - struct static_cast_helper, vector, void> - { - static inline emulated_vector cast(vector vec) - { - return portable_vector_t(_static_cast(vec.x), _static_cast(vec.y)); - } - }; - - template - struct static_cast_helper, vector, void> - { - static inline emulated_vector cast(vector vec) - { - return portable_vector_t(_static_cast(vec.x), _static_cast(vec.y), _static_cast(vec.z)); - } - }; - - template - struct static_cast_helper, vector, void> - { - static inline emulated_vector cast(vector vec) - { - return portable_vector_t(_static_cast(vec.x), _static_cast(vec.y), _static_cast(vec.z), _static_cast(vec.w)); - } - }; - - /*template - struct static_cast_helper, From, void> - { - static inline emulated_vector cast(From val) - { - To vecComponent = To::create(val); - return emulated_vector(vecComponent, vecComponent); - } - }; - - template - struct static_cast_helper, From, void> - { - static inline emulated_vector cast(From val) - { - To vecComponent = To::create(val); - return emulated_vector(vecComponent, vecComponent, vecComponent); - } - }; - - template - struct static_cast_helper, vector, void> - { - static inline emulated_vector cast(From val) - { - To vecComponent = To::create(val); - return emulated_vector(vecComponent, vecComponent, vecComponent, vecComponent); - } - };*/ -} - -namespace impl -{ -template -struct PortableMul64Helper -{ - static inline V multiply(M mat, V vec) - { - V output; - M matTransposed = mat.getTransposed(); - - output.x = (matTransposed.columns[0] * vec).calcComponentSum(); - output.y = (matTransposed.columns[1] * vec).calcComponentSum(); - output.z = (matTransposed.columns[2] * vec).calcComponentSum(); - - return output; - } -}; - -template -struct PortableMul64Helper -{ - static inline V multiply(M mat, V vec) - { - return mul(mat, vec); - } -}; -} - -#ifdef __HLSL_VERSION -template -V portableMul64(M mat, V vec) -{ - return impl::PortableMul64Helper >::multiply(mat, vec); -} -#else -template -V portableMul64(M mat, V vec) -{ - return impl::PortableMul64Helper::multiply(mat, vec); -} -#endif - -} -} -#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/shapes/beziers.hlsl b/include/nbl/builtin/hlsl/shapes/beziers.hlsl index 92f3839240..936a37fa7e 100644 --- a/include/nbl/builtin/hlsl/shapes/beziers.hlsl +++ b/include/nbl/builtin/hlsl/shapes/beziers.hlsl @@ -10,8 +10,9 @@ #include #include #include -#include -#include +#include +#include +#include // TODO: Later include from correct hlsl header (numeric_limits.hlsl) #ifndef nbl_hlsl_FLT_EPSILON diff --git a/include/nbl/builtin/hlsl/tgmath.hlsl b/include/nbl/builtin/hlsl/tgmath.hlsl index d603dea2c0..0edb47eb1f 100644 --- a/include/nbl/builtin/hlsl/tgmath.hlsl +++ b/include/nbl/builtin/hlsl/tgmath.hlsl @@ -5,7 +5,7 @@ #define _NBL_BUILTIN_HLSL_TGMATH_INCLUDED_ #include -#include +#include #include namespace nbl diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index 8c58add15a..f06deb7c46 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -232,14 +232,19 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/normalization/shared_nor # HLSL LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/macros.h") #emulated -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated/emulated_float64_t.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated/emulated_float64_t_impl.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated/float64_t.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated/float64_t_impl.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated/vector_t_impl.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated/matrix_t_impl.hlsl") +#portable +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/portable/float64_t.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/portable/vector_t.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/portable/matrix_t.hlsl") #ieee754 -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ieee754/ieee754.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ieee754.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ieee754/impl.hlsl") #utility LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tgmath.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/portable_float64_t.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/portable_float64_t_math.hlsl") #spirv intrinsics LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/core.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/fragment_shader_pixel_interlock.hlsl") From fb0ebd323c9cce1e5fc6502ceb4d13022b473f09 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Fri, 13 Sep 2024 21:51:03 +0100 Subject: [PATCH 051/358] More refactor --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated/float64_t.hlsl | 88 +++++++++---------- .../builtin/hlsl/emulated/float64_t_impl.hlsl | 38 +------- include/nbl/builtin/hlsl/tgmath.hlsl | 32 +++++-- 4 files changed, 71 insertions(+), 89 deletions(-) diff --git a/examples_tests b/examples_tests index d6f0c587af..7f7b48596a 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit d6f0c587af12531deea906d8fd582d70ee06db6c +Subproject commit 7f7b48596a61c43d5d72b1e032525cc07ece91a8 diff --git a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl index 73d41b0001..6051cee7b5 100644 --- a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl @@ -38,28 +38,28 @@ namespace hlsl NBL_CONSTEXPR_STATIC_INLINE this_t create(int32_t val) { - return bit_cast(impl::castToUint64WithFloat64BitPattern(int64_t(val))); + return bit_cast(emulated_float64_t_impl::castToUint64WithFloat64BitPattern(int64_t(val))); } NBL_CONSTEXPR_STATIC_INLINE this_t create(int64_t val) { - return bit_cast(impl::castToUint64WithFloat64BitPattern(val)); + return bit_cast(emulated_float64_t_impl::castToUint64WithFloat64BitPattern(val)); } NBL_CONSTEXPR_STATIC_INLINE this_t create(uint32_t val) { - return bit_cast(impl::castToUint64WithFloat64BitPattern(uint64_t(val))); + return bit_cast(emulated_float64_t_impl::castToUint64WithFloat64BitPattern(uint64_t(val))); } NBL_CONSTEXPR_STATIC_INLINE this_t create(uint64_t val) { - return bit_cast(impl::castToUint64WithFloat64BitPattern(val)); + return bit_cast(emulated_float64_t_impl::castToUint64WithFloat64BitPattern(val)); } NBL_CONSTEXPR_STATIC_INLINE this_t create(float32_t val) { this_t output; - output.data = impl::castFloat32ToStorageType(val); + output.data = emulated_float64_t_impl::castFloat32ToStorageType(val); return output; } @@ -89,10 +89,10 @@ namespace hlsl { if(FastMath) { - if (tgmath::isnan(data) || tgmath::isnan(rhs.data)) + if (tgmath::isNaN(data) || tgmath::isNaN(rhs.data)) return bit_cast(ieee754::traits::quietNaN); - if (impl::areBothInfinity(data, rhs.data)) + if (emulated_float64_t_impl::areBothInfinity(data, rhs.data)) { uint64_t lhsSign = data & ieee754::traits::signMask; uint64_t rhsSign = rhs.data & ieee754::traits::signMask; @@ -103,25 +103,25 @@ namespace hlsl return bit_cast(ieee754::traits::quietNaN | ieee754::traits::signMask); } - if (tgmath::isinf(data)) + if (tgmath::isInf(data)) return bit_cast(data); - if (tgmath::isinf(rhs.data)) + if (tgmath::isInf(rhs.data)) return bit_cast(rhs.data); } const int lhsBiasedExp = ieee754::extractBiasedExponent(data); const int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); - uint64_t lhsData = impl::flushDenormToZero(lhsBiasedExp, data); - uint64_t rhsData = impl::flushDenormToZero(rhsBiasedExp, rhs.data); + uint64_t lhsData = emulated_float64_t_impl::flushDenormToZero(lhsBiasedExp, data); + uint64_t rhsData = emulated_float64_t_impl::flushDenormToZero(rhsBiasedExp, rhs.data); uint64_t lhsSign = ieee754::extractSignPreserveBitPattern(lhsData); uint64_t rhsSign = ieee754::extractSignPreserveBitPattern(rhsData); if(FastMath) { - if (impl::areBothZero(lhsData, rhsData)) + if (emulated_float64_t_impl::areBothZero(lhsData, rhsData)) { if (lhsSign == rhsSign) return bit_cast(lhsSign); @@ -129,7 +129,7 @@ namespace hlsl return bit_cast(0ull); } - if (tgmath::isinf(lhsData)) + if (tgmath::isInf(lhsData)) return bit_cast(ieee754::traits::inf | ieee754::extractSignPreserveBitPattern(max(lhsData, rhsData))); } @@ -188,7 +188,7 @@ namespace hlsl } resultMantissa &= ieee754::traits::mantissaMask; - uint64_t output = impl::assembleFloat64(lhsSign, resultBiasedExp << ieee754::traits::mantissaBitCnt, resultMantissa); + uint64_t output = emulated_float64_t_impl::assembleFloat64(lhsSign, resultBiasedExp << ieee754::traits::mantissaBitCnt, resultMantissa); return bit_cast(output); } @@ -224,8 +224,8 @@ namespace hlsl int lhsBiasedExp = ieee754::extractBiasedExponent(data); int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); - uint64_t lhsData = impl::flushDenormToZero(lhsBiasedExp, data); - uint64_t rhsData = impl::flushDenormToZero(rhsBiasedExp, rhs.data); + uint64_t lhsData = emulated_float64_t_impl::flushDenormToZero(lhsBiasedExp, data); + uint64_t rhsData = emulated_float64_t_impl::flushDenormToZero(rhsBiasedExp, rhs.data); uint64_t lhsSign = lhsData & ieee754::traits::signMask; uint64_t rhsSign = rhsData & ieee754::traits::signMask; @@ -237,11 +237,11 @@ namespace hlsl uint64_t sign = (lhsData ^ rhsData) & ieee754::traits::signMask; if (FastMath) { - if (tgmath::isnan(lhsData) || tgmath::isnan(rhsData)) + if (tgmath::isNaN(lhsData) || tgmath::isNaN(rhsData)) return bit_cast(ieee754::traits::quietNaN | sign); - if (tgmath::isinf(lhsData) || tgmath::isinf(rhsData)) + if (tgmath::isInf(lhsData) || tgmath::isInf(rhsData)) return bit_cast(ieee754::traits::inf | sign); - if (impl::areBothZero(lhsData, rhsData)) + if (emulated_float64_t_impl::areBothZero(lhsData, rhsData)) return bit_cast(sign); } @@ -265,7 +265,7 @@ namespace hlsl } newPseudoMantissa &= (ieee754::traits::mantissaMask); - return bit_cast(impl::assembleFloat64(sign, uint64_t(exp) << ieee754::traits::mantissaBitCnt, newPseudoMantissa)); + return bit_cast(emulated_float64_t_impl::assembleFloat64(sign, uint64_t(exp) << ieee754::traits::mantissaBitCnt, newPseudoMantissa)); } else { @@ -287,35 +287,35 @@ namespace hlsl if(FastMath) { - if (tgmath::isnan(data) || tgmath::isnan(rhs.data)) + if (tgmath::isNaN(data) || tgmath::isNaN(rhs.data)) return bit_cast(ieee754::traits::quietNaN); - if (impl::isZero(rhs.data)) + if (emulated_float64_t_impl::isZero(rhs.data)) return bit_cast(ieee754::traits::quietNaN | sign); - if (impl::areBothInfinity(data, rhs.data)) + if (emulated_float64_t_impl::areBothInfinity(data, rhs.data)) return bit_cast(ieee754::traits::quietNaN | ieee754::traits::signMask); - if (tgmath::isinf(data)) + if (tgmath::isInf(data)) return bit_cast(ieee754::traits::inf | sign); - if (tgmath::isinf(rhs.data)) + if (tgmath::isInf(rhs.data)) return bit_cast(0ull | sign); - if (impl::isZero(rhs.data)) + if (emulated_float64_t_impl::isZero(rhs.data)) return bit_cast(ieee754::traits::quietNaN | sign); } int lhsBiasedExp = ieee754::extractBiasedExponent(data); int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); - uint64_t lhsData = impl::flushDenormToZero(lhsBiasedExp, data); - uint64_t rhsData = impl::flushDenormToZero(rhsBiasedExp, rhs.data); + uint64_t lhsData = emulated_float64_t_impl::flushDenormToZero(lhsBiasedExp, data); + uint64_t rhsData = emulated_float64_t_impl::flushDenormToZero(rhsBiasedExp, rhs.data); const uint64_t lhsRealMantissa = (ieee754::extractMantissa(lhsData) | (1ull << ieee754::traits::mantissaBitCnt)); const uint64_t rhsRealMantissa = ieee754::extractMantissa(rhsData) | (1ull << ieee754::traits::mantissaBitCnt); int exp = lhsBiasedExp - rhsBiasedExp + int(ieee754::traits::exponentBias); - uint64_t2 lhsMantissaShifted = impl::shiftMantissaLeftBy53(lhsRealMantissa); - uint64_t mantissa = impl::divmod128by64(lhsMantissaShifted.x, lhsMantissaShifted.y, rhsRealMantissa); + uint64_t2 lhsMantissaShifted = emulated_float64_t_impl::shiftMantissaLeftBy53(lhsRealMantissa); + uint64_t mantissa = emulated_float64_t_impl::divmod128by64(lhsMantissaShifted.x, lhsMantissaShifted.y, rhsRealMantissa); - const int msb = impl::_findMSB(mantissa); + const int msb = emulated_float64_t_impl::_findMSB(mantissa); if(msb != -1) { const int shiftAmount = 52 - msb; @@ -326,7 +326,7 @@ namespace hlsl mantissa &= ieee754::traits::mantissaMask; - return bit_cast(impl::assembleFloat64(sign, uint64_t(exp) << ieee754::traits::mantissaBitCnt, mantissa)); + return bit_cast(emulated_float64_t_impl::assembleFloat64(sign, uint64_t(exp) << ieee754::traits::mantissaBitCnt, mantissa)); } else { @@ -340,9 +340,9 @@ namespace hlsl { if (FastMath) { - if (tgmath::isnan(data) || tgmath::isnan(rhs.data)) + if (tgmath::isNaN(data) || tgmath::isNaN(rhs.data)) return false; - if (impl::areBothZero(data, rhs.data)) + if (emulated_float64_t_impl::areBothZero(data, rhs.data)) return true; } @@ -354,7 +354,7 @@ namespace hlsl } bool operator!=(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { - if (FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) + if (FastMath && (tgmath::isNaN(data) || tgmath::isNaN(rhs.data))) return false; return !(bit_cast(data) == rhs); @@ -363,11 +363,11 @@ namespace hlsl { if (FastMath) { - if (tgmath::isnan(data) || tgmath::isnan(rhs.data)) + if (tgmath::isNaN(data) || tgmath::isNaN(rhs.data)) return false; - if (impl::areBothInfinity(data, rhs.data)) + if (emulated_float64_t_impl::areBothInfinity(data, rhs.data)) return false; - if (impl::areBothZero(data, rhs.data)) + if (emulated_float64_t_impl::areBothZero(data, rhs.data)) return false; } @@ -384,11 +384,11 @@ namespace hlsl { if (FastMath) { - if (tgmath::isnan(data) || tgmath::isnan(rhs.data)) + if (tgmath::isNaN(data) || tgmath::isNaN(rhs.data)) return false; - if (impl::areBothInfinity(data, rhs.data)) + if (emulated_float64_t_impl::areBothInfinity(data, rhs.data)) return false; - if (impl::areBothZero(data, rhs.data)) + if (emulated_float64_t_impl::areBothZero(data, rhs.data)) return false; } @@ -403,14 +403,14 @@ namespace hlsl } bool operator<=(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { - if (FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) + if (FastMath && (tgmath::isNaN(data) || tgmath::isNaN(rhs.data))) return false; return !(bit_cast(data) > bit_cast(rhs.data)); } bool operator>=(emulated_float64_t rhs) { - if (FastMath && (tgmath::isnan(data) || tgmath::isnan(rhs.data))) + if (FastMath && (tgmath::isNaN(data) || tgmath::isNaN(rhs.data))) return false; return !(bit_cast(data) < bit_cast(rhs.data)); @@ -517,7 +517,7 @@ struct static_cast_helper,void return bit_cast(ieee754::traits::inf); if (exponent < ieee754::traits::exponentMin) return bit_cast(-ieee754::traits::inf); - if (tgmath::isnan(v.data)) + if (tgmath::isNaN(v.data)) return bit_cast(ieee754::traits::quietNaN); } diff --git a/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl index 4e5ad6c7be..def46a36c9 100644 --- a/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl +++ b/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl @@ -39,7 +39,7 @@ namespace nbl { namespace hlsl { -namespace impl +namespace emulated_float64_t_impl { NBL_CONSTEXPR_INLINE_FUNC uint64_t2 shiftMantissaLeftBy53(uint64_t mantissa64) { @@ -50,26 +50,13 @@ NBL_CONSTEXPR_INLINE_FUNC uint64_t2 shiftMantissaLeftBy53(uint64_t mantissa64) return output; } -NBL_CONSTEXPR_INLINE_FUNC uint64_t packFloat64(uint32_t zSign, int zExp, uint32_t zFrac0, uint32_t zFrac1) -{ - uint32_t2 z; - - z.x = zSign + (uint32_t(zExp) << 20) + zFrac0; - z.y = zFrac1; - - uint64_t output = 0u; - output |= (uint64_t(z.x) << 32) & 0xFFFFFFFF00000000ull; - output |= uint64_t(z.y); - return output; -} - template inline uint64_t castFloat32ToStorageType(float32_t val) { if (FlushDenormToZero) { const uint64_t sign = uint64_t(ieee754::extractSign(val)) << 63; - if (tgmath::isinf(val)) + if (tgmath::isInf(val)) return ieee754::traits::inf | sign; uint32_t asUint = ieee754::impl::bitCastToUintType(val); const int f32BiasedExp = ieee754::extractBiasedExponent(val); @@ -178,19 +165,6 @@ NBL_CONSTEXPR_INLINE_FUNC uint32_t2 umulExtended(uint32_t lhs, uint32_t rhs) return output; } -NBL_CONSTEXPR_INLINE_FUNC uint64_t propagateFloat64NaN(uint64_t lhs, uint64_t rhs) -{ -#if defined RELAXED_NAN_PROPAGATION - return lhs | rhs; -#else - - lhs |= 0x0008000000000000ull; - rhs |= 0x0008000000000000ull; - return glsl::mix(rhs, glsl::mix(lhs, rhs, tgmath::isnan(rhs)), tgmath::isnan(lhs)); - return 0; -#endif -} - NBL_CONSTEXPR_INLINE_FUNC uint64_t flushDenormToZero(uint64_t extractedBiasedExponent, uint64_t value) { return extractedBiasedExponent ? value : ieee754::extractSignPreserveBitPattern(value); @@ -201,14 +175,6 @@ NBL_CONSTEXPR_INLINE_FUNC uint64_t assembleFloat64(uint64_t signShifted, uint64_ return signShifted | expShifted | mantissa; } -//TODO: remove -static inline void normalizeFloat64Subnormal(uint64_t mantissa, - NBL_REF_ARG(int) outExp, - NBL_REF_ARG(uint64_t) outMantissa) -{ - return; -} - NBL_CONSTEXPR_INLINE_FUNC bool areBothInfinity(uint64_t lhs, uint64_t rhs) { lhs &= ~ieee754::traits::signMask; diff --git a/include/nbl/builtin/hlsl/tgmath.hlsl b/include/nbl/builtin/hlsl/tgmath.hlsl index 0edb47eb1f..4a78ba9aed 100644 --- a/include/nbl/builtin/hlsl/tgmath.hlsl +++ b/include/nbl/builtin/hlsl/tgmath.hlsl @@ -12,12 +12,32 @@ namespace nbl { namespace hlsl { - namespace tgmath { +namespace impl +{ + +template::value> +NBL_CONSTEXPR_INLINE_FUNC bool isInf(T val) +{ + using AsUint = typename unsigned_integer_of_size::type; + using AsFloat = typename float_of_size::type; + + if (IsTFundamental) + { + return isinf(bit_cast(val)); + } + else + { + AsUint tmp = bit_cast(val); + return (tmp & (~ieee754::traits::signMask)) == ieee754::traits::inf; + } +} + +} template -inline bool isnan(T val) +inline bool isNaN(T val) { using AsUint = typename unsigned_integer_of_size::type; using AsFloat = typename float_of_size::type; @@ -27,13 +47,9 @@ inline bool isnan(T val) } template -NBL_CONSTEXPR_INLINE_FUNC bool isinf(T val) +NBL_CONSTEXPR_INLINE_FUNC bool isInf(T val) { - using AsUint = typename unsigned_integer_of_size::type; - using AsFloat = typename float_of_size::type; - - AsUint tmp = bit_cast(val); - return (tmp & (~ieee754::traits::signMask)) == ieee754::traits::inf; + return impl::isInf(val); } } From 52993c4f21d6ff5e030b4901091fb4e3ed4e7aa6 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Sat, 14 Sep 2024 22:12:24 +0100 Subject: [PATCH 052/358] Saving work --- .../nbl/builtin/hlsl/emulated/matrix_t.hlsl | 32 +++++++++---------- .../nbl/builtin/hlsl/portable/matrix_t.hlsl | 6 ++-- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/include/nbl/builtin/hlsl/emulated/matrix_t.hlsl b/include/nbl/builtin/hlsl/emulated/matrix_t.hlsl index 241975285e..ba5156c962 100644 --- a/include/nbl/builtin/hlsl/emulated/matrix_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/matrix_t.hlsl @@ -17,17 +17,17 @@ struct emulated_matrix using vec_t = emulated_vector_t2; using this_t = emulated_matrix; - vec_t columns[2]; + vec_t rows[2]; this_t getTransposed() NBL_CONST_MEMBER_FUNC { this_t output; - output.columns[0].x = columns[0].x; - output.columns[1].x = columns[0].y; + output.rows[0].x = rows[0].x; + output.rows[1].x = rows[0].y; - output.columns[0].y = columns[1].x; - output.columns[1].y = columns[1].y; + output.rows[0].y = rows[1].x; + output.rows[1].y = rows[1].y; return output; } @@ -39,30 +39,30 @@ struct emulated_matrix using vec_t = emulated_vector_t3; using this_t = emulated_matrix; - vec_t columns[3]; + vec_t rows[3]; this_t getTransposed() NBL_CONST_MEMBER_FUNC { this_t output; - output.columns[0].x = columns[0].x; - output.columns[1].x = columns[0].y; - output.columns[2].x = columns[0].z; + output.rows[0].x = rows[0].x; + output.rows[1].x = rows[0].y; + output.rows[2].x = rows[0].z; - output.columns[0].y = columns[1].x; - output.columns[1].y = columns[1].y; - output.columns[2].y = columns[1].z; + output.rows[0].y = rows[1].x; + output.rows[1].y = rows[1].y; + output.rows[2].y = rows[1].z; - output.columns[0].z = columns[2].x; - output.columns[1].z = columns[2].y; - output.columns[2].z = columns[2].z; + output.rows[0].z = rows[2].x; + output.rows[1].z = rows[2].y; + output.rows[2].z = rows[2].z; return output; } vec_t operator[](uint32_t columnIdx) { - return columns[columnIdx]; + return rows[columnIdx]; } }; diff --git a/include/nbl/builtin/hlsl/portable/matrix_t.hlsl b/include/nbl/builtin/hlsl/portable/matrix_t.hlsl index 8cf3d8233f..854777e1f8 100644 --- a/include/nbl/builtin/hlsl/portable/matrix_t.hlsl +++ b/include/nbl/builtin/hlsl/portable/matrix_t.hlsl @@ -54,9 +54,9 @@ struct PortableMul64Helper V output; M matTransposed = mat.getTransposed(); - output.x = (matTransposed.columns[0] * vec).calcComponentSum(); - output.y = (matTransposed.columns[1] * vec).calcComponentSum(); - output.z = (matTransposed.columns[2] * vec).calcComponentSum(); + output.x = (matTransposed.rows[0] * vec).calcComponentSum(); + output.y = (matTransposed.rows[1] * vec).calcComponentSum(); + output.z = (matTransposed.rows[2] * vec).calcComponentSum(); return output; } From aa444c474603613ea299b1eb4d62db72e293569a Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz <34793522+AnastaZIuk@users.noreply.github.com> Date: Mon, 16 Sep 2024 10:08:05 +0200 Subject: [PATCH 053/358] Update CMakeLists.txt, change file names (vector_t.hlsl & matrix_t.hlsl), CMake had wrong ones --- src/nbl/builtin/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index f06deb7c46..5c3b3faf93 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -234,8 +234,8 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/macros.h") #emulated LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated/float64_t.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated/float64_t_impl.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated/vector_t_impl.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated/matrix_t_impl.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated/vector_t.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated/matrix_t.hlsl") #portable LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/portable/float64_t.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/portable/vector_t.hlsl") From 7ae2f526ee3d57827b44c0c0e7f6fb1d015e4a26 Mon Sep 17 00:00:00 2001 From: Fletterio Date: Mon, 16 Sep 2024 12:18:56 -0300 Subject: [PATCH 054/358] Adding utils to get right order after FFT --- include/nbl/builtin/hlsl/fft/utils.hlsl | 0 include/nbl/builtin/hlsl/workgroup/fft.hlsl | 62 +++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 include/nbl/builtin/hlsl/fft/utils.hlsl diff --git a/include/nbl/builtin/hlsl/fft/utils.hlsl b/include/nbl/builtin/hlsl/fft/utils.hlsl new file mode 100644 index 0000000000..e69de29bb2 diff --git a/include/nbl/builtin/hlsl/workgroup/fft.hlsl b/include/nbl/builtin/hlsl/workgroup/fft.hlsl index a7a885e15f..2d00147ce3 100644 --- a/include/nbl/builtin/hlsl/workgroup/fft.hlsl +++ b/include/nbl/builtin/hlsl/workgroup/fft.hlsl @@ -97,6 +97,68 @@ struct exchangeValues template NBL_CONSTEXPR uint32_t SharedMemoryDWORDs = (sizeof(complex_t) / sizeof(uint32_t)) * WorkgroupSize; + +template +enable_if_t bitShiftRightHigher(uint32_t i) +{ + // Highest H bits are numbered N-1 through N - H + // N - H is then the middle bit + // Lowest bits numbered from 0 through N - H - 1 + uint32_t low = i & ((1 << (N - H)) - 1); + uint32_t mid = i & (1 << (N - H)); + uint32_t high = i & ~((1 << (N - H + 1)) - 1); + + high >>= 1; + mid <<= H - 1; + + return mid | high | low; +} + +template +enable_if_t bitShiftLeftHigher(uint32_t i) +{ + // Highest H bits are numbered N-1 through N - H + // N - 1 is then the highest bit, and N - 2 through N - H are the middle bits + // Lowest bits numbered from 0 through N - H - 1 + uint32_t low = i & ((1 << (N - H)) - 1); + uint32_t mid = i & (~((1 << (N - H)) - 1) | ~(1 << (N - 1))); + uint32_t high = i & (1 << (N - 1)); + + mid <<= 1; + high >>= H - 1; + + return mid | high | low; +} + +// For an N-bit number, mirrors it around the Nyquist frequency, which for the range [0, 2^N - 1] is precisely 2^(N - 1) +template +uint32_t mirror(uint32_t i) +{ + return ((1 << N) - i) & ((1 << N) - 1) +} + +// This function maps the index `idx` in the output array of a Forward FFT to the index `freqIdx` in the DFT such that `DFT[freqIdx] = output[idx]` +// This is because Cooley-Tukey + subgroup operations end up spewing out the outputs in a weird order +template +uint32_t getFrequencyAt(uint32_t idx) +{ + NBL_CONSTEXPR_STATIC_INLINE uint32_t ELEMENTS_PER_INVOCATION_LOG_2 = uint32_t(mpl::log2::value); + NBL_CONSTEXPR_STATIC_INLINE uint32_t FFT_SIZE_LOG_2 = ELEMENTS_PER_INVOCATION_LOG_2 + uint32_t(mpl::log2::value); + + return mirror(bitShiftRightHigher(glsl::bitfieldReverse(idx) >> (32 - FFT_SIZE_LOG_2))); +} + +// This function maps the index `freqIdx` in the DFT to the index `idx` in the output array of a Forward FFT such that `DFT[freqIdx] = output[idx]` +// It is essentially the inverse of `getFrequencyAt` +template +uint32_t getOutputAt(uint32_t freqIdx) +{ + NBL_CONSTEXPR_STATIC_INLINE uint32_t ELEMENTS_PER_INVOCATION_LOG_2 = uint32_t(mpl::log2::value); + NBL_CONSTEXPR_STATIC_INLINE uint32_t FFT_SIZE_LOG_2 = ELEMENTS_PER_INVOCATION_LOG_2 + uint32_t(mpl::log2::value); + + return glsl::bitfieldReverse(bitShiftLeftHigher(mirror(freqIdx))) >> (32 - FFT_SIZE_LOG_2); +} + } //namespace fft // ----------------------------------- End Utils ----------------------------------------------- From bb3cae5daed8f5052fb34e532960e22f557116b4 Mon Sep 17 00:00:00 2001 From: Fletterio Date: Mon, 16 Sep 2024 12:38:40 -0300 Subject: [PATCH 055/358] add NBL_OPENEXR_FORCE_SSH /Arek (some agents seems to not like passing keys to CMake it seems) --- 3rdparty/CMakeLists.txt | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt index 2c278407d1..4d804db5f7 100755 --- a/3rdparty/CMakeLists.txt +++ b/3rdparty/CMakeLists.txt @@ -203,8 +203,16 @@ option(_NBL_COMPILE_WITH_OPEN_EXR_ "Build with OpenEXR library" ON) # but it's core # for new build system it doesn't matter since we no more fetch content stuff, temporary to fix current master CMake clones -set(OPENEXR_DEFLATE_REPO "git@github.com:ebiggers/libdeflate.git" CACHE STRING "Repo path for libdeflate source" FORCE) -set(OPENEXR_IMATH_REPO "git@github.com:AcademySoftwareFoundation/Imath.git" CACHE STRING "Repo for auto-build of Imath" FORCE) + +option(NBL_OPENEXR_FORCE_SSH "" ON) + +if(NBL_OPENEXR_FORCE_SSH) + set(OPENEXR_DEFLATE_REPO "git@github.com:ebiggers/libdeflate.git" CACHE STRING "Repo path for libdeflate source" FORCE) + set(OPENEXR_IMATH_REPO "git@github.com:AcademySoftwareFoundation/Imath.git" CACHE STRING "Repo for auto-build of Imath" FORCE) +else() + set(OPENEXR_DEFLATE_REPO "https://github.com/ebiggers/libdeflate.git" CACHE STRING "Repo path for libdeflate source" FORCE) + set(OPENEXR_IMATH_REPO "https://github.com/AcademySoftwareFoundation/Imath.git" CACHE STRING "Repo for auto-build of Imath" FORCE) +endif() set(_OLD_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) set(_OLD_BUILD_STATIC_LIBS ${BUILD_STATIC_LIBS}) From 29d07fba8248a4bbd529943df8ce15ed582936a8 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Sat, 21 Sep 2024 16:30:56 -0700 Subject: [PATCH 056/358] Fixes --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated/float64_t.hlsl | 2 +- .../nbl/builtin/hlsl/emulated/vector_t.hlsl | 407 ++++++++++++++---- .../nbl/builtin/hlsl/portable/float64_t.hlsl | 3 +- .../nbl/builtin/hlsl/portable/matrix_t.hlsl | 8 +- .../nbl/builtin/hlsl/portable/vector_t.hlsl | 36 +- include/nbl/builtin/hlsl/shapes/beziers.hlsl | 2 +- include/nbl/builtin/hlsl/tgmath.hlsl | 23 +- 8 files changed, 377 insertions(+), 106 deletions(-) diff --git a/examples_tests b/examples_tests index 7f7b48596a..cdafbbcc3b 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 7f7b48596a61c43d5d72b1e032525cc07ece91a8 +Subproject commit cdafbbcc3b5367e1139a5911f6a798d307128f5a diff --git a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl index 6051cee7b5..6da685654c 100644 --- a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl @@ -511,7 +511,7 @@ struct static_cast_helper,void { const int exponent = ieee754::extractExponent(v.data); - if (!From::isFastMathSuppoerted) + if (!From::isFastMathSupported) { if (exponent > ieee754::traits::exponentMax) return bit_cast(ieee754::traits::inf); diff --git a/include/nbl/builtin/hlsl/emulated/vector_t.hlsl b/include/nbl/builtin/hlsl/emulated/vector_t.hlsl index d4809647e4..8bbc6c4224 100644 --- a/include/nbl/builtin/hlsl/emulated/vector_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/vector_t.hlsl @@ -8,148 +8,396 @@ namespace nbl namespace hlsl { -template -struct emulated_vector {}; +namespace emulated_vector_impl +{ + + +template +struct _2_component_vec +{ + T x; + T y; + + static_assert(sizeof(T) <= 8); + + NBL_CONSTEXPR_INLINE_FUNC void setComponent(uint32_t componentIdx, T val) + { + if (componentIdx == 0) + x = val; + if (componentIdx == 1) + y = val; + } + + NBL_CONSTEXPR_INLINE_FUNC T getComponent(uint32_t componentIdx) + { + if (componentIdx == 0) + return x; + if (componentIdx == 1) + return y; + + // TODO: avoid code duplication, make it constexpr + using TAsUint = typename unsigned_integer_of_size::type; + uint64_t invalidComponentValue = nbl::hlsl::_static_cast(0xdeadbeefbadcaffeull >> (64 - sizeof(T) * 8)); + return nbl::hlsl::bit_cast(invalidComponentValue); + } + + NBL_CONSTEXPR_STATIC uint32_t Dimension = 2; +}; + +template +struct _3_component_vec +{ + T x; + T y; + T z; + + + NBL_CONSTEXPR_INLINE_FUNC void setComponent(uint32_t componentIdx, T val) + { + if (componentIdx == 0) + x = val; + if (componentIdx == 1) + y = val; + if (componentIdx == 2) + z = val; + } + + NBL_CONSTEXPR_INLINE_FUNC T getComponent(uint32_t componentIdx) + { + if (componentIdx == 0) + return x; + if (componentIdx == 1) + return y; + if (componentIdx == 1) + return z; + + // TODO: avoid code duplication, make it constexpr + using TAsUint = typename unsigned_integer_of_size::type; + uint64_t invalidComponentValue = nbl::hlsl::_static_cast(0xdeadbeefbadcaffeull >> (64 - sizeof(T) * 8)); + return nbl::hlsl::bit_cast(invalidComponentValue); + } + + NBL_CONSTEXPR_STATIC uint32_t Dimension = 3; +}; + +template +struct _4_component_vec +{ + T x; + T y; + T z; + T w; + + NBL_CONSTEXPR_INLINE_FUNC void setComponent(uint32_t componentIdx, T val) + { + if (componentIdx == 0) + x = val; + if (componentIdx == 1) + y = val; + if (componentIdx == 2) + z = val; + if (componentIdx == 3) + w = val; + } + + NBL_CONSTEXPR_INLINE_FUNC T getComponent(uint32_t componentIdx) + { + if (componentIdx == 0) + return x; + if (componentIdx == 1) + return y; + if (componentIdx == 1) + return z; + if (componentIdx == 3) + return w; + + // TODO: avoid code duplication, make it constexpr + using TAsUint = typename unsigned_integer_of_size::type; + uint64_t invalidComponentValue = nbl::hlsl::_static_cast(0xdeadbeefbadcaffeull >> (64 - sizeof(T) * 8)); + return nbl::hlsl::bit_cast(invalidComponentValue); + } + + NBL_CONSTEXPR_STATIC uint32_t Dimension = 4; +}; -template -struct emulated_vector +template ::value> +struct emulated_vector : CRTP { - using this_t = emulated_vector; + using this_t = emulated_vector; + using component_t = ComponentType; - EmulatedType x; - EmulatedType y; + NBL_CONSTEXPR_INLINE_FUNC this_t create(this_t other) + { + CRTP output; + + for (uint32_t i = 0u; i < CRTP::Dimension; ++i) + output.setComponent(i, other.getComponent(i)); + } - EmulatedType calcComponentSum() NBL_CONST_MEMBER_FUNC + template + NBL_CONSTEXPR_INLINE_FUNC this_t create(vector other) { - return x + y; + this_t output; + + for (uint32_t i = 0u; i < CRTP::Dimension; ++i) + output.setComponent(i, other[i]); + + return output; } - NBL_CONSTEXPR_STATIC_INLINE this_t create(EmulatedType x, EmulatedType y) + NBL_CONSTEXPR_INLINE_FUNC this_t operator+(ComponentType val) { this_t output; - output.x = x; - output.y = y; + + for (uint32_t i = 0u; i < CRTP::Dimension; ++i) + output.setComponent(i, this_t::getComponent(i) + val); return output; } + NBL_CONSTEXPR_INLINE_FUNC this_t operator+(this_t other) + { + this_t output; + + for (uint32_t i = 0u; i < CRTP::Dimension; ++i) + output.setComponent(i, this_t::getComponent(i) + other.getComponent(i)); - this_t operator+(float rhs) + return output; + } + NBL_CONSTEXPR_INLINE_FUNC this_t operator+(vector other) { this_t output; - EmulatedType rhsAsEF64 = EmulatedType::create(rhs); - output.x = x + rhsAsEF64; - output.y = y + rhsAsEF64; + + for (uint32_t i = 0u; i < CRTP::Dimension; ++i) + output.setComponent(i, this_t::getComponent(i) + other[i]); + + return output; + } + + NBL_CONSTEXPR_INLINE_FUNC this_t operator-(ComponentType val) + { + this_t output; + + for (uint32_t i = 0u; i < CRTP::Dimension; ++i) + output.setComponent(i, this_t::getComponent(i) - val); return output; } + NBL_CONSTEXPR_INLINE_FUNC this_t operator-(this_t other) + { + this_t output; - this_t operator+(EmulatedType rhs) + for (uint32_t i = 0u; i < CRTP::Dimension; ++i) + output.setComponent(i, this_t::getComponent(i) - other.getComponent(i)); + + return output; + } + NBL_CONSTEXPR_INLINE_FUNC this_t operator-(vector other) { this_t output; - output.x = x + rhs; - output.y = y + rhs; + + for (uint32_t i = 0u; i < CRTP::Dimension; ++i) + output.setComponent(i, this_t::getComponent(i) - other[i]); return output; } - this_t operator+(this_t rhs) + NBL_CONSTEXPR_INLINE_FUNC this_t operator*(ComponentType val) { this_t output; - output.x = x + rhs.x; - output.y = y + rhs.y; + + for (uint32_t i = 0u; i < CRTP::Dimension; ++i) + output.setComponent(i, this_t::getComponent(i) * val); return output; } + NBL_CONSTEXPR_INLINE_FUNC this_t operator*(this_t other) + { + this_t output; - this_t operator-(float rhs) + for (uint32_t i = 0u; i < CRTP::Dimension; ++i) + output.setComponent(i, this_t::getComponent(i) * other.getComponent(i)); + + return output; + } + NBL_CONSTEXPR_INLINE_FUNC this_t operator*(vector other) { - return create(x, y) + (-rhs); + this_t output; + + for (uint32_t i = 0u; i < CRTP::Dimension; ++i) + output.setComponent(i, this_t::getComponent(i) * other[i]); + + return output; } - this_t operator-(EmulatedType rhs) + NBL_CONSTEXPR_INLINE_FUNC component_t calcComponentSum() { - return create(x, y) + (rhs.flipSign()); + component_t sum = 0; + for (uint32_t i = 0u; i < CRTP::Dimension; ++i) + sum = sum + CRTP::getComponent(i); + + return sum; } +}; - this_t operator-(this_t rhs) +#define DEFINE_OPERATORS_FOR_TYPE(TYPE)\ +NBL_CONSTEXPR_INLINE_FUNC this_t operator+(TYPE val)\ +{\ + this_t output;\ + for (uint32_t i = 0u; i < CRTP::Dimension; ++i)\ + output.setComponent(i, CRTP::getComponent(i) + val);\ +\ + return output;\ +}\ +\ +NBL_CONSTEXPR_INLINE_FUNC this_t operator-(TYPE val)\ +{\ + this_t output;\ + for (uint32_t i = 0u; i < CRTP::Dimension; ++i)\ + output.setComponent(i, CRTP::getComponent(i) - val);\ +\ + return output;\ +}\ +\ +NBL_CONSTEXPR_INLINE_FUNC this_t operator*(TYPE val)\ +{\ + this_t output;\ + for (uint32_t i = 0u; i < CRTP::Dimension; ++i)\ + output.setComponent(i, CRTP::getComponent(i) * val);\ +\ + return output;\ +}\ +\ + +template +struct emulated_vector : CRTP +{ + using component_t = ComponentType; + using this_t = emulated_vector; + + NBL_CONSTEXPR_INLINE_FUNC this_t create(this_t other) { - rhs.x = rhs.x.flipSign(); - rhs.y = rhs.y.flipSign(); - return create(x, y) + rhs; + CRTP output; + + for (uint32_t i = 0u; i < CRTP::Dimension; ++i) + output.setComponent(i, other.getComponent(i)); } - this_t operator*(float rhs) + template + NBL_CONSTEXPR_INLINE_FUNC this_t create(vector other) { this_t output; - EmulatedType rhsAsEF64 = EmulatedType::create(rhs); - output.x = x * rhsAsEF64; - output.y = y * rhsAsEF64; + + for (uint32_t i = 0u; i < CRTP::Dimension; ++i) + output.setComponent(i, ComponentType::create(other[i])); return output; } - this_t operator*(EmulatedType rhs) + NBL_CONSTEXPR_INLINE_FUNC this_t operator+(this_t other) { this_t output; - output.x = x * rhs; - output.y = y * rhs; + + for (uint32_t i = 0u; i < CRTP::Dimension; ++i) + output.setComponent(i, CRTP::getComponent(i) + other.getComponent(i)); return output; } + NBL_CONSTEXPR_INLINE_FUNC this_t operator-(this_t other) + { + this_t output; + + for (uint32_t i = 0u; i < CRTP::Dimension; ++i) + output.setComponent(i, CRTP::getComponent(i) - other.getComponent(i)); - this_t operator*(this_t rhs) + return output; + } + NBL_CONSTEXPR_INLINE_FUNC this_t operator*(this_t other) { this_t output; - output.x = x * rhs.x; - output.y = y * rhs.y; + + for (uint32_t i = 0u; i < CRTP::Dimension; ++i) + output.setComponent(i, CRTP::getComponent(i) * other.getComponent(i)); return output; } + + DEFINE_OPERATORS_FOR_TYPE(float32_t) + DEFINE_OPERATORS_FOR_TYPE(float64_t) + DEFINE_OPERATORS_FOR_TYPE(uint16_t) + DEFINE_OPERATORS_FOR_TYPE(uint32_t) + DEFINE_OPERATORS_FOR_TYPE(uint64_t) + DEFINE_OPERATORS_FOR_TYPE(int16_t) + DEFINE_OPERATORS_FOR_TYPE(int32_t) + DEFINE_OPERATORS_FOR_TYPE(int64_t) + + NBL_CONSTEXPR_INLINE_FUNC ComponentType calcComponentSum() + { + ComponentType sum = ComponentType::create(0); + for (uint32_t i = 0u; i < CRTP::Dimension; ++i) + sum = sum + CRTP::getComponent(i); + + return sum; + } }; -template -struct emulated_vector -{ - using this_t = emulated_vector; +#undef DEFINE_OPERATORS_FOR_TYPE - EmulatedType x; - EmulatedType y; - EmulatedType z; +} - EmulatedType calcComponentSum() NBL_CONST_MEMBER_FUNC +template +struct emulated_vector_t {}; +template +struct emulated_vector_t : emulated_vector_impl::emulated_vector > {}; +template +struct emulated_vector_t : emulated_vector_impl::emulated_vector > {}; +template +struct emulated_vector_t : emulated_vector_impl::emulated_vector > {}; + +template +using emulated_vector_t2 = emulated_vector_impl::emulated_vector >; +template +using emulated_vector_t3 = emulated_vector_impl::emulated_vector >; +template +using emulated_vector_t4 = emulated_vector_impl::emulated_vector >; + +template +struct array_get +{ + T operator()(I index, NBL_CONST_REF_ARG(U) arr) { - return x + y + z; + return arr[index]; } +}; - this_t operator*(NBL_CONST_REF_ARG(this_t) rhs) NBL_CONST_MEMBER_FUNC +template +struct array_get::component_t, emulated_vector_t, uint32_t> +{ + T operator()(uint32_t index, NBL_CONST_REF_ARG(emulated_vector_t) vec) { - this_t output; - output.x = x * rhs.x; - output.y = y * rhs.y; - output.z = z * rhs.z; + return vec.getComponent(index); + } +}; - return output; +template +struct array_set +{ + void operator()(I index, NBL_REF_ARG(U) arr, T val) + { + arr[index] = val; } }; -template -struct emulated_vector +template +struct array_set, uint32_t> { - using type = emulated_vector; + using type_t = T; - EmulatedType x; - EmulatedType y; - EmulatedType z; - EmulatedType w; + T operator()(uint32_t index, NBL_CONST_REF_ARG(emulated_vector_t) vec, T value) + { + vec.setComponent(index, value); + } }; -template -using emulated_vector_t2 = emulated_vector; -template -using emulated_vector_t3 = emulated_vector; -template -using emulated_vector_t4 = emulated_vector; - namespace impl { template @@ -157,7 +405,11 @@ struct static_cast_helper, vector, void> { static inline emulated_vector_t2 cast(vector vec) { - return emulated_vector_t2(_static_cast(vec.x), _static_cast(vec.y)); + emulated_vector_t2 output; + output.x = _static_cast(vec.x); + output.y = _static_cast(vec.y); + + return output; } }; @@ -166,7 +418,12 @@ struct static_cast_helper, vector, void> { static inline emulated_vector_t3 cast(vector vec) { - return emulated_vector_t3(_static_cast(vec.x), _static_cast(vec.y), _static_cast(vec.z)); + emulated_vector_t2 output; + output.x = _static_cast(vec.x); + output.y = _static_cast(vec.y); + output.z = _static_cast(vec.z); + + return output; } }; @@ -175,7 +432,13 @@ struct static_cast_helper, vector, void> { static inline emulated_vector_t4 cast(vector vec) { - return emulated_vector_t4(_static_cast(vec.x), _static_cast(vec.y), _static_cast(vec.z), _static_cast(vec.w)); + emulated_vector_t2 output; + output.x = _static_cast(vec.x); + output.y = _static_cast(vec.y); + output.z = _static_cast(vec.z); + output.w = _static_cast(vec.w); + + return output; } }; } diff --git a/include/nbl/builtin/hlsl/portable/float64_t.hlsl b/include/nbl/builtin/hlsl/portable/float64_t.hlsl index 92b6e53133..131d37a87b 100644 --- a/include/nbl/builtin/hlsl/portable/float64_t.hlsl +++ b/include/nbl/builtin/hlsl/portable/float64_t.hlsl @@ -9,7 +9,8 @@ namespace hlsl { template #ifdef __HLSL_VERSION -using portable_float64_t = typename conditional::shaderFloat64, float64_t, emulated_float64_t >::type; +//using portable_float64_t = typename conditional::shaderFloat64, float64_t, emulated_float64_t >::type; +using portable_float64_t = typename conditional::shaderFloat64, emulated_float64_t, emulated_float64_t >::type; #else using portable_float64_t = float64_t; #endif diff --git a/include/nbl/builtin/hlsl/portable/matrix_t.hlsl b/include/nbl/builtin/hlsl/portable/matrix_t.hlsl index 854777e1f8..a01c16748d 100644 --- a/include/nbl/builtin/hlsl/portable/matrix_t.hlsl +++ b/include/nbl/builtin/hlsl/portable/matrix_t.hlsl @@ -33,14 +33,14 @@ using portable_matrix_t3x3 = portable_matrix_t; #ifdef __HLSL_VERSION template -using portable_matrix64_t2x2 = portable_matrix_t2x2 >; +using portable_float64_t2x2 = portable_matrix_t2x2 >; template -using portable_matrix64_t3x3 = portable_matrix_t3x3 >; +using portable_float64_t3x3 = portable_matrix_t3x3 >; #else template -using portable_matrix64_t2x2 = portable_matrix_t2x2; +using portable_float64_t2x2 = portable_matrix_t2x2; template -using portable_matrix64_t3x3 = portable_matrix_t3x3; +using portable_float64_t3x3 = portable_matrix_t3x3; #endif namespace impl diff --git a/include/nbl/builtin/hlsl/portable/vector_t.hlsl b/include/nbl/builtin/hlsl/portable/vector_t.hlsl index 474975fb21..74df16579b 100644 --- a/include/nbl/builtin/hlsl/portable/vector_t.hlsl +++ b/include/nbl/builtin/hlsl/portable/vector_t.hlsl @@ -9,6 +9,8 @@ namespace nbl namespace hlsl { +namespace portable_vector_impl +{ template::value> struct portable_vector { @@ -18,12 +20,30 @@ struct portable_vector template struct portable_vector { - using type = portable_vector; + using type = emulated_vector_t; }; + +template +struct portable_vector +{ + using type = emulated_vector_t2; +}; +template +struct portable_vector +{ + using type = emulated_vector_t3; +}; +template +struct portable_vector +{ + using type = emulated_vector_t4; +}; + #endif +} template -using portable_vector_t = typename portable_vector::type; +using portable_vector_t = typename portable_vector_impl::portable_vector::type; template using portable_vector_t2 = portable_vector_t; @@ -34,18 +54,18 @@ using portable_vector_t4 = portable_vector_t; #ifdef __HLSL_VERSION template -using portable_vector64_t2 = portable_vector_t2 >; +using portable_float64_t2 = portable_vector_t2 >; template -using portable_vector64_t3 = portable_vector_t3 >; +using portable_float64_t3 = portable_vector_t3 >; template -using portable_vector64_t4 = portable_vector_t4 >; +using portable_float64_t4 = portable_vector_t4 >; #else template -using portable_vector64_t2 = portable_vector_t2; +using portable_float64_t2 = portable_vector_t2; template -using portable_vector64_t3 = portable_vector_t3; +using portable_float64_t3 = portable_vector_t3; template -using portable_vector64_t4 = portable_vector_t4; +using portable_float64_t4 = portable_vector_t4; #endif } diff --git a/include/nbl/builtin/hlsl/shapes/beziers.hlsl b/include/nbl/builtin/hlsl/shapes/beziers.hlsl index 936a37fa7e..fa009074e0 100644 --- a/include/nbl/builtin/hlsl/shapes/beziers.hlsl +++ b/include/nbl/builtin/hlsl/shapes/beziers.hlsl @@ -517,7 +517,7 @@ static math::equations::Quartic getBezierBezierIntersectionEquation(NBL { using float_t2 = portable_vector_t2; using float64 = portable_float64_t; - using float64_vec2 = portable_vector64_t2; + using float64_vec2 = portable_float64_t2; // Algorithm based on Computer Aided Geometric Design: diff --git a/include/nbl/builtin/hlsl/tgmath.hlsl b/include/nbl/builtin/hlsl/tgmath.hlsl index 4a78ba9aed..68db6e932f 100644 --- a/include/nbl/builtin/hlsl/tgmath.hlsl +++ b/include/nbl/builtin/hlsl/tgmath.hlsl @@ -17,23 +17,6 @@ namespace tgmath namespace impl { -template::value> -NBL_CONSTEXPR_INLINE_FUNC bool isInf(T val) -{ - using AsUint = typename unsigned_integer_of_size::type; - using AsFloat = typename float_of_size::type; - - if (IsTFundamental) - { - return isinf(bit_cast(val)); - } - else - { - AsUint tmp = bit_cast(val); - return (tmp & (~ieee754::traits::signMask)) == ieee754::traits::inf; - } -} - } template @@ -49,7 +32,11 @@ inline bool isNaN(T val) template NBL_CONSTEXPR_INLINE_FUNC bool isInf(T val) { - return impl::isInf(val); + using AsUint = typename unsigned_integer_of_size::type; + using AsFloat = typename float_of_size::type; + + AsUint tmp = bit_cast(val); + return (tmp & (~ieee754::traits::signMask)) == ieee754::traits::inf; } } From 734b8eb60638fd32f49cd545c965a5779d0dc034 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Mon, 23 Sep 2024 15:43:54 -0700 Subject: [PATCH 057/358] More fixes --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated/float64_t.hlsl | 40 +----- .../builtin/hlsl/emulated/float64_t_impl.hlsl | 6 +- .../nbl/builtin/hlsl/emulated/matrix_t.hlsl | 4 +- .../nbl/builtin/hlsl/emulated/vector_t.hlsl | 136 +++++++++++------- .../nbl/builtin/hlsl/portable/matrix_t.hlsl | 4 +- .../nbl/builtin/hlsl/portable/vector_t.hlsl | 1 - 7 files changed, 96 insertions(+), 97 deletions(-) diff --git a/examples_tests b/examples_tests index cdafbbcc3b..aebab4a34f 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit cdafbbcc3b5367e1139a5911f6a798d307128f5a +Subproject commit aebab4a34f486f2ab7556a1fa47b33ddca5c7e83 diff --git a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl index 6da685654c..8302b5405e 100644 --- a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl @@ -503,7 +503,6 @@ struct static_cast_helper,void using ToAsFloat = typename float_of_size::type; using ToAsUint = typename unsigned_integer_of_size::type; - if (is_same_v) return To(bit_cast(v.data)); @@ -563,45 +562,12 @@ struct static_cast_helper,void } }; -template -struct static_cast_helper, float32_t, void> -{ - using To = emulated_float64_t; - - static inline To cast(float32_t v) - { - return To::create(v); - } -}; - -template -struct static_cast_helper, float64_t, void> -{ - using To = emulated_float64_t; - - static inline To cast(float64_t v) - { - return To::create(v); - } -}; - -template -struct static_cast_helper, uint32_t, void> +template +struct static_cast_helper, From, void> { using To = emulated_float64_t; - static inline To cast(uint32_t v) - { - return To::create(v); - } -}; - -template -struct static_cast_helper, uint64_t, void> -{ - using To = emulated_float64_t; - - static inline To cast(uint64_t v) + static inline To cast(From v) { return To::create(v); } diff --git a/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl index def46a36c9..8f6e37f26a 100644 --- a/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl +++ b/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl @@ -59,10 +59,10 @@ inline uint64_t castFloat32ToStorageType(float32_t val) if (tgmath::isInf(val)) return ieee754::traits::inf | sign; uint32_t asUint = ieee754::impl::bitCastToUintType(val); - const int f32BiasedExp = ieee754::extractBiasedExponent(val); - if (f32BiasedExp == 0) + const int f32Exp = ieee754::extractExponent(val); + if (f32Exp == 0) return sign; - const uint64_t biasedExp = uint64_t(f32BiasedExp - ieee754::traits::exponentBias + ieee754::traits::exponentBias) << (ieee754::traits::mantissaBitCnt); + const uint64_t biasedExp = uint64_t(f32Exp + ieee754::traits::exponentBias) << (ieee754::traits::mantissaBitCnt); const uint64_t mantissa = (uint64_t(ieee754::traits::mantissaMask) & asUint) << (ieee754::traits::mantissaBitCnt - ieee754::traits::mantissaBitCnt); return sign | biasedExp | mantissa; diff --git a/include/nbl/builtin/hlsl/emulated/matrix_t.hlsl b/include/nbl/builtin/hlsl/emulated/matrix_t.hlsl index ba5156c962..2dde2bd90c 100644 --- a/include/nbl/builtin/hlsl/emulated/matrix_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/matrix_t.hlsl @@ -60,9 +60,9 @@ struct emulated_matrix return output; } - vec_t operator[](uint32_t columnIdx) + vec_t operator[](uint32_t rowIdx) { - return rows[columnIdx]; + return rows[rowIdx]; } }; diff --git a/include/nbl/builtin/hlsl/emulated/vector_t.hlsl b/include/nbl/builtin/hlsl/emulated/vector_t.hlsl index 8bbc6c4224..fe7ccaec42 100644 --- a/include/nbl/builtin/hlsl/emulated/vector_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/vector_t.hlsl @@ -68,7 +68,7 @@ struct _3_component_vec return x; if (componentIdx == 1) return y; - if (componentIdx == 1) + if (componentIdx == 2) return z; // TODO: avoid code duplication, make it constexpr @@ -106,7 +106,7 @@ struct _4_component_vec return x; if (componentIdx == 1) return y; - if (componentIdx == 1) + if (componentIdx == 2) return z; if (componentIdx == 3) return w; @@ -126,16 +126,14 @@ struct emulated_vector : CRTP using this_t = emulated_vector; using component_t = ComponentType; - NBL_CONSTEXPR_INLINE_FUNC this_t create(this_t other) + NBL_CONSTEXPR_STATIC_INLINE this_t create(this_t other) { CRTP output; for (uint32_t i = 0u; i < CRTP::Dimension; ++i) output.setComponent(i, other.getComponent(i)); } - - template - NBL_CONSTEXPR_INLINE_FUNC this_t create(vector other) + NBL_CONSTEXPR_STATIC_INLINE this_t create(vector other) { this_t output; @@ -145,7 +143,7 @@ struct emulated_vector : CRTP return output; } - NBL_CONSTEXPR_INLINE_FUNC this_t operator+(ComponentType val) + NBL_CONSTEXPR_INLINE_FUNC this_t operator+(component_t val) { this_t output; @@ -163,7 +161,7 @@ struct emulated_vector : CRTP return output; } - NBL_CONSTEXPR_INLINE_FUNC this_t operator+(vector other) + NBL_CONSTEXPR_INLINE_FUNC this_t operator+(vector other) { this_t output; @@ -173,12 +171,12 @@ struct emulated_vector : CRTP return output; } - NBL_CONSTEXPR_INLINE_FUNC this_t operator-(ComponentType val) + NBL_CONSTEXPR_INLINE_FUNC this_t operator-(component_t val) { this_t output; for (uint32_t i = 0u; i < CRTP::Dimension; ++i) - output.setComponent(i, this_t::getComponent(i) - val); + output.setComponent(i, CRTP::getComponent(i) - val); return output; } @@ -187,26 +185,26 @@ struct emulated_vector : CRTP this_t output; for (uint32_t i = 0u; i < CRTP::Dimension; ++i) - output.setComponent(i, this_t::getComponent(i) - other.getComponent(i)); + output.setComponent(i, CRTP::getComponent(i) - other.getComponent(i)); return output; } - NBL_CONSTEXPR_INLINE_FUNC this_t operator-(vector other) + NBL_CONSTEXPR_INLINE_FUNC this_t operator-(vector other) { this_t output; for (uint32_t i = 0u; i < CRTP::Dimension; ++i) - output.setComponent(i, this_t::getComponent(i) - other[i]); + output.setComponent(i, CRTP::getComponent(i) - other[i]); return output; } - NBL_CONSTEXPR_INLINE_FUNC this_t operator*(ComponentType val) + NBL_CONSTEXPR_INLINE_FUNC this_t operator*(component_t val) { this_t output; for (uint32_t i = 0u; i < CRTP::Dimension; ++i) - output.setComponent(i, this_t::getComponent(i) * val); + output.setComponent(i, CRTP::getComponent(i) * val); return output; } @@ -215,16 +213,16 @@ struct emulated_vector : CRTP this_t output; for (uint32_t i = 0u; i < CRTP::Dimension; ++i) - output.setComponent(i, this_t::getComponent(i) * other.getComponent(i)); + output.setComponent(i, CRTP::getComponent(i) * other.getComponent(i)); return output; } - NBL_CONSTEXPR_INLINE_FUNC this_t operator*(vector other) + NBL_CONSTEXPR_INLINE_FUNC this_t operator*(vector other) { this_t output; for (uint32_t i = 0u; i < CRTP::Dimension; ++i) - output.setComponent(i, this_t::getComponent(i) * other[i]); + output.setComponent(i, CRTP::getComponent(i) * other[i]); return output; } @@ -244,7 +242,7 @@ NBL_CONSTEXPR_INLINE_FUNC this_t operator+(TYPE val)\ {\ this_t output;\ for (uint32_t i = 0u; i < CRTP::Dimension; ++i)\ - output.setComponent(i, CRTP::getComponent(i) + val);\ + output.setComponent(i, CRTP::getComponent(i) + component_t::create(val));\ \ return output;\ }\ @@ -253,7 +251,7 @@ NBL_CONSTEXPR_INLINE_FUNC this_t operator-(TYPE val)\ {\ this_t output;\ for (uint32_t i = 0u; i < CRTP::Dimension; ++i)\ - output.setComponent(i, CRTP::getComponent(i) - val);\ + output.setComponent(i, CRTP::getComponent(i) - component_t::create(val));\ \ return output;\ }\ @@ -262,19 +260,20 @@ NBL_CONSTEXPR_INLINE_FUNC this_t operator*(TYPE val)\ {\ this_t output;\ for (uint32_t i = 0u; i < CRTP::Dimension; ++i)\ - output.setComponent(i, CRTP::getComponent(i) * val);\ + output.setComponent(i, CRTP::getComponent(i) * component_t::create(val));\ \ return output;\ }\ \ +// TODO: some of code duplication could be avoided template struct emulated_vector : CRTP { using component_t = ComponentType; using this_t = emulated_vector; - NBL_CONSTEXPR_INLINE_FUNC this_t create(this_t other) + NBL_CONSTEXPR_STATIC_INLINE this_t create(this_t other) { CRTP output; @@ -283,7 +282,7 @@ struct emulated_vector : CRTP } template - NBL_CONSTEXPR_INLINE_FUNC this_t create(vector other) + NBL_CONSTEXPR_STATIC_INLINE this_t create(vector other) { this_t output; @@ -340,25 +339,39 @@ struct emulated_vector : CRTP } }; -#undef DEFINE_OPERATORS_FOR_TYPE - -} - template -struct emulated_vector_t {}; +struct CRTPParentStructSelector +{ + using type = void; +}; template -struct emulated_vector_t : emulated_vector_impl::emulated_vector > {}; +struct CRTPParentStructSelector +{ + using type = _2_component_vec; +}; template -struct emulated_vector_t : emulated_vector_impl::emulated_vector > {}; +struct CRTPParentStructSelector +{ + using type = _3_component_vec; +}; template -struct emulated_vector_t : emulated_vector_impl::emulated_vector > {}; +struct CRTPParentStructSelector +{ + using type = _4_component_vec; +}; +#undef DEFINE_OPERATORS_FOR_TYPE + +} + +template +using emulated_vector_t = emulated_vector_impl::emulated_vector::type>; template -using emulated_vector_t2 = emulated_vector_impl::emulated_vector >; +using emulated_vector_t2 = emulated_vector_impl::emulated_vector::type>; template -using emulated_vector_t3 = emulated_vector_impl::emulated_vector >; +using emulated_vector_t3 = emulated_vector_impl::emulated_vector::type>; template -using emulated_vector_t4 = emulated_vector_impl::emulated_vector >; +using emulated_vector_t4 = emulated_vector_impl::emulated_vector::type>; template struct array_get @@ -369,14 +382,15 @@ struct array_get } }; -template -struct array_get::component_t, emulated_vector_t, uint32_t> -{ - T operator()(uint32_t index, NBL_CONST_REF_ARG(emulated_vector_t) vec) - { - return vec.getComponent(index); - } -}; +// TODO: fix +//template +//struct array_get::component_t, emulated_vector_t, uint32_t> +//{ +// T operator()(uint32_t index, NBL_CONST_REF_ARG(emulated_vector_t) vec) +// { +// return vec.getComponent(index); +// } +//}; template struct array_set @@ -387,16 +401,17 @@ struct array_set } }; -template -struct array_set, uint32_t> -{ - using type_t = T; - - T operator()(uint32_t index, NBL_CONST_REF_ARG(emulated_vector_t) vec, T value) - { - vec.setComponent(index, value); - } -}; +// TODO: fix +//template +//struct array_set, uint32_t> +//{ +// using type_t = T; +// +// T operator()(uint32_t index, NBL_CONST_REF_ARG(emulated_vector_t) vec, T value) +// { +// vec.setComponent(index, value); +// } +//}; namespace impl { @@ -441,6 +456,23 @@ struct static_cast_helper, vector, void> return output; } }; + +template +struct static_cast_helper, emulated_vector_t, void> +{ + using OutputVecType = vector; + using InputVecType = emulated_vector_t; + + static inline OutputVecType cast(InputVecType vec) + { + OutputVecType output; + output.x = _static_cast(vec.x); + output.y = _static_cast(vec.y); + + return output; + } +}; + } } diff --git a/include/nbl/builtin/hlsl/portable/matrix_t.hlsl b/include/nbl/builtin/hlsl/portable/matrix_t.hlsl index a01c16748d..e3c805d367 100644 --- a/include/nbl/builtin/hlsl/portable/matrix_t.hlsl +++ b/include/nbl/builtin/hlsl/portable/matrix_t.hlsl @@ -46,13 +46,15 @@ using portable_float64_t3x3 = portable_matrix_t3x3; namespace impl { // TODO: move to emulated/matrix.hlsl +// TODO: make one template for all dimensions template struct PortableMul64Helper { static inline V multiply(M mat, V vec) { V output; - M matTransposed = mat.getTransposed(); + //M matTransposed = mat.getTransposed(); + M matTransposed = mat; output.x = (matTransposed.rows[0] * vec).calcComponentSum(); output.y = (matTransposed.rows[1] * vec).calcComponentSum(); diff --git a/include/nbl/builtin/hlsl/portable/vector_t.hlsl b/include/nbl/builtin/hlsl/portable/vector_t.hlsl index 74df16579b..8a558be581 100644 --- a/include/nbl/builtin/hlsl/portable/vector_t.hlsl +++ b/include/nbl/builtin/hlsl/portable/vector_t.hlsl @@ -8,7 +8,6 @@ namespace nbl { namespace hlsl { - namespace portable_vector_impl { template::value> From 29dadfbc4e0cf9ddb1cc0db92f9302b3e9b0f013 Mon Sep 17 00:00:00 2001 From: Fletterio Date: Tue, 24 Sep 2024 17:40:55 -0300 Subject: [PATCH 058/358] Changes made to bit ordering function --- examples_tests | 2 +- include/nbl/builtin/hlsl/fft/common.hlsl | 4 ++-- include/nbl/builtin/hlsl/workgroup/fft.hlsl | 17 +++++------------ 3 files changed, 8 insertions(+), 15 deletions(-) diff --git a/examples_tests b/examples_tests index 1849b547e0..9adee12bf7 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 1849b547e05cbbfb50e3f41545373bafa48cf260 +Subproject commit 9adee12bf72b925a25a64bff7461f29e34332e26 diff --git a/include/nbl/builtin/hlsl/fft/common.hlsl b/include/nbl/builtin/hlsl/fft/common.hlsl index b471b13a9f..fa54da5a3f 100644 --- a/include/nbl/builtin/hlsl/fft/common.hlsl +++ b/include/nbl/builtin/hlsl/fft/common.hlsl @@ -21,9 +21,9 @@ complex_t twiddle(uint32_t k, uint32_t halfN) const Scalar kthRootAngleRadians = numbers::pi * Scalar(k) / Scalar(halfN); retVal.real( cos(kthRootAngleRadians) ); if (! inverse) - retVal.imag( sin(kthRootAngleRadians) ); - else retVal.imag( sin(-kthRootAngleRadians) ); + else + retVal.imag( sin(kthRootAngleRadians) ); return retVal; } diff --git a/include/nbl/builtin/hlsl/workgroup/fft.hlsl b/include/nbl/builtin/hlsl/workgroup/fft.hlsl index 2d00147ce3..765a69034d 100644 --- a/include/nbl/builtin/hlsl/workgroup/fft.hlsl +++ b/include/nbl/builtin/hlsl/workgroup/fft.hlsl @@ -130,33 +130,26 @@ enable_if_t bitShiftLeftHigher(uint32_t i) return mid | high | low; } -// For an N-bit number, mirrors it around the Nyquist frequency, which for the range [0, 2^N - 1] is precisely 2^(N - 1) -template -uint32_t mirror(uint32_t i) -{ - return ((1 << N) - i) & ((1 << N) - 1) -} - // This function maps the index `idx` in the output array of a Forward FFT to the index `freqIdx` in the DFT such that `DFT[freqIdx] = output[idx]` // This is because Cooley-Tukey + subgroup operations end up spewing out the outputs in a weird order template -uint32_t getFrequencyAt(uint32_t idx) +uint32_t getFrequencyIndex(uint32_t outputIdx) { NBL_CONSTEXPR_STATIC_INLINE uint32_t ELEMENTS_PER_INVOCATION_LOG_2 = uint32_t(mpl::log2::value); NBL_CONSTEXPR_STATIC_INLINE uint32_t FFT_SIZE_LOG_2 = ELEMENTS_PER_INVOCATION_LOG_2 + uint32_t(mpl::log2::value); - return mirror(bitShiftRightHigher(glsl::bitfieldReverse(idx) >> (32 - FFT_SIZE_LOG_2))); + return bitShiftRightHigher(glsl::bitfieldReverse(outputIdx) >> (32 - FFT_SIZE_LOG_2)); } // This function maps the index `freqIdx` in the DFT to the index `idx` in the output array of a Forward FFT such that `DFT[freqIdx] = output[idx]` -// It is essentially the inverse of `getFrequencyAt` +// It is essentially the inverse of `getFrequencyIndex` template -uint32_t getOutputAt(uint32_t freqIdx) +uint32_t getOutputIndex(uint32_t freqIdx) { NBL_CONSTEXPR_STATIC_INLINE uint32_t ELEMENTS_PER_INVOCATION_LOG_2 = uint32_t(mpl::log2::value); NBL_CONSTEXPR_STATIC_INLINE uint32_t FFT_SIZE_LOG_2 = ELEMENTS_PER_INVOCATION_LOG_2 + uint32_t(mpl::log2::value); - return glsl::bitfieldReverse(bitShiftLeftHigher(mirror(freqIdx))) >> (32 - FFT_SIZE_LOG_2); + return glsl::bitfieldReverse(bitShiftLeftHigher(freqIdx)) >> (32 - FFT_SIZE_LOG_2); } } //namespace fft From ab95289b42b704df23ab7e394097b26839fc6528 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Wed, 25 Sep 2024 11:55:31 -0700 Subject: [PATCH 059/358] Even more fixes --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated/float64_t.hlsl | 29 +++++++++++- .../builtin/hlsl/emulated/float64_t_impl.hlsl | 6 +-- .../nbl/builtin/hlsl/emulated/vector_t.hlsl | 44 ++++++++++++++++--- .../nbl/builtin/hlsl/portable/float64_t.hlsl | 4 +- .../nbl/builtin/hlsl/portable/matrix_t.hlsl | 8 ++-- .../nbl/builtin/hlsl/portable/vector_t.hlsl | 16 ------- 7 files changed, 74 insertions(+), 35 deletions(-) diff --git a/examples_tests b/examples_tests index aebab4a34f..62f8a5403c 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit aebab4a34f486f2ab7556a1fa47b33ddca5c7e83 +Subproject commit 62f8a5403c990edb114fa7e2f752ec89b815bbce diff --git a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl index 8302b5405e..af6dd1482c 100644 --- a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl @@ -69,7 +69,7 @@ namespace hlsl emulated_float64_t retval; uint32_t lo, hi; asuint(val, lo, hi); - retval.data = (uint64_t(hi) << 32) | lo; + retval.data = (uint64_t(hi) << 32) | uint64_t(lo); return retval; #else return bit_cast(reinterpret_cast(val)); @@ -85,6 +85,15 @@ namespace hlsl // arithmetic operators this_t operator+(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { + // TODO: REMOVE! + float64_t sum = bit_cast(data) + bit_cast(rhs.data); + uint64_t sumAsUint = bit_cast(sum); + + this_t output2; + output2.data = sumAsUint; + + return output2; + if (FlushDenormToZero) { if(FastMath) @@ -217,6 +226,12 @@ namespace hlsl emulated_float64_t operator*(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { + float64_t sum = bit_cast(data) * bit_cast(rhs.data); + uint64_t sumAsUint = bit_cast(sum); + + this_t output2; + output2.data = sumAsUint; + if(FlushDenormToZero) { emulated_float64_t retval = this_t::create(0ull); @@ -487,6 +502,18 @@ output.data = bit_cast(val);\ return output;\ }\ \ +template<>\ +NBL_CONSTEXPR_FUNC uint64_t bit_cast(NBL_CONST_REF_ARG( __VA_ARGS__ ) val)\ +{\ +return val.data;\ +}\ +\ +template<>\ +NBL_CONSTEXPR_FUNC float64_t bit_cast(NBL_CONST_REF_ARG( __VA_ARGS__ ) val)\ +{\ +return bit_cast(val.data);\ +}\ +\ namespace impl { diff --git a/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl index 8f6e37f26a..6ef891e58f 100644 --- a/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl +++ b/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl @@ -59,10 +59,10 @@ inline uint64_t castFloat32ToStorageType(float32_t val) if (tgmath::isInf(val)) return ieee754::traits::inf | sign; uint32_t asUint = ieee754::impl::bitCastToUintType(val); - const int f32Exp = ieee754::extractExponent(val); - if (f32Exp == 0) + const int f32BiasedExp = int(ieee754::extractBiasedExponent(val)); + if (f32BiasedExp == 0) return sign; - const uint64_t biasedExp = uint64_t(f32Exp + ieee754::traits::exponentBias) << (ieee754::traits::mantissaBitCnt); + const uint64_t biasedExp = uint64_t(f32BiasedExp - ieee754::traits::exponentBias + ieee754::traits::exponentBias) << (ieee754::traits::mantissaBitCnt); const uint64_t mantissa = (uint64_t(ieee754::traits::mantissaMask) & asUint) << (ieee754::traits::mantissaBitCnt - ieee754::traits::mantissaBitCnt); return sign | biasedExp | mantissa; diff --git a/include/nbl/builtin/hlsl/emulated/vector_t.hlsl b/include/nbl/builtin/hlsl/emulated/vector_t.hlsl index fe7ccaec42..537171ff3e 100644 --- a/include/nbl/builtin/hlsl/emulated/vector_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/vector_t.hlsl @@ -339,6 +339,8 @@ struct emulated_vector : CRTP } }; +#undef DEFINE_OPERATORS_FOR_TYPE + template struct CRTPParentStructSelector { @@ -360,8 +362,6 @@ struct CRTPParentStructSelector using type = _4_component_vec; }; -#undef DEFINE_OPERATORS_FOR_TYPE - } template @@ -373,20 +373,50 @@ using emulated_vector_t3 = emulated_vector_impl::emulated_vector using emulated_vector_t4 = emulated_vector_impl::emulated_vector::type>; -template +// TODO: better implementation +template +struct is_valid_emulated_vector +{ + NBL_CONSTEXPR_STATIC bool value = is_same_v > || + is_same_v > || + is_same_v >; +}; + +#ifdef __HLSL_VERSION +template struct array_get { - T operator()(I index, NBL_CONST_REF_ARG(U) arr) + T operator()(NBL_CONST_REF_ARG(U) vec, const I ix) { - return arr[index]; + return vec[ix]; } }; -// TODO: fix +template +struct array_get, TT, I> +{ + TT operator()(NBL_CONST_REF_ARG(emulated_vector_t) vec, const I ix) + { + return vec.getComponent(ix); + } +}; +#endif + +//template +//struct array_get +//{ +// T operator()(I index, NBL_CONST_REF_ARG(U) arr) +// { +// return arr[index]; +// } +//}; +// //template //struct array_get::component_t, emulated_vector_t, uint32_t> //{ -// T operator()(uint32_t index, NBL_CONST_REF_ARG(emulated_vector_t) vec) +// using vec_t = emulated_vector_t; +// +// T operator()(uint32_t index, NBL_CONST_REF_ARG(vec_t) vec) // { // return vec.getComponent(index); // } diff --git a/include/nbl/builtin/hlsl/portable/float64_t.hlsl b/include/nbl/builtin/hlsl/portable/float64_t.hlsl index 131d37a87b..c5389ef8cc 100644 --- a/include/nbl/builtin/hlsl/portable/float64_t.hlsl +++ b/include/nbl/builtin/hlsl/portable/float64_t.hlsl @@ -9,8 +9,8 @@ namespace hlsl { template #ifdef __HLSL_VERSION -//using portable_float64_t = typename conditional::shaderFloat64, float64_t, emulated_float64_t >::type; -using portable_float64_t = typename conditional::shaderFloat64, emulated_float64_t, emulated_float64_t >::type; +using portable_float64_t = typename conditional::shaderFloat64, float64_t, emulated_float64_t >::type; +//using portable_float64_t = typename conditional::shaderFloat64, emulated_float64_t, emulated_float64_t >::type; #else using portable_float64_t = float64_t; #endif diff --git a/include/nbl/builtin/hlsl/portable/matrix_t.hlsl b/include/nbl/builtin/hlsl/portable/matrix_t.hlsl index e3c805d367..c0b2596cb5 100644 --- a/include/nbl/builtin/hlsl/portable/matrix_t.hlsl +++ b/include/nbl/builtin/hlsl/portable/matrix_t.hlsl @@ -53,12 +53,10 @@ struct PortableMul64Helper static inline V multiply(M mat, V vec) { V output; - //M matTransposed = mat.getTransposed(); - M matTransposed = mat; - output.x = (matTransposed.rows[0] * vec).calcComponentSum(); - output.y = (matTransposed.rows[1] * vec).calcComponentSum(); - output.z = (matTransposed.rows[2] * vec).calcComponentSum(); + output.x = (mat.rows[0] * vec).calcComponentSum(); + output.y = (mat.rows[1] * vec).calcComponentSum(); + output.z = (mat.rows[2] * vec).calcComponentSum(); return output; } diff --git a/include/nbl/builtin/hlsl/portable/vector_t.hlsl b/include/nbl/builtin/hlsl/portable/vector_t.hlsl index 8a558be581..ace199e20b 100644 --- a/include/nbl/builtin/hlsl/portable/vector_t.hlsl +++ b/include/nbl/builtin/hlsl/portable/vector_t.hlsl @@ -22,22 +22,6 @@ struct portable_vector using type = emulated_vector_t; }; -template -struct portable_vector -{ - using type = emulated_vector_t2; -}; -template -struct portable_vector -{ - using type = emulated_vector_t3; -}; -template -struct portable_vector -{ - using type = emulated_vector_t4; -}; - #endif } From 02fe0ae2232cce1aa7d31d02a475356c9cf6fac9 Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Thu, 26 Sep 2024 21:12:11 +0100 Subject: [PATCH 060/358] Update shaderc --- 3rdparty/shaderc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/shaderc b/3rdparty/shaderc index e72186b66b..e166325b24 160000 --- a/3rdparty/shaderc +++ b/3rdparty/shaderc @@ -1 +1 @@ -Subproject commit e72186b66bb90ed06aaf15cbdc9a053581a0616b +Subproject commit e166325b24d79d64bfa47065328890ce116ea642 From f4328c977e899fb2e56ff5f24fc265103997fa87 Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Thu, 26 Sep 2024 21:13:11 +0100 Subject: [PATCH 061/358] Update glslang --- 3rdparty/glslang | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/glslang b/3rdparty/glslang index f397c9b6e9..7bc35fa99c 160000 --- a/3rdparty/glslang +++ b/3rdparty/glslang @@ -1 +1 @@ -Subproject commit f397c9b6e90bc53aa2e9feaef1a9cdf20ca43298 +Subproject commit 7bc35fa99cb8572715bea9d3e977f9b27423337b From 71f4a8bea547fdadb0d0fcf1e3481913b89c31bc Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Fri, 27 Sep 2024 05:20:43 -0700 Subject: [PATCH 062/358] Saving work --- .../nbl/builtin/hlsl/emulated/float64_t.hlsl | 80 +++++++++---------- .../builtin/hlsl/emulated/float64_t_impl.hlsl | 62 +++++++++++++- .../nbl/builtin/hlsl/portable/float64_t.hlsl | 4 +- 3 files changed, 100 insertions(+), 46 deletions(-) diff --git a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl index af6dd1482c..9869cb00e5 100644 --- a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl @@ -86,13 +86,13 @@ namespace hlsl this_t operator+(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { // TODO: REMOVE! - float64_t sum = bit_cast(data) + bit_cast(rhs.data); + /*float64_t sum = bit_cast(data) + bit_cast(rhs.data); uint64_t sumAsUint = bit_cast(sum); this_t output2; output2.data = sumAsUint; - return output2; + return output2;*/ if (FlushDenormToZero) { @@ -157,9 +157,8 @@ namespace hlsl swap(lhsSign, rhsSign); } - rhsNormMantissa >>= shiftAmount; - uint64_t resultMantissa; + uint64_t resultBiasedExp = uint64_t(exp) + ieee754::traits::exponentBias; if (lhsSign != rhsSign) { int64_t mantissaDiff = lhsNormMantissa - rhsNormMantissa; @@ -169,31 +168,27 @@ namespace hlsl swap(lhsSign, rhsSign); } - lhsNormMantissa <<= 10; - rhsNormMantissa <<= 10; - resultMantissa = uint64_t(int64_t(lhsNormMantissa) - int64_t(rhsNormMantissa)); - resultMantissa >>= 10; + resultMantissa = emulated_float64_t_impl::subMantissas128NormalizeResult(lhsNormMantissa, rhsNormMantissa, shiftAmount, resultBiasedExp); + + if (resultMantissa == 0ull); + return _static_cast(0ull); } else { + rhsNormMantissa >>= shiftAmount; resultMantissa = lhsNormMantissa + rhsNormMantissa; - } - - uint64_t resultBiasedExp = uint64_t(exp) + ieee754::traits::exponentBias; - if (resultMantissa == 0ull) - return _static_cast(0ull); - - if (resultMantissa & 1ull << 53) - { - ++resultBiasedExp; - resultMantissa >>= 1; - } + if (resultMantissa & 1ull << 53) + { + ++resultBiasedExp; + resultMantissa >>= 1; + } - while (resultMantissa < (1ull << 52)) - { - --resultBiasedExp; - resultMantissa <<= 1; + while (resultMantissa < (1ull << 52)) + { + --resultBiasedExp; + resultMantissa <<= 1; + } } resultMantissa &= ieee754::traits::mantissaMask; @@ -226,12 +221,6 @@ namespace hlsl emulated_float64_t operator*(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { - float64_t sum = bit_cast(data) * bit_cast(rhs.data); - uint64_t sumAsUint = bit_cast(sum); - - this_t output2; - output2.data = sumAsUint; - if(FlushDenormToZero) { emulated_float64_t retval = this_t::create(0ull); @@ -244,12 +233,12 @@ namespace hlsl uint64_t lhsSign = lhsData & ieee754::traits::signMask; uint64_t rhsSign = rhsData & ieee754::traits::signMask; + uint64_t sign = (lhsData ^ rhsData) & ieee754::traits::signMask; uint64_t lhsMantissa = ieee754::extractMantissa(lhsData); uint64_t rhsMantissa = ieee754::extractMantissa(rhsData); int exp = int(lhsBiasedExp + rhsBiasedExp) - ieee754::traits::exponentBias; - uint64_t sign = (lhsData ^ rhsData) & ieee754::traits::signMask; if (FastMath) { if (tgmath::isNaN(lhsData) || tgmath::isNaN(rhsData)) @@ -260,6 +249,8 @@ namespace hlsl return bit_cast(sign); } + if (emulated_float64_t_impl::isZero(lhsData) || emulated_float64_t_impl::isZero(rhsData)) + return bit_cast(sign); const uint64_t hi_l = (lhsMantissa >> 21) | (1ull << 31); const uint64_t lo_l = lhsMantissa & ((1ull << 21) - 1); @@ -300,27 +291,30 @@ namespace hlsl { const uint64_t sign = (data ^ rhs.data) & ieee754::traits::signMask; + int lhsBiasedExp = ieee754::extractBiasedExponent(data); + int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); + + uint64_t lhsData = emulated_float64_t_impl::flushDenormToZero(lhsBiasedExp, data); + uint64_t rhsData = emulated_float64_t_impl::flushDenormToZero(rhsBiasedExp, rhs.data); + if(FastMath) { - if (tgmath::isNaN(data) || tgmath::isNaN(rhs.data)) + if (tgmath::isNaN(lhsData) || tgmath::isNaN(rhsData)) return bit_cast(ieee754::traits::quietNaN); - if (emulated_float64_t_impl::isZero(rhs.data)) + if (emulated_float64_t_impl::areBothZero(lhsData, rhsData)) return bit_cast(ieee754::traits::quietNaN | sign); - if (emulated_float64_t_impl::areBothInfinity(data, rhs.data)) + if (emulated_float64_t_impl::isZero(rhsData)) + return bit_cast(ieee754::traits::inf | sign); + if (emulated_float64_t_impl::areBothInfinity(lhsData, rhsData)) return bit_cast(ieee754::traits::quietNaN | ieee754::traits::signMask); - if (tgmath::isInf(data)) + if (tgmath::isInf(lhsData)) return bit_cast(ieee754::traits::inf | sign); - if (tgmath::isInf(rhs.data)) - return bit_cast(0ull | sign); - if (emulated_float64_t_impl::isZero(rhs.data)) - return bit_cast(ieee754::traits::quietNaN | sign); + if (tgmath::isInf(rhsData)) + return bit_cast(sign); } - int lhsBiasedExp = ieee754::extractBiasedExponent(data); - int rhsBiasedExp = ieee754::extractBiasedExponent(rhs.data); - - uint64_t lhsData = emulated_float64_t_impl::flushDenormToZero(lhsBiasedExp, data); - uint64_t rhsData = emulated_float64_t_impl::flushDenormToZero(rhsBiasedExp, rhs.data); + if (emulated_float64_t_impl::isZero(lhsData)) + return bit_cast(sign); const uint64_t lhsRealMantissa = (ieee754::extractMantissa(lhsData) | (1ull << ieee754::traits::mantissaBitCnt)); const uint64_t rhsRealMantissa = ieee754::extractMantissa(rhsData) | (1ull << ieee754::traits::mantissaBitCnt); diff --git a/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl index 6ef891e58f..0248a679b6 100644 --- a/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl +++ b/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl @@ -193,8 +193,9 @@ NBL_CONSTEXPR_INLINE_FUNC bool areBothSameSignZero(uint64_t lhs, uint64_t rhs) return ((lhs << 1) == 0ull) && (lhs == rhs); } +// TODO: remove, use Newton-Raphson instead // returns pair of quotient and remainder -static inline uint64_t divmod128by64(const uint64_t dividentHigh, const uint64_t dividentLow, uint64_t divisor) +inline uint64_t divmod128by64(const uint64_t dividentHigh, const uint64_t dividentLow, uint64_t divisor) { const uint64_t b = 1ull << 32; uint64_t un1, un0, vn1, vn0, q1, q0, un32, un21, un10, rhat, left, right; @@ -258,6 +259,65 @@ static inline uint64_t divmod128by64(const uint64_t dividentHigh, const uint64_t return (q1 << 32) | q0; } + +inline uint64_t subMantissas128NormalizeResult(const uint64_t greaterNumberMantissa, const uint64_t lesserNumberMantissa, const int shiftAmount, NBL_REF_ARG(uint64_t) resultBiasedExp) +{ + uint64_t greaterHigh, greaterLow, lesserHigh, lesserLow; + greaterHigh = greaterNumberMantissa << 9; + greaterLow = 0ull; + lesserHigh = lesserNumberMantissa << 9; + resultBiasedExp += 9; + + const uint64_t mask = (1ull << shiftAmount) - 1ull; + const uint64_t lostBits = lesserHigh & mask; + lesserLow = lostBits << (63 - shiftAmount); + lesserHigh >>= shiftAmount; + + uint64_t diffHigh, diffLow; + diffHigh = greaterHigh - lesserHigh; + diffLow = greaterLow - lesserLow; + + if (diffLow > greaterLow) + --diffHigh; + + int msbIdx = _findMSB(diffHigh); + if (msbIdx == -1) + { + msbIdx = _findMSB(diffLow); + if (msbIdx == -1) + return 0ull; // TODO: for sure? + } + else + { + msbIdx += 64; + } + + // TODO: optimize + while (msbIdx > 52) + { + uint64_t lostBit = (diffHigh & 0x1ull) << 63; + diffHigh >>= 1; + diffLow >>= 1; + diffLow |= lostBit; + + --resultBiasedExp; + --msbIdx; + } + + while (msbIdx < 52) + { + uint64_t lostBit = (diffLow >> 63) & 0x1ull; + diffHigh <<= 1; + diffHigh |= lostBit; + diffLow <<= 1; + + ++resultBiasedExp; + ++msbIdx; + } + + return diffLow; +} + } } } diff --git a/include/nbl/builtin/hlsl/portable/float64_t.hlsl b/include/nbl/builtin/hlsl/portable/float64_t.hlsl index c5389ef8cc..131d37a87b 100644 --- a/include/nbl/builtin/hlsl/portable/float64_t.hlsl +++ b/include/nbl/builtin/hlsl/portable/float64_t.hlsl @@ -9,8 +9,8 @@ namespace hlsl { template #ifdef __HLSL_VERSION -using portable_float64_t = typename conditional::shaderFloat64, float64_t, emulated_float64_t >::type; -//using portable_float64_t = typename conditional::shaderFloat64, emulated_float64_t, emulated_float64_t >::type; +//using portable_float64_t = typename conditional::shaderFloat64, float64_t, emulated_float64_t >::type; +using portable_float64_t = typename conditional::shaderFloat64, emulated_float64_t, emulated_float64_t >::type; #else using portable_float64_t = float64_t; #endif From ee1289887bab238c47f28b29b1c228a1a5281b61 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Fri, 27 Sep 2024 20:58:53 -0700 Subject: [PATCH 063/358] Modified add operator --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated/float64_t.hlsl | 19 +++----- .../builtin/hlsl/emulated/float64_t_impl.hlsl | 46 +++++++------------ 3 files changed, 25 insertions(+), 42 deletions(-) diff --git a/examples_tests b/examples_tests index 62f8a5403c..8c873fabf5 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 62f8a5403c990edb114fa7e2f752ec89b815bbce +Subproject commit 8c873fabf5fbcf893e33b828844fd593d4985f9e diff --git a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl index 9869cb00e5..d01b73f1a3 100644 --- a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl @@ -147,7 +147,7 @@ namespace hlsl const int expDiff = lhsBiasedExp - rhsBiasedExp; - const int exp = max(lhsBiasedExp, rhsBiasedExp) - ieee754::traits::exponentBias; + int exp = max(lhsBiasedExp, rhsBiasedExp) - ieee754::traits::exponentBias; const uint32_t shiftAmount = abs(expDiff); if (expDiff < 0) @@ -157,8 +157,9 @@ namespace hlsl swap(lhsSign, rhsSign); } + rhsNormMantissa >>= shiftAmount; + uint64_t resultMantissa; - uint64_t resultBiasedExp = uint64_t(exp) + ieee754::traits::exponentBias; if (lhsSign != rhsSign) { int64_t mantissaDiff = lhsNormMantissa - rhsNormMantissa; @@ -168,29 +169,23 @@ namespace hlsl swap(lhsSign, rhsSign); } - resultMantissa = emulated_float64_t_impl::subMantissas128NormalizeResult(lhsNormMantissa, rhsNormMantissa, shiftAmount, resultBiasedExp); + resultMantissa = emulated_float64_t_impl::subMantissas128NormalizeResult(lhsNormMantissa, rhsNormMantissa, exp); - if (resultMantissa == 0ull); + if (resultMantissa == 0ull) return _static_cast(0ull); } else { - rhsNormMantissa >>= shiftAmount; resultMantissa = lhsNormMantissa + rhsNormMantissa; if (resultMantissa & 1ull << 53) { - ++resultBiasedExp; + ++exp; resultMantissa >>= 1; } - - while (resultMantissa < (1ull << 52)) - { - --resultBiasedExp; - resultMantissa <<= 1; - } } + uint64_t resultBiasedExp = uint64_t(exp) + ieee754::traits::exponentBias; resultMantissa &= ieee754::traits::mantissaMask; uint64_t output = emulated_float64_t_impl::assembleFloat64(lhsSign, resultBiasedExp << ieee754::traits::mantissaBitCnt, resultMantissa); return bit_cast(output); diff --git a/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl index 0248a679b6..ccc49cd904 100644 --- a/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl +++ b/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl @@ -260,18 +260,13 @@ inline uint64_t divmod128by64(const uint64_t dividentHigh, const uint64_t divide return (q1 << 32) | q0; } -inline uint64_t subMantissas128NormalizeResult(const uint64_t greaterNumberMantissa, const uint64_t lesserNumberMantissa, const int shiftAmount, NBL_REF_ARG(uint64_t) resultBiasedExp) +inline uint64_t subMantissas128NormalizeResult(const uint64_t greaterNumberMantissa, const uint64_t lesserNumberMantissa, NBL_REF_ARG(int) resultExp) { uint64_t greaterHigh, greaterLow, lesserHigh, lesserLow; - greaterHigh = greaterNumberMantissa << 9; + greaterHigh = greaterNumberMantissa; greaterLow = 0ull; - lesserHigh = lesserNumberMantissa << 9; - resultBiasedExp += 9; - - const uint64_t mask = (1ull << shiftAmount) - 1ull; - const uint64_t lostBits = lesserHigh & mask; - lesserLow = lostBits << (63 - shiftAmount); - lesserHigh >>= shiftAmount; + lesserHigh = lesserNumberMantissa; + lesserLow = 0ull; uint64_t diffHigh, diffLow; diffHigh = greaterHigh - lesserHigh; @@ -285,37 +280,30 @@ inline uint64_t subMantissas128NormalizeResult(const uint64_t greaterNumberManti { msbIdx = _findMSB(diffLow); if (msbIdx == -1) - return 0ull; // TODO: for sure? + return 0ull; } else { msbIdx += 64; } - // TODO: optimize - while (msbIdx > 52) - { - uint64_t lostBit = (diffHigh & 0x1ull) << 63; - diffHigh >>= 1; - diffLow >>= 1; - diffLow |= lostBit; + static const int TargetMSB = 52 + 64; + int shiftAmount = msbIdx - TargetMSB; + resultExp += shiftAmount; - --resultBiasedExp; - --msbIdx; + if (shiftAmount > 0) + { + diffHigh >>= shiftAmount; } - - while (msbIdx < 52) + else if (shiftAmount < 0) { - uint64_t lostBit = (diffLow >> 63) & 0x1ull; - diffHigh <<= 1; - diffHigh |= lostBit; - diffLow <<= 1; - - ++resultBiasedExp; - ++msbIdx; + shiftAmount = -shiftAmount; + diffHigh <<= shiftAmount; + const uint64_t shiftedOutBits = diffLow >> (64 - shiftAmount); + diffHigh |= shiftedOutBits; } - return diffLow; + return diffHigh; } } From 0cb210d1df827d12c5e0931986624703b411100c Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Fri, 27 Sep 2024 21:04:44 -0700 Subject: [PATCH 064/358] Updated examples --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 8c873fabf5..af5ae2717b 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 8c873fabf5fbcf893e33b828844fd593d4985f9e +Subproject commit af5ae2717b4de966ad761af846fade8cc55eef5f From 8095e05539bf65cede47b9e5ddf61405900e32a5 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Fri, 27 Sep 2024 23:05:53 -0700 Subject: [PATCH 065/358] Updated examples --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index af5ae2717b..846e7aa5af 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit af5ae2717b4de966ad761af846fade8cc55eef5f +Subproject commit 846e7aa5afa7ad1344ec8e18041e669ddfddee00 From 9e86de2dc837a468249c4d0aaf7875b9232de2ad Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Sat, 28 Sep 2024 12:17:48 -0700 Subject: [PATCH 066/358] Added ef64_benchmark --- examples_tests | 2 +- include/nbl/builtin/hlsl/emulated/float64_t.hlsl | 11 ++++++++--- .../quadrature/gauss_legendre/gauss_legendre.hlsl | 12 ++++++++++-- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/examples_tests b/examples_tests index 846e7aa5af..a4fe41dcb1 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 846e7aa5afa7ad1344ec8e18041e669ddfddee00 +Subproject commit a4fe41dcb18359ea6a9944f63858df86a4e987f4 diff --git a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl index d01b73f1a3..2f41e1b571 100644 --- a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl @@ -86,13 +86,13 @@ namespace hlsl this_t operator+(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { // TODO: REMOVE! - /*float64_t sum = bit_cast(data) + bit_cast(rhs.data); + float64_t sum = bit_cast(data) + bit_cast(rhs.data); uint64_t sumAsUint = bit_cast(sum); this_t output2; output2.data = sumAsUint; - return output2;*/ + return output2; if (FlushDenormToZero) { @@ -277,7 +277,7 @@ namespace hlsl emulated_float64_t operator*(float rhs) { - return _static_cast(data) * create(rhs); + return bit_cast(data) * create(rhs); } emulated_float64_t operator/(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC @@ -339,6 +339,11 @@ namespace hlsl } } + emulated_float64_t operator/(const float rhs) NBL_CONST_MEMBER_FUNC + { + return bit_cast(data) * create(rhs); + } + // relational operators bool operator==(this_t rhs) NBL_CONST_MEMBER_FUNC { diff --git a/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl b/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl index 61055305da..3aa3c047dc 100644 --- a/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl +++ b/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl @@ -5,7 +5,8 @@ #define _NBL_BUILTIN_HLSL_MATH_QUADRATURE_GAUSS_LEGENDRE_INCLUDED_ #include - +// TODO: portable/float64_t.hlsl instead? +#include namespace nbl { @@ -24,7 +25,7 @@ struct GaussLegendreIntegration { static float_t calculateIntegral(NBL_CONST_REF_ARG(IntegrandFunc) func, float_t start, float_t end) { - float_t integral = 0.0; + float_t integral = _static_cast(0ull); for (uint32_t i = 0u; i < Order; ++i) { const float_t xi = GaussLegendreValues::xi(i) * ((end - start) / 2.0) + ((end + start) / 2.0); @@ -46,6 +47,13 @@ struct GaussLegendreIntegration #undef TYPED_NUMBER #undef float_t +// TODO: do for every emulated_float64_t +#define float_t emulated_float64_t +#define TYPED_NUMBER(N) emulated_float64_t::create(N); +#include +#undef TYPED_NUMBER +#undef float_t + } // quadrature } // math } // hlsl From cf2d4334a211dce4831ed444bba380442dc7a132 Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Mon, 30 Sep 2024 09:52:52 +0200 Subject: [PATCH 067/358] correctly update shaderc (we was missing 1 year of commits causing shaderc to requesting still HLSL & OGLCompiler stub libraries which got removed either from glslang & shaderc on latest revisions --- 3rdparty/shaderc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/shaderc b/3rdparty/shaderc index e166325b24..d2564ba598 160000 --- a/3rdparty/shaderc +++ b/3rdparty/shaderc @@ -1 +1 @@ -Subproject commit e166325b24d79d64bfa47065328890ce116ea642 +Subproject commit d2564ba5989c9de1a76714b3e59ec60595e9be50 From 8afc5dbb8c3ca00e92d3cc518f1efa2efb6c3660 Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Mon, 30 Sep 2024 10:12:16 +0200 Subject: [PATCH 068/358] and now update glslang to be up-to-date (still we had divergence) --- 3rdparty/glslang | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/glslang b/3rdparty/glslang index 7bc35fa99c..ff26c2e995 160000 --- a/3rdparty/glslang +++ b/3rdparty/glslang @@ -1 +1 @@ -Subproject commit 7bc35fa99cb8572715bea9d3e977f9b27423337b +Subproject commit ff26c2e995ae521cb9fbc902fb4d686a47e4eb53 From 6f092a0f9f8595a77d80b23b4251ea17135bc60b Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Mon, 30 Sep 2024 10:17:56 +0200 Subject: [PATCH 069/358] update build system, make Nabla build with fully updated glslang & shaderc --- 3rdparty/CMakeLists.txt | 9 --------- src/nbl/CMakeLists.txt | 2 -- 2 files changed, 11 deletions(-) diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt index 8f65e1b2e8..932660a36d 100755 --- a/3rdparty/CMakeLists.txt +++ b/3rdparty/CMakeLists.txt @@ -161,11 +161,6 @@ target_include_directories(SPIRV PUBLIC "${GLSLANG_GENERATED_INCLUDEDIR}") set(SHADERC_SKIP_TESTS ON CACHE INTERNAL "Skip shaderc tests?") set(SHADERC_SKIP_INSTALL ON CACHE INTERNAL "Install shaderc?") -# if it doesn't work without the `touch` on Linux, then fetch the latest submodule head of shaderc and try again -# https://github.com/google/shaderc/issues/568 -if (UNIX) - file(WRITE ${THIRD_PARTY_SOURCE_DIR}/shaderc/libshaderc/libshaderc_combined.a "") -endif() add_subdirectory(shaderc shaderc EXCLUDE_FROM_ALL) # libjpeg-turbo @@ -468,7 +463,6 @@ set(NBL_3RDPARTY_TARGETS simdjson nlohmann_json glslang - OGLCompiler OSDependent MachineIndependent GenericCodeGen @@ -494,9 +488,6 @@ endif() if (NBL_BUILD_IMGUI) list(APPEND NBL_3RDPARTY_TARGETS imgui implot imtestsuite imtestengine imguizmo) endif() -if(ENABLE_HLSL) - list(APPEND NBL_3RDPARTY_TARGETS HLSL) -endif() foreach(trgt IN LISTS NBL_3RDPARTY_TARGETS) if(NBL_DYNAMIC_MSVC_RUNTIME) diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 6f38f7e1df..a454bc636a 100755 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -714,7 +714,6 @@ if(NBL_STATIC_BUILD) nbl_install_lib(glslang) nbl_install_lib(GenericCodeGen) nbl_install_lib(MachineIndependent) - nbl_install_lib(HLSL) nbl_install_lib(jpeg-static) if (_NBL_COMPILE_WITH_OPEN_EXR_) nbl_install_lib(OpenEXR) @@ -728,7 +727,6 @@ if(NBL_STATIC_BUILD) nbl_install_lib(SPIRV) nbl_install_lib(SPIRV-Tools-static) # TODO: make this function/macro work with alias target nbl_install_lib(SPIRV-Tools-opt) - nbl_install_lib(OGLCompiler) nbl_install_lib(OSDependent) nbl_install_lib(zlibstatic) nbl_install_lib(simdjson) From 7e3fea07357b2ef5f0fd247ebec0b7661d56ff9e Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Mon, 30 Sep 2024 16:02:58 -0700 Subject: [PATCH 070/358] Saving work --- .../math/quadrature/gauss_legendre/gauss_legendre.hlsl | 7 ++++--- .../builtin/hlsl/math/quadrature/gauss_legendre/impl.hlsl | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl b/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl index 3aa3c047dc..543c1ed16a 100644 --- a/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl +++ b/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl @@ -29,7 +29,7 @@ struct GaussLegendreIntegration for (uint32_t i = 0u; i < Order; ++i) { const float_t xi = GaussLegendreValues::xi(i) * ((end - start) / 2.0) + ((end + start) / 2.0); - integral += GaussLegendreValues::wi(i) * func(xi); + integral = integral + GaussLegendreValues::wi(i) * func(xi); } return ((end - start) / 2.0) * integral; } @@ -48,8 +48,9 @@ struct GaussLegendreIntegration #undef float_t // TODO: do for every emulated_float64_t -#define float_t emulated_float64_t -#define TYPED_NUMBER(N) emulated_float64_t::create(N); + +#define float_t emulated_float64_t +#define TYPED_NUMBER(N) emulated_float64_t::create(N) #include #undef TYPED_NUMBER #undef float_t diff --git a/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/impl.hlsl b/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/impl.hlsl index 0c27ed3287..262468d19f 100644 --- a/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/impl.hlsl +++ b/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/impl.hlsl @@ -338,7 +338,7 @@ NBL_CONSTEXPR float_t wi_15[15] = { #define DEF_GAUSS_LEGENDRE_VALS(N) \ template<> \ -struct GaussLegendreValues \ +struct GaussLegendreValues \ { \ static float_t xi(uint32_t idx) { return NAMESPACE_NAME::xi_##N[idx]; } \ static float_t wi(uint32_t idx) { return NAMESPACE_NAME::wi_##N[idx]; } \ From a73b60e69d7fcf737f119cea9b006ebb39adc8c9 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Mon, 30 Sep 2024 22:54:08 -0700 Subject: [PATCH 071/358] Tests works --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated/float64_t.hlsl | 20 ++++++++++++++++++- .../builtin/hlsl/emulated/float64_t_impl.hlsl | 4 ++-- .../gauss_legendre/gauss_legendre.hlsl | 6 ++++++ .../math/quadrature/gauss_legendre/impl.hlsl | 11 +++++----- 5 files changed, 34 insertions(+), 9 deletions(-) diff --git a/examples_tests b/examples_tests index a4fe41dcb1..ca0216686d 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit a4fe41dcb18359ea6a9944f63858df86a4e987f4 +Subproject commit ca0216686d2f51bd5a0dcabea523f83dddf5f93e diff --git a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl index 2f41e1b571..c62b67d568 100644 --- a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl @@ -72,7 +72,7 @@ namespace hlsl retval.data = (uint64_t(hi) << 32) | uint64_t(lo); return retval; #else - return bit_cast(reinterpret_cast(val)); + return bit_cast(val); #endif } @@ -216,6 +216,15 @@ namespace hlsl emulated_float64_t operator*(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { + // TODO: remove + float64_t sum = bit_cast(data) * bit_cast(rhs.data); + uint64_t sumAsUint = bit_cast(sum); + + this_t output2; + output2.data = sumAsUint; + + return output2; + if(FlushDenormToZero) { emulated_float64_t retval = this_t::create(0ull); @@ -282,6 +291,15 @@ namespace hlsl emulated_float64_t operator/(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { + // TODO: remove + float64_t sum = bit_cast(data) / bit_cast(rhs.data); + uint64_t sumAsUint = bit_cast(sum); + + this_t output2; + output2.data = sumAsUint; + + return output2; + if (FlushDenormToZero) { const uint64_t sign = (data ^ rhs.data) & ieee754::traits::signMask; diff --git a/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl index ccc49cd904..dc32398248 100644 --- a/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl +++ b/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl @@ -81,7 +81,7 @@ NBL_CONSTEXPR_INLINE_FUNC bool isZero(uint64_t val) // TODO: where do i move this function? also rename template -static inline int _findMSB(Int val) +inline int _findMSB(Int val) { //static_assert(is_integral::value); #ifndef __HLSL_VERSION @@ -92,7 +92,7 @@ static inline int _findMSB(Int val) } template <> -static inline int _findMSB(uint64_t val) +inline int _findMSB(uint64_t val) { #ifndef __HLSL_VERSION return nbl::hlsl::findMSB(val); diff --git a/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl b/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl index 543c1ed16a..ec223770be 100644 --- a/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl +++ b/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl @@ -36,23 +36,29 @@ struct GaussLegendreIntegration }; #define float_t float32_t +#define float_t_namespace impl_float32_t #define TYPED_NUMBER(N) NBL_CONCATENATE(N, f) // to add f after floating point numbers and avoid casting warnings and emitting ShaderFloat64 Caps #include #undef TYPED_NUMBER +#undef float_t_namespace #undef float_t #define float_t float64_t +#define float_t_namespace impl_float64_t #define TYPED_NUMBER(N) N #include #undef TYPED_NUMBER +#undef float_t_namespace #undef float_t // TODO: do for every emulated_float64_t #define float_t emulated_float64_t +#define float_t_namespace impl_emulated_float64_t_true_true #define TYPED_NUMBER(N) emulated_float64_t::create(N) #include #undef TYPED_NUMBER +#undef float_t_namespace #undef float_t } // quadrature diff --git a/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/impl.hlsl b/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/impl.hlsl index 262468d19f..3bcfbb2388 100644 --- a/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/impl.hlsl +++ b/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/impl.hlsl @@ -5,13 +5,14 @@ #ifndef float_t #error Define float_t before including #endif +#ifndef float_t_namespace +#error Define float_t_namespace before including +#endif #ifndef TYPED_NUMBER #error Define TYPED_NUMBER before including #endif -#define NAMESPACE_NAME NBL_CONCATENATE(impl_, float_t) - -namespace NAMESPACE_NAME +namespace float_t_namespace { NBL_CONSTEXPR float_t xi_2[2] = { TYPED_NUMBER(-0.5773502691896257), @@ -340,8 +341,8 @@ NBL_CONSTEXPR float_t wi_15[15] = { template<> \ struct GaussLegendreValues \ { \ - static float_t xi(uint32_t idx) { return NAMESPACE_NAME::xi_##N[idx]; } \ - static float_t wi(uint32_t idx) { return NAMESPACE_NAME::wi_##N[idx]; } \ + static float_t xi(uint32_t idx) { return float_t_namespace::xi_##N[idx]; } \ + static float_t wi(uint32_t idx) { return float_t_namespace::wi_##N[idx]; } \ } DEF_GAUSS_LEGENDRE_VALS(2); From 4eb1c05c8e10736efb29e86ed2196c6f58bba4ce Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Wed, 2 Oct 2024 11:53:49 -0700 Subject: [PATCH 072/358] Saving work --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index ca0216686d..742224863a 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit ca0216686d2f51bd5a0dcabea523f83dddf5f93e +Subproject commit 742224863a4ad9e66182893c57584774fdee830b From 0b93b15cf0ba9cfcb325d637c797dd5b29d982ce Mon Sep 17 00:00:00 2001 From: Fletterio Date: Wed, 2 Oct 2024 22:17:36 -0300 Subject: [PATCH 073/358] Impkementing stuff needed to run Bloom example --- examples_tests | 2 +- include/nbl/builtin/hlsl/fft/utils.hlsl | 0 include/nbl/builtin/hlsl/functional.hlsl | 2 +- include/nbl/builtin/hlsl/workgroup/fft.hlsl | 14 +++- .../nbl/builtin/hlsl/workgroup/shuffle.hlsl | 78 +++++++++++++++++-- 5 files changed, 83 insertions(+), 13 deletions(-) delete mode 100644 include/nbl/builtin/hlsl/fft/utils.hlsl diff --git a/examples_tests b/examples_tests index 9adee12bf7..8abd881bdf 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 9adee12bf72b925a25a64bff7461f29e34332e26 +Subproject commit 8abd881bdfed0b15f7a54c5175da915c844621ba diff --git a/include/nbl/builtin/hlsl/fft/utils.hlsl b/include/nbl/builtin/hlsl/fft/utils.hlsl deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/include/nbl/builtin/hlsl/functional.hlsl b/include/nbl/builtin/hlsl/functional.hlsl index 6ef88a7ac4..c1b347c721 100644 --- a/include/nbl/builtin/hlsl/functional.hlsl +++ b/include/nbl/builtin/hlsl/functional.hlsl @@ -13,7 +13,7 @@ namespace nbl { namespace hlsl { -#ifdef __HLSL_VERSION // CPP +#ifdef __HLSL_VERSION // HLSL template using __spv_ptr_t = spirv::pointer_t; diff --git a/include/nbl/builtin/hlsl/workgroup/fft.hlsl b/include/nbl/builtin/hlsl/workgroup/fft.hlsl index 765a69034d..0b16601088 100644 --- a/include/nbl/builtin/hlsl/workgroup/fft.hlsl +++ b/include/nbl/builtin/hlsl/workgroup/fft.hlsl @@ -34,7 +34,7 @@ struct exchangeValues const bool topHalf = bool(threadID & stride); // Pack two halves into a single uint32_t uint32_t toExchange = bit_cast(topHalf ? float16_t2 (lo.real(), lo.imag()) : float16_t2 (hi.real(), hi.imag())); - shuffleXor::__call(toExchange, stride, sharedmemAdaptor); + shuffleXor(toExchange, stride, sharedmemAdaptor); float16_t2 exchanged = bit_cast(toExchange); if (topHalf) { @@ -57,7 +57,7 @@ struct exchangeValues const bool topHalf = bool(threadID & stride); // pack into `float32_t2` because ternary operator doesn't support structs float32_t2 exchanged = topHalf ? float32_t2(lo.real(), lo.imag()) : float32_t2(hi.real(), hi.imag()); - shuffleXor::__call(exchanged, stride, sharedmemAdaptor); + shuffleXor(exchanged, stride, sharedmemAdaptor); if (topHalf) { lo.real(exchanged.x); @@ -79,7 +79,7 @@ struct exchangeValues const bool topHalf = bool(threadID & stride); // pack into `float64_t2` because ternary operator doesn't support structs float64_t2 exchanged = topHalf ? float64_t2(lo.real(), lo.imag()) : float64_t2(hi.real(), hi.imag()); - shuffleXor::__call(exchanged, stride, sharedmemAdaptor); + shuffleXor(exchanged, stride, sharedmemAdaptor); if (topHalf) { lo.real(exchanged.x); @@ -152,6 +152,14 @@ uint32_t getOutputIndex(uint32_t freqIdx) return glsl::bitfieldReverse(bitShiftLeftHigher(freqIdx)) >> (32 - FFT_SIZE_LOG_2); } +// Mirrors an index about the Nyquist frequency +template +uint32_t mirror(uint32_t idx) +{ + NBL_CONSTEXPR_STATIC_INLINE uint32_t FFT_SIZE = WorkgroupSize * uint32_t(ElementsPerInvocation); + return (FFT_SIZE - idx) & (FFT_SIZE - 1); +} + } //namespace fft // ----------------------------------- End Utils ----------------------------------------------- diff --git a/include/nbl/builtin/hlsl/workgroup/shuffle.hlsl b/include/nbl/builtin/hlsl/workgroup/shuffle.hlsl index 5eb0035406..852241780b 100644 --- a/include/nbl/builtin/hlsl/workgroup/shuffle.hlsl +++ b/include/nbl/builtin/hlsl/workgroup/shuffle.hlsl @@ -2,6 +2,7 @@ #define _NBL_BUILTIN_HLSL_WORKGROUP_SHUFFLE_INCLUDED_ #include "nbl/builtin/hlsl/memory_accessor.hlsl" +#include "nbl/builtin/hlsl/functional.hlsl" // TODO: Add other shuffles @@ -14,26 +15,87 @@ namespace hlsl namespace workgroup { +// ------------------------------------- Skeletons for implementing other Shuffles -------------------------------- + template -struct shuffleXor +struct Shuffle +{ + static void __call(NBL_REF_ARG(T) value, uint32_t storeIdx, uint32_t loadIdx, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor) + { + // TODO: optimization (optional) where we shuffle in the shared memory available (using rounds) + sharedmemAdaptor.template set(storeIdx, value); + + // Wait until all writes are done before reading + sharedmemAdaptor.workgroupExecutionAndMemoryBarrier(); + + sharedmemAdaptor.template get(loadIdx, value); + } + + // By default store to threadID in the workgroup + static void __call(NBL_REF_ARG(T) value, uint32_t loadIdx, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor) + { + __call(value, uint32_t(SubgroupContiguousIndex()), loadIdx, sharedmemAdaptor); + } +}; + +template +struct ShuffleUnOp +{ + static void __call(NBL_REF_ARG(T) value, uint32_t a, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor) + { + UnOp unop; + // TODO: optimization (optional) where we shuffle in the shared memory available (using rounds) + sharedmemAdaptor.template set(a, value); + + // Wait until all writes are done before reading + sharedmemAdaptor.workgroupExecutionAndMemoryBarrier(); + + sharedmemAdaptor.template get(unop(a), value); + } + + // By default store to threadID's index and load from unop(threadID) + static void __call(NBL_REF_ARG(T) value, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor) + { + __call(value, uint32_t(SubgroupContiguousIndex()), sharedmemAdaptor); + } +}; + +template +struct ShuffleBinOp { - static void __call(NBL_REF_ARG(T) value, uint32_t mask, uint32_t threadID, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor) + static void __call(NBL_REF_ARG(T) value, uint32_t a, uint32_t b, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor) { + BinOp binop; // TODO: optimization (optional) where we shuffle in the shared memory available (using rounds) - sharedmemAdaptor.template set(threadID, value); - + sharedmemAdaptor.template set(a, value); + // Wait until all writes are done before reading sharedmemAdaptor.workgroupExecutionAndMemoryBarrier(); - - sharedmemAdaptor.template get(threadID ^ mask, value); + + sharedmemAdaptor.template get(binop(a, b), value); } - static void __call(NBL_REF_ARG(T) value, uint32_t mask, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor) + // By default first argument of binary op is the thread's ID in the workgroup + static void __call(NBL_REF_ARG(T) value, uint32_t b, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor) { - __call(value, mask, uint32_t(SubgroupContiguousIndex()), sharedmemAdaptor); + __call(value, uint32_t(SubgroupContiguousIndex()), b, sharedmemAdaptor); } }; +// ------------------------------------------ ShuffleXor --------------------------------------------------------------- + +template +void shuffleXor(NBL_REF_ARG(T) value, uint32_t threadID, uint32_t mask, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor) +{ + return ShuffleBinOp, SharedMemoryAdaptor, T>::__call(value, threadID, mask, sharedmemAdaptor); +} + +template +void shuffleXor(NBL_REF_ARG(T) value, uint32_t mask, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor) +{ + return ShuffleBinOp, SharedMemoryAdaptor, T>::__call(value, mask, sharedmemAdaptor); +} + } } } From 2dc3c48fddca0b39ed32410eb7a922bcb8b13e4c Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Thu, 3 Oct 2024 14:12:58 +0100 Subject: [PATCH 074/358] Disable GL directives --- src/nbl/asset/utils/CGLSLCompiler.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/nbl/asset/utils/CGLSLCompiler.cpp b/src/nbl/asset/utils/CGLSLCompiler.cpp index f90be9f27a..a6ba06011a 100644 --- a/src/nbl/asset/utils/CGLSLCompiler.cpp +++ b/src/nbl/asset/utils/CGLSLCompiler.cpp @@ -103,9 +103,9 @@ namespace nbl::asset::impl { auto res_str = std::move(result.contents); //employ encloseWithinExtraInclGuards() in order to prevent infinite loop of (not necesarilly direct) self-inclusions while other # directives (incl guards among them) are disabled - CGLSLCompiler::disableAllDirectivesExceptIncludes(res_str); - disableGlDirectives(res_str); - res_str = CGLSLCompiler::encloseWithinExtraInclGuards(std::move(res_str), m_maxInclCnt, name.string().c_str()); + //CGLSLCompiler::disableAllDirectivesExceptIncludes(res_str); + //disableGlDirectives(res_str); + //res_str = CGLSLCompiler::encloseWithinExtraInclGuards(std::move(res_str), m_maxInclCnt, name.string().c_str()); res->content_length = res_str.size(); res->content = new char[res_str.size() + 1u]; @@ -145,8 +145,8 @@ std::string CGLSLCompiler::preprocessShader(std::string&& code, IShader::E_SHADE insertion << "#define " << define.identifier << " " << define.definition << "\n"; insertIntoStart(code,std::move(insertion)); } - disableAllDirectivesExceptIncludes(code); - disableGlDirectives(code); + //disableAllDirectivesExceptIncludes(code); + //disableGlDirectives(code); shaderc::Compiler comp; shaderc::CompileOptions options; options.SetTargetSpirv(shaderc_spirv_version_1_6); @@ -164,8 +164,8 @@ std::string CGLSLCompiler::preprocessShader(std::string&& code, IShader::E_SHADE } auto resolvedString = std::string(res.cbegin(), std::distance(res.cbegin(), res.cend())); - reenableDirectives(resolvedString); - reenableGlDirectives(resolvedString); + //reenableDirectives(resolvedString); + //reenableGlDirectives(resolvedString); return resolvedString; } From 75d6d9453117b7e0e2e845d583760667ddcf1ecd Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Thu, 3 Oct 2024 17:39:20 -0700 Subject: [PATCH 075/358] Benchmark works --- examples_tests | 2 +- include/nbl/builtin/hlsl/emulated/float64_t.hlsl | 5 ++++- .../math/quadrature/gauss_legendre/gauss_legendre.hlsl | 10 +++++++++- include/nbl/video/ILogicalDevice.h | 4 +++- src/nbl/video/CVulkanLogicalDevice.cpp | 2 +- 5 files changed, 18 insertions(+), 5 deletions(-) diff --git a/examples_tests b/examples_tests index 742224863a..69156640cd 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 742224863a4ad9e66182893c57584774fdee830b +Subproject commit 69156640cd7386084f99e9af09f280101590db45 diff --git a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl index c62b67d568..19f95f9323 100644 --- a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl @@ -291,6 +291,8 @@ namespace hlsl emulated_float64_t operator/(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { + printf("%llu", rhs.data); + // TODO: remove float64_t sum = bit_cast(data) / bit_cast(rhs.data); uint64_t sumAsUint = bit_cast(sum); @@ -359,7 +361,8 @@ namespace hlsl emulated_float64_t operator/(const float rhs) NBL_CONST_MEMBER_FUNC { - return bit_cast(data) * create(rhs); + printf("guwno"); + return bit_cast(data) / create(rhs); } // relational operators diff --git a/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl b/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl index ec223770be..053ea02a47 100644 --- a/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl +++ b/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl @@ -28,9 +28,17 @@ struct GaussLegendreIntegration float_t integral = _static_cast(0ull); for (uint32_t i = 0u; i < Order; ++i) { - const float_t xi = GaussLegendreValues::xi(i) * ((end - start) / 2.0) + ((end + start) / 2.0); + const float_t xi = GaussLegendreValues::xi(i) * ((end - start) / 2.0f) + ((end + start) / 2.0f); integral = integral + GaussLegendreValues::wi(i) * func(xi); + + float_t a = GaussLegendreValues::xi(i); + float_t b = (end - start) / 2.0f; + + //printf("x = %ull, xi = %ull, ((end - start) / 2.0) = %ull", bit_cast(integral), bit_cast(a), bit_cast(b)); + //printf("start = %llu, end = %llu", bit_cast(start), bit_cast(end)); + printf("((end - start)) = %ull", bit_cast(b)); } + return ((end - start) / 2.0) * integral; } }; diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 02fad9abd7..70546ec86a 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -925,6 +925,8 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe return nullptr; } break; + case IQueryPool::TYPE::TIMESTAMP: + break; default: NBL_LOG_ERROR("Unsupported query pool type"); return nullptr; @@ -939,7 +941,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe NBL_LOG_ERROR("The queryPool was not created by this device"); return false; } - if (firstQuery + queryCount >= queryPool->getCreationParameters().queryCount) + if (firstQuery + queryCount > queryPool->getCreationParameters().queryCount) { NBL_LOG_ERROR("Query index out of bounds"); return false; diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index c90ffb94a7..03f2c94494 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -1434,7 +1434,7 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createQueryPool_impl(co info.pipelineStatistics = CVulkanQueryPool::getVkPipelineStatisticsFlagsFrom(params.pipelineStatisticsFlags.value); VkQueryPool vk_queryPool = VK_NULL_HANDLE; - if (m_devf.vk.vkCreateQueryPool(m_vkdev,&info,nullptr,&vk_queryPool)!=VK_SUCCESS) + if (m_devf.vk.vkCreateQueryPool(m_vkdev,&info,nullptr,&vk_queryPool)==VK_SUCCESS) return core::make_smart_refctd_ptr(this,params,vk_queryPool); return nullptr; } From 01fff5443162adde8edb66b1ed7da42810ffe1e4 Mon Sep 17 00:00:00 2001 From: Ali Cheraghi Date: Fri, 4 Oct 2024 11:09:40 +0330 Subject: [PATCH 076/358] stash Signed-off-by: Ali Cheraghi --- src/nbl/system/ISystem.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nbl/system/ISystem.cpp b/src/nbl/system/ISystem.cpp index 39f7223693..64406c6473 100644 --- a/src/nbl/system/ISystem.cpp +++ b/src/nbl/system/ISystem.cpp @@ -122,7 +122,7 @@ bool ISystem::deleteDirectory(const system::path& p) bool nbl::system::ISystem::deleteFile(const system::path& p) { - if (std::filesystem::exists(p) && !std::filesystem::is_directory(p))) + if (std::filesystem::exists(p) && !std::filesystem::is_directory(p)) return std::filesystem::remove(p); else return false; From c1c490809756bb0a63aa4f0ff466055e68ee5531 Mon Sep 17 00:00:00 2001 From: Ali Cheraghi Date: Fri, 4 Oct 2024 16:23:30 +0330 Subject: [PATCH 077/358] ShaderCompiler: Polishing --- include/nbl/asset/utils/IShaderCompiler.h | 31 +++++++++---------- src/nbl/asset/utils/IShaderCompiler.cpp | 12 +++++-- .../utils/shaderCompiler_serialization.h | 2 -- src/nbl/asset/utils/waveContext.h | 2 +- 4 files changed, 25 insertions(+), 22 deletions(-) diff --git a/include/nbl/asset/utils/IShaderCompiler.h b/include/nbl/asset/utils/IShaderCompiler.h index 1ed613f27b..0ac247a48c 100644 --- a/include/nbl/asset/utils/IShaderCompiler.h +++ b/include/nbl/asset/utils/IShaderCompiler.h @@ -33,7 +33,7 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted { system::path absolutePath = {}; std::string contents = {}; - std::array hash = {}; // TODO: we're not yet using IFile::getPrecomputedHash(), so for builtins we can maybe use that in the future + std::array hash = {}; // TODO: we're not yet using IFile::getPrecomputedHash(), so for builtins we can maybe use that in the future // Could be used in the future for early rejection of cache hit //nbl::system::IFileBase::time_point_t lastWriteTime = {}; @@ -185,7 +185,7 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted // Used to check compatibility of Caches before reading constexpr static inline std::string_view VERSION = "1.0.0"; - using hash_t = std::array; + using hash_t = std::array; static auto const SHADER_BUFFER_SIZE_BYTES = sizeof(uint64_t) / sizeof(uint8_t); // It's obviously 8 struct SEntry @@ -196,11 +196,9 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted { public: // Perf note: hashing while preprocessor lexing is likely to be slower than just hashing the whole array like this - inline SPreprocessingDependency(const system::path& _requestingSourceDir, const std::string_view& _identifier, const std::string_view& _contents, bool _standardInclude, std::array _hash) : - requestingSourceDir(_requestingSourceDir), identifier(_identifier), contents(_contents), standardInclude(_standardInclude), hash(_hash) - { - assert(!_contents.empty()); - } + inline SPreprocessingDependency(const system::path& _requestingSourceDir, const std::string_view& _identifier, bool _standardInclude, std::array _hash) : + requestingSourceDir(_requestingSourceDir), identifier(_identifier), standardInclude(_standardInclude), hash(_hash) + {} inline SPreprocessingDependency(SPreprocessingDependency&) = default; inline SPreprocessingDependency& operator=(SPreprocessingDependency&) = delete; @@ -218,11 +216,8 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted // path or identifier system::path requestingSourceDir = ""; std::string identifier = ""; - // file contents - // TODO: change to `core::vector` a compressed blob of LZMA, and store all contents together in the `SEntry` - std::string contents = ""; // hash of the contents - used to check against a found_t - std::array hash = {}; + std::array hash = {}; // If true, then `getIncludeStandard` was used to find, otherwise `getIncludeRelative` bool standardInclude = false; }; @@ -351,9 +346,13 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted // Now add the mainFileContents and produce both lookup and early equality rejection hashes hashable.insert(hashable.end(), mainFileContents.begin(), mainFileContents.end()); - hash = nbl::core::XXHash_256(hashable.data(), hashable.size()); - lookupHash = hash[0]; - for (auto i = 1u; i < 4; i++) { + + core::blake3_hasher hasher; + hasher.update(hashable.data(), hashable.size()); + hash = { *static_cast(hasher).data }; + // ALI:TODO + lookupHash = std::bit_cast({hash[0], hash[1], hash[2], hash[3], hash[4], hash[5], hash[6], hash[7]}); + for (auto i = 8u; i < 32; i++) { core::hash_combine(lookupHash, hash[i]); } } @@ -374,7 +373,7 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted // TODO: make some of these private std::string mainFileContents; SCompilerArgs compilerArgs; - std::array hash; + std::array hash; size_t lookupHash; dependency_container_t dependencies; core::smart_refctd_ptr cpuShader; @@ -429,7 +428,7 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted }; - using EntrySet = core::unordered_multiset; + using EntrySet = core::unordered_set; EntrySet m_container; NBL_API2 EntrySet::const_iterator find_impl(const SEntry& mainFile, const CIncludeFinder* finder) const; diff --git a/src/nbl/asset/utils/IShaderCompiler.cpp b/src/nbl/asset/utils/IShaderCompiler.cpp index ba23ba6281..d4efafd80b 100644 --- a/src/nbl/asset/utils/IShaderCompiler.cpp +++ b/src/nbl/asset/utils/IShaderCompiler.cpp @@ -116,7 +116,10 @@ auto IShaderCompiler::CIncludeFinder::getIncludeStandard(const system::path& req retVal = std::move(contents); else retVal = m_defaultFileSystemLoader->getInclude(requestingSourceDir.string(), includeName); - retVal.hash = nbl::core::XXHash_256((uint8_t*)(retVal.contents.data()), retVal.contents.size() * (sizeof(char) / sizeof(uint8_t))); + + core::blake3_hasher hasher; + hasher.update((uint8_t*)(retVal.contents.data()), retVal.contents.size() * (sizeof(char) / sizeof(uint8_t))); + retVal.hash = { *static_cast(hasher).data }; return retVal; } @@ -129,7 +132,10 @@ auto IShaderCompiler::CIncludeFinder::getIncludeRelative(const system::path& req if (auto contents = m_defaultFileSystemLoader->getInclude(requestingSourceDir.string(), includeName)) retVal = std::move(contents); else retVal = std::move(trySearchPaths(includeName)); - retVal.hash = nbl::core::XXHash_256((uint8_t*)(retVal.contents.data()), retVal.contents.size() * (sizeof(char) / sizeof(uint8_t))); + + core::blake3_hasher hasher; + hasher.update((uint8_t*)(retVal.contents.data()), retVal.contents.size() * (sizeof(char) / sizeof(uint8_t))); + retVal.hash = { *static_cast(hasher).data }; return retVal; } @@ -238,7 +244,7 @@ IShaderCompiler::CCache::EntrySet::const_iterator IShaderCompiler::CCache::find_ else header = finder->getIncludeRelative(dependency.requestingSourceDir, dependency.identifier); - if (header.hash != dependency.hash || header.contents != dependency.contents) + if (header.hash != dependency.hash) { allDependenciesMatch = false; break; diff --git a/src/nbl/asset/utils/shaderCompiler_serialization.h b/src/nbl/asset/utils/shaderCompiler_serialization.h index cd964d5689..11ca678b4a 100644 --- a/src/nbl/asset/utils/shaderCompiler_serialization.h +++ b/src/nbl/asset/utils/shaderCompiler_serialization.h @@ -116,7 +116,6 @@ inline void to_json(json& j, const SEntry::SPreprocessingDependency& dependency) j = json{ { "requestingSourceDir", dependency.requestingSourceDir }, { "identifier", dependency.identifier }, - { "contents", dependency.contents }, { "hash", dependency.hash }, { "standardInclude", dependency.standardInclude }, }; @@ -126,7 +125,6 @@ inline void from_json(const json& j, SEntry::SPreprocessingDependency& dependenc { j.at("requestingSourceDir").get_to(dependency.requestingSourceDir); j.at("identifier").get_to(dependency.identifier); - j.at("contents").get_to(dependency.contents); j.at("hash").get_to(dependency.hash); j.at("standardInclude").get_to(dependency.standardInclude); } diff --git a/src/nbl/asset/utils/waveContext.h b/src/nbl/asset/utils/waveContext.h index 696a2b9fb3..a374391576 100644 --- a/src/nbl/asset/utils/waveContext.h +++ b/src/nbl/asset/utils/waveContext.h @@ -533,7 +533,7 @@ template<> inline bool boost::wave::impl::pp_iterator_functor Date: Fri, 4 Oct 2024 16:58:12 +0330 Subject: [PATCH 078/358] ShaderCompiler: Remove comment --- include/nbl/asset/utils/IShaderCompiler.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/nbl/asset/utils/IShaderCompiler.h b/include/nbl/asset/utils/IShaderCompiler.h index 0ac247a48c..38e9d38dd6 100644 --- a/include/nbl/asset/utils/IShaderCompiler.h +++ b/include/nbl/asset/utils/IShaderCompiler.h @@ -350,7 +350,6 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted core::blake3_hasher hasher; hasher.update(hashable.data(), hashable.size()); hash = { *static_cast(hasher).data }; - // ALI:TODO lookupHash = std::bit_cast({hash[0], hash[1], hash[2], hash[3], hash[4], hash[5], hash[6], hash[7]}); for (auto i = 8u; i < 32; i++) { core::hash_combine(lookupHash, hash[i]); From 39b8bd2cd458a4a2f8304cd50374d36e4359a87b Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Fri, 4 Oct 2024 10:39:07 -0700 Subject: [PATCH 079/358] Saving work --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated/float64_t.hlsl | 15 ++++------ .../gauss_legendre/gauss_legendre.hlsl | 30 +++++++++++++++---- 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/examples_tests b/examples_tests index 69156640cd..582f002cef 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 69156640cd7386084f99e9af09f280101590db45 +Subproject commit 582f002cef18e0a75bbcd16210745a783c197663 diff --git a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl index 19f95f9323..2c40c8ba2e 100644 --- a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl @@ -86,13 +86,13 @@ namespace hlsl this_t operator+(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { // TODO: REMOVE! - float64_t sum = bit_cast(data) + bit_cast(rhs.data); + /*float64_t sum = bit_cast(data) + bit_cast(rhs.data); uint64_t sumAsUint = bit_cast(sum); this_t output2; output2.data = sumAsUint; - return output2; + return output2;*/ if (FlushDenormToZero) { @@ -217,13 +217,13 @@ namespace hlsl emulated_float64_t operator*(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { // TODO: remove - float64_t sum = bit_cast(data) * bit_cast(rhs.data); + /*float64_t sum = bit_cast(data) * bit_cast(rhs.data); uint64_t sumAsUint = bit_cast(sum); this_t output2; output2.data = sumAsUint; - return output2; + return output2;*/ if(FlushDenormToZero) { @@ -291,16 +291,14 @@ namespace hlsl emulated_float64_t operator/(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { - printf("%llu", rhs.data); - // TODO: remove - float64_t sum = bit_cast(data) / bit_cast(rhs.data); + /*float64_t sum = bit_cast(data) / bit_cast(rhs.data); uint64_t sumAsUint = bit_cast(sum); this_t output2; output2.data = sumAsUint; - return output2; + return output2;*/ if (FlushDenormToZero) { @@ -361,7 +359,6 @@ namespace hlsl emulated_float64_t operator/(const float rhs) NBL_CONST_MEMBER_FUNC { - printf("guwno"); return bit_cast(data) / create(rhs); } diff --git a/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl b/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl index 053ea02a47..823c368b4d 100644 --- a/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl +++ b/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl @@ -33,10 +33,6 @@ struct GaussLegendreIntegration float_t a = GaussLegendreValues::xi(i); float_t b = (end - start) / 2.0f; - - //printf("x = %ull, xi = %ull, ((end - start) / 2.0) = %ull", bit_cast(integral), bit_cast(a), bit_cast(b)); - //printf("start = %llu, end = %llu", bit_cast(start), bit_cast(end)); - printf("((end - start)) = %ull", bit_cast(b)); } return ((end - start) / 2.0) * integral; @@ -59,8 +55,6 @@ struct GaussLegendreIntegration #undef float_t_namespace #undef float_t -// TODO: do for every emulated_float64_t - #define float_t emulated_float64_t #define float_t_namespace impl_emulated_float64_t_true_true #define TYPED_NUMBER(N) emulated_float64_t::create(N) @@ -69,6 +63,30 @@ struct GaussLegendreIntegration #undef float_t_namespace #undef float_t +#define float_t emulated_float64_t +#define float_t_namespace impl_emulated_float64_t_false_false +#define TYPED_NUMBER(N) emulated_float64_t::create(N) +#include +#undef TYPED_NUMBER +#undef float_t_namespace +#undef float_t + +#define float_t emulated_float64_t +#define float_t_namespace impl_emulated_float64_t_false_true +#define TYPED_NUMBER(N) emulated_float64_t::create(N) +#include +#undef TYPED_NUMBER +#undef float_t_namespace +#undef float_t + +#define float_t emulated_float64_t +#define float_t_namespace impl_emulated_float64_t_true_false +#define TYPED_NUMBER(N) emulated_float64_t::create(N) +#include +#undef TYPED_NUMBER +#undef float_t_namespace +#undef float_t + } // quadrature } // math } // hlsl From 38952118cc143d230c05b04cad12212d3e8e247d Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Sat, 5 Oct 2024 01:56:26 -0700 Subject: [PATCH 080/358] Improved benchmark --- examples_tests | 2 +- include/nbl/asset/utils/CCompilerSet.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 582f002cef..8c371527bd 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 582f002cef18e0a75bbcd16210745a783c197663 +Subproject commit 8c371527bd39600a85425f80a785fe875778dcdc diff --git a/include/nbl/asset/utils/CCompilerSet.h b/include/nbl/asset/utils/CCompilerSet.h index 0b65db6b4e..3e5fd0d6ce 100644 --- a/include/nbl/asset/utils/CCompilerSet.h +++ b/include/nbl/asset/utils/CCompilerSet.h @@ -33,6 +33,8 @@ namespace nbl::asset #ifdef _NBL_PLATFORM_WINDOWS_ return m_HLSLCompiler; +#else + return nullptr; #endif } else if (contentType == IShader::E_CONTENT_TYPE::ECT_GLSL) From d821878620bb2d45ab8b65aec8b937c641578b89 Mon Sep 17 00:00:00 2001 From: Ali Cheraghi Date: Sat, 5 Oct 2024 11:27:12 +0330 Subject: [PATCH 081/358] use blake3_hash_t and std-specialization Signed-off-by: Ali Cheraghi --- include/nbl/asset/utils/IShaderCompiler.h | 16 ++++++---------- src/nbl/asset/utils/IShaderCompiler.cpp | 4 ++-- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/include/nbl/asset/utils/IShaderCompiler.h b/include/nbl/asset/utils/IShaderCompiler.h index 38e9d38dd6..05b42db344 100644 --- a/include/nbl/asset/utils/IShaderCompiler.h +++ b/include/nbl/asset/utils/IShaderCompiler.h @@ -33,7 +33,7 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted { system::path absolutePath = {}; std::string contents = {}; - std::array hash = {}; // TODO: we're not yet using IFile::getPrecomputedHash(), so for builtins we can maybe use that in the future + core::blake3_hash_t hash = {}; // TODO: we're not yet using IFile::getPrecomputedHash(), so for builtins we can maybe use that in the future // Could be used in the future for early rejection of cache hit //nbl::system::IFileBase::time_point_t lastWriteTime = {}; @@ -185,7 +185,6 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted // Used to check compatibility of Caches before reading constexpr static inline std::string_view VERSION = "1.0.0"; - using hash_t = std::array; static auto const SHADER_BUFFER_SIZE_BYTES = sizeof(uint64_t) / sizeof(uint8_t); // It's obviously 8 struct SEntry @@ -196,7 +195,7 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted { public: // Perf note: hashing while preprocessor lexing is likely to be slower than just hashing the whole array like this - inline SPreprocessingDependency(const system::path& _requestingSourceDir, const std::string_view& _identifier, bool _standardInclude, std::array _hash) : + inline SPreprocessingDependency(const system::path& _requestingSourceDir, const std::string_view& _identifier, bool _standardInclude, core::blake3_hash_t _hash) : requestingSourceDir(_requestingSourceDir), identifier(_identifier), standardInclude(_standardInclude), hash(_hash) {} @@ -217,7 +216,7 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted system::path requestingSourceDir = ""; std::string identifier = ""; // hash of the contents - used to check against a found_t - std::array hash = {}; + core::blake3_hash_t hash = {}; // If true, then `getIncludeStandard` was used to find, otherwise `getIncludeRelative` bool standardInclude = false; }; @@ -349,11 +348,8 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted core::blake3_hasher hasher; hasher.update(hashable.data(), hashable.size()); - hash = { *static_cast(hasher).data }; - lookupHash = std::bit_cast({hash[0], hash[1], hash[2], hash[3], hash[4], hash[5], hash[6], hash[7]}); - for (auto i = 8u; i < 32; i++) { - core::hash_combine(lookupHash, hash[i]); - } + hash = static_cast(hasher); + lookupHash = std::hash{}(hash); } // Needed to get the vector deserialization automatically @@ -372,7 +368,7 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted // TODO: make some of these private std::string mainFileContents; SCompilerArgs compilerArgs; - std::array hash; + core::blake3_hash_t hash; size_t lookupHash; dependency_container_t dependencies; core::smart_refctd_ptr cpuShader; diff --git a/src/nbl/asset/utils/IShaderCompiler.cpp b/src/nbl/asset/utils/IShaderCompiler.cpp index d4efafd80b..843207accd 100644 --- a/src/nbl/asset/utils/IShaderCompiler.cpp +++ b/src/nbl/asset/utils/IShaderCompiler.cpp @@ -119,7 +119,7 @@ auto IShaderCompiler::CIncludeFinder::getIncludeStandard(const system::path& req core::blake3_hasher hasher; hasher.update((uint8_t*)(retVal.contents.data()), retVal.contents.size() * (sizeof(char) / sizeof(uint8_t))); - retVal.hash = { *static_cast(hasher).data }; + retVal.hash = static_cast(hasher); return retVal; } @@ -135,7 +135,7 @@ auto IShaderCompiler::CIncludeFinder::getIncludeRelative(const system::path& req core::blake3_hasher hasher; hasher.update((uint8_t*)(retVal.contents.data()), retVal.contents.size() * (sizeof(char) / sizeof(uint8_t))); - retVal.hash = { *static_cast(hasher).data }; + retVal.hash = static_cast(hasher); return retVal; } From 8c70f06b971f4e80ccf5f8359d13f80ac6f743b8 Mon Sep 17 00:00:00 2001 From: Ali Cheraghi Date: Sat, 5 Oct 2024 11:43:18 +0330 Subject: [PATCH 082/358] fix serialization Signed-off-by: Ali Cheraghi --- include/nbl/asset/utils/IShaderCompiler.h | 2 +- src/nbl/asset/utils/shaderCompiler_serialization.h | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/nbl/asset/utils/IShaderCompiler.h b/include/nbl/asset/utils/IShaderCompiler.h index 05b42db344..696478d56f 100644 --- a/include/nbl/asset/utils/IShaderCompiler.h +++ b/include/nbl/asset/utils/IShaderCompiler.h @@ -183,7 +183,7 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted public: // Used to check compatibility of Caches before reading - constexpr static inline std::string_view VERSION = "1.0.0"; + constexpr static inline std::string_view VERSION = "1.1.0"; static auto const SHADER_BUFFER_SIZE_BYTES = sizeof(uint64_t) / sizeof(uint8_t); // It's obviously 8 diff --git a/src/nbl/asset/utils/shaderCompiler_serialization.h b/src/nbl/asset/utils/shaderCompiler_serialization.h index 11ca678b4a..ab23588438 100644 --- a/src/nbl/asset/utils/shaderCompiler_serialization.h +++ b/src/nbl/asset/utils/shaderCompiler_serialization.h @@ -116,7 +116,7 @@ inline void to_json(json& j, const SEntry::SPreprocessingDependency& dependency) j = json{ { "requestingSourceDir", dependency.requestingSourceDir }, { "identifier", dependency.identifier }, - { "hash", dependency.hash }, + { "hash", dependency.hash.data }, { "standardInclude", dependency.standardInclude }, }; } @@ -125,7 +125,7 @@ inline void from_json(const json& j, SEntry::SPreprocessingDependency& dependenc { j.at("requestingSourceDir").get_to(dependency.requestingSourceDir); j.at("identifier").get_to(dependency.identifier); - j.at("hash").get_to(dependency.hash); + j.at("hash").get_to(dependency.hash.data); j.at("standardInclude").get_to(dependency.standardInclude); } @@ -177,7 +177,7 @@ inline void to_json(json& j, const SEntry& entry) j = json{ { "mainFileContents", entry.mainFileContents }, { "compilerArgs", entry.compilerArgs }, - { "hash", entry.hash }, + { "hash", entry.hash.data }, { "lookupHash", entry.lookupHash }, { "dependencies", entry.dependencies }, }; @@ -187,7 +187,7 @@ inline void from_json(const json& j, SEntry& entry) { j.at("mainFileContents").get_to(entry.mainFileContents); j.at("compilerArgs").get_to(entry.compilerArgs); - j.at("hash").get_to(entry.hash); + j.at("hash").get_to(entry.hash.data); j.at("lookupHash").get_to(entry.lookupHash); j.at("dependencies").get_to(entry.dependencies); entry.cpuShader = nullptr; From 99f93869fc2d871bc60d74121e7a6b5e076f5a11 Mon Sep 17 00:00:00 2001 From: Ali Cheraghi Date: Sat, 5 Oct 2024 20:44:31 +0330 Subject: [PATCH 083/358] not working yet Signed-off-by: Ali Cheraghi --- include/nbl/asset/utils/IShaderCompiler.h | 52 +++------- src/nbl/asset/utils/CHLSLCompiler.cpp | 5 + src/nbl/asset/utils/IShaderCompiler.cpp | 97 +++++++++++++++++-- .../utils/shaderCompiler_serialization.h | 4 +- 4 files changed, 110 insertions(+), 48 deletions(-) diff --git a/include/nbl/asset/utils/IShaderCompiler.h b/include/nbl/asset/utils/IShaderCompiler.h index 696478d56f..0c0dfef3e0 100644 --- a/include/nbl/asset/utils/IShaderCompiler.h +++ b/include/nbl/asset/utils/IShaderCompiler.h @@ -239,6 +239,8 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted return true; } + std::string sourceIdentifier; + private: friend class SCompilerArgs; friend class SEntry; @@ -262,7 +264,6 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted // Sort them so equality and hashing are well defined std::sort(extraDefines.begin(), extraDefines.end(), [](const SMacroDefinition& lhs, const SMacroDefinition& rhs) {return lhs.identifier < rhs.identifier; }); }; - std::string sourceIdentifier; std::vector extraDefines; }; // TODO: SPreprocessorArgs could just be folded into `SCompilerArgs` to have less classes and operators @@ -282,6 +283,9 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted return retVal; } + IShader::E_SHADER_STAGE stage; + SPreprocessorArgs preprocessorArgs; + private: friend class SEntry; friend void to_json(nlohmann::json&, const SCompilerArgs&); @@ -306,11 +310,9 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted } } - IShader::E_SHADER_STAGE stage; E_SPIRV_VERSION targetSpirvVersion; std::vector optimizerPasses; core::bitflag debugInfoFlags; - SPreprocessorArgs preprocessorArgs; }; // The ordering is important here, the dependencies MUST be added to the array IN THE ORDER THE PREPROCESSOR INCLUDED THEM! @@ -358,20 +360,23 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted // Making the copy constructor deep-copy everything but the shader inline SEntry(const SEntry& other) : mainFileContents(other.mainFileContents), compilerArgs(other.compilerArgs), hash(other.hash), lookupHash(other.lookupHash), - dependencies(other.dependencies), cpuShader(other.cpuShader) {} + dependencies(other.dependencies), spirv(other.spirv) {} inline SEntry& operator=(SEntry& other) = delete; inline SEntry(SEntry&& other) = default; // Used for late initialization while looking up a cache, so as not to always initialize an entry even if caching was not requested inline SEntry& operator=(SEntry&& other) = default; + core::smart_refctd_ptr decodeShader() const; + // TODO: make some of these private std::string mainFileContents; SCompilerArgs compilerArgs; core::blake3_hash_t hash; size_t lookupHash; dependency_container_t dependencies; - core::smart_refctd_ptr cpuShader; + core::smart_refctd_ptr spirv; + size_t uncompressedSize; }; inline void insert(SEntry&& entry) @@ -429,42 +434,7 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted NBL_API2 EntrySet::const_iterator find_impl(const SEntry& mainFile, const CIncludeFinder* finder) const; }; - inline core::smart_refctd_ptr compileToSPIRV(const std::string_view code, const SCompilerOptions& options) const - { - CCache::SEntry entry; - std::vector dependencies; - if (options.readCache || options.writeCache) - entry = std::move(CCache::SEntry(code, options)); - - if (options.readCache) - { - auto found = options.readCache->find_impl(entry, options.preprocessorOptions.includeFinder); - if (found != options.readCache->m_container.end()) - { - if (options.writeCache) - { - CCache::SEntry writeEntry = *found; - options.writeCache->insert(std::move(writeEntry)); - } - return found->cpuShader; - } - } - - auto retVal = compileToSPIRV_impl(code, options, options.writeCache ? &dependencies : nullptr); - // compute the SPIR-V shader content hash - { - auto backingBuffer = retVal->getContent(); - const_cast(backingBuffer)->setContentHash(backingBuffer->computeContentHash()); - } - - if (options.writeCache) - { - entry.dependencies = std::move(dependencies); - entry.cpuShader = retVal; - options.writeCache->insert(std::move(entry)); - } - return retVal; - } + core::smart_refctd_ptr compileToSPIRV(const std::string_view code, const SCompilerOptions& options) const; inline core::smart_refctd_ptr compileToSPIRV(const char* code, const SCompilerOptions& options) const { diff --git a/src/nbl/asset/utils/CHLSLCompiler.cpp b/src/nbl/asset/utils/CHLSLCompiler.cpp index ca0af41baf..b6d46768e9 100644 --- a/src/nbl/asset/utils/CHLSLCompiler.cpp +++ b/src/nbl/asset/utils/CHLSLCompiler.cpp @@ -18,6 +18,8 @@ #include #include +#include "lzma/C/LzmaEnc.h" + using namespace nbl; using namespace nbl::asset; using Microsoft::WRL::ComPtr; @@ -373,6 +375,9 @@ std::string CHLSLCompiler::preprocessShader(std::string&& code, IShader::E_SHADE return preprocessShader(std::move(code), stage, preprocessOptions, extra_dxc_compile_flags); } +static void* SzAlloc(ISzAllocPtr p, size_t size) { p = p; return _NBL_ALIGNED_MALLOC(size, _NBL_SIMD_ALIGNMENT); } +static void SzFree(ISzAllocPtr p, void* address) { p = p; _NBL_ALIGNED_FREE(address); } + core::smart_refctd_ptr CHLSLCompiler::compileToSPIRV_impl(const std::string_view code, const IShaderCompiler::SCompilerOptions& options, std::vector* dependencies) const { auto hlslOptions = option_cast(options); diff --git a/src/nbl/asset/utils/IShaderCompiler.cpp b/src/nbl/asset/utils/IShaderCompiler.cpp index 843207accd..8520580b78 100644 --- a/src/nbl/asset/utils/IShaderCompiler.cpp +++ b/src/nbl/asset/utils/IShaderCompiler.cpp @@ -13,6 +13,9 @@ #include #include +#include +#include + using namespace nbl; using namespace nbl::asset; @@ -22,6 +25,71 @@ IShaderCompiler::IShaderCompiler(core::smart_refctd_ptr&& syste m_defaultIncludeFinder = core::make_smart_refctd_ptr(core::smart_refctd_ptr(m_system)); } +static void* SzAlloc(ISzAllocPtr p, size_t size) { p = p; return _NBL_ALIGNED_MALLOC(size, _NBL_SIMD_ALIGNMENT); } +static void SzFree(ISzAllocPtr p, void* address) { p = p; _NBL_ALIGNED_FREE(address); } + +inline core::smart_refctd_ptr nbl::asset::IShaderCompiler::compileToSPIRV(const std::string_view code, const SCompilerOptions& options) const +{ + CCache::SEntry entry; + std::vector dependencies; + if (options.readCache || options.writeCache) + entry = std::move(CCache::SEntry(code, options)); + + if (options.readCache) + { + auto found = options.readCache->find_impl(entry, options.preprocessorOptions.includeFinder); + if (found != options.readCache->m_container.end()) + { + if (options.writeCache) + { + CCache::SEntry writeEntry = *found; + options.writeCache->insert(std::move(writeEntry)); + } + return found->decodeShader(); + } + } + + auto retVal = compileToSPIRV_impl(code, options, options.writeCache ? &dependencies : nullptr); + // compute the SPIR-V shader content hash + { + auto backingBuffer = retVal->getContent(); + const_cast(backingBuffer)->setContentHash(backingBuffer->computeContentHash()); + } + + std::cout << "writeCache: " << options.writeCache << "\n"; + if (options.writeCache) + { + auto* spirvBuffer = retVal->getContent(); + size_t propsSize = LZMA_PROPS_SIZE; + size_t destLen = spirvBuffer->getSize() + spirvBuffer->getSize() / 3 + 128; + auto compressedSpirvBuffer = core::make_smart_refctd_ptr(propsSize + destLen); + + CLzmaEncProps props; + LzmaEncProps_Init(&props); + props.dictSize = 1 << 16; // 64 KB + props.writeEndMark = 1; // 0 or 1 + + ISzAlloc alloc = { SzAlloc, SzFree }; + int res = LzmaEncode( + reinterpret_cast(compressedSpirvBuffer->getPointer()) + LZMA_PROPS_SIZE, &destLen, + reinterpret_cast(spirvBuffer->getPointer()), spirvBuffer->getSize(), + &props, reinterpret_cast(compressedSpirvBuffer->getPointer()), &propsSize, props.writeEndMark, + nullptr, &alloc, &alloc); + + assert(propsSize == LZMA_PROPS_SIZE); + assert(res == SZ_OK); + + entry.dependencies = std::move(dependencies); + entry.spirv = std::move(compressedSpirvBuffer); + entry.uncompressedSize = spirvBuffer->getSize(); + + std::cout << "original: " << spirvBuffer->getSize() << ", compressed: " << compressedSpirvBuffer->getSize() << "\n"; + + options.writeCache->insert(std::move(entry)); + } + return retVal; +} + std::string IShaderCompiler::preprocessShader( system::IFile* sourcefile, IShader::E_SHADER_STAGE stage, @@ -224,7 +292,7 @@ auto IShaderCompiler::CIncludeFinder::tryIncludeGenerators(const std::string& in core::smart_refctd_ptr IShaderCompiler::CCache::find(const SEntry& mainFile, const IShaderCompiler::CIncludeFinder* finder) const { - return find_impl(mainFile, finder)->cpuShader; + return find_impl(mainFile, finder)->decodeShader(); } IShaderCompiler::CCache::EntrySet::const_iterator IShaderCompiler::CCache::find_impl(const SEntry& mainFile, const IShaderCompiler::CIncludeFinder* finder) const @@ -273,10 +341,10 @@ core::smart_refctd_ptr IShaderCompiler::CCache::serialize() const // We keep a copy of the offsets and the sizes of each shader. This is so that later on, when we add the shaders to the buffer after json creation // (where the params array has been moved) we don't have to read the json to get the offsets again offsets[i] = shaderBufferSize; - sizes[i] = entry.cpuShader->getContent()->getSize(); + sizes[i] = entry.spirv->getSize(); // And add the params to the shader creation parameters array - shaderCreationParams.emplace_back(entry.cpuShader->getStage(), entry.cpuShader->getContentType(), entry.cpuShader->getFilepathHint(), sizes[i], shaderBufferSize); + shaderCreationParams.emplace_back(entry.compilerArgs.stage, IShader::E_CONTENT_TYPE::ECT_SPIRV, entry.compilerArgs.preprocessorArgs.sourceIdentifier.data(), sizes[i], shaderBufferSize); // Enlarge the shader buffer by the size of the current shader shaderBufferSize += sizes[i]; i++; @@ -300,7 +368,7 @@ core::smart_refctd_ptr IShaderCompiler::CCache::serialize() const // Loop over entries again, adding each one's shader to the buffer. i = 0u; for (auto& entry : m_container) { - memcpy(retVal.data() + SHADER_BUFFER_SIZE_BYTES + offsets[i], entry.cpuShader->getContent()->getPointer(), sizes[i]); + memcpy(retVal.data() + SHADER_BUFFER_SIZE_BYTES + offsets[i], entry.spirv->getPointer(), sizes[i]); i++; } @@ -343,13 +411,30 @@ core::smart_refctd_ptr IShaderCompiler::CCache::deseria // Create buffer to hold the code auto code = core::make_smart_refctd_ptr(shaderCreationParams[i].codeByteSize); // Copy the shader bytecode into the buffer + memcpy(code->getPointer(), serializedCache.data() + SHADER_BUFFER_SIZE_BYTES + shaderCreationParams[i].offset, shaderCreationParams[i].codeByteSize); code->setContentHash(code->computeContentHash()); // Create the ICPUShader - entries[i].cpuShader = core::make_smart_refctd_ptr(std::move(code), shaderCreationParams[i].stage, shaderCreationParams[i].contentType, std::move(shaderCreationParams[i].filepathHint)); + entries[i].spirv = std::move(code); retVal->insert(std::move(entries[i])); } return retVal; -} \ No newline at end of file +} + +core::smart_refctd_ptr nbl::asset::IShaderCompiler::CCache::SEntry::decodeShader() const +{ + auto uncompressedBuf = core::make_smart_refctd_ptr(uncompressedSize); + size_t dstSize = uncompressedBuf->getSize(); + size_t srcSize = spirv->getSize() - LZMA_PROPS_SIZE; + ELzmaStatus status; + ISzAlloc alloc = { SzAlloc, SzFree }; + SRes res = LzmaDecode( + reinterpret_cast(uncompressedBuf->getPointer()), &dstSize, + reinterpret_cast(spirv->getPointer()) + LZMA_PROPS_SIZE, &srcSize, + reinterpret_cast(spirv->getPointer()), LZMA_PROPS_SIZE, + LZMA_FINISH_ANY, &status, &alloc); + assert(res == SZ_OK); + return core::make_smart_refctd_ptr(std::move(uncompressedBuf), compilerArgs.stage, IShader::E_CONTENT_TYPE::ECT_SPIRV, compilerArgs.preprocessorArgs.sourceIdentifier.data()); +} diff --git a/src/nbl/asset/utils/shaderCompiler_serialization.h b/src/nbl/asset/utils/shaderCompiler_serialization.h index ab23588438..e038d3bf9b 100644 --- a/src/nbl/asset/utils/shaderCompiler_serialization.h +++ b/src/nbl/asset/utils/shaderCompiler_serialization.h @@ -180,6 +180,7 @@ inline void to_json(json& j, const SEntry& entry) { "hash", entry.hash.data }, { "lookupHash", entry.lookupHash }, { "dependencies", entry.dependencies }, + { "uncompressedSize", entry.uncompressedSize }, }; } @@ -190,7 +191,8 @@ inline void from_json(const json& j, SEntry& entry) j.at("hash").get_to(entry.hash.data); j.at("lookupHash").get_to(entry.lookupHash); j.at("dependencies").get_to(entry.dependencies); - entry.cpuShader = nullptr; + j.at("uncompressedSize").get_to(entry.uncompressedSize); + entry.spirv = nullptr; } } From afa49359e09c67a1fbb72d9136d700dd0b485286 Mon Sep 17 00:00:00 2001 From: Ali Cheraghi Date: Sat, 5 Oct 2024 21:06:45 +0330 Subject: [PATCH 084/358] maybe Signed-off-by: Ali Cheraghi --- src/nbl/asset/utils/CHLSLCompiler.cpp | 5 ----- src/nbl/asset/utils/IShaderCompiler.cpp | 1 - 2 files changed, 6 deletions(-) diff --git a/src/nbl/asset/utils/CHLSLCompiler.cpp b/src/nbl/asset/utils/CHLSLCompiler.cpp index b6d46768e9..ca0af41baf 100644 --- a/src/nbl/asset/utils/CHLSLCompiler.cpp +++ b/src/nbl/asset/utils/CHLSLCompiler.cpp @@ -18,8 +18,6 @@ #include #include -#include "lzma/C/LzmaEnc.h" - using namespace nbl; using namespace nbl::asset; using Microsoft::WRL::ComPtr; @@ -375,9 +373,6 @@ std::string CHLSLCompiler::preprocessShader(std::string&& code, IShader::E_SHADE return preprocessShader(std::move(code), stage, preprocessOptions, extra_dxc_compile_flags); } -static void* SzAlloc(ISzAllocPtr p, size_t size) { p = p; return _NBL_ALIGNED_MALLOC(size, _NBL_SIMD_ALIGNMENT); } -static void SzFree(ISzAllocPtr p, void* address) { p = p; _NBL_ALIGNED_FREE(address); } - core::smart_refctd_ptr CHLSLCompiler::compileToSPIRV_impl(const std::string_view code, const IShaderCompiler::SCompilerOptions& options, std::vector* dependencies) const { auto hlslOptions = option_cast(options); diff --git a/src/nbl/asset/utils/IShaderCompiler.cpp b/src/nbl/asset/utils/IShaderCompiler.cpp index 8520580b78..ef685d08fe 100644 --- a/src/nbl/asset/utils/IShaderCompiler.cpp +++ b/src/nbl/asset/utils/IShaderCompiler.cpp @@ -414,7 +414,6 @@ core::smart_refctd_ptr IShaderCompiler::CCache::deseria memcpy(code->getPointer(), serializedCache.data() + SHADER_BUFFER_SIZE_BYTES + shaderCreationParams[i].offset, shaderCreationParams[i].codeByteSize); code->setContentHash(code->computeContentHash()); - // Create the ICPUShader entries[i].spirv = std::move(code); retVal->insert(std::move(entries[i])); From b2927babcfb17bcabcda7cf5e95dc3a783221589 Mon Sep 17 00:00:00 2001 From: Ali Cheraghi Date: Sun, 6 Oct 2024 12:11:25 +0330 Subject: [PATCH 085/358] ready Signed-off-by: Ali Cheraghi --- include/nbl/asset/utils/IShaderCompiler.h | 2 +- src/nbl/asset/utils/IShaderCompiler.cpp | 17 +++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/include/nbl/asset/utils/IShaderCompiler.h b/include/nbl/asset/utils/IShaderCompiler.h index 0c0dfef3e0..991d453510 100644 --- a/include/nbl/asset/utils/IShaderCompiler.h +++ b/include/nbl/asset/utils/IShaderCompiler.h @@ -360,7 +360,7 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted // Making the copy constructor deep-copy everything but the shader inline SEntry(const SEntry& other) : mainFileContents(other.mainFileContents), compilerArgs(other.compilerArgs), hash(other.hash), lookupHash(other.lookupHash), - dependencies(other.dependencies), spirv(other.spirv) {} + dependencies(other.dependencies), spirv(other.spirv), uncompressedSize(other.uncompressedSize) {} inline SEntry& operator=(SEntry& other) = delete; inline SEntry(SEntry&& other) = default; diff --git a/src/nbl/asset/utils/IShaderCompiler.cpp b/src/nbl/asset/utils/IShaderCompiler.cpp index ef685d08fe..d36c87f626 100644 --- a/src/nbl/asset/utils/IShaderCompiler.cpp +++ b/src/nbl/asset/utils/IShaderCompiler.cpp @@ -62,29 +62,31 @@ inline core::smart_refctd_ptr nbl::asset::IShaderCompiler::compileTo auto* spirvBuffer = retVal->getContent(); size_t propsSize = LZMA_PROPS_SIZE; size_t destLen = spirvBuffer->getSize() + spirvBuffer->getSize() / 3 + 128; - auto compressedSpirvBuffer = core::make_smart_refctd_ptr(propsSize + destLen); + std::vector compressedSpirv = {}; + compressedSpirv.resize(propsSize + destLen); CLzmaEncProps props; LzmaEncProps_Init(&props); - props.dictSize = 1 << 16; // 64 KB - props.writeEndMark = 1; // 0 or 1 + props.dictSize = 1 << 16; // 64KB + props.writeEndMark = 1; ISzAlloc alloc = { SzAlloc, SzFree }; int res = LzmaEncode( - reinterpret_cast(compressedSpirvBuffer->getPointer()) + LZMA_PROPS_SIZE, &destLen, + compressedSpirv.data() + LZMA_PROPS_SIZE, &destLen, reinterpret_cast(spirvBuffer->getPointer()), spirvBuffer->getSize(), - &props, reinterpret_cast(compressedSpirvBuffer->getPointer()), &propsSize, props.writeEndMark, + &props, compressedSpirv.data(), &propsSize, props.writeEndMark, nullptr, &alloc, &alloc); assert(propsSize == LZMA_PROPS_SIZE); assert(res == SZ_OK); + auto compressedSpirvBuffer = core::make_smart_refctd_ptr(propsSize + destLen); + memcpy(compressedSpirvBuffer->getPointer(), compressedSpirv.data(), compressedSpirvBuffer->getSize()); + entry.dependencies = std::move(dependencies); entry.spirv = std::move(compressedSpirvBuffer); entry.uncompressedSize = spirvBuffer->getSize(); - std::cout << "original: " << spirvBuffer->getSize() << ", compressed: " << compressedSpirvBuffer->getSize() << "\n"; - options.writeCache->insert(std::move(entry)); } return retVal; @@ -398,7 +400,6 @@ core::smart_refctd_ptr IShaderCompiler::CCache::deseria return nullptr; } } - // Now retrieve two vectors, one with the entries and one with the extra data to recreate the CPUShaders std::vector entries; From f3afeb4e08f65bf9fa4f53e030cb63f4e9dde7b7 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Mon, 7 Oct 2024 07:47:04 -0700 Subject: [PATCH 086/358] Updated benchmark example --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 8c371527bd..3912137730 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 8c371527bd39600a85425f80a785fe875778dcdc +Subproject commit 391213773098baf46552d3e4959ae744c9c973e5 From ff7ed067aa4b9e79958f6d0b84da24833d047c57 Mon Sep 17 00:00:00 2001 From: Ali Cheraghi Date: Mon, 7 Oct 2024 23:36:27 +0330 Subject: [PATCH 087/358] review fixes Signed-off-by: Ali Cheraghi --- include/nbl/asset/utils/IShaderCompiler.h | 29 ++++++++++++------- src/nbl/asset/utils/IShaderCompiler.cpp | 17 +++++------ .../utils/shaderCompiler_serialization.h | 13 ++++----- 3 files changed, 31 insertions(+), 28 deletions(-) diff --git a/include/nbl/asset/utils/IShaderCompiler.h b/include/nbl/asset/utils/IShaderCompiler.h index 991d453510..3899c12669 100644 --- a/include/nbl/asset/utils/IShaderCompiler.h +++ b/include/nbl/asset/utils/IShaderCompiler.h @@ -239,11 +239,10 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted return true; } - std::string sourceIdentifier; - private: friend class SCompilerArgs; friend class SEntry; + friend class CCache; friend void to_json(nlohmann::json&, const SPreprocessorArgs&); friend void from_json(const nlohmann::json&, SPreprocessorArgs&); @@ -264,9 +263,10 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted // Sort them so equality and hashing are well defined std::sort(extraDefines.begin(), extraDefines.end(), [](const SMacroDefinition& lhs, const SMacroDefinition& rhs) {return lhs.identifier < rhs.identifier; }); }; + std::string sourceIdentifier; std::vector extraDefines; }; - // TODO: SPreprocessorArgs could just be folded into `SCompilerArgs` to have less classes and operators + // TODO: SPreprocessorArgs could just be folded into `SCompilerArgs` to have less classes and decompressShader struct SCompilerArgs final { public: @@ -283,11 +283,9 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted return retVal; } - IShader::E_SHADER_STAGE stage; - SPreprocessorArgs preprocessorArgs; - private: friend class SEntry; + friend class CCache; friend void to_json(nlohmann::json&, const SCompilerArgs&); friend void from_json(const nlohmann::json&, SCompilerArgs&); @@ -310,9 +308,11 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted } } + IShader::E_SHADER_STAGE stage; E_SPIRV_VERSION targetSpirvVersion; std::vector optimizerPasses; core::bitflag debugInfoFlags; + SPreprocessorArgs preprocessorArgs; }; // The ordering is important here, the dependencies MUST be added to the array IN THE ORDER THE PREPROCESSOR INCLUDED THEM! @@ -354,20 +354,28 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted lookupHash = std::hash{}(hash); } + // Making an entry to insert into write cache + inline SEntry(const SEntry& other, dependency_container_t dependencies, core::smart_refctd_ptr spirv, + core::blake3_hash_t uncompressedContentHash, size_t uncompressedSize) + : mainFileContents(other.mainFileContents), compilerArgs(other.compilerArgs), hash(other.hash), + lookupHash(other.lookupHash), dependencies(dependencies), spirv(spirv), + uncompressedContentHash(uncompressedContentHash), uncompressedSize(uncompressedSize) {} + // Needed to get the vector deserialization automatically inline SEntry() {} // Making the copy constructor deep-copy everything but the shader - inline SEntry(const SEntry& other) - : mainFileContents(other.mainFileContents), compilerArgs(other.compilerArgs), hash(other.hash), lookupHash(other.lookupHash), - dependencies(other.dependencies), spirv(other.spirv), uncompressedSize(other.uncompressedSize) {} + inline SEntry(const SEntry& other) + : mainFileContents(other.mainFileContents), compilerArgs(other.compilerArgs), hash(other.hash), + lookupHash(other.lookupHash), dependencies(other.dependencies), spirv(other.spirv), + uncompressedContentHash(other.uncompressedContentHash), uncompressedSize(other.uncompressedSize) {} inline SEntry& operator=(SEntry& other) = delete; inline SEntry(SEntry&& other) = default; // Used for late initialization while looking up a cache, so as not to always initialize an entry even if caching was not requested inline SEntry& operator=(SEntry&& other) = default; - core::smart_refctd_ptr decodeShader() const; + core::smart_refctd_ptr decompressShader() const; // TODO: make some of these private std::string mainFileContents; @@ -376,6 +384,7 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted size_t lookupHash; dependency_container_t dependencies; core::smart_refctd_ptr spirv; + core::blake3_hash_t uncompressedContentHash; size_t uncompressedSize; }; diff --git a/src/nbl/asset/utils/IShaderCompiler.cpp b/src/nbl/asset/utils/IShaderCompiler.cpp index d36c87f626..20c8f20cfe 100644 --- a/src/nbl/asset/utils/IShaderCompiler.cpp +++ b/src/nbl/asset/utils/IShaderCompiler.cpp @@ -45,7 +45,7 @@ inline core::smart_refctd_ptr nbl::asset::IShaderCompiler::compileTo CCache::SEntry writeEntry = *found; options.writeCache->insert(std::move(writeEntry)); } - return found->decodeShader(); + return found->decompressShader(); } } @@ -56,7 +56,6 @@ inline core::smart_refctd_ptr nbl::asset::IShaderCompiler::compileTo const_cast(backingBuffer)->setContentHash(backingBuffer->computeContentHash()); } - std::cout << "writeCache: " << options.writeCache << "\n"; if (options.writeCache) { auto* spirvBuffer = retVal->getContent(); @@ -83,11 +82,7 @@ inline core::smart_refctd_ptr nbl::asset::IShaderCompiler::compileTo auto compressedSpirvBuffer = core::make_smart_refctd_ptr(propsSize + destLen); memcpy(compressedSpirvBuffer->getPointer(), compressedSpirv.data(), compressedSpirvBuffer->getSize()); - entry.dependencies = std::move(dependencies); - entry.spirv = std::move(compressedSpirvBuffer); - entry.uncompressedSize = spirvBuffer->getSize(); - - options.writeCache->insert(std::move(entry)); + options.writeCache->insert(std::move(SEntry(entry, std::move(dependencies), std::move(compressedSpirvBuffer), spirvBuffer->getContentHash(), spirvBuffer->getSize()))); } return retVal; } @@ -294,7 +289,7 @@ auto IShaderCompiler::CIncludeFinder::tryIncludeGenerators(const std::string& in core::smart_refctd_ptr IShaderCompiler::CCache::find(const SEntry& mainFile, const IShaderCompiler::CIncludeFinder* finder) const { - return find_impl(mainFile, finder)->decodeShader(); + return find_impl(mainFile, finder)->decompressShader(); } IShaderCompiler::CCache::EntrySet::const_iterator IShaderCompiler::CCache::find_impl(const SEntry& mainFile, const IShaderCompiler::CIncludeFinder* finder) const @@ -346,7 +341,7 @@ core::smart_refctd_ptr IShaderCompiler::CCache::serialize() const sizes[i] = entry.spirv->getSize(); // And add the params to the shader creation parameters array - shaderCreationParams.emplace_back(entry.compilerArgs.stage, IShader::E_CONTENT_TYPE::ECT_SPIRV, entry.compilerArgs.preprocessorArgs.sourceIdentifier.data(), sizes[i], shaderBufferSize); + shaderCreationParams.emplace_back(entry.compilerArgs.stage, entry.compilerArgs.preprocessorArgs.sourceIdentifier.data(), sizes[i], shaderBufferSize); // Enlarge the shader buffer by the size of the current shader shaderBufferSize += sizes[i]; i++; @@ -423,9 +418,11 @@ core::smart_refctd_ptr IShaderCompiler::CCache::deseria return retVal; } -core::smart_refctd_ptr nbl::asset::IShaderCompiler::CCache::SEntry::decodeShader() const +core::smart_refctd_ptr nbl::asset::IShaderCompiler::CCache::SEntry::decompressShader() const { auto uncompressedBuf = core::make_smart_refctd_ptr(uncompressedSize); + uncompressedBuf->setContentHash(uncompressedContentHash); + size_t dstSize = uncompressedBuf->getSize(); size_t srcSize = spirv->getSize() - LZMA_PROPS_SIZE; ELzmaStatus status; diff --git a/src/nbl/asset/utils/shaderCompiler_serialization.h b/src/nbl/asset/utils/shaderCompiler_serialization.h index e038d3bf9b..ca81df6408 100644 --- a/src/nbl/asset/utils/shaderCompiler_serialization.h +++ b/src/nbl/asset/utils/shaderCompiler_serialization.h @@ -133,13 +133,12 @@ inline void from_json(const json& j, SEntry::SPreprocessingDependency& dependenc struct CPUShaderCreationParams { IShader::E_SHADER_STAGE stage; - IShader::E_CONTENT_TYPE contentType; //I think this one could be skipped since it's always going to be SPIR-V std::string filepathHint; uint64_t codeByteSize = 0; uint64_t offset = 0; // Offset into the serialized .bin for the Cache where code starts - CPUShaderCreationParams(IShader::E_SHADER_STAGE _stage, IShader::E_CONTENT_TYPE _contentType, std::string_view _filepathHint, uint64_t _codeByteSize, uint64_t _offset) - : stage(_stage), contentType(_contentType), filepathHint(_filepathHint), codeByteSize(_codeByteSize), offset(_offset) + CPUShaderCreationParams(IShader::E_SHADER_STAGE _stage, std::string_view _filepathHint, uint64_t _codeByteSize, uint64_t _offset) + : stage(_stage), filepathHint(_filepathHint), codeByteSize(_codeByteSize), offset(_offset) {} CPUShaderCreationParams() {}; @@ -148,10 +147,8 @@ struct CPUShaderCreationParams { inline void to_json(json& j, const CPUShaderCreationParams& creationParams) { uint32_t stage = static_cast(creationParams.stage); - uint32_t contentType = static_cast(creationParams.contentType); j = json{ { "stage", stage }, - { "contentType", contentType }, { "filepathHint", creationParams.filepathHint }, { "codeByteSize", creationParams.codeByteSize }, { "offset", creationParams.offset }, @@ -160,14 +157,12 @@ inline void to_json(json& j, const CPUShaderCreationParams& creationParams) inline void from_json(const json& j, CPUShaderCreationParams& creationParams) { - uint32_t stage, contentType; + uint32_t stage; j.at("stage").get_to(stage); - j.at("contentType").get_to(contentType); j.at("filepathHint").get_to(creationParams.filepathHint); j.at("codeByteSize").get_to(creationParams.codeByteSize); j.at("offset").get_to(creationParams.offset); creationParams.stage = static_cast(stage); - creationParams.contentType = static_cast(stage); } // Serialize SEntry, keeping some fields as extra serialization to keep them separate on disk @@ -180,6 +175,7 @@ inline void to_json(json& j, const SEntry& entry) { "hash", entry.hash.data }, { "lookupHash", entry.lookupHash }, { "dependencies", entry.dependencies }, + { "uncompressedContentHash", entry.uncompressedContentHash.data }, { "uncompressedSize", entry.uncompressedSize }, }; } @@ -191,6 +187,7 @@ inline void from_json(const json& j, SEntry& entry) j.at("hash").get_to(entry.hash.data); j.at("lookupHash").get_to(entry.lookupHash); j.at("dependencies").get_to(entry.dependencies); + j.at("uncompressedContentHash").get_to(entry.uncompressedContentHash.data); j.at("uncompressedSize").get_to(entry.uncompressedSize); entry.spirv = nullptr; } From ab86fdfdbc3589083fa35baf91ebcfb96c5890e0 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Mon, 7 Oct 2024 15:13:47 -0700 Subject: [PATCH 088/358] Fixed substraction --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated/float64_t.hlsl | 40 +++++++++---------- .../builtin/hlsl/emulated/float64_t_impl.hlsl | 24 +++++------ .../gauss_legendre/gauss_legendre.hlsl | 1 - 4 files changed, 32 insertions(+), 35 deletions(-) diff --git a/examples_tests b/examples_tests index 846e7aa5af..239b3b0afb 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 846e7aa5afa7ad1344ec8e18041e669ddfddee00 +Subproject commit 239b3b0afbbecce4f878afd3e79e35d397e6ac2c diff --git a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl index d01b73f1a3..fec353b4e4 100644 --- a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl @@ -75,25 +75,10 @@ namespace hlsl return bit_cast(reinterpret_cast(val)); #endif } - - // TODO: unresolved external symbol imath_half_to_float_table - /*static emulated_float64_t create(float16_t val) - { - return emulated_float64_t(bit_cast(float64_t(val))); - }*/ // arithmetic operators this_t operator+(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { - // TODO: REMOVE! - /*float64_t sum = bit_cast(data) + bit_cast(rhs.data); - uint64_t sumAsUint = bit_cast(sum); - - this_t output2; - output2.data = sumAsUint; - - return output2;*/ - if (FlushDenormToZero) { if(FastMath) @@ -137,7 +122,10 @@ namespace hlsl else return bit_cast(0ull); } - + if(emulated_float64_t_impl::isZero(lhsData)) + return bit_cast(rhsData); + if (emulated_float64_t_impl::isZero(rhsData)) + return bit_cast(lhsData); if (tgmath::isInf(lhsData)) return bit_cast(ieee754::traits::inf | ieee754::extractSignPreserveBitPattern(max(lhsData, rhsData))); } @@ -157,25 +145,35 @@ namespace hlsl swap(lhsSign, rhsSign); } - rhsNormMantissa >>= shiftAmount; - uint64_t resultMantissa; if (lhsSign != rhsSign) { - int64_t mantissaDiff = lhsNormMantissa - rhsNormMantissa; + uint64_t rhsNormMantissaHigh = shiftAmount >= 64 ? 0ull : rhsNormMantissa >> shiftAmount; + uint64_t rhsNormMantissaLow = 0ull; + if (shiftAmount < 128) + { + if (shiftAmount >= 64) + rhsNormMantissaLow = rhsNormMantissa >> (shiftAmount - 64); + else + rhsNormMantissaLow = rhsNormMantissa << (64 - shiftAmount); + } + + const int64_t mantissaDiff = int64_t(lhsNormMantissa) - int64_t(rhsNormMantissaHigh); + // can only happen when shiftAmount == 0, so it is safe to swap only high bits of rhs mantissa if (mantissaDiff < 0) { - swap(lhsNormMantissa, rhsNormMantissa); + swap(lhsNormMantissa, rhsNormMantissaHigh); swap(lhsSign, rhsSign); } - resultMantissa = emulated_float64_t_impl::subMantissas128NormalizeResult(lhsNormMantissa, rhsNormMantissa, exp); + resultMantissa = emulated_float64_t_impl::subMantissas128NormalizeResult(lhsNormMantissa, rhsNormMantissaHigh, rhsNormMantissaLow, exp); if (resultMantissa == 0ull) return _static_cast(0ull); } else { + rhsNormMantissa >>= shiftAmount; resultMantissa = lhsNormMantissa + rhsNormMantissa; if (resultMantissa & 1ull << 53) diff --git a/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl index ccc49cd904..2618789276 100644 --- a/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl +++ b/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl @@ -260,19 +260,23 @@ inline uint64_t divmod128by64(const uint64_t dividentHigh, const uint64_t divide return (q1 << 32) | q0; } -inline uint64_t subMantissas128NormalizeResult(const uint64_t greaterNumberMantissa, const uint64_t lesserNumberMantissa, NBL_REF_ARG(int) resultExp) +struct uint128_t { - uint64_t greaterHigh, greaterLow, lesserHigh, lesserLow; + uint64_t highBits; + uint64_t lowBits; +}; + +inline uint64_t subMantissas128NormalizeResult(const uint64_t greaterNumberMantissa, const uint64_t lesserNumberMantissaHigh, const uint64_t lesserNumberMantissaLow, NBL_REF_ARG(int) resultExp) +{ + uint64_t greaterHigh, greaterLow; greaterHigh = greaterNumberMantissa; greaterLow = 0ull; - lesserHigh = lesserNumberMantissa; - lesserLow = 0ull; uint64_t diffHigh, diffLow; - diffHigh = greaterHigh - lesserHigh; - diffLow = greaterLow - lesserLow; + diffHigh = greaterHigh - lesserNumberMantissaHigh; + diffLow = greaterLow - lesserNumberMantissaLow; - if (diffLow > greaterLow) + if (lesserNumberMantissaLow > greaterLow) --diffHigh; int msbIdx = _findMSB(diffHigh); @@ -291,11 +295,7 @@ inline uint64_t subMantissas128NormalizeResult(const uint64_t greaterNumberManti int shiftAmount = msbIdx - TargetMSB; resultExp += shiftAmount; - if (shiftAmount > 0) - { - diffHigh >>= shiftAmount; - } - else if (shiftAmount < 0) + if (shiftAmount < 0) { shiftAmount = -shiftAmount; diffHigh <<= shiftAmount; diff --git a/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl b/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl index 61055305da..d97438d218 100644 --- a/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl +++ b/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl @@ -6,7 +6,6 @@ #include - namespace nbl { namespace hlsl From ff5c747dc01e4d2e0baf6fb62f823d268980d355 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Tue, 8 Oct 2024 11:25:29 -0700 Subject: [PATCH 089/358] Updated examples --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 3912137730..0b39d2d312 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 391213773098baf46552d3e4959ae744c9c973e5 +Subproject commit 0b39d2d31233554e9dfa818b647119649c7ba065 From 3445f6123f0dce03af132311f49b63bbd1072f10 Mon Sep 17 00:00:00 2001 From: Ali Cheraghi Date: Tue, 8 Oct 2024 22:27:39 +0330 Subject: [PATCH 090/358] resolve reviews Signed-off-by: Ali Cheraghi --- include/nbl/asset/utils/IShaderCompiler.h | 9 +-- src/nbl/asset/utils/IShaderCompiler.cpp | 68 +++++++++++++---------- 2 files changed, 41 insertions(+), 36 deletions(-) diff --git a/include/nbl/asset/utils/IShaderCompiler.h b/include/nbl/asset/utils/IShaderCompiler.h index 3899c12669..bac2379ed6 100644 --- a/include/nbl/asset/utils/IShaderCompiler.h +++ b/include/nbl/asset/utils/IShaderCompiler.h @@ -354,13 +354,6 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted lookupHash = std::hash{}(hash); } - // Making an entry to insert into write cache - inline SEntry(const SEntry& other, dependency_container_t dependencies, core::smart_refctd_ptr spirv, - core::blake3_hash_t uncompressedContentHash, size_t uncompressedSize) - : mainFileContents(other.mainFileContents), compilerArgs(other.compilerArgs), hash(other.hash), - lookupHash(other.lookupHash), dependencies(dependencies), spirv(spirv), - uncompressedContentHash(uncompressedContentHash), uncompressedSize(uncompressedSize) {} - // Needed to get the vector deserialization automatically inline SEntry() {} @@ -375,6 +368,8 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted // Used for late initialization while looking up a cache, so as not to always initialize an entry even if caching was not requested inline SEntry& operator=(SEntry&& other) = default; + void setContent(const asset::ICPUBuffer* uncompressedSpirvBuffer, dependency_container_t&& dependencies); + core::smart_refctd_ptr decompressShader() const; // TODO: make some of these private diff --git a/src/nbl/asset/utils/IShaderCompiler.cpp b/src/nbl/asset/utils/IShaderCompiler.cpp index 20c8f20cfe..2ce889189a 100644 --- a/src/nbl/asset/utils/IShaderCompiler.cpp +++ b/src/nbl/asset/utils/IShaderCompiler.cpp @@ -25,9 +25,6 @@ IShaderCompiler::IShaderCompiler(core::smart_refctd_ptr&& syste m_defaultIncludeFinder = core::make_smart_refctd_ptr(core::smart_refctd_ptr(m_system)); } -static void* SzAlloc(ISzAllocPtr p, size_t size) { p = p; return _NBL_ALIGNED_MALLOC(size, _NBL_SIMD_ALIGNMENT); } -static void SzFree(ISzAllocPtr p, void* address) { p = p; _NBL_ALIGNED_FREE(address); } - inline core::smart_refctd_ptr nbl::asset::IShaderCompiler::compileToSPIRV(const std::string_view code, const SCompilerOptions& options) const { CCache::SEntry entry; @@ -58,31 +55,8 @@ inline core::smart_refctd_ptr nbl::asset::IShaderCompiler::compileTo if (options.writeCache) { - auto* spirvBuffer = retVal->getContent(); - size_t propsSize = LZMA_PROPS_SIZE; - size_t destLen = spirvBuffer->getSize() + spirvBuffer->getSize() / 3 + 128; - std::vector compressedSpirv = {}; - compressedSpirv.resize(propsSize + destLen); - - CLzmaEncProps props; - LzmaEncProps_Init(&props); - props.dictSize = 1 << 16; // 64KB - props.writeEndMark = 1; - - ISzAlloc alloc = { SzAlloc, SzFree }; - int res = LzmaEncode( - compressedSpirv.data() + LZMA_PROPS_SIZE, &destLen, - reinterpret_cast(spirvBuffer->getPointer()), spirvBuffer->getSize(), - &props, compressedSpirv.data(), &propsSize, props.writeEndMark, - nullptr, &alloc, &alloc); - - assert(propsSize == LZMA_PROPS_SIZE); - assert(res == SZ_OK); - - auto compressedSpirvBuffer = core::make_smart_refctd_ptr(propsSize + destLen); - memcpy(compressedSpirvBuffer->getPointer(), compressedSpirv.data(), compressedSpirvBuffer->getSize()); - - options.writeCache->insert(std::move(SEntry(entry, std::move(dependencies), std::move(compressedSpirvBuffer), spirvBuffer->getContentHash(), spirvBuffer->getSize()))); + entry.setContent(retVal->getContent(), std::move(dependencies)); + options.writeCache->insert(std::move(entry)); } return retVal; } @@ -289,7 +263,10 @@ auto IShaderCompiler::CIncludeFinder::tryIncludeGenerators(const std::string& in core::smart_refctd_ptr IShaderCompiler::CCache::find(const SEntry& mainFile, const IShaderCompiler::CIncludeFinder* finder) const { - return find_impl(mainFile, finder)->decompressShader(); + const auto found = find_impl(mainFile, finder); + if (found==m_container.end()) + return nullptr; + return found->decompressShader(); } IShaderCompiler::CCache::EntrySet::const_iterator IShaderCompiler::CCache::find_impl(const SEntry& mainFile, const IShaderCompiler::CIncludeFinder* finder) const @@ -418,6 +395,39 @@ core::smart_refctd_ptr IShaderCompiler::CCache::deseria return retVal; } +static void* SzAlloc(ISzAllocPtr p, size_t size) { p = p; return _NBL_ALIGNED_MALLOC(size, _NBL_SIMD_ALIGNMENT); } +static void SzFree(ISzAllocPtr p, void* address) { p = p; _NBL_ALIGNED_FREE(address); } + +void nbl::asset::IShaderCompiler::CCache::SEntry::setContent(const asset::ICPUBuffer* uncompressedSpirvBuffer, dependency_container_t&& dependencies) +{ + dependencies = std::move(dependencies); + uncompressedContentHash = uncompressedSpirvBuffer->getContentHash(); + uncompressedSize = uncompressedSpirvBuffer->getSize(); + + size_t propsSize = LZMA_PROPS_SIZE; + size_t destLen = uncompressedSpirvBuffer->getSize() + uncompressedSpirvBuffer->getSize() / 3 + 128; + std::vector compressedSpirv = {}; + compressedSpirv.resize(propsSize + destLen); + + CLzmaEncProps props; + LzmaEncProps_Init(&props); + props.dictSize = 1 << 16; // 64KB + props.writeEndMark = 1; + + ISzAlloc alloc = { SzAlloc, SzFree }; + int res = LzmaEncode( + compressedSpirv.data() + LZMA_PROPS_SIZE, &destLen, + reinterpret_cast(uncompressedSpirvBuffer->getPointer()), uncompressedSpirvBuffer->getSize(), + &props, compressedSpirv.data(), &propsSize, props.writeEndMark, + nullptr, &alloc, &alloc); + + assert(propsSize == LZMA_PROPS_SIZE); + assert(res == SZ_OK); + + spirv = core::make_smart_refctd_ptr(propsSize + destLen); + memcpy(spirv->getPointer(), compressedSpirv.data(), spirv->getSize()); +} + core::smart_refctd_ptr nbl::asset::IShaderCompiler::CCache::SEntry::decompressShader() const { auto uncompressedBuf = core::make_smart_refctd_ptr(uncompressedSize); From f11d61e5fab3e9d7964a95cb9812fc0e407fca99 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Tue, 8 Oct 2024 17:23:58 -0700 Subject: [PATCH 091/358] Updated examples --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 0b39d2d312..879860a22b 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 0b39d2d31233554e9dfa818b647119649c7ba065 +Subproject commit 879860a22b2f2d62f97114dce46c80cefd3a217e From f3961db7ac57ded32e52c4f0581c28e5e0b50f23 Mon Sep 17 00:00:00 2001 From: Ali Cheraghi Date: Wed, 9 Oct 2024 12:33:06 +0330 Subject: [PATCH 092/358] resolve reviews Signed-off-by: Ali Cheraghi --- include/nbl/asset/utils/IShaderCompiler.h | 2 +- src/nbl/asset/utils/IShaderCompiler.cpp | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/include/nbl/asset/utils/IShaderCompiler.h b/include/nbl/asset/utils/IShaderCompiler.h index bac2379ed6..ac7ed5eb13 100644 --- a/include/nbl/asset/utils/IShaderCompiler.h +++ b/include/nbl/asset/utils/IShaderCompiler.h @@ -368,7 +368,7 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted // Used for late initialization while looking up a cache, so as not to always initialize an entry even if caching was not requested inline SEntry& operator=(SEntry&& other) = default; - void setContent(const asset::ICPUBuffer* uncompressedSpirvBuffer, dependency_container_t&& dependencies); + bool setContent(const asset::ICPUBuffer* uncompressedSpirvBuffer, dependency_container_t&& dependencies); core::smart_refctd_ptr decompressShader() const; diff --git a/src/nbl/asset/utils/IShaderCompiler.cpp b/src/nbl/asset/utils/IShaderCompiler.cpp index 2ce889189a..f8c9e545d3 100644 --- a/src/nbl/asset/utils/IShaderCompiler.cpp +++ b/src/nbl/asset/utils/IShaderCompiler.cpp @@ -55,8 +55,8 @@ inline core::smart_refctd_ptr nbl::asset::IShaderCompiler::compileTo if (options.writeCache) { - entry.setContent(retVal->getContent(), std::move(dependencies)); - options.writeCache->insert(std::move(entry)); + if (entry.setContent(retVal->getContent(), std::move(dependencies))) + options.writeCache->insert(std::move(entry)); } return retVal; } @@ -398,7 +398,7 @@ core::smart_refctd_ptr IShaderCompiler::CCache::deseria static void* SzAlloc(ISzAllocPtr p, size_t size) { p = p; return _NBL_ALIGNED_MALLOC(size, _NBL_SIMD_ALIGNMENT); } static void SzFree(ISzAllocPtr p, void* address) { p = p; _NBL_ALIGNED_FREE(address); } -void nbl::asset::IShaderCompiler::CCache::SEntry::setContent(const asset::ICPUBuffer* uncompressedSpirvBuffer, dependency_container_t&& dependencies) +bool nbl::asset::IShaderCompiler::CCache::SEntry::setContent(const asset::ICPUBuffer* uncompressedSpirvBuffer, dependency_container_t&& dependencies) { dependencies = std::move(dependencies); uncompressedContentHash = uncompressedSpirvBuffer->getContentHash(); @@ -421,11 +421,12 @@ void nbl::asset::IShaderCompiler::CCache::SEntry::setContent(const asset::ICPUBu &props, compressedSpirv.data(), &propsSize, props.writeEndMark, nullptr, &alloc, &alloc); - assert(propsSize == LZMA_PROPS_SIZE); - assert(res == SZ_OK); + if (res != SZ_OK || propsSize != LZMA_PROPS_SIZE) return false; spirv = core::make_smart_refctd_ptr(propsSize + destLen); memcpy(spirv->getPointer(), compressedSpirv.data(), spirv->getSize()); + + return true; } core::smart_refctd_ptr nbl::asset::IShaderCompiler::CCache::SEntry::decompressShader() const From eecbbaa7a683dedafae561845ba22145756a89a0 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Wed, 9 Oct 2024 07:08:24 -0700 Subject: [PATCH 093/358] Updated examples --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated/float64_t.hlsl | 20 +++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/examples_tests b/examples_tests index 879860a22b..3c063f8e3f 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 879860a22b2f2d62f97114dce46c80cefd3a217e +Subproject commit 3c063f8e3f3ef8609f78b6c6da9c693bae344d98 diff --git a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl index cbf1584edd..19f02a5b10 100644 --- a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl @@ -81,7 +81,7 @@ namespace hlsl { if (FlushDenormToZero) { - if(FastMath) + if(!FastMath) { if (tgmath::isNaN(data) || tgmath::isNaN(rhs.data)) return bit_cast(ieee754::traits::quietNaN); @@ -113,7 +113,7 @@ namespace hlsl uint64_t lhsSign = ieee754::extractSignPreserveBitPattern(lhsData); uint64_t rhsSign = ieee754::extractSignPreserveBitPattern(rhsData); - if(FastMath) + if(!FastMath) { if (emulated_float64_t_impl::areBothZero(lhsData, rhsData)) { @@ -241,7 +241,7 @@ namespace hlsl uint64_t rhsMantissa = ieee754::extractMantissa(rhsData); int exp = int(lhsBiasedExp + rhsBiasedExp) - ieee754::traits::exponentBias; - if (FastMath) + if (!FastMath) { if (tgmath::isNaN(lhsData) || tgmath::isNaN(rhsData)) return bit_cast(ieee754::traits::quietNaN | sign); @@ -308,7 +308,7 @@ namespace hlsl uint64_t lhsData = emulated_float64_t_impl::flushDenormToZero(lhsBiasedExp, data); uint64_t rhsData = emulated_float64_t_impl::flushDenormToZero(rhsBiasedExp, rhs.data); - if(FastMath) + if(!FastMath) { if (tgmath::isNaN(lhsData) || tgmath::isNaN(rhsData)) return bit_cast(ieee754::traits::quietNaN); @@ -363,7 +363,7 @@ namespace hlsl // relational operators bool operator==(this_t rhs) NBL_CONST_MEMBER_FUNC { - if (FastMath) + if (!FastMath) { if (tgmath::isNaN(data) || tgmath::isNaN(rhs.data)) return false; @@ -379,14 +379,14 @@ namespace hlsl } bool operator!=(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { - if (FastMath && (tgmath::isNaN(data) || tgmath::isNaN(rhs.data))) + if (!FastMath && (tgmath::isNaN(data) || tgmath::isNaN(rhs.data))) return false; return !(bit_cast(data) == rhs); } bool operator<(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { - if (FastMath) + if (!FastMath) { if (tgmath::isNaN(data) || tgmath::isNaN(rhs.data)) return false; @@ -407,7 +407,7 @@ namespace hlsl } bool operator>(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { - if (FastMath) + if (!FastMath) { if (tgmath::isNaN(data) || tgmath::isNaN(rhs.data)) return false; @@ -428,14 +428,14 @@ namespace hlsl } bool operator<=(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { - if (FastMath && (tgmath::isNaN(data) || tgmath::isNaN(rhs.data))) + if (!FastMath && (tgmath::isNaN(data) || tgmath::isNaN(rhs.data))) return false; return !(bit_cast(data) > bit_cast(rhs.data)); } bool operator>=(emulated_float64_t rhs) { - if (FastMath && (tgmath::isNaN(data) || tgmath::isNaN(rhs.data))) + if (!FastMath && (tgmath::isNaN(data) || tgmath::isNaN(rhs.data))) return false; return !(bit_cast(data) < bit_cast(rhs.data)); From 684bcc20169a44f7f709354ca7171b58154e7877 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Thu, 10 Oct 2024 10:39:01 -0700 Subject: [PATCH 094/358] Updated examples --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 3c063f8e3f..38282a3784 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 3c063f8e3f3ef8609f78b6c6da9c693bae344d98 +Subproject commit 38282a37847b56b655653832ffe4cae8c3b76863 From 6236fdd809376508e0e46794746269055acbdab1 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Thu, 10 Oct 2024 14:54:10 -0700 Subject: [PATCH 095/358] Fixed ef64 with fast math enabled --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated/float64_t.hlsl | 23 ++++++++++--------- .../gauss_legendre/gauss_legendre.hlsl | 3 --- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/examples_tests b/examples_tests index 38282a3784..fa5a77b1c6 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 38282a37847b56b655653832ffe4cae8c3b76863 +Subproject commit fa5a77b1c65db8b83ad6ba0e38508d29ddc788ac diff --git a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl index 19f02a5b10..4f97f28102 100644 --- a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl @@ -115,21 +115,22 @@ namespace hlsl if(!FastMath) { - if (emulated_float64_t_impl::areBothZero(lhsData, rhsData)) - { - if (lhsSign == rhsSign) - return bit_cast(lhsSign); - else - return bit_cast(0ull); - } - if(emulated_float64_t_impl::isZero(lhsData)) - return bit_cast(rhsData); - if (emulated_float64_t_impl::isZero(rhsData)) - return bit_cast(lhsData); if (tgmath::isInf(lhsData)) return bit_cast(ieee754::traits::inf | ieee754::extractSignPreserveBitPattern(max(lhsData, rhsData))); } + if (emulated_float64_t_impl::areBothZero(lhsData, rhsData)) + { + if (lhsSign == rhsSign) + return bit_cast(lhsSign); + else + return bit_cast(0ull); + } + if (emulated_float64_t_impl::isZero(lhsData)) + return bit_cast(rhsData); + if (emulated_float64_t_impl::isZero(rhsData)) + return bit_cast(lhsData); + uint64_t lhsNormMantissa = ieee754::extractNormalizeMantissa(lhsData); uint64_t rhsNormMantissa = ieee754::extractNormalizeMantissa(rhsData); diff --git a/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl b/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl index 823c368b4d..1eeba76546 100644 --- a/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl +++ b/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl @@ -30,9 +30,6 @@ struct GaussLegendreIntegration { const float_t xi = GaussLegendreValues::xi(i) * ((end - start) / 2.0f) + ((end + start) / 2.0f); integral = integral + GaussLegendreValues::wi(i) * func(xi); - - float_t a = GaussLegendreValues::xi(i); - float_t b = (end - start) / 2.0f; } return ((end - start) / 2.0) * integral; From b161dbb34a3f2820b48c1a06f49871dc8e3c766c Mon Sep 17 00:00:00 2001 From: Fletterio Date: Fri, 11 Oct 2024 20:49:44 -0300 Subject: [PATCH 096/358] Submodule pointer update --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 8abd881bdf..7818f1d694 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 8abd881bdfed0b15f7a54c5175da915c844621ba +Subproject commit 7818f1d69410db0c47f2e890a3169385df0ad64d From efe14d6da9568d00f83955cf77d18e3d5bff776b Mon Sep 17 00:00:00 2001 From: Fletterio Date: Sat, 12 Oct 2024 18:32:08 -0300 Subject: [PATCH 097/358] Add packing/unpacking utils --- examples_tests | 2 +- include/nbl/builtin/hlsl/workgroup/fft.hlsl | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 7818f1d694..3ad298aec1 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 7818f1d69410db0c47f2e890a3169385df0ad64d +Subproject commit 3ad298aec13203c2986e465d96a681fdb776bf8d diff --git a/include/nbl/builtin/hlsl/workgroup/fft.hlsl b/include/nbl/builtin/hlsl/workgroup/fft.hlsl index 0b16601088..e963e023ea 100644 --- a/include/nbl/builtin/hlsl/workgroup/fft.hlsl +++ b/include/nbl/builtin/hlsl/workgroup/fft.hlsl @@ -160,6 +160,14 @@ uint32_t mirror(uint32_t idx) return (FFT_SIZE - idx) & (FFT_SIZE - 1); } +// When packing real FFTs a common operation is to get `DFT[T]` and `DFT[-T]` to unpack the result of a packed real FFT. +// Given an index `idx` into the Nabla-ordered DFT such that `output[idx] = DFT[T]`, this function is such that `output[getNegativeIndex(idx)] = DFT[-T]` +template +uint32_t getNegativeIndex(uint32_t idx) +{ + return getOutputIndex(mirror(getFrequencyIndex(idx))); +} + } //namespace fft // ----------------------------------- End Utils ----------------------------------------------- From 070ac3eaf73025a6eee8ae55c5e3c9ef8340dcce Mon Sep 17 00:00:00 2001 From: Fletterio Date: Sat, 12 Oct 2024 18:36:44 -0300 Subject: [PATCH 098/358] Another packing util --- examples_tests | 2 +- include/nbl/builtin/hlsl/workgroup/fft.hlsl | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 3ad298aec1..31e3bd689d 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 3ad298aec13203c2986e465d96a681fdb776bf8d +Subproject commit 31e3bd689dde469c9fc2a1cf6bc5d4c9e2f1f741 diff --git a/include/nbl/builtin/hlsl/workgroup/fft.hlsl b/include/nbl/builtin/hlsl/workgroup/fft.hlsl index e963e023ea..1e4a0dfc35 100644 --- a/include/nbl/builtin/hlsl/workgroup/fft.hlsl +++ b/include/nbl/builtin/hlsl/workgroup/fft.hlsl @@ -168,6 +168,15 @@ uint32_t getNegativeIndex(uint32_t idx) return getOutputIndex(mirror(getFrequencyIndex(idx))); } +// Util to unpack two values from the packed FFT X + iY - get outputs in the same input arguments, storing x to lo and y to hi +template +void unpack(NBL_CONST_REF_ARG(complex_t) lo, NBL_CONST_REF_ARG(complex_t) hi) +{ + complex_t x = (lo + conj(hi)) * Scalar(0.5); + hi = rotateRight(lo - conj(hi)) * 0.5; + lo = x; +} + } //namespace fft // ----------------------------------- End Utils ----------------------------------------------- From 8caa917b60d6a5ff7511baa1284e19e6e4d6a905 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Tue, 15 Oct 2024 04:47:58 -0700 Subject: [PATCH 099/358] Fixed array_get and array_set --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated/float64_t.hlsl | 24 +++-- .../nbl/builtin/hlsl/emulated/vector_t.hlsl | 95 +++++++++++-------- 3 files changed, 72 insertions(+), 49 deletions(-) diff --git a/examples_tests b/examples_tests index fa5a77b1c6..e4a42f42ef 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit fa5a77b1c65db8b83ad6ba0e38508d29ddc788ac +Subproject commit e4a42f42efe0e0345a337870bae1e64c6357cefd diff --git a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl index 4f97f28102..eec7a27c46 100644 --- a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl @@ -79,6 +79,15 @@ namespace hlsl // arithmetic operators this_t operator+(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { + // TODO: remove + float64_t sum = bit_cast(data) + bit_cast(rhs.data); + uint64_t sumAsUint = bit_cast(sum); + + this_t output2; + output2.data = sumAsUint; + + return output2; + if (FlushDenormToZero) { if(!FastMath) @@ -216,13 +225,13 @@ namespace hlsl emulated_float64_t operator*(emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { // TODO: remove - /*float64_t sum = bit_cast(data) * bit_cast(rhs.data); + float64_t sum = bit_cast(data) * bit_cast(rhs.data); uint64_t sumAsUint = bit_cast(sum); this_t output2; output2.data = sumAsUint; - return output2;*/ + return output2; if(FlushDenormToZero) { @@ -291,13 +300,13 @@ namespace hlsl emulated_float64_t operator/(const emulated_float64_t rhs) NBL_CONST_MEMBER_FUNC { // TODO: remove - /*float64_t sum = bit_cast(data) / bit_cast(rhs.data); + float64_t sum = bit_cast(data) / bit_cast(rhs.data); uint64_t sumAsUint = bit_cast(sum); this_t output2; output2.data = sumAsUint; - return output2;*/ + return output2; if (FlushDenormToZero) { @@ -535,18 +544,20 @@ struct static_cast_helper,void static_assert(is_scalar::value); using From = emulated_float64_t; - + static inline To cast(From v) { using ToAsFloat = typename float_of_size::type; using ToAsUint = typename unsigned_integer_of_size::type; + if (emulated_float64_t_impl::isZero(v.data)) + return 0; + if (is_same_v) return To(bit_cast(v.data)); if (is_floating_point::value) { - const int exponent = ieee754::extractExponent(v.data); if (!From::isFastMathSupported) { @@ -558,7 +569,6 @@ struct static_cast_helper,void return bit_cast(ieee754::traits::quietNaN); } - const uint32_t toBitSize = sizeof(To) * 8; const ToAsUint sign = ToAsUint(ieee754::extractSign(v.data) << (toBitSize - 1)); const ToAsUint biasedExponent = ToAsUint(exponent + ieee754::traits::exponentBias) << ieee754::traits::mantissaBitCnt; diff --git a/include/nbl/builtin/hlsl/emulated/vector_t.hlsl b/include/nbl/builtin/hlsl/emulated/vector_t.hlsl index 537171ff3e..768c6b8c85 100644 --- a/include/nbl/builtin/hlsl/emulated/vector_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/vector_t.hlsl @@ -37,7 +37,7 @@ struct _2_component_vec // TODO: avoid code duplication, make it constexpr using TAsUint = typename unsigned_integer_of_size::type; - uint64_t invalidComponentValue = nbl::hlsl::_static_cast(0xdeadbeefbadcaffeull >> (64 - sizeof(T) * 8)); + uint64_t invalidComponentValue = nbl::hlsl::_static_cast(0xdeadbeefbadcaffeull); return nbl::hlsl::bit_cast(invalidComponentValue); } @@ -382,66 +382,79 @@ struct is_valid_emulated_vector is_same_v >; }; -#ifdef __HLSL_VERSION template struct array_get { - T operator()(NBL_CONST_REF_ARG(U) vec, const I ix) + T operator()(NBL_REF_ARG(U) vec, const I ix) { return vec[ix]; } }; -template -struct array_get, TT, I> +template +struct array_get, ComponentType, uint32_t> +{ + ComponentType operator()(NBL_REF_ARG(emulated_vector_t2) vec, const uint32_t ix) + { + return vec.getComponent(ix); + } +}; + +template +struct array_get, ComponentType, uint32_t> { - TT operator()(NBL_CONST_REF_ARG(emulated_vector_t) vec, const I ix) + ComponentType operator()(NBL_REF_ARG(emulated_vector_t3) vec, const uint32_t ix) { return vec.getComponent(ix); } }; -#endif - -//template -//struct array_get -//{ -// T operator()(I index, NBL_CONST_REF_ARG(U) arr) -// { -// return arr[index]; -// } -//}; -// -//template -//struct array_get::component_t, emulated_vector_t, uint32_t> -//{ -// using vec_t = emulated_vector_t; -// -// T operator()(uint32_t index, NBL_CONST_REF_ARG(vec_t) vec) -// { -// return vec.getComponent(index); -// } -//}; - -template + +template +struct array_get, ComponentType, uint32_t> +{ + ComponentType operator()(NBL_REF_ARG(emulated_vector_t4) vec, const uint32_t ix) + { + return vec.getComponent(ix); + } +}; + +#undef DEFINE_EMULATED_VECTOR_ARRAY_GET_SPECIALIZATION + +template struct array_set { - void operator()(I index, NBL_REF_ARG(U) arr, T val) + void operator()(NBL_REF_ARG(U) arr, I index, T val) { arr[index] = val; } }; -// TODO: fix -//template -//struct array_set, uint32_t> -//{ -// using type_t = T; -// -// T operator()(uint32_t index, NBL_CONST_REF_ARG(emulated_vector_t) vec, T value) -// { -// vec.setComponent(index, value); -// } -//}; +template +struct array_set, ComponentType, uint32_t> +{ + void operator()(NBL_REF_ARG(emulated_vector_t2) vec, uint32_t index, ComponentType value) + { + vec.setComponent(index, value); + } +}; + +template +struct array_set, ComponentType, uint32_t> +{ + void operator()(NBL_REF_ARG(emulated_vector_t3) vec, uint32_t index, ComponentType value) + { + vec.setComponent(index, value); + } +}; + +template +struct array_set, ComponentType, uint32_t> +{ + void operator()(NBL_REF_ARG(emulated_vector_t4) vec, uint32_t index, ComponentType value) + { + vec.setComponent(index, value); + } +}; namespace impl { From da885753a0b3b8dd345abab81015b153a44b6be6 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Mon, 21 Oct 2024 13:14:06 -0700 Subject: [PATCH 100/358] Saving work --- examples_tests | 2 +- .../transformation_matrix_utils.hlsl | 112 ++++++++++++++++++ .../builtin/hlsl/projection/projection.hlsl | 33 ++++++ include/quaternion.h | 3 +- 4 files changed, 148 insertions(+), 2 deletions(-) create mode 100644 include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl create mode 100644 include/nbl/builtin/hlsl/projection/projection.hlsl diff --git a/examples_tests b/examples_tests index a565619cd2..fedb9c138f 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit a565619cd23300181e48df34562f329bd93c538d +Subproject commit fedb9c138fdd41f5713b9b75fc4eeac19dd5d203 diff --git a/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl b/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl new file mode 100644 index 0000000000..d50bc16869 --- /dev/null +++ b/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl @@ -0,0 +1,112 @@ +#ifndef _NBL_BUILTIN_HLSL_TRANSFORMATION_MATRIX_UTILS_INCLUDED_ +#define _NBL_BUILTIN_HLSL_TRANSFORMATION_MATRIX_UTILS_INCLUDED_ + +#include + +namespace nbl +{ +namespace hlsl +{ + +template +matrix getMatrix3x4As4x4(const matrix& mat) +{ + matrix output; + for (int i = 0; i < 3; ++i) + output[i] = mat[i]; + output[3] = float32_t4(0.0f, 0.0f, 0.0f, 1.0f); + + return output; +} + +// TODO: use portable_float when merged +//! multiplies matrices a and b, 3x4 matrices are treated as 4x4 matrices with 4th row set to (0, 0, 0 ,1) +template +inline matrix concatenateBFollowedByA(const matrix& a, const matrix& b) +{ + const matrix a4x4 = getMatrix3x4As4x4(a); + const matrix b4x4 = getMatrix3x4As4x4(b); + return matrix(mul(a4x4, b4x4)); +} + +template +inline matrix buildCameraLookAtMatrixLH( + const vector& position, + const vector& target, + const vector& upVector) +{ + const vector zaxis = core::normalize(target - position); + const vector xaxis = core::normalize(core::cross(upVector, zaxis)); + const vector yaxis = core::cross(zaxis, xaxis); + + matrix r; + r[0] = vector(xaxis, -dot(xaxis, position)); + r[1] = vector(yaxis, -dot(yaxis, position)); + r[2] = vector(zaxis, -dot(zaxis, position)); + + return r; +} + +float32_t3x4 buildCameraLookAtMatrixRH( + const float32_t3& position, + const float32_t3& target, + const float32_t3& upVector) +{ + const float32_t3 zaxis = core::normalize(position - target); + const float32_t3 xaxis = core::normalize(core::cross(upVector, zaxis)); + const float32_t3 yaxis = core::cross(zaxis, xaxis); + + float32_t3x4 r; + r[0] = float32_t4(xaxis, -dot(xaxis, position)); + r[1] = float32_t4(yaxis, -dot(yaxis, position)); + r[2] = float32_t4(zaxis, -dot(zaxis, position)); + + return r; +} + +// TODO: test, check if there is better implementation +// TODO: move quaternion to nbl::hlsl +// TODO: why NBL_REF_ARG(MatType) doesn't work????? + +//! Replaces curent rocation and scale by rotation represented by quaternion `quat`, leaves 4th row and 4th colum unchanged +template +inline void setRotation(matrix& outMat, NBL_CONST_REF_ARG(core::quaternion) quat) +{ + static_assert(N == 3 || N == 4); + + outMat[0] = vector( + 1 - 2 * (quat.y * quat.y + quat.z * quat.z), + 2 * (quat.x * quat.y - quat.z * quat.w), + 2 * (quat.x * quat.z + quat.y * quat.w), + outMat[0][3] + ); + + outMat[1] = vector( + 2 * (quat.x * quat.y + quat.z * quat.w), + 1 - 2 * (quat.x * quat.x + quat.z * quat.z), + 2 * (quat.y * quat.z - quat.x * quat.w), + outMat[1][3] + ); + + outMat[2] = vector( + 2 * (quat.x * quat.z - quat.y * quat.w), + 2 * (quat.y * quat.z + quat.x * quat.w), + 1 - 2 * (quat.x * quat.x + quat.y * quat.y), + outMat[2][3] + ); +} + +template +inline void setTranslation(matrix& outMat, NBL_CONST_REF_ARG(vector) translation) +{ + static_assert(N == 3 || N == 4); + + outMat[0].w = translation.x; + outMat[1].w = translation.y; + outMat[2].w = translation.z; +} + +} +} + +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/projection/projection.hlsl b/include/nbl/builtin/hlsl/projection/projection.hlsl new file mode 100644 index 0000000000..d973fda25d --- /dev/null +++ b/include/nbl/builtin/hlsl/projection/projection.hlsl @@ -0,0 +1,33 @@ +#ifndef _NBL_BUILTIN_HLSL_PROJECTION_INCLUDED_ +#define _NBL_BUILTIN_HLSL_PROJECTION_INCLUDED_ + +#include + +namespace nbl +{ +namespace hlsl +{ +// TODO: use glm instead for c++ +inline float32_t4x4 buildProjectionMatrixPerspectiveFovLH(float fieldOfViewRadians, float aspectRatio, float zNear, float zFar) +{ +#ifndef __HLSL_VERSION + _NBL_DEBUG_BREAK_IF(aspectRatio == 0.f); //division by zero + _NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero +#endif + + const float h = core::reciprocal(tanf(fieldOfViewRadians * 0.5f)); + const float w = h / aspectRatio; + + float32_t4x4 m; + m[0] = float32_t4(w, 0.f, 0.f, 0.f); + m[1] = float32_t4(0.f, -h, 0.f, 0.f); + m[2] = float32_t4(0.f, 0.f, zFar / (zFar - zNear), -zNear * zFar / (zFar - zNear)); + m[3] = float32_t4(0.f, 0.f, 1.f, 0.f); + + return m; +} + +} +} + +#endif \ No newline at end of file diff --git a/include/quaternion.h b/include/quaternion.h index c1867235db..a0fd689dc6 100644 --- a/include/quaternion.h +++ b/include/quaternion.h @@ -23,7 +23,8 @@ class matrix3x4SIMD; //! Quaternion class for representing rotations. /** It provides cheap combinations and avoids gimbal locks. Also useful for interpolations. */ -class quaternion : private vectorSIMDf +// TODO: move this to nbl::hlsl +class quaternion : public vectorSIMDf { public: //! Default Constructor From ee338359d69dce12321a7c66ebb11f8d28bedb6f Mon Sep 17 00:00:00 2001 From: Fletterio Date: Mon, 21 Oct 2024 20:35:22 -0300 Subject: [PATCH 101/358] QOL update to FFT shuffles --- examples_tests | 2 +- include/nbl/builtin/hlsl/workgroup/fft.hlsl | 60 +++------------------ 2 files changed, 7 insertions(+), 55 deletions(-) diff --git a/examples_tests b/examples_tests index d21495ebe5..2504a93cce 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit d21495ebe554ccd70f418a80499f3f18363134d6 +Subproject commit 2504a93cce2c3c7cf634d6724c0f7f1535ef25ed diff --git a/include/nbl/builtin/hlsl/workgroup/fft.hlsl b/include/nbl/builtin/hlsl/workgroup/fft.hlsl index 1e4a0dfc35..63d367048d 100644 --- a/include/nbl/builtin/hlsl/workgroup/fft.hlsl +++ b/include/nbl/builtin/hlsl/workgroup/fft.hlsl @@ -24,18 +24,14 @@ namespace fft // ---------------------------------- Utils ----------------------------------------------- template -struct exchangeValues; - -template -struct exchangeValues +struct exchangeValues { - static void __call(NBL_REF_ARG(complex_t) lo, NBL_REF_ARG(complex_t) hi, uint32_t threadID, uint32_t stride, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor) + static void __call(NBL_REF_ARG(complex_t) lo, NBL_REF_ARG(complex_t) hi, uint32_t threadID, uint32_t stride, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor) { const bool topHalf = bool(threadID & stride); - // Pack two halves into a single uint32_t - uint32_t toExchange = bit_cast(topHalf ? float16_t2 (lo.real(), lo.imag()) : float16_t2 (hi.real(), hi.imag())); - shuffleXor(toExchange, stride, sharedmemAdaptor); - float16_t2 exchanged = bit_cast(toExchange); + // Pack into float vector because ternary operator does not support structs + vector exchanged = topHalf ? vector(lo.real(), lo.imag()) : vector(hi.real(), hi.imag()); + shuffleXor >(exchanged, stride, sharedmemAdaptor); if (topHalf) { lo.real(exchanged.x); @@ -45,51 +41,7 @@ struct exchangeValues { hi.real(exchanged.x); lo.imag(exchanged.y); - } - } -}; - -template -struct exchangeValues -{ - static void __call(NBL_REF_ARG(complex_t) lo, NBL_REF_ARG(complex_t) hi, uint32_t threadID, uint32_t stride, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor) - { - const bool topHalf = bool(threadID & stride); - // pack into `float32_t2` because ternary operator doesn't support structs - float32_t2 exchanged = topHalf ? float32_t2(lo.real(), lo.imag()) : float32_t2(hi.real(), hi.imag()); - shuffleXor(exchanged, stride, sharedmemAdaptor); - if (topHalf) - { - lo.real(exchanged.x); - lo.imag(exchanged.y); } - else - { - hi.real(exchanged.x); - hi.imag(exchanged.y); - } - } -}; - -template -struct exchangeValues -{ - static void __call(NBL_REF_ARG(complex_t) lo, NBL_REF_ARG(complex_t) hi, uint32_t threadID, uint32_t stride, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor) - { - const bool topHalf = bool(threadID & stride); - // pack into `float64_t2` because ternary operator doesn't support structs - float64_t2 exchanged = topHalf ? float64_t2(lo.real(), lo.imag()) : float64_t2(hi.real(), hi.imag()); - shuffleXor(exchanged, stride, sharedmemAdaptor); - if (topHalf) - { - lo.real(exchanged.x); - lo.imag(exchanged.y); - } - else - { - hi.real(exchanged.x); - hi.imag(exchanged.y); - } } }; @@ -170,7 +122,7 @@ uint32_t getNegativeIndex(uint32_t idx) // Util to unpack two values from the packed FFT X + iY - get outputs in the same input arguments, storing x to lo and y to hi template -void unpack(NBL_CONST_REF_ARG(complex_t) lo, NBL_CONST_REF_ARG(complex_t) hi) +void unpack(NBL_REF_ARG(complex_t) lo, NBL_REF_ARG(complex_t) hi) { complex_t x = (lo + conj(hi)) * Scalar(0.5); hi = rotateRight(lo - conj(hi)) * 0.5; From 8e25bb1afb1d12578e63f0f2234b6d4bd56e01db Mon Sep 17 00:00:00 2001 From: Fletterio Date: Tue, 22 Oct 2024 21:39:43 -0300 Subject: [PATCH 102/358] Submodule pointer update --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 2504a93cce..5cf4ef6999 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 2504a93cce2c3c7cf634d6724c0f7f1535ef25ed +Subproject commit 5cf4ef6999e468b06383716e01f4329f33d7721f From ab77ca41a310ecbc105302172ee95bc21d13c02e Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Wed, 23 Oct 2024 13:51:08 -0700 Subject: [PATCH 103/358] Removed SIMD matrix classes --- examples_tests | 2 +- include/matrix3x4SIMD.h | 263 ---------- include/matrix3x4SIMD_impl.h | 470 ------------------ include/matrix4SIMD.h | 385 -------------- include/matrix4SIMD_impl.h | 299 ----------- .../nbl/builtin/hlsl/glsl_compat/math.hlsl | 65 +++ .../hlsl/math/quaternion/quaternion.hlsl | 74 +++ .../hlsl/math/quaternion/quaternion_impl.hlsl | 25 + 8 files changed, 165 insertions(+), 1418 deletions(-) delete mode 100644 include/matrix3x4SIMD.h delete mode 100644 include/matrix3x4SIMD_impl.h delete mode 100644 include/matrix4SIMD.h delete mode 100644 include/matrix4SIMD_impl.h create mode 100644 include/nbl/builtin/hlsl/glsl_compat/math.hlsl create mode 100644 include/nbl/builtin/hlsl/math/quaternion/quaternion.hlsl create mode 100644 include/nbl/builtin/hlsl/math/quaternion/quaternion_impl.hlsl diff --git a/examples_tests b/examples_tests index fedb9c138f..6b45fecf8c 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit fedb9c138fdd41f5713b9b75fc4eeac19dd5d203 +Subproject commit 6b45fecf8c64e8b11fef8c2b3a7e5618edaca68d diff --git a/include/matrix3x4SIMD.h b/include/matrix3x4SIMD.h deleted file mode 100644 index d52f305cec..0000000000 --- a/include/matrix3x4SIMD.h +++ /dev/null @@ -1,263 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#ifndef __NBL_MATRIX3X4SIMD_H_INCLUDED__ -#define __NBL_MATRIX3X4SIMD_H_INCLUDED__ - -#include "vectorSIMD.h" -#include "quaternion.h" - -namespace nbl::core -{ - -class matrix4x3; - -#define _NBL_MATRIX_ALIGNMENT _NBL_SIMD_ALIGNMENT -static_assert(_NBL_MATRIX_ALIGNMENT>=_NBL_VECTOR_ALIGNMENT,"Matrix must be equally or more aligned than vector!"); - -//! Equivalent of GLSL's mat4x3 -class matrix3x4SIMD// : private AllocationOverrideBase<_NBL_MATRIX_ALIGNMENT> EBO inheritance problem w.r.t `rows[3]` -{ - public: - _NBL_STATIC_INLINE_CONSTEXPR uint32_t VectorCount = 3u; - vectorSIMDf rows[VectorCount]; - - explicit matrix3x4SIMD( const vectorSIMDf& _r0 = vectorSIMDf(1.f, 0.f, 0.f, 0.f), - const vectorSIMDf& _r1 = vectorSIMDf(0.f, 1.f, 0.f, 0.f), - const vectorSIMDf& _r2 = vectorSIMDf(0.f, 0.f, 1.f, 0.f)) : rows{_r0, _r1, _r2} - { - } - - matrix3x4SIMD( float _a00, float _a01, float _a02, float _a03, - float _a10, float _a11, float _a12, float _a13, - float _a20, float _a21, float _a22, float _a23) - : matrix3x4SIMD(vectorSIMDf(_a00, _a01, _a02, _a03), - vectorSIMDf(_a10, _a11, _a12, _a13), - vectorSIMDf(_a20, _a21, _a22, _a23)) - { - } - - explicit matrix3x4SIMD(const float* const _data) - { - if (!_data) - return; - for (size_t i = 0u; i < VectorCount; ++i) - rows[i] = vectorSIMDf(_data + 4*i); - } - matrix3x4SIMD(const float* const _data, bool ALIGNED) - { - if (!_data) - return; - for (size_t i = 0u; i < VectorCount; ++i) - rows[i] = vectorSIMDf(_data + 4*i, ALIGNED); - } - - float* pointer() { return rows[0].pointer; } - const float* pointer() const { return rows[0].pointer; } - - inline matrix3x4SIMD& set(const matrix4x3& _retarded); - inline matrix4x3 getAsRetardedIrrlichtMatrix() const; - - static inline matrix3x4SIMD concatenateBFollowedByA(const matrix3x4SIMD& _a, const matrix3x4SIMD& _b); - - static inline matrix3x4SIMD concatenateBFollowedByAPrecisely(const matrix3x4SIMD& _a, const matrix3x4SIMD& _b); - - inline matrix3x4SIMD& concatenateAfter(const matrix3x4SIMD& _other) - { - return *this = concatenateBFollowedByA(*this, _other); - } - - inline matrix3x4SIMD& concatenateBefore(const matrix3x4SIMD& _other) - { - return *this = concatenateBFollowedByA(_other, *this); - } - - inline matrix3x4SIMD& concatenateAfterPrecisely(const matrix3x4SIMD& _other) - { - return *this = concatenateBFollowedByAPrecisely(*this, _other); - } - - inline matrix3x4SIMD& concatenateBeforePrecisely(const matrix3x4SIMD& _other) - { - return *this = concatenateBFollowedByAPrecisely(_other, *this); - } - - inline bool operator==(const matrix3x4SIMD& _other) - { - return !(*this != _other); - } - - inline bool operator!=(const matrix3x4SIMD& _other); - - - inline matrix3x4SIMD operator-() const - { - matrix3x4SIMD retval; - retval.rows[0] = -rows[0]; - retval.rows[1] = -rows[1]; - retval.rows[2] = -rows[2]; - return retval; - } - - - inline matrix3x4SIMD& operator+=(const matrix3x4SIMD& _other); - inline matrix3x4SIMD operator+(const matrix3x4SIMD& _other) const - { - matrix3x4SIMD retval(*this); - return retval += _other; - } - - inline matrix3x4SIMD& operator-=(const matrix3x4SIMD& _other); - inline matrix3x4SIMD operator-(const matrix3x4SIMD& _other) const - { - matrix3x4SIMD retval(*this); - return retval -= _other; - } - - inline matrix3x4SIMD& operator*=(float _scalar); - inline matrix3x4SIMD operator*(float _scalar) const - { - matrix3x4SIMD retval(*this); - return retval *= _scalar; - } - - inline matrix3x4SIMD& setTranslation(const vectorSIMDf& _translation) - { - // no faster way of doing it? - rows[0].w = _translation.x; - rows[1].w = _translation.y; - rows[2].w = _translation.z; - return *this; - } - inline vectorSIMDf getTranslation() const; - inline vectorSIMDf getTranslation3D() const; - - inline matrix3x4SIMD& setScale(const vectorSIMDf& _scale); - - inline vectorSIMDf getScale() const; - - inline void transformVect(vectorSIMDf& _out, const vectorSIMDf& _in) const; - inline void transformVect(vectorSIMDf& _in_out) const - { - transformVect(_in_out, _in_out); - } - - inline void pseudoMulWith4x1(vectorSIMDf& _out, const vectorSIMDf& _in) const; - inline void pseudoMulWith4x1(vectorSIMDf& _in_out) const - { - pseudoMulWith4x1(_in_out,_in_out); - } - - inline void mulSub3x3WithNx1(vectorSIMDf& _out, const vectorSIMDf& _in) const; - inline void mulSub3x3WithNx1(vectorSIMDf& _in_out) const - { - mulSub3x3WithNx1(_in_out, _in_out); - } - - inline static matrix3x4SIMD buildCameraLookAtMatrixLH( - const vectorSIMDf& position, - const vectorSIMDf& target, - const vectorSIMDf& upVector); - inline static matrix3x4SIMD buildCameraLookAtMatrixRH( - const vectorSIMDf& position, - const vectorSIMDf& target, - const vectorSIMDf& upVector); - - inline matrix3x4SIMD& setRotation(const quaternion& _quat); - - inline matrix3x4SIMD& setScaleRotationAndTranslation( const vectorSIMDf& _scale, - const quaternion& _quat, - const vectorSIMDf& _translation); - - inline vectorSIMDf getPseudoDeterminant() const - { - vectorSIMDf tmp; - return determinant_helper(tmp); - } - - inline bool getInverse(matrix3x4SIMD& _out) const; - bool makeInverse() - { - matrix3x4SIMD tmp; - - if (getInverse(tmp)) - { - *this = tmp; - return true; - } - return false; - } - - // - inline bool getSub3x3InverseTranspose(matrix3x4SIMD& _out) const; - - // - inline bool getSub3x3InverseTransposePacked(float outRows[9]) const - { - matrix3x4SIMD tmp; - if (!getSub3x3InverseTranspose(tmp)) - return false; - - float* _out = outRows; - for (auto i=0; i<3; i++) - { - const auto& row = tmp.rows[i]; - for (auto j=0; j<3; j++) - *(_out++) = row[j]; - } - - return true; - } - - // - inline core::matrix3x4SIMD getSub3x3TransposeCofactors() const; - - // - inline void setTransformationCenter(const vectorSIMDf& _center, const vectorSIMDf& _translation); - - // - static inline matrix3x4SIMD buildAxisAlignedBillboard( - const vectorSIMDf& camPos, - const vectorSIMDf& center, - const vectorSIMDf& translation, - const vectorSIMDf& axis, - const vectorSIMDf& from); - - - // - float& operator()(size_t _i, size_t _j) { return rows[_i].pointer[_j]; } - const float& operator()(size_t _i, size_t _j) const { return rows[_i].pointer[_j]; } - - // - inline const vectorSIMDf& operator[](size_t _rown) const { return rows[_rown]; } - inline vectorSIMDf& operator[](size_t _rown) { return rows[_rown]; } - - private: - static inline vectorSIMDf doJob(const __m128& a, const matrix3x4SIMD& _mtx); - - // really need that dvec<2> or wider - inline __m128d halfRowAsDouble(size_t _n, bool _0) const; - static inline __m128d doJob_d(const __m128d& _a0, const __m128d& _a1, const matrix3x4SIMD& _mtx, bool _xyHalf); - - vectorSIMDf determinant_helper(vectorSIMDf& r1crossr2) const - { - r1crossr2 = core::cross(rows[1], rows[2]); - return core::dot(rows[0], r1crossr2); - } -}; - -inline matrix3x4SIMD concatenateBFollowedByA(const matrix3x4SIMD& _a, const matrix3x4SIMD& _b) -{ - return matrix3x4SIMD::concatenateBFollowedByA(_a, _b); -} -/* -inline matrix3x4SIMD concatenateBFollowedByAPrecisely(const matrix3x4SIMD& _a, const matrix3x4SIMD& _b) -{ - return matrix3x4SIMD::concatenateBFollowedByAPrecisely(_a, _b); -} -*/ - -} - -#endif diff --git a/include/matrix3x4SIMD_impl.h b/include/matrix3x4SIMD_impl.h deleted file mode 100644 index 0e9022efd0..0000000000 --- a/include/matrix3x4SIMD_impl.h +++ /dev/null @@ -1,470 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#ifndef _NBL_MATRIX3X4SIMD_IMPL_H_INCLUDED_ -#define _NBL_MATRIX3X4SIMD_IMPL_H_INCLUDED_ - -#include "matrix3x4SIMD.h" -#include "nbl/core/math/glslFunctions.tcc" - -namespace nbl::core -{ - -// TODO: move to another implementation header -inline quaternion::quaternion(const matrix3x4SIMD& m) -{ - const vectorSIMDf one(1.f); - auto Qx = m.rows[0].xxxx()^vectorSIMDu32(0,0,0x80000000u,0x80000000u); - auto Qy = m.rows[1].yyyy()^vectorSIMDu32(0,0x80000000u,0,0x80000000u); - auto Qz = m.rows[2].zzzz()^vectorSIMDu32(0,0x80000000u,0x80000000u,0); - - auto tmp = one+Qx+Qy+Qz; - auto invscales = inversesqrt(tmp)*0.5f; - auto scales = tmp*invscales*0.5f; - - // TODO: speed this up - if (tmp.x > 0.0f) - { - X = (m(2, 1) - m(1, 2)) * invscales.x; - Y = (m(0, 2) - m(2, 0)) * invscales.x; - Z = (m(1, 0) - m(0, 1)) * invscales.x; - W = scales.x; - } - else - { - if (tmp.y>0.f) - { - X = scales.y; - Y = (m(0, 1) + m(1, 0)) * invscales.y; - Z = (m(2, 0) + m(0, 2)) * invscales.y; - W = (m(2, 1) - m(1, 2)) * invscales.y; - } - else if (tmp.z>0.f) - { - X = (m(0, 1) + m(1, 0)) * invscales.z; - Y = scales.z; - Z = (m(1, 2) + m(2, 1)) * invscales.z; - W = (m(0, 2) - m(2, 0)) * invscales.z; - } - else - { - X = (m(0, 2) + m(2, 0)) * invscales.w; - Y = (m(1, 2) + m(2, 1)) * invscales.w; - Z = scales.w; - W = (m(1, 0) - m(0, 1)) * invscales.w; - } - } - - *this = normalize(*this); -} - -inline bool matrix3x4SIMD::operator!=(const matrix3x4SIMD& _other) -{ - for (size_t i = 0u; i < VectorCount; ++i) - if ((rows[i] != _other.rows[i]).any()) - return true; - return false; -} - -inline matrix3x4SIMD& matrix3x4SIMD::operator+=(const matrix3x4SIMD& _other) -{ - for (size_t i = 0u; i < VectorCount; ++i) - rows[i] += _other.rows[i]; - return *this; -} -inline matrix3x4SIMD& matrix3x4SIMD::operator-=(const matrix3x4SIMD& _other) -{ - for (size_t i = 0u; i < VectorCount; ++i) - rows[i] -= _other.rows[i]; - return *this; -} -inline matrix3x4SIMD& matrix3x4SIMD::operator*=(float _scalar) -{ - for (size_t i = 0u; i < VectorCount; ++i) - rows[i] *= _scalar; - return *this; -} - -#ifdef __NBL_COMPILE_WITH_SSE3 -#define BROADCAST32(fpx) _MM_SHUFFLE(fpx, fpx, fpx, fpx) -#define BUILD_XORMASKF(_x_, _y_, _z_, _w_) _mm_setr_epi32(_x_ ? 0x80000000u:0x0u, _y_ ? 0x80000000u:0x0u, _z_ ? 0x80000000u:0x0u, _w_ ? 0x80000000u:0x0u) -#define BUILD_MASKF(_x_, _y_, _z_, _w_) _mm_setr_epi32(_x_*0xffffffff, _y_*0xffffffff, _z_*0xffffffff, _w_*0xffffffff) - -inline matrix3x4SIMD matrix3x4SIMD::concatenateBFollowedByA(const matrix3x4SIMD& _a, const matrix3x4SIMD& _b) -{ -#ifdef _NBL_DEBUG - assert(is_aligned_to(&_a, _NBL_SIMD_ALIGNMENT)); - assert(is_aligned_to(&_b, _NBL_SIMD_ALIGNMENT)); -#endif // _NBL_DEBUG - __m128 r0 = _a.rows[0].getAsRegister(); - __m128 r1 = _a.rows[1].getAsRegister(); - __m128 r2 = _a.rows[2].getAsRegister(); - - matrix3x4SIMD out; - out.rows[0] = matrix3x4SIMD::doJob(r0, _b); - out.rows[1] = matrix3x4SIMD::doJob(r1, _b); - out.rows[2] = matrix3x4SIMD::doJob(r2, _b); - - return out; -} - -inline matrix3x4SIMD matrix3x4SIMD::concatenateBFollowedByAPrecisely(const matrix3x4SIMD& _a, const matrix3x4SIMD& _b) -{ - __m128d r00 = _a.halfRowAsDouble(0u, true); - __m128d r01 = _a.halfRowAsDouble(0u, false); - __m128d r10 = _a.halfRowAsDouble(1u, true); - __m128d r11 = _a.halfRowAsDouble(1u, false); - __m128d r20 = _a.halfRowAsDouble(2u, true); - __m128d r21 = _a.halfRowAsDouble(2u, false); - - matrix3x4SIMD out; - - const __m128i mask0011 = BUILD_MASKF(0, 0, 1, 1); - - __m128 second = _mm_cvtpd_ps(matrix3x4SIMD::doJob_d(r00, r01, _b, false)); - out.rows[0] = vectorSIMDf(_mm_cvtpd_ps(matrix3x4SIMD::doJob_d(r00, r01, _b, true))) | _mm_castps_si128((vectorSIMDf(_mm_movelh_ps(second, second)) & mask0011).getAsRegister()); - - second = _mm_cvtpd_ps(matrix3x4SIMD::doJob_d(r10, r11, _b, false)); - out.rows[1] = vectorSIMDf(_mm_cvtpd_ps(matrix3x4SIMD::doJob_d(r10, r11, _b, true))) | _mm_castps_si128((vectorSIMDf(_mm_movelh_ps(second, second)) & mask0011).getAsRegister()); - - second = _mm_cvtpd_ps(matrix3x4SIMD::doJob_d(r20, r21, _b, false)); - out.rows[2] = vectorSIMDf(_mm_cvtpd_ps(matrix3x4SIMD::doJob_d(r20, r21, _b, true))) | _mm_castps_si128((vectorSIMDf(_mm_movelh_ps(second, second)) & mask0011).getAsRegister()); - - return out; -} - -inline vectorSIMDf matrix3x4SIMD::getTranslation() const -{ - __m128 xmm0 = _mm_unpackhi_ps(rows[0].getAsRegister(), rows[1].getAsRegister()); // (0z,1z,0w,1w) - __m128 xmm1 = _mm_unpackhi_ps(rows[2].getAsRegister(), _mm_setr_ps(0.f, 0.f, 0.f, 1.f)); // (2z,3z,2w,3w) - __m128 xmm2 = _mm_movehl_ps(xmm1, xmm0);// (0w,1w,2w,3w) - - return xmm2; -} -inline vectorSIMDf matrix3x4SIMD::getTranslation3D() const -{ - __m128 xmm0 = _mm_unpackhi_ps(rows[0].getAsRegister(), rows[1].getAsRegister()); // (0z,1z,0w,1w) - __m128 xmm1 = _mm_unpackhi_ps(rows[2].getAsRegister(), _mm_setzero_ps()); // (2z,0,2w,0) - __m128 xmm2 = _mm_movehl_ps(xmm1, xmm0);// (0w,1w,2w,0) - - return xmm2; -} - -inline matrix3x4SIMD& matrix3x4SIMD::setScale(const core::vectorSIMDf& _scale) -{ - const vectorSIMDu32 mask0001 = vectorSIMDu32(BUILD_MASKF(0, 0, 0, 1)); - const vectorSIMDu32 mask0010 = vectorSIMDu32(BUILD_MASKF(0, 0, 1, 0)); - const vectorSIMDu32 mask0100 = vectorSIMDu32(BUILD_MASKF(0, 1, 0, 0)); - const vectorSIMDu32 mask1000 = vectorSIMDu32(BUILD_MASKF(1, 0, 0, 0)); - - const vectorSIMDu32& scaleAlias = reinterpret_cast(_scale); - - vectorSIMDu32& rowAlias0 = reinterpret_cast(rows[0]); - vectorSIMDu32& rowAlias1 = reinterpret_cast(rows[1]); - vectorSIMDu32& rowAlias2 = reinterpret_cast(rows[2]); - rowAlias0 = (scaleAlias & reinterpret_cast(mask1000)) | (rowAlias0 & reinterpret_cast(mask0001)); - rowAlias1 = (scaleAlias & reinterpret_cast(mask0100)) | (rowAlias1 & reinterpret_cast(mask0001)); - rowAlias2 = (scaleAlias & reinterpret_cast(mask0010)) | (rowAlias2 & reinterpret_cast(mask0001)); - - return *this; -} - -inline core::vectorSIMDf matrix3x4SIMD::getScale() const -{ - // xmm4-7 will now become columuns of B - __m128 xmm4 = rows[0].getAsRegister(); - __m128 xmm5 = rows[1].getAsRegister(); - __m128 xmm6 = rows[2].getAsRegister(); - __m128 xmm7 = _mm_setzero_ps(); - // g==0 - __m128 xmm0 = _mm_unpacklo_ps(xmm4, xmm5); - __m128 xmm1 = _mm_unpacklo_ps(xmm6, xmm7); // (2x,g,2y,g) - __m128 xmm2 = _mm_unpackhi_ps(xmm4, xmm5); - __m128 xmm3 = _mm_unpackhi_ps(xmm6, xmm7); // (2z,g,2w,g) - xmm4 = _mm_movelh_ps(xmm1, xmm0); //(0x,1x,2x,g) - xmm5 = _mm_movehl_ps(xmm1, xmm0); - xmm6 = _mm_movelh_ps(xmm3, xmm2); //(0z,1z,2z,g) - - // See http://www.robertblum.com/articles/2005/02/14/decomposing-matrices - // We have to do the full calculation. - xmm0 = _mm_mul_ps(xmm4, xmm4);// column 0 squared - xmm1 = _mm_mul_ps(xmm5, xmm5);// column 1 squared - xmm2 = _mm_mul_ps(xmm6, xmm6);// column 2 squared - xmm4 = _mm_hadd_ps(xmm0, xmm1); - xmm5 = _mm_hadd_ps(xmm2, xmm7); - xmm6 = _mm_hadd_ps(xmm4, xmm5); - - return _mm_sqrt_ps(xmm6); -} - -inline void matrix3x4SIMD::transformVect(vectorSIMDf& _out, const vectorSIMDf& _in) const -{ - vectorSIMDf r0 = rows[0] * _in, - r1 = rows[1] * _in, - r2 = rows[2] * _in; - - _out = - _mm_hadd_ps( - _mm_hadd_ps(r0.getAsRegister(), r1.getAsRegister()), - _mm_hadd_ps(r2.getAsRegister(), _mm_set1_ps(0.25f)) - ); -} - -inline void matrix3x4SIMD::pseudoMulWith4x1(vectorSIMDf& _out, const vectorSIMDf& _in) const -{ - __m128i mask1110 = BUILD_MASKF(1, 1, 1, 0); - _out = (_in & mask1110) | _mm_castps_si128(vectorSIMDf(0.f, 0.f, 0.f, 1.f).getAsRegister()); - transformVect(_out); -} - -inline void matrix3x4SIMD::mulSub3x3WithNx1(vectorSIMDf& _out, const vectorSIMDf& _in) const -{ - auto maskedIn = _in & BUILD_MASKF(1, 1, 1, 0); - vectorSIMDf r0 = rows[0] * maskedIn, - r1 = rows[1] * maskedIn, - r2 = rows[2] * maskedIn; - - _out = - _mm_hadd_ps( - _mm_hadd_ps(r0.getAsRegister(), r1.getAsRegister()), - _mm_hadd_ps(r2.getAsRegister(), _mm_setzero_ps()) - ); -} - - -inline matrix3x4SIMD matrix3x4SIMD::buildCameraLookAtMatrixLH( - const core::vectorSIMDf& position, - const core::vectorSIMDf& target, - const core::vectorSIMDf& upVector) -{ - const core::vectorSIMDf zaxis = core::normalize(target - position); - const core::vectorSIMDf xaxis = core::normalize(core::cross(upVector, zaxis)); - const core::vectorSIMDf yaxis = core::cross(zaxis, xaxis); - - matrix3x4SIMD r; - r.rows[0] = xaxis; - r.rows[1] = yaxis; - r.rows[2] = zaxis; - r.rows[0].w = -dot(xaxis, position)[0]; - r.rows[1].w = -dot(yaxis, position)[0]; - r.rows[2].w = -dot(zaxis, position)[0]; - - return r; -} -inline matrix3x4SIMD matrix3x4SIMD::buildCameraLookAtMatrixRH( - const core::vectorSIMDf& position, - const core::vectorSIMDf& target, - const core::vectorSIMDf& upVector) -{ - const core::vectorSIMDf zaxis = core::normalize(position - target); - const core::vectorSIMDf xaxis = core::normalize(core::cross(upVector, zaxis)); - const core::vectorSIMDf yaxis = core::cross(zaxis, xaxis); - - matrix3x4SIMD r; - r.rows[0] = xaxis; - r.rows[1] = yaxis; - r.rows[2] = zaxis; - r.rows[0].w = -dot(xaxis, position)[0]; - r.rows[1].w = -dot(yaxis, position)[0]; - r.rows[2].w = -dot(zaxis, position)[0]; - - return r; -} - -inline matrix3x4SIMD& matrix3x4SIMD::setRotation(const core::quaternion& _quat) -{ - const vectorSIMDu32 mask0001 = vectorSIMDu32(BUILD_MASKF(0, 0, 0, 1)); - const __m128i mask1110 = BUILD_MASKF(1, 1, 1, 0); - - const core::vectorSIMDf& quat = reinterpret_cast(_quat); - rows[0] = ((quat.yyyy() * ((quat.yxwx() & mask1110) * vectorSIMDf(2.f))) + (quat.zzzz() * (quat.zwxx() & mask1110) * vectorSIMDf(2.f, -2.f, 2.f, 0.f))) | (reinterpret_cast(rows[0]) & (mask0001)); - rows[0].x = 1.f - rows[0].x; - - rows[1] = ((quat.zzzz() * ((quat.wzyx() & mask1110) * vectorSIMDf(2.f))) + (quat.xxxx() * (quat.yxwx() & mask1110) * vectorSIMDf(2.f, 2.f, -2.f, 0.f))) | (reinterpret_cast(rows[1]) & (mask0001)); - rows[1].y = 1.f - rows[1].y; - - rows[2] = ((quat.xxxx() * ((quat.zwxx() & mask1110) * vectorSIMDf(2.f))) + (quat.yyyy() * (quat.wzyx() & mask1110) * vectorSIMDf(-2.f, 2.f, 2.f, 0.f))) | (reinterpret_cast(rows[2]) & (mask0001)); - rows[2].z = 1.f - rows[2].z; - - return *this; -} - -inline matrix3x4SIMD& matrix3x4SIMD::setScaleRotationAndTranslation(const vectorSIMDf& _scale, const core::quaternion& _quat, const vectorSIMDf& _translation) -{ - const __m128i mask1110 = BUILD_MASKF(1, 1, 1, 0); - - const vectorSIMDf& quat = reinterpret_cast(_quat); - const vectorSIMDf dblScale = (_scale * 2.f) & mask1110; - - vectorSIMDf mlt = dblScale ^ BUILD_XORMASKF(0, 1, 0, 0); - rows[0] = ((quat.yyyy() * ((quat.yxwx() & mask1110) * dblScale)) + (quat.zzzz() * (quat.zwxx() & mask1110) * mlt)); - rows[0].x = _scale.x - rows[0].x; - - mlt = dblScale ^ BUILD_XORMASKF(0, 0, 1, 0); - rows[1] = ((quat.zzzz() * ((quat.wzyx() & mask1110) * dblScale)) + (quat.xxxx() * (quat.yxwx() & mask1110) * mlt)); - rows[1].y = _scale.y - rows[1].y; - - mlt = dblScale ^ BUILD_XORMASKF(1, 0, 0, 0); - rows[2] = ((quat.xxxx() * ((quat.zwxx() & mask1110) * dblScale)) + (quat.yyyy() * (quat.wzyx() & mask1110) * mlt)); - rows[2].z = _scale.z - rows[2].z; - - setTranslation(_translation); - - return *this; -} - - -inline bool matrix3x4SIMD::getInverse(matrix3x4SIMD& _out) const //! SUBOPTIMAL - OPTIMIZE! -{ - auto translation = getTranslation(); - // `tmp` will have columns in its `rows` - core::matrix4SIMD tmp; - auto* cols = tmp.rows; - if (!getSub3x3InverseTranspose(reinterpret_cast(tmp))) - return false; - - // find inverse post-translation - cols[3] = -cols[0]*translation.xxxx()-cols[1]*translation.yyyy()-cols[2]*translation.zzzz(); - - // columns into rows - _out = transpose(tmp).extractSub3x4(); - - return true; -} - -inline bool matrix3x4SIMD::getSub3x3InverseTranspose(core::matrix3x4SIMD& _out) const -{ - vectorSIMDf r1crossr2; - const vectorSIMDf d = determinant_helper(r1crossr2); - if (core::iszero(d.x, FLT_MIN)) - return false; - auto rcp = core::reciprocal(d); - - // matrix of cofactors * 1/det - _out = getSub3x3TransposeCofactors(); - _out.rows[0] *= rcp; - _out.rows[1] *= rcp; - _out.rows[2] *= rcp; - - return true; -} - -inline core::matrix3x4SIMD matrix3x4SIMD::getSub3x3TransposeCofactors() const -{ - core::matrix3x4SIMD _out; - _out.rows[0] = core::cross(rows[1], rows[2]); - _out.rows[1] = core::cross(rows[2], rows[0]); - _out.rows[2] = core::cross(rows[0], rows[1]); - return _out; -} - -// TODO: Double check this!- -inline void matrix3x4SIMD::setTransformationCenter(const core::vectorSIMDf& _center, const core::vectorSIMDf& _translation) -{ - core::vectorSIMDf r0 = rows[0] * _center; - core::vectorSIMDf r1 = rows[1] * _center; - core::vectorSIMDf r2 = rows[2] * _center; - core::vectorSIMDf r3(0.f, 0.f, 0.f, 1.f); - - __m128 col3 = _mm_hadd_ps(_mm_hadd_ps(r0.getAsRegister(), r1.getAsRegister()), _mm_hadd_ps(r2.getAsRegister(), r3.getAsRegister())); - const vectorSIMDf vcol3 = _center - _translation - col3; - - for (size_t i = 0u; i < VectorCount; ++i) - rows[i].w = vcol3.pointer[i]; -} - - -// TODO: Double check this! -inline matrix3x4SIMD matrix3x4SIMD::buildAxisAlignedBillboard( - const core::vectorSIMDf& camPos, - const core::vectorSIMDf& center, - const core::vectorSIMDf& translation, - const core::vectorSIMDf& axis, - const core::vectorSIMDf& from) -{ - // axis of rotation - const core::vectorSIMDf up = core::normalize(axis); - const core::vectorSIMDf forward = core::normalize(camPos - center); - const core::vectorSIMDf right = core::normalize(core::cross(up, forward)); - - // correct look vector - const core::vectorSIMDf look = core::cross(right, up); - - // rotate from to - // axis multiplication by sin - const core::vectorSIMDf vs = core::cross(look, from); - - // cosinus angle - const core::vectorSIMDf ca = core::cross(from, look); - - const core::vectorSIMDf vt(up * (core::vectorSIMDf(1.f) - ca)); - const core::vectorSIMDf wt = vt * up.yzxx(); - const core::vectorSIMDf vtuppca = vt * up + ca; - - matrix3x4SIMD mat; - core::vectorSIMDf& row0 = mat.rows[0]; - core::vectorSIMDf& row1 = mat.rows[1]; - core::vectorSIMDf& row2 = mat.rows[2]; - - row0 = vtuppca & BUILD_MASKF(1, 0, 0, 0); - row1 = vtuppca & BUILD_MASKF(0, 1, 0, 0); - row2 = vtuppca & BUILD_MASKF(0, 0, 1, 0); - - row0 += (wt.xxzx() + vs.xzyx() * core::vectorSIMDf(1.f, 1.f, -1.f, 1.f)) & BUILD_MASKF(0, 1, 1, 0); - row1 += (wt.xxyx() + vs.zxxx() * core::vectorSIMDf(-1.f, 1.f, 1.f, 1.f)) & BUILD_MASKF(1, 0, 1, 0); - row2 += (wt.zyxx() + vs.yxxx() * core::vectorSIMDf(1.f, -1.f, 1.f, 1.f)) & BUILD_MASKF(1, 1, 0, 0); - - mat.setTransformationCenter(center, translation); - return mat; -} - - - -inline vectorSIMDf matrix3x4SIMD::doJob(const __m128& a, const matrix3x4SIMD& _mtx) -{ - __m128 r0 = _mtx.rows[0].getAsRegister(); - __m128 r1 = _mtx.rows[1].getAsRegister(); - __m128 r2 = _mtx.rows[2].getAsRegister(); - - const __m128i mask = _mm_setr_epi32(0, 0, 0, 0xffffffff); - - vectorSIMDf res; - res = _mm_mul_ps(_mm_shuffle_ps(a, a, BROADCAST32(0)), r0); - res += _mm_mul_ps(_mm_shuffle_ps(a, a, BROADCAST32(1)), r1); - res += _mm_mul_ps(_mm_shuffle_ps(a, a, BROADCAST32(2)), r2); - res += vectorSIMDf(a) & mask; // always 0 0 0 a3 -- no shuffle needed - return res; - } - -inline __m128d matrix3x4SIMD::halfRowAsDouble(size_t _n, bool _0) const -{ - return _mm_cvtps_pd(_0 ? rows[_n].xyxx().getAsRegister() : rows[_n].zwxx().getAsRegister()); -} -inline __m128d matrix3x4SIMD::doJob_d(const __m128d& _a0, const __m128d& _a1, const matrix3x4SIMD& _mtx, bool _xyHalf) -{ - __m128d r0 = _mtx.halfRowAsDouble(0u, _xyHalf); - __m128d r1 = _mtx.halfRowAsDouble(1u, _xyHalf); - __m128d r2 = _mtx.halfRowAsDouble(2u, _xyHalf); - - const __m128d mask01 = _mm_castsi128_pd(_mm_setr_epi32(0, 0, 0xffffffff, 0xffffffff)); - - __m128d res; - res = _mm_mul_pd(_mm_shuffle_pd(_a0, _a0, 0), r0); - res = _mm_add_pd(res, _mm_mul_pd(_mm_shuffle_pd(_a0, _a0, 3), r1)); - res = _mm_add_pd(res, _mm_mul_pd(_mm_shuffle_pd(_a1, _a1, 0), r2)); - if (!_xyHalf) - res = _mm_add_pd(res, _mm_and_pd(_a1, mask01)); - return res; -} - -#undef BUILD_MASKF -#undef BUILD_XORMASKF -#undef BROADCAST32 -#else -#error "no implementation" -#endif - -} // nbl::core - -#endif diff --git a/include/matrix4SIMD.h b/include/matrix4SIMD.h deleted file mode 100644 index 03126c61f7..0000000000 --- a/include/matrix4SIMD.h +++ /dev/null @@ -1,385 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#ifndef __NBL_MATRIX4SIMD_H_INCLUDED__ -#define __NBL_MATRIX4SIMD_H_INCLUDED__ - -#include "matrix3x4SIMD.h" - -namespace nbl -{ -namespace core -{ - -template -class aabbox3d; - - -class matrix4SIMD// : public AlignedBase<_NBL_SIMD_ALIGNMENT> don't inherit from AlignedBase (which is empty) because member `rows[4]` inherits from it as well -{ - public: - _NBL_STATIC_INLINE_CONSTEXPR uint32_t VectorCount = 4u; - vectorSIMDf rows[VectorCount]; - - inline explicit matrix4SIMD(const vectorSIMDf& _r0 = vectorSIMDf(1.f, 0.f, 0.f, 0.f), - const vectorSIMDf& _r1 = vectorSIMDf(0.f, 1.f, 0.f, 0.f), - const vectorSIMDf& _r2 = vectorSIMDf(0.f, 0.f, 1.f, 0.f), - const vectorSIMDf& _r3 = vectorSIMDf(0.f, 0.f, 0.f, 1.f)) - : rows{ _r0, _r1, _r2, _r3 } - { - } - - inline matrix4SIMD( float _a00, float _a01, float _a02, float _a03, - float _a10, float _a11, float _a12, float _a13, - float _a20, float _a21, float _a22, float _a23, - float _a30, float _a31, float _a32, float _a33) - : matrix4SIMD( vectorSIMDf(_a00, _a01, _a02, _a03), - vectorSIMDf(_a10, _a11, _a12, _a13), - vectorSIMDf(_a20, _a21, _a22, _a23), - vectorSIMDf(_a30, _a31, _a32, _a33)) - { - } - - inline explicit matrix4SIMD(const float* const _data) - { - if (!_data) - return; - for (size_t i = 0u; i < VectorCount; ++i) - rows[i] = vectorSIMDf(_data + 4 * i); - } - inline matrix4SIMD(const float* const _data, bool ALIGNED) - { - if (!_data) - return; - for (size_t i = 0u; i < VectorCount; ++i) - rows[i] = vectorSIMDf(_data + 4 * i, ALIGNED); - } - - inline explicit matrix4SIMD(const matrix3x4SIMD& smallMat) - { - *reinterpret_cast(this) = smallMat; - rows[3].set(0.f,0.f,0.f,1.f); - } - - inline matrix3x4SIMD extractSub3x4() const - { - return matrix3x4SIMD(rows[0],rows[1],rows[2]); - } - - //! Access by row - inline const vectorSIMDf& getRow(size_t _rown) const{ return rows[_rown]; } - inline vectorSIMDf& getRow(size_t _rown) { return rows[_rown]; } - - //! Access by element - inline float operator()(size_t _i, size_t _j) const { return rows[_i].pointer[_j]; } - inline float& operator()(size_t _i, size_t _j) { return rows[_i].pointer[_j]; } - - //! Access for memory - inline const float* pointer() const {return rows[0].pointer;} - inline float* pointer() {return rows[0].pointer;} - - - inline bool operator==(const matrix4SIMD& _other) const - { - return !(*this != _other); - } - inline bool operator!=(const matrix4SIMD& _other) const; - - inline matrix4SIMD& operator+=(const matrix4SIMD& _other); - inline matrix4SIMD operator+(const matrix4SIMD& _other) const - { - matrix4SIMD r{*this}; - return r += _other; - } - - inline matrix4SIMD& operator-=(const matrix4SIMD& _other); - inline matrix4SIMD operator-(const matrix4SIMD& _other) const - { - matrix4SIMD r{*this}; - return r -= _other; - } - - inline matrix4SIMD& operator*=(float _scalar); - inline matrix4SIMD operator*(float _scalar) const - { - matrix4SIMD r{*this}; - return r *= _scalar; - } - - static inline matrix4SIMD concatenateBFollowedByA(const matrix4SIMD& _a, const matrix4SIMD& _b); - static inline matrix4SIMD concatenateBFollowedByAPrecisely(const matrix4SIMD& _a, const matrix4SIMD& _b); - - inline bool isIdentity() const - { - return *this == matrix4SIMD(); - } - inline bool isIdentity(float _tolerance) const; - - inline bool isOrthogonal() const - { - return concatenateBFollowedByA(transpose(*this), *this).isIdentity(); - } - inline bool isOrthogonal(float _tolerance) const - { - return concatenateBFollowedByA(transpose(*this), *this).isIdentity(_tolerance); - } - - inline matrix4SIMD& setScale(const core::vectorSIMDf& _scale); - inline matrix4SIMD& setScale(float _scale) - { - return setScale(vectorSIMDf(_scale)); - } - - inline void setTranslation(const float* _t) - { - for (size_t i = 0u; i < 3u; ++i) - rows[i].w = _t[i]; - } - //! Takes into account only x,y,z components of _t - inline void setTranslation(const vectorSIMDf& _t) - { - setTranslation(_t.pointer); - } - inline void setTranslation(const vector3d& _t) - { - setTranslation(&_t.X); - } - - //! Returns last column of the matrix. - inline vectorSIMDf getTranslation() const; - - //! Returns translation part of the matrix (w component is always 0). - inline vectorSIMDf getTranslation3D() const; - - enum class E_MATRIX_INVERSE_PRECISION - { - EMIP_FAST_RECIPROCAL, - EMIP_32BIT, - EMIP_64BBIT - }; - - template - inline bool getInverseTransform(matrix4SIMD& _out) const - { - if constexpr (precision == E_MATRIX_INVERSE_PRECISION::EMIP_64BBIT) - { - double a = rows[0][0], b = rows[0][1], c = rows[0][2], d = rows[0][3]; - double e = rows[1][0], f = rows[1][1], g = rows[1][2], h = rows[1][3]; - double i = rows[2][0], j = rows[2][1], k = rows[2][2], l = rows[2][3]; - double m = rows[3][0], n = rows[3][1], o = rows[3][2], p = rows[3][3]; - - double kp_lo = k * p - l * o; - double jp_ln = j * p - l * n; - double jo_kn = j * o - k * n; - double ip_lm = i * p - l * m; - double io_km = i * o - k * m; - double in_jm = i * n - j * m; - - double a11 = +(f * kp_lo - g * jp_ln + h * jo_kn); - double a12 = -(e * kp_lo - g * ip_lm + h * io_km); - double a13 = +(e * jp_ln - f * ip_lm + h * in_jm); - double a14 = -(e * jo_kn - f * io_km + g * in_jm); - - double det = a * a11 + b * a12 + c * a13 + d * a14; - - if (core::iszero(det, DBL_MIN)) - return false; - - double invDet = 1.0 / det; - - _out.rows[0][0] = a11 * invDet; - _out.rows[1][0] = a12 * invDet; - _out.rows[2][0] = a13 * invDet; - _out.rows[3][0] = a14 * invDet; - - _out.rows[0][1] = -(b * kp_lo - c * jp_ln + d * jo_kn) * invDet; - _out.rows[1][1] = +(a * kp_lo - c * ip_lm + d * io_km) * invDet; - _out.rows[2][1] = -(a * jp_ln - b * ip_lm + d * in_jm) * invDet; - _out.rows[3][1] = +(a * jo_kn - b * io_km + c * in_jm) * invDet; - - double gp_ho = g * p - h * o; - double fp_hn = f * p - h * n; - double fo_gn = f * o - g * n; - double ep_hm = e * p - h * m; - double eo_gm = e * o - g * m; - double en_fm = e * n - f * m; - - _out.rows[0][2] = +(b * gp_ho - c * fp_hn + d * fo_gn) * invDet; - _out.rows[1][2] = -(a * gp_ho - c * ep_hm + d * eo_gm) * invDet; - _out.rows[2][2] = +(a * fp_hn - b * ep_hm + d * en_fm) * invDet; - _out.rows[3][2] = -(a * fo_gn - b * eo_gm + c * en_fm) * invDet; - - double gl_hk = g * l - h * k; - double fl_hj = f * l - h * j; - double fk_gj = f * k - g * j; - double el_hi = e * l - h * i; - double ek_gi = e * k - g * i; - double ej_fi = e * j - f * i; - - _out.rows[0][3] = -(b * gl_hk - c * fl_hj + d * fk_gj) * invDet; - _out.rows[1][3] = +(a * gl_hk - c * el_hi + d * ek_gi) * invDet; - _out.rows[2][3] = -(a * fl_hj - b * el_hi + d * ej_fi) * invDet; - _out.rows[3][3] = +(a * fk_gj - b * ek_gi + c * ej_fi) * invDet; - - return true; - } - else - { - auto mat2mul = [](vectorSIMDf _A, vectorSIMDf _B) - { - return _A*_B.xwxw()+_A.yxwz()*_B.zyzy(); - }; - auto mat2adjmul = [](vectorSIMDf _A, vectorSIMDf _B) - { - return _A.wwxx()*_B-_A.yyzz()*_B.zwxy(); - }; - auto mat2muladj = [](vectorSIMDf _A, vectorSIMDf _B) - { - return _A*_B.wxwx()-_A.yxwz()*_B.zyzy(); - }; - - vectorSIMDf A = _mm_movelh_ps(rows[0].getAsRegister(), rows[1].getAsRegister()); - vectorSIMDf B = _mm_movehl_ps(rows[1].getAsRegister(), rows[0].getAsRegister()); - vectorSIMDf C = _mm_movelh_ps(rows[2].getAsRegister(), rows[3].getAsRegister()); - vectorSIMDf D = _mm_movehl_ps(rows[3].getAsRegister(), rows[2].getAsRegister()); - - vectorSIMDf allDets = vectorSIMDf(_mm_shuffle_ps(rows[0].getAsRegister(),rows[2].getAsRegister(),_MM_SHUFFLE(2,0,2,0)))* - vectorSIMDf(_mm_shuffle_ps(rows[1].getAsRegister(),rows[3].getAsRegister(),_MM_SHUFFLE(3,1,3,1))) - - - vectorSIMDf(_mm_shuffle_ps(rows[0].getAsRegister(),rows[2].getAsRegister(),_MM_SHUFFLE(3,1,3,1)))* - vectorSIMDf(_mm_shuffle_ps(rows[1].getAsRegister(),rows[3].getAsRegister(),_MM_SHUFFLE(2,0,2,0))); - - auto detA = allDets.xxxx(); - auto detB = allDets.yyyy(); - auto detC = allDets.zzzz(); - auto detD = allDets.wwww(); - - // https://lxjk.github.io/2017/09/03/Fast-4x4-Matrix-Inverse-with-SSE-SIMD-Explained.html - auto D_C = mat2adjmul(D, C); - // A#B - auto A_B = mat2adjmul(A, B); - // X# = |D|A - B(D#C) - auto X_ = detD*A - mat2mul(B, D_C); - // W# = |A|D - C(A#B) - auto W_ = detA*D - mat2mul(C, A_B); - - // |M| = |A|*|D| + ... (continue later) - auto detM = detA*detD; - - // Y# = |B|C - D(A#B)# - auto Y_ = detB*C - mat2muladj(D, A_B); - // Z# = |C|B - A(D#C)# - auto Z_ = detC*B - mat2muladj(A, D_C); - - // |M| = |A|*|D| + |B|*|C| ... (continue later) - detM += detB*detC; - - // tr((A#B)(D#C)) - __m128 tr = (A_B*D_C.xzyw()).getAsRegister(); - tr = _mm_hadd_ps(tr, tr); - tr = _mm_hadd_ps(tr, tr); - // |M| = |A|*|D| + |B|*|C| - tr((A#B)(D#C) - detM -= tr; - - if (core::iszero(detM.x, FLT_MIN)) - return false; - - vectorSIMDf rDetM; - - // (1/|M|, -1/|M|, -1/|M|, 1/|M|) - if constexpr (precision == E_MATRIX_INVERSE_PRECISION::EMIP_FAST_RECIPROCAL) - rDetM = vectorSIMDf(1.f, -1.f, -1.f, 1.f)*core::reciprocal(detM); - else if constexpr (precision == E_MATRIX_INVERSE_PRECISION::EMIP_32BIT) - rDetM = vectorSIMDf(1.f, -1.f, -1.f, 1.f).preciseDivision(detM); - - X_ *= rDetM; - Y_ *= rDetM; - Z_ *= rDetM; - W_ *= rDetM; - - // apply adjugate and store, here we combine adjugate shuffle and store shuffle - _out.rows[0] = _mm_shuffle_ps(X_.getAsRegister(), Y_.getAsRegister(), _MM_SHUFFLE(1, 3, 1, 3)); - _out.rows[1] = _mm_shuffle_ps(X_.getAsRegister(), Y_.getAsRegister(), _MM_SHUFFLE(0, 2, 0, 2)); - _out.rows[2] = _mm_shuffle_ps(Z_.getAsRegister(), W_.getAsRegister(), _MM_SHUFFLE(1, 3, 1, 3)); - _out.rows[3] = _mm_shuffle_ps(Z_.getAsRegister(), W_.getAsRegister(), _MM_SHUFFLE(0, 2, 0, 2)); - - return true; - } - } - - inline vectorSIMDf sub3x3TransformVect(const vectorSIMDf& _in) const; - - inline void transformVect(vectorSIMDf& _out, const vectorSIMDf& _in) const; - inline void transformVect(vectorSIMDf& _vector) const - { - transformVect(_vector, _vector); - } - - inline void translateVect(vectorSIMDf& _vect) const - { - _vect += getTranslation(); - } - - bool isBoxInFrustum(const aabbox3d& bbox); - - bool perspectiveTransformVect(core::vectorSIMDf& inOutVec) - { - transformVect(inOutVec); - const bool inFront = inOutVec[3] > 0.f; - inOutVec /= inOutVec.wwww(); - return inFront; - } - - core::vector2di fragCoordTransformVect(const core::vectorSIMDf& _in, const core::dimension2du& viewportDimensions) - { - core::vectorSIMDf pos(_in); - pos.w = 1.f; - if (perspectiveTransformVect(pos)) - core::vector2di(-0x80000000, -0x80000000); - - pos[0] *= 0.5f; - pos[1] *= 0.5f; - pos[0] += 0.5f; - pos[1] += 0.5f; - - return core::vector2di(pos[0] * float(viewportDimensions.Width), pos[1] * float(viewportDimensions.Height)); - } - - static inline matrix4SIMD buildProjectionMatrixPerspectiveFovRH(float fieldOfViewRadians, float aspectRatio, float zNear, float zFar); - static inline matrix4SIMD buildProjectionMatrixPerspectiveFovLH(float fieldOfViewRadians, float aspectRatio, float zNear, float zFar); - - static inline matrix4SIMD buildProjectionMatrixOrthoRH(float widthOfViewVolume, float heightOfViewVolume, float zNear, float zFar); - static inline matrix4SIMD buildProjectionMatrixOrthoLH(float widthOfViewVolume, float heightOfViewVolume, float zNear, float zFar); - - //! Access by row - inline const vectorSIMDf& operator[](size_t _rown) const { return rows[_rown]; } - //! Access by row - inline vectorSIMDf& operator[](size_t _rown) { return rows[_rown]; } - - private: - //! TODO: implement a dvec<2> - inline __m128d halfRowAsDouble(size_t _n, bool _firstHalf) const; - static inline __m128d concat64_helper(const __m128d& _a0, const __m128d& _a1, const matrix4SIMD& _mtx, bool _firstHalf); -}; - -inline matrix4SIMD operator*(float _scalar, const matrix4SIMD& _mtx) -{ - return _mtx * _scalar; -} - -inline matrix4SIMD concatenateBFollowedByA(const matrix4SIMD& _a, const matrix4SIMD& _b) -{ - return matrix4SIMD::concatenateBFollowedByA(_a, _b); -} -/* -inline matrix4SIMD concatenateBFollowedByAPrecisely(const matrix4SIMD& _a, const matrix4SIMD& _b) -{ - return matrix4SIMD::concatenateBFollowedByAPrecisely(_a, _b); -} -*/ - - -}} // nbl::core - -#endif diff --git a/include/matrix4SIMD_impl.h b/include/matrix4SIMD_impl.h deleted file mode 100644 index 02484e7a4c..0000000000 --- a/include/matrix4SIMD_impl.h +++ /dev/null @@ -1,299 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#ifndef __NBL_MATRIX4SIMD_IMPL_H_INCLUDED__ -#define __NBL_MATRIX4SIMD_IMPL_H_INCLUDED__ - -#include "matrix4SIMD.h" -#include "nbl/core/math/glslFunctions.tcc" -#include "aabbox3d.h" - -namespace nbl -{ -namespace core -{ - - -inline bool matrix4SIMD::operator!=(const matrix4SIMD& _other) const -{ - for (size_t i = 0u; i < VectorCount; ++i) - if ((rows[i] != _other.rows[i]).any()) - return true; - return false; -} - -inline matrix4SIMD& matrix4SIMD::operator+=(const matrix4SIMD& _other) -{ - for (size_t i = 0u; i < VectorCount; ++i) - rows[i] += _other.rows[i]; - return *this; -} - -inline matrix4SIMD& matrix4SIMD::operator-=(const matrix4SIMD& _other) -{ - for (size_t i = 0u; i < VectorCount; ++i) - rows[i] -= _other.rows[i]; - return *this; -} - -inline matrix4SIMD& matrix4SIMD::operator*=(float _scalar) -{ - for (size_t i = 0u; i < VectorCount; ++i) - rows[i] *= _scalar; - return *this; -} - -inline bool matrix4SIMD::isIdentity(float _tolerance) const -{ - return core::equals(*this, matrix4SIMD(), core::ROUNDING_ERROR()); -} - -#ifdef __NBL_COMPILE_WITH_SSE3 -#define BROADCAST32(fpx) _MM_SHUFFLE(fpx, fpx, fpx, fpx) -#define BUILD_MASKF(_x_, _y_, _z_, _w_) _mm_setr_epi32(_x_*0xffffffff, _y_*0xffffffff, _z_*0xffffffff, _w_*0xffffffff) -inline matrix4SIMD matrix4SIMD::concatenateBFollowedByA(const matrix4SIMD& _a, const matrix4SIMD& _b) -{ - auto calcRow = [](const __m128& _row, const matrix4SIMD& _mtx) - { - __m128 r0 = _mtx.rows[0].getAsRegister(); - __m128 r1 = _mtx.rows[1].getAsRegister(); - __m128 r2 = _mtx.rows[2].getAsRegister(); - __m128 r3 = _mtx.rows[3].getAsRegister(); - - __m128 res; - res = _mm_mul_ps(_mm_shuffle_ps(_row, _row, BROADCAST32(0)), r0); - res = _mm_add_ps(res, _mm_mul_ps(_mm_shuffle_ps(_row, _row, BROADCAST32(1)), r1)); - res = _mm_add_ps(res, _mm_mul_ps(_mm_shuffle_ps(_row, _row, BROADCAST32(2)), r2)); - res = _mm_add_ps(res, _mm_mul_ps(_mm_shuffle_ps(_row, _row, BROADCAST32(3)), r3)); - return res; - }; - - matrix4SIMD r; - for (size_t i = 0u; i < 4u; ++i) - r.rows[i] = calcRow(_a.rows[i].getAsRegister(), _b); - - return r; -} -inline matrix4SIMD matrix4SIMD::concatenateBFollowedByAPrecisely(const matrix4SIMD& _a, const matrix4SIMD& _b) -{ - matrix4SIMD out; - - __m128i mask0011 = BUILD_MASKF(0, 0, 1, 1); - __m128 second; - - { - __m128d r00 = _a.halfRowAsDouble(0u, true); - __m128d r01 = _a.halfRowAsDouble(0u, false); - second = _mm_cvtpd_ps(concat64_helper(r00, r01, _b, false)); - out.rows[0] = vectorSIMDf(_mm_cvtpd_ps(concat64_helper(r00, r01, _b, true))) | _mm_castps_si128((vectorSIMDf(_mm_movelh_ps(second, second)) & mask0011).getAsRegister()); - } - - { - __m128d r10 = _a.halfRowAsDouble(1u, true); - __m128d r11 = _a.halfRowAsDouble(1u, false); - second = _mm_cvtpd_ps(concat64_helper(r10, r11, _b, false)); - out.rows[1] = vectorSIMDf(_mm_cvtpd_ps(concat64_helper(r10, r11, _b, true))) | _mm_castps_si128((vectorSIMDf(_mm_movelh_ps(second, second)) & mask0011).getAsRegister()); - } - - { - __m128d r20 = _a.halfRowAsDouble(2u, true); - __m128d r21 = _a.halfRowAsDouble(2u, false); - second = _mm_cvtpd_ps(concat64_helper(r20, r21, _b, false)); - out.rows[2] = vectorSIMDf(_mm_cvtpd_ps(concat64_helper(r20, r21, _b, true))) | _mm_castps_si128((vectorSIMDf(_mm_movelh_ps(second, second)) & mask0011).getAsRegister()); - } - - { - __m128d r30 = _a.halfRowAsDouble(3u, true); - __m128d r31 = _a.halfRowAsDouble(3u, false); - second = _mm_cvtpd_ps(concat64_helper(r30, r31, _b, false)); - out.rows[3] = vectorSIMDf(_mm_cvtpd_ps(concat64_helper(r30, r31, _b, true))) | _mm_castps_si128((vectorSIMDf(_mm_movelh_ps(second, second)) & mask0011).getAsRegister()); - } - - return out; -} - -inline matrix4SIMD& matrix4SIMD::setScale(const core::vectorSIMDf& _scale) -{ - const __m128i mask0001 = BUILD_MASKF(0, 0, 0, 1); - - rows[0] = (_scale & BUILD_MASKF(1, 0, 0, 0)) | _mm_castps_si128((rows[0] & mask0001).getAsRegister()); - rows[1] = (_scale & BUILD_MASKF(0, 1, 0, 0)) | _mm_castps_si128((rows[1] & mask0001).getAsRegister()); - rows[2] = (_scale & BUILD_MASKF(0, 0, 1, 0)) | _mm_castps_si128((rows[2] & mask0001).getAsRegister()); - rows[3] = vectorSIMDf(0.f, 0.f, 0.f, 1.f); - - return *this; -} - -//! Returns last column of the matrix. -inline vectorSIMDf matrix4SIMD::getTranslation() const -{ - __m128 tmp1 = _mm_unpackhi_ps(rows[0].getAsRegister(), rows[1].getAsRegister()); // (0z,1z,0w,1w) - __m128 tmp2 = _mm_unpackhi_ps(rows[2].getAsRegister(), rows[3].getAsRegister()); // (2z,3z,2w,3w) - __m128 col3 = _mm_movehl_ps(tmp1, tmp2);// (0w,1w,2w,3w) - - return col3; -} -//! Returns translation part of the matrix (w component is always 0). -inline vectorSIMDf matrix4SIMD::getTranslation3D() const -{ - __m128 tmp1 = _mm_unpackhi_ps(rows[0].getAsRegister(), rows[1].getAsRegister()); // (0z,1z,0w,1w) - __m128 tmp2 = _mm_unpackhi_ps(rows[2].getAsRegister(), _mm_setzero_ps()); // (2z,0,2w,0) - __m128 transl = _mm_movehl_ps(tmp1, tmp2);// (0w,1w,2w,0) - - return transl; -} - -inline vectorSIMDf matrix4SIMD::sub3x3TransformVect(const vectorSIMDf& _in) const -{ - matrix4SIMD cp{*this}; - vectorSIMDf out = _in & BUILD_MASKF(1, 1, 1, 0); - transformVect(out); - return out; -} - -inline void matrix4SIMD::transformVect(vectorSIMDf& _out, const vectorSIMDf& _in) const -{ - vectorSIMDf r[4]; - for (size_t i = 0u; i < VectorCount; ++i) - r[i] = rows[i] * _in; - - _out = _mm_hadd_ps( - _mm_hadd_ps(r[0].getAsRegister(), r[1].getAsRegister()), - _mm_hadd_ps(r[2].getAsRegister(), r[3].getAsRegister()) - ); -} - -inline matrix4SIMD matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(float fieldOfViewRadians, float aspectRatio, float zNear, float zFar) -{ - const float h = core::reciprocal(tanf(fieldOfViewRadians*0.5f)); - _NBL_DEBUG_BREAK_IF(aspectRatio == 0.f); //division by zero - const float w = h / aspectRatio; - - _NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero - - matrix4SIMD m; - m.rows[0] = vectorSIMDf(w, 0.f, 0.f, 0.f); - m.rows[1] = vectorSIMDf(0.f, -h, 0.f, 0.f); - m.rows[2] = vectorSIMDf(0.f, 0.f, -zFar/(zFar-zNear), -zNear*zFar/(zFar-zNear)); - m.rows[3] = vectorSIMDf(0.f, 0.f, -1.f, 0.f); - - return m; -} -inline matrix4SIMD matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(float fieldOfViewRadians, float aspectRatio, float zNear, float zFar) -{ - const float h = core::reciprocal(tanf(fieldOfViewRadians*0.5f)); - _NBL_DEBUG_BREAK_IF(aspectRatio == 0.f); //division by zero - const float w = h / aspectRatio; - - _NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero - - matrix4SIMD m; - m.rows[0] = vectorSIMDf(w, 0.f, 0.f, 0.f); - m.rows[1] = vectorSIMDf(0.f, -h, 0.f, 0.f); - m.rows[2] = vectorSIMDf(0.f, 0.f, zFar/(zFar-zNear), -zNear*zFar/(zFar-zNear)); - m.rows[3] = vectorSIMDf(0.f, 0.f, 1.f, 0.f); - - return m; -} - -inline matrix4SIMD matrix4SIMD::buildProjectionMatrixOrthoRH(float widthOfViewVolume, float heightOfViewVolume, float zNear, float zFar) -{ - _NBL_DEBUG_BREAK_IF(widthOfViewVolume == 0.f); //division by zero - _NBL_DEBUG_BREAK_IF(heightOfViewVolume == 0.f); //division by zero - _NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero - - matrix4SIMD m; - m.rows[0] = vectorSIMDf(2.f/widthOfViewVolume, 0.f, 0.f, 0.f); - m.rows[1] = vectorSIMDf(0.f, -2.f/heightOfViewVolume, 0.f, 0.f); - m.rows[2] = vectorSIMDf(0.f, 0.f, -1.f/(zFar-zNear), -zNear/(zFar-zNear)); - m.rows[3] = vectorSIMDf(0.f, 0.f, 0.f, 1.f); - - return m; -} -inline matrix4SIMD matrix4SIMD::buildProjectionMatrixOrthoLH(float widthOfViewVolume, float heightOfViewVolume, float zNear, float zFar) -{ - _NBL_DEBUG_BREAK_IF(widthOfViewVolume == 0.f); //division by zero - _NBL_DEBUG_BREAK_IF(heightOfViewVolume == 0.f); //division by zero - _NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero - - matrix4SIMD m; - m.rows[0] = vectorSIMDf(2.f/widthOfViewVolume, 0.f, 0.f, 0.f); - m.rows[1] = vectorSIMDf(0.f, -2.f/heightOfViewVolume, 0.f, 0.f); - m.rows[2] = vectorSIMDf(0.f, 0.f, 1.f/(zFar-zNear), -zNear/(zFar-zNear)); - m.rows[3] = vectorSIMDf(0.f, 0.f, 0.f, 1.f); - - return m; -} - - - -inline __m128d matrix4SIMD::halfRowAsDouble(size_t _n, bool _firstHalf) const -{ - return _mm_cvtps_pd(_firstHalf ? rows[_n].xyxx().getAsRegister() : rows[_n].zwxx().getAsRegister()); -} -inline __m128d matrix4SIMD::concat64_helper(const __m128d& _a0, const __m128d& _a1, const matrix4SIMD& _mtx, bool _firstHalf) -{ - __m128d r0 = _mtx.halfRowAsDouble(0u, _firstHalf); - __m128d r1 = _mtx.halfRowAsDouble(1u, _firstHalf); - __m128d r2 = _mtx.halfRowAsDouble(2u, _firstHalf); - __m128d r3 = _mtx.halfRowAsDouble(3u, _firstHalf); - - //const __m128d mask01 = _mm_castsi128_pd(_mm_setr_epi32(0, 0, 0xffffffff, 0xffffffff)); - - __m128d res; - res = _mm_mul_pd(_mm_shuffle_pd(_a0, _a0, 0), r0); - res = _mm_add_pd(res, _mm_mul_pd(_mm_shuffle_pd(_a0, _a0, 3/*0b11*/), r1)); - res = _mm_add_pd(res, _mm_mul_pd(_mm_shuffle_pd(_a1, _a1, 0), r2)); - res = _mm_add_pd(res, _mm_mul_pd(_mm_shuffle_pd(_a1, _a1, 3/*0b11*/), r3)); - return res; -} - -#undef BUILD_MASKF -#undef BROADCAST32 -#else -#error "no implementation" -#endif - -inline bool matrix4SIMD::isBoxInFrustum(const aabbox3d& bbox) -{ - vectorSIMDf MinEdge, MaxEdge; - MinEdge.set(bbox.MinEdge); - MaxEdge.set(bbox.MaxEdge); - MinEdge.w = 1.f; - MaxEdge.w = 1.f; - - - auto getClosestDP = [&MinEdge,&MaxEdge](const vectorSIMDf& toDot) -> float - { - return dot(mix(MaxEdge,MinEdge,toDot +#endif + +namespace nbl +{ +namespace hlsl +{ +namespace glsl +{ + +template +inline T radians(NBL_CONST_REF_ARG(T) degrees) +{ + static_assert( + is_floating_point::value, + "This code expects the type to be either a double or a float." + ); + + return degrees * PI() / T(180); +} + +template +inline T degrees(NBL_CONST_REF_ARG(T) radians) +{ + static_assert( + is_floating_point::value, + "This code expects the type to be either a double or a float." + ); + + return radians * T(180) / PI(); +} + +template +inline bool equals(NBL_CONST_REF_ARG(T) a, NBL_CONST_REF_ARG(T) b, NBL_CONST_REF_ARG(T) tolerance) +{ + return (a + tolerance >= b) && (a - tolerance <= b); +} + +#ifndef __HLSL_VERSION + +NBL_FORCE_INLINE bool equals(const core::vector3df& a, const core::vector3df& b, const core::vector3df& tolerance) +{ + auto ha = a + tolerance; + auto la = a - tolerance; + return ha.X >= b.X && ha.Y >= b.Y && ha.Z >= b.Z && la.X <= b.X && la.Y <= b.Y && la.Z <= b.Z; +} + +#endif + +} +} +} + +#endif diff --git a/include/nbl/builtin/hlsl/math/quaternion/quaternion.hlsl b/include/nbl/builtin/hlsl/math/quaternion/quaternion.hlsl new file mode 100644 index 0000000000..4979e35ef6 --- /dev/null +++ b/include/nbl/builtin/hlsl/math/quaternion/quaternion.hlsl @@ -0,0 +1,74 @@ +// Copyright (C) 2019 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine" and was originally part of the "Irrlicht Engine" +// For conditions of distribution and use, see copyright notice in nabla.h +// See the original file in irrlicht source for authors + +#ifndef _NBL_BUILTIN_HLSL_MATH_QUATERNION_INCLUDED_ +#define _NBL_BUILTIN_HLSL_MATH_QUATERNION_INCLUDED_ + +#include + +namespace nbl +{ +namespace hlsl +{ + + +//! Quaternion class for representing rotations. +/** It provides cheap combinations and avoids gimbal locks. +Also useful for interpolations. */ + +template +struct quaternion +{ + // i*data[0] + j*data[1] + k*data[2] + data[3] + vector data; + + //! creates identity quaternion + static inline quaternion create() + { + quaternion q; + q.data = vector(0.0f, 0.0f, 0.0f, 1.0f); + + return q; + } + + static inline quaternion create(T x, T y, T z, T w) + { + quaternion q; + q.data = vector(x, y, z, z); + + return q; + } + +#define DEFINE_MUL_QUATERNION_BY_SCALAR_OPERATOR(TYPE)\ + inline quaternion operator*(float scalar)\ + {\ + quaternion output;\ + output.data = data * scalar;\ + return output;\ + }\ + + DEFINE_MUL_QUATERNION_BY_SCALAR_OPERATOR(uint32_t) + DEFINE_MUL_QUATERNION_BY_SCALAR_OPERATOR(uint64_t) + DEFINE_MUL_QUATERNION_BY_SCALAR_OPERATOR(float32_t) + DEFINE_MUL_QUATERNION_BY_SCALAR_OPERATOR(float64_t) + +#undef DEFINE_MUL_QUATERNION_BY_SCALAR_OPERATOR + + inline quaternion operator*(NBL_CONST_REF_ARG(quaternion) other) + { + return quaternion::create( + w * q.w - x * q.x - y * q.y - z * q.z, + w * q.x + x * q.w + y * q.z - z * q.y, + w * q.y - x * q.z + y * q.w + z * q.x, + w * q.z + x * q.y - y * q.x + z * q.w + ); + } +} + +} // end namespace core +} // nbl + +#endif + diff --git a/include/nbl/builtin/hlsl/math/quaternion/quaternion_impl.hlsl b/include/nbl/builtin/hlsl/math/quaternion/quaternion_impl.hlsl new file mode 100644 index 0000000000..d00d9ce2c4 --- /dev/null +++ b/include/nbl/builtin/hlsl/math/quaternion/quaternion_impl.hlsl @@ -0,0 +1,25 @@ +// Copyright (C) 2019 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine" and was originally part of the "Irrlicht Engine" +// For conditions of distribution and use, see copyright notice in nabla.h +// See the original file in irrlicht source for authors + +#ifndef _NBL_BUILTIN_HLSL_MATH_QUATERNION_IMPL_INCLUDED_ +#define _NBL_BUILTIN_HLSL_MATH_QUATERNION_IMPL_INCLUDED_ + +#include + +namespace nbl +{ +namespace hlsl +{ + +namespace quaternion_impl +{ + +} + +} // end namespace core +} // nbl + +#endif + From 1397756bb8e218b1f7a39cb151f545a84f71b390 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Fri, 25 Oct 2024 03:10:59 -0700 Subject: [PATCH 104/358] Saving work --- examples_tests | 2 +- include/nabla.h | 3 +- .../nbl/asset/utils/CQuantQuaternionCache.h | 2 +- .../nbl/builtin/hlsl/cpp_compat/intrinsics.h | 2 + .../nbl/builtin/hlsl/cpp_compat/matrix.hlsl | 1 + .../hlsl/math/quaternion/quaternion.hlsl | 12 ++--- .../transformation_matrix_utils.hlsl | 35 ++++++++++--- include/nbl/core/declarations.h | 1 - include/nbl/core/definitions.h | 4 -- include/nbl/core/math/floatutil.tcc | 12 +---- include/nbl/core/math/glslFunctions.h | 22 ++++----- include/nbl/core/math/glslFunctions.tcc | 49 +++++++------------ include/nbl/core/math/matrixutil.h | 29 ----------- include/nbl/core/math/plane3dSIMD.h | 18 ++++--- 14 files changed, 80 insertions(+), 112 deletions(-) delete mode 100644 include/nbl/core/math/matrixutil.h diff --git a/examples_tests b/examples_tests index 6b45fecf8c..13df456b76 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 6b45fecf8c64e8b11fef8c2b3a7e5618edaca68d +Subproject commit 13df456b76c4e38996502fc71d9878334fd5ea18 diff --git a/include/nabla.h b/include/nabla.h index dac3155e5e..c3981635ea 100644 --- a/include/nabla.h +++ b/include/nabla.h @@ -53,9 +53,8 @@ #include "vector3d.h" #include "vectorSIMD.h" #include "line3d.h" -#include "matrix4SIMD.h" #include "position2d.h" -#include "quaternion.h" +#include "nbl/builtin/hlsl/math/quaternion/quaternion.hlsl" #include "rect.h" #include "dimension2d.h" diff --git a/include/nbl/asset/utils/CQuantQuaternionCache.h b/include/nbl/asset/utils/CQuantQuaternionCache.h index 8e46dffb0a..a51549d24d 100644 --- a/include/nbl/asset/utils/CQuantQuaternionCache.h +++ b/include/nbl/asset/utils/CQuantQuaternionCache.h @@ -60,7 +60,7 @@ class CQuantQuaternionCache : public CDirQuantCacheBase - value_type_t quantize(const core::quaternion& quat) + value_type_t quantize(const hlsl::quaternion& quat) { return Base::quantize<4u,CacheFormat>(reinterpret_cast(quat)); } diff --git a/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h b/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h index 3cc0bc7dab..a8f745fbae 100644 --- a/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h +++ b/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h @@ -4,6 +4,7 @@ #include #include +#include // this is a C++ only header, hence the `.h` extension, it only implements HLSL's built-in functions #ifndef __HLSL_VERSION @@ -62,6 +63,7 @@ NBL_BIT_OP_GLM_PASSTHROUGH(findMSB,findMSB) template inline matrix inverse(const matrix& m) { + static_assert(!(N == 3 && M == 4)); return reinterpret_cast&>(glm::inverse(reinterpret_cast::Base const&>(m))); } diff --git a/include/nbl/builtin/hlsl/cpp_compat/matrix.hlsl b/include/nbl/builtin/hlsl/cpp_compat/matrix.hlsl index b1d33f097b..455f8ef0c7 100644 --- a/include/nbl/builtin/hlsl/cpp_compat/matrix.hlsl +++ b/include/nbl/builtin/hlsl/cpp_compat/matrix.hlsl @@ -2,6 +2,7 @@ #define _NBL_BUILTIN_HLSL_CPP_COMPAT_MATRIX_INCLUDED_ #include +#include namespace nbl { diff --git a/include/nbl/builtin/hlsl/math/quaternion/quaternion.hlsl b/include/nbl/builtin/hlsl/math/quaternion/quaternion.hlsl index 4979e35ef6..2991a79e35 100644 --- a/include/nbl/builtin/hlsl/math/quaternion/quaternion.hlsl +++ b/include/nbl/builtin/hlsl/math/quaternion/quaternion.hlsl @@ -42,7 +42,7 @@ struct quaternion } #define DEFINE_MUL_QUATERNION_BY_SCALAR_OPERATOR(TYPE)\ - inline quaternion operator*(float scalar)\ + inline quaternion operator*(TYPE scalar)\ {\ quaternion output;\ output.data = data * scalar;\ @@ -59,13 +59,13 @@ struct quaternion inline quaternion operator*(NBL_CONST_REF_ARG(quaternion) other) { return quaternion::create( - w * q.w - x * q.x - y * q.y - z * q.z, - w * q.x + x * q.w + y * q.z - z * q.y, - w * q.y - x * q.z + y * q.w + z * q.x, - w * q.z + x * q.y - y * q.x + z * q.w + data.w * other.data.w - data.x * other.x - data.y * other.data.y - data.z * other.data.z, + data.w * other.data.x + data.x * other.w + data.y * other.data.z - data.z * other.data.y, + data.w * other.data.y - data.x * other.z + data.y * other.data.w + data.z * other.data.x, + data.w * other.data.z + data.x * other.y - data.y * other.data.x + data.z * other.data.w ); } -} +}; } // end namespace core } // nbl diff --git a/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl b/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl index d50bc16869..9b3ce2f602 100644 --- a/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl +++ b/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl @@ -2,14 +2,33 @@ #define _NBL_BUILTIN_HLSL_TRANSFORMATION_MATRIX_UTILS_INCLUDED_ #include +#include +// TODO: remove this header when deleting vectorSIMDf.hlsl +#include namespace nbl { namespace hlsl { +// TODO: this is temporary function, delete when removing vectorSIMD template -matrix getMatrix3x4As4x4(const matrix& mat) +core::vectorSIMDf transformVector(const matrix& mat, const core::vectorSIMDf& vec) +{ + core::vectorSIMDf output; + float32_t4 tmp; + for (int i = 0; i < 4; ++i) // rather do that that reinterpret_cast for safety + tmp[i] = output[i]; + + for (int i = 0; i < 4; ++i) + output[i] = dot(mat[i], tmp[i]); + + return output; +} + +// TODO: another idea is to create `getTransformationMatrixAs4x4` function, which will add 4th row to a 3x4 matrix and do nothing to 4x4 matrix, this way we will not have to deal with partial specialization later +template +matrix getMatrix3x4As4x4(matrix mat) { matrix output; for (int i = 0; i < 3; ++i) @@ -35,9 +54,9 @@ inline matrix buildCameraLookAtMatrixLH( const vector& target, const vector& upVector) { - const vector zaxis = core::normalize(target - position); - const vector xaxis = core::normalize(core::cross(upVector, zaxis)); - const vector yaxis = core::cross(zaxis, xaxis); + const vector zaxis = normalize(target - position); + const vector xaxis = normalize(cross(upVector, zaxis)); + const vector yaxis = cross(zaxis, xaxis); matrix r; r[0] = vector(xaxis, -dot(xaxis, position)); @@ -52,9 +71,9 @@ float32_t3x4 buildCameraLookAtMatrixRH( const float32_t3& target, const float32_t3& upVector) { - const float32_t3 zaxis = core::normalize(position - target); - const float32_t3 xaxis = core::normalize(core::cross(upVector, zaxis)); - const float32_t3 yaxis = core::cross(zaxis, xaxis); + const float32_t3 zaxis = normalize(position - target); + const float32_t3 xaxis = normalize(cross(upVector, zaxis)); + const float32_t3 yaxis = cross(zaxis, xaxis); float32_t3x4 r; r[0] = float32_t4(xaxis, -dot(xaxis, position)); @@ -70,7 +89,7 @@ float32_t3x4 buildCameraLookAtMatrixRH( //! Replaces curent rocation and scale by rotation represented by quaternion `quat`, leaves 4th row and 4th colum unchanged template -inline void setRotation(matrix& outMat, NBL_CONST_REF_ARG(core::quaternion) quat) +inline void setRotation(matrix& outMat, NBL_CONST_REF_ARG(nbl::hlsl::quaternion) quat) { static_assert(N == 3 || N == 4); diff --git a/include/nbl/core/declarations.h b/include/nbl/core/declarations.h index 9aa708a793..466ea988aa 100644 --- a/include/nbl/core/declarations.h +++ b/include/nbl/core/declarations.h @@ -50,7 +50,6 @@ #include "nbl/core/math/colorutil.h" #include "nbl/core/math/rational.h" #include "nbl/core/math/plane3dSIMD.h" -#include "nbl/core/math/matrixutil.h" // memory #include "nbl/core/memory/memory.h" #include "nbl/core/memory/new_delete.h" diff --git a/include/nbl/core/definitions.h b/include/nbl/core/definitions.h index c08af6ad74..5913c2c8f2 100644 --- a/include/nbl/core/definitions.h +++ b/include/nbl/core/definitions.h @@ -15,8 +15,4 @@ #include "nbl/core/math/floatutil.tcc" #include "nbl/core/math/glslFunctions.tcc" -// implementations [deprecated] -#include "matrix3x4SIMD_impl.h" -#include "matrix4SIMD_impl.h" - #endif \ No newline at end of file diff --git a/include/nbl/core/math/floatutil.tcc b/include/nbl/core/math/floatutil.tcc index 71c8bd2da7..86e42c7e73 100644 --- a/include/nbl/core/math/floatutil.tcc +++ b/include/nbl/core/math/floatutil.tcc @@ -7,7 +7,7 @@ #include "nbl/core/math/floatutil.h" -#include "matrix4SIMD.h" +#include "vectorSIMD.h" namespace nbl { @@ -29,16 +29,6 @@ NBL_FORCE_INLINE vectorSIMDf ROUNDING_ERROR() { return vectorSIMDf(ROUNDING_ERROR()); } -template<> -NBL_FORCE_INLINE matrix3x4SIMD ROUNDING_ERROR() -{ - return matrix3x4SIMD(ROUNDING_ERROR(),ROUNDING_ERROR(),ROUNDING_ERROR()); -} -template<> -NBL_FORCE_INLINE matrix4SIMD ROUNDING_ERROR() -{ - return matrix4SIMD(ROUNDING_ERROR(),ROUNDING_ERROR(),ROUNDING_ERROR(),ROUNDING_ERROR()); -} template NBL_FORCE_INLINE T ROUNDING_ERROR() { diff --git a/include/nbl/core/math/glslFunctions.h b/include/nbl/core/math/glslFunctions.h index 2bd17cd642..82402a0ae5 100644 --- a/include/nbl/core/math/glslFunctions.h +++ b/include/nbl/core/math/glslFunctions.h @@ -10,6 +10,7 @@ #include "nbl/type_traits.h" #include "nbl/core/math/floatutil.h" +#include "nbl/builtin/hlsl/cpp_compat.hlsl" namespace nbl { @@ -21,8 +22,6 @@ class vectorSIMDBool; template class vectorSIMD_32; class vectorSIMDf; -class matrix4SIMD; -class matrix3x4SIMD; template @@ -317,7 +316,6 @@ NBL_FORCE_INLINE T lerp(const T& a, const T& b, const U& t) return core::mix(a,b,t); } - // TODO : step,smoothstep,isnan,isinf,floatBitsToInt,floatBitsToUint,intBitsToFloat,uintBitsToFloat,frexp,ldexp // extra note, GCC breaks isfinite, isinf, isnan, isnormal, signbit in -ffast-math so need to implement ourselves // TODO : packUnorm2x16, packSnorm2x16, packUnorm4x8, packSnorm4x8, unpackUnorm2x16, unpackSnorm2x16, unpackUnorm4x8, unpackSnorm4x8, packHalf2x16, unpackHalf2x16, packDouble2x32, unpackDouble2x32 @@ -331,7 +329,6 @@ NBL_FORCE_INLINE vectorSIMD_32 dot>(const vector template<> NBL_FORCE_INLINE vectorSIMD_32 dot>(const vectorSIMD_32& a, const vectorSIMD_32& b); - template NBL_FORCE_INLINE T lengthsquared(const T& v) { @@ -355,14 +352,22 @@ NBL_FORCE_INLINE T distancesquared(const T& a, const T& b) } template -NBL_FORCE_INLINE T cross(const T& a, const T& b); +NBL_FORCE_INLINE T cross(const T& a, const T& b) +{ + hlsl::float32_t3 output; + output.x = a[1] * b[2] - a[2] * b[1]; + output.y = a[2] * b[0] - a[0] * b[2]; + output.y = a[0] * b[1] - a[1] * b[0]; + + return output; +} template<> NBL_FORCE_INLINE vectorSIMDf cross(const vectorSIMDf& a, const vectorSIMDf& b); template NBL_FORCE_INLINE T normalize(const T& v) { - auto d = dot(v, v); + auto d = core::dot(v, v); #ifdef __NBL_FAST_MATH return v * core::inversesqrt(d); #else @@ -373,11 +378,6 @@ NBL_FORCE_INLINE T normalize(const T& v) // TODO : matrixCompMult, outerProduct, inverse template NBL_FORCE_INLINE T transpose(const T& m); -template<> -NBL_FORCE_INLINE matrix4SIMD transpose(const matrix4SIMD& m); - - - // Extras diff --git a/include/nbl/core/math/glslFunctions.tcc b/include/nbl/core/math/glslFunctions.tcc index 205585965b..c25464b876 100644 --- a/include/nbl/core/math/glslFunctions.tcc +++ b/include/nbl/core/math/glslFunctions.tcc @@ -8,7 +8,6 @@ #include "nbl/core/declarations.h" #include "nbl/core/math/floatutil.tcc" -#include "matrix4SIMD.h" #include #include @@ -266,6 +265,23 @@ NBL_FORCE_INLINE vectorSIMDu32 dot(const vectorSIMDu32& a, const #endif } +// these functions will be removed with vectorSIMDf, would't worry about them +template<> +NBL_FORCE_INLINE hlsl::float32_t3 dot(const hlsl::float32_t3& a, const hlsl::float32_t3& b) +{ + hlsl::float32_t3 output = a * b; + output.x = output.x + output.y + output.z; + return output; +} + +template<> +NBL_FORCE_INLINE hlsl::float32_t4 dot(const hlsl::float32_t4& a, const hlsl::float32_t4& b) +{ + hlsl::float32_t4 output = a * b; + output.x = output.x + output.y + output.z + output.w; + return output; +} + template<> NBL_FORCE_INLINE vectorSIMDf cross(const vectorSIMDf& a, const vectorSIMDf& b) { @@ -280,21 +296,6 @@ NBL_FORCE_INLINE vectorSIMDf cross(const vectorSIMDf& a, const vect #endif } -template<> -NBL_FORCE_INLINE matrix4SIMD transpose(const matrix4SIMD& m) -{ - core::matrix4SIMD retval; - __m128 a0 = m.rows[0].getAsRegister(), a1 = m.rows[1].getAsRegister(), a2 = m.rows[2].getAsRegister(), a3 = m.rows[3].getAsRegister(); - _MM_TRANSPOSE4_PS(a0, a1, a2, a3); - retval.rows[0] = a0; - retval.rows[1] = a1; - retval.rows[2] = a2; - retval.rows[3] = a3; - return retval; -} - - - template<> NBL_FORCE_INLINE bool equals(const vectorSIMDf& a, const vectorSIMDf& b, const vectorSIMDf& tolerance) { @@ -307,22 +308,6 @@ NBL_FORCE_INLINE bool equals(const core::vector3df& a, const core::vector3df& b, auto la = a-tolerance; return ha.X>=b.X&&ha.Y>=b.Y&&ha.Z>=b.Z && la.X<=b.X&&la.Y<=b.Y&&la.Z<=b.Z; } -template<> -NBL_FORCE_INLINE bool equals(const matrix4SIMD& a, const matrix4SIMD& b, const matrix4SIMD& tolerance) -{ - for (size_t i = 0u; i(a.rows[i], b.rows[i], tolerance.rows[i])) - return false; - return true; -} -template<> -NBL_FORCE_INLINE bool equals(const matrix3x4SIMD& a, const matrix3x4SIMD& b, const matrix3x4SIMD& tolerance) -{ - for (size_t i = 0u; i(a.rows[i], b.rows[i], tolerance[i])) - return false; - return true; -} template NBL_FORCE_INLINE bool equals(const T& a, const T& b, const T& tolerance) { diff --git a/include/nbl/core/math/matrixutil.h b/include/nbl/core/math/matrixutil.h deleted file mode 100644 index afe7955c9b..0000000000 --- a/include/nbl/core/math/matrixutil.h +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#ifndef _NBL_MATRIX_UTIL_H_INCLUDED_ -#define _NBL_MATRIX_UTIL_H_INCLUDED_ - -#include "matrix4SIMD.h" -#include "matrix3x4SIMD.h" - -namespace nbl::core -{ - - -//! TODO: OPTIMIZE THIS, DON'T PROMOTE THE MATRIX IF DON'T HAVE TO -inline matrix4SIMD concatenateBFollowedByA(const matrix4SIMD& _a, const matrix3x4SIMD& _b) -{ - return concatenateBFollowedByA(_a, matrix4SIMD(_b)); -} -/* -inline matrix4SIMD concatenateBFollowedByAPrecisely(const matrix4SIMD& _a, const matrix3x4SIMD& _b) -{ - return concatenateBFollowedByAPrecisely(_a, matrix4SIMD(_b)); -} -*/ - -} - -#endif diff --git a/include/nbl/core/math/plane3dSIMD.h b/include/nbl/core/math/plane3dSIMD.h index 891ed1300c..f6afafd8de 100644 --- a/include/nbl/core/math/plane3dSIMD.h +++ b/include/nbl/core/math/plane3dSIMD.h @@ -6,7 +6,7 @@ #ifndef __NBL_CORE_PLANE_3D_H_INCLUDED__ #define __NBL_CORE_PLANE_3D_H_INCLUDED__ -#include "matrix3x4SIMD.h" +#include namespace nbl { @@ -99,14 +99,20 @@ class plane3dSIMDf : private vectorSIMDf } //! - static inline plane3dSIMDf transform(const plane3dSIMDf& _in, const matrix3x4SIMD& _mat) + static inline plane3dSIMDf transform(const plane3dSIMDf& in, const hlsl::float32_t3x4& mat) { - matrix3x4SIMD inv; - _mat.getInverse(inv); + hlsl::float32_t3x4 a = mat; + hlsl::float32_t4x4 inv = hlsl::getMatrix3x4As4x4(a); + hlsl::inverse(inv); - vectorSIMDf normal(_in.getNormal()); + vectorSIMDf normal(in.getNormal()); // transform by inverse transpose - return plane3dSIMDf(inv.rows[0]*normal.xxxx()+inv.rows[1]*normal.yyyy()+inv.rows[2]*normal.zzzz()+(normal.wwww()&BUILD_MASKF(0,0,0,1))); + hlsl::float32_t4 planeEq = inv[0] * hlsl::float32_t4(normal.x) + inv[1] * hlsl::float32_t4(normal.y) + inv[2] * hlsl::float32_t4(normal.z) + (hlsl::float32_t4(0, 0, 0, normal.w)); + vectorSIMDf planeEqSIMD; + for (int i = 0; i < 4; ++i) + planeEqSIMD[i] = planeEq[i]; + + return plane3dSIMDf(planeEqSIMD); #undef BUILD_MASKF } From 847dc080731a9369ab95c83c1a1c6a8bfc86a219 Mon Sep 17 00:00:00 2001 From: Erfan Ahmadi Date: Fri, 25 Oct 2024 16:39:38 +0400 Subject: [PATCH 105/358] update examples --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index a80dcf86cf..ceb08477ef 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit a80dcf86cfe3712fd8fdb4170f60afc53637fec6 +Subproject commit ceb08477ef578008a490067b613114ae1e469666 From bc8b2a43265e03e90eb33c8df6ad89efcca530f1 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Sat, 26 Oct 2024 09:12:46 -0700 Subject: [PATCH 106/358] Saving work --- examples_tests | 2 +- include/nbl/asset/IAccelerationStructure.h | 9 +-- include/nbl/asset/IAnimationLibrary.h | 6 +- include/nbl/asset/ICPUSkeleton.h | 8 +-- include/nbl/asset/asset_utils.h | 2 +- include/nbl/asset/metadata/IMeshMetadata.h | 2 +- include/nbl/asset/utils/IMeshManipulator.h | 2 +- .../nbl/builtin/hlsl/glsl_compat/math.hlsl | 65 ------------------- .../hlsl/math/quaternion/quaternion.hlsl | 39 ++++++++++- .../transformation_matrix_utils.hlsl | 12 ++++ include/nbl/core/math/glslFunctions.h | 26 +++----- include/nbl/core/math/glslFunctions.tcc | 20 ------ include/nbl/core/math/intutil.h | 4 +- include/nbl/ext/DebugDraw/CDraw3DLine.h | 2 +- include/nbl/ext/MitsubaLoader/CElementShape.h | 2 +- include/nbl/scene/ILevelOfDetailLibrary.h | 6 +- include/vector3d.h | 6 +- src/nbl/asset/interchange/CGLTFLoader.cpp | 2 +- src/nbl/asset/utils/CGeometryCreator.cpp | 6 +- src/nbl/asset/utils/CMeshManipulator.cpp | 17 ++--- 20 files changed, 95 insertions(+), 143 deletions(-) delete mode 100644 include/nbl/builtin/hlsl/glsl_compat/math.hlsl diff --git a/examples_tests b/examples_tests index 13df456b76..d1c0e9f7e6 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 13df456b76c4e38996502fc71d9878334fd5ea18 +Subproject commit d1c0e9f7e6bb05dbacd2318da86909d3a93e08f3 diff --git a/include/nbl/asset/IAccelerationStructure.h b/include/nbl/asset/IAccelerationStructure.h index 94e53238b2..9dc07b2675 100644 --- a/include/nbl/asset/IAccelerationStructure.h +++ b/include/nbl/asset/IAccelerationStructure.h @@ -10,6 +10,7 @@ #include #include "nbl/builtin/hlsl/cpp_compat/matrix.hlsl" +#include "nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl" #include "nbl/asset/ECommonEnums.h" #include "nbl/asset/IDescriptor.h" @@ -181,7 +182,7 @@ class ITopLevelAccelerationStructure : public AccelerationStructure FORCE_OPACITY_MICROMAP_2_STATE_BIT = 0x1u<<4u, FORCE_DISABLE_OPACITY_MICROMAPS_BIT = 0x1u<<5u, }; - // Note: `core::matrix3x4SIMD` is equvalent to VkTransformMatrixKHR, 4x3 row_major matrix + // Note: `hlsl::float32_t3x4` is equvalent to VkTransformMatrixKHR, 4x3 row_major matrix template struct Instance final { @@ -197,18 +198,18 @@ class ITopLevelAccelerationStructure : public AccelerationStructure template struct StaticInstance final { - core::matrix3x4SIMD transform = core::matrix3x4SIMD(); + hlsl::float32_t3x4 transform = hlsl::IdentityFloat32_t3x4; Instance base = {}; }; template struct MatrixMotionInstance final { - core::matrix3x4SIMD transform[2] = {core::matrix3x4SIMD(),core::matrix3x4SIMD()}; + hlsl::float32_t3x4 transform[2] = { hlsl::IdentityFloat32_t3x4, hlsl::IdentityFloat32_t3x4 }; Instance base = {}; }; struct SRT { - // TODO: some operators to convert back and forth from `core::matrix3x4SIMD + // TODO: some operators to convert back and forth from `hlsl::float32_t3x4 float sx; float a; diff --git a/include/nbl/asset/IAnimationLibrary.h b/include/nbl/asset/IAnimationLibrary.h index 9665349103..d650cb25d9 100644 --- a/include/nbl/asset/IAnimationLibrary.h +++ b/include/nbl/asset/IAnimationLibrary.h @@ -34,7 +34,7 @@ class IAnimationLibrary : public virtual core::IReferenceCounted translation[2] = translation[1] = translation[0] = 0.f; quat = core::vectorSIMDu32(128u,128u,128u,255u); // should be (0,0,0,1) encoded } - Keyframe(const core::vectorSIMDf& _scale, const core::quaternion& _quat, const CQuantQuaternionCache* quantCache, const core::vectorSIMDf& _translation) + Keyframe(const core::vectorSIMDf& _scale, const hlsl::quaternion& _quat, const CQuantQuaternionCache* quantCache, const core::vectorSIMDf& _translation) { std::copy(_translation.pointer,_translation.pointer+3,translation); quat = quantCache->template quantize(_quat); @@ -42,13 +42,13 @@ class IAnimationLibrary : public virtual core::IReferenceCounted //scale = ; } - inline core::quaternion getRotation() const + inline hlsl::quaternion getRotation() const { const void* _pix[4] = {&quat,nullptr,nullptr,nullptr}; double out[4]; decodePixels(_pix,out,0u,0u); auto q = core::normalize(core::vectorSIMDf(out[0],out[1],out[2],out[3])); - return reinterpret_cast(&q)[0]; + return reinterpret_cast*>(&q)[0]; } inline core::vectorSIMDf getScale() const diff --git a/include/nbl/asset/ICPUSkeleton.h b/include/nbl/asset/ICPUSkeleton.h index 6f1c576ed8..53d7e66be0 100644 --- a/include/nbl/asset/ICPUSkeleton.h +++ b/include/nbl/asset/ICPUSkeleton.h @@ -42,15 +42,15 @@ class ICPUSkeleton final : public ISkeleton, public IAsset } //! - inline const core::matrix3x4SIMD& getDefaultTransformMatrix(base_t::joint_id_t jointID) const + inline const hlsl::float32_t3x4& getDefaultTransformMatrix(base_t::joint_id_t jointID) const { const uint8_t* ptr = reinterpret_cast(m_defaultTransforms.buffer->getPointer()); - return reinterpret_cast(ptr+m_defaultTransforms.offset)[jointID]; + return reinterpret_cast(ptr+m_defaultTransforms.offset)[jointID]; } - inline core::matrix3x4SIMD& getDefaultTransformMatrix(base_t::joint_id_t jointID) + inline hlsl::float32_t3x4& getDefaultTransformMatrix(base_t::joint_id_t jointID) { assert(isMutable()); - return const_cast(const_cast(this)->getDefaultTransformMatrix(jointID)); + return const_cast(const_cast(this)->getDefaultTransformMatrix(jointID)); } //! diff --git a/include/nbl/asset/asset_utils.h b/include/nbl/asset/asset_utils.h index 8e4e35a733..84c8a8df45 100644 --- a/include/nbl/asset/asset_utils.h +++ b/include/nbl/asset/asset_utils.h @@ -31,7 +31,7 @@ inline void fillBufferWithDeadBeef(ICPUBuffer* _buf) #include "nbl/nblpack.h" //! Designed for use with interface blocks declared with `layout (row_major, std140)` -// TODO: change members to core::matrix3x4SIMD and core::matrix4SIMD +// TODO: change members to hlsl::float32_t3x4 and hlsl::float32_t4x4 struct SBasicViewParameters { float MVP[4*4]; diff --git a/include/nbl/asset/metadata/IMeshMetadata.h b/include/nbl/asset/metadata/IMeshMetadata.h index 5ce3c12980..25f72e05c0 100644 --- a/include/nbl/asset/metadata/IMeshMetadata.h +++ b/include/nbl/asset/metadata/IMeshMetadata.h @@ -18,7 +18,7 @@ class IMeshMetadata : public core::Interface public: struct SInstance { - core::matrix3x4SIMD worldTform; + hlsl::float32_t3x4 worldTform; }; core::SRange m_instances; diff --git a/include/nbl/asset/utils/IMeshManipulator.h b/include/nbl/asset/utils/IMeshManipulator.h index dfdd44e475..27569e4f60 100644 --- a/include/nbl/asset/utils/IMeshManipulator.h +++ b/include/nbl/asset/utils/IMeshManipulator.h @@ -351,7 +351,7 @@ class NBL_API2 IMeshManipulator : public virtual core::IReferenceCounted static float DistanceToLine(core::vectorSIMDf P0, core::vectorSIMDf P1, core::vectorSIMDf InPoint); static float DistanceToPlane(core::vectorSIMDf InPoint, core::vectorSIMDf PlanePoint, core::vectorSIMDf PlaneNormal); - static core::matrix3x4SIMD calculateOBB(const nbl::asset::ICPUMeshBuffer* meshbuffer); + static hlsl::float32_t3x4 calculateOBB(const nbl::asset::ICPUMeshBuffer* meshbuffer); //! Calculates bounding box of the meshbuffer static inline core::aabbox3df calculateBoundingBox( diff --git a/include/nbl/builtin/hlsl/glsl_compat/math.hlsl b/include/nbl/builtin/hlsl/glsl_compat/math.hlsl deleted file mode 100644 index 22d6314d0d..0000000000 --- a/include/nbl/builtin/hlsl/glsl_compat/math.hlsl +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h -#ifndef _NBL_BUILTIN_HLSL_GLSL_COMPAT_CORE_INCLUDED_ -#define _NBL_BUILTIN_HLSL_GLSL_COMPAT_CORE_INCLUDED_ - -#include "nbl/builtin/hlsl/cpp_compat.hlsl" -#include "nbl/builtin/hlsl/spirv_intrinsics/core.hlsl" -#include "nbl/builtin/hlsl/type_traits.hlsl" - -#ifndef __HLSL_VERSION -#include -#endif - -namespace nbl -{ -namespace hlsl -{ -namespace glsl -{ - -template -inline T radians(NBL_CONST_REF_ARG(T) degrees) -{ - static_assert( - is_floating_point::value, - "This code expects the type to be either a double or a float." - ); - - return degrees * PI() / T(180); -} - -template -inline T degrees(NBL_CONST_REF_ARG(T) radians) -{ - static_assert( - is_floating_point::value, - "This code expects the type to be either a double or a float." - ); - - return radians * T(180) / PI(); -} - -template -inline bool equals(NBL_CONST_REF_ARG(T) a, NBL_CONST_REF_ARG(T) b, NBL_CONST_REF_ARG(T) tolerance) -{ - return (a + tolerance >= b) && (a - tolerance <= b); -} - -#ifndef __HLSL_VERSION - -NBL_FORCE_INLINE bool equals(const core::vector3df& a, const core::vector3df& b, const core::vector3df& tolerance) -{ - auto ha = a + tolerance; - auto la = a - tolerance; - return ha.X >= b.X && ha.Y >= b.Y && ha.Z >= b.Z && la.X <= b.X && la.Y <= b.Y && la.Z <= b.Z; -} - -#endif - -} -} -} - -#endif diff --git a/include/nbl/builtin/hlsl/math/quaternion/quaternion.hlsl b/include/nbl/builtin/hlsl/math/quaternion/quaternion.hlsl index 2991a79e35..2f24e52635 100644 --- a/include/nbl/builtin/hlsl/math/quaternion/quaternion.hlsl +++ b/include/nbl/builtin/hlsl/math/quaternion/quaternion.hlsl @@ -13,7 +13,6 @@ namespace nbl namespace hlsl { - //! Quaternion class for representing rotations. /** It provides cheap combinations and avoids gimbal locks. Also useful for interpolations. */ @@ -22,6 +21,7 @@ template struct quaternion { // i*data[0] + j*data[1] + k*data[2] + data[3] + using vec_t = vector; vector data; //! creates identity quaternion @@ -36,11 +36,46 @@ struct quaternion static inline quaternion create(T x, T y, T z, T w) { quaternion q; - q.data = vector(x, y, z, z); + q.data = vector(x, y, z, w); return q; } + static inline quaternion create(NBL_CONST_REF_ARG(quaternion) other) + { + return other; + } + + static inline quaternion create(T pitch, T yaw, T roll) + { + float angle; + + angle = roll * 0.5f; + const float sr = sinf(angle); + const float cr = cosf(angle); + + angle = pitch * 0.5f; + const float sp = sinf(angle); + const float cp = cos(angle); + + angle = yaw * 0.5f; + const float sy = sinf(angle); + const float cy = cosf(angle); + + const float cpcy = cp * cy; + const float spcy = sp * cy; + const float cpsy = cp * sy; + const float spsy = sp * sy; + + quaternion output; + output.data = float32_t4(sr, cr, cr, cr) * float32_t4(cpcy, spcy, cpsy, cpcy) + float32_t4(-cr, sr, -sr, sr) * float32_t4(spsy, cpsy, spcy, spsy); + + return output; + } + + // TODO: + //explicit quaternion(NBL_CONST_REF_ARG(float32_t3x4) m) {} + #define DEFINE_MUL_QUATERNION_BY_SCALAR_OPERATOR(TYPE)\ inline quaternion operator*(TYPE scalar)\ {\ diff --git a/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl b/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl index 9b3ce2f602..cb3391c16f 100644 --- a/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl +++ b/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl @@ -5,12 +5,15 @@ #include // TODO: remove this header when deleting vectorSIMDf.hlsl #include +#include "vectorSIMD.h" namespace nbl { namespace hlsl { +NBL_CONSTEXPR hlsl::float32_t3x4 IdentityFloat32_t3x4 = hlsl::float32_t3x4(hlsl::float32_t4(1, 0, 0, 0), hlsl::float32_t4(0, 0, 1, 0), hlsl::float32_t4(0, 0, 1, 0)); + // TODO: this is temporary function, delete when removing vectorSIMD template core::vectorSIMDf transformVector(const matrix& mat, const core::vectorSIMDf& vec) @@ -125,6 +128,15 @@ inline void setTranslation(matrix& outMat, NBL_CONST_REF_ARG(vector +inline void setTranslation_tmp(matrix& outMat, NBL_CONST_REF_ARG(core::vectorSIMDf) translation) +{ + outMat[0].w = translation.x; + outMat[1].w = translation.y; + outMat[2].w = translation.z; +} + } } diff --git a/include/nbl/core/math/glslFunctions.h b/include/nbl/core/math/glslFunctions.h index 82402a0ae5..7fa7385dad 100644 --- a/include/nbl/core/math/glslFunctions.h +++ b/include/nbl/core/math/glslFunctions.h @@ -122,17 +122,17 @@ NBL_FORCE_INLINE T mix(const T & a, const T & b, const U & t) } else { - if constexpr(nbl::is_any_of::value) + if constexpr(nbl::is_any_of::value) { for (uint32_t i=0u; i::value) + if constexpr(nbl::is_any_of::value) { - retval[i] = core::mix(a.rows[i], b.rows[i], t.rows[i]); + retval[i] = core::mix(a[i], b[i], t[i]); } else { - retval[i] = core::mix(a.rows[i], b.rows[i], t); + retval[i] = core::mix(a[i], b[i], t); } } @@ -316,6 +316,7 @@ NBL_FORCE_INLINE T lerp(const T& a, const T& b, const U& t) return core::mix(a,b,t); } + // TODO : step,smoothstep,isnan,isinf,floatBitsToInt,floatBitsToUint,intBitsToFloat,uintBitsToFloat,frexp,ldexp // extra note, GCC breaks isfinite, isinf, isnan, isnormal, signbit in -ffast-math so need to implement ourselves // TODO : packUnorm2x16, packSnorm2x16, packUnorm4x8, packSnorm4x8, unpackUnorm2x16, unpackSnorm2x16, unpackUnorm4x8, unpackSnorm4x8, packHalf2x16, unpackHalf2x16, packDouble2x32, unpackDouble2x32 @@ -329,6 +330,7 @@ NBL_FORCE_INLINE vectorSIMD_32 dot>(const vector template<> NBL_FORCE_INLINE vectorSIMD_32 dot>(const vectorSIMD_32& a, const vectorSIMD_32& b); + template NBL_FORCE_INLINE T lengthsquared(const T& v) { @@ -352,22 +354,14 @@ NBL_FORCE_INLINE T distancesquared(const T& a, const T& b) } template -NBL_FORCE_INLINE T cross(const T& a, const T& b) -{ - hlsl::float32_t3 output; - output.x = a[1] * b[2] - a[2] * b[1]; - output.y = a[2] * b[0] - a[0] * b[2]; - output.y = a[0] * b[1] - a[1] * b[0]; - - return output; -} +NBL_FORCE_INLINE T cross(const T& a, const T& b); template<> NBL_FORCE_INLINE vectorSIMDf cross(const vectorSIMDf& a, const vectorSIMDf& b); template NBL_FORCE_INLINE T normalize(const T& v) { - auto d = core::dot(v, v); + auto d = dot(v, v); #ifdef __NBL_FAST_MATH return v * core::inversesqrt(d); #else @@ -424,10 +418,6 @@ template NBL_FORCE_INLINE bool equals(const T& a, const T& b, const T& tolerance); template<> NBL_FORCE_INLINE bool equals(const vectorSIMDf& a, const vectorSIMDf& b, const vectorSIMDf& tolerance); -template<> -NBL_FORCE_INLINE bool equals(const matrix4SIMD& a, const matrix4SIMD& b, const matrix4SIMD& tolerance); -template<> -NBL_FORCE_INLINE bool equals(const matrix3x4SIMD& a, const matrix3x4SIMD& b, const matrix3x4SIMD& tolerance); //! returns if a equals zero, taking rounding errors into account diff --git a/include/nbl/core/math/glslFunctions.tcc b/include/nbl/core/math/glslFunctions.tcc index c25464b876..d2343f0322 100644 --- a/include/nbl/core/math/glslFunctions.tcc +++ b/include/nbl/core/math/glslFunctions.tcc @@ -265,23 +265,6 @@ NBL_FORCE_INLINE vectorSIMDu32 dot(const vectorSIMDu32& a, const #endif } -// these functions will be removed with vectorSIMDf, would't worry about them -template<> -NBL_FORCE_INLINE hlsl::float32_t3 dot(const hlsl::float32_t3& a, const hlsl::float32_t3& b) -{ - hlsl::float32_t3 output = a * b; - output.x = output.x + output.y + output.z; - return output; -} - -template<> -NBL_FORCE_INLINE hlsl::float32_t4 dot(const hlsl::float32_t4& a, const hlsl::float32_t4& b) -{ - hlsl::float32_t4 output = a * b; - output.x = output.x + output.y + output.z + output.w; - return output; -} - template<> NBL_FORCE_INLINE vectorSIMDf cross(const vectorSIMDf& a, const vectorSIMDf& b) { @@ -314,7 +297,6 @@ NBL_FORCE_INLINE bool equals(const T& a, const T& b, const T& tolerance) return (a + tolerance >= b) && (a - tolerance <= b); } - template<> NBL_FORCE_INLINE vectorSIMDf sin(const vectorSIMDf& a) { @@ -327,8 +309,6 @@ NBL_FORCE_INLINE T sin(const T& a) return std::sin(a); } - - // extras diff --git a/include/nbl/core/math/intutil.h b/include/nbl/core/math/intutil.h index d365835163..768883f2b0 100644 --- a/include/nbl/core/math/intutil.h +++ b/include/nbl/core/math/intutil.h @@ -3,8 +3,8 @@ // For conditions of distribution and use, see copyright notice in nabla.h // TODO: kill this file -#ifndef __NBL_MATH_H_INCLUDED__ -#define __NBL_MATH_H_INCLUDED__ +#ifndef __NBL_INTUTIL_H_INCLUDED__ +#define __NBL_INTUTIL_H_INCLUDED__ #include "BuildConfigOptions.h" diff --git a/include/nbl/ext/DebugDraw/CDraw3DLine.h b/include/nbl/ext/DebugDraw/CDraw3DLine.h index 68cd64e9c1..2437ce4bc5 100644 --- a/include/nbl/ext/DebugDraw/CDraw3DLine.h +++ b/include/nbl/ext/DebugDraw/CDraw3DLine.h @@ -91,7 +91,7 @@ class CDraw3DLine : public core::IReferenceCounted */ void recordToCommandBuffer(video::IGPUCommandBuffer* cmdBuffer, video::IGPUGraphicsPipeline* graphics_pipeline); - inline void addBox(const core::aabbox3df& box, float r, float g, float b, float a, const core::matrix3x4SIMD& tform=core::matrix3x4SIMD()) + inline void addBox(const core::aabbox3df& box, float r, float g, float b, float a, const hlsl::float32_t3x4& tform=hlsl::float32_t3x4()) { auto addLine = [&](auto s, auto e) -> void { diff --git a/include/nbl/ext/MitsubaLoader/CElementShape.h b/include/nbl/ext/MitsubaLoader/CElementShape.h index 205023afea..c1725963b2 100644 --- a/include/nbl/ext/MitsubaLoader/CElementShape.h +++ b/include/nbl/ext/MitsubaLoader/CElementShape.h @@ -225,7 +225,7 @@ class CElementShape : public IElement std::string getLogName() const override { return "shape"; } - inline core::matrix3x4SIMD getAbsoluteTransform() const + inline hlsl::float32_t3x4 getAbsoluteTransform() const { auto local = transform.matrix.extractSub3x4(); // TODO restore at some point (and make it actually work??) diff --git a/include/nbl/scene/ILevelOfDetailLibrary.h b/include/nbl/scene/ILevelOfDetailLibrary.h index 12fbf9dabf..6b68189e7d 100644 --- a/include/nbl/scene/ILevelOfDetailLibrary.h +++ b/include/nbl/scene/ILevelOfDetailLibrary.h @@ -26,11 +26,11 @@ class ILevelOfDetailLibrary : public virtual core::IReferenceCounted return distanceSqAtReferenceFoV(); - return abs(proj.rows[0].x*proj.rows[1].y-proj.rows[0].y*proj.rows[1].x)/dot(proj.rows[3],proj.rows[3]).x; + return abs(proj[0].x*proj[1].y-proj[0].y*proj[1].x)/core::dot(proj[3],proj[3]).x; } }; template class container=core::vector> diff --git a/include/vector3d.h b/include/vector3d.h index 327eaacdb2..e5a4905da3 100644 --- a/include/vector3d.h +++ b/include/vector3d.h @@ -6,7 +6,7 @@ #ifndef __NBL_POINT_3D_H_INCLUDED__ #define __NBL_POINT_3D_H_INCLUDED__ -#include "nbl/core/math/glslFunctions.h" +#include namespace nbl { @@ -63,7 +63,7 @@ namespace core //! use weak float compare bool operator==(const vector3d& other) const { - return core::equals >(*this,other,vector3d(core::ROUNDING_ERROR())); + return core::equals >(*this, other, vector3d(core::ROUNDING_ERROR())); } bool operator!=(const vector3d& other) const @@ -72,7 +72,6 @@ namespace core } // functions - vector3d& set(const T nx, const T ny, const T nz) {X=nx; Y=ny; Z=nz; return *this;} vector3d& set(const vector3d& p) {X=p.X; Y=p.Y; Z=p.Z;return *this;} @@ -275,7 +274,6 @@ namespace core template vector3d operator*(const S scalar, const vector3d& vector) { return vector*scalar; } - } // end namespace core } // end namespace nbl diff --git a/src/nbl/asset/interchange/CGLTFLoader.cpp b/src/nbl/asset/interchange/CGLTFLoader.cpp index 7382bd2f08..63fd42c552 100644 --- a/src/nbl/asset/interchange/CGLTFLoader.cpp +++ b/src/nbl/asset/interchange/CGLTFLoader.cpp @@ -2398,7 +2398,7 @@ using namespace nbl::asset; { core::vector3df_SIMD translation = {}; //!< The node's translation along the x, y, and z axes. core::vector3df_SIMD scale = core::vector3df_SIMD(1.f,1.f,1.f); //!< The node's non-uniform scale, given as the scaling factors along the x, y, and z axes. - core::quaternion rotation = {}; //!< The node's unit quaternion rotation in the order (x, y, z, w), where w is the scalar. + hlsl::quaternion rotation = {}; //!< The node's unit quaternion rotation in the order (x, y, z, w), where w is the scalar. } trs; if (translation.error() != simdjson::error_code::NO_SUCH_FIELD) diff --git a/src/nbl/asset/utils/CGeometryCreator.cpp b/src/nbl/asset/utils/CGeometryCreator.cpp index 50898be56f..c740b85d97 100644 --- a/src/nbl/asset/utils/CGeometryCreator.cpp +++ b/src/nbl/asset/utils/CGeometryCreator.cpp @@ -680,7 +680,7 @@ CGeometryCreator::return_type CGeometryCreator::createDiskMesh(float radius, uin DiskVertex* ptr = (DiskVertex*)vertices->getPointer(); const core::vectorSIMDf v0(0.0f, radius, 0.0f, 1.0f); - core::matrix3x4SIMD rotation; + hlsl::float32_t3x4 rotation; //center ptr[0] = DiskVertex(core::vector3df_SIMD(0.0f), video::SColor(0xFFFFFFFFu), @@ -697,8 +697,8 @@ CGeometryCreator::return_type CGeometryCreator::createDiskMesh(float radius, uin for (int i = 2; i < vertexCount-1; i++) { core::vectorSIMDf vn; - core::matrix3x4SIMD rotMatrix; - rotMatrix.setRotation(core::quaternion(0.0f, 0.0f, core::radians((i-1)*angle))); + hlsl::float32_t3x4 rotMatrix; + rotMatrix.setRotation(hlsl::quaternion(0.0f, 0.0f, core::radians((i-1)*angle))); rotMatrix.transformVect(vn, v0); ptr[i] = DiskVertex(vn, video::SColor(0xFFFFFFFFu), diff --git a/src/nbl/asset/utils/CMeshManipulator.cpp b/src/nbl/asset/utils/CMeshManipulator.cpp index 8996972a98..86ead8d302 100644 --- a/src/nbl/asset/utils/CMeshManipulator.cpp +++ b/src/nbl/asset/utils/CMeshManipulator.cpp @@ -11,6 +11,7 @@ #include "nbl/asset/asset.h" +#include "nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl" #include "nbl/asset/IRenderpassIndependentPipeline.h" #include "nbl/asset/utils/CMeshManipulator.h" #include "nbl/asset/utils/CSmoothNormalGenerator.h" @@ -1526,7 +1527,7 @@ float IMeshManipulator::DistanceToPlane(core::vectorSIMDf InPoint, core::vectorS return (core::dot(PointToPlane, PlaneNormal).x >= 0) ? core::abs(core::dot(PointToPlane, PlaneNormal).x) : 0; } -core::matrix3x4SIMD IMeshManipulator::calculateOBB(const nbl::asset::ICPUMeshBuffer* meshbuffer) +hlsl::float32_t3x4 IMeshManipulator::calculateOBB(const nbl::asset::ICPUMeshBuffer* meshbuffer) { auto FindMinMaxProj = [&](const core::vectorSIMDf& Dir, const core::vectorSIMDf Extrema[]) -> core::vectorSIMDf { @@ -1682,7 +1683,7 @@ core::matrix3x4SIMD IMeshManipulator::calculateOBB(const nbl::asset::ICPUMeshBuf ComputeAxis(P2, P1, Q1, BestAxis, BestQuality, Extrema); ComputeAxis(Q1, P2, P1, BestAxis, BestQuality, Extrema); - core::matrix3x4SIMD TransMat = core::matrix3x4SIMD( + hlsl::float32_t3x4 TransMat = hlsl::float32_t3x4( BestAxis[0].x, BestAxis[1].x, BestAxis[2].x, 0, BestAxis[0].y, BestAxis[1].y, BestAxis[2].y, 0, BestAxis[0].z, BestAxis[1].z, BestAxis[2].z, 0); @@ -1705,12 +1706,12 @@ core::matrix3x4SIMD IMeshManipulator::calculateOBB(const nbl::asset::ICPUMeshBuf core::vectorSIMDf ABBDiff = AABBMax - AABBMin; float ABBQuality = ABBDiff.x * ABBDiff.y + ABBDiff.y * ABBDiff.z + ABBDiff.z * ABBDiff.x; - core::matrix3x4SIMD scaleMat; - core::matrix3x4SIMD translationMat; - translationMat.setTranslation(-(MinPoint) / OBBDiff); - scaleMat.setScale(OBBDiff); - TransMat = core::concatenateBFollowedByA(TransMat, scaleMat); - TransMat = core::concatenateBFollowedByA(TransMat, translationMat); + hlsl::float32_t3x4 scaleMat; + hlsl::float32_t3x4 translationMat; + hlsl::setTranslation(translationMat, -(MinPoint) / OBBDiff); + hlsl::setScale(scaleMat, OBBDiff); + TransMat = hlsl::concatenateBFollowedByA(TransMat, scaleMat); + TransMat = hlsl::concatenateBFollowedByA(TransMat, translationMat); if (ABBQuality < OBBQuality) { translationMat.setTranslation(-(AABBMin) / ABBDiff); scaleMat.setScale(ABBDiff); From 5b6b09e2df8faf88d10dbe790bc1b0f9aea2826e Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 28 Oct 2024 15:44:59 +0100 Subject: [PATCH 107/358] sketch out pipeline caching --- include/nbl/video/utilities/CComputeBlit.h | 37 +++++----- src/nbl/video/utilities/CComputeBlit.cpp | 86 +++++++++++++++++++--- 2 files changed, 95 insertions(+), 28 deletions(-) diff --git a/include/nbl/video/utilities/CComputeBlit.h b/include/nbl/video/utilities/CComputeBlit.h index dc4c6f3c52..779dc2ddc0 100644 --- a/include/nbl/video/utilities/CComputeBlit.h +++ b/include/nbl/video/utilities/CComputeBlit.h @@ -41,7 +41,21 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted } // ctor - inline CComputeBlit(core::smart_refctd_ptr&& logicalDevice) : m_device(std::move(logicalDevice)) {} + CComputeBlit( + core::smart_refctd_ptr&& logicalDevice, + core::smart_refctd_ptr&& cache=nullptr, + core::smart_refctd_ptr&& logger=nullptr + ); + + // if you set the balues too small, we'll correct them ourselves anyway + struct STask + { + uint32_t workgroupSizeLog2 : 4 = 0; + // the TRUE output format, not the storage view format you might manually encode into + hlsl::format::TexelBlockFormat outputFormat : 8 = hlsl::format::TexelBlockFormat::TBF_UNKNOWN; + uint32_t sharedMemoryPerInvocation : 6 = 0; + uint32_t unused : 14 = 0; + }; //! Returns the original format if supports STORAGE_IMAGE otherwise returns a format in its compat class which supports STORAGE_IMAGE. inline asset::E_FORMAT getOutputViewFormat(const asset::E_FORMAT format) @@ -66,22 +80,7 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted return compatFormat; } } -/* - struct STask - { - hlsl::vector preloadWindow; - asset::E_FORMAT inFormat; - asset::E_FORMAT outFormat; - // default no coverage adjustment - uint8_t alphaBinCountLog2 : 4 = 0; - }; - inline void initializeTaskDefault(STask& task) const - { - auto physDev = m_device->getPhysicalDevice(); - const auto formatTrait = hlsl::format::getTraits(static_cast(task.outFormat)); - task.alphaBinCountLog2 = hlsl::max(,task.alphaBinCountLog2); - } -*/ + #if 0 // @param `alphaBinCount` is only required to size the histogram present in the default nbl_glsl_blit_AlphaStatistics_t in default_compute_common.comp core::smart_refctd_ptr createAlphaTestSpecializedShader(const asset::IImage::E_TYPE inImageType, const uint32_t alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount); @@ -666,7 +665,11 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted EBT_COUNT }; + void createAndCachePipelines(CAssetConverter* converter, core::smart_refctd_ptr* pipelines, const std::span tasks); + core::smart_refctd_ptr m_device; + system::logger_opt_smart_ptr m_logger; + core::smart_refctd_ptr m_shaderCache; //! This calculates the inclusive upper bound on the preload region i.e. it will be reachable for some cases. For the rest it will be bigger //! by a pixel in each dimension. diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp index 012f8a2305..5856f42fc5 100644 --- a/src/nbl/video/utilities/CComputeBlit.cpp +++ b/src/nbl/video/utilities/CComputeBlit.cpp @@ -5,6 +5,79 @@ using namespace nbl::system; using namespace nbl::asset; using namespace nbl::video; + +CComputeBlit::CComputeBlit(smart_refctd_ptr&& logicalDevice, smart_refctd_ptr&& cache, smart_refctd_ptr&& logger) : m_device(std::move(logicalDevice)), m_logger(nullptr) +{ + if (logger) + m_logger = std::move(logger); + else if (auto debugCb=m_device->getPhysicalDevice()->getDebugCallback(); debugCb->getLogger()) + m_logger = smart_refctd_ptr(debugCb->getLogger()); + + if (cache) + m_shaderCache = std::move(cache); + else + m_shaderCache = make_smart_refctd_ptr(); +} + +void CComputeBlit::createAndCachePipelines(CAssetConverter* converter, smart_refctd_ptr* pipelines, const std::span tasks) +{ + core::vector> cpuPplns; + cpuPplns.reserve(tasks.size()); + + const auto& limits = m_device->getPhysicalDevice()->getLimits(); + for (auto task : tasks) + { + // adjust task default values + { + if (task.workgroupSizeLog2(task.outputFormat),3,1.f); + const auto precisionAt0 = getFormatPrecision(static_cast(task.outputFormat),3,0.f); + if (limits.workgroupMemoryExplicitLayout16BitAccess && limits.shaderFloat16 && precisionAt1>=std::exp2f(-11.f) && precisionAt0>=std::numeric_limits::min()) + useFloat16 = true; + } + // the absolute minimum needed to store a single pixel + const auto singlePixelStorage = channels*(useFloat16 ? sizeof(hlsl::float16_t):sizeof(hlsl::float32_t)); + // also slightly more memory is needed + task.sharedMemoryPerInvocation = core::max(singlePixelStorage*2,task.sharedMemoryPerInvocation); + } + // create blit pipeline + cpuPplns.emplace_back(nullptr); + // create optional coverage normalization pipeline + cpuPplns.emplace_back(nullptr); + } + + CAssetConverter::SInputs inputs = {}; + inputs.readCache = converter; + inputs.logger = m_logger.getRaw(); + std::get>(inputs.assets) = {&cpuPplns.data()->get(),cpuPplns.size()}; + inputs.readShaderCache = m_shaderCache.get(); + inputs.writeShaderCache = m_shaderCache.get(); + // no pipeline cache, because we only make the same pipeline once, ever + auto reserveResults = converter->reserve(inputs); + assert(reserveResults.getRequiredQueueFlags().value==IQueue::FAMILY_FLAGS::NONE); + // copy over the results + { + auto rIt = reserveResults.getGPUObjects().data(); + // TODO: redo + for (size_t i=0; ivalue; + } + + // this just inserts the pipelines into the cache + { + CAssetConverter::SConvertParams params = {}; + auto convertResults = reserveResults.convert(params); + assert(!convertResults.blocking()); + } +} + #if 0 core::smart_refctd_ptr CComputeBlit::createAlphaTestSpecializedShader(const asset::IImage::E_TYPE imageType, const uint32_t alphaBinCount) { @@ -39,21 +112,14 @@ core::smart_refctd_ptr CComputeBlit::createAlphaTestSpecializ "}\n"; auto cpuShader = core::make_smart_refctd_ptr(shaderSourceStream.str().c_str(), IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, IGPUShader::E_CONTENT_TYPE::ECT_HLSL, "CComputeBlitGLSLGLSL::createAlphaTestSpecializedShader"); - - return m_device->createShader(std::move(cpuShader.get())); } -core::smart_refctd_ptr CComputeBlit::createNormalizationSpecializedShader(const asset::IImage::E_TYPE imageType, const asset::E_FORMAT outFormat, - const uint32_t alphaBinCount) +core::smart_refctd_ptr CComputeBlit::createNormalizationSpecializedShader(const asset::IImage::E_TYPE imageType, const uint32_t alphaBinCount) { const auto workgroupDims = getDefaultWorkgroupDims(imageType); const auto paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount); const uint32_t blitDimCount = static_cast(imageType) + 1; - const auto castedFormat = getOutImageViewFormat(outFormat); - assert(outFormat == castedFormat); - const char* formatQualifier = asset::CHLSLCompiler::getStorageImageFormatQualifier(castedFormat); - std::ostringstream shaderSourceStream; shaderSourceStream @@ -67,7 +133,7 @@ core::smart_refctd_ptr CComputeBlit::createNormalizationSpeci "[[vk::binding(0, 0)]]\n" "nbl::hlsl::blit::impl::dim_to_image_properties::combined_sampler_t inCS;\n" - "[[vk::image_format(\"" << formatQualifier << "\")]]\n" + "[[vk::image_format(\"unknown\")]]\n" "[[vk::binding(1, 0)]]\n" "nbl::hlsl::blit::impl::dim_to_image_properties::image_t outImg;\n" @@ -90,7 +156,5 @@ core::smart_refctd_ptr CComputeBlit::createNormalizationSpeci "}\n"; auto cpuShader = core::make_smart_refctd_ptr(shaderSourceStream.str().c_str(), IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, IGPUShader::E_CONTENT_TYPE::ECT_HLSL, "CComputeBlitGLSL::createNormalizationSpecializedShader"); - - return m_device->createShader(std::move(cpuShader.get())); } #endif \ No newline at end of file From 492a0ad804b65e656c8a7f29fbd5b16d7d58dc92 Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 29 Oct 2024 15:05:55 +0100 Subject: [PATCH 108/358] ladies and gentlemen we have C++20 concepts in HLSL ! --- include/nbl/builtin/hlsl/concepts.hlsl | 42 ++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/include/nbl/builtin/hlsl/concepts.hlsl b/include/nbl/builtin/hlsl/concepts.hlsl index b252b34379..91dc76970f 100644 --- a/include/nbl/builtin/hlsl/concepts.hlsl +++ b/include/nbl/builtin/hlsl/concepts.hlsl @@ -1,23 +1,40 @@ -// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O. +// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h - #ifndef _NBL_BUILTIN_HLSL_CONCEPTS_INCLUDED_ #define _NBL_BUILTIN_HLSL_CONCEPTS_INCLUDED_ + #include #include #include -#if (__cplusplus >= 202002L && __cpp_concepts) +#ifndef __HLSL_VERSION +// TODO: old stuff, see how much we can remove #define NBL_CONCEPT_TYPE_PARAMS(...) template <__VA_ARGS__> #define NBL_CONCEPT_SIGNATURE(NAME, ...) concept NAME = requires(__VA_ARGS__) #define NBL_CONCEPT_BODY(...) { __VA_ARGS__ }; #define NBL_CONCEPT_ASSIGN(NAME, ...) concept NAME = __VA_ARGS__; #define NBL_REQUIRES(...) requires __VA_ARGS__ +// for struct definitions, use instead of closing `>` on the primary template parameter list +#define NBL_PRIMARY_REQUIRES(...) > requires (__VA_ARGS__) + +// to put right before the closing `>` of the primary template definition, otherwise `NBL_PARTIAL_REQUIRES` wont work on specializations +#define NBL_STRUCT_CONSTRAINABLE +// NOTE: C++20 requires and C++11 enable_if have to be in different places! ITS OF UTTMOST IMPORTANCE YOUR REQUIRE CLAUSES ARE IDENTICAL FOR BOTH MACROS +// put just after the closing `>` on the partial template specialization `template` declaration e.g. `template NBL_PARTIAL_REQ_TOP(SomeCond) +#define NBL_PARTIAL_REQ_TOP(...) requires (__VA_ARGS__) +// put just before closing `>` on the partial template specialization Type args, e.g. `MyStruct)> +#define NBL_PARTIAL_REQ_BOT(...) + +// condition +#define NBL_FUNC_REQUIRES_BEGIN(...) requires (__VA_ARGS__) +// return value +#define NBL_FUNC_REQUIRES_END(...) __VA_ARGS__ + #include namespace nbl @@ -77,12 +94,31 @@ concept matricial = is_matrix::value; #else +// TODO: old stuff, see how much we can remove // No C++20 support. Do nothing. #define NBL_CONCEPT_TYPE_PARAMS(...) #define NBL_CONCEPT_SIGNATURE(NAME, ...) #define NBL_CONCEPT_BODY(...) #define NBL_REQUIRES(...) + +// for struct definitions, use instead of closing `>` on the primary template parameter list +#define NBL_PRIMARY_REQUIRES(...) ,typename __requires=::nbl::hlsl::enable_if_t<(__VA_ARGS__),void> > + +// to put right before the closing `>` of the primary template definition, otherwise `NBL_PARTIAL_REQUIRES` wont work on specializations +#define NBL_STRUCT_CONSTRAINABLE ,typename __requires=void +// NOTE: C++20 requires and C++11 enable_if have to be in different places! ITS OF UTTMOST IMPORTANCE YOUR REQUIRE CLAUSES ARE IDENTICAL FOR BOTH MACROS +// put just after the closing `>` on the partial template specialization `template` declaration e.g. `template NBL_PARTIAL_REQ_TOP(SomeCond) +#define NBL_PARTIAL_REQ_TOP(...) +// put just before closing `>` on the partial template specialization Type args, e.g. `MyStruct)> +#define NBL_PARTIAL_REQ_BOT(...) ,std::enable_if_t<(__VA_ARGS__),void> + +// condition, use right after the closing `>` of a function template +#define NBL_FUNC_REQUIRES_BEGIN(...) ::nbl::hlsl::enable_if_t<(__VA_ARGS__), +// return value, use `END(T)` instead of the return value type declaration +#define NBL_FUNC_REQUIRES_END(...) __VA_ARGS__> + #endif + #endif \ No newline at end of file From fb2f7c6cc9e380c761a8d52acd8099a37ef97f21 Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 29 Oct 2024 15:30:31 +0100 Subject: [PATCH 109/358] forgot to amend the commit --- include/nbl/builtin/hlsl/concepts.hlsl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/nbl/builtin/hlsl/concepts.hlsl b/include/nbl/builtin/hlsl/concepts.hlsl index 91dc76970f..033709e726 100644 --- a/include/nbl/builtin/hlsl/concepts.hlsl +++ b/include/nbl/builtin/hlsl/concepts.hlsl @@ -19,6 +19,10 @@ #define NBL_CONCEPT_ASSIGN(NAME, ...) concept NAME = __VA_ARGS__; #define NBL_REQUIRES(...) requires __VA_ARGS__ + +// to define a concept using `concept Name = SomeContexprBoolCondition;` +#define NBL_BOOL_CONCEPT concept + // for struct definitions, use instead of closing `>` on the primary template parameter list #define NBL_PRIMARY_REQUIRES(...) > requires (__VA_ARGS__) @@ -102,6 +106,9 @@ concept matricial = is_matrix::value; #define NBL_REQUIRES(...) +// to define a concept using `concept Name = SomeContexprBoolCondition;` +#define NBL_BOOL_CONCEPT NBL_CONSTEXPR_STATIC_INLINE bool + // for struct definitions, use instead of closing `>` on the primary template parameter list #define NBL_PRIMARY_REQUIRES(...) ,typename __requires=::nbl::hlsl::enable_if_t<(__VA_ARGS__),void> > From 86e9a664a21c575eafa8406aba6e5d74c06a7585 Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 29 Oct 2024 15:34:25 +0100 Subject: [PATCH 110/358] start reworking the descriptor declarations --- include/nbl/builtin/hlsl/binding_info.hlsl | 24 ++++ include/nbl/builtin/hlsl/blit/common.hlsl | 107 +++++++-------- .../builtin/hlsl/blit/default_blit.comp.hlsl | 57 ++++++++ .../hlsl/blit/default_normalize.comp.hlsl | 18 +++ include/nbl/builtin/hlsl/blit/temp.hlsl | 40 ------ include/nbl/video/utilities/CComputeBlit.h | 80 ----------- src/nbl/builtin/CMakeLists.txt | 5 +- src/nbl/video/utilities/CComputeBlit.cpp | 129 +++++++++++++++++- 8 files changed, 279 insertions(+), 181 deletions(-) create mode 100644 include/nbl/builtin/hlsl/binding_info.hlsl create mode 100644 include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl create mode 100644 include/nbl/builtin/hlsl/blit/default_normalize.comp.hlsl delete mode 100644 include/nbl/builtin/hlsl/blit/temp.hlsl diff --git a/include/nbl/builtin/hlsl/binding_info.hlsl b/include/nbl/builtin/hlsl/binding_info.hlsl new file mode 100644 index 0000000000..8702a32c3d --- /dev/null +++ b/include/nbl/builtin/hlsl/binding_info.hlsl @@ -0,0 +1,24 @@ +// Copyright (C) 2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_BUILTIN_HLSL_BINDING_INFO_INCLUDED_ +#define _NBL_BUILTIN_HLSL_BINDING_INFO_INCLUDED_ + +#include "nbl/builtin/hlsl/cpp_compat.hlsl" + +namespace nbl +{ +namespace hlsl +{ + +template +struct ConstevalBindingInfo +{ + NBL_CONSTEXPR_STATIC_INLINE uint32_t Set = set; + NBL_CONSTEXPR_STATIC_INLINE uint32_t Index = ix; + NBL_CONSTEXPR_STATIC_INLINE uint32_t Count = count; +}; + +} +} +#endif diff --git a/include/nbl/builtin/hlsl/blit/common.hlsl b/include/nbl/builtin/hlsl/blit/common.hlsl index 07bb3f9429..6295e68706 100644 --- a/include/nbl/builtin/hlsl/blit/common.hlsl +++ b/include/nbl/builtin/hlsl/blit/common.hlsl @@ -4,84 +4,77 @@ #ifndef _NBL_BUILTIN_HLSL_BLIT_COMMON_INCLUDED_ #define _NBL_BUILTIN_HLSL_BLIT_COMMON_INCLUDED_ -#include +#include namespace nbl { namespace hlsl { -namespace blit +namespace glsl { -namespace impl +uint32_t gl_WorkGroupSize() { + return uint32_t3(ConstevalParameters::WorkGroupSize,1,1); +} +} +} +} -template -struct dim_to_image_properties { }; +using namespace nbl::hlsl; + +[[vk::binding(ConstevalParameters::kernel_weight_binding_t::Index,ConstevalParameters::kernel_weight_binding_t::Set)]] +Buffer kernelWeights[ConstevalParameters::kernel_weight_binding_t::Count]; +[[vk::binding(ConstevalParameters::input_sampler_binding_t::Index,ConstevalParameters::input_sampler_binding_t::Set)]] +SamplerState inSamp[ConstevalParameters::input_sampler_binding_t::Count]; +// aliased +[[vk::binding(ConstevalParameters::input_image_binding_t::Index,ConstevalParameters::input_image_binding_t::Set)]] +Texture1DArray inAs1DArray[ConstevalParameters::input_image_binding_t::Count]; +[[vk::binding(ConstevalParameters::input_image_binding_t::Index,ConstevalParameters::input_image_binding_t::Set)]] +Texture2DArray inAs2DArray[ConstevalParameters::input_image_binding_t::Count]; +[[vk::binding(ConstevalParameters::input_image_binding_t::Index,ConstevalParameters::input_image_binding_t::Set)]] +Texture3D inAs3D[ConstevalParameters::input_image_binding_t::Count]; +// aliased +[[vk::binding(ConstevalParameters::output_binding_t::Index,ConstevalParameters::output_binding_t::Set)]] [[vk::image_format("unknown")]] +RWTexture1DArray outAs1DArray[ConstevalParameters::output_binding_t::Count]; +[[vk::binding(ConstevalParameters::output_binding_t::Index,ConstevalParameters::output_binding_t::Set)]] [[vk::image_format("unknown")]] +RWTexture2DArray outAs2DArray[ConstevalParameters::output_binding_t::Count]; +[[vk::binding(ConstevalParameters::output_binding_t::Index,ConstevalParameters::output_binding_t::Set)]] [[vk::image_format("unknown")]] +RWTexture3D outAs3D[ConstevalParameters::output_binding_t::Count]; -template <> -struct dim_to_image_properties<1> -{ - using combined_sampler_t = Texture1DArray; - using image_t = RWTexture1DArray; - template - static vector getIndexCoord(vector coords, uint32_t layer) +groupshared uint32_t sMem[ConstevalParameters::SharedMemoryDWORDs]; +/* +struct HistogramAccessor +{ + void atomicAdd(uint32_t wgID, uint32_t bucket, uint32_t v) { - return vector(coords.x, layer); + InterlockedAdd(statsBuff[wgID * (ConstevalParameters::AlphaBinCount + 1) + bucket], v); } }; - -template <> -struct dim_to_image_properties<2> +struct SharedAccessor { - using combined_sampler_t = Texture2DArray; - using image_t = RWTexture2DArray; - - template - static vector getIndexCoord(vector coords, uint32_t layer) + float32_t get(float32_t idx) { - return vector(coords.xy, layer); + return sMem[idx]; + } + void set(float32_t idx, float32_t val) + { + sMem[idx] = val; } }; - -template <> -struct dim_to_image_properties<3> +struct InCSAccessor { - using combined_sampler_t = Texture3D; - using image_t = RWTexture3D; - - template - static vector getIndexCoord(vector coords, uint32_t layer) + float32_t4 get(float32_t3 c, uint32_t l) { - return vector(coords); + return inCS.SampleLevel(inSamp, blit::impl::dim_to_image_properties::getIndexCoord(c, l), 0); } }; - -} - - -template< - uint32_t _WorkGroupSizeX, - uint32_t _WorkGroupSizeY, - uint32_t _WorkGroupSizeZ, - uint32_t _SMemFloatsPerChannel, - uint32_t _BlitOutChannelCount, - uint32_t _BlitDimCount, - uint32_t _AlphaBinCount> -struct consteval_parameters_t +struct OutImgAccessor { - NBL_CONSTEXPR_STATIC_INLINE uint32_t SMemFloatsPerChannel = _SMemFloatsPerChannel; - NBL_CONSTEXPR_STATIC_INLINE uint32_t BlitOutChannelCount = _BlitOutChannelCount; - NBL_CONSTEXPR_STATIC_INLINE uint32_t BlitDimCount = _BlitDimCount; - NBL_CONSTEXPR_STATIC_INLINE uint32_t AlphaBinCount = _AlphaBinCount; - NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkGroupSizeX = _WorkGroupSizeX; - NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkGroupSizeY = _WorkGroupSizeY; - NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkGroupSizeZ = _WorkGroupSizeZ; - NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkGroupSize = WorkGroupSizeX * WorkGroupSizeY * WorkGroupSizeZ; + void set(int32_t3 c, uint32_t l, float32_t4 v) + { + outImg[blit::impl::dim_to_image_properties::getIndexCoord(c, l)] = v; + } }; - -} -} -} - +*/ #endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl b/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl new file mode 100644 index 0000000000..ad2749904c --- /dev/null +++ b/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl @@ -0,0 +1,57 @@ +// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +//#include "nbl/builtin/hlsl/blit/common.hlsl" +//#include "nbl/builtin/hlsl/blit/parameters.hlsl" +//#include "nbl/builtin/hlsl/blit/compute_blit.hlsl" + + +groupshared uint32_t sMem[ConstevalParameters::SharedMemoryDWORDs]; +/* +struct HistogramAccessor +{ + void atomicAdd(uint32_t wgID, uint32_t bucket, uint32_t v) + { + InterlockedAdd(statsBuff[wgID * (ConstevalParameters::AlphaBinCount + 1) + bucket], v); + } +}; +struct KernelWeightsAccessor +{ + float32_t4 get(uint32_t idx) + { + return kernelWeights[idx]; + } +}; +struct InCSAccessor +{ + float32_t4 get(float32_t3 c, uint32_t l) + { + return inCS.SampleLevel(inSamp, blit::impl::dim_to_image_properties::getIndexCoord(c, l), 0); + } +}; +struct OutImgAccessor +{ + void set(int32_t3 c, uint32_t l, float32_t4 v) + { + outImg[blit::impl::dim_to_image_properties::getIndexCoord(c, l)] = v; + } +}; +*/ + +using namespace nbl::hlsl::blit; + +// TODO: push constants + +[numthreads(ConstevalParameters::WorkGroupSize,1,1)] +void main() +{ +/* + blit::compute_blit_t blit = blit::compute_blit_t::create(params); + InCSAccessor inCSA; + OutImgAccessor outImgA; + KernelWeightsAccessor kwA; + HistogramAccessor hA; + SharedAccessor sA; + blit.execute(inCSA, outImgA, kwA, hA, sA, workGroupID, localInvocationIndex); +*/ +} \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/blit/default_normalize.comp.hlsl b/include/nbl/builtin/hlsl/blit/default_normalize.comp.hlsl new file mode 100644 index 0000000000..589f370c0e --- /dev/null +++ b/include/nbl/builtin/hlsl/blit/default_normalize.comp.hlsl @@ -0,0 +1,18 @@ +// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#include "nbl/builtin/hlsl/blit/common.hlsl" + + + +//#include "nbl/builtin/hlsl/blit/parameters.hlsl" +//#include "nbl/builtin/hlsl/blit/compute_blit.hlsl" + +using namespace nbl::hlsl::blit; + +// TODO: push constants + +[numthreads(ConstevalParameters::WorkGroupSize,1,1)] +void main() +{ +} \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/blit/temp.hlsl b/include/nbl/builtin/hlsl/blit/temp.hlsl deleted file mode 100644 index 4f8ced390a..0000000000 --- a/include/nbl/builtin/hlsl/blit/temp.hlsl +++ /dev/null @@ -1,40 +0,0 @@ -// TODO: Delete this file! -// This file is temporary file that defines all of the dependencies on PR #519 -// and should be deleted as soon as that's merged. -#ifndef _NBL_BUILTIN_HLSL_BLIT_TEMP_INCLUDED_ -#define _NBL_BUILTIN_HLSL_BLIT_TEMP_INCLUDED_ - - -namespace nbl -{ -namespace hlsl -{ - -namespace workgroup -{ - // This is slow naive scan but it doesn't matter as this file is going to - // be nuked. The interface is different than the one suggested in PR #519 - // because right now there's no easy hack-free way to access - // gl_localInvocationID globally. - template - T inclusive_scan(T value, NBL_REF_ARG(SharedAccessor) sharedAccessor, uint32_t localInvocationIndex) - { - Binop binop; - for (uint32_t i = 0; i < firstbithigh(WorkGroupSize); ++i) - { - sharedAccessor.main.set(localInvocationIndex, value); - GroupMemoryBarrierWithGroupSync(); - if (localInvocationIndex >= (1u << i)) - { - value = binop(sharedAccessor.main.get(localInvocationIndex - (1u << i)), value); - } - GroupMemoryBarrierWithGroupSync(); - } - return value; - } -} - -} -} - -#endif \ No newline at end of file diff --git a/include/nbl/video/utilities/CComputeBlit.h b/include/nbl/video/utilities/CComputeBlit.h index 779dc2ddc0..eae3f4bf0a 100644 --- a/include/nbl/video/utilities/CComputeBlit.h +++ b/include/nbl/video/utilities/CComputeBlit.h @@ -130,86 +130,6 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted return m_normalizationPipelines[key]; } - template - core::smart_refctd_ptr createBlitSpecializedShader( - const asset::E_FORMAT outFormat, - const asset::IImage::E_TYPE imageType, - const core::vectorSIMDu32& inExtent, - const core::vectorSIMDu32& outExtent, - const asset::IBlitUtilities::E_ALPHA_SEMANTIC alphaSemantic, - const typename BlitUtilities::convolution_kernels_t& kernels, - const uint32_t workgroupSize = 0, - const uint32_t alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount) - { - if (workgroupSize==0) - workgroupSize = m_device->getPhysicalDevice()->getLimits().maxWorkgroupSize; - - const auto workgroupDims = getDefaultWorkgroupDims(imageType); - const auto paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount); - - const uint32_t outChannelCount = asset::getFormatChannelCount(outFormat); - const uint32_t smemFloatCount = m_availableSharedMemory / (sizeof(float) * outChannelCount); - const uint32_t blitDimCount = static_cast(imageType) + 1; - - const auto castedFormat = getOutImageViewFormat(outFormat); - assert(outFormat == castedFormat); - const char* formatQualifier = asset::CHLSLCompiler::getStorageImageFormatQualifier(castedFormat); - - std::ostringstream shaderSourceStream; - shaderSourceStream - << "#include \"nbl/builtin/hlsl/blit/common.hlsl\"\n" - "#include \"nbl/builtin/hlsl/blit/parameters.hlsl\"\n" - "#include \"nbl/builtin/hlsl/blit/compute_blit.hlsl\"\n"; - - shaderSourceStream - << "typedef nbl::hlsl::blit::consteval_parameters_t<" << workgroupSize << ", 1, 1, " << smemFloatCount << ", " - << outChannelCount << ", " << blitDimCount << ", " << paddedAlphaBinCount << "> ceval_params_t;\n"; - - shaderSourceStream - << "[[vk::combinedImageSampler]] [[vk::binding(0, 0)]]\n" - "nbl::hlsl::blit::impl::dim_to_image_properties::combined_sampler_t inCS;\n" - "[[vk::combinedImageSampler]] [[vk::binding(0, 0)]]\n" - "SamplerState inSamp;\n" - - "[[vk::image_format(\""<< formatQualifier << "\")]]\n" - "[[vk::binding(1, 0)]]\n" - "nbl::hlsl::blit::impl::dim_to_image_properties::image_t outImg;\n" - - "[[vk::binding(0, 1)]] Buffer kernelWeights;\n" - "[[vk::push_constant]] nbl::hlsl::blit::parameters_t params;" - "groupshared float32_t sMem[" << m_availableSharedMemory / sizeof(float) << "];\n"; - - if (alphaSemantic == asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE) - { - shaderSourceStream - << "[[vk::binding(2 , 0)]] RWStructuredBuffer statsBuff;\n" - "struct HistogramAccessor { void atomicAdd(uint32_t wgID, uint32_t bucket, uint32_t v) { InterlockedAdd(statsBuff[wgID * (ceval_params_t::AlphaBinCount + 1) + bucket], v); } };\n"; - } - else - { - shaderSourceStream << "struct HistogramAccessor { void atomicAdd(uint32_t wgID, uint32_t bucket, uint32_t v) { } };\n"; - } - - shaderSourceStream - << "struct KernelWeightsAccessor { float32_t4 get(float32_t idx) { return kernelWeights[idx]; } };\n" - "struct SharedAccessor { float32_t get(float32_t idx) { return sMem[idx]; } void set(float32_t idx, float32_t val) { sMem[idx] = val; } };\n" - "struct InCSAccessor { float32_t4 get(float32_t3 c, uint32_t l) { return inCS.SampleLevel(inSamp, nbl::hlsl::blit::impl::dim_to_image_properties::getIndexCoord(c, l), 0); } };\n" - "struct OutImgAccessor { void set(int32_t3 c, uint32_t l, float32_t4 v) { outImg[nbl::hlsl::blit::impl::dim_to_image_properties::getIndexCoord(c, l)] = v; } };\n" - - "[numthreads(ceval_params_t::WorkGroupSize, 1, 1)]\n" - "void main(uint32_t3 workGroupID : SV_GroupID, uint32_t localInvocationIndex : SV_GroupIndex)\n" - "{\n" - " nbl::hlsl::blit::compute_blit_t blit = nbl::hlsl::blit::compute_blit_t::create(params);\n" - " InCSAccessor inCSA; OutImgAccessor outImgA; KernelWeightsAccessor kwA; HistogramAccessor hA; SharedAccessor sA;\n" - " blit.execute(inCSA, outImgA, kwA, hA, sA, workGroupID, localInvocationIndex);\n" - "}\n"; - - auto cpuShader = core::make_smart_refctd_ptr(shaderSourceStream.str().c_str(), IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, IGPUShader::E_SHADER_STAGE::E_CONTENT_TYPE::ECT_HLSL, "CComputeBlit::createBlitSpecializedShader"); - auto gpuShader = m_device->createShader(std::move(cpuShader.get())); - - return gpuShader; - } - template core::smart_refctd_ptr getBlitPipeline( const asset::E_FORMAT outFormat, diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index 4705ca4426..b3ec566beb 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -263,7 +263,8 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/member_test_macros.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/blit/alpha_test.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/blit/compute_blit.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/blit/common.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/blit/temp.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/blit/default_blit.comp.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/blit/default_normalize.comp.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/blit/normalization.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/blit/parameters.hlsl") #device capability @@ -337,5 +338,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/memory.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/memory_accessor.hlsl") #enums LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/enums.hlsl") +# +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/binding_info.hlsl") ADD_CUSTOM_BUILTIN_RESOURCES(nblBuiltinResourceData NBL_RESOURCES_TO_EMBED "${NBL_ROOT_PATH}/include" "nbl/builtin" "nbl::builtin" "${NBL_ROOT_PATH_BINARY}/include" "${NBL_ROOT_PATH_BINARY}/src" "STATIC" "INTERNAL") diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp index 5856f42fc5..1ceb1ee415 100644 --- a/src/nbl/video/utilities/CComputeBlit.cpp +++ b/src/nbl/video/utilities/CComputeBlit.cpp @@ -1,4 +1,5 @@ #include "nbl/video/utilities/CComputeBlit.h" +#include "nbl/builtin/hlsl/binding_info.hlsl" using namespace nbl::core; using namespace nbl::system; @@ -47,10 +48,55 @@ void CComputeBlit::createAndCachePipelines(CAssetConverter* converter, smart_ref // also slightly more memory is needed task.sharedMemoryPerInvocation = core::max(singlePixelStorage*2,task.sharedMemoryPerInvocation); } + const auto common = [&]()->std::string + { + // TODO: introduce a common type between ImGUI and Blit for the descriptor infos + auto serializeBindingInfo = [](const hlsl::SBindingInfo& info={})->std::string + { + return "ConstevalBindingInfo<"+std::to_string(info.Set)+","+std::to_string(info.Set)+","+std::to_string(info.Count)+">"; + }; + + std::ostringstream tmp; + tmp << R"===( +#include "nbl/builtin/hlsl/binding_info.hlsl" + + +using namespace nbl::hlsl; + + +struct ConstevalParameters +{ + NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkGroupSize = )===" << (0x1u<smart_refctd_ptr + { + auto shader = make_smart_refctd_ptr( + (common+"\n#include \""+mainPath+"\"\n").c_str(), + IShader::E_SHADER_STAGE::ESS_COMPUTE, + IShader::E_CONTENT_TYPE::ECT_HLSL, + mainPath + ); + + ICPUComputePipeline::SCreationParams params = {}; + params.layout = nullptr; // TODO + params.shader.entryPoint = "main"; + params.shader.shader = shader.get(); + params.shader.requiredSubgroupSize = static_cast(hlsl::findMSB(limits.maxSubgroupSize)); + // needed for the prefix and reductions to work + params.shader.requireFullSubgroups = true; + return ICPUComputePipeline::create(params); + }; // create blit pipeline - cpuPplns.emplace_back(nullptr); - // create optional coverage normalization pipeline - cpuPplns.emplace_back(nullptr); + cpuPplns.emplace_back(createPipeline("default_blit.comp.hlsl")); + cpuPplns.emplace_back(createPipeline("default_normalize.comp.hlsl")); } CAssetConverter::SInputs inputs = {}; @@ -79,6 +125,83 @@ void CComputeBlit::createAndCachePipelines(CAssetConverter* converter, smart_ref } #if 0 + +template +core::smart_refctd_ptr createBlitSpecializedShader( + const asset::IImage::E_TYPE imageType, + const core::vectorSIMDu32& inExtent, + const core::vectorSIMDu32& outExtent, + const asset::IBlitUtilities::E_ALPHA_SEMANTIC alphaSemantic, + const typename BlitUtilities::convolution_kernels_t& kernels, + const uint32_t workgroupSize = 0, + const uint32_t alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount) +{ + if (workgroupSize==0) + workgroupSize = m_device->getPhysicalDevice()->getLimits().maxWorkgroupSize; + + const auto workgroupDims = getDefaultWorkgroupDims(imageType); + const auto paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount); + + const uint32_t outChannelCount = asset::getFormatChannelCount(outFormat); + const uint32_t smemFloatCount = m_availableSharedMemory / (sizeof(float) * outChannelCount); + const uint32_t blitDimCount = static_cast(imageType) + 1; + + + std::ostringstream shaderSourceStream; + shaderSourceStream + << "#include \"nbl/builtin/hlsl/blit/common.hlsl\"\n" + "#include \"nbl/builtin/hlsl/blit/parameters.hlsl\"\n" + "#include \"nbl/builtin/hlsl/blit/compute_blit.hlsl\"\n"; + + shaderSourceStream + << "typedef nbl::hlsl::blit::consteval_parameters_t<" << workgroupSize << ", 1, 1, " << smemFloatCount << ", " + << outChannelCount << ", " << blitDimCount << ", " << paddedAlphaBinCount << "> ceval_params_t;\n"; + + shaderSourceStream + << "[[vk::combinedImageSampler]] [[vk::binding(0, 0)]]\n" + "nbl::hlsl::blit::impl::dim_to_image_properties::combined_sampler_t inCS;\n" + "[[vk::combinedImageSampler]] [[vk::binding(0, 0)]]\n" + "SamplerState inSamp;\n" + + "[[vk::image_format(\""<< formatQualifier << "\")]]\n" + "[[vk::binding(1, 0)]]\n" + "nbl::hlsl::blit::impl::dim_to_image_properties::image_t outImg;\n" + + "[[vk::binding(0, 1)]] Buffer kernelWeights;\n" + "[[vk::push_constant]] nbl::hlsl::blit::parameters_t params;" + "groupshared float32_t sMem[" << m_availableSharedMemory / sizeof(float) << "];\n"; + + if (alphaSemantic == asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE) + { + shaderSourceStream + << "[[vk::binding(2 , 0)]] RWStructuredBuffer statsBuff;\n" + "struct HistogramAccessor { void atomicAdd(uint32_t wgID, uint32_t bucket, uint32_t v) { InterlockedAdd(statsBuff[wgID * (ceval_params_t::AlphaBinCount + 1) + bucket], v); } };\n"; + } + else + { + shaderSourceStream << "struct HistogramAccessor { void atomicAdd(uint32_t wgID, uint32_t bucket, uint32_t v) { } };\n"; + } + + shaderSourceStream + << "struct KernelWeightsAccessor { float32_t4 get(float32_t idx) { return kernelWeights[idx]; } };\n" + "struct SharedAccessor { float32_t get(float32_t idx) { return sMem[idx]; } void set(float32_t idx, float32_t val) { sMem[idx] = val; } };\n" + "struct InCSAccessor { float32_t4 get(float32_t3 c, uint32_t l) { return inCS.SampleLevel(inSamp, nbl::hlsl::blit::impl::dim_to_image_properties::getIndexCoord(c, l), 0); } };\n" + "struct OutImgAccessor { void set(int32_t3 c, uint32_t l, float32_t4 v) { outImg[nbl::hlsl::blit::impl::dim_to_image_properties::getIndexCoord(c, l)] = v; } };\n" + + "[numthreads(ceval_params_t::WorkGroupSize, 1, 1)]\n" + "void main(uint32_t3 workGroupID : SV_GroupID, uint32_t localInvocationIndex : SV_GroupIndex)\n" + "{\n" + " nbl::hlsl::blit::compute_blit_t blit = nbl::hlsl::blit::compute_blit_t::create(params);\n" + " InCSAccessor inCSA; OutImgAccessor outImgA; KernelWeightsAccessor kwA; HistogramAccessor hA; SharedAccessor sA;\n" + " blit.execute(inCSA, outImgA, kwA, hA, sA, workGroupID, localInvocationIndex);\n" + "}\n"; + + auto cpuShader = core::make_smart_refctd_ptr(shaderSourceStream.str().c_str(), IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, IGPUShader::E_SHADER_STAGE::E_CONTENT_TYPE::ECT_HLSL, "CComputeBlit::createBlitSpecializedShader"); + auto gpuShader = m_device->createShader(std::move(cpuShader.get())); + + return gpuShader; +} + core::smart_refctd_ptr CComputeBlit::createAlphaTestSpecializedShader(const asset::IImage::E_TYPE imageType, const uint32_t alphaBinCount) { const auto workgroupDims = getDefaultWorkgroupDims(imageType); From fda286ad51311fcbe7430b6b460090727a93bdf2 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Tue, 29 Oct 2024 10:04:39 -0700 Subject: [PATCH 111/358] Saving work --- examples_tests | 2 +- include/ICameraSceneNode.h | 8 +- include/nbl/asset/ICPUMeshBuffer.h | 8 +- include/nbl/asset/IMeshBuffer.h | 4 +- include/nbl/asset/ISkeleton.h | 2 +- .../IRenderpassIndependentPipelineMetadata.h | 30 +- include/nbl/asset/utils/IMeshManipulator.h | 6 +- .../nbl/builtin/glsl/math/quaternions.glsl | 5 - .../nbl/builtin/hlsl/cpp_compat/intrinsics.h | 2 - .../nbl/builtin/hlsl/cpp_compat/matrix.hlsl | 1 - .../nbl/builtin/hlsl/cpp_compat/vector.hlsl | 3 +- .../transformation_matrix_utils.hlsl | 83 ++-- include/nbl/core/math/floatutil.h | 6 +- include/nbl/core/math/floatutil.tcc | 1 - include/nbl/core/math/glslFunctions.h | 9 +- include/nbl/core/math/glslFunctions.tcc | 1 - include/nbl/core/math/intutil.h | 2 +- include/nbl/core/math/plane3dSIMD.h | 2 + .../nbl/ext/MitsubaLoader/CMitsubaLoader.h | 10 +- include/nbl/ext/MitsubaLoader/SContext.h | 4 +- include/nbl/video/IGPUAccelerationStructure.h | 2 +- include/quaternion.h | 463 ------------------ include/vectorSIMD.h | 18 + src/nbl/asset/IAssetManager.cpp | 2 +- src/nbl/asset/utils/CGeometryCreator.cpp | 5 +- src/nbl/asset/utils/CMeshManipulator.cpp | 17 +- 26 files changed, 125 insertions(+), 571 deletions(-) delete mode 100644 include/quaternion.h diff --git a/examples_tests b/examples_tests index d1c0e9f7e6..6cbd85caa3 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit d1c0e9f7e6bb05dbacd2318da86909d3a93e08f3 +Subproject commit 6cbd85caa3d646969d7724ca7f07b0b6a72e0957 diff --git a/include/ICameraSceneNode.h b/include/ICameraSceneNode.h index e3975e3802..014c063bf2 100644 --- a/include/ICameraSceneNode.h +++ b/include/ICameraSceneNode.h @@ -46,17 +46,17 @@ class ICameraSceneNode : public ISceneNode The function will figure it out if you've set an orthogonal matrix. \param projection The new projection matrix of the camera. */ - virtual void setProjectionMatrix(const core::matrix4SIMD& projection) =0; + virtual void setProjectionMatrix(const hlsl::float32_t4x4& projection) =0; //! Gets the current projection matrix of the camera. /** \return The current projection matrix of the camera. */ - inline const core::matrix4SIMD& getProjectionMatrix() const { return projMatrix; } + inline const hlsl::float32_t4x4& getProjectionMatrix() const { return projMatrix; } //! Gets the current view matrix of the camera. /** \return The current view matrix of the camera. */ - virtual const core::matrix3x4SIMD& getViewMatrix() const =0; + virtual const hlsl::float32_t3x4& getViewMatrix() const =0; - virtual const core::matrix4SIMD& getConcatenatedMatrix() const =0; + virtual const hlsl::float32_t4x4& getConcatenatedMatrix() const =0; #if 0 //! It is possible to send mouse and key events to the camera. /** Most cameras may ignore this input, but camera scene nodes diff --git a/include/nbl/asset/ICPUMeshBuffer.h b/include/nbl/asset/ICPUMeshBuffer.h index 4ecc0f35ba..edfae1be39 100644 --- a/include/nbl/asset/ICPUMeshBuffer.h +++ b/include/nbl/asset/ICPUMeshBuffer.h @@ -582,18 +582,18 @@ class ICPUMeshBuffer final : public IMeshBuffer(m_inverseBindPoseBufferBinding.buffer->getPointer()); - return reinterpret_cast(ptr+m_inverseBindPoseBufferBinding.offset); + return reinterpret_cast(ptr+m_inverseBindPoseBufferBinding.offset); } - inline core::matrix3x4SIMD* getInverseBindPoses() + inline hlsl::float32_t3x4* getInverseBindPoses() { assert(isMutable()); - return const_cast(const_cast(this)->getInverseBindPoses()); + return const_cast(const_cast(this)->getInverseBindPoses()); } //! diff --git a/include/nbl/asset/IMeshBuffer.h b/include/nbl/asset/IMeshBuffer.h index abf5d156dd..cbcfa9c8f6 100644 --- a/include/nbl/asset/IMeshBuffer.h +++ b/include/nbl/asset/IMeshBuffer.h @@ -211,7 +211,7 @@ class IMeshBuffer : public virtual core::IReferenceCounted virtual inline bool isSkinned() const { return jointCount>0u && maxJointsPerVx>0u && m_inverseBindPoseBufferBinding.buffer && - m_inverseBindPoseBufferBinding.offset+jointCount*sizeof(core::matrix3x4SIMD)<=m_inverseBindPoseBufferBinding.buffer->getSize(); + m_inverseBindPoseBufferBinding.offset+jointCount*sizeof(hlsl::float32_t3x4)<=m_inverseBindPoseBufferBinding.buffer->getSize(); } //! @@ -228,7 +228,7 @@ class IMeshBuffer : public virtual core::IReferenceCounted if (_maxJointsPerVx==0u || _maxJointsPerVx>4u) return false; - if (_inverseBindPoseBufferBinding.offset+_jointCount*sizeof(core::matrix3x4SIMD)>_inverseBindPoseBufferBinding.buffer->getSize()) + if (_inverseBindPoseBufferBinding.offset+_jointCount*sizeof(hlsl::float32_t3x4)>_inverseBindPoseBufferBinding.buffer->getSize()) return false; m_inverseBindPoseBufferBinding = std::move(_inverseBindPoseBufferBinding); diff --git a/include/nbl/asset/ISkeleton.h b/include/nbl/asset/ISkeleton.h index 7960ca4eef..03ba3af4ea 100644 --- a/include/nbl/asset/ISkeleton.h +++ b/include/nbl/asset/ISkeleton.h @@ -62,7 +62,7 @@ class ISkeleton : public virtual core::IReferenceCounted return; assert(m_parentJointIDs.buffer->getSize()>=m_parentJointIDs.offset+sizeof(joint_id_t)*m_jointCount); - assert(m_defaultTransforms.buffer->getSize()>=m_defaultTransforms.offset+sizeof(core::matrix3x4SIMD)*m_jointCount); + assert(m_defaultTransforms.buffer->getSize()>=m_defaultTransforms.offset+sizeof(hlsl::float32_t3x4)*m_jointCount); } virtual ~ISkeleton() { diff --git a/include/nbl/asset/metadata/IRenderpassIndependentPipelineMetadata.h b/include/nbl/asset/metadata/IRenderpassIndependentPipelineMetadata.h index 416c04823b..7d0b63a141 100644 --- a/include/nbl/asset/metadata/IRenderpassIndependentPipelineMetadata.h +++ b/include/nbl/asset/metadata/IRenderpassIndependentPipelineMetadata.h @@ -140,35 +140,35 @@ class IRenderpassIndependentPipelineMetadata : public core::Interface //! A non exhaustive list of commonly used shader input semantics enum E_COMMON_SHADER_INPUT { - //! core::matrix4SIMD giving the total projection onto the screen from model-space coordinates + //! hlsl::float32_t4x4 giving the total projection onto the screen from model-space coordinates ECSI_WORLD_VIEW_PROJ, - //! core::matrix4SIMD giving the mapping from view-space into the pre-divide NDC space + //! hlsl::float32_t4x4 giving the mapping from view-space into the pre-divide NDC space ECSI_PROJ, - //! core::matrix3x4SIMD giving the view-space transformation from model-space coordinates + //! hlsl::float32_t3x4 giving the view-space transformation from model-space coordinates ECSI_WORLD_VIEW, - //! core::matrix3x4SIMD giving the view-space transformation from world-space + //! hlsl::float32_t3x4 giving the view-space transformation from world-space ECSI_VIEW, - //! core::matrix3x4SIMD giving the world-space transformation from model-space (last column is object world-space-position) + //! hlsl::float32_t3x4 giving the world-space transformation from model-space (last column is object world-space-position) ECSI_WORLD, - //! core::matrix4SIMD giving the total projection to model-space coordinates from screen-space + //! hlsl::float32_t4x4 giving the total projection to model-space coordinates from screen-space ECSI_WORLD_VIEW_PROJ_INVERSE, - //! core::matrix4SIMD giving the mapping from the pre-divide NDC space into view-space + //! hlsl::float32_t4x4 giving the mapping from the pre-divide NDC space into view-space ECSI_PROJ_INVERSE, - //! core::matrix3x4SIMD giving the model-space transformation from view-space coordinates + //! hlsl::float32_t3x4 giving the model-space transformation from view-space coordinates ECSI_WORLD_VIEW_INVERSE, - //! core::matrix3x4SIMD giving the world-space transformation from view-space (last column is camera world-space-position) + //! hlsl::float32_t3x4 giving the world-space transformation from view-space (last column is camera world-space-position) ECSI_VIEW_INVERSE, - //! core::matrix3x4SIMD giving the model-space transformation from world-space + //! hlsl::float32_t3x4 giving the model-space transformation from world-space ECSI_WORLD_INVERSE, - //! transpose of core::matrix4SIMD giving the total projection to model-space coordinates from screen-space + //! transpose of hlsl::float32_t4x4 giving the total projection to model-space coordinates from screen-space ECSI_WORLD_VIEW_PROJ_INVERSE_TRANSPOSE, - //! transpose of core::matrix4SIMD giving the mapping from the pre-divide NDC space into view-space + //! transpose of hlsl::float32_t4x4 giving the mapping from the pre-divide NDC space into view-space ECSI_PROJ_INVERSE_TRANSPOSE, - //! transpose of core::matrix3x4SIMD giving the model-space transformation from view-space coordinates (upper 3x3 matrix can be used instead of `gl_NormalMatrix`) + //! transpose of hlsl::float32_t3x4 giving the model-space transformation from view-space coordinates (upper 3x3 matrix can be used instead of `gl_NormalMatrix`) ECSI_WORLD_VIEW_INVERSE_TRANSPOSE, - //! transpose of core::matrix3x4SIMD giving the world-space transformation from view-space (last row is camera world-space-position) + //! transpose of hlsl::float32_t3x4 giving the world-space transformation from view-space (last row is camera world-space-position) ECSI_VIEW_INVERSE_TRANSPOSE, - //! transpose of core::matrix3x4SIMD giving the model-space transformation from world-space (upper 3x3 matrix can transform model space normals to world space) + //! transpose of hlsl::float32_t3x4 giving the model-space transformation from world-space (upper 3x3 matrix can transform model space normals to world space) ECSI_WORLD_INVERSE_TRANSPOSE, //! a simple non-filtered environment map as a cubemap diff --git a/include/nbl/asset/utils/IMeshManipulator.h b/include/nbl/asset/utils/IMeshManipulator.h index 27569e4f60..0a5bf12313 100644 --- a/include/nbl/asset/utils/IMeshManipulator.h +++ b/include/nbl/asset/utils/IMeshManipulator.h @@ -18,6 +18,9 @@ #include "nbl/asset/utils/CQuantNormalCache.h" #include "nbl/asset/utils/CQuantQuaternionCache.h" +#include +#include + namespace nbl { namespace asset @@ -408,8 +411,7 @@ class NBL_API2 IMeshManipulator : public virtual core::IReferenceCounted if (jointIDFLT_MIN) { - core::vectorSIMDf boneSpacePos; - inverseBindPoses[jointID].transformVect(boneSpacePos,pos); + core::vectorSIMDf boneSpacePos = hlsl::transformVector(hlsl::getMatrix3x4As4x4(inverseBindPoses[jointID]), pos); jointAABBs[jointID].addInternalPoint(boneSpacePos.getAsVector3df()); noJointInfluence = false; } diff --git a/include/nbl/builtin/glsl/math/quaternions.glsl b/include/nbl/builtin/glsl/math/quaternions.glsl index 7dc6ca0279..d94d48ecc9 100644 --- a/include/nbl/builtin/glsl/math/quaternions.glsl +++ b/include/nbl/builtin/glsl/math/quaternions.glsl @@ -1,18 +1,13 @@ #ifndef _NBL_BUILTIN_GLSL_MATH_QUATERNIONS_INCLUDED_ #define _NBL_BUILTIN_GLSL_MATH_QUATERNIONS_INCLUDED_ - - #include - - struct nbl_glsl_quaternion_t { vec4 data; }; - nbl_glsl_quaternion_t nbl_glsl_quaternion_t_constructFromTruncated(in vec3 first3Components) { nbl_glsl_quaternion_t quat; diff --git a/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h b/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h index a8f745fbae..4d10a669c3 100644 --- a/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h +++ b/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h @@ -1,10 +1,8 @@ #ifndef _NBL_BUILTIN_HLSL_CPP_COMPAT_INTRINSICS_INCLUDED_ #define _NBL_BUILTIN_HLSL_CPP_COMPAT_INTRINSICS_INCLUDED_ - #include #include -#include // this is a C++ only header, hence the `.h` extension, it only implements HLSL's built-in functions #ifndef __HLSL_VERSION diff --git a/include/nbl/builtin/hlsl/cpp_compat/matrix.hlsl b/include/nbl/builtin/hlsl/cpp_compat/matrix.hlsl index 455f8ef0c7..b1d33f097b 100644 --- a/include/nbl/builtin/hlsl/cpp_compat/matrix.hlsl +++ b/include/nbl/builtin/hlsl/cpp_compat/matrix.hlsl @@ -2,7 +2,6 @@ #define _NBL_BUILTIN_HLSL_CPP_COMPAT_MATRIX_INCLUDED_ #include -#include namespace nbl { diff --git a/include/nbl/builtin/hlsl/cpp_compat/vector.hlsl b/include/nbl/builtin/hlsl/cpp_compat/vector.hlsl index 0407cf95b6..c83b3893a6 100644 --- a/include/nbl/builtin/hlsl/cpp_compat/vector.hlsl +++ b/include/nbl/builtin/hlsl/cpp_compat/vector.hlsl @@ -2,7 +2,7 @@ #define _NBL_BUILTIN_HLSL_CPP_COMPAT_VECTOR_INCLUDED_ // stuff for C++ -#ifndef __HLSL_VERSION +#ifndef __HLSL_VERSION #include #include @@ -90,5 +90,6 @@ struct blake3_hasher::update_impl,Dummy> }; } #endif + } #endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl b/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl index cb3391c16f..8f687fb97a 100644 --- a/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl +++ b/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl @@ -1,5 +1,5 @@ -#ifndef _NBL_BUILTIN_HLSL_TRANSFORMATION_MATRIX_UTILS_INCLUDED_ -#define _NBL_BUILTIN_HLSL_TRANSFORMATION_MATRIX_UTILS_INCLUDED_ +#ifndef _NBL_BUILTIN_HLSL_MATRIX_UTILS_TRANSFORMATION_MATRIX_UTILS_INCLUDED_ +#define _NBL_BUILTIN_HLSL_MATRIX_UTILS_TRANSFORMATION_MATRIX_UTILS_INCLUDED_ #include #include @@ -16,7 +16,7 @@ NBL_CONSTEXPR hlsl::float32_t3x4 IdentityFloat32_t3x4 = hlsl::float32_t3x4(hlsl: // TODO: this is temporary function, delete when removing vectorSIMD template -core::vectorSIMDf transformVector(const matrix& mat, const core::vectorSIMDf& vec) +inline core::vectorSIMDf transformVector(const matrix& mat, const core::vectorSIMDf& vec) { core::vectorSIMDf output; float32_t4 tmp; @@ -24,14 +24,13 @@ core::vectorSIMDf transformVector(const matrix& mat, const core::vector tmp[i] = output[i]; for (int i = 0; i < 4; ++i) - output[i] = dot(mat[i], tmp[i]); + output[i] = hlsl::dot(mat[i], tmp); return output; } -// TODO: another idea is to create `getTransformationMatrixAs4x4` function, which will add 4th row to a 3x4 matrix and do nothing to 4x4 matrix, this way we will not have to deal with partial specialization later template -matrix getMatrix3x4As4x4(matrix mat) +inline matrix getMatrix3x4As4x4(matrix mat) { matrix output; for (int i = 0; i < 3; ++i) @@ -41,6 +40,16 @@ matrix getMatrix3x4As4x4(matrix mat) return output; } +template +inline matrix getAs64BitPrecisionMatrix(matrix mat) +{ + matrix output; + for (int i = 0; i < N; ++i) + output[i] = mat[i]; + + return output; +} + // TODO: use portable_float when merged //! multiplies matrices a and b, 3x4 matrices are treated as 4x4 matrices with 4th row set to (0, 0, 0 ,1) template @@ -62,26 +71,27 @@ inline matrix buildCameraLookAtMatrixLH( const vector yaxis = cross(zaxis, xaxis); matrix r; - r[0] = vector(xaxis, -dot(xaxis, position)); - r[1] = vector(yaxis, -dot(yaxis, position)); - r[2] = vector(zaxis, -dot(zaxis, position)); + r[0] = vector(xaxis, -dot(xaxis, position)); + r[1] = vector(yaxis, -dot(yaxis, position)); + r[2] = vector(zaxis, -dot(zaxis, position)); return r; } -float32_t3x4 buildCameraLookAtMatrixRH( - const float32_t3& position, - const float32_t3& target, - const float32_t3& upVector) +template +inline matrix buildCameraLookAtMatrixRH( + const vector& position, + const vector& target, + const vector& upVector) { - const float32_t3 zaxis = normalize(position - target); - const float32_t3 xaxis = normalize(cross(upVector, zaxis)); - const float32_t3 yaxis = cross(zaxis, xaxis); + const vector zaxis = normalize(position - target); + const vector xaxis = normalize(cross(upVector, zaxis)); + const vector yaxis = cross(zaxis, xaxis); float32_t3x4 r; - r[0] = float32_t4(xaxis, -dot(xaxis, position)); - r[1] = float32_t4(yaxis, -dot(yaxis, position)); - r[2] = float32_t4(zaxis, -dot(zaxis, position)); + r[0] = vector(xaxis, -dot(xaxis, position)); + r[1] = vector(yaxis, -dot(yaxis, position)); + r[2] = vector(zaxis, -dot(zaxis, position)); return r; } @@ -90,6 +100,14 @@ float32_t3x4 buildCameraLookAtMatrixRH( // TODO: move quaternion to nbl::hlsl // TODO: why NBL_REF_ARG(MatType) doesn't work????? +template +inline void setScale(matrix& outMat, NBL_CONST_REF_ARG(vector) scale) +{ + outMat[0][0] = scale[0]; + outMat[1][1] = scale[1]; + outMat[2][2] = scale[2]; +} + //! Replaces curent rocation and scale by rotation represented by quaternion `quat`, leaves 4th row and 4th colum unchanged template inline void setRotation(matrix& outMat, NBL_CONST_REF_ARG(nbl::hlsl::quaternion) quat) @@ -97,23 +115,23 @@ inline void setRotation(matrix& outMat, NBL_CONST_REF_ARG(nbl::hlsl::qu static_assert(N == 3 || N == 4); outMat[0] = vector( - 1 - 2 * (quat.y * quat.y + quat.z * quat.z), - 2 * (quat.x * quat.y - quat.z * quat.w), - 2 * (quat.x * quat.z + quat.y * quat.w), + 1 - 2 * (quat.data.y * quat.data.y + quat.data.z * quat.data.z), + 2 * (quat.data.x * quat.data.y - quat.data.z * quat.data.w), + 2 * (quat.data.x * quat.data.z + quat.data.y * quat.data.w), outMat[0][3] ); outMat[1] = vector( - 2 * (quat.x * quat.y + quat.z * quat.w), - 1 - 2 * (quat.x * quat.x + quat.z * quat.z), - 2 * (quat.y * quat.z - quat.x * quat.w), + 2 * (quat.data.x * quat.data.y + quat.data.z * quat.data.w), + 1 - 2 * (quat.data.x * quat.data.x + quat.data.z * quat.data.z), + 2 * (quat.data.y * quat.data.z - quat.data.x * quat.data.w), outMat[1][3] ); outMat[2] = vector( - 2 * (quat.x * quat.z - quat.y * quat.w), - 2 * (quat.y * quat.z + quat.x * quat.w), - 1 - 2 * (quat.x * quat.x + quat.y * quat.y), + 2 * (quat.data.x * quat.data.z - quat.data.y * quat.data.w), + 2 * (quat.data.y * quat.data.z + quat.data.x * quat.data.w), + 1 - 2 * (quat.data.x * quat.data.x + quat.data.y * quat.data.y), outMat[2][3] ); } @@ -128,15 +146,6 @@ inline void setTranslation(matrix& outMat, NBL_CONST_REF_ARG(vector -inline void setTranslation_tmp(matrix& outMat, NBL_CONST_REF_ARG(core::vectorSIMDf) translation) -{ - outMat[0].w = translation.x; - outMat[1].w = translation.y; - outMat[2].w = translation.z; -} - } } diff --git a/include/nbl/core/math/floatutil.h b/include/nbl/core/math/floatutil.h index b37b2cc463..9f310ba147 100644 --- a/include/nbl/core/math/floatutil.h +++ b/include/nbl/core/math/floatutil.h @@ -27,8 +27,6 @@ namespace nbl::core { class vectorSIMDf; -class matrix3x4SIMD; -class matrix4SIMD; //! Rounding error constant often used when comparing values. (TODO: remove) template @@ -40,9 +38,9 @@ NBL_FORCE_INLINE double ROUNDING_ERROR(); template<> NBL_FORCE_INLINE vectorSIMDf ROUNDING_ERROR(); template<> -NBL_FORCE_INLINE matrix3x4SIMD ROUNDING_ERROR(); +NBL_FORCE_INLINE hlsl::float32_t3x4 ROUNDING_ERROR(); template<> -NBL_FORCE_INLINE matrix4SIMD ROUNDING_ERROR(); +NBL_FORCE_INLINE hlsl::float32_t4x4 ROUNDING_ERROR(); #ifdef PI // make sure we don't collide with a define #undef PI diff --git a/include/nbl/core/math/floatutil.tcc b/include/nbl/core/math/floatutil.tcc index 86e42c7e73..e2b43a25bb 100644 --- a/include/nbl/core/math/floatutil.tcc +++ b/include/nbl/core/math/floatutil.tcc @@ -5,7 +5,6 @@ #ifndef __NBL_CORE_FLOAT_UTIL_TCC_INCLUDED__ #define __NBL_CORE_FLOAT_UTIL_TCC_INCLUDED__ - #include "nbl/core/math/floatutil.h" #include "vectorSIMD.h" diff --git a/include/nbl/core/math/glslFunctions.h b/include/nbl/core/math/glslFunctions.h index 7fa7385dad..5fabd1126f 100644 --- a/include/nbl/core/math/glslFunctions.h +++ b/include/nbl/core/math/glslFunctions.h @@ -10,7 +10,7 @@ #include "nbl/type_traits.h" #include "nbl/core/math/floatutil.h" -#include "nbl/builtin/hlsl/cpp_compat.hlsl" +#include "nbl/builtin/hlsl/cpp_compat/matrix.hlsl" namespace nbl { @@ -23,7 +23,6 @@ template class vectorSIMD_32; class vectorSIMDf; - template NBL_FORCE_INLINE T radians(const T& degrees) { @@ -126,7 +125,7 @@ NBL_FORCE_INLINE T mix(const T & a, const T & b, const U & t) { for (uint32_t i=0u; i::value) + if constexpr(nbl::is_any_of::value) { retval[i] = core::mix(a[i], b[i], t[i]); } @@ -316,13 +315,13 @@ NBL_FORCE_INLINE T lerp(const T& a, const T& b, const U& t) return core::mix(a,b,t); } - // TODO : step,smoothstep,isnan,isinf,floatBitsToInt,floatBitsToUint,intBitsToFloat,uintBitsToFloat,frexp,ldexp // extra note, GCC breaks isfinite, isinf, isnan, isnormal, signbit in -ffast-math so need to implement ourselves // TODO : packUnorm2x16, packSnorm2x16, packUnorm4x8, packSnorm4x8, unpackUnorm2x16, unpackSnorm2x16, unpackUnorm4x8, unpackSnorm4x8, packHalf2x16, unpackHalf2x16, packDouble2x32, unpackDouble2x32 // MOVE : faceforward, reflect, refract, any, all, not template NBL_FORCE_INLINE T dot(const T& a, const T& b); + template<> NBL_FORCE_INLINE vectorSIMDf dot(const vectorSIMDf& a, const vectorSIMDf& b); template<> @@ -361,7 +360,7 @@ NBL_FORCE_INLINE vectorSIMDf cross(const vectorSIMDf& a, const vect template NBL_FORCE_INLINE T normalize(const T& v) { - auto d = dot(v, v); + auto d = core::dot(v, v); #ifdef __NBL_FAST_MATH return v * core::inversesqrt(d); #else diff --git a/include/nbl/core/math/glslFunctions.tcc b/include/nbl/core/math/glslFunctions.tcc index d2343f0322..5c86d4c6c1 100644 --- a/include/nbl/core/math/glslFunctions.tcc +++ b/include/nbl/core/math/glslFunctions.tcc @@ -228,7 +228,6 @@ NBL_FORCE_INLINE T max(const T& a, const T& b) return core::mix(a,vb,asmaller); } - template<> NBL_FORCE_INLINE vectorSIMDf dot(const vectorSIMDf& a, const vectorSIMDf& b) { diff --git a/include/nbl/core/math/intutil.h b/include/nbl/core/math/intutil.h index 768883f2b0..8e6bead24d 100644 --- a/include/nbl/core/math/intutil.h +++ b/include/nbl/core/math/intutil.h @@ -8,9 +8,9 @@ #include "BuildConfigOptions.h" +#include "nbl/builtin/hlsl/cpp_compat/intrinsics.h" #include "nbl/macros.h" #include "nbl/core/math/glslFunctions.h" -#include "nbl/builtin/hlsl/cpp_compat.hlsl" #include #include // For INT_MAX / UINT_MAX diff --git a/include/nbl/core/math/plane3dSIMD.h b/include/nbl/core/math/plane3dSIMD.h index f6afafd8de..2686c0a821 100644 --- a/include/nbl/core/math/plane3dSIMD.h +++ b/include/nbl/core/math/plane3dSIMD.h @@ -7,6 +7,8 @@ #define __NBL_CORE_PLANE_3D_H_INCLUDED__ #include +#include +#include namespace nbl { diff --git a/include/nbl/ext/MitsubaLoader/CMitsubaLoader.h b/include/nbl/ext/MitsubaLoader/CMitsubaLoader.h index 5deb0982db..3021cc42ec 100644 --- a/include/nbl/ext/MitsubaLoader/CMitsubaLoader.h +++ b/include/nbl/ext/MitsubaLoader/CMitsubaLoader.h @@ -13,12 +13,10 @@ #include "nbl/ext/MitsubaLoader/CMitsubaMetadata.h" #include "nbl/ext/MitsubaLoader/CElementShape.h" #include "nbl/ext/MitsubaLoader/SContext.h" - +#include namespace nbl::ext::MitsubaLoader { - - class CElementBSDF; class CMitsubaMaterialCompilerFrontend; @@ -27,7 +25,7 @@ class CMitsubaMaterialCompilerFrontend; //#include "nbl/builtin/glsl/ext/MitsubaLoader/instance_data_struct.glsl" #define uint uint32_t #define uvec2 uint64_t -#define mat4x3 nbl::core::matrix3x4SIMD +#define mat4x3 nbl::hlsl::float32_t3x4 #define nbl_glsl_MC_material_data_t asset::material_compiler::material_data_t struct nbl_glsl_ext_Mitsuba_Loader_instance_data_t { @@ -70,8 +68,8 @@ class CMitsubaLoader : public asset::IRenderpassIndependentPipelineLoader // core::vector getMesh(SContext& ctx, uint32_t hierarchyLevel, CElementShape* shape, const system::logger_opt_ptr& logger); - core::vector loadShapeGroup(SContext& ctx, uint32_t hierarchyLevel, const CElementShape::ShapeGroup* shapegroup, const core::matrix3x4SIMD& relTform, const system::logger_opt_ptr& _logger); - SContext::shape_ass_type loadBasicShape(SContext& ctx, uint32_t hierarchyLevel, CElementShape* shape, const core::matrix3x4SIMD& relTform, const system::logger_opt_ptr& logger); + core::vector loadShapeGroup(SContext& ctx, uint32_t hierarchyLevel, const CElementShape::ShapeGroup* shapegroup, const hlsl::float32_t3x4& relTform, const system::logger_opt_ptr& _logger); + SContext::shape_ass_type loadBasicShape(SContext& ctx, uint32_t hierarchyLevel, CElementShape* shape, const hlsl::float32_t3x4& relTform, const system::logger_opt_ptr& logger); void cacheTexture(SContext& ctx, uint32_t hierarchyLevel, const CElementTexture* texture, const CMitsubaMaterialCompilerFrontend::E_IMAGE_VIEW_SEMANTIC semantic); diff --git a/include/nbl/ext/MitsubaLoader/SContext.h b/include/nbl/ext/MitsubaLoader/SContext.h index 4069b03583..88129918cb 100644 --- a/include/nbl/ext/MitsubaLoader/SContext.h +++ b/include/nbl/ext/MitsubaLoader/SContext.h @@ -190,7 +190,7 @@ namespace MitsubaLoader struct SInstanceData { - SInstanceData(core::matrix3x4SIMD _tform, SContext::bsdf_type _bsdf, const std::string& _id, const CElementEmitter& _emitterFront, const CElementEmitter& _emitterBack) : + SInstanceData(hlsl::float32_t3x4 _tform, SContext::bsdf_type _bsdf, const std::string& _id, const CElementEmitter& _emitterFront, const CElementEmitter& _emitterBack) : tform(_tform), bsdf(_bsdf), #if defined(_NBL_DEBUG) || defined(_NBL_RELWITHDEBINFO) bsdf_id(_id), @@ -198,7 +198,7 @@ namespace MitsubaLoader emitter{_emitterFront, _emitterBack} {} - core::matrix3x4SIMD tform; + hlsl::float32_t3x4 tform; SContext::bsdf_type bsdf; #if defined(_NBL_DEBUG) || defined(_NBL_RELWITHDEBINFO) std::string bsdf_id; diff --git a/include/nbl/video/IGPUAccelerationStructure.h b/include/nbl/video/IGPUAccelerationStructure.h index 8e451b7a8c..20b350be0b 100644 --- a/include/nbl/video/IGPUAccelerationStructure.h +++ b/include/nbl/video/IGPUAccelerationStructure.h @@ -279,7 +279,7 @@ class IGPUBottomLevelAccelerationStructure : public asset::IBottomLevelAccelerat // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdBuildAccelerationStructuresIndirectKHR-pInfos-03809 // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdBuildAccelerationStructuresIndirectKHR-pInfos-03810 // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkBuildAccelerationStructuresKHR-pInfos-03773 - if (Base::invalidInputBuffer(geometry.transform,buildRangeInfo.transformByteOffset,1u,sizeof(core::matrix3x4SIMD),sizeof(core::vectorSIMDf))) + if (Base::invalidInputBuffer(geometry.transform,buildRangeInfo.transformByteOffset,1u,sizeof(hlsl::float32_t3x4),sizeof(core::vectorSIMDf))) return false; } else diff --git a/include/quaternion.h b/include/quaternion.h deleted file mode 100644 index a0fd689dc6..0000000000 --- a/include/quaternion.h +++ /dev/null @@ -1,463 +0,0 @@ -// Copyright (C) 2019 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine" and was originally part of the "Irrlicht Engine" -// For conditions of distribution and use, see copyright notice in nabla.h -// See the original file in irrlicht source for authors - -#ifndef __NBL_QUATERNION_H_INCLUDED__ -#define __NBL_QUATERNION_H_INCLUDED__ - - -#include "vectorSIMD.h" - -#include "nbl/core/math/glslFunctions.h" - - -namespace nbl -{ -namespace core -{ - -class matrix3x4SIMD; - - -//! Quaternion class for representing rotations. -/** It provides cheap combinations and avoids gimbal locks. -Also useful for interpolations. */ -// TODO: move this to nbl::hlsl -class quaternion : public vectorSIMDf -{ - public: - //! Default Constructor - inline quaternion() : vectorSIMDf(0,0,0,1) {} - - inline quaternion(const quaternion& other) : vectorSIMDf(static_cast(other)) {} - - inline quaternion(const float* data) : vectorSIMDf(data) {} - - //! Constructor - inline quaternion(const float& x, const float& y, const float& z, const float& w) : vectorSIMDf(x,y,z,w) { } - - //! Constructor which converts euler angles (radians) to a quaternion - inline quaternion(const float& pitch, const float& yaw, const float& roll) {set(pitch,yaw,roll);} - - //! Constructor which converts a matrix to a quaternion - explicit quaternion(const matrix3x4SIMD& m); - - inline float* getPointer() {return pointer;} - - //! Equalilty operator - inline vector4db_SIMD operator==(const quaternion& other) const {return vectorSIMDf::operator==(other);} - - //! inequality operator - inline vector4db_SIMD operator!=(const quaternion& other) const {return vectorSIMDf::operator!=(other);} - - //! Assignment operator - inline quaternion& operator=(const quaternion& other) {return reinterpret_cast(vectorSIMDf::operator=(other));} - - //! Multiplication operator with scalar - inline quaternion operator*(const float& s) const - { - quaternion tmp; - reinterpret_cast(tmp) = reinterpret_cast(this)->operator*(s); - return tmp; - } - - //! Multiplication operator with scalar - inline quaternion& operator*=(const float& s) - { - *this = (*this)*s; - return *this; - } - - //! Multiplication operator - inline quaternion& operator*=(const quaternion& other) - { - *this = (*this)*other; - return *this; - } - - //! Multiplication operator - //http://momchil-velikov.blogspot.fr/2013/10/fast-sse-quternion-multiplication.html - inline quaternion operator*(const quaternion& other) const - { - __m128 xyzw = vectorSIMDf::getAsRegister(); - __m128 abcd = reinterpret_cast(other).getAsRegister(); - - __m128 t0 = FAST_FLOAT_SHUFFLE(abcd, _MM_SHUFFLE (3, 3, 3, 3)); /* 1, 0.5 */ - __m128 t1 = FAST_FLOAT_SHUFFLE(xyzw, _MM_SHUFFLE (2, 3, 0, 1)); /* 1, 0.5 */ - - __m128 t3 = FAST_FLOAT_SHUFFLE(abcd, _MM_SHUFFLE (0, 0, 0, 0)); /* 1, 0.5 */ - __m128 t4 = FAST_FLOAT_SHUFFLE(xyzw, _MM_SHUFFLE (1, 0, 3, 2)); /* 1, 0.5 */ - - __m128 t5 = FAST_FLOAT_SHUFFLE(abcd, _MM_SHUFFLE (1, 1, 1, 1)); /* 1, 0.5 */ - __m128 t6 = FAST_FLOAT_SHUFFLE(xyzw, _MM_SHUFFLE (2, 0, 3, 1)); /* 1, 0.5 */ - - /* [d,d,d,d]*[z,w,x,y] = [dz,dw,dx,dy] */ - __m128 m0 = _mm_mul_ps (t0, t1); /* 5/4, 1 */ - - /* [a,a,a,a]*[y,x,w,z] = [ay,ax,aw,az]*/ - __m128 m1 = _mm_mul_ps (t3, t4); /* 5/4, 1 */ - - /* [b,b,b,b]*[z,x,w,y] = [bz,bx,bw,by]*/ - __m128 m2 = _mm_mul_ps (t5, t6); /* 5/4, 1 */ - - /* [c,c,c,c]*[w,z,x,y] = [cw,cz,cx,cy] */ - __m128 t7 = FAST_FLOAT_SHUFFLE(abcd, _MM_SHUFFLE (2, 2, 2, 2)); /* 1, 0.5 */ - __m128 t8 = FAST_FLOAT_SHUFFLE(xyzw, _MM_SHUFFLE (3, 2, 0, 1)); /* 1, 0.5 */ - - __m128 m3 = _mm_mul_ps (t7, t8); /* 5/4, 1 */ - - /* 1 */ - /* [dz,dw,dx,dy]+-[ay,ax,aw,az] = [dz+ay,dw-ax,dx+aw,dy-az] */ - __m128 e = _mm_addsub_ps (m0, m1); /* 3, 1 */ - - /* 2 */ - /* [dx+aw,dz+ay,dy-az,dw-ax] */ - e = FAST_FLOAT_SHUFFLE(e, _MM_SHUFFLE (1, 3, 0, 2)); /* 1, 0.5 */ - - /* [dx+aw,dz+ay,dy-az,dw-ax]+-[bz,bx,bw,by] = [dx+aw+bz,dz+ay-bx,dy-az+bw,dw-ax-by]*/ - e = _mm_addsub_ps (e, m2); /* 3, 1 */ - - /* 2 */ - /* [dz+ay-bx,dw-ax-by,dy-az+bw,dx+aw+bz] */ - e = FAST_FLOAT_SHUFFLE(e, _MM_SHUFFLE (2, 0, 1, 3)); /* 1, 0.5 */ - - /* [dz+ay-bx,dw-ax-by,dy-az+bw,dx+aw+bz]+-[cw,cz,cx,cy] - = [dz+ay-bx+cw,dw-ax-by-cz,dy-az+bw+cx,dx+aw+bz-cy] */ - e = _mm_addsub_ps (e, m3); /* 3, 1 */ - - /* 2 */ - /* [dw-ax-by-cz,dz+ay-bx+cw,dy-az+bw+cx,dx+aw+bz-cy] */ - quaternion tmp; - reinterpret_cast(tmp) = FAST_FLOAT_SHUFFLE(e, _MM_SHUFFLE (2, 3, 1, 0)); /* 1, 0.5 */ - return tmp; - } - - inline vectorSIMDf transformVect(const vectorSIMDf& vec) - { - vectorSIMDf direction = *reinterpret_cast(this); - vectorSIMDf scale = core::length(direction); - direction.makeSafe3D(); - - return scale*vec+cross(direction,vec*W+cross(direction,vec))*2.f; - } - - //! Sets new quaternion - inline quaternion& set(const vectorSIMDf& xyzw) - { - *this = reinterpret_cast(xyzw); - return *this; - } - - //! Sets new quaternion based on euler angles (radians) - inline quaternion& set(const float& roll, const float& pitch, const float& yaw); - - //! Sets new quaternion from other quaternion - inline quaternion& set(const quaternion& quat) - { - *this = quat; - return *this; - } - - //! Inverts this quaternion - inline void makeInverse() - { - reinterpret_cast(*this) ^= _mm_set_epi32(0x0u,0x80000000u,0x80000000u,0x80000000u); - } - - //! Fills an angle (radians) around an axis (unit vector) - void toAngleAxis(float& angle, vector3df_SIMD& axis) const; - - //! Output this quaternion to an euler angle (radians) - void toEuler(vector3df_SIMD& euler) const; - - //! Set quaternion to identity - inline void makeIdentity() {vectorSIMDf::set(0,0,0,1);} - - - vectorSIMDf& getData() {return *((vectorSIMDf*)this);} - -//statics - inline static quaternion normalize(const quaternion& in) - { - quaternion tmp; - reinterpret_cast(tmp) = core::normalize(reinterpret_cast(in)); - return tmp; - } - - //! Helper func - static quaternion lerp(const quaternion &q1, const quaternion &q2, const float& interpolant, const bool& wrongDoubleCover); - - //! Set this quaternion to the linear interpolation between two quaternions - /** \param q1 First quaternion to be interpolated. - \param q2 Second quaternion to be interpolated. - \param interpolant Progress of interpolation. For interpolant=0 the result is - q1, for interpolant=1 the result is q2. Otherwise interpolation - between q1 and q2. - */ - static quaternion lerp(const quaternion &q1, const quaternion &q2, const float& interpolant); - - //! Helper func - static inline void flerp_interpolant_terms(float& interpolantPrecalcTerm2, float& interpolantPrecalcTerm3, const float& interpolant) - { - interpolantPrecalcTerm2 = (interpolant - 0.5f) * (interpolant - 0.5f); - interpolantPrecalcTerm3 = interpolant * (interpolant - 0.5f) * (interpolant - 1.f); - } - - static float flerp_adjustedinterpolant(const float& angle, const float& interpolant, const float& interpolantPrecalcTerm2, const float& interpolantPrecalcTerm3); - - //! Set this quaternion to the approximate slerp between two quaternions - /** \param q1 First quaternion to be interpolated. - \param q2 Second quaternion to be interpolated. - \param interpolant Progress of interpolation. For interpolant=0 the result is - q1, for interpolant=1 the result is q2. Otherwise interpolation - between q1 and q2. - */ - static quaternion flerp(const quaternion &q1, const quaternion &q2, const float& interpolant); - - //! Set this quaternion to the result of the spherical interpolation between two quaternions - /** \param q1 First quaternion to be interpolated. - \param q2 Second quaternion to be interpolated. - \param time Progress of interpolation. For interpolant=0 the result is - q1, for interpolant=1 the result is q2. Otherwise interpolation - between q1 and q2. - \param threshold To avoid inaccuracies the - interpolation switches to linear interpolation at some point. - This value defines how much of the interpolation will - be calculated with lerp. - */ - static quaternion slerp(const quaternion& q1, const quaternion& q2, - const float& interpolant, const float& threshold=.05f); - - inline static quaternion fromEuler(const vector3df_SIMD& euler) - { - quaternion tmp; - tmp.set(euler.X,euler.Y,euler.Z); - return tmp; - } - - inline static quaternion fromEuler(const vector3df& euler) - { - quaternion tmp; - tmp.set(euler.X,euler.Y,euler.Z); - return tmp; - } - - //! Set quaternion to represent a rotation from one vector to another. - static quaternion rotationFromTo(const vector3df_SIMD& from, const vector3df_SIMD& to); - - //! Create quaternion from rotation angle and rotation axis. - /** Axis must be unit length. - The quaternion representing the rotation is - q = cos(A/2)+sin(A/2)*(x*i+y*j+z*k). - \param angle Rotation Angle in radians. - \param axis Rotation axis. */ - static quaternion fromAngleAxis(const float& angle, const vector3df_SIMD& axis); -}; -static_assert(sizeof(quaternion) == sizeof(vectorSIMDf), "Quaternion not same size as vec4"); - - -// set this quaternion to the result of the linear interpolation between two quaternions -inline quaternion quaternion::lerp(const quaternion &q1, const quaternion &q2, const float& interpolant, const bool& wrongDoubleCover) -{ - vectorSIMDf retval; - if (wrongDoubleCover) - retval = mix(reinterpret_cast(q1),-reinterpret_cast(q2),vectorSIMDf(interpolant)); - else - retval = mix(reinterpret_cast(q1), reinterpret_cast(q2),vectorSIMDf(interpolant)); - return reinterpret_cast(retval); -} - -// set this quaternion to the result of the linear interpolation between two quaternions -inline quaternion quaternion::lerp(const quaternion &q1, const quaternion &q2, const float& interpolant) -{ - const float angle = dot(q1,q2)[0]; - return lerp(q1,q2,interpolant,angle < 0.0f); -} - -// Arseny Kapoulkine -inline float quaternion::flerp_adjustedinterpolant(const float& angle, const float& interpolant, const float& interpolantPrecalcTerm2, const float& interpolantPrecalcTerm3) -{ - float A = 1.0904f + angle * (-3.2452f + angle * (3.55645f - angle * 1.43519f)); - float B = 0.848013f + angle * (-1.06021f + angle * 0.215638f); - float k = A * interpolantPrecalcTerm2 + B; - float ot = interpolant + interpolantPrecalcTerm3 * k; - return ot; -} - -// set this quaternion to the result of an approximate slerp -inline quaternion quaternion::flerp(const quaternion &q1, const quaternion &q2, const float& interpolant) -{ - const float angle = dot(q1,q2)[0]; - return lerp(q1,q2,flerp_adjustedinterpolant(fabsf(angle),interpolant,(interpolant - 0.5f) * (interpolant - 0.5f),interpolant * (interpolant - 0.5f) * (interpolant - 1.f)),angle < 0.0f); -} - - -// set this quaternion to the result of the interpolation between two quaternions -inline quaternion quaternion::slerp(const quaternion &q1, const quaternion &q2, const float& interpolant, const float& threshold) -{ - float angle = dot(q1,q2)[0]; - - // make sure we use the short rotation - bool wrongDoubleCover = angle < 0.0f; - if (wrongDoubleCover) - angle *= -1.f; - - if (angle <= (1.f-threshold)) // spherical interpolation - { // acosf + sinf - vectorSIMDf retval; - - const float sinARcp = inversesqrt(1.f-angle*angle); - const float sinAt = sinf(acosf(angle) * interpolant); // could this line be optimized? - //1sqrt 3min/add 5mul from now on - const float sinAt_over_sinA = sinAt*sinARcp; - - const float scale = core::sqrt(1.f-sinAt*sinAt)-angle*sinAt_over_sinA; //cosAt-cos(A)sin(tA)/sin(A) = (sin(A)cos(tA)-cos(A)sin(tA))/sin(A) - if (wrongDoubleCover) // make sure we use the short rotation - retval = reinterpret_cast(q1)*scale - reinterpret_cast(q2)*sinAt_over_sinA; - else - retval = reinterpret_cast(q1)*scale + reinterpret_cast(q2)*sinAt_over_sinA; - - return reinterpret_cast(retval); - } - else - return normalize(lerp(q1,q2,interpolant,wrongDoubleCover)); -} - - -#if !NBL_TEST_BROKEN_QUATERNION_USE -//! axis must be unit length, angle in radians -inline quaternion quaternion::fromAngleAxis(const float& angle, const vector3df_SIMD& axis) -{ - const float fHalfAngle = 0.5f*angle; - const float fSin = sinf(fHalfAngle); - quaternion retval; - reinterpret_cast(retval) = axis*fSin; - reinterpret_cast(retval).W = cosf(fHalfAngle); - return retval; -} - - -inline void quaternion::toAngleAxis(float& angle, vector3df_SIMD &axis) const -{ - vectorSIMDf scale = core::length(*reinterpret_cast(this)); - - if (scale.X==0.f) - { - angle = 0.0f; - axis.X = 0.0f; - axis.Y = 1.0f; - axis.Z = 0.0f; - } - else - { - axis = reinterpret_cast(this)->operator/(scale); - angle = 2.f * acosf(axis.W); - - axis.makeSafe3D(); - } -} - -inline void quaternion::toEuler(vector3df_SIMD& euler) const -{ - vectorSIMDf sqr = *reinterpret_cast(this); - sqr *= sqr; - const double test = 2.0 * (Y*W - X*Z); - - if (core::equals(test, 1.0, 0.000001)) - { - // heading = rotation about z-axis - euler.Z = (float) (-2.0*atan2(X, W)); - // bank = rotation about x-axis - euler.X = 0; - // attitude = rotation about y-axis - euler.Y = core::HALF_PI(); - } - else if (core::equals(test, -1.0, 0.000001)) - { - // heading = rotation about z-axis - euler.Z = (float) (2.0*atan2(X, W)); - // bank = rotation about x-axis - euler.X = 0; - // attitude = rotation about y-axis - euler.Y = -core::HALF_PI(); - } - else - { - // heading = rotation about z-axis - euler.Z = (float) atan2(2.0 * (X*Y +Z*W),(sqr.X - sqr.Y - sqr.Z + sqr.W)); - // bank = rotation about x-axis - euler.X = (float) atan2(2.0 * (Y*Z +X*W),(-sqr.X - sqr.Y + sqr.Z + sqr.W)); - // attitude = rotation about y-axis - euler.Y = (float) asin( core::clamp(test, -1.0, 1.0) ); - } -} - -inline quaternion quaternion::rotationFromTo(const vector3df_SIMD& from, const vector3df_SIMD& to) -{ - // Based on Stan Melax's article in Game Programming Gems - // Copy, since cannot modify local - vector3df_SIMD v0 = from; - vector3df_SIMD v1 = to; - v0 = core::normalize(v0); - v1 = core::normalize(v1); - - const vectorSIMDf dddd = core::dot(v0,v1); - quaternion tmp; - if (dddd.X >= 1.0f) // If dot == 1, vectors are the same - { - return tmp; - } - else if (dddd.X <= -1.0f) // exactly opposite - { - vector3df_SIMD axis(1.0f, 0.f, 0.f); - axis = cross(axis,v0); - if (length(axis)[0]==0.f) - { - axis.set(0.f,1.f,0.f); - axis = cross(axis,v0); - } - // same as fromAngleAxis(PI, axis).normalize(); - reinterpret_cast(tmp) = axis; - return normalize(tmp); - } - - vectorSIMDf s = core::sqrt(vectorSIMDf(2.f,2.f,2.f,0.f)+dddd*2.f); - reinterpret_cast(tmp) = cross(v0,v1)*reciprocal_approxim(s); - tmp.W = s.X*0.5f; - return normalize(tmp); -} -#endif - -// sets new quaternion based on euler angles -inline quaternion& quaternion::set(const float& roll, const float& pitch, const float& yaw) -{ - float angle; - - angle = roll * 0.5f; - const float sr = sinf(angle); - const float cr = cosf(angle); - - angle = pitch * 0.5f; - const float sp = sinf(angle); - const float cp = cos(angle); - - angle = yaw * 0.5f; - const float sy = sinf(angle); - const float cy = cosf(angle); - - const float cpcy = cp * cy; - const float spcy = sp * cy; - const float cpsy = cp * sy; - const float spsy = sp * sy; - - *reinterpret_cast(this) = vectorSIMDf(sr,cr,cr,cr)*vectorSIMDf(cpcy,spcy,cpsy,cpcy)+vectorSIMDf(-cr,sr,-sr,sr)*vectorSIMDf(spsy,cpsy,spcy,spsy); - - return *this; -} - -} // end namespace core -} // end namespace nbl - -#endif - diff --git a/include/vectorSIMD.h b/include/vectorSIMD.h index 4c9c90d236..7d0dc8d966 100644 --- a/include/vectorSIMD.h +++ b/include/vectorSIMD.h @@ -887,6 +887,24 @@ namespace core } }; + inline hlsl::float32_t4 getAsVec4(const vectorSIMDf& vec) + { + hlsl::float32_t4 output; + for (int i = 0; i < 4; ++i) + output[i] = vec[i]; + + return output; + } + + inline hlsl::float32_t3 getAsVec3(const vectorSIMDf& vec) + { + hlsl::float32_t3 output; + for (int i = 0; i < 3; ++i) + output[i] = vec[i]; + + return output; + } + } // end namespace core } // end namespace nbl diff --git a/src/nbl/asset/IAssetManager.cpp b/src/nbl/asset/IAssetManager.cpp index e6a23fdb44..8b65544136 100644 --- a/src/nbl/asset/IAssetManager.cpp +++ b/src/nbl/asset/IAssetManager.cpp @@ -297,7 +297,7 @@ void IAssetManager::insertBuiltinAssets() addBuiltInToCaches(ds3Layout, "nbl/builtin/material/lambertian/singletexture/descriptor_set_layout/3"); // TODO find everything what has been using it so far constexpr uint32_t pcCount = 1u; - asset::SPushConstantRange pcRanges[pcCount] = {asset::IShader::E_SHADER_STAGE::ESS_VERTEX,0u,sizeof(core::matrix4SIMD)}; + asset::SPushConstantRange pcRanges[pcCount] = {asset::IShader::E_SHADER_STAGE::ESS_VERTEX,0u,sizeof(hlsl::float32_t3x4)}; auto pLayout = core::make_smart_refctd_ptr( std::span(pcRanges,pcCount), nullptr,core::smart_refctd_ptr(ds1Layout),nullptr,core::smart_refctd_ptr(ds3Layout) diff --git a/src/nbl/asset/utils/CGeometryCreator.cpp b/src/nbl/asset/utils/CGeometryCreator.cpp index c740b85d97..ac065a8c97 100644 --- a/src/nbl/asset/utils/CGeometryCreator.cpp +++ b/src/nbl/asset/utils/CGeometryCreator.cpp @@ -696,10 +696,9 @@ CGeometryCreator::return_type CGeometryCreator::createDiskMesh(float radius, uin //v1, v2, ..., vn-1 for (int i = 2; i < vertexCount-1; i++) { - core::vectorSIMDf vn; hlsl::float32_t3x4 rotMatrix; - rotMatrix.setRotation(hlsl::quaternion(0.0f, 0.0f, core::radians((i-1)*angle))); - rotMatrix.transformVect(vn, v0); + hlsl::setRotation(rotMatrix, hlsl::quaternion::create(0.0f, 0.0f, core::radians((i - 1) * angle))); + core::vectorSIMDf vn = hlsl::transformVector(hlsl::getMatrix3x4As4x4(rotMatrix), v0); ptr[i] = DiskVertex(vn, video::SColor(0xFFFFFFFFu), core::vector2du32_SIMD(0u, 1u), core::vector3df_SIMD(0.0f, 0.0f, 1.0f)); diff --git a/src/nbl/asset/utils/CMeshManipulator.cpp b/src/nbl/asset/utils/CMeshManipulator.cpp index 86ead8d302..a76b1ed9d8 100644 --- a/src/nbl/asset/utils/CMeshManipulator.cpp +++ b/src/nbl/asset/utils/CMeshManipulator.cpp @@ -1708,19 +1708,20 @@ hlsl::float32_t3x4 IMeshManipulator::calculateOBB(const nbl::asset::ICPUMeshBuff float ABBQuality = ABBDiff.x * ABBDiff.y + ABBDiff.y * ABBDiff.z + ABBDiff.z * ABBDiff.x; hlsl::float32_t3x4 scaleMat; hlsl::float32_t3x4 translationMat; - hlsl::setTranslation(translationMat, -(MinPoint) / OBBDiff); - hlsl::setScale(scaleMat, OBBDiff); + hlsl::setTranslation(translationMat, core::getAsVec3(-(MinPoint) / OBBDiff)); + hlsl::setScale(scaleMat, core::getAsVec3(OBBDiff)); TransMat = hlsl::concatenateBFollowedByA(TransMat, scaleMat); TransMat = hlsl::concatenateBFollowedByA(TransMat, translationMat); - if (ABBQuality < OBBQuality) { - translationMat.setTranslation(-(AABBMin) / ABBDiff); - scaleMat.setScale(ABBDiff); - TransMat = core::matrix3x4SIMD( + if (ABBQuality < OBBQuality) + { + hlsl::setTranslation(translationMat, core::getAsVec3(-(AABBMin) / ABBDiff)); + hlsl::setScale(scaleMat, core::getAsVec3(ABBDiff)); + TransMat = hlsl::float32_t3x4( 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0); - TransMat = core::concatenateBFollowedByA(TransMat, scaleMat); - TransMat = core::concatenateBFollowedByA(TransMat, translationMat); + TransMat = hlsl::concatenateBFollowedByA(TransMat, scaleMat); + TransMat = hlsl::concatenateBFollowedByA(TransMat, translationMat); } return TransMat; From 6f9f099ae6d737d710a8a096562641421b5f4a80 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Tue, 29 Oct 2024 14:40:52 -0700 Subject: [PATCH 112/358] Removed core::matrixSIMD --- examples_tests | 2 +- .../nbl/builtin/hlsl/cpp_compat/intrinsics.h | 2 ++ .../transformation_matrix_utils.hlsl | 24 +++++++++---------- .../hlsl/vector_utils/vector_utils.hlsl | 21 ++++++++++++++++ include/nbl/core/math/glslFunctions.h | 8 +++++-- include/nbl/scene/ILevelOfDetailLibrary.h | 3 ++- 6 files changed, 44 insertions(+), 16 deletions(-) create mode 100644 include/nbl/builtin/hlsl/vector_utils/vector_utils.hlsl diff --git a/examples_tests b/examples_tests index 6cbd85caa3..b69152cb35 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 6cbd85caa3d646969d7724ca7f07b0b6a72e0957 +Subproject commit b69152cb350ddd994c5d7fde9a89a1d989fef61b diff --git a/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h b/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h index 4d10a669c3..f3f0e9e3eb 100644 --- a/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h +++ b/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h @@ -41,6 +41,8 @@ NBL_BIT_OP_GLM_PASSTHROUGH(bitCount,bitCount) NBL_SIMPLE_GLM_PASSTHROUGH(cross,cross) NBL_SIMPLE_GLM_PASSTHROUGH(clamp,clamp) +NBL_SIMPLE_GLM_PASSTHROUGH(normalize, normalize) +NBL_SIMPLE_GLM_PASSTHROUGH(length, length) template inline typename scalar_type::type dot(const T& lhs, const T& rhs) {return glm::dot(lhs,rhs);} diff --git a/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl b/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl index 8f687fb97a..63d764693d 100644 --- a/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl +++ b/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl @@ -66,14 +66,14 @@ inline matrix buildCameraLookAtMatrixLH( const vector& target, const vector& upVector) { - const vector zaxis = normalize(target - position); - const vector xaxis = normalize(cross(upVector, zaxis)); - const vector yaxis = cross(zaxis, xaxis); + const vector zaxis = hlsl::normalize(target - position); + const vector xaxis = hlsl::normalize(hlsl::cross(upVector, zaxis)); + const vector yaxis = hlsl::cross(zaxis, xaxis); matrix r; - r[0] = vector(xaxis, -dot(xaxis, position)); - r[1] = vector(yaxis, -dot(yaxis, position)); - r[2] = vector(zaxis, -dot(zaxis, position)); + r[0] = vector(xaxis, -hlsl::dot(xaxis, position)); + r[1] = vector(yaxis, -hlsl::dot(yaxis, position)); + r[2] = vector(zaxis, -hlsl::dot(zaxis, position)); return r; } @@ -84,14 +84,14 @@ inline matrix buildCameraLookAtMatrixRH( const vector& target, const vector& upVector) { - const vector zaxis = normalize(position - target); - const vector xaxis = normalize(cross(upVector, zaxis)); - const vector yaxis = cross(zaxis, xaxis); + const vector zaxis = hlsl::normalize(position - target); + const vector xaxis = hlsl::normalize(hlsl::cross(upVector, zaxis)); + const vector yaxis = hlsl::cross(zaxis, xaxis); float32_t3x4 r; - r[0] = vector(xaxis, -dot(xaxis, position)); - r[1] = vector(yaxis, -dot(yaxis, position)); - r[2] = vector(zaxis, -dot(zaxis, position)); + r[0] = vector(xaxis, -hlsl::dot(xaxis, position)); + r[1] = vector(yaxis, -hlsl::dot(yaxis, position)); + r[2] = vector(zaxis, -hlsl::dot(zaxis, position)); return r; } diff --git a/include/nbl/builtin/hlsl/vector_utils/vector_utils.hlsl b/include/nbl/builtin/hlsl/vector_utils/vector_utils.hlsl new file mode 100644 index 0000000000..e1fa9dd3a0 --- /dev/null +++ b/include/nbl/builtin/hlsl/vector_utils/vector_utils.hlsl @@ -0,0 +1,21 @@ +#ifndef _NBL_BUILTIN_HLSL_VECTOR_UTILS_VECTOR_UTILS_INCLUDED_ +#define _NBL_BUILTIN_HLSL_VECTOR_UTILS_VECTOR_UTILS_INCLUDED_ + +#include + +namespace nbl +{ +namespace hlsl +{ + +// TODO: why cant I NBL_CONST_REF_ARG(vector) +template +inline T lengthsquared(vector vec) +{ + return dot(vec, vec); +} + +} +} + +#endif \ No newline at end of file diff --git a/include/nbl/core/math/glslFunctions.h b/include/nbl/core/math/glslFunctions.h index 5fabd1126f..30a0344501 100644 --- a/include/nbl/core/math/glslFunctions.h +++ b/include/nbl/core/math/glslFunctions.h @@ -320,7 +320,12 @@ NBL_FORCE_INLINE T lerp(const T& a, const T& b, const U& t) // TODO : packUnorm2x16, packSnorm2x16, packUnorm4x8, packSnorm4x8, unpackUnorm2x16, unpackSnorm2x16, unpackUnorm4x8, unpackSnorm4x8, packHalf2x16, unpackHalf2x16, packDouble2x32, unpackDouble2x32 // MOVE : faceforward, reflect, refract, any, all, not template -NBL_FORCE_INLINE T dot(const T& a, const T& b); +NBL_FORCE_INLINE T dot(const T& a, const T& b) +{ + static_assert(!(std::is_same_v || std::is_same_v || std::is_same_v)); + + return T(0); +} template<> NBL_FORCE_INLINE vectorSIMDf dot(const vectorSIMDf& a, const vectorSIMDf& b); @@ -329,7 +334,6 @@ NBL_FORCE_INLINE vectorSIMD_32 dot>(const vector template<> NBL_FORCE_INLINE vectorSIMD_32 dot>(const vectorSIMD_32& a, const vectorSIMD_32& b); - template NBL_FORCE_INLINE T lengthsquared(const T& v) { diff --git a/include/nbl/scene/ILevelOfDetailLibrary.h b/include/nbl/scene/ILevelOfDetailLibrary.h index 6b68189e7d..be3794f966 100644 --- a/include/nbl/scene/ILevelOfDetailLibrary.h +++ b/include/nbl/scene/ILevelOfDetailLibrary.h @@ -7,6 +7,7 @@ #include "nbl/video/ILogicalDevice.h" #include "nbl/video/utilities/IDrawIndirectAllocator.h" +#include "nbl/builtin/hlsl/cpp_compat/intrinsics.h" namespace nbl::scene { @@ -30,7 +31,7 @@ class ILevelOfDetailLibrary : public virtual core::IReferenceCounted { if (proj[3].w!=0.f) return core::nan(); - return abs(proj[0].x*proj[1].y-proj[0].y*proj[1].x)/core::dot(proj[3],proj[3]).x; + return abs(proj[0].x*proj[1].y-proj[0].y*proj[1].x)/hlsl::dot(proj[3],proj[3]); } }; template class container=core::vector> From 339ec8e33f1dabc6484a3bad1ae2a977bb06c773 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Tue, 29 Oct 2024 17:46:33 -0700 Subject: [PATCH 113/358] Fixed `length` function --- examples_tests | 2 +- include/nbl/builtin/hlsl/cpp_compat/intrinsics.h | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/examples_tests b/examples_tests index 9213b8933d..b1d968927d 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 9213b8933dee9fcdd415587bf1ca1c44ae1e9e70 +Subproject commit b1d968927dc75a83f8f2292ed03f48cdb9d10847 diff --git a/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h b/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h index f56c3553cf..357b89b619 100644 --- a/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h +++ b/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h @@ -43,7 +43,12 @@ NBL_BIT_OP_GLM_PASSTHROUGH(bitCount,bitCount) NBL_SIMPLE_GLM_PASSTHROUGH(cross,cross) NBL_SIMPLE_GLM_PASSTHROUGH(clamp,clamp) NBL_SIMPLE_GLM_PASSTHROUGH(normalize, normalize) -NBL_SIMPLE_GLM_PASSTHROUGH(length, length) + +template +inline scalar_type_t length(const T& vec) +{ + return glm::length(vec); +} template inline scalar_type_t dot(const T& lhs, const T& rhs) From c6ba21128a27279b585f1966aa46fb169051810f Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 30 Oct 2024 12:46:26 +0100 Subject: [PATCH 114/358] improve the function template constraints a lot --- include/nbl/builtin/hlsl/concepts.hlsl | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/include/nbl/builtin/hlsl/concepts.hlsl b/include/nbl/builtin/hlsl/concepts.hlsl index 033709e726..fea6dacc92 100644 --- a/include/nbl/builtin/hlsl/concepts.hlsl +++ b/include/nbl/builtin/hlsl/concepts.hlsl @@ -34,10 +34,8 @@ // put just before closing `>` on the partial template specialization Type args, e.g. `MyStruct)> #define NBL_PARTIAL_REQ_BOT(...) -// condition -#define NBL_FUNC_REQUIRES_BEGIN(...) requires (__VA_ARGS__) -// return value -#define NBL_FUNC_REQUIRES_END(...) __VA_ARGS__ +// condition, use instead of the closing `>` of a function template +#define NBL_FUNC_REQUIRES(...) > requires (__VA_ARGS__) #include @@ -120,10 +118,8 @@ concept matricial = is_matrix::value; // put just before closing `>` on the partial template specialization Type args, e.g. `MyStruct)> #define NBL_PARTIAL_REQ_BOT(...) ,std::enable_if_t<(__VA_ARGS__),void> -// condition, use right after the closing `>` of a function template -#define NBL_FUNC_REQUIRES_BEGIN(...) ::nbl::hlsl::enable_if_t<(__VA_ARGS__), -// return value, use `END(T)` instead of the return value type declaration -#define NBL_FUNC_REQUIRES_END(...) __VA_ARGS__> +// condition, use instead of the closing `>` of a function template +#define NBL_FUNC_REQUIRES(...) ,std::enable_if_t<(__VA_ARGS__),bool> = true> #endif From 8497a5e6af89ff75a6784fa0faa1148c8c0f48ff Mon Sep 17 00:00:00 2001 From: Ali Cheraghi Date: Wed, 30 Oct 2024 23:31:45 +0330 Subject: [PATCH 115/358] remove redundant `move` and call to `equal_range` Signed-off-by: Ali Cheraghi --- src/nbl/asset/utils/IShaderCompiler.cpp | 36 +++++++++++-------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/src/nbl/asset/utils/IShaderCompiler.cpp b/src/nbl/asset/utils/IShaderCompiler.cpp index d4b88d3853..bde89557ec 100644 --- a/src/nbl/asset/utils/IShaderCompiler.cpp +++ b/src/nbl/asset/utils/IShaderCompiler.cpp @@ -29,7 +29,7 @@ inline core::smart_refctd_ptr nbl::asset::IShaderCompiler::compileTo CCache::SEntry entry; std::vector dependencies; if (options.readCache || options.writeCache) - entry = std::move(CCache::SEntry(code, options)); + entry = CCache::SEntry(code, options); if (options.readCache) { @@ -270,31 +270,25 @@ core::smart_refctd_ptr IShaderCompiler::CCache::find(const SE IShaderCompiler::CCache::EntrySet::const_iterator IShaderCompiler::CCache::find_impl(const SEntry& mainFile, const IShaderCompiler::CIncludeFinder* finder) const { - auto foundRange = m_container.equal_range(mainFile); - for (auto& found = foundRange.first; found != foundRange.second; found++) + auto found = m_container.find(mainFile); + // go through all dependencies + for (auto i = 0; i < found->dependencies.size(); i++) { - bool allDependenciesMatch = true; - // go through all dependencies - for (auto i = 0; i < found->dependencies.size(); i++) - { - const auto& dependency = found->dependencies[i]; + const auto& dependency = found->dependencies[i]; - IIncludeLoader::found_t header; - if (dependency.standardInclude) - header = finder->getIncludeStandard(dependency.requestingSourceDir, dependency.identifier); - else - header = finder->getIncludeRelative(dependency.requestingSourceDir, dependency.identifier); + IIncludeLoader::found_t header; + if (dependency.standardInclude) + header = finder->getIncludeStandard(dependency.requestingSourceDir, dependency.identifier); + else + header = finder->getIncludeRelative(dependency.requestingSourceDir, dependency.identifier); - if (header.hash != dependency.hash) - { - allDependenciesMatch = false; - break; - } + if (header.hash != dependency.hash) + { + return m_container.end(); } - if (allDependenciesMatch) - return found; } - return m_container.end(); + + return found; } core::smart_refctd_ptr IShaderCompiler::CCache::serialize() const From bfdbdbcd9a1284f5360eb20f4a95af2c351c8c2d Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Tue, 29 Oct 2024 12:51:27 -0700 Subject: [PATCH 116/358] Refactor --- examples_tests | 2 +- .../hlsl/math/quaternion/quaternion.hlsl | 36 ++++------- .../transformation_matrix_utils.hlsl | 64 ++++++++++++++++++- .../builtin/hlsl/projection/projection.hlsl | 51 +++++++++++++++ 4 files changed, 129 insertions(+), 24 deletions(-) diff --git a/examples_tests b/examples_tests index b1d968927d..1a694ebbd2 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit b1d968927dc75a83f8f2292ed03f48cdb9d10847 +Subproject commit 1a694ebbd26cd62a92d791fed4ac1b35f176d531 diff --git a/include/nbl/builtin/hlsl/math/quaternion/quaternion.hlsl b/include/nbl/builtin/hlsl/math/quaternion/quaternion.hlsl index 2f24e52635..bc0286e778 100644 --- a/include/nbl/builtin/hlsl/math/quaternion/quaternion.hlsl +++ b/include/nbl/builtin/hlsl/math/quaternion/quaternion.hlsl @@ -17,26 +17,26 @@ namespace hlsl /** It provides cheap combinations and avoids gimbal locks. Also useful for interpolations. */ -template +template struct quaternion { // i*data[0] + j*data[1] + k*data[2] + data[3] - using vec_t = vector; - vector data; + using vec_t = vector; + vector data; //! creates identity quaternion static inline quaternion create() { quaternion q; - q.data = vector(0.0f, 0.0f, 0.0f, 1.0f); + q.data = vector(0.0f, 0.0f, 0.0f, 1.0f); return q; } - static inline quaternion create(T x, T y, T z, T w) + static inline quaternion create(float_t x, float_t y, float_t z, float_t w) { quaternion q; - q.data = vector(x, y, z, w); + q.data = vector(x, y, z, w); return q; } @@ -46,7 +46,7 @@ struct quaternion return other; } - static inline quaternion create(T pitch, T yaw, T roll) + static inline quaternion create(float_t pitch, float_t yaw, float_t roll) { float angle; @@ -67,7 +67,7 @@ struct quaternion const float cpsy = cp * sy; const float spsy = sp * sy; - quaternion output; + quaternion output; output.data = float32_t4(sr, cr, cr, cr) * float32_t4(cpcy, spcy, cpsy, cpcy) + float32_t4(-cr, sr, -sr, sr) * float32_t4(spsy, cpsy, spcy, spsy); return output; @@ -76,20 +76,12 @@ struct quaternion // TODO: //explicit quaternion(NBL_CONST_REF_ARG(float32_t3x4) m) {} -#define DEFINE_MUL_QUATERNION_BY_SCALAR_OPERATOR(TYPE)\ - inline quaternion operator*(TYPE scalar)\ - {\ - quaternion output;\ - output.data = data * scalar;\ - return output;\ - }\ - - DEFINE_MUL_QUATERNION_BY_SCALAR_OPERATOR(uint32_t) - DEFINE_MUL_QUATERNION_BY_SCALAR_OPERATOR(uint64_t) - DEFINE_MUL_QUATERNION_BY_SCALAR_OPERATOR(float32_t) - DEFINE_MUL_QUATERNION_BY_SCALAR_OPERATOR(float64_t) - -#undef DEFINE_MUL_QUATERNION_BY_SCALAR_OPERATOR + inline quaternion operator*(float_t scalar) + { + quaternion output; + output.data = data * scalar; + return output; + } inline quaternion operator*(NBL_CONST_REF_ARG(quaternion) other) { diff --git a/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl b/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl index 63d764693d..8fcccdafda 100644 --- a/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl +++ b/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl @@ -12,7 +12,16 @@ namespace nbl namespace hlsl { -NBL_CONSTEXPR hlsl::float32_t3x4 IdentityFloat32_t3x4 = hlsl::float32_t3x4(hlsl::float32_t4(1, 0, 0, 0), hlsl::float32_t4(0, 0, 1, 0), hlsl::float32_t4(0, 0, 1, 0)); +// TODO: if `IdentityFloat32_t3x4` and `IdentityFloat32_t3x4` constexprs are ok, then I can expand them into templated struct, not doing it untill the concept is approved +//template +//struct IdentityMatrix +//{ +// +//}; +NBL_CONSTEXPR hlsl::float32_t3x4 IdentityFloat32_t3x4 = + hlsl::float32_t3x4(hlsl::float32_t4(1, 0, 0, 0), hlsl::float32_t4(0, 0, 1, 0), hlsl::float32_t4(0, 0, 1, 0)); +NBL_CONSTEXPR hlsl::float32_t4x4 IdentityFloat32_t4x4 = + hlsl::float32_t4x4(hlsl::float32_t4(1, 0, 0, 0), hlsl::float32_t4(0, 0, 1, 0), hlsl::float32_t4(0, 0, 1, 0), hlsl::float32_t4(0, 0, 0, 1)); // TODO: this is temporary function, delete when removing vectorSIMD template @@ -40,6 +49,12 @@ inline matrix getMatrix3x4As4x4(matrix mat) return output; } +template +inline matrix getSub3x3(matrix mat) +{ + return matrix(mat); +} + template inline matrix getAs64BitPrecisionMatrix(matrix mat) { @@ -50,6 +65,53 @@ inline matrix getAs64BitPrecisionMatrix(matrix return output; } +namespace transformation_matrix_utils_impl +{ + template + inline T determinant_helper(matrix mat, vector& r1crossr2) + { + r1crossr2 = hlsl::cross(mat[1], mat[2]); + return hlsl::dot(mat[0], r1crossr2); + } +} + +template +inline matrix getSub3x3TransposeCofactors(const matrix& mat) +{ + static_assert(N >= 3 && M >= 3); + + matrix output; + vector row0 = vector(mat[0]); + vector row1 = vector(mat[1]); + vector row2 = vector(mat[2]); + output[0] = hlsl::cross(row1, row2); + output[1] = hlsl::cross(row2, row0); + output[2] = hlsl::cross(row0, row1); + + output[0] = hlsl::cross(row0, row1); + + return output; +} + +template +inline bool getSub3x3InverseTranspose(const matrix& matIn, matrix& matOut) +{ + matrix matIn3x3 = getSub3x3(matIn); + vector r1crossr2; + T d = transformation_matrix_utils_impl::determinant_helper(matIn3x3, r1crossr2); + if (core::iszero(d, FLT_MIN)) + return false; + auto rcp = core::reciprocal(d); + + // matrix of cofactors * 1/det + matOut = getSub3x3TransposeCofactors(matIn3x3); + matOut[0] *= rcp; + matOut[1] *= rcp; + matOut[2] *= rcp; + + return true; +} + // TODO: use portable_float when merged //! multiplies matrices a and b, 3x4 matrices are treated as 4x4 matrices with 4th row set to (0, 0, 0 ,1) template diff --git a/include/nbl/builtin/hlsl/projection/projection.hlsl b/include/nbl/builtin/hlsl/projection/projection.hlsl index d973fda25d..ecf180b5c6 100644 --- a/include/nbl/builtin/hlsl/projection/projection.hlsl +++ b/include/nbl/builtin/hlsl/projection/projection.hlsl @@ -27,6 +27,57 @@ inline float32_t4x4 buildProjectionMatrixPerspectiveFovLH(float fieldOfViewRadia return m; } +inline float32_t4x4 buildProjectionMatrixPerspectiveFovRH(float fieldOfViewRadians, float aspectRatio, float zNear, float zFar) +{ + const float h = core::reciprocal(tanf(fieldOfViewRadians * 0.5f)); +#ifndef __HLSL_VERSION + _NBL_DEBUG_BREAK_IF(aspectRatio == 0.f); //division by zero + _NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero +#endif + const float w = h / aspectRatio; + + float32_t4x4 m; + m[0] = hlsl::float32_t4(w, 0.f, 0.f, 0.f); + m[1] = hlsl::float32_t4(0.f, -h, 0.f, 0.f); + m[2] = hlsl::float32_t4(0.f, 0.f, -zFar / (zFar - zNear), -zNear * zFar / (zFar - zNear)); + m[3] = hlsl::float32_t4(0.f, 0.f, -1.f, 0.f); + + return m; +} + +inline float32_t4x4 buildProjectionMatrixOrthoRH(float widthOfViewVolume, float heightOfViewVolume, float zNear, float zFar) +{ +#ifndef __HLSL_VERSION + _NBL_DEBUG_BREAK_IF(widthOfViewVolume == 0.f); //division by zero + _NBL_DEBUG_BREAK_IF(heightOfViewVolume == 0.f); //division by zero + _NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero +#endif + + float32_t4x4 m; + m[0] = hlsl::float32_t4(2.f / widthOfViewVolume, 0.f, 0.f, 0.f); + m[1] = hlsl::float32_t4(0.f, -2.f / heightOfViewVolume, 0.f, 0.f); + m[2] = hlsl::float32_t4(0.f, 0.f, -1.f / (zFar - zNear), -zNear / (zFar - zNear)); + m[3] = hlsl::float32_t4(0.f, 0.f, 0.f, 1.f); + + return m; +} +inline float32_t4x4 buildProjectionMatrixOrthoLH(float widthOfViewVolume, float heightOfViewVolume, float zNear, float zFar) +{ +#ifndef __HLSL_VERSION + _NBL_DEBUG_BREAK_IF(widthOfViewVolume == 0.f); //division by zero + _NBL_DEBUG_BREAK_IF(heightOfViewVolume == 0.f); //division by zero + _NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero +#endif + + float32_t4x4 m; + m[0] = hlsl::float32_t4(2.f / widthOfViewVolume, 0.f, 0.f, 0.f); + m[1] = hlsl::float32_t4(0.f, -2.f / heightOfViewVolume, 0.f, 0.f); + m[2] = hlsl::float32_t4(0.f, 0.f, 1.f / (zFar - zNear), -zNear / (zFar - zNear)); + m[3] = hlsl::float32_t4(0.f, 0.f, 0.f, 1.f); + + return m; +} + } } From f00262d5d3375720b5ed3ad7f2c7b32f209566a7 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Thu, 31 Oct 2024 15:45:05 -0700 Subject: [PATCH 117/358] Refactor --- .../hlsl/matrix_utils/transformation_matrix_utils.hlsl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl b/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl index 8fcccdafda..6938709e78 100644 --- a/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl +++ b/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl @@ -39,7 +39,7 @@ inline core::vectorSIMDf transformVector(const matrix& mat, const core: } template -inline matrix getMatrix3x4As4x4(matrix mat) +inline matrix getMatrix3x4As4x4(const matrix& mat) { matrix output; for (int i = 0; i < 3; ++i) @@ -50,13 +50,13 @@ inline matrix getMatrix3x4As4x4(matrix mat) } template -inline matrix getSub3x3(matrix mat) +inline matrix getSub3x3(const matrix& mat) { return matrix(mat); } template -inline matrix getAs64BitPrecisionMatrix(matrix mat) +inline matrix getAs64BitPrecisionMatrix(const matrix& mat) { matrix output; for (int i = 0; i < N; ++i) @@ -68,7 +68,7 @@ inline matrix getAs64BitPrecisionMatrix(matrix namespace transformation_matrix_utils_impl { template - inline T determinant_helper(matrix mat, vector& r1crossr2) + inline T determinant_helper(const matrix& mat, vector& r1crossr2) { r1crossr2 = hlsl::cross(mat[1], mat[2]); return hlsl::dot(mat[0], r1crossr2); From a4a9fc9dda20bbedfad320897c3d5e8a795db734 Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 4 Nov 2024 12:13:35 +0100 Subject: [PATCH 118/358] add concept macros! --- include/nbl/builtin/hlsl/concepts.hlsl | 108 ++++++++++++++++------ include/nbl/builtin/hlsl/type_traits.hlsl | 3 + 2 files changed, 84 insertions(+), 27 deletions(-) diff --git a/include/nbl/builtin/hlsl/concepts.hlsl b/include/nbl/builtin/hlsl/concepts.hlsl index fea6dacc92..bf16d3d1c5 100644 --- a/include/nbl/builtin/hlsl/concepts.hlsl +++ b/include/nbl/builtin/hlsl/concepts.hlsl @@ -10,14 +10,31 @@ #include -#ifndef __HLSL_VERSION - -// TODO: old stuff, see how much we can remove -#define NBL_CONCEPT_TYPE_PARAMS(...) template <__VA_ARGS__> -#define NBL_CONCEPT_SIGNATURE(NAME, ...) concept NAME = requires(__VA_ARGS__) -#define NBL_CONCEPT_BODY(...) { __VA_ARGS__ }; -#define NBL_CONCEPT_ASSIGN(NAME, ...) concept NAME = __VA_ARGS__; -#define NBL_REQUIRES(...) requires __VA_ARGS__ +namespace nbl +{ +namespace hlsl +{ +namespace concepts +{ +// common implementation juice +#include +#define NBL_IMPL_CONCEPT_FULL_TPLT(z, n, unused) BOOST_PP_SEQ_ELEM(n,NBL_CONCEPT_TPLT_PRM_KINDS) BOOST_PP_SEQ_ELEM(n,NBL_CONCEPT_TPLT_PRM_NAMES) +#include +#define NBL_CONCEPT_FULL_TPLT() BOOST_PP_ENUM(BOOST_PP_SEQ_SIZE(NBL_CONCEPT_TPLT_PRM_NAMES),NBL_IMPL_CONCEPT_FULL_TPLT,DUMMY) +#include +#define NBL_CONCEPT_TPLT_PARAMS() BOOST_PP_SEQ_ENUM(NBL_CONCEPT_TPLT_PRM_NAMES) +#include +#include +#include +// +#define NBL_CONCEPT_REQ_TYPE 0 +#define NBL_CONCEPT_REQ_EXPR 1 +// +#define NBL_CONCEPT_REQ_EXPR_RET_TYPE 2 + + +//! Now diverge +#ifndef __cpp_concepts // to define a concept using `concept Name = SomeContexprBoolCondition;` @@ -37,14 +54,30 @@ // condition, use instead of the closing `>` of a function template #define NBL_FUNC_REQUIRES(...) > requires (__VA_ARGS__) -#include -namespace nbl -{ -namespace hlsl -{ -namespace concepts +// +#define NBL_CONCEPT_PARAM_T(ID,...) ID +// +#define NBL_IMPL_IMPL_CONCEPT_BEGIN(A,...) __VA_ARGS__ A +#define NBL_IMPL_CONCEPT_BEGIN(z,n,data) NBL_IMPL_IMPL_CONCEPT_BEGIN NBL_CONCEPT_PARAM_##n +// TODO: are empty local parameter lists valid? a.k.a. just a `()` +#define NBL_CONCEPT_BEGIN(LOCAL_PARAM_COUNT) template \ +concept NBL_CONCEPT_NAME = requires BOOST_PP_EXPR_IF(LOCAL_PARAM_COUNT,(BOOST_PP_ENUM(LOCAL_PARAM_COUNT,NBL_IMPL_CONCEPT_BEGIN,DUMMY))) \ { +// +#define NBL_IMPL_CONCEPT_REQ_TYPE(...) typename __VA_ARGS__; +#define NBL_IMPL_CONCEPT_REQ_EXPR(...) __VA_ARGS__; +#define NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE(E,C,...) {E}; C; +// +#define NBL_IMPL_CONCEPT (NBL_IMPL_CONCEPT_REQ_TYPE,NBL_IMPL_CONCEPT_REQ_EXPR,NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE) +// +#define NBL_IMPL_CONCEPT_END_DEF(r,unused,i,e) NBL_EVAL(BOOST_PP_TUPLE_ELEM(BOOST_PP_SEQ_HEAD(e),NBL_IMPL_CONCEPT) BOOST_PP_SEQ_TAIL(e)) +// +#define NBL_CONCEPT_END(SEQ) BOOST_PP_SEQ_FOR_EACH_I(NBL_IMPL_CONCEPT_END_DEF, DUMMY, SEQ) \ +} + + +#include // Alias some of the std concepts in nbl. As this is C++20 only, we don't need to use // the macros here. @@ -90,22 +123,11 @@ concept vectorial = is_vector::value; template concept matricial = is_matrix::value; -} -} -} - #else -// TODO: old stuff, see how much we can remove -// No C++20 support. Do nothing. -#define NBL_CONCEPT_TYPE_PARAMS(...) -#define NBL_CONCEPT_SIGNATURE(NAME, ...) -#define NBL_CONCEPT_BODY(...) -#define NBL_REQUIRES(...) - // to define a concept using `concept Name = SomeContexprBoolCondition;` -#define NBL_BOOL_CONCEPT NBL_CONSTEXPR_STATIC_INLINE bool +#define NBL_BOOL_CONCEPT NBL_CONSTEXPR bool // for struct definitions, use instead of closing `>` on the primary template parameter list #define NBL_PRIMARY_REQUIRES(...) ,typename __requires=::nbl::hlsl::enable_if_t<(__VA_ARGS__),void> > @@ -121,7 +143,39 @@ concept matricial = is_matrix::value; // condition, use instead of the closing `>` of a function template #define NBL_FUNC_REQUIRES(...) ,std::enable_if_t<(__VA_ARGS__),bool> = true> -#endif +// +#define NBL_CONCEPT_BEGIN(LOCAL_PARAM_COUNT) namespace BOOST_PP_CAT(__concept__,NBL_CONCEPT_NAME) \ +{ +// +#define NBL_CONCEPT_PARAM_T(ID,...) ::nbl::hlsl::impl::declval<__VA_ARGS__ >() +// +#define NBL_IMPL_CONCEPT_REQ_TYPE(...) ::nbl::hlsl::make_void_t +#define NBL_IMPL_CONCEPT_REQ_EXPR(...) ::nbl::hlsl::make_void_t +#define NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE(E,C,...) C +// +#define NBL_IMPL_CONCEPT_SFINAE (typename=void,typename=void,bool=true) +#define NBL_IMPL_CONCEPT_SFINAE_SPEC (NBL_IMPL_CONCEPT_REQ_TYPE,NBL_IMPL_CONCEPT_REQ_EXPR,NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE) +// +#define NBL_IMPL_CONCEPT_END_DEF(r,unused,i,e) template \ +struct BOOST_PP_CAT(__requirement,i) : ::nbl::hlsl::false_type {}; \ +template \ +struct BOOST_PP_CAT(__requirement,i) : ::nbl::hlsl::true_type {}; +// +#define NBL_IMPL_CONCEPT_END_GET(r,unused,i,e) BOOST_PP_EXPR_IF(i,&&) BOOST_PP_CAT(__concept__,NBL_CONCEPT_NAME)::BOOST_PP_CAT(__requirement,i)::value +// +#define NBL_CONCEPT_END(SEQ) BOOST_PP_SEQ_FOR_EACH_I(NBL_IMPL_CONCEPT_END_DEF, DUMMY, SEQ) \ +} \ +template \ +NBL_CONSTEXPR bool NBL_CONCEPT_NAME = BOOST_PP_SEQ_FOR_EACH_I(NBL_IMPL_CONCEPT_END_GET, DUMMY, SEQ) + +// TODO: counterparts of all the other concepts + +#endif +} +} +} #endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/type_traits.hlsl b/include/nbl/builtin/hlsl/type_traits.hlsl index d7b1102afc..68cfc6476f 100644 --- a/include/nbl/builtin/hlsl/type_traits.hlsl +++ b/include/nbl/builtin/hlsl/type_traits.hlsl @@ -574,6 +574,9 @@ using enable_if = std::enable_if; template using alignment_of = std::alignment_of; +template +using make_void_t = typename make_void::type; + template using remove_const = std::remove_const; template using remove_volatile = std::remove_volatile; template using remove_cv = std::remove_cv; From 0b2e2f15e90ee8ff93a22eac6b1470f5b9176375 Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 4 Nov 2024 15:07:31 +0100 Subject: [PATCH 119/358] fix up type traits --- include/nbl/builtin/hlsl/type_traits.hlsl | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/include/nbl/builtin/hlsl/type_traits.hlsl b/include/nbl/builtin/hlsl/type_traits.hlsl index 68cfc6476f..3a4e0eda70 100644 --- a/include/nbl/builtin/hlsl/type_traits.hlsl +++ b/include/nbl/builtin/hlsl/type_traits.hlsl @@ -156,10 +156,9 @@ namespace nbl { namespace hlsl { - +// namespace impl { - template class Trait, class T> struct base_type_forwarder : Trait {}; @@ -168,11 +167,14 @@ struct base_type_forwarder > : Trait {}; template class Trait, class T, uint16_t N, uint16_t M> struct base_type_forwarder > : Trait {}; - } -#ifdef __HLSL_VERSION // HLSL +// +template +struct make_void { using type = void; }; + +#ifdef __HLSL_VERSION // HLSL #define decltype(expr) __decltype(expr) @@ -391,9 +393,6 @@ struct enable_if : type_identity {}; template struct alignment_of; -template -struct make_void { using type = void; }; - // reference stuff needed for semantics // not for "human consumption" @@ -574,9 +573,6 @@ using enable_if = std::enable_if; template using alignment_of = std::alignment_of; -template -using make_void_t = typename make_void::type; - template using remove_const = std::remove_const; template using remove_volatile = std::remove_volatile; template using remove_cv = std::remove_cv; @@ -617,6 +613,9 @@ template NBL_CONSTEXPR uint32_t alignment_of_v = alignment_of::value; // Overlapping definitions +template +using make_void_t = typename make_void::type; + template struct conditional_value { From 4275c233ffdd6542a159ac8506037f8083e275d8 Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 4 Nov 2024 15:09:05 +0100 Subject: [PATCH 120/358] add a general binding info --- include/nbl/asset/IPipelineLayout.h | 46 ++++++++++++++++++++-- include/nbl/builtin/hlsl/binding_info.hlsl | 9 +++++ include/nbl/ext/ImGui/ImGui.h | 1 + include/nbl/video/utilities/CComputeBlit.h | 32 +++++++++++---- src/nbl/ext/ImGui/ImGui.cpp | 1 + src/nbl/video/utilities/CComputeBlit.cpp | 12 ++---- 6 files changed, 82 insertions(+), 19 deletions(-) diff --git a/include/nbl/asset/IPipelineLayout.h b/include/nbl/asset/IPipelineLayout.h index 7628d0b481..fdbc97bbfa 100644 --- a/include/nbl/asset/IPipelineLayout.h +++ b/include/nbl/asset/IPipelineLayout.h @@ -4,12 +4,14 @@ #ifndef _NBL_ASSET_I_PIPELINE_LAYOUT_H_INCLUDED_ #define _NBL_ASSET_I_PIPELINE_LAYOUT_H_INCLUDED_ +#include "nbl/macros.h" +#include "nbl/core/declarations.h" #include #include -#include "nbl/macros.h" -#include "nbl/core/declarations.h" +#include "nbl/asset/IDescriptorSetLayout.h" +#include "nbl/builtin/hlsl/binding_info.hlsl" namespace nbl::asset @@ -21,7 +23,7 @@ namespace nbl::asset however they serve as a fast path with regard to data upload from the CPU and data access from the GPU. - Note that IrrlichtBaW limits push constant size to 128 bytes. + Note that Nabla limits push constant size to 128 bytes. Push Constants are an alternative to an UBO where it performs really poorly, mostly very small and very frequent updates. Examples of which are: @@ -140,6 +142,44 @@ class IPipelineLayout return static_cast(i)-1; } + // utility function, if you compile shaders for specific layouts, not create layouts given shaders + using desc_type_bitset_t = std::bitset(IDescriptor::E_TYPE::ET_COUNT)>; + // TODO: add constraints for stage and creation flags, or just return the storage index & redirect? + core::string getBindingInfoForHLSL(const hlsl::SBindingInfo& info, const desc_type_bitset_t allowedTypes=desc_type_bitset_t().set()) const + { + if (info.set>=DESCRIPTOR_SET_COUNT) + return "#error \"::nbl::hlsl::SBindingInfo::set out of range!\""; + const auto* layout = m_descSetLayouts[info.set]; + if (!layout) + return "#error \"::nbl::hlsl::SBindingInfo::set layout is nullptr!\""; + // + using redirect_t = IDescriptorSetLayoutBase::CBindingRedirect; + using storage_range_index_t = redirect_t::storage_range_index_t; + const redirect_t* redirect; + storage_range_index_t found; + { + const redirect_t::binding_number_t binding(info.binding); + for (auto t=0u; t(IDescriptor::E_TYPE::ET_COUNT); t++) + if (allowedTypes.test(t)) + { + redirect = &layout->getDescriptorRedirect(static_cast(t)); + found = redirect->findBindingStorageIndex(binding); + if (found) + break; + } + if (!found && allowedTypes.test(static_cast(IDescriptor::E_TYPE::ET_SAMPLER))) + { + redirect = &layout->getImmutableSamplerRedirect(); + found = redirect->findBindingStorageIndex(binding); + } + if (!found) + return "#error \"Could not find `::nbl::hlsl::SBindingInfo::binding` in `::nbl::hlsl::SBindingInfo::set`'s layout!\""; + } + const auto count = redirect->getCount(found); + assert(count); // this layout should have never passed validation + return "::nbl::hlsl::ConstevalBindingInfo<"+std::to_string(info.set)+","+std::to_string(info.binding)+","+std::to_string(count)+">"; + } + protected: IPipelineLayout( const std::span _pcRanges, diff --git a/include/nbl/builtin/hlsl/binding_info.hlsl b/include/nbl/builtin/hlsl/binding_info.hlsl index 8702a32c3d..e037665162 100644 --- a/include/nbl/builtin/hlsl/binding_info.hlsl +++ b/include/nbl/builtin/hlsl/binding_info.hlsl @@ -19,6 +19,15 @@ struct ConstevalBindingInfo NBL_CONSTEXPR_STATIC_INLINE uint32_t Count = count; }; +// used for descriptor set layout lookups +struct SBindingInfo +{ + //! binding index for a given resource + uint32_t binding : 29; + //! descriptor set index for a resource + uint32_t set : 3; +}; + } } #endif diff --git a/include/nbl/ext/ImGui/ImGui.h b/include/nbl/ext/ImGui/ImGui.h index 58787b9d55..244195c01a 100644 --- a/include/nbl/ext/ImGui/ImGui.h +++ b/include/nbl/ext/ImGui/ImGui.h @@ -24,6 +24,7 @@ class UI final : public core::IReferenceCounted struct SResourceParameters { //! for a given pipeline layout we need to know what is intended for UI resources + // TODO: introduce a common type between ImGUI and Blit for the descriptor infos "binding_info.hlsl" struct SBindingInfo { //! descriptor set index for a resource diff --git a/include/nbl/video/utilities/CComputeBlit.h b/include/nbl/video/utilities/CComputeBlit.h index eae3f4bf0a..69b8d8ba29 100644 --- a/include/nbl/video/utilities/CComputeBlit.h +++ b/include/nbl/video/utilities/CComputeBlit.h @@ -47,16 +47,34 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted core::smart_refctd_ptr&& logger=nullptr ); - // if you set the balues too small, we'll correct them ourselves anyway - struct STask + // create your pipelines + struct SPipelines { + core::smart_refctd_ptr blit; + core::smart_refctd_ptr coverage; + }; + struct SPipelinesCreateInfo + { + // required + CAssetConverter* converter; + // in theory we _could_ accept either pipeline layout type (or just the base) and make the CPU one back from the GPU + const asset::ICPUPipelineLayout* layout; + // must be Uniform Texel Buffer descriptor type + hlsl::SBindingInfo kernelWeights; + // must be Sampled Image descriptor type + hlsl::SBindingInfo inputs; + // must be Sampler descriptor type + hlsl::SBindingInfo samplers; + // must be Storage Image descriptor type + hlsl::SBindingInfo outputs; + //! If you set the balues too small, we'll correct them ourselves anyway + // needs to be at least as big as the maximum subgroup size uint32_t workgroupSizeLog2 : 4 = 0; - // the TRUE output format, not the storage view format you might manually encode into - hlsl::format::TexelBlockFormat outputFormat : 8 = hlsl::format::TexelBlockFormat::TBF_UNKNOWN; + // uint32_t sharedMemoryPerInvocation : 6 = 0; - uint32_t unused : 14 = 0; }; - + SPipelines createAndCachePipelines(const SPipelinesCreateInfo& info); + //! Returns the original format if supports STORAGE_IMAGE otherwise returns a format in its compat class which supports STORAGE_IMAGE. inline asset::E_FORMAT getOutputViewFormat(const asset::E_FORMAT format) { @@ -585,8 +603,6 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted EBT_COUNT }; - void createAndCachePipelines(CAssetConverter* converter, core::smart_refctd_ptr* pipelines, const std::span tasks); - core::smart_refctd_ptr m_device; system::logger_opt_smart_ptr m_logger; core::smart_refctd_ptr m_shaderCache; diff --git a/src/nbl/ext/ImGui/ImGui.cpp b/src/nbl/ext/ImGui/ImGui.cpp index 91b9e41157..9e9f9f2e55 100644 --- a/src/nbl/ext/ImGui/ImGui.cpp +++ b/src/nbl/ext/ImGui/ImGui.cpp @@ -221,6 +221,7 @@ core::smart_refctd_ptr UI::createPipeline(SCreation std::stringstream stream; + // TODO: Use the `ConstevalBindingInfo` stream << "// -> this code has been autogenerated with Nabla ImGUI extension\n" << "#define NBL_TEXTURES_BINDING_IX " << creationParams.resources.texturesInfo.bindingIx << "\n" << "#define NBL_SAMPLER_STATES_BINDING_IX " << creationParams.resources.samplersInfo.bindingIx << "\n" diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp index 1ceb1ee415..1dd123952f 100644 --- a/src/nbl/video/utilities/CComputeBlit.cpp +++ b/src/nbl/video/utilities/CComputeBlit.cpp @@ -20,8 +20,9 @@ CComputeBlit::CComputeBlit(smart_refctd_ptr&& logicalDevice, sma m_shaderCache = make_smart_refctd_ptr(); } -void CComputeBlit::createAndCachePipelines(CAssetConverter* converter, smart_refctd_ptr* pipelines, const std::span tasks) +auto CComputeBlit::createAndCachePipelines(const SPipelinesCreateInfo& info) -> SPipelines { + SPipelines retval; core::vector> cpuPplns; cpuPplns.reserve(tasks.size()); @@ -50,12 +51,6 @@ void CComputeBlit::createAndCachePipelines(CAssetConverter* converter, smart_ref } const auto common = [&]()->std::string { - // TODO: introduce a common type between ImGUI and Blit for the descriptor infos - auto serializeBindingInfo = [](const hlsl::SBindingInfo& info={})->std::string - { - return "ConstevalBindingInfo<"+std::to_string(info.Set)+","+std::to_string(info.Set)+","+std::to_string(info.Count)+">"; - }; - std::ostringstream tmp; tmp << R"===( #include "nbl/builtin/hlsl/binding_info.hlsl" @@ -67,7 +62,7 @@ using namespace nbl::hlsl; struct ConstevalParameters { NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkGroupSize = )===" << (0x1u<getBindingInfoForHLSL() << R"===(; using input_sampler_binding_t = )===" << serializeBindingInfo() << R"===(; using input_image_binding_t = )===" << serializeBindingInfo() << R"===(; using output_binding_t = )===" << serializeBindingInfo() << R"===(; @@ -122,6 +117,7 @@ struct ConstevalParameters auto convertResults = reserveResults.convert(params); assert(!convertResults.blocking()); } + return retval; } #if 0 From 2eba89b8b7439fb5e5889fa30cf3d03cd777663e Mon Sep 17 00:00:00 2001 From: Fletterio Date: Mon, 4 Nov 2024 14:02:31 -0300 Subject: [PATCH 121/358] Final utils for FFT --- examples_tests | 2 +- include/nbl/builtin/hlsl/workgroup/fft.hlsl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples_tests b/examples_tests index 5cf4ef6999..e14fde322a 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 5cf4ef6999e468b06383716e01f4329f33d7721f +Subproject commit e14fde322aa127f862a2e7eb1dd9192d41dc2112 diff --git a/include/nbl/builtin/hlsl/workgroup/fft.hlsl b/include/nbl/builtin/hlsl/workgroup/fft.hlsl index 63d367048d..8ec1df2e21 100644 --- a/include/nbl/builtin/hlsl/workgroup/fft.hlsl +++ b/include/nbl/builtin/hlsl/workgroup/fft.hlsl @@ -40,7 +40,7 @@ struct exchangeValues else { hi.real(exchanged.x); - lo.imag(exchanged.y); + hi.imag(exchanged.y); } } }; @@ -125,7 +125,7 @@ template void unpack(NBL_REF_ARG(complex_t) lo, NBL_REF_ARG(complex_t) hi) { complex_t x = (lo + conj(hi)) * Scalar(0.5); - hi = rotateRight(lo - conj(hi)) * 0.5; + hi = rotateRight(lo - conj(hi)) * Scalar(0.5); lo = x; } From 3fee9c1ee3bc730ea2160f6b021a23bbcfed52ee Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Mon, 4 Nov 2024 15:30:49 -0800 Subject: [PATCH 122/358] Updated examples_tests --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 88c3d34a86..424a71a55a 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 88c3d34a866c96a7037e9b916a71b37b7abe0b33 +Subproject commit 424a71a55a457231aa782a6fe6083a2fb9f082f4 From 9746e6adbbd3a83f01a22b1721c44e463377df1f Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Mon, 4 Nov 2024 15:38:50 -0800 Subject: [PATCH 123/358] Refactor --- examples_tests | 2 +- .../nbl/builtin/hlsl/camera/view_matrix.hlsl | 51 +++++++++ .../transformation_matrix_utils.hlsl | 107 ------------------ .../builtin/hlsl/projection/projection.hlsl | 73 ++++++------ 4 files changed, 87 insertions(+), 146 deletions(-) create mode 100644 include/nbl/builtin/hlsl/camera/view_matrix.hlsl diff --git a/examples_tests b/examples_tests index 424a71a55a..1946d5b2da 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 424a71a55a457231aa782a6fe6083a2fb9f082f4 +Subproject commit 1946d5b2dab693c54dd16eaaf93cc77b8de2ddf2 diff --git a/include/nbl/builtin/hlsl/camera/view_matrix.hlsl b/include/nbl/builtin/hlsl/camera/view_matrix.hlsl new file mode 100644 index 0000000000..27b2c63239 --- /dev/null +++ b/include/nbl/builtin/hlsl/camera/view_matrix.hlsl @@ -0,0 +1,51 @@ +#ifndef _NBL_BUILTIN_HLSL_PROJECTION_INCLUDED_ +#define _NBL_BUILTIN_HLSL_PROJECTION_INCLUDED_ + +#include + +namespace nbl +{ +namespace hlsl +{ + +// /Arek: glm:: for normalize till dot product is fixed (ambiguity with glm namespace + linker issues) +template +inline matrix buildCameraLookAtMatrixLH( + const vector& position, + const vector& target, + const vector& upVector) +{ + const vector zaxis = hlsl::normalize(target - position); + const vector xaxis = hlsl::normalize(hlsl::cross(upVector, zaxis)); + const vector yaxis = hlsl::cross(zaxis, xaxis); + + matrix r; + r[0] = vector(xaxis, -hlsl::dot(xaxis, position)); + r[1] = vector(yaxis, -hlsl::dot(yaxis, position)); + r[2] = vector(zaxis, -hlsl::dot(zaxis, position)); + + return r; +} + +template +inline matrix buildCameraLookAtMatrixRH( + const vector& position, + const vector& target, + const vector& upVector) +{ + const vector zaxis = hlsl::normalize(position - target); + const vector xaxis = hlsl::normalize(hlsl::cross(upVector, zaxis)); + const vector yaxis = hlsl::cross(zaxis, xaxis); + + matrix r; + r[0] = vector(xaxis, -hlsl::dot(xaxis, position)); + r[1] = vector(yaxis, -hlsl::dot(yaxis, position)); + r[2] = vector(zaxis, -hlsl::dot(zaxis, position)); + + return r; +} + +} +} + +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl b/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl index dbbfe5c2df..add1471bb5 100644 --- a/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl +++ b/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl @@ -122,45 +122,6 @@ inline matrix concatenateBFollowedByA(const matrix& a, const m return matrix(mul(a4x4, b4x4)); } -// /Arek: glm:: for normalize till dot product is fixed (ambiguity with glm namespace + linker issues) -template -inline matrix buildCameraLookAtMatrixLH( - const vector& position, - const vector& target, - const vector& upVector) -{ - const vector zaxis = hlsl::normalize(target - position); - const vector xaxis = hlsl::normalize(hlsl::cross(upVector, zaxis)); - const vector yaxis = hlsl::cross(zaxis, xaxis); - - matrix r; - r[0] = vector(xaxis, -hlsl::dot(xaxis, position)); - r[1] = vector(yaxis, -hlsl::dot(yaxis, position)); - r[2] = vector(zaxis, -hlsl::dot(zaxis, position)); - - return r; -} - -template -inline matrix buildCameraLookAtMatrixRH( - const vector& position, - const vector& target, - const vector& upVector) -{ - const vector zaxis = hlsl::normalize(position - target); - const vector xaxis = hlsl::normalize(hlsl::cross(upVector, zaxis)); - const vector yaxis = hlsl::cross(zaxis, xaxis); - - matrix r; - r[0] = vector(xaxis, -hlsl::dot(xaxis, position)); - r[1] = vector(yaxis, -hlsl::dot(yaxis, position)); - r[2] = vector(zaxis, -hlsl::dot(zaxis, position)); - - return r; -} - -// TODO: test, check if there is better implementation -// TODO: move quaternion to nbl::hlsl // TODO: why NBL_REF_ARG(MatType) doesn't work????? template @@ -210,74 +171,6 @@ inline void setTranslation(matrix& outMat, NBL_CONST_REF_ARG(vector -inline matrix buildProjectionMatrixPerspectiveFovRH(float fieldOfViewRadians, float aspectRatio, float zNear, float zFar) -{ - const float h = core::reciprocal(tanf(fieldOfViewRadians * 0.5f)); - _NBL_DEBUG_BREAK_IF(aspectRatio == 0.f); //division by zero - const float w = h / aspectRatio; - - _NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero - - matrix m; - m[0] = vector(w, 0.f, 0.f, 0.f); - m[1] = vector(0.f, -h, 0.f, 0.f); - m[2] = vector(0.f, 0.f, -zFar / (zFar - zNear), -zNear * zFar / (zFar - zNear)); - m[3] = vector(0.f, 0.f, -1.f, 0.f); - - return m; -} -template -inline matrix buildProjectionMatrixPerspectiveFovLH(float fieldOfViewRadians, float aspectRatio, float zNear, float zFar) -{ - const float h = core::reciprocal(tanf(fieldOfViewRadians * 0.5f)); - _NBL_DEBUG_BREAK_IF(aspectRatio == 0.f); //division by zero - const float w = h / aspectRatio; - - _NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero - - matrix m; - m[0] = vector(w, 0.f, 0.f, 0.f); - m[1] = vector(0.f, -h, 0.f, 0.f); - m[2] = vector(0.f, 0.f, zFar / (zFar - zNear), -zNear * zFar / (zFar - zNear)); - m[3] = vector(0.f, 0.f, 1.f, 0.f); - - return m; -} - -template -inline matrix buildProjectionMatrixOrthoRH(float widthOfViewVolume, float heightOfViewVolume, float zNear, float zFar) -{ - _NBL_DEBUG_BREAK_IF(widthOfViewVolume == 0.f); //division by zero - _NBL_DEBUG_BREAK_IF(heightOfViewVolume == 0.f); //division by zero - _NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero - - matrix m; - m[0] = vector(2.f / widthOfViewVolume, 0.f, 0.f, 0.f); - m[1] = vector(0.f, -2.f / heightOfViewVolume, 0.f, 0.f); - m[2] = vector(0.f, 0.f, -1.f / (zFar - zNear), -zNear / (zFar - zNear)); - m[3] = vector(0.f, 0.f, 0.f, 1.f); - - return m; -} - -template -inline matrix buildProjectionMatrixOrthoLH(float widthOfViewVolume, float heightOfViewVolume, float zNear, float zFar) -{ - _NBL_DEBUG_BREAK_IF(widthOfViewVolume == 0.f); //division by zero - _NBL_DEBUG_BREAK_IF(heightOfViewVolume == 0.f); //division by zero - _NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero - - matrix m; - m[0] = vector(2.f / widthOfViewVolume, 0.f, 0.f, 0.f); - m[1] = vector(0.f, -2.f / heightOfViewVolume, 0.f, 0.f); - m[2] = vector(0.f, 0.f, 1.f / (zFar - zNear), -zNear / (zFar - zNear)); - m[3] = vector(0.f, 0.f, 0.f, 1.f); - - return m; -} - } } diff --git a/include/nbl/builtin/hlsl/projection/projection.hlsl b/include/nbl/builtin/hlsl/projection/projection.hlsl index ecf180b5c6..22d2872fde 100644 --- a/include/nbl/builtin/hlsl/projection/projection.hlsl +++ b/include/nbl/builtin/hlsl/projection/projection.hlsl @@ -8,72 +8,69 @@ namespace nbl namespace hlsl { // TODO: use glm instead for c++ -inline float32_t4x4 buildProjectionMatrixPerspectiveFovLH(float fieldOfViewRadians, float aspectRatio, float zNear, float zFar) +template +inline matrix buildProjectionMatrixPerspectiveFovRH(float fieldOfViewRadians, float aspectRatio, float zNear, float zFar) { -#ifndef __HLSL_VERSION + const float h = core::reciprocal(tanf(fieldOfViewRadians * 0.5f)); _NBL_DEBUG_BREAK_IF(aspectRatio == 0.f); //division by zero + const float w = h / aspectRatio; + _NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero -#endif - const float h = core::reciprocal(tanf(fieldOfViewRadians * 0.5f)); - const float w = h / aspectRatio; - - float32_t4x4 m; - m[0] = float32_t4(w, 0.f, 0.f, 0.f); - m[1] = float32_t4(0.f, -h, 0.f, 0.f); - m[2] = float32_t4(0.f, 0.f, zFar / (zFar - zNear), -zNear * zFar / (zFar - zNear)); - m[3] = float32_t4(0.f, 0.f, 1.f, 0.f); - + matrix m; + m[0] = vector(w, 0.f, 0.f, 0.f); + m[1] = vector(0.f, -h, 0.f, 0.f); + m[2] = vector(0.f, 0.f, -zFar / (zFar - zNear), -zNear * zFar / (zFar - zNear)); + m[3] = vector(0.f, 0.f, -1.f, 0.f); + return m; } - -inline float32_t4x4 buildProjectionMatrixPerspectiveFovRH(float fieldOfViewRadians, float aspectRatio, float zNear, float zFar) +template +inline matrix buildProjectionMatrixPerspectiveFovLH(float fieldOfViewRadians, float aspectRatio, float zNear, float zFar) { const float h = core::reciprocal(tanf(fieldOfViewRadians * 0.5f)); -#ifndef __HLSL_VERSION _NBL_DEBUG_BREAK_IF(aspectRatio == 0.f); //division by zero - _NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero -#endif const float w = h / aspectRatio; - float32_t4x4 m; - m[0] = hlsl::float32_t4(w, 0.f, 0.f, 0.f); - m[1] = hlsl::float32_t4(0.f, -h, 0.f, 0.f); - m[2] = hlsl::float32_t4(0.f, 0.f, -zFar / (zFar - zNear), -zNear * zFar / (zFar - zNear)); - m[3] = hlsl::float32_t4(0.f, 0.f, -1.f, 0.f); + _NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero + + matrix m; + m[0] = vector(w, 0.f, 0.f, 0.f); + m[1] = vector(0.f, -h, 0.f, 0.f); + m[2] = vector(0.f, 0.f, zFar / (zFar - zNear), -zNear * zFar / (zFar - zNear)); + m[3] = vector(0.f, 0.f, 1.f, 0.f); return m; } -inline float32_t4x4 buildProjectionMatrixOrthoRH(float widthOfViewVolume, float heightOfViewVolume, float zNear, float zFar) +template +inline matrix buildProjectionMatrixOrthoRH(float widthOfViewVolume, float heightOfViewVolume, float zNear, float zFar) { -#ifndef __HLSL_VERSION _NBL_DEBUG_BREAK_IF(widthOfViewVolume == 0.f); //division by zero _NBL_DEBUG_BREAK_IF(heightOfViewVolume == 0.f); //division by zero _NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero -#endif - float32_t4x4 m; - m[0] = hlsl::float32_t4(2.f / widthOfViewVolume, 0.f, 0.f, 0.f); - m[1] = hlsl::float32_t4(0.f, -2.f / heightOfViewVolume, 0.f, 0.f); - m[2] = hlsl::float32_t4(0.f, 0.f, -1.f / (zFar - zNear), -zNear / (zFar - zNear)); - m[3] = hlsl::float32_t4(0.f, 0.f, 0.f, 1.f); + matrix m; + m[0] = vector(2.f / widthOfViewVolume, 0.f, 0.f, 0.f); + m[1] = vector(0.f, -2.f / heightOfViewVolume, 0.f, 0.f); + m[2] = vector(0.f, 0.f, -1.f / (zFar - zNear), -zNear / (zFar - zNear)); + m[3] = vector(0.f, 0.f, 0.f, 1.f); return m; } -inline float32_t4x4 buildProjectionMatrixOrthoLH(float widthOfViewVolume, float heightOfViewVolume, float zNear, float zFar) + +template +inline matrix buildProjectionMatrixOrthoLH(float widthOfViewVolume, float heightOfViewVolume, float zNear, float zFar) { -#ifndef __HLSL_VERSION _NBL_DEBUG_BREAK_IF(widthOfViewVolume == 0.f); //division by zero _NBL_DEBUG_BREAK_IF(heightOfViewVolume == 0.f); //division by zero _NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero -#endif - float32_t4x4 m; - m[0] = hlsl::float32_t4(2.f / widthOfViewVolume, 0.f, 0.f, 0.f); - m[1] = hlsl::float32_t4(0.f, -2.f / heightOfViewVolume, 0.f, 0.f); - m[2] = hlsl::float32_t4(0.f, 0.f, 1.f / (zFar - zNear), -zNear / (zFar - zNear)); - m[3] = hlsl::float32_t4(0.f, 0.f, 0.f, 1.f); + matrix m; + m[0] = vector(2.f / widthOfViewVolume, 0.f, 0.f, 0.f); + m[1] = vector(0.f, -2.f / heightOfViewVolume, 0.f, 0.f); + m[2] = vector(0.f, 0.f, 1.f / (zFar - zNear), -zNear / (zFar - zNear)); + m[3] = vector(0.f, 0.f, 0.f, 1.f); return m; } From d6c9716d62b8d7881220e904b947e375b226d08b Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Mon, 4 Nov 2024 17:56:11 -0800 Subject: [PATCH 124/358] Updated examples --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 1946d5b2da..d3d7bc4e3e 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 1946d5b2dab693c54dd16eaaf93cc77b8de2ddf2 +Subproject commit d3d7bc4e3ecf37b117a34f130a827bafeba139fa From 0995e6e0bf197d262371878b804e6918513ff524 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 5 Nov 2024 11:53:05 +0700 Subject: [PATCH 125/358] commit latest example --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 4735b731a7..2bfca3a1f2 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 4735b731a71490c7bcf074a3e4041c435975e942 +Subproject commit 2bfca3a1f2720373fd9ad5c0dfcb1fd673be3b20 From bb757d8c29015b2c3ef02c399c9da07067254172 Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 5 Nov 2024 07:33:47 +0100 Subject: [PATCH 126/358] make concepts work P.S. also make the HLSL `decltype` macro forward perfectly --- include/nbl/builtin/hlsl/blit/parameters.hlsl | 32 ++++++++++++++++ include/nbl/builtin/hlsl/concepts.hlsl | 13 +++---- .../nbl/builtin/hlsl/member_test_macros.hlsl | 13 +++---- include/nbl/builtin/hlsl/type_traits.hlsl | 2 +- include/nbl/builtin/hlsl/utility.hlsl | 38 +++++++++++++++++++ src/nbl/builtin/CMakeLists.txt | 1 + 6 files changed, 83 insertions(+), 16 deletions(-) create mode 100644 include/nbl/builtin/hlsl/utility.hlsl diff --git a/include/nbl/builtin/hlsl/blit/parameters.hlsl b/include/nbl/builtin/hlsl/blit/parameters.hlsl index d280cc523f..3992fcd689 100644 --- a/include/nbl/builtin/hlsl/blit/parameters.hlsl +++ b/include/nbl/builtin/hlsl/blit/parameters.hlsl @@ -44,6 +44,38 @@ struct parameters_t } }; +struct parameters2_t +{ + float32_t3 fScale; + float32_t3 negativeSupportMinusHalf; + float32_t referenceAlpha; + uint32_t kernelWeightsOffsetY; + uint32_t kernelWeightsOffsetZ; + uint32_t inPixelCount; + uint32_t outPixelCount; + + uint16_t3 inputDims; + uint16_t3 outputDims; + uint16_t3 windowDims; + uint16_t3 phaseCount; + uint16_t3 preloadRegion; + uint16_t3 iterationRegionXPrefixProducts; + uint16_t3 iterationRegionYPrefixProducts; + uint16_t3 iterationRegionZPrefixProducts; + + //! Offset into the shared memory array which tells us from where the second buffer of shared memory begins + //! Given by max(memory_for_preload_region, memory_for_result_of_y_pass) + uint16_t secondScratchOffset; + uint16_t outputTexelsPerWGZ; + + uint32_t3 getOutputTexelsPerWG() + { + //! `outputTexelsPerWG.xy` just happens to be in the first components of `iterationRegionsXPrefixProducts` and `iterationRegionYPrefixProducts` --this is + //! the result of how we choose to iterate, i.e. if, in the future, we decide to iterate differently, this needs to change. + return uint32_t3(iterationRegionXPrefixProducts.x, iterationRegionYPrefixProducts.x, outputTexelsPerWGZ); + } +}; + } } diff --git a/include/nbl/builtin/hlsl/concepts.hlsl b/include/nbl/builtin/hlsl/concepts.hlsl index bf16d3d1c5..0aa1af7b55 100644 --- a/include/nbl/builtin/hlsl/concepts.hlsl +++ b/include/nbl/builtin/hlsl/concepts.hlsl @@ -7,7 +7,7 @@ #include #include -#include +#include namespace nbl @@ -148,20 +148,19 @@ concept matricial = is_matrix::value; #define NBL_CONCEPT_BEGIN(LOCAL_PARAM_COUNT) namespace BOOST_PP_CAT(__concept__,NBL_CONCEPT_NAME) \ { // -#define NBL_CONCEPT_PARAM_T(ID,...) ::nbl::hlsl::impl::declval<__VA_ARGS__ >() +#define NBL_CONCEPT_PARAM_T(ID,...) ::nbl::hlsl::experimental::declval<__VA_ARGS__ >() // #define NBL_IMPL_CONCEPT_REQ_TYPE(...) ::nbl::hlsl::make_void_t #define NBL_IMPL_CONCEPT_REQ_EXPR(...) ::nbl::hlsl::make_void_t -#define NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE(E,C,...) C +#define NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE(E,C,...) ::nbl::hlsl::enable_if_t > // -#define NBL_IMPL_CONCEPT_SFINAE (typename=void,typename=void,bool=true) -#define NBL_IMPL_CONCEPT_SFINAE_SPEC (NBL_IMPL_CONCEPT_REQ_TYPE,NBL_IMPL_CONCEPT_REQ_EXPR,NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE) +#define NBL_IMPL_CONCEPT_SFINAE (NBL_IMPL_CONCEPT_REQ_TYPE,NBL_IMPL_CONCEPT_REQ_EXPR,NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE) // -#define NBL_IMPL_CONCEPT_END_DEF(r,unused,i,e) template \ +#define NBL_IMPL_CONCEPT_END_DEF(r,unused,i,e) template \ struct BOOST_PP_CAT(__requirement,i) : ::nbl::hlsl::false_type {}; \ template \ struct BOOST_PP_CAT(__requirement,i) : ::nbl::hlsl::true_type {}; // #define NBL_IMPL_CONCEPT_END_GET(r,unused,i,e) BOOST_PP_EXPR_IF(i,&&) BOOST_PP_CAT(__concept__,NBL_CONCEPT_NAME)::BOOST_PP_CAT(__requirement,i)::value diff --git a/include/nbl/builtin/hlsl/member_test_macros.hlsl b/include/nbl/builtin/hlsl/member_test_macros.hlsl index f103d6d83d..7579fb0fa2 100644 --- a/include/nbl/builtin/hlsl/member_test_macros.hlsl +++ b/include/nbl/builtin/hlsl/member_test_macros.hlsl @@ -4,7 +4,7 @@ #ifndef _NBL_BUILTIN_HLSL_MEMBER_TEST_MACROS_INCLUDED_ #define _NBL_BUILTIN_HLSL_MEMBER_TEST_MACROS_INCLUDED_ -#include +#include #include #ifdef __HLSL_VERSION @@ -24,9 +24,6 @@ enum e_member_presence is_const = 1<<2, }; -template -T declval(){} - template struct if_2_else_1 : integral_constant {}; template<> @@ -53,7 +50,7 @@ struct is_static_member_##a:: template \ struct is_member_##a: false_type { using type = void; }; \ template \ -struct is_member_##a().a),void>::value,void>::type> : true_type { using type = decltype(declval().a); }; \ +struct is_member_##a().a),void>::value,void>::type> : true_type { using type = decltype(experimental::declval().a); }; \ } \ template \ struct has_member_##a { NBL_CONSTEXPR_STATIC_INLINE e_member_presence value = (e_member_presence)(impl::is_member_##a::value + 2*impl::is_static_member_##a::value + 4*is_const::type>::value); }; \ @@ -72,7 +69,7 @@ NBL_GENERATE_MEMBER_TESTER(w) #define NBL_TYPE_DECLARE(z, n, x) BOOST_PP_COMMA_IF(x) typename Arg##n #define NBL_TYPE_DECLARE_DEFAULT(z, n, x) BOOST_PP_COMMA_IF(x) typename Arg##n=void #define NBL_TYPE_FWD(z, n, x) BOOST_PP_COMMA_IF(x) Arg##n -#define NBL_DECLVAL_DECLARE(z, n, x) impl::declval() BOOST_PP_COMMA_IF(BOOST_PP_NOT_EQUAL(BOOST_PP_INC(n), x)) +#define NBL_DECLVAL_DECLARE(z, n, x) experimental::declval() BOOST_PP_COMMA_IF(BOOST_PP_NOT_EQUAL(BOOST_PP_INC(n), x)) #define GENERATE_STATIC_METHOD_TESTER_SPEC(z, n, x) \ template \ @@ -89,9 +86,9 @@ BOOST_PP_REPEAT(n, GENERATE_STATIC_METHOD_TESTER_SPEC, x) #define GENERATE_METHOD_TESTER_SPEC(z, n, x) \ template \ -struct has_method_##x().x(BOOST_PP_REPEAT(n, NBL_DECLVAL_DECLARE, n)))>::type> : impl::if_2_else_1::value> \ +struct has_method_##x().x(BOOST_PP_REPEAT(n, NBL_DECLVAL_DECLARE, n)))>::type> : impl::if_2_else_1::value> \ { \ - using return_type = decltype(impl::declval().x(BOOST_PP_REPEAT(n, NBL_DECLVAL_DECLARE, n))); \ + using return_type = decltype(experimental::declval().x(BOOST_PP_REPEAT(n, NBL_DECLVAL_DECLARE, n))); \ NBL_CONSTEXPR_STATIC_INLINE uint arg_count = n; \ }; diff --git a/include/nbl/builtin/hlsl/type_traits.hlsl b/include/nbl/builtin/hlsl/type_traits.hlsl index 3a4e0eda70..1481d087f3 100644 --- a/include/nbl/builtin/hlsl/type_traits.hlsl +++ b/include/nbl/builtin/hlsl/type_traits.hlsl @@ -176,7 +176,7 @@ struct make_void { using type = void; }; #ifdef __HLSL_VERSION // HLSL -#define decltype(expr) __decltype(expr) +#define decltype(...) __decltype(__VA_ARGS__) template struct type_identity diff --git a/include/nbl/builtin/hlsl/utility.hlsl b/include/nbl/builtin/hlsl/utility.hlsl new file mode 100644 index 0000000000..487d4a7d75 --- /dev/null +++ b/include/nbl/builtin/hlsl/utility.hlsl @@ -0,0 +1,38 @@ +// Copyright (C) 2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_BUILTIN_HLSL_UTILITY_INCLUDED_ +#define _NBL_BUILTIN_HLSL_UTILITY_INCLUDED_ + + +#include + + +// for now we only implement declval +namespace nbl +{ +namespace hlsl +{ +#ifndef __HLSL_VERSION + +template +std::add_rvalue_reference_t declval() noexcept +{ + static_assert(false,"Actually calling declval is ill-formed."); +} + +#else + +namespace experimental +{ + +template +T declval() {} + +} + +#endif +} +} + +#endif diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index b3ec566beb..53ab534971 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -256,6 +256,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/functional.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/limits.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/type_traits.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/utility.hlsl") #metaprogramming LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/mpl.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/member_test_macros.hlsl") From 45db07075b05290fa3db2c379ed10063fe7bfdf2 Mon Sep 17 00:00:00 2001 From: Erfan Ahmadi Date: Tue, 5 Nov 2024 12:00:34 +0400 Subject: [PATCH 127/358] update examples --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index e14fde322a..e171e855f0 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit e14fde322aa127f862a2e7eb1dd9192d41dc2112 +Subproject commit e171e855f07fa716cc2900e971c0e789be126e27 From 067e8a385750177bf91f92aa0a1c832ba0d1c6b6 Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 5 Nov 2024 16:46:08 +0100 Subject: [PATCH 128/358] make pipeline layouts const and improve the `getBindingInfoForHLSL` --- include/nbl/asset/ICPUComputePipeline.h | 6 ++--- include/nbl/asset/ICPUGraphicsPipeline.h | 4 +-- include/nbl/asset/ICPUPipeline.h | 6 ++--- include/nbl/asset/IPipeline.h | 6 ++--- include/nbl/asset/IPipelineLayout.h | 32 ++++++++++++++++-------- 5 files changed, 32 insertions(+), 22 deletions(-) diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h index f3af332c64..14b0277152 100644 --- a/include/nbl/asset/ICPUComputePipeline.h +++ b/include/nbl/asset/ICPUComputePipeline.h @@ -25,7 +25,7 @@ class ICPUComputePipeline : public ICPUPipeline,1> { if (!params.layout) return nullptr; - auto retval = new ICPUComputePipeline(core::smart_refctd_ptr(params.layout)); + auto retval = new ICPUComputePipeline(core::smart_refctd_ptr(params.layout)); if (!retval->setSpecInfo(params.shader)) { retval->drop(); @@ -48,7 +48,7 @@ class ICPUComputePipeline : public ICPUPipeline,1> using base_t::base_t; virtual ~ICPUComputePipeline() = default; - base_t* clone_impl(core::smart_refctd_ptr&& layout) const override + base_t* clone_impl(core::smart_refctd_ptr&& layout) const override { return new ICPUComputePipeline(std::move(layout)); } @@ -57,7 +57,7 @@ class ICPUComputePipeline : public ICPUPipeline,1> { if (ix!=0) return m_stages[0].shader.get(); - return m_layout.get(); + return const_cast(m_layout.get()); } inline int8_t stageToIndex(const ICPUShader::E_SHADER_STAGE stage) const override diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h index 8b922c5a4e..e319b27503 100644 --- a/include/nbl/asset/ICPUGraphicsPipeline.h +++ b/include/nbl/asset/ICPUGraphicsPipeline.h @@ -65,7 +65,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline&& layout) const override + base_t* clone_impl(core::smart_refctd_ptr&& layout) const override { std::array _shaders; for (auto i=0; i(m_layout.get()); if (ix==1) return m_renderpass.get(); size_t stageCount = 0; diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h index 7a0f0c5bf0..5c43df0170 100644 --- a/include/nbl/asset/ICPUPipeline.h +++ b/include/nbl/asset/ICPUPipeline.h @@ -51,11 +51,11 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase ICPUPipelineLayout* getLayout() { assert(isMutable()); - return PipelineNonAssetBase::m_layout.get(); + return const_cast(PipelineNonAssetBase::m_layout.get()); } const ICPUPipelineLayout* getLayout() const { return PipelineNonAssetBase::m_layout.get(); } - inline void setLayout(core::smart_refctd_ptr&& _layout) + inline void setLayout(core::smart_refctd_ptr&& _layout) { assert(isMutable()); PipelineNonAssetBase::m_layout = std::move(_layout); @@ -117,7 +117,7 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase using PipelineNonAssetBase::PipelineNonAssetBase; virtual ~ICPUPipeline() = default; - virtual this_t* clone_impl(core::smart_refctd_ptr&& layout) const = 0; + virtual this_t* clone_impl(core::smart_refctd_ptr&& layout) const = 0; virtual int8_t stageToIndex(const ICPUShader::E_SHADER_STAGE stage) const = 0; struct ShaderStage { diff --git a/include/nbl/asset/IPipeline.h b/include/nbl/asset/IPipeline.h index 6af7b50bfc..40623876fe 100644 --- a/include/nbl/asset/IPipeline.h +++ b/include/nbl/asset/IPipeline.h @@ -35,7 +35,7 @@ class IPipeline struct SCreationParams { public: - PipelineLayout* layout = nullptr; + const PipelineLayout* layout = nullptr; protected: // This is not public to make sure that different pipelines only get the enums they support @@ -107,9 +107,9 @@ class IPipeline inline const PipelineLayout* getLayout() const {return m_layout.get();} protected: - inline IPipeline(core::smart_refctd_ptr&& _layout) : m_layout(std::move(_layout)) {} + inline IPipeline(core::smart_refctd_ptr&& _layout) : m_layout(std::move(_layout)) {} - core::smart_refctd_ptr m_layout; + core::smart_refctd_ptr m_layout; }; } diff --git a/include/nbl/asset/IPipelineLayout.h b/include/nbl/asset/IPipelineLayout.h index fdbc97bbfa..7cc9802900 100644 --- a/include/nbl/asset/IPipelineLayout.h +++ b/include/nbl/asset/IPipelineLayout.h @@ -143,41 +143,51 @@ class IPipelineLayout } // utility function, if you compile shaders for specific layouts, not create layouts given shaders - using desc_type_bitset_t = std::bitset(IDescriptor::E_TYPE::ET_COUNT)>; + struct SBindingKey + { + using type_bitset_t = std::bitset(IDescriptor::E_TYPE::ET_COUNT)>; + + hlsl::SBindingInfo binding = {}; + core::bitflag requiredStages = IShader::E_SHADER_STAGE::ESS_UNKNOWN; + // could have just initialized with `~type_bitset_t()` in C++23 + type_bitset_t allowedTypes = type_bitset_t((0x1u<(IDescriptor::E_TYPE::ET_COUNT))-1); + }; // TODO: add constraints for stage and creation flags, or just return the storage index & redirect? - core::string getBindingInfoForHLSL(const hlsl::SBindingInfo& info, const desc_type_bitset_t allowedTypes=desc_type_bitset_t().set()) const + core::string getBindingInfoForHLSL(const SBindingKey& key) const { - if (info.set>=DESCRIPTOR_SET_COUNT) - return "#error \"::nbl::hlsl::SBindingInfo::set out of range!\""; - const auto* layout = m_descSetLayouts[info.set]; + if (key.binding.set>=DESCRIPTOR_SET_COUNT) + return "#error \"IPipelineLayout::SBindingKey::binding::set out of range!\""; + const auto* layout = m_descSetLayouts[key.binding.set].get(); if (!layout) - return "#error \"::nbl::hlsl::SBindingInfo::set layout is nullptr!\""; + return "#error \"IPipelineLayout::SBindingKey::binding::set layout is nullptr!\""; // using redirect_t = IDescriptorSetLayoutBase::CBindingRedirect; using storage_range_index_t = redirect_t::storage_range_index_t; const redirect_t* redirect; storage_range_index_t found; { - const redirect_t::binding_number_t binding(info.binding); + const redirect_t::binding_number_t binding(key.binding.binding); for (auto t=0u; t(IDescriptor::E_TYPE::ET_COUNT); t++) - if (allowedTypes.test(t)) + if (key.allowedTypes.test(t)) { redirect = &layout->getDescriptorRedirect(static_cast(t)); found = redirect->findBindingStorageIndex(binding); if (found) break; } - if (!found && allowedTypes.test(static_cast(IDescriptor::E_TYPE::ET_SAMPLER))) + if (!found && key.allowedTypes.test(static_cast(IDescriptor::E_TYPE::ET_SAMPLER))) { redirect = &layout->getImmutableSamplerRedirect(); found = redirect->findBindingStorageIndex(binding); } if (!found) - return "#error \"Could not find `::nbl::hlsl::SBindingInfo::binding` in `::nbl::hlsl::SBindingInfo::set`'s layout!\""; + return "#error \"Could not find `IPipelineLayout::SBindingKey::binding::binding` in `IPipelineLayout::SBindingKey::binding::set`'s layout!\""; } + if (redirect->getStageFlags(found).hasFlags(key.requiredStages)) + return "#error \"Binding found in the layout doesn't have all the `IPipelineLayout::SBindingKey::binding::requiredStages` flags!\""; const auto count = redirect->getCount(found); assert(count); // this layout should have never passed validation - return "::nbl::hlsl::ConstevalBindingInfo<"+std::to_string(info.set)+","+std::to_string(info.binding)+","+std::to_string(count)+">"; + return "::nbl::hlsl::ConstevalBindingInfo<"+std::to_string(key.binding.set)+","+std::to_string(key.binding.binding)+","+std::to_string(count)+">"; } protected: From 395ac581fd5c28efaf8bb4c3a7123ea6b194d6e0 Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 5 Nov 2024 16:46:58 +0100 Subject: [PATCH 129/358] start using the asset converter to make Blit shaders --- examples_tests | 2 +- include/nbl/asset/IGraphicsPipeline.h | 2 +- include/nbl/video/utilities/CComputeBlit.h | 293 +++------------------ src/nbl/builtin/CMakeLists.txt | 18 -- src/nbl/video/utilities/CComputeBlit.cpp | 204 +++++++++----- 5 files changed, 188 insertions(+), 331 deletions(-) diff --git a/examples_tests b/examples_tests index f6492b0de9..e95c56290e 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit f6492b0de975754f960a2761aaacf3a1a3354100 +Subproject commit e95c56290e7f31f3f2a2b6e07ccafd7feb2e686e diff --git a/include/nbl/asset/IGraphicsPipeline.h b/include/nbl/asset/IGraphicsPipeline.h index 5f6365525e..62861fdc9d 100644 --- a/include/nbl/asset/IGraphicsPipeline.h +++ b/include/nbl/asset/IGraphicsPipeline.h @@ -155,7 +155,7 @@ class IGraphicsPipeline : public IPipeline, public IGraphics protected: explicit IGraphicsPipeline(const SCreationParams& _params) : - IPipeline(core::smart_refctd_ptr(_params.layout)), + IPipeline(core::smart_refctd_ptr(_params.layout)), m_params(_params.cached), m_renderpass(core::smart_refctd_ptr(_params.renderpass)) {} SCachedCreationParams m_params; diff --git a/include/nbl/video/utilities/CComputeBlit.h b/include/nbl/video/utilities/CComputeBlit.h index 69b8d8ba29..4180ac4204 100644 --- a/include/nbl/video/utilities/CComputeBlit.h +++ b/include/nbl/video/utilities/CComputeBlit.h @@ -8,9 +8,16 @@ namespace nbl::video { -class NBL_API2 CComputeBlit : public core::IReferenceCounted +class CComputeBlit : public core::IReferenceCounted { public: + constexpr static inline asset::SPushConstantRange DefaultPushConstantRange = { + .stageFlags = IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, + .offset = 0ull, + .size = sizeof(hlsl::blit::parameters2_t) + }; + constexpr static inline std::span DefaultPushConstantRanges = {&DefaultPushConstantRange,1}; + // Coverage adjustment needs alpha to be stored in HDR with high precision static inline asset::E_FORMAT getCoverageAdjustmentIntermediateFormat(const asset::E_FORMAT format) { @@ -41,7 +48,7 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted } // ctor - CComputeBlit( + NBL_API2 CComputeBlit( core::smart_refctd_ptr&& logicalDevice, core::smart_refctd_ptr&& cache=nullptr, core::smart_refctd_ptr&& logger=nullptr @@ -52,6 +59,7 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted { core::smart_refctd_ptr blit; core::smart_refctd_ptr coverage; + uint16_t workgroupSize; }; struct SPipelinesCreateInfo { @@ -67,13 +75,13 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted hlsl::SBindingInfo samplers; // must be Storage Image descriptor type hlsl::SBindingInfo outputs; - //! If you set the balues too small, we'll correct them ourselves anyway + //! If you set the balues too small, we'll correct them ourselves anyway, default values of 0 means we guess and provide our defaults // needs to be at least as big as the maximum subgroup size - uint32_t workgroupSizeLog2 : 4 = 0; - // - uint32_t sharedMemoryPerInvocation : 6 = 0; + uint16_t workgroupSizeLog2 : 4 = 0; + // in bytes, needs to be at least enough to store two full input pixels per invocation + uint16_t sharedMemoryPerInvocation : 6 = 0; }; - SPipelines createAndCachePipelines(const SPipelinesCreateInfo& info); + NBL_API2 SPipelines createAndCachePipelines(const SPipelinesCreateInfo& info); //! Returns the original format if supports STORAGE_IMAGE otherwise returns a format in its compat class which supports STORAGE_IMAGE. inline asset::E_FORMAT getOutputViewFormat(const asset::E_FORMAT format) @@ -99,101 +107,38 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted } } -#if 0 - // @param `alphaBinCount` is only required to size the histogram present in the default nbl_glsl_blit_AlphaStatistics_t in default_compute_common.comp - core::smart_refctd_ptr createAlphaTestSpecializedShader(const asset::IImage::E_TYPE inImageType, const uint32_t alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount); - - core::smart_refctd_ptr getAlphaTestPipeline(const uint32_t alphaBinCount, const asset::IImage::E_TYPE imageType) - { - const auto workgroupDims = getDefaultWorkgroupDims(imageType); - const auto paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount); - - assert(paddedAlphaBinCount >= asset::IBlitUtilities::MinAlphaBinCount); - const auto pipelineIndex = (paddedAlphaBinCount / asset::IBlitUtilities::MinAlphaBinCount) - 1; - - if (m_alphaTestPipelines[pipelineIndex][imageType]) - return m_alphaTestPipelines[pipelineIndex][imageType]; - - auto specShader = createAlphaTestSpecializedShader(imageType, paddedAlphaBinCount); - IGPUComputePipeline::SCreationParams creationParams; - creationParams.shader.shader = specShader.get(); - creationParams.shader.entryPoint = "main"; - creationParams.layout = m_blitPipelineLayout[EBT_COVERAGE_ADJUSTMENT].get(); - assert(m_device->createComputePipelines(nullptr, { &creationParams, &creationParams + 1 }, &m_alphaTestPipelines[pipelineIndex][imageType])); - - return m_alphaTestPipelines[pipelineIndex][imageType]; - } - - // @param `outFormat` dictates encoding. - core::smart_refctd_ptr createNormalizationSpecializedShader(const asset::IImage::E_TYPE inImageType, const asset::E_FORMAT outFormat, - const uint32_t alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount); - - core::smart_refctd_ptr getNormalizationPipeline(const asset::IImage::E_TYPE imageType, const asset::E_FORMAT outFormat, - const uint32_t alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount) + // Use the return values of `getOutputViewFormat` and `getCoverageAdjustmentIntermediateFormat` for this + static inline uint32_t getAlphaBinCount(const uint16_t workgroupSize, const asset::E_FORMAT intermediateAlpha, const uint32_t layersToBlit) { - const auto workgroupDims = getDefaultWorkgroupDims(imageType); - const uint32_t paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount); - const SNormalizationCacheKey key = { imageType, paddedAlphaBinCount, outFormat }; - - if (m_normalizationPipelines.find(key) == m_normalizationPipelines.end()) + uint16_t baseBucketCount; + using format_t = nbl::asset::E_FORMAT; + switch (intermediateAlpha) { - auto specShader = createNormalizationSpecializedShader(imageType, outFormat, paddedAlphaBinCount); - IGPUComputePipeline::SCreationParams creationParams; - creationParams.shader.shader = specShader.get(); - creationParams.shader.entryPoint = "main"; - creationParams.layout = m_blitPipelineLayout[EBT_COVERAGE_ADJUSTMENT].get(); - assert(m_device->createComputePipelines(nullptr, { &creationParams, &creationParams + 1 }, &m_normalizationPipelines[key])); + case format_t::EF_R8_UNORM: [[fallthrough]]; + case format_t::EF_R8_SNORM: + baseBucketCount = 256; + break; + case format_t::EF_R16_SFLOAT: + baseBucketCount = 512; + break; + case format_t::EF_R16_UNORM: [[fallthrough]]; + case format_t::EF_R16_SNORM: [[fallthrough]]; + baseBucketCount = 1024; + break; + case format_t::EF_R32_SFLOAT: + baseBucketCount = 2048; + break; + default: + return 0; } - - return m_normalizationPipelines[key]; + // the absolute minimum needed to store a single pixel of a worst case format (precise, all 4 channels) + constexpr auto singlePixelStorage = 4*sizeof(hlsl::float32_t); + constexpr auto ratio = singlePixelStorage/sizeof(uint16_t); + const auto paddedAlphaBinCount = core::min(core::roundUp(baseBucketCount,workgroupSize),workgroupSize*ratio); + return paddedAlphaBinCount*layersToBlit; } - template - core::smart_refctd_ptr getBlitPipeline( - const asset::E_FORMAT outFormat, - const asset::IImage::E_TYPE imageType, - const core::vectorSIMDu32& inExtent, - const core::vectorSIMDu32& outExtent, - const asset::IBlitUtilities::E_ALPHA_SEMANTIC alphaSemantic, - const typename BlitUtilities::convolution_kernels_t& kernels, - const uint32_t workgroupSize = 256, - const uint32_t alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount) - { - const auto paddedAlphaBinCount = getPaddedAlphaBinCount(core::vectorSIMDu32(workgroupSize, 1, 1, 1), alphaBinCount); - - const SBlitCacheKey key = - { - .wgSize = workgroupSize, - .imageType = imageType, - .alphaBinCount = paddedAlphaBinCount, - .outFormat = outFormat, - .smemSize = m_availableSharedMemory, - .coverageAdjustment = (alphaSemantic == asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE) - }; - - if (m_blitPipelines.find(key) == m_blitPipelines.end()) - { - const auto blitType = (alphaSemantic == asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE) ? EBT_COVERAGE_ADJUSTMENT : EBT_REGULAR; - - auto specShader = createBlitSpecializedShader( - outFormat, - imageType, - inExtent, - outExtent, - alphaSemantic, - kernels, - workgroupSize, - paddedAlphaBinCount); - - IGPUComputePipeline::SCreationParams creationParams; - creationParams.shader.shader = specShader.get(); - creationParams.shader.entryPoint = "main"; - creationParams.layout = m_blitPipelineLayout[blitType].get(); - m_device->createComputePipelines(nullptr, { &creationParams, &creationParams + 1 }, &m_blitPipelines[key]); - } - - return m_blitPipelines[key]; - } +#if 0 //! Returns the number of output texels produced by one workgroup, deciding factor is `m_availableSharedMemory`. //! @param outImageFormat is the format of output (of the blit step) image. @@ -368,152 +313,10 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted outDispatchInfo.wgCount[2] = workgroupCount[2]; } - static inline core::vectorSIMDu32 getDefaultWorkgroupDims(const asset::IImage::E_TYPE imageType) - { - switch (imageType) - { - case asset::IImage::ET_1D: - return core::vectorSIMDu32(256, 1, 1, 1); - case asset::IImage::ET_2D: - return core::vectorSIMDu32(16, 16, 1, 1); - case asset::IImage::ET_3D: - return core::vectorSIMDu32(8, 8, 4, 1); - default: - return core::vectorSIMDu32(1, 1, 1, 1); - } - } - - static inline size_t getCoverageAdjustmentScratchSize(const asset::IBlitUtilities::E_ALPHA_SEMANTIC alphaSemantic, const asset::IImage::E_TYPE imageType, const uint32_t alphaBinCount, const uint32_t layersToBlit) - { - if (alphaSemantic != asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE) - return 0; - - const auto workgroupDims = getDefaultWorkgroupDims(imageType); - const auto paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount); - const auto requiredSize = (sizeof(uint32_t) + paddedAlphaBinCount * sizeof(uint32_t)) * layersToBlit; - return requiredSize; - } - - bool updateDescriptorSet( - video::IGPUDescriptorSet* blitDS, - video::IGPUDescriptorSet* kernelWeightsDS, - core::smart_refctd_ptr inImageView, - core::smart_refctd_ptr outImageView, - core::smart_refctd_ptr coverageAdjustmentScratchBuffer, - core::smart_refctd_ptr kernelWeightsUTB, - const asset::ISampler::E_TEXTURE_CLAMP wrapU = asset::ISampler::ETC_CLAMP_TO_EDGE, - const asset::ISampler::E_TEXTURE_CLAMP wrapV = asset::ISampler::ETC_CLAMP_TO_EDGE, - const asset::ISampler::E_TEXTURE_CLAMP wrapW = asset::ISampler::ETC_CLAMP_TO_EDGE, - const asset::ISampler::E_TEXTURE_BORDER_COLOR borderColor = asset::ISampler::ETBC_FLOAT_OPAQUE_BLACK) - { - constexpr auto MAX_DESCRIPTOR_COUNT = 3; - - auto updateDS = [this, coverageAdjustmentScratchBuffer](video::IGPUDescriptorSet* ds, video::IGPUDescriptorSet::SDescriptorInfo* infos) -> bool - { - const auto bindingCount = ds->getLayout()->getTotalBindingCount(); - if ((bindingCount == 3) && !coverageAdjustmentScratchBuffer) - return false; - - video::IGPUDescriptorSet::SWriteDescriptorSet writes[MAX_DESCRIPTOR_COUNT] = {}; - - uint32_t infoIdx = 0; - uint32_t writeCount = 0; - for (uint32_t t = 0; t < static_cast(asset::IDescriptor::E_TYPE::ET_COUNT); ++t) - { - const auto type = static_cast(t); - const auto& redirect = ds->getLayout()->getDescriptorRedirect(type); - const auto declaredBindingCount = redirect.getBindingCount(); - - for (uint32_t i = 0; i < declaredBindingCount; ++i) - { - auto& write = writes[writeCount++]; - write.dstSet = ds; - write.binding = redirect.getBinding(IGPUDescriptorSetLayout::CBindingRedirect::storage_range_index_t{ i }).data; - write.arrayElement = 0u; - write.count = redirect.getCount(IGPUDescriptorSetLayout::CBindingRedirect::storage_range_index_t{ i }); - write.info = &infos[infoIdx]; - - infoIdx += write.count; - } - } - assert(writeCount == bindingCount); - m_device->updateDescriptorSets(writeCount, writes, 0u, nullptr); - - return true; - }; - - if (blitDS) - { - if (!inImageView || !outImageView) - return false; - - video::IGPUDescriptorSet::SDescriptorInfo infos[MAX_DESCRIPTOR_COUNT] = {}; - - if (!samplers[wrapU][wrapV][wrapW][borderColor]) - { - video::IGPUSampler::SParams params = {}; - params.TextureWrapU = wrapU; - params.TextureWrapV = wrapV; - params.TextureWrapW = wrapW; - params.BorderColor = borderColor; - params.MinFilter = asset::ISampler::ETF_NEAREST; - params.MaxFilter = asset::ISampler::ETF_NEAREST; - params.MipmapMode = asset::ISampler::ESMM_NEAREST; - params.AnisotropicFilter = 0u; - params.CompareEnable = 0u; - params.CompareFunc = asset::ISampler::ECO_ALWAYS; - - samplers[wrapU][wrapV][wrapW][borderColor] = m_device->createSampler(params); - if (!samplers[wrapU][wrapV][wrapW][borderColor]) - return false; - } - - infos[0].desc = inImageView; - infos[0].info.image.imageLayout = asset::IImage::LAYOUT::READ_ONLY_OPTIMAL; - infos[0].info.combinedImageSampler.sampler = samplers[wrapU][wrapV][wrapW][borderColor]; - - infos[1].desc = outImageView; - infos[1].info.image.imageLayout = asset::IImage::LAYOUT::GENERAL; - infos[1].info.combinedImageSampler.sampler = nullptr; - - if (coverageAdjustmentScratchBuffer) - { - infos[2].desc = coverageAdjustmentScratchBuffer; - infos[2].info.buffer.offset = 0; - infos[2].info.buffer.size = coverageAdjustmentScratchBuffer->getSize(); - } - - if (!updateDS(blitDS, infos)) - return false; - } - - if (kernelWeightsDS) - { - video::IGPUDescriptorSet::SDescriptorInfo info = {}; - info.desc = kernelWeightsUTB; - info.info.buffer.offset = 0ull; - info.info.buffer.size = kernelWeightsUTB->getUnderlyingBuffer()->getSize(); - - if (!updateDS(kernelWeightsDS, &info)) - return false; - } - - return true; - } - //! User is responsible for the memory barriers between previous writes and the first //! dispatch on the input image, and future reads of output image and the last dispatch. template inline void blit( - video::IGPUCommandBuffer* cmdbuf, - const asset::IBlitUtilities::E_ALPHA_SEMANTIC alphaSemantic, - video::IGPUDescriptorSet* alphaTestDS, - video::IGPUComputePipeline* alphaTestPipeline, - video::IGPUDescriptorSet* blitDS, - video::IGPUDescriptorSet* blitWeightsDS, - video::IGPUComputePipeline* blitPipeline, - video::IGPUDescriptorSet* normalizationDS, - video::IGPUComputePipeline* normalizationPipeline, const core::vectorSIMDu32& inImageExtent, const asset::IImage::E_TYPE inImageType, const asset::E_FORMAT inImageFormat, @@ -627,7 +430,7 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted } //! Query shared memory size for a given `outputTexelsPerWG`. - size_t getRequiredSharedMemorySize( + inline size_t getRequiredSharedMemorySize( const core::vectorSIMDu32& outputTexelsPerWG, const core::vectorSIMDu32& outExtent, const asset::IImage::E_TYPE imageType, @@ -641,16 +444,6 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted const size_t requiredSmem = (core::max(preloadRegion.x * preloadRegion.y * preloadRegion.z, outputTexelsPerWG.x * outputTexelsPerWG.y * preloadRegion.z) + outputTexelsPerWG.x * preloadRegion.y * preloadRegion.z) * channelCount * sizeof(float); return requiredSmem; }; - - static inline uint32_t getPaddedAlphaBinCount(const core::vectorSIMDu32& workgroupDims, const uint32_t oldAlphaBinCount) - { - // For the normalization shader, it should be that: - // alphaBinCount = k*workGroupSize, k is integer, k >= 1, - assert(workgroupDims.x != 0 && workgroupDims.y != 0 && workgroupDims.z != 0); - const auto wgSize = workgroupDims.x * workgroupDims.y * workgroupDims.z; - const auto paddedAlphaBinCount = core::roundUp(oldAlphaBinCount, wgSize); - return paddedAlphaBinCount; - } }; } diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index 53ab534971..4dbd039b54 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -207,24 +207,6 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/ext/DepthPyramidGenerator/com LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/ext/DepthPyramidGenerator/push_constants_struct_common.h") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/ext/DepthPyramidGenerator/depth_pyramid_generator_impl.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/ext/DepthPyramidGenerator/virtual_work_group.glsl") -# blit -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/formats_encode.glsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/parameters.glsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/multi_dimensional_array_addressing.glsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/default_compute_common.comp") - -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/default_compute_blit.comp") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/blit/blit.glsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/blit/descriptors.glsl") - -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/default_compute_alpha_test.comp") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/alpha_test/alpha_test.glsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/alpha_test/descriptors.glsl") - -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/default_compute_normalization.comp") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/normalization/normalization.glsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/normalization/descriptors.glsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/normalization/shared_normalization.glsl") # HLSL LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/macros.h") diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp index 1dd123952f..2ad5656005 100644 --- a/src/nbl/video/utilities/CComputeBlit.cpp +++ b/src/nbl/video/utilities/CComputeBlit.cpp @@ -23,36 +23,25 @@ CComputeBlit::CComputeBlit(smart_refctd_ptr&& logicalDevice, sma auto CComputeBlit::createAndCachePipelines(const SPipelinesCreateInfo& info) -> SPipelines { SPipelines retval; - core::vector> cpuPplns; - cpuPplns.reserve(tasks.size()); + + std::array,2> cpuPplns; const auto& limits = m_device->getPhysicalDevice()->getLimits(); - for (auto task : tasks) + retval.workgroupSize = 0x1u<std::string { - // adjust task default values - { - if (task.workgroupSizeLog2(task.outputFormat),3,1.f); - const auto precisionAt0 = getFormatPrecision(static_cast(task.outputFormat),3,0.f); - if (limits.workgroupMemoryExplicitLayout16BitAccess && limits.shaderFloat16 && precisionAt1>=std::exp2f(-11.f) && precisionAt0>=std::numeric_limits::min()) - useFloat16 = true; - } - // the absolute minimum needed to store a single pixel - const auto singlePixelStorage = channels*(useFloat16 ? sizeof(hlsl::float16_t):sizeof(hlsl::float32_t)); - // also slightly more memory is needed - task.sharedMemoryPerInvocation = core::max(singlePixelStorage*2,task.sharedMemoryPerInvocation); - } - const auto common = [&]()->std::string - { - std::ostringstream tmp; - tmp << R"===( + std::ostringstream tmp; + tmp << R"===( #include "nbl/builtin/hlsl/binding_info.hlsl" @@ -61,54 +50,58 @@ using namespace nbl::hlsl; struct ConstevalParameters { - NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkGroupSize = )===" << (0x1u<getBindingInfoForHLSL() << R"===(; - using input_sampler_binding_t = )===" << serializeBindingInfo() << R"===(; - using input_image_binding_t = )===" << serializeBindingInfo() << R"===(; - using output_binding_t = )===" << serializeBindingInfo() << R"===(; - NBL_CONSTEXPR_STATIC_INLINE uint32_t uint32_t SharedMemoryDWORDs = )===" << task.sharedMemoryPerInvocation/sizeof(uint32_t) << R"===(; +NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkGroupSize = )===" << retval.workgroupSize << R"===(; +using kernel_weight_binding_t = )===" << layout->getBindingInfoForHLSL({.binding=info.kernelWeights,.requiredStages=IShader::E_SHADER_STAGE::ESS_COMPUTE}) << R"===(; +using input_sampler_binding_t = )===" << layout->getBindingInfoForHLSL({.binding=info.samplers,.requiredStages=IShader::E_SHADER_STAGE::ESS_COMPUTE}) << R"===(; +using input_image_binding_t = )===" << layout->getBindingInfoForHLSL({.binding=info.inputs,.requiredStages=IShader::E_SHADER_STAGE::ESS_COMPUTE}) << R"===(; +using output_binding_t = )===" << layout->getBindingInfoForHLSL({.binding=info.outputs,.requiredStages=IShader::E_SHADER_STAGE::ESS_COMPUTE}) << R"===(; +NBL_CONSTEXPR_STATIC_INLINE uint32_t uint32_t SharedMemoryDWORDs = )===" << (sharedMemoryPerInvocation* retval.workgroupSize)/sizeof(uint32_t) << R"===(; }; )==="; - return tmp.str(); - }(); - auto createPipeline = [&limits,&common](const char* mainPath)->smart_refctd_ptr + return tmp.str(); + }(); + auto createPipeline = [&limits,layout,&common](const char* mainPath)->smart_refctd_ptr + { + auto shader = make_smart_refctd_ptr( + (common+"\n#include \""+mainPath+"\"\n").c_str(), + IShader::E_SHADER_STAGE::ESS_COMPUTE, + IShader::E_CONTENT_TYPE::ECT_HLSL, + mainPath + ); + // make sure there's a hash so asset converter doesn't fail { - auto shader = make_smart_refctd_ptr( - (common+"\n#include \""+mainPath+"\"\n").c_str(), - IShader::E_SHADER_STAGE::ESS_COMPUTE, - IShader::E_CONTENT_TYPE::ECT_HLSL, - mainPath - ); - - ICPUComputePipeline::SCreationParams params = {}; - params.layout = nullptr; // TODO - params.shader.entryPoint = "main"; - params.shader.shader = shader.get(); - params.shader.requiredSubgroupSize = static_cast(hlsl::findMSB(limits.maxSubgroupSize)); - // needed for the prefix and reductions to work - params.shader.requireFullSubgroups = true; - return ICPUComputePipeline::create(params); - }; - // create blit pipeline - cpuPplns.emplace_back(createPipeline("default_blit.comp.hlsl")); - cpuPplns.emplace_back(createPipeline("default_normalize.comp.hlsl")); - } + auto source = const_cast(shader->getContent()); + source->setContentHash(source->computeContentHash()); + } + + ICPUComputePipeline::SCreationParams params = {}; + params.layout = layout; + params.shader.entryPoint = "main"; + params.shader.shader = shader.get(); + params.shader.requiredSubgroupSize = static_cast(hlsl::findMSB(limits.maxSubgroupSize)); + // needed for the prefix and reductions to work + params.shader.requireFullSubgroups = true; + return ICPUComputePipeline::create(params); + }; + // create blit pipeline + cpuPplns[0] = createPipeline("nbl/builtin/hlsl/blit/default_blit.comp.hlsl"); + cpuPplns[1] = createPipeline("nbl/builtin/hlsl/blit/default_normalize.comp.hlsl"); CAssetConverter::SInputs inputs = {}; - inputs.readCache = converter; + inputs.readCache = info.converter; inputs.logger = m_logger.getRaw(); std::get>(inputs.assets) = {&cpuPplns.data()->get(),cpuPplns.size()}; inputs.readShaderCache = m_shaderCache.get(); inputs.writeShaderCache = m_shaderCache.get(); // no pipeline cache, because we only make the same pipeline once, ever - auto reserveResults = converter->reserve(inputs); + auto reserveResults = info.converter->reserve(inputs); assert(reserveResults.getRequiredQueueFlags().value==IQueue::FAMILY_FLAGS::NONE); + // copy over the results { auto rIt = reserveResults.getGPUObjects().data(); - // TODO: redo - for (size_t i=0; ivalue; + retval.blit = (rIt++)->value; + retval.coverage = (rIt++)->value; } // this just inserts the pipelines into the cache @@ -198,6 +191,53 @@ core::smart_refctd_ptr createBlitSpecializedShader( return gpuShader; } +template +core::smart_refctd_ptr getBlitPipeline( + const asset::E_FORMAT outFormat, + const asset::IImage::E_TYPE imageType, + const core::vectorSIMDu32& inExtent, + const core::vectorSIMDu32& outExtent, + const asset::IBlitUtilities::E_ALPHA_SEMANTIC alphaSemantic, + const typename BlitUtilities::convolution_kernels_t& kernels, + const uint32_t workgroupSize = 256, + const uint32_t alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount) +{ + const auto paddedAlphaBinCount = getPaddedAlphaBinCount(core::vectorSIMDu32(workgroupSize, 1, 1, 1), alphaBinCount); + + const SBlitCacheKey key = + { + .wgSize = workgroupSize, + .imageType = imageType, + .alphaBinCount = paddedAlphaBinCount, + .outFormat = outFormat, + .smemSize = m_availableSharedMemory, + .coverageAdjustment = (alphaSemantic == asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE) + }; + + if (m_blitPipelines.find(key) == m_blitPipelines.end()) + { + const auto blitType = (alphaSemantic == asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE) ? EBT_COVERAGE_ADJUSTMENT : EBT_REGULAR; + + auto specShader = createBlitSpecializedShader( + outFormat, + imageType, + inExtent, + outExtent, + alphaSemantic, + kernels, + workgroupSize, + paddedAlphaBinCount); + + IGPUComputePipeline::SCreationParams creationParams; + creationParams.shader.shader = specShader.get(); + creationParams.shader.entryPoint = "main"; + creationParams.layout = m_blitPipelineLayout[blitType].get(); + m_device->createComputePipelines(nullptr, { &creationParams, &creationParams + 1 }, &m_blitPipelines[key]); + } + + return m_blitPipelines[key]; +} + core::smart_refctd_ptr CComputeBlit::createAlphaTestSpecializedShader(const asset::IImage::E_TYPE imageType, const uint32_t alphaBinCount) { const auto workgroupDims = getDefaultWorkgroupDims(imageType); @@ -233,6 +273,28 @@ core::smart_refctd_ptr CComputeBlit::createAlphaTestSpecializ auto cpuShader = core::make_smart_refctd_ptr(shaderSourceStream.str().c_str(), IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, IGPUShader::E_CONTENT_TYPE::ECT_HLSL, "CComputeBlitGLSLGLSL::createAlphaTestSpecializedShader"); } +core::smart_refctd_ptr getAlphaTestPipeline(const uint32_t alphaBinCount, const asset::IImage::E_TYPE imageType) +{ + const auto workgroupDims = getDefaultWorkgroupDims(imageType); + const auto paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount); + + assert(paddedAlphaBinCount >= asset::IBlitUtilities::MinAlphaBinCount); + const auto pipelineIndex = (paddedAlphaBinCount / asset::IBlitUtilities::MinAlphaBinCount) - 1; + + if (m_alphaTestPipelines[pipelineIndex][imageType]) + return m_alphaTestPipelines[pipelineIndex][imageType]; + + auto specShader = createAlphaTestSpecializedShader(imageType, paddedAlphaBinCount); + IGPUComputePipeline::SCreationParams creationParams; + creationParams.shader.shader = specShader.get(); + creationParams.shader.entryPoint = "main"; + creationParams.layout = m_blitPipelineLayout[EBT_COVERAGE_ADJUSTMENT].get(); + assert(m_device->createComputePipelines(nullptr, { &creationParams, &creationParams + 1 }, &m_alphaTestPipelines[pipelineIndex][imageType])); + + return m_alphaTestPipelines[pipelineIndex][imageType]; +} + +// @param `outFormat` dictates encoding. core::smart_refctd_ptr CComputeBlit::createNormalizationSpecializedShader(const asset::IImage::E_TYPE imageType, const uint32_t alphaBinCount) { const auto workgroupDims = getDefaultWorkgroupDims(imageType); @@ -276,4 +338,24 @@ core::smart_refctd_ptr CComputeBlit::createNormalizationSpeci auto cpuShader = core::make_smart_refctd_ptr(shaderSourceStream.str().c_str(), IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, IGPUShader::E_CONTENT_TYPE::ECT_HLSL, "CComputeBlitGLSL::createNormalizationSpecializedShader"); } + +core::smart_refctd_ptr getNormalizationPipeline(const asset::IImage::E_TYPE imageType, const asset::E_FORMAT outFormat, + const uint32_t alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount) +{ + const auto workgroupDims = getDefaultWorkgroupDims(imageType); + const uint32_t paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount); + const SNormalizationCacheKey key = { imageType, paddedAlphaBinCount, outFormat }; + + if (m_normalizationPipelines.find(key) == m_normalizationPipelines.end()) + { + auto specShader = createNormalizationSpecializedShader(imageType, outFormat, paddedAlphaBinCount); + IGPUComputePipeline::SCreationParams creationParams; + creationParams.shader.shader = specShader.get(); + creationParams.shader.entryPoint = "main"; + creationParams.layout = m_blitPipelineLayout[EBT_COVERAGE_ADJUSTMENT].get(); + assert(m_device->createComputePipelines(nullptr, { &creationParams, &creationParams + 1 }, &m_normalizationPipelines[key])); + } + + return m_normalizationPipelines[key]; +} #endif \ No newline at end of file From e1a87e757995a1d531d5517c9a23e6d07b9dbafe Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 5 Nov 2024 17:16:24 +0100 Subject: [PATCH 130/358] stupid typos are the bane of my existence --- examples_tests | 2 +- include/nbl/asset/IPipelineLayout.h | 2 +- include/nbl/builtin/hlsl/blit/common.hlsl | 2 +- include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl | 7 +++---- include/nbl/builtin/hlsl/blit/default_normalize.comp.hlsl | 6 ++---- src/nbl/video/utilities/CComputeBlit.cpp | 2 +- 6 files changed, 9 insertions(+), 12 deletions(-) diff --git a/examples_tests b/examples_tests index e95c56290e..e77ed5d468 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit e95c56290e7f31f3f2a2b6e07ccafd7feb2e686e +Subproject commit e77ed5d468f929ac5e7f1909f728895c923eb2c4 diff --git a/include/nbl/asset/IPipelineLayout.h b/include/nbl/asset/IPipelineLayout.h index 7cc9802900..0eaba46f7c 100644 --- a/include/nbl/asset/IPipelineLayout.h +++ b/include/nbl/asset/IPipelineLayout.h @@ -183,7 +183,7 @@ class IPipelineLayout if (!found) return "#error \"Could not find `IPipelineLayout::SBindingKey::binding::binding` in `IPipelineLayout::SBindingKey::binding::set`'s layout!\""; } - if (redirect->getStageFlags(found).hasFlags(key.requiredStages)) + if (!redirect->getStageFlags(found).hasFlags(key.requiredStages)) return "#error \"Binding found in the layout doesn't have all the `IPipelineLayout::SBindingKey::binding::requiredStages` flags!\""; const auto count = redirect->getCount(found); assert(count); // this layout should have never passed validation diff --git a/include/nbl/builtin/hlsl/blit/common.hlsl b/include/nbl/builtin/hlsl/blit/common.hlsl index 6295e68706..93ed579312 100644 --- a/include/nbl/builtin/hlsl/blit/common.hlsl +++ b/include/nbl/builtin/hlsl/blit/common.hlsl @@ -12,7 +12,7 @@ namespace hlsl { namespace glsl { -uint32_t gl_WorkGroupSize() +uint32_t3 gl_WorkGroupSize() { return uint32_t3(ConstevalParameters::WorkGroupSize,1,1); } diff --git a/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl b/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl index ad2749904c..c9184d0164 100644 --- a/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl +++ b/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl @@ -1,12 +1,11 @@ // Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -//#include "nbl/builtin/hlsl/blit/common.hlsl" -//#include "nbl/builtin/hlsl/blit/parameters.hlsl" -//#include "nbl/builtin/hlsl/blit/compute_blit.hlsl" +#include "nbl/builtin/hlsl/blit/parameters.hlsl" +#include "nbl/builtin/hlsl/blit/common.hlsl" +//#include "nbl/builtin/hlsl/blit/compute_blit.hlsl" -groupshared uint32_t sMem[ConstevalParameters::SharedMemoryDWORDs]; /* struct HistogramAccessor { diff --git a/include/nbl/builtin/hlsl/blit/default_normalize.comp.hlsl b/include/nbl/builtin/hlsl/blit/default_normalize.comp.hlsl index 589f370c0e..8e2f4beb23 100644 --- a/include/nbl/builtin/hlsl/blit/default_normalize.comp.hlsl +++ b/include/nbl/builtin/hlsl/blit/default_normalize.comp.hlsl @@ -1,11 +1,9 @@ // Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/builtin/hlsl/blit/common.hlsl" - +#include "nbl/builtin/hlsl/blit/parameters.hlsl" - -//#include "nbl/builtin/hlsl/blit/parameters.hlsl" +#include "nbl/builtin/hlsl/blit/common.hlsl" //#include "nbl/builtin/hlsl/blit/compute_blit.hlsl" using namespace nbl::hlsl::blit; diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp index 2ad5656005..c3ceb66675 100644 --- a/src/nbl/video/utilities/CComputeBlit.cpp +++ b/src/nbl/video/utilities/CComputeBlit.cpp @@ -55,7 +55,7 @@ using kernel_weight_binding_t = )===" << layout->getBindingInfoForHLSL({.binding using input_sampler_binding_t = )===" << layout->getBindingInfoForHLSL({.binding=info.samplers,.requiredStages=IShader::E_SHADER_STAGE::ESS_COMPUTE}) << R"===(; using input_image_binding_t = )===" << layout->getBindingInfoForHLSL({.binding=info.inputs,.requiredStages=IShader::E_SHADER_STAGE::ESS_COMPUTE}) << R"===(; using output_binding_t = )===" << layout->getBindingInfoForHLSL({.binding=info.outputs,.requiredStages=IShader::E_SHADER_STAGE::ESS_COMPUTE}) << R"===(; -NBL_CONSTEXPR_STATIC_INLINE uint32_t uint32_t SharedMemoryDWORDs = )===" << (sharedMemoryPerInvocation* retval.workgroupSize)/sizeof(uint32_t) << R"===(; +NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedMemoryDWORDs = )===" << (sharedMemoryPerInvocation* retval.workgroupSize)/sizeof(uint32_t) << R"===(; }; )==="; return tmp.str(); From c700f67ccc1dc13f7bb4127ace5c6449987ce03e Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 5 Nov 2024 19:13:48 +0100 Subject: [PATCH 131/358] Updated DXC --- 3rdparty/dxc/dxc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc index 7acfe6f4fc..5adc27f9e4 160000 --- a/3rdparty/dxc/dxc +++ b/3rdparty/dxc/dxc @@ -1 +1 @@ -Subproject commit 7acfe6f4fc724265db8026256fad18afeb282b97 +Subproject commit 5adc27f9e42de7681d65a98873048af661b9b367 From 0c0b9ab86beec5595e09ae594a58e529a3f4cbb7 Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 5 Nov 2024 19:22:27 +0100 Subject: [PATCH 132/358] change last place's usage of `impl::declval` to `experimental::declval` --- include/nbl/builtin/hlsl/sort/counting.hlsl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/builtin/hlsl/sort/counting.hlsl b/include/nbl/builtin/hlsl/sort/counting.hlsl index 12da2e9d1d..1cd916ccc8 100644 --- a/include/nbl/builtin/hlsl/sort/counting.hlsl +++ b/include/nbl/builtin/hlsl/sort/counting.hlsl @@ -22,7 +22,7 @@ template< typename ValueAccessor, typename HistogramAccessor, typename SharedAccessor, - typename key_t = decltype(impl::declval().get(0)), + typename key_t = decltype(experimental::declval().get(0)), bool robust=false > struct counting From 565bf0e44c3b0e74701d8a0ee63803e13e7c9b93 Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 6 Nov 2024 07:48:40 +0100 Subject: [PATCH 133/358] revert whatever @keptsecret did to DXC (remember to update submodules after merges with Mmaster!) --- 3rdparty/dxc/dxc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc index 7acfe6f4fc..5adc27f9e4 160000 --- a/3rdparty/dxc/dxc +++ b/3rdparty/dxc/dxc @@ -1 +1 @@ -Subproject commit 7acfe6f4fc724265db8026256fad18afeb282b97 +Subproject commit 5adc27f9e42de7681d65a98873048af661b9b367 From 0ce6cfcc1745f246fe79933acf363107a6482e28 Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 6 Nov 2024 07:53:03 +0100 Subject: [PATCH 134/358] fix a small bug in template code --- include/nbl/video/ILogicalDevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 02fad9abd7..20ebcce272 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -406,7 +406,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe const uint32_t* const pMaxPrimitiveCounts ) const { - if (invalidFeaturesForASBuild(motionBlur)) + if (invalidFeaturesForASBuild(motionBlur)) { NBL_LOG_ERROR("Required features are not enabled"); return {}; From 294501cf529498874eab458d1fb1977d4d8e8b62 Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 6 Nov 2024 08:02:14 +0100 Subject: [PATCH 135/358] update example submodule --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index e59c4c7e4c..8e66e10dcd 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit e59c4c7e4cfcf69d4db19edceda53ccc17ae351d +Subproject commit 8e66e10dcd7a456d4a5b7e34d0a6f89299668d21 From 76faa785128acfa8c28e11ceb4ec30d4fb554f05 Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 6 Nov 2024 12:16:59 +0100 Subject: [PATCH 136/358] remove GLSL blit code --- examples_tests | 2 +- .../nbl/video/utilities/CComputeBlitGLSL.h | 898 ------------------ src/nbl/video/utilities/CComputeBlitGLSL.cpp | 57 -- 3 files changed, 1 insertion(+), 956 deletions(-) delete mode 100644 include/nbl/video/utilities/CComputeBlitGLSL.h delete mode 100644 src/nbl/video/utilities/CComputeBlitGLSL.cpp diff --git a/examples_tests b/examples_tests index e77ed5d468..1585888070 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit e77ed5d468f929ac5e7f1909f728895c923eb2c4 +Subproject commit 15858880704a7a8f641e2cd792a41708f617c3e3 diff --git a/include/nbl/video/utilities/CComputeBlitGLSL.h b/include/nbl/video/utilities/CComputeBlitGLSL.h deleted file mode 100644 index d987799b00..0000000000 --- a/include/nbl/video/utilities/CComputeBlitGLSL.h +++ /dev/null @@ -1,898 +0,0 @@ -#ifndef _NBL_VIDEO_C_COMPUTE_BLIT_GLSL_H_INCLUDED_ - -#include "nbl/asset/filters/CBlitUtilities.h" - -// TODO: Port to vk1.3 or delete? -#if 0 - -namespace nbl::video -{ - class NBL_API2 CComputeBlitGLSL : public core::IReferenceCounted - { - private: - struct vec3 { float x, y, z; }; - struct uvec3 { uint32_t x, y, z; }; - - public: - // This default is only for the blitting step (not alpha test or normalization steps) which always uses a 1D workgroup. - // For the default values of alpha test and normalization steps, see getDefaultWorkgroupDims. - static constexpr uint32_t DefaultBlitWorkgroupSize = 256u; - -#include "nbl/builtin/glsl/blit/parameters.glsl" - - struct dispatch_info_t - { - uint32_t wgCount[3]; - }; - - //! Set smemSize param to ~0u to use all the shared memory available. - static core::smart_refctd_ptr create(core::smart_refctd_ptr&& logicalDevice, const uint32_t smemSize = ~0u) - { - auto result = core::smart_refctd_ptr(new CComputeBlitGLSL(std::move(logicalDevice)), core::dont_grab); - - result->setAvailableSharedMemory(smemSize); - - { - constexpr auto BlitDescriptorCount = 3; - const asset::IDescriptor::E_TYPE types[BlitDescriptorCount] = { asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER, asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER }; // input image, output image, alpha statistics - - for (auto i = 0; i < static_cast(EBT_COUNT); ++i) - { - result->m_blitDSLayout[i] = result->createDSLayout(i == static_cast(EBT_COVERAGE_ADJUSTMENT) ? 3 : 2, types, result->m_device.get()); - if (!result->m_blitDSLayout[i]) - return nullptr; - } - } - - { - constexpr auto KernelWeightsDescriptorCount = 1; - asset::IDescriptor::E_TYPE types[KernelWeightsDescriptorCount] = { asset::IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER }; - result->m_kernelWeightsDSLayout = result->createDSLayout(KernelWeightsDescriptorCount, types, result->m_device.get()); - - if (!result->m_kernelWeightsDSLayout) - return nullptr; - } - - asset::SPushConstantRange pcRange = {}; - { - pcRange.stageFlags = asset::IShader::ESS_COMPUTE; - pcRange.offset = 0u; - pcRange.size = sizeof(nbl_glsl_blit_parameters_t); - } - - for (auto i = 0; i < static_cast(EBT_COUNT); ++i) - { - result->m_blitPipelineLayout[i] = result->m_device->createPipelineLayout(&pcRange, &pcRange + 1ull, core::smart_refctd_ptr(result->m_blitDSLayout[i]), core::smart_refctd_ptr(result->m_kernelWeightsDSLayout)); - if (!result->m_blitPipelineLayout[i]) - return nullptr; - } - - result->m_coverageAdjustmentPipelineLayout = result->m_device->createPipelineLayout(&pcRange, &pcRange + 1ull, core::smart_refctd_ptr(result->m_blitDSLayout[EBT_COVERAGE_ADJUSTMENT])); - if (!result->m_coverageAdjustmentPipelineLayout) - return nullptr; - - return result; - } - - inline void setAvailableSharedMemory(const uint32_t smemSize) - { - if (smemSize == ~0u) - m_availableSharedMemory = m_device->getPhysicalDevice()->getProperties().limits.maxComputeSharedMemorySize; - else - m_availableSharedMemory = core::min(core::roundUp(smemSize, static_cast(sizeof(float) * 64)), m_device->getPhysicalDevice()->getLimits().maxComputeSharedMemorySize); - } - - inline core::smart_refctd_ptr getDefaultBlitDescriptorSetLayout(const asset::IBlitUtilities::E_ALPHA_SEMANTIC alphaSemantic) const - { - if (alphaSemantic == asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE) - return m_blitDSLayout[EBT_COVERAGE_ADJUSTMENT]; - else - return m_blitDSLayout[EBT_REGULAR]; - } - - inline core::smart_refctd_ptr getDefaultKernelWeightsDescriptorSetLayout() const - { - return m_kernelWeightsDSLayout; - } - - inline core::smart_refctd_ptr getDefaultBlitPipelineLayout(const asset::IBlitUtilities::E_ALPHA_SEMANTIC alphaSemantic) const - { - if (alphaSemantic == asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE) - return m_blitPipelineLayout[EBT_COVERAGE_ADJUSTMENT]; - else - return m_blitPipelineLayout[EBT_REGULAR]; - } - - inline core::smart_refctd_ptr getDefaultCoverageAdjustmentPipelineLayout() const - { - return m_coverageAdjustmentPipelineLayout; - } - - // @param `alphaBinCount` is only required to size the histogram present in the default nbl_glsl_blit_AlphaStatistics_t in default_compute_common.comp - core::smart_refctd_ptr createAlphaTestSpecializedShader(const asset::IImage::E_TYPE inImageType, const uint32_t alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount); - - core::smart_refctd_ptr getAlphaTestPipeline(const uint32_t alphaBinCount, const asset::IImage::E_TYPE imageType) - { - const auto workgroupDims = getDefaultWorkgroupDims(imageType); - const auto paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount); - - assert(paddedAlphaBinCount >= asset::IBlitUtilities::MinAlphaBinCount); - const auto pipelineIndex = (paddedAlphaBinCount / asset::IBlitUtilities::MinAlphaBinCount) - 1; - - if (m_alphaTestPipelines[pipelineIndex][imageType]) - return m_alphaTestPipelines[pipelineIndex][imageType]; - - auto specShader = createAlphaTestSpecializedShader(imageType, paddedAlphaBinCount); - m_alphaTestPipelines[pipelineIndex][imageType] = m_device->createComputePipeline(nullptr, core::smart_refctd_ptr(m_blitPipelineLayout[EBT_COVERAGE_ADJUSTMENT]), std::move(specShader)); - - return m_alphaTestPipelines[pipelineIndex][imageType]; - } - - // @param `outFormat` dictates encoding. - core::smart_refctd_ptr createNormalizationSpecializedShader(const asset::IImage::E_TYPE inImageType, const asset::E_FORMAT outFormat, - const uint32_t alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount); - - core::smart_refctd_ptr getNormalizationPipeline(const asset::IImage::E_TYPE imageType, const asset::E_FORMAT outFormat, - const uint32_t alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount) - { - const auto workgroupDims = getDefaultWorkgroupDims(imageType); - const uint32_t paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount); - const SNormalizationCacheKey key = { imageType, paddedAlphaBinCount, outFormat }; - - if (m_normalizationPipelines.find(key) == m_normalizationPipelines.end()) - { - auto specShader = createNormalizationSpecializedShader(imageType, outFormat, paddedAlphaBinCount); - m_normalizationPipelines[key] = m_device->createComputePipeline(nullptr, core::smart_refctd_ptr(m_blitPipelineLayout[EBT_COVERAGE_ADJUSTMENT]), std::move(specShader)); - } - - return m_normalizationPipelines[key]; - } - - template - core::smart_refctd_ptr createBlitSpecializedShader( - const asset::E_FORMAT outFormat, - const asset::IImage::E_TYPE imageType, - const core::vectorSIMDu32& inExtent, - const core::vectorSIMDu32& outExtent, - const asset::IBlitUtilities::E_ALPHA_SEMANTIC alphaSemantic, - const typename BlitUtilities::convolution_kernels_t& kernels, - const uint32_t workgroupSize = DefaultBlitWorkgroupSize, - const uint32_t alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount) - { - const auto workgroupDims = getDefaultWorkgroupDims(imageType); - const auto paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount); - - std::ostringstream shaderSourceStream; - shaderSourceStream - << "#version 460 core\n" - << "#define _NBL_GLSL_WORKGROUP_SIZE_X_ " << workgroupSize << "\n" - << "#define _NBL_GLSL_WORKGROUP_SIZE_Y_ " << 1 << "\n" - << "#define _NBL_GLSL_WORKGROUP_SIZE_Z_ " << 1 << "\n" - << "#define _NBL_GLSL_BLIT_DIM_COUNT_ " << static_cast(imageType) + 1 << "\n" - << "#define _NBL_GLSL_BLIT_ALPHA_BIN_COUNT_ " << paddedAlphaBinCount << "\n"; - - const auto castedFormat = getOutImageViewFormat(outFormat); - const uint32_t outChannelCount = asset::getFormatChannelCount(outFormat); - - const char* glslFormatQualifier = asset::CGLSLCompiler::getStorageImageFormatQualifier(castedFormat); - - shaderSourceStream - << "#define _NBL_GLSL_BLIT_OUT_CHANNEL_COUNT_ " << outChannelCount << "\n" - << "#define _NBL_GLSL_BLIT_OUT_IMAGE_FORMAT_ " << glslFormatQualifier << "\n"; - - const core::vectorSIMDf minSupport(std::get<0>(kernels).getMinSupport(), std::get<1>(kernels).getMinSupport(), std::get<2>(kernels).getMinSupport()); - const core::vectorSIMDf maxSupport(std::get<0>(kernels).getMaxSupport(), std::get<1>(kernels).getMaxSupport(), std::get<2>(kernels).getMaxSupport()); - - const uint32_t smemFloatCount = m_availableSharedMemory / (sizeof(float) * outChannelCount); - shaderSourceStream << "#define _NBL_GLSL_BLIT_SMEM_FLOAT_COUNT_ " << smemFloatCount << "\n"; - - if (alphaSemantic == asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE) - shaderSourceStream << "#define _NBL_GLSL_BLIT_COVERAGE_SEMANTIC_\n"; - if (outFormat != castedFormat) - shaderSourceStream << "#define _NBL_GLSL_BLIT_SOFTWARE_ENCODE_FORMAT_ " << outFormat << "\n"; - shaderSourceStream << "#include \n"; - - auto cpuShader = core::make_smart_refctd_ptr(shaderSourceStream.str().c_str(), asset::IShader::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_GLSL, "CComputeBlitGLSL::createBlitSpecializedShader"); - auto gpuUnspecShader = m_device->createShader(std::move(cpuShader)); - auto specShader = m_device->createSpecializedShader(gpuUnspecShader.get(), { nullptr, nullptr, "main" }); - - return specShader; - } - - template - core::smart_refctd_ptr getBlitPipeline( - const asset::E_FORMAT outFormat, - const asset::IImage::E_TYPE imageType, - const core::vectorSIMDu32& inExtent, - const core::vectorSIMDu32& outExtent, - const asset::IBlitUtilities::E_ALPHA_SEMANTIC alphaSemantic, - const typename BlitUtilities::convolution_kernels_t& kernels, - const uint32_t workgroupSize = DefaultBlitWorkgroupSize, - const uint32_t alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount) - { - const auto paddedAlphaBinCount = getPaddedAlphaBinCount(core::vectorSIMDu32(workgroupSize, 1, 1, 1), alphaBinCount); - - const SBlitCacheKey key = - { - .wgSize = workgroupSize, - .imageType = imageType, - .alphaBinCount = paddedAlphaBinCount, - .outFormat = outFormat, - .smemSize = m_availableSharedMemory, - .coverageAdjustment = (alphaSemantic == asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE) - }; - - if (m_blitPipelines.find(key) == m_blitPipelines.end()) - { - const auto blitType = (alphaSemantic == asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE) ? EBT_COVERAGE_ADJUSTMENT : EBT_REGULAR; - - auto specShader = createBlitSpecializedShader( - outFormat, - imageType, - inExtent, - outExtent, - alphaSemantic, - kernels, - workgroupSize, - paddedAlphaBinCount); - - m_blitPipelines[key] = m_device->createComputePipeline(nullptr, core::smart_refctd_ptr(m_blitPipelineLayout[blitType]), std::move(specShader)); - } - - return m_blitPipelines[key]; - } - - //! Returns the number of output texels produced by one workgroup, deciding factor is `m_availableSharedMemory`. - //! @param outImageFormat is the format of output (of the blit step) image. - //! If a normalization step is involved then this will be the same as the format of normalization step's input image --which may differ from the - //! final output format, because we blit to a higher precision format for normalization. - template - bool getOutputTexelsPerWorkGroup( - core::vectorSIMDu32& outputTexelsPerWG, - const core::vectorSIMDu32& inExtent, - const core::vectorSIMDu32& outExtent, - const asset::E_FORMAT outImageFormat, - const asset::IImage::E_TYPE imageType, - const typename BlitUtilities::convolution_kernels_t& kernels) - { - core::vectorSIMDf scale = static_cast(inExtent).preciseDivision(static_cast(outExtent)); - - const core::vectorSIMDf minSupport(std::get<0>(kernels).getMinSupport(), std::get<1>(kernels).getMinSupport(), std::get<2>(kernels).getMinSupport()); - const core::vectorSIMDf maxSupport(std::get<0>(kernels).getMaxSupport(), std::get<1>(kernels).getMaxSupport(), std::get<2>(kernels).getMaxSupport()); - - outputTexelsPerWG = core::vectorSIMDu32(1, 1, 1, 1); - size_t requiredSmem = getRequiredSharedMemorySize(outputTexelsPerWG, outExtent, imageType, minSupport, maxSupport, scale, asset::getFormatChannelCount(outImageFormat)); - bool failed = true; - asset::IImage::E_TYPE minDimAxes[3] = { asset::IImage::ET_1D, asset::IImage::ET_2D, asset::IImage::ET_3D }; - while (requiredSmem < m_availableSharedMemory) - { - failed = false; - - std::sort(minDimAxes, minDimAxes + imageType + 1, [&outputTexelsPerWG](const asset::IImage::E_TYPE a, const asset::IImage::E_TYPE b) -> bool { return outputTexelsPerWG[a] < outputTexelsPerWG[b]; }); - - int i = 0; - for (; i < imageType + 1; ++i) - { - const auto axis = minDimAxes[i]; - - core::vectorSIMDu32 delta(0, 0, 0, 0); - delta[axis] = 1; - - if (outputTexelsPerWG[axis] < outExtent[axis]) - { - // Note: we use outImageFormat's channel count as opposed to its image view's format because, even in the event that they are different, we blit - // as if we will be writing to a storage image of out - requiredSmem = getRequiredSharedMemorySize(outputTexelsPerWG + delta, outExtent, imageType, minSupport, maxSupport, scale, asset::getFormatChannelCount(outImageFormat)); - - if (requiredSmem <= m_availableSharedMemory) - { - outputTexelsPerWG += delta; - break; - } - } - } - if (i == imageType + 1) // If we cannot find any axis to increment outputTexelsPerWG along, then break - break; - } - - return !failed; - } - - template - inline void buildParameters( - nbl_glsl_blit_parameters_t& outPC, - const core::vectorSIMDu32& inImageExtent, - const core::vectorSIMDu32& outImageExtent, - const asset::IImage::E_TYPE imageType, - const asset::E_FORMAT inImageFormat, - const typename BlitUtilities::convolution_kernels_t& kernels, - const uint32_t layersToBlit = 1, - const float referenceAlpha = 0.f) - { - core::vectorSIMDu32 inDim(inImageExtent.x, inImageExtent.y, inImageExtent.z); - core::vectorSIMDu32 outDim(outImageExtent.x, outImageExtent.y, outImageExtent.z); - - if (imageType < asset::IImage::ET_3D) - { - inDim.z = layersToBlit; - outDim.z = layersToBlit; - } - - constexpr auto MaxImageDim = 1 << 16; - const auto maxImageDims = core::vectorSIMDu32(MaxImageDim, MaxImageDim, MaxImageDim, MaxImageDim); - assert((inDim.x < maxImageDims.x) && (inDim.y < maxImageDims.y) && (inDim.z < maxImageDims.z)); - assert((outDim.x < maxImageDims.x) && (outDim.y < maxImageDims.y) && (outDim.z < maxImageDims.z)); - - outPC.dims.x = (outDim.x << 16) | inDim.x; - outPC.dims.y = (outDim.y << 16) | inDim.y; - outPC.dims.z = (outDim.z << 16) | inDim.z; - - core::vectorSIMDf scale = static_cast(inImageExtent).preciseDivision(static_cast(outImageExtent)); - - const core::vectorSIMDf minSupport(std::get<0>(kernels).getMinSupport(), std::get<1>(kernels).getMinSupport(), std::get<2>(kernels).getMinSupport()); - const core::vectorSIMDf maxSupport(std::get<0>(kernels).getMaxSupport(), std::get<1>(kernels).getMaxSupport(), std::get<2>(kernels).getMaxSupport()); - - core::vectorSIMDu32 outputTexelsPerWG; - getOutputTexelsPerWorkGroup(outputTexelsPerWG, inImageExtent, outImageExtent, inImageFormat, imageType, kernels); - const auto preloadRegion = getPreloadRegion(outputTexelsPerWG, imageType, minSupport, maxSupport, scale); - - outPC.secondScratchOffset = core::max(preloadRegion.x * preloadRegion.y * preloadRegion.z, outputTexelsPerWG.x * outputTexelsPerWG.y * preloadRegion.z); - outPC.iterationRegionXPrefixProducts = { outputTexelsPerWG.x, outputTexelsPerWG.x * preloadRegion.y, outputTexelsPerWG.x * preloadRegion.y * preloadRegion.z }; - outPC.referenceAlpha = referenceAlpha; - outPC.fScale = { scale.x, scale.y, scale.z }; - outPC.inPixelCount = inImageExtent.x * inImageExtent.y * inImageExtent.z; - outPC.negativeSupport.x = minSupport.x; outPC.negativeSupport.y = minSupport.y; outPC.negativeSupport.z = minSupport.z; - outPC.outPixelCount = outImageExtent.x * outImageExtent.y * outImageExtent.z; - - const core::vectorSIMDi32 windowDim = core::max(BlitUtilities::getWindowSize(imageType, kernels), core::vectorSIMDi32(1, 1, 1, 1)); - assert((windowDim.x < maxImageDims.x) && (windowDim.y < maxImageDims.y) && (windowDim.z < maxImageDims.z)); - - const core::vectorSIMDu32 phaseCount = asset::IBlitUtilities::getPhaseCount(inImageExtent, outImageExtent, imageType); - assert((phaseCount.x < maxImageDims.x) && (phaseCount.y < maxImageDims.y) && (phaseCount.z < maxImageDims.z)); - - outPC.windowDimPhaseCount.x = (phaseCount.x << 16) | windowDim.x; - outPC.windowDimPhaseCount.y = (phaseCount.y << 16) | windowDim.y; - outPC.windowDimPhaseCount.z = (phaseCount.z << 16) | windowDim.z; - - outPC.kernelWeightsOffsetY = phaseCount.x * windowDim.x; - outPC.iterationRegionYPrefixProducts = { outputTexelsPerWG.y, outputTexelsPerWG.y * outputTexelsPerWG.x, outputTexelsPerWG.y * outputTexelsPerWG.x * preloadRegion.z }; - outPC.kernelWeightsOffsetZ = outPC.kernelWeightsOffsetY + phaseCount.y * windowDim.y; - outPC.iterationRegionZPrefixProducts = { outputTexelsPerWG.y, outputTexelsPerWG.y * outputTexelsPerWG.x, outputTexelsPerWG.y * outputTexelsPerWG.x * outputTexelsPerWG.z }; - outPC.outputTexelsPerWGZ = outputTexelsPerWG.z; - outPC.preloadRegion = { preloadRegion.x, preloadRegion.y, preloadRegion.z }; - } - - static inline void buildAlphaTestDispatchInfo(dispatch_info_t& outDispatchInfo, const core::vectorSIMDu32& inImageExtent, const asset::IImage::E_TYPE imageType, const uint32_t layersToBlit = 1) - { - const core::vectorSIMDu32 workgroupDims = getDefaultWorkgroupDims(imageType); - const core::vectorSIMDu32 workgroupCount = (inImageExtent + workgroupDims - core::vectorSIMDu32(1u, 1u, 1u, 1u)) / workgroupDims; - - outDispatchInfo.wgCount[0] = workgroupCount.x; - outDispatchInfo.wgCount[1] = workgroupCount.y; - if (imageType < asset::IImage::ET_3D) - outDispatchInfo.wgCount[2] = layersToBlit; - else - outDispatchInfo.wgCount[2] = workgroupCount[2]; - } - - template - inline void buildBlitDispatchInfo( - dispatch_info_t& dispatchInfo, - const core::vectorSIMDu32& inImageExtent, - const core::vectorSIMDu32& outImageExtent, - const asset::E_FORMAT inImageFormat, - const asset::IImage::E_TYPE imageType, - const typename BlitUtilities::convolution_kernels_t& kernels, - const uint32_t workgroupSize = DefaultBlitWorkgroupSize, - const uint32_t layersToBlit = 1) - { - core::vectorSIMDu32 outputTexelsPerWG; - getOutputTexelsPerWorkGroup(outputTexelsPerWG, inImageExtent, outImageExtent, inImageFormat, imageType, kernels); - const auto wgCount = (outImageExtent + outputTexelsPerWG - core::vectorSIMDu32(1, 1, 1)) / core::vectorSIMDu32(outputTexelsPerWG.x, outputTexelsPerWG.y, outputTexelsPerWG.z, 1); - - dispatchInfo.wgCount[0] = wgCount[0]; - dispatchInfo.wgCount[1] = wgCount[1]; - if (imageType < asset::IImage::ET_3D) - dispatchInfo.wgCount[2] = layersToBlit; - else - dispatchInfo.wgCount[2] = wgCount[2]; - } - - static inline void buildNormalizationDispatchInfo(dispatch_info_t& outDispatchInfo, const core::vectorSIMDu32& outImageExtent, const asset::IImage::E_TYPE imageType, const uint32_t layersToBlit = 1) - { - const core::vectorSIMDu32 workgroupDims = getDefaultWorkgroupDims(imageType); - const core::vectorSIMDu32 workgroupCount = (outImageExtent + workgroupDims - core::vectorSIMDu32(1u, 1u, 1u, 1u)) / workgroupDims; - - outDispatchInfo.wgCount[0] = workgroupCount.x; - outDispatchInfo.wgCount[1] = workgroupCount.y; - if (imageType < asset::IImage::ET_3D) - outDispatchInfo.wgCount[2] = layersToBlit; - else - outDispatchInfo.wgCount[2] = workgroupCount[2]; - } - - static inline core::vectorSIMDu32 getDefaultWorkgroupDims(const asset::IImage::E_TYPE imageType) - { - switch (imageType) - { - case asset::IImage::ET_1D: - return core::vectorSIMDu32(256, 1, 1, 1); - case asset::IImage::ET_2D: - return core::vectorSIMDu32(16, 16, 1, 1); - case asset::IImage::ET_3D: - return core::vectorSIMDu32(8, 8, 4, 1); - default: - return core::vectorSIMDu32(1, 1, 1, 1); - } - } - - static inline size_t getCoverageAdjustmentScratchSize(const asset::IBlitUtilities::E_ALPHA_SEMANTIC alphaSemantic, const asset::IImage::E_TYPE imageType, const uint32_t alphaBinCount, const uint32_t layersToBlit) - { - if (alphaSemantic != asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE) - return 0; - - const auto workgroupDims = getDefaultWorkgroupDims(imageType); - const auto paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount); - const auto requiredSize = (sizeof(uint32_t) + paddedAlphaBinCount * sizeof(uint32_t)) * layersToBlit; - return requiredSize; - } - - bool updateDescriptorSet( - video::IGPUDescriptorSet* blitDS, - video::IGPUDescriptorSet* kernelWeightsDS, - core::smart_refctd_ptr inImageView, - core::smart_refctd_ptr outImageView, - core::smart_refctd_ptr coverageAdjustmentScratchBuffer, - core::smart_refctd_ptr kernelWeightsUTB, - const asset::ISampler::E_TEXTURE_CLAMP wrapU = asset::ISampler::ETC_CLAMP_TO_EDGE, - const asset::ISampler::E_TEXTURE_CLAMP wrapV = asset::ISampler::ETC_CLAMP_TO_EDGE, - const asset::ISampler::E_TEXTURE_CLAMP wrapW = asset::ISampler::ETC_CLAMP_TO_EDGE, - const asset::ISampler::E_TEXTURE_BORDER_COLOR borderColor = asset::ISampler::ETBC_FLOAT_OPAQUE_BLACK) - { - constexpr auto MAX_DESCRIPTOR_COUNT = 3; - - auto updateDS = [this, coverageAdjustmentScratchBuffer](video::IGPUDescriptorSet* ds, video::IGPUDescriptorSet::SDescriptorInfo* infos) -> bool - { - const auto bindingCount = ds->getLayout()->getTotalBindingCount(); - if ((bindingCount == 3) && !coverageAdjustmentScratchBuffer) - return false; - - video::IGPUDescriptorSet::SWriteDescriptorSet writes[MAX_DESCRIPTOR_COUNT] = {}; - - uint32_t infoIdx = 0; - uint32_t writeCount = 0; - for (uint32_t t = 0; t < static_cast(asset::IDescriptor::E_TYPE::ET_COUNT); ++t) - { - const auto type = static_cast(t); - const auto& redirect = ds->getLayout()->getDescriptorRedirect(type); - const auto declaredBindingCount = redirect.getBindingCount(); - - for (uint32_t i = 0; i < declaredBindingCount; ++i) - { - auto& write = writes[writeCount++]; - write.dstSet = ds; - write.binding = redirect.getBinding(IGPUDescriptorSetLayout::CBindingRedirect::storage_range_index_t{ i }).data; - write.arrayElement = 0u; - write.count = redirect.getCount(IGPUDescriptorSetLayout::CBindingRedirect::storage_range_index_t{ i }); - write.info = &infos[infoIdx]; - write.descriptorType = type; - - infoIdx += write.count; - } - } - assert(writeCount == bindingCount); - m_device->updateDescriptorSets(writeCount, writes, 0u, nullptr); - - return true; - }; - - if (blitDS) - { - if (!inImageView || !outImageView) - return false; - - video::IGPUDescriptorSet::SDescriptorInfo infos[MAX_DESCRIPTOR_COUNT] = {}; - - if (!samplers[wrapU][wrapV][wrapW][borderColor]) - { - video::IGPUSampler::SParams params = {}; - params.TextureWrapU = wrapU; - params.TextureWrapV = wrapV; - params.TextureWrapW = wrapW; - params.BorderColor = borderColor; - params.MinFilter = asset::ISampler::ETF_NEAREST; - params.MaxFilter = asset::ISampler::ETF_NEAREST; - params.MipmapMode = asset::ISampler::ESMM_NEAREST; - params.AnisotropicFilter = 0u; - params.CompareEnable = 0u; - params.CompareFunc = asset::ISampler::ECO_ALWAYS; - - samplers[wrapU][wrapV][wrapW][borderColor] = m_device->createSampler(params); - if (!samplers[wrapU][wrapV][wrapW][borderColor]) - return false; - } - - infos[0].desc = inImageView; - infos[0].info.image.imageLayout = asset::IImage::EL_SHADER_READ_ONLY_OPTIMAL; - infos[0].info.image.sampler = samplers[wrapU][wrapV][wrapW][borderColor]; - - infos[1].desc = outImageView; - infos[1].info.image.imageLayout = asset::IImage::EL_GENERAL; - infos[1].info.image.sampler = nullptr; - - if (coverageAdjustmentScratchBuffer) - { - infos[2].desc = coverageAdjustmentScratchBuffer; - infos[2].info.buffer.offset = 0; - infos[2].info.buffer.size = coverageAdjustmentScratchBuffer->getSize(); - } - - if (!updateDS(blitDS, infos)) - return false; - } - - if (kernelWeightsDS) - { - video::IGPUDescriptorSet::SDescriptorInfo info = {}; - info.desc = kernelWeightsUTB; - info.info.buffer.offset = 0ull; - info.info.buffer.size = kernelWeightsUTB->getUnderlyingBuffer()->getSize(); - - if (!updateDS(kernelWeightsDS, &info)) - return false; - } - - return true; - } - - //! User is responsible for the memory barriers between previous writes and the first - //! dispatch on the input image, and future reads of output image and the last dispatch. - template - inline void blit( - video::IGPUCommandBuffer* cmdbuf, - const asset::IBlitUtilities::E_ALPHA_SEMANTIC alphaSemantic, - video::IGPUDescriptorSet* alphaTestDS, - video::IGPUComputePipeline* alphaTestPipeline, - video::IGPUDescriptorSet* blitDS, - video::IGPUDescriptorSet* blitWeightsDS, - video::IGPUComputePipeline* blitPipeline, - video::IGPUDescriptorSet* normalizationDS, - video::IGPUComputePipeline* normalizationPipeline, - const core::vectorSIMDu32& inImageExtent, - const asset::IImage::E_TYPE inImageType, - const asset::E_FORMAT inImageFormat, - core::smart_refctd_ptr normalizationInImage, - const typename BlitUtilities::convolution_kernels_t& kernels, - const uint32_t layersToBlit = 1, - core::smart_refctd_ptr coverageAdjustmentScratchBuffer = nullptr, - const float referenceAlpha = 0.f, - const uint32_t alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount, - const uint32_t workgroupSize = DefaultBlitWorkgroupSize) - { - const core::vectorSIMDu32 outImageExtent(normalizationInImage->getCreationParameters().extent.width, normalizationInImage->getCreationParameters().extent.height, normalizationInImage->getCreationParameters().extent.depth, 1u); - - nbl_glsl_blit_parameters_t pushConstants; - buildParameters(pushConstants, inImageExtent, outImageExtent, inImageType, inImageFormat, kernels, layersToBlit, referenceAlpha); - - if (alphaSemantic == asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE) - { - dispatch_info_t dispatchInfo; - buildAlphaTestDispatchInfo(dispatchInfo, inImageExtent, inImageType, layersToBlit); - - cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, alphaTestPipeline->getLayout(), 0u, 1u, &alphaTestDS); - cmdbuf->bindComputePipeline(alphaTestPipeline); - dispatchHelper(cmdbuf, alphaTestPipeline->getLayout(), pushConstants, dispatchInfo); - } - - { - dispatch_info_t dispatchInfo; - buildBlitDispatchInfo(dispatchInfo, inImageExtent, outImageExtent, inImageFormat, inImageType, kernels, workgroupSize, layersToBlit); - - video::IGPUDescriptorSet* ds_raw[] = { blitDS, blitWeightsDS }; - cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, blitPipeline->getLayout(), 0, 2, ds_raw); - cmdbuf->bindComputePipeline(blitPipeline); - dispatchHelper(cmdbuf, blitPipeline->getLayout(), pushConstants, dispatchInfo); - } - - // After this dispatch ends and finishes writing to outImage, normalize outImage - if (alphaSemantic == asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE) - { - dispatch_info_t dispatchInfo; - buildNormalizationDispatchInfo(dispatchInfo, outImageExtent, inImageType, layersToBlit); - - assert(coverageAdjustmentScratchBuffer); - - // Memory dependency to ensure the alpha test pass has finished writing to alphaTestCounterBuffer - video::IGPUCommandBuffer::SBufferMemoryBarrier alphaTestBarrier = {}; - alphaTestBarrier.barrier.srcAccessMask = asset::EAF_SHADER_WRITE_BIT; - alphaTestBarrier.barrier.dstAccessMask = asset::EAF_SHADER_READ_BIT; - alphaTestBarrier.srcQueueFamilyIndex = ~0u; - alphaTestBarrier.dstQueueFamilyIndex = ~0u; - alphaTestBarrier.buffer = coverageAdjustmentScratchBuffer; - alphaTestBarrier.size = coverageAdjustmentScratchBuffer->getSize(); - alphaTestBarrier.offset = 0; - - // Memory dependency to ensure that the previous compute pass has finished writing to the output image, - // also transitions the layout of said image: GENERAL -> SHADER_READ_ONLY_OPTIMAL - video::IGPUCommandBuffer::SImageMemoryBarrier readyForNorm = {}; - readyForNorm.barrier.srcAccessMask = asset::EAF_SHADER_WRITE_BIT; - readyForNorm.barrier.dstAccessMask = static_cast(asset::EAF_SHADER_READ_BIT); - readyForNorm.oldLayout = asset::IImage::EL_GENERAL; - readyForNorm.newLayout = asset::IImage::EL_SHADER_READ_ONLY_OPTIMAL; - readyForNorm.srcQueueFamilyIndex = ~0u; - readyForNorm.dstQueueFamilyIndex = ~0u; - readyForNorm.image = normalizationInImage; - readyForNorm.subresourceRange.aspectMask = asset::IImage::EAF_COLOR_BIT; - readyForNorm.subresourceRange.levelCount = 1u; - readyForNorm.subresourceRange.layerCount = normalizationInImage->getCreationParameters().arrayLayers; - cmdbuf->pipelineBarrier(asset::EPSF_COMPUTE_SHADER_BIT, asset::EPSF_COMPUTE_SHADER_BIT, asset::EDF_NONE, 0u, nullptr, 1u, &alphaTestBarrier, 1u, &readyForNorm); - - cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, normalizationPipeline->getLayout(), 0u, 1u, &normalizationDS); - cmdbuf->bindComputePipeline(normalizationPipeline); - dispatchHelper(cmdbuf, normalizationPipeline->getLayout(), pushConstants, dispatchInfo); - } - } - - //! WARNING: This function blocks and stalls the GPU! - template - inline void blit(video::IGPUQueue* computeQueue, Args&&... args) - { - auto cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(), video::IGPUCommandPool::ECF_NONE); - core::smart_refctd_ptr cmdbuf; - m_device->createCommandBuffers(cmdpool.get(), video::IGPUCommandBuffer::EL_PRIMARY, 1u, &cmdbuf); - - auto fence = m_device->createFence(video::IGPUFence::ECF_UNSIGNALED); - - cmdbuf->begin(video::IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT); - blit(cmdbuf.get(), std::forward(args)...); - cmdbuf->end(); - - video::IGPUQueue::SSubmitInfo submitInfo = {}; - submitInfo.commandBufferCount = 1u; - submitInfo.commandBuffers = &cmdbuf.get(); - computeQueue->submit(1u, &submitInfo, fence.get()); - - m_device->blockForFences(1u, &fence.get()); - } - - //! Returns the original format if supports STORAGE_IMAGE otherwise returns a format in its compat class which supports STORAGE_IMAGE. - inline asset::E_FORMAT getOutImageViewFormat(const asset::E_FORMAT format) - { - const auto& formatUsages = m_device->getPhysicalDevice()->getImageFormatUsagesOptimalTiling()[format]; - - if (formatUsages.storageImage) - { - return format; - } - else - { - const asset::E_FORMAT compatFormat = getCompatClassFormat(format); - - const auto& compatClassFormatUsages = m_device->getPhysicalDevice()->getImageFormatUsagesOptimalTiling()[compatFormat]; - if (!compatClassFormatUsages.storageImage) - return asset::EF_UNKNOWN; - else - return compatFormat; - } - } - - static inline asset::E_FORMAT getCoverageAdjustmentIntermediateFormat(const asset::E_FORMAT format) - { - using namespace nbl::asset; - - switch (format) - { - case EF_R32G32B32A32_SFLOAT: - case EF_R16G16B16A16_SFLOAT: - case EF_R16G16B16A16_UNORM: - case EF_R16G16B16A16_SNORM: - return EF_R32G32B32A32_SFLOAT; - - case EF_R32G32_SFLOAT: - case EF_R16G16_SFLOAT: - case EF_R16G16_UNORM: - case EF_R16G16_SNORM: - return EF_R32G32_SFLOAT; - - case EF_B10G11R11_UFLOAT_PACK32: - return EF_R16G16B16A16_SFLOAT; - - case EF_R32_SFLOAT: - case EF_R16_SFLOAT: - case EF_R16_UNORM: - case EF_R16_SNORM: - return EF_R32_SFLOAT; - - case EF_A2B10G10R10_UNORM_PACK32: - case EF_R8G8B8A8_UNORM: - return EF_R16G16B16A16_UNORM; - - case EF_R8G8_UNORM: - return EF_R16G16_UNORM; - - case EF_R8_UNORM: - return EF_R16_UNORM; - - case EF_R8G8B8A8_SNORM: - return EF_R16G16B16A16_SNORM; - - case EF_R8G8_SNORM: - return EF_R16G16_SNORM; - - default: - return EF_UNKNOWN; - } - } - - private: - enum E_BLIT_TYPE : uint8_t - { - EBT_REGULAR = 0, - EBT_COVERAGE_ADJUSTMENT, - EBT_COUNT - }; - - core::smart_refctd_ptr m_blitDSLayout[EBT_COUNT]; - core::smart_refctd_ptr m_kernelWeightsDSLayout; - - core::smart_refctd_ptr m_blitPipelineLayout[EBT_COUNT]; - core::smart_refctd_ptr m_coverageAdjustmentPipelineLayout; - - core::smart_refctd_ptr m_alphaTestPipelines[asset::IBlitUtilities::MaxAlphaBinCount / asset::IBlitUtilities::MinAlphaBinCount][asset::IImage::ET_COUNT] = { nullptr }; - - struct SNormalizationCacheKey - { - asset::IImage::E_TYPE imageType; - uint32_t alphaBinCount; - asset::E_FORMAT outFormat; - - inline bool operator==(const SNormalizationCacheKey& other) const - { - return (imageType == other.imageType) && (alphaBinCount == other.alphaBinCount) && (outFormat == other.outFormat); - } - }; - struct SNormalizationCacheHash - { - inline size_t operator() (const SNormalizationCacheKey& key) const - { - return - std::hash{}(key.imageType) ^ - std::hash{}(key.alphaBinCount) ^ - std::hash{}(key.outFormat); - } - }; - core::unordered_map, SNormalizationCacheHash> m_normalizationPipelines; - - struct SBlitCacheKey - { - uint32_t wgSize; - asset::IImage::E_TYPE imageType; - uint32_t alphaBinCount; - asset::E_FORMAT outFormat; - uint32_t smemSize; - bool coverageAdjustment; - - inline bool operator==(const SBlitCacheKey& other) const - { - return (wgSize == other.wgSize) && (imageType == other.imageType) && (alphaBinCount == other.alphaBinCount) && (outFormat == other.outFormat) - && (smemSize == other.smemSize) && (coverageAdjustment == other.coverageAdjustment); - } - }; - struct SBlitCacheHash - { - inline size_t operator()(const SBlitCacheKey& key) const - { - return - std::hash{}(key.wgSize) ^ - std::hash{}(key.imageType) ^ - std::hash{}(key.alphaBinCount) ^ - std::hash{}(key.outFormat) ^ - std::hash{}(key.smemSize) ^ - std::hash{}(key.coverageAdjustment); - } - }; - core::unordered_map, SBlitCacheHash> m_blitPipelines; - - uint32_t m_availableSharedMemory; - core::smart_refctd_ptr m_device; - - core::smart_refctd_ptr samplers[video::IGPUSampler::ETC_COUNT][video::IGPUSampler::ETC_COUNT][video::IGPUSampler::ETC_COUNT][video::IGPUSampler::ETBC_COUNT] = { nullptr }; - - CComputeBlitGLSL(core::smart_refctd_ptr&& logicalDevice) : m_device(std::move(logicalDevice)) {} - - static inline void dispatchHelper(video::IGPUCommandBuffer* cmdbuf, const video::IGPUPipelineLayout* pipelineLayout, const nbl_glsl_blit_parameters_t& pushConstants, const dispatch_info_t& dispatchInfo) - { - cmdbuf->pushConstants(pipelineLayout, asset::IShader::ESS_COMPUTE, 0u, sizeof(nbl_glsl_blit_parameters_t), &pushConstants); - cmdbuf->dispatch(dispatchInfo.wgCount[0], dispatchInfo.wgCount[1], dispatchInfo.wgCount[2]); - } - - core::smart_refctd_ptr createDSLayout(const uint32_t descriptorCount, const asset::IDescriptor::E_TYPE* descriptorTypes, video::ILogicalDevice* logicalDevice) const - { - constexpr uint32_t MAX_DESCRIPTOR_COUNT = 5; - assert(descriptorCount < MAX_DESCRIPTOR_COUNT); - - video::IGPUDescriptorSetLayout::SBinding bindings[MAX_DESCRIPTOR_COUNT] = {}; - - for (uint32_t i = 0u; i < descriptorCount; ++i) - { - bindings[i].binding = i; - bindings[i].count = 1u; - bindings[i].stageFlags = asset::IShader::ESS_COMPUTE; - bindings[i].type = descriptorTypes[i]; - } - - auto dsLayout = logicalDevice->createDescriptorSetLayout(bindings, bindings + descriptorCount); - return dsLayout; - } - - static inline asset::E_FORMAT getCompatClassFormat(const asset::E_FORMAT format) - { - const asset::E_FORMAT_CLASS formatClass = asset::getFormatClass(format); - switch (formatClass) - { - case asset::EFC_8_BIT: - return asset::EF_R8_UINT; - case asset::EFC_16_BIT: - return asset::EF_R16_UINT; - case asset::EFC_32_BIT: - return asset::EF_R32_UINT; - case asset::EFC_64_BIT: - return asset::EF_R32G32_UINT; - case asset::EFC_128_BIT: - return asset::EF_R32G32B32A32_UINT; - default: - return asset::EF_UNKNOWN; - } - } - - //! This calculates the inclusive upper bound on the preload region i.e. it will be reachable for some cases. For the rest it will be bigger - //! by a pixel in each dimension. - //! Used to size the shared memory. - inline core::vectorSIMDu32 getPreloadRegion( - const core::vectorSIMDu32& outputTexelsPerWG, - const asset::IImage::E_TYPE imageType, - const core::vectorSIMDf& scaledMinSupport, - const core::vectorSIMDf& scaledMaxSupport, - const core::vectorSIMDf& scale) - { - core::vectorSIMDu32 preloadRegion = core::vectorSIMDu32(core::floor((scaledMaxSupport - scaledMinSupport) + core::vectorSIMDf(outputTexelsPerWG - core::vectorSIMDu32(1, 1, 1, 1)) * scale)) + core::vectorSIMDu32(1, 1, 1, 1); - - // Set the unused dimensions to 1 to avoid weird behaviours with scaled kernels - for (auto axis = imageType + 1; axis < 3u; ++axis) - preloadRegion[axis] = 1; - - return preloadRegion; - } - - //! Query shared memory size for a given `outputTexelsPerWG`. - size_t getRequiredSharedMemorySize( - const core::vectorSIMDu32& outputTexelsPerWG, - const core::vectorSIMDu32& outExtent, - const asset::IImage::E_TYPE imageType, - const core::vectorSIMDf& scaledMinSupport, - const core::vectorSIMDf& scaledMaxSupport, - const core::vectorSIMDf& scale, - const uint32_t channelCount) - { - const auto preloadRegion = getPreloadRegion(outputTexelsPerWG, imageType, scaledMinSupport, scaledMaxSupport, scale); - - const size_t requiredSmem = (core::max(preloadRegion.x * preloadRegion.y * preloadRegion.z, outputTexelsPerWG.x * outputTexelsPerWG.y * preloadRegion.z) + outputTexelsPerWG.x * preloadRegion.y * preloadRegion.z) * channelCount * sizeof(float); - return requiredSmem; - }; - - static inline uint32_t getPaddedAlphaBinCount(const core::vectorSIMDu32& workgroupDims, const uint32_t oldAlphaBinCount) - { - // For the normalization shader, it should be that: - // alphaBinCount = k*workGroupSize, k is integer, k >= 1, - assert(workgroupDims.x != 0 && workgroupDims.y != 0 && workgroupDims.z != 0); - const auto wgSize = workgroupDims.x * workgroupDims.y * workgroupDims.z; - const auto paddedAlphaBinCount = core::roundUp(oldAlphaBinCount, wgSize); - return paddedAlphaBinCount; - } - }; -} - -#endif - -#define _NBL_VIDEO_C_COMPUTE_BLIT_GLSL_H_INCLUDED_ -#endif diff --git a/src/nbl/video/utilities/CComputeBlitGLSL.cpp b/src/nbl/video/utilities/CComputeBlitGLSL.cpp deleted file mode 100644 index c67bf09b7f..0000000000 --- a/src/nbl/video/utilities/CComputeBlitGLSL.cpp +++ /dev/null @@ -1,57 +0,0 @@ -#include "nbl/video/utilities/CComputeBlitGLSL.h" - -using namespace nbl; -using namespace video; - -// TODO: Port to vk 1.3 or delete? -#if 0 -core::smart_refctd_ptr CComputeBlitGLSL::createAlphaTestSpecializedShader(const asset::IImage::E_TYPE imageType, const uint32_t alphaBinCount) -{ - const auto workgroupDims = getDefaultWorkgroupDims(imageType); - const auto paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount); - - std::ostringstream shaderSourceStream; - shaderSourceStream - << "#version 460 core\n" - << "#define _NBL_GLSL_WORKGROUP_SIZE_X_ " << ((imageType >= asset::IImage::ET_1D) ? workgroupDims.x : 1) << "\n" - << "#define _NBL_GLSL_WORKGROUP_SIZE_Y_ " << ((imageType >= asset::IImage::ET_2D) ? workgroupDims.y : 1) << "\n" - << "#define _NBL_GLSL_WORKGROUP_SIZE_Z_ " << ((imageType >= asset::IImage::ET_3D) ? workgroupDims.z : 1) << "\n" - << "#define _NBL_GLSL_BLIT_DIM_COUNT_ " << static_cast(imageType) + 1 << "\n" - << "#define _NBL_GLSL_BLIT_ALPHA_BIN_COUNT_ " << paddedAlphaBinCount << "\n" - << "#include \n"; - - auto cpuShader = core::make_smart_refctd_ptr(shaderSourceStream.str().c_str(), asset::IShader::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_GLSL, "CComputeBlitGLSLGLSL::createAlphaTestSpecializedShader"); - auto gpuUnspecShader = m_device->createShader(std::move(cpuShader)); - - return m_device->createSpecializedShader(gpuUnspecShader.get(), { nullptr, nullptr, "main" }); -} - -core::smart_refctd_ptr CComputeBlitGLSL::createNormalizationSpecializedShader(const asset::IImage::E_TYPE imageType, const asset::E_FORMAT outFormat, - const uint32_t alphaBinCount) -{ - const auto workgroupDims = getDefaultWorkgroupDims(imageType); - const auto paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount); - - const auto castedFormat = getOutImageViewFormat(outFormat); - const char* glslFormatQualifier = asset::CGLSLCompiler::getStorageImageFormatQualifier(castedFormat); - - std::ostringstream shaderSourceStream; - shaderSourceStream - << "#version 460 core\n" - << "#define _NBL_GLSL_WORKGROUP_SIZE_X_ " << ((imageType >= asset::IImage::ET_1D) ? workgroupDims.x : 1) << "\n" - << "#define _NBL_GLSL_WORKGROUP_SIZE_Y_ " << ((imageType >= asset::IImage::ET_2D) ? workgroupDims.y : 1) << "\n" - << "#define _NBL_GLSL_WORKGROUP_SIZE_Z_ " << ((imageType >= asset::IImage::ET_3D) ? workgroupDims.z : 1) << "\n" - << "#define _NBL_GLSL_BLIT_DIM_COUNT_ " << static_cast(imageType) + 1 << "\n" - << "#define _NBL_GLSL_BLIT_ALPHA_BIN_COUNT_ " << paddedAlphaBinCount << "\n" - << "#define _NBL_GLSL_BLIT_NORMALIZATION_OUT_IMAGE_FORMAT_ " << glslFormatQualifier << "\n"; - if (outFormat != castedFormat) - shaderSourceStream << "#define _NBL_GLSL_BLIT_SOFTWARE_ENCODE_FORMAT_ " << outFormat << "\n"; - shaderSourceStream << "#include \n"; - - auto cpuShader = core::make_smart_refctd_ptr(shaderSourceStream.str().c_str(), asset::IShader::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_GLSL, "CComputeBlitGLSL::createNormalizationSpecializedShader"); - auto gpuUnspecShader = m_device->createShader(std::move(cpuShader)); - - return m_device->createSpecializedShader(gpuUnspecShader.get(), { nullptr, nullptr, "main" }); -} - -#endif \ No newline at end of file From afcc423c0fd8786cbbf8b841f3bb856f6ca8e0b0 Mon Sep 17 00:00:00 2001 From: Fletterio Date: Wed, 6 Nov 2024 17:23:04 -0300 Subject: [PATCH 137/358] Adding strip debug flag to optimizer --- include/nbl/asset/utils/ISPIRVOptimizer.h | 3 ++- src/nbl/asset/utils/ISPIRVOptimizer.cpp | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/nbl/asset/utils/ISPIRVOptimizer.h b/include/nbl/asset/utils/ISPIRVOptimizer.h index 542b60958e..796bc3db6a 100644 --- a/include/nbl/asset/utils/ISPIRVOptimizer.h +++ b/include/nbl/asset/utils/ISPIRVOptimizer.h @@ -24,7 +24,6 @@ class ISPIRVOptimizer final : public core::IReferenceCounted EOP_SIMPLIFICATION, EOP_VECTOR_DCE, EOP_DEAD_INSERT_ELIM, - EOP_AGGRESSIVE_DCE, EOP_DEAD_BRANCH_ELIM, EOP_BLOCK_MERGE, EOP_LOCAL_MULTI_STORE_ELIM, @@ -34,6 +33,8 @@ class ISPIRVOptimizer final : public core::IReferenceCounted EOP_REDUCE_LOAD_SIZE, EOP_STRENGTH_REDUCTION, EOP_IF_CONVERSION, + EOP_STRIP_DEBUG, + EOP_AGGRESSIVE_DCE, EOP_COUNT }; diff --git a/src/nbl/asset/utils/ISPIRVOptimizer.cpp b/src/nbl/asset/utils/ISPIRVOptimizer.cpp index b504bed7ff..4036774206 100644 --- a/src/nbl/asset/utils/ISPIRVOptimizer.cpp +++ b/src/nbl/asset/utils/ISPIRVOptimizer.cpp @@ -32,7 +32,6 @@ nbl::core::smart_refctd_ptr ISPIRVOptimizer::optimize(const uint32_t &spvtools::CreateSimplificationPass, &spvtools::CreateVectorDCEPass, &spvtools::CreateDeadInsertElimPass, - //&spvtools::CreateAggressiveDCEPass, &spvtools::CreateDeadBranchElimPass, &spvtools::CreateBlockMergePass, &spvtools::CreateLocalMultiStoreElimPass, @@ -41,7 +40,9 @@ nbl::core::smart_refctd_ptr ISPIRVOptimizer::optimize(const uint32_t &spvtools::CreateCCPPass, CreateReduceLoadSizePass, &spvtools::CreateStrengthReductionPass, - &spvtools::CreateIfConversionPass + &spvtools::CreateIfConversionPass, + &spvtools::CreateStripDebugInfoPass, + //&spvtools::CreateAggressiveDCEPass }; auto msgConsumer = [&logger](spv_message_level_t level, const char* src, const spv_position_t& pos, const char* msg) From 1380bd08b4149902d8f08dac125ccecde6b8fb19 Mon Sep 17 00:00:00 2001 From: Fletterio Date: Wed, 6 Nov 2024 23:53:03 -0300 Subject: [PATCH 138/358] Small fix for SPIRV Optimizer + small fix to pass serialization --- include/nbl/asset/utils/ISPIRVOptimizer.h | 5 +++-- src/nbl/asset/utils/ISPIRVOptimizer.cpp | 13 +++++++++---- src/nbl/asset/utils/shaderCompiler_serialization.h | 2 +- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/include/nbl/asset/utils/ISPIRVOptimizer.h b/include/nbl/asset/utils/ISPIRVOptimizer.h index 542b60958e..a18343ab1b 100644 --- a/include/nbl/asset/utils/ISPIRVOptimizer.h +++ b/include/nbl/asset/utils/ISPIRVOptimizer.h @@ -24,7 +24,6 @@ class ISPIRVOptimizer final : public core::IReferenceCounted EOP_SIMPLIFICATION, EOP_VECTOR_DCE, EOP_DEAD_INSERT_ELIM, - EOP_AGGRESSIVE_DCE, EOP_DEAD_BRANCH_ELIM, EOP_BLOCK_MERGE, EOP_LOCAL_MULTI_STORE_ELIM, @@ -34,11 +33,13 @@ class ISPIRVOptimizer final : public core::IReferenceCounted EOP_REDUCE_LOAD_SIZE, EOP_STRENGTH_REDUCTION, EOP_IF_CONVERSION, + EOP_STRIP_DEBUG_INFO, + EOP_AGGRESSIVE_DCE, EOP_COUNT }; - ISPIRVOptimizer(std::initializer_list _passes) : m_passes(std::move(_passes)) {} + ISPIRVOptimizer(core::vector&& _passes) : m_passes(_passes) {} core::smart_refctd_ptr optimize(const uint32_t* _spirv, uint32_t _dwordCount, system::logger_opt_ptr logger) const; core::smart_refctd_ptr optimize(const ICPUBuffer* _spirv, system::logger_opt_ptr logger) const; diff --git a/src/nbl/asset/utils/ISPIRVOptimizer.cpp b/src/nbl/asset/utils/ISPIRVOptimizer.cpp index b504bed7ff..2716bcd70f 100644 --- a/src/nbl/asset/utils/ISPIRVOptimizer.cpp +++ b/src/nbl/asset/utils/ISPIRVOptimizer.cpp @@ -7,7 +7,7 @@ using namespace nbl::asset; -static constexpr spv_target_env SPIRV_VERSION = spv_target_env::SPV_ENV_UNIVERSAL_1_5; +static constexpr spv_target_env SPIRV_VERSION = spv_target_env::SPV_ENV_UNIVERSAL_1_6; nbl::core::smart_refctd_ptr ISPIRVOptimizer::optimize(const uint32_t* _spirv, uint32_t _dwordCount, system::logger_opt_ptr logger) const { @@ -32,7 +32,6 @@ nbl::core::smart_refctd_ptr ISPIRVOptimizer::optimize(const uint32_t &spvtools::CreateSimplificationPass, &spvtools::CreateVectorDCEPass, &spvtools::CreateDeadInsertElimPass, - //&spvtools::CreateAggressiveDCEPass, &spvtools::CreateDeadBranchElimPass, &spvtools::CreateBlockMergePass, &spvtools::CreateLocalMultiStoreElimPass, @@ -41,7 +40,9 @@ nbl::core::smart_refctd_ptr ISPIRVOptimizer::optimize(const uint32_t &spvtools::CreateCCPPass, CreateReduceLoadSizePass, &spvtools::CreateStrengthReductionPass, - &spvtools::CreateIfConversionPass + &spvtools::CreateIfConversionPass, + &spvtools::CreateStripDebugInfoPass, + //&spvtools::CreateAggressiveDCEPass }; auto msgConsumer = [&logger](spv_message_level_t level, const char* src, const spv_position_t& pos, const char* msg) @@ -58,7 +59,11 @@ nbl::core::smart_refctd_ptr ISPIRVOptimizer::optimize(const uint32_t system::ILogger::ELL_DEBUG }; const auto lvl = lvl2lvl[level]; - const std::string location = src + ":"s + std::to_string(pos.line) + ":" + std::to_string(pos.column); + std::string location; + if (src) + location = src + ":"s + std::to_string(pos.line) + ":" + std::to_string(pos.column); + else + location = ""; logger.log(location, lvl, msg); }; diff --git a/src/nbl/asset/utils/shaderCompiler_serialization.h b/src/nbl/asset/utils/shaderCompiler_serialization.h index ca81df6408..6ad33a2ff5 100644 --- a/src/nbl/asset/utils/shaderCompiler_serialization.h +++ b/src/nbl/asset/utils/shaderCompiler_serialization.h @@ -50,7 +50,7 @@ inline void to_json(json& j, const ISPIRVOptimizer::E_OPTIMIZER_PASS& optPass) { uint32_t value = static_cast(optPass); j = json{ - { "optPass", optPass }, + { "optPass", value }, }; } From c7ea9662ad29b9f0edb204b132a581ef499d6b39 Mon Sep 17 00:00:00 2001 From: Fletterio Date: Thu, 7 Nov 2024 00:25:36 -0300 Subject: [PATCH 139/358] Roll back changes to SPIRV optimizer so there's no merge conflicts later --- include/nbl/asset/utils/ISPIRVOptimizer.h | 3 +-- src/nbl/asset/utils/ISPIRVOptimizer.cpp | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/include/nbl/asset/utils/ISPIRVOptimizer.h b/include/nbl/asset/utils/ISPIRVOptimizer.h index 796bc3db6a..542b60958e 100644 --- a/include/nbl/asset/utils/ISPIRVOptimizer.h +++ b/include/nbl/asset/utils/ISPIRVOptimizer.h @@ -24,6 +24,7 @@ class ISPIRVOptimizer final : public core::IReferenceCounted EOP_SIMPLIFICATION, EOP_VECTOR_DCE, EOP_DEAD_INSERT_ELIM, + EOP_AGGRESSIVE_DCE, EOP_DEAD_BRANCH_ELIM, EOP_BLOCK_MERGE, EOP_LOCAL_MULTI_STORE_ELIM, @@ -33,8 +34,6 @@ class ISPIRVOptimizer final : public core::IReferenceCounted EOP_REDUCE_LOAD_SIZE, EOP_STRENGTH_REDUCTION, EOP_IF_CONVERSION, - EOP_STRIP_DEBUG, - EOP_AGGRESSIVE_DCE, EOP_COUNT }; diff --git a/src/nbl/asset/utils/ISPIRVOptimizer.cpp b/src/nbl/asset/utils/ISPIRVOptimizer.cpp index 4036774206..98cefe2ac6 100644 --- a/src/nbl/asset/utils/ISPIRVOptimizer.cpp +++ b/src/nbl/asset/utils/ISPIRVOptimizer.cpp @@ -32,6 +32,7 @@ nbl::core::smart_refctd_ptr ISPIRVOptimizer::optimize(const uint32_t &spvtools::CreateSimplificationPass, &spvtools::CreateVectorDCEPass, &spvtools::CreateDeadInsertElimPass, + //&spvtools::CreateAggressiveDCEPass &spvtools::CreateDeadBranchElimPass, &spvtools::CreateBlockMergePass, &spvtools::CreateLocalMultiStoreElimPass, @@ -41,8 +42,6 @@ nbl::core::smart_refctd_ptr ISPIRVOptimizer::optimize(const uint32_t CreateReduceLoadSizePass, &spvtools::CreateStrengthReductionPass, &spvtools::CreateIfConversionPass, - &spvtools::CreateStripDebugInfoPass, - //&spvtools::CreateAggressiveDCEPass }; auto msgConsumer = [&logger](spv_message_level_t level, const char* src, const spv_position_t& pos, const char* msg) From a383f5a2770edb1f8966306b3a107f1ad61c0bd1 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 7 Nov 2024 07:43:02 +0100 Subject: [PATCH 140/358] fix query pool typo --- examples_tests | 2 +- src/nbl/video/CVulkanLogicalDevice.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples_tests b/examples_tests index 8e66e10dcd..fee781b998 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 8e66e10dcd7a456d4a5b7e34d0a6f89299668d21 +Subproject commit fee781b998e8c78e0f141de15e937d688cc930fb diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index c90ffb94a7..03f2c94494 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -1434,7 +1434,7 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createQueryPool_impl(co info.pipelineStatistics = CVulkanQueryPool::getVkPipelineStatisticsFlagsFrom(params.pipelineStatisticsFlags.value); VkQueryPool vk_queryPool = VK_NULL_HANDLE; - if (m_devf.vk.vkCreateQueryPool(m_vkdev,&info,nullptr,&vk_queryPool)!=VK_SUCCESS) + if (m_devf.vk.vkCreateQueryPool(m_vkdev,&info,nullptr,&vk_queryPool)==VK_SUCCESS) return core::make_smart_refctd_ptr(this,params,vk_queryPool); return nullptr; } From 9844019d26d24e215694c7a61a0d1c8015fffd63 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 7 Nov 2024 08:18:12 +0100 Subject: [PATCH 141/358] Note that `StackAllocation::size` returns what `sizeof(T[N])` would have, fix bugs Also add links to Vulkan spec about why we error out in `ILogicalDevice::writeAccelerationStructuresProperties` --- examples_tests | 2 +- include/nbl/video/ILogicalDevice.h | 12 +++++++----- src/nbl/video/CVulkanCommandBuffer.cpp | 4 ++-- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/examples_tests b/examples_tests index fee781b998..7e3b373a96 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit fee781b998e8c78e0f141de15e937d688cc930fb +Subproject commit 7e3b373a96a6ef512f82404f9b60545fe103a036 diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 20ebcce272..7c4c7cd264 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -567,17 +567,19 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe return false; break; } + // https://vulkan.lunarg.com/doc/view/1.3.290.0/windows/1.3-extensions/vkspec.html#VUID-vkWriteAccelerationStructuresPropertiesKHR-accelerationStructureHostCommands-03585 if (!getEnabledFeatures().accelerationStructureHostCommands) { NBL_LOG_ERROR("Feature `acceleration structure` host commands is not enabled"); return false; } + // https://vulkan.lunarg.com/doc/view/1.3.290.0/windows/1.3-extensions/vkspec.html#VUID-vkWriteAccelerationStructuresPropertiesKHR-buffer-03733 for (const auto& as : accelerationStructures) - if (invalidAccelerationStructureForHostOperations(as)) - { - NBL_LOG_ERROR("Invalid acceleration structure for host operations"); - return false; - } + if (invalidAccelerationStructureForHostOperations(as)) + { + NBL_LOG_ERROR("Invalid acceleration structure for host operations"); + return false; + } // unfortunately cannot validate if they're built and if they're built with the right flags return writeAccelerationStructuresProperties_impl(accelerationStructures,type,data,stride); } diff --git a/src/nbl/video/CVulkanCommandBuffer.cpp b/src/nbl/video/CVulkanCommandBuffer.cpp index 5716653002..75a24a8a61 100644 --- a/src/nbl/video/CVulkanCommandBuffer.cpp +++ b/src/nbl/video/CVulkanCommandBuffer.cpp @@ -583,7 +583,7 @@ bool CVulkanCommandBuffer::writeAccelerationStructureProperties_impl(const std:: vk_accelerationStructures[i] = *reinterpret_cast(static_cast(pAccelerationStructures[i])->getNativeHandle()); getFunctionTable().vkCmdWriteAccelerationStructuresPropertiesKHR( - m_cmdbuf, vk_accelerationStructures.size(), vk_accelerationStructures.data(), + m_cmdbuf, vk_accelerationStructures.size()/sizeof(VkAccelerationStructureKHR), vk_accelerationStructures.data(), CVulkanQueryPool::getVkQueryTypeFrom(queryType), static_cast(queryPool)->getInternalObject(), firstQuery ); return true; @@ -643,7 +643,7 @@ bool CVulkanCommandBuffer::beginRenderPass_impl(const SRenderpassBeginInfo& info .renderArea = info.renderArea, // Implicitly but could be optimizedif needed // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkRenderPassBeginInfo.html#VUID-VkRenderPassBeginInfo-clearValueCount-00902 - .clearValueCount = vk_clearValues.size(), + .clearValueCount = vk_clearValues.size()/sizeof(VkClearValue), // Implicit // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkRenderPassBeginInfo.html#VUID-VkRenderPassBeginInfo-clearValueCount-04962 .pClearValues = vk_clearValues.data() From 886240ea7c8797f09ee4a03630a319e8de116f56 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 7 Nov 2024 08:50:47 +0100 Subject: [PATCH 142/358] require `KHR_ray_tracing_maintenance1` --- src/nbl/video/CVulkanPhysicalDevice.cpp | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/nbl/video/CVulkanPhysicalDevice.cpp b/src/nbl/video/CVulkanPhysicalDevice.cpp index 541a600b03..b7bfe94974 100644 --- a/src/nbl/video/CVulkanPhysicalDevice.cpp +++ b/src/nbl/video/CVulkanPhysicalDevice.cpp @@ -703,6 +703,7 @@ std::unique_ptr CVulkanPhysicalDevice::create(core::smart VkPhysicalDeviceShaderSubgroupUniformControlFlowFeaturesKHR subgroupUniformControlFlowFeatures = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_SUBGROUP_UNIFORM_CONTROL_FLOW_FEATURES_KHR }; VkPhysicalDeviceRayTracingMotionBlurFeaturesNV rayTracingMotionBlurFeatures = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_TRACING_MOTION_BLUR_FEATURES_NV }; VkPhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR workgroupMemoryExplicitLayout = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_WORKGROUP_MEMORY_EXPLICIT_LAYOUT_FEATURES_KHR }; + VkPhysicalDeviceRayTracingMaintenance1FeaturesKHR raytracingMaintenance1Features = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_TRACING_MAINTENANCE_1_FEATURES_KHR }; VkPhysicalDeviceRasterizationOrderAttachmentAccessFeaturesARM rasterizationOrderAttachmentAccessFeatures = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_FEATURES_ARM }; VkPhysicalDeviceColorWriteEnableFeaturesEXT colorWriteEnableFeatures = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COLOR_WRITE_ENABLE_FEATURES_EXT }; #if 0 @@ -712,8 +713,11 @@ std::unique_ptr CVulkanPhysicalDevice::create(core::smart addToPNextChain(&conditionalRenderingFeatures); if (isExtensionSupported(VK_KHR_PERFORMANCE_QUERY_EXTENSION_NAME)) addToPNextChain(&performanceQueryFeatures); - if (isExtensionSupported(VK_KHR_ACCELERATION_STRUCTURE_EXTENSION_NAME)) + if (isExtensionSupported(VK_KHR_ACCELERATION_STRUCTURE_EXTENSION_NAME) && isExtensionSupported(VK_KHR_RAY_TRACING_MAINTENANCE_1_EXTENSION_NAME)) + { addToPNextChain(&accelerationStructureFeatures); + addToPNextChain(&raytracingMaintenance1Features); + } if (isExtensionSupported(VK_KHR_RAY_TRACING_PIPELINE_EXTENSION_NAME)) addToPNextChain(&rayTracingPipelineFeatures); if (isExtensionSupported(VK_NV_SHADER_SM_BUILTINS_EXTENSION_NAME)) @@ -1027,7 +1031,7 @@ std::unique_ptr CVulkanPhysicalDevice::create(core::smart features.mixedAttachmentSamples = isExtensionSupported(VK_AMD_MIXED_ATTACHMENT_SAMPLES_EXTENSION_NAME) || isExtensionSupported(VK_NV_FRAMEBUFFER_MIXED_SAMPLES_EXTENSION_NAME); - if (isExtensionSupported(VK_KHR_ACCELERATION_STRUCTURE_EXTENSION_NAME)) + if (isExtensionSupported(VK_KHR_ACCELERATION_STRUCTURE_EXTENSION_NAME) && isExtensionSupported(VK_KHR_RAY_TRACING_MAINTENANCE_1_EXTENSION_NAME)) { features.accelerationStructure = accelerationStructureFeatures.accelerationStructure; features.accelerationStructureIndirectBuild = accelerationStructureFeatures.accelerationStructureIndirectBuild; @@ -1039,10 +1043,10 @@ std::unique_ptr CVulkanPhysicalDevice::create(core::smart } } - if (isExtensionSupported(VK_KHR_RAY_TRACING_PIPELINE_EXTENSION_NAME)) + if (isExtensionSupported(VK_KHR_RAY_TRACING_PIPELINE_EXTENSION_NAME) && isExtensionSupported(VK_KHR_RAY_TRACING_MAINTENANCE_1_EXTENSION_NAME)) { features.rayTracingPipeline = rayTracingPipelineFeatures.rayTracingPipeline; - if (!rayTracingPipelineFeatures.rayTracingPipelineTraceRaysIndirect) + if (!rayTracingPipelineFeatures.rayTracingPipelineTraceRaysIndirect && !raytracingMaintenance1Features.rayTracingPipelineTraceRaysIndirect2) { logger.log("Not enumerating VkPhysicalDevice %p because it reports features contrary to Vulkan specification!", system::ILogger::ELL_INFO, vk_physicalDevice); return nullptr; @@ -1056,7 +1060,7 @@ std::unique_ptr CVulkanPhysicalDevice::create(core::smart features.rayTraversalPrimitiveCulling = rayTracingPipelineFeatures.rayTraversalPrimitiveCulling; } - features.rayQuery = isExtensionSupported(VK_KHR_RAY_QUERY_EXTENSION_NAME); + features.rayQuery = isExtensionSupported(VK_KHR_RAY_QUERY_EXTENSION_NAME) && isExtensionSupported(VK_KHR_RAY_TRACING_MAINTENANCE_1_EXTENSION_NAME); if (isExtensionSupported(VK_NV_REPRESENTATIVE_FRAGMENT_TEST_EXTENSION_NAME)) features.representativeFragmentTest = representativeFragmentTestFeatures.representativeFragmentTest; @@ -1449,7 +1453,9 @@ core::smart_refctd_ptr CVulkanPhysicalDevice::createLogicalDevic enableExtensionIfAvailable(VK_EXT_SHADER_STENCIL_EXPORT_EXTENSION_NAME); VkPhysicalDeviceAccelerationStructureFeaturesKHR accelerationStructureFeatures = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ACCELERATION_STRUCTURE_FEATURES_KHR,nullptr }; + VkPhysicalDeviceRayTracingMaintenance1FeaturesKHR rayTracingMaintenance1Features = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_TRACING_MAINTENANCE_1_FEATURES_KHR,nullptr }; REQUIRE_EXTENSION_IF(enabledFeatures.accelerationStructure,VK_KHR_ACCELERATION_STRUCTURE_EXTENSION_NAME,&accelerationStructureFeatures); // feature dependency taken care of + REQUIRE_EXTENSION_IF(enabledFeatures.accelerationStructure,VK_KHR_RAY_TRACING_MAINTENANCE_1_EXTENSION_NAME,&rayTracingMaintenance1Features); // feature dependency taken care of VkPhysicalDeviceRayTracingPipelineFeaturesKHR rayTracingPipelineFeatures = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_TRACING_PIPELINE_FEATURES_KHR,nullptr }; REQUIRE_EXTENSION_IF(enabledFeatures.rayTracingPipeline,VK_KHR_RAY_TRACING_PIPELINE_EXTENSION_NAME,&rayTracingPipelineFeatures); // feature dependency taken care of @@ -1797,6 +1803,10 @@ core::smart_refctd_ptr CVulkanPhysicalDevice::createLogicalDevic //workgroupMemoryExplicitLayoutFeatures [LIMIT SO ENABLE EVERYTHING BY DEFAULT] + // we require this extension if the base ones and their features are enabled + rayTracingMaintenance1Features.rayTracingMaintenance1 = accelerationStructureFeatures.accelerationStructure; + rayTracingMaintenance1Features.rayTracingPipelineTraceRaysIndirect2 = rayTracingPipelineFeatures.rayTracingPipelineTraceRaysIndirect; + rasterizationOrderAttachmentAccessFeatures.rasterizationOrderColorAttachmentAccess = enabledFeatures.rasterizationOrderColorAttachmentAccess; rasterizationOrderAttachmentAccessFeatures.rasterizationOrderDepthAttachmentAccess = enabledFeatures.rasterizationOrderDepthAttachmentAccess; rasterizationOrderAttachmentAccessFeatures.rasterizationOrderStencilAttachmentAccess = enabledFeatures.rasterizationOrderStencilAttachmentAccess; From 9d5fd07f1eb376be70f8ad1bfa02b29ef14f77a4 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 7 Nov 2024 15:01:21 +0700 Subject: [PATCH 143/358] fix for getAccelerationStructureBuildSizes for tlas creation --- src/nbl/video/CVulkanLogicalDevice.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 03f2c94494..cc96eb6e51 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -459,7 +459,7 @@ auto CVulkanLogicalDevice::getAccelerationStructureBuildSizes_impl( ) const -> AccelerationStructureBuildSizes { VkAccelerationStructureGeometryKHR geometry = {VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_KHR,nullptr,VK_GEOMETRY_TYPE_INSTANCES_KHR}; - geometry.geometry.instances = {}; + geometry.geometry.instances = { .sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_INSTANCES_DATA_KHR }; // no "geometry flags" are valid for all instances! geometry.flags = static_cast(0); From 6181735f36b06de1dbe4cd12afdbe9a1af7bd9b2 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 7 Nov 2024 09:16:18 +0100 Subject: [PATCH 144/358] add some logging of AS Build failures --- src/nbl/video/IGPUCommandBuffer.cpp | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/nbl/video/IGPUCommandBuffer.cpp b/src/nbl/video/IGPUCommandBuffer.cpp index c5b79e1d93..f1ed3a9a73 100644 --- a/src/nbl/video/IGPUCommandBuffer.cpp +++ b/src/nbl/video/IGPUCommandBuffer.cpp @@ -585,21 +585,36 @@ uint32_t IGPUCommandBuffer::buildAccelerationStructures_common(const std::spangetObjectDebugName():"nullptr" + ); return false; + } resourcesToTrack += toAdd; totalGeometries += infos[i].inputCount(); } // infos array was empty if (resourcesToTrack==0u) + { + m_logger.log("Acceleration Structure Build Info span was empty!", system::ILogger::ELL_ERROR); return false; + } if (indirectBuffer) { if (!features.accelerationStructureIndirectBuild) + { + m_logger.log("Indirect Acceleration Structure Build Feature not enabled!", system::ILogger::ELL_ERROR); return false; + } resourcesToTrack++; } From 2fb4ec923cb63c602e5fc5f96da77685564d6553 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 7 Nov 2024 09:25:23 +0100 Subject: [PATCH 145/358] make sure the esitmated vertex alignment is PoT --- include/nbl/video/IGPUAccelerationStructure.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/video/IGPUAccelerationStructure.h b/include/nbl/video/IGPUAccelerationStructure.h index 8e451b7a8c..5bace85fe0 100644 --- a/include/nbl/video/IGPUAccelerationStructure.h +++ b/include/nbl/video/IGPUAccelerationStructure.h @@ -228,7 +228,7 @@ class IGPUBottomLevelAccelerationStructure : public asset::IBottomLevelAccelerat const size_t vertexSize = asset::getTexelOrBlockBytesize(geometry.vertexFormat); // TODO: improve in line with the spec https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdBuildAccelerationStructuresKHR-pInfos-03711 - const size_t vertexAlignment = core::max(vertexSize/4u,1ull); + const size_t vertexAlignment = core::max(core::roundDownToPoT(vertexSize/asset::getFormatChannelCount(geometry.vertexFormat)),1ull); // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkAccelerationStructureGeometryTrianglesDataKHR-vertexStride-03735 if (!core::is_aligned_to(geometry.vertexStride,vertexAlignment)) return false; From f6fa65fc70ea4e4b616c6f87b766fad002e65336 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 7 Nov 2024 09:37:01 +0100 Subject: [PATCH 146/358] fix typos in `BuildInfo::invalidInputBuffer` validation --- include/nbl/video/IGPUAccelerationStructure.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/nbl/video/IGPUAccelerationStructure.h b/include/nbl/video/IGPUAccelerationStructure.h index 5bace85fe0..5237b5d6f1 100644 --- a/include/nbl/video/IGPUAccelerationStructure.h +++ b/include/nbl/video/IGPUAccelerationStructure.h @@ -81,13 +81,13 @@ class IGPUAccelerationStructure : public asset::IAccelerationStructure, public I // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdBuildAccelerationStructuresIndirectKHR-geometry-03673 static inline bool invalidInputBuffer(const asset::SBufferBinding& binding, const size_t byteOffset, const size_t count, const size_t elementSize, const size_t alignment) { - if (!binding.buffer || binding.offset+byteOffset+count*elementSizegetSize()) + if (!binding.buffer || binding.offset+byteOffset+count*elementSize>binding.buffer->getSize()) return true; if constexpr (std::is_same_v) { const auto deviceAddr = binding.buffer->getDeviceAddress(); - if (!deviceAddr==0ull || !core::is_aligned_to(deviceAddr,alignment)) + if (deviceAddr==0ull || !core::is_aligned_to(deviceAddr,alignment)) return true; if (!binding.buffer->getCreationParams().usage.hasFlags(IGPUBuffer::E_USAGE_FLAGS::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT)) From 455715ed46941d7e86291148b45a41e40afdd281 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 7 Nov 2024 10:02:17 +0100 Subject: [PATCH 147/358] they call me the typo king --- src/nbl/video/CVulkanAccelerationStructure.h | 2 +- src/nbl/video/CVulkanCommandBuffer.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/nbl/video/CVulkanAccelerationStructure.h b/src/nbl/video/CVulkanAccelerationStructure.h index ad0608bc78..69df8b6e3b 100644 --- a/src/nbl/video/CVulkanAccelerationStructure.h +++ b/src/nbl/video/CVulkanAccelerationStructure.h @@ -205,7 +205,7 @@ inline VkAccelerationStructureBuildGeometryInfoKHR getVkASBuildGeometryInfo(cons vk_info.type = IsTLAS ? VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR:VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR; vk_info.flags = getVkASBuildFlagsFrom(info.buildFlags,info.dstAS); vk_info.mode = info.isUpdate ? VK_BUILD_ACCELERATION_STRUCTURE_MODE_UPDATE_KHR:VK_BUILD_ACCELERATION_STRUCTURE_MODE_BUILD_KHR; - vk_info.srcAccelerationStructure = static_cast*>(info.srcAS)->getInternalObject(); + vk_info.srcAccelerationStructure = info.srcAS ? static_cast*>(info.srcAS)->getInternalObject():VK_NULL_HANDLE; vk_info.dstAccelerationStructure = static_cast*>(info.dstAS)->getInternalObject(); vk_info.geometryCount = info.inputCount(); vk_info.pGeometries = p_vk_geometry; diff --git a/src/nbl/video/CVulkanCommandBuffer.h b/src/nbl/video/CVulkanCommandBuffer.h index f5bd57264e..8f83b28850 100644 --- a/src/nbl/video/CVulkanCommandBuffer.h +++ b/src/nbl/video/CVulkanCommandBuffer.h @@ -118,7 +118,7 @@ class CVulkanCommandBuffer final : public IGPUCommandBuffer auto out_vk_geoms = vk_geometries.data(); for (auto i=0u; i(infos[i],out_vk_geoms,out_vk_vertexMotions); + vk_buildGeomsInfos[i] = getVkASBuildGeometryInfo(infos[i],out_vk_geoms,out_vk_vertexMotions); getFunctionTable().vkCmdBuildAccelerationStructuresKHR(m_cmdbuf,infoCount,vk_buildGeomsInfos.data(),vk_ppBuildRangeInfos); return true; @@ -170,7 +170,7 @@ class CVulkanCommandBuffer final : public IGPUCommandBuffer auto out_vk_geoms = vk_geometries.data(); for (auto i=0u; i(infos[i],out_vk_geoms,out_vk_vertexMotions); + vk_buildGeomsInfos[i] = getVkASBuildGeometryInfo(infos[i],out_vk_geoms,out_vk_vertexMotions); indirectDeviceAddresses[i] = baseIndirectAddress+pIndirectOffsets[i]; } getFunctionTable().vkCmdBuildAccelerationStructuresIndirectKHR(m_cmdbuf,infoCount,vk_buildGeomsInfos.data(),indirectDeviceAddresses.data(),pIndirectStrides,ppMaxPrimitiveOrInstanceCounts); From 189cfaecc64a1f45887e9a37f2acf5c2a6b98e0a Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 7 Nov 2024 16:57:58 +0700 Subject: [PATCH 148/358] fixed typo --- src/nbl/video/IGPUAccelerationStructure.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nbl/video/IGPUAccelerationStructure.cpp b/src/nbl/video/IGPUAccelerationStructure.cpp index b366f66515..eafbe08d6f 100644 --- a/src/nbl/video/IGPUAccelerationStructure.cpp +++ b/src/nbl/video/IGPUAccelerationStructure.cpp @@ -34,7 +34,7 @@ bool IGPUAccelerationStructure::BuildInfo::invalid(const IGPUAcceler if constexpr (std::is_same_v) { // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdBuildAccelerationStructuresIndirectKHR-pInfos-03674 - if (scratch.buffer->getCreationParams().usage.hasFlags(IGPUBuffer::EUF_STORAGE_BUFFER_BIT)) + if (!scratch.buffer->getCreationParams().usage.hasFlags(IGPUBuffer::EUF_STORAGE_BUFFER_BIT)) return true; const auto scratchAddress = scratch.buffer->getDeviceAddress(); // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdBuildAccelerationStructuresIndirectKHR-pInfos-03802 From 7ed9c5b03609fc5488b3d00ec271477c6dca8d18 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 7 Nov 2024 14:46:50 +0100 Subject: [PATCH 149/358] add a util function to compute blit, and update examples_tests (pipelien barriers written) --- examples_tests | 2 +- include/nbl/video/utilities/CComputeBlit.h | 51 ++----- src/nbl/video/utilities/CComputeBlit.cpp | 167 ++------------------- 3 files changed, 19 insertions(+), 201 deletions(-) diff --git a/examples_tests b/examples_tests index 1585888070..54e0ab15f3 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 15858880704a7a8f641e2cd792a41708f617c3e3 +Subproject commit 54e0ab15f3ee5734be10d73ca236322013797a2e diff --git a/include/nbl/video/utilities/CComputeBlit.h b/include/nbl/video/utilities/CComputeBlit.h index 4180ac4204..f7775ef3d6 100644 --- a/include/nbl/video/utilities/CComputeBlit.h +++ b/include/nbl/video/utilities/CComputeBlit.h @@ -134,10 +134,15 @@ class CComputeBlit : public core::IReferenceCounted // the absolute minimum needed to store a single pixel of a worst case format (precise, all 4 channels) constexpr auto singlePixelStorage = 4*sizeof(hlsl::float32_t); constexpr auto ratio = singlePixelStorage/sizeof(uint16_t); - const auto paddedAlphaBinCount = core::min(core::roundUp(baseBucketCount,workgroupSize),workgroupSize*ratio); + // atomicAdd gets performed on MSB or LSB of a single DWORD + const auto paddedAlphaBinCount = core::min(core::roundUp(baseBucketCount,workgroupSize*2),workgroupSize*ratio); return paddedAlphaBinCount*layersToBlit; } - + + static inline uint32_t getNormalizationByteSize(const uint16_t workgroupSize, const asset::E_FORMAT intermediateAlpha, const uint32_t layersToBlit) + { + return getAlphaBinCount(workgroupSize,intermediateAlpha,layersToBlit)*sizeof(uint16_t)+sizeof(uint32_t)+sizeof(uint32_t); + } #if 0 //! Returns the number of output texels produced by one workgroup, deciding factor is `m_availableSharedMemory`. @@ -337,19 +342,14 @@ class CComputeBlit : public core::IReferenceCounted { dispatch_info_t dispatchInfo; buildAlphaTestDispatchInfo(dispatchInfo, inImageExtent, inImageType, layersToBlit); - - cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, alphaTestPipeline->getLayout(), 0u, 1u, &alphaTestDS); - cmdbuf->bindComputePipeline(alphaTestPipeline); +// bind omitted dispatchHelper(cmdbuf, alphaTestPipeline->getLayout(), pushConstants, dispatchInfo); } { dispatch_info_t dispatchInfo; buildBlitDispatchInfo(dispatchInfo, inImageExtent, outImageExtent, inImageFormat, inImageType, kernels, workgroupSize, layersToBlit); - - video::IGPUDescriptorSet* ds_raw[] = { blitDS, blitWeightsDS }; - cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, blitPipeline->getLayout(), 0, 2, ds_raw); - cmdbuf->bindComputePipeline(blitPipeline); +// bind omitted dispatchHelper(cmdbuf, blitPipeline->getLayout(), pushConstants, dispatchInfo); } @@ -359,39 +359,6 @@ class CComputeBlit : public core::IReferenceCounted dispatch_info_t dispatchInfo; buildNormalizationDispatchInfo(dispatchInfo, outImageExtent, inImageType, layersToBlit); - assert(coverageAdjustmentScratchBuffer); - IGPUCommandBuffer::SPipelineBarrierDependencyInfo depInfo; - // Memory dependency to ensure the alpha test pass has finished writing to alphaTestCounterBuffer - video::IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t alphaTestBarrier = {}; - alphaTestBarrier.barrier.dep.srcStageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; - alphaTestBarrier.barrier.dep.srcAccessMask = asset::ACCESS_FLAGS::SHADER_WRITE_BITS; - alphaTestBarrier.barrier.dep.dstStageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; - alphaTestBarrier.barrier.dep.dstAccessMask = asset::ACCESS_FLAGS::SHADER_READ_BITS; - alphaTestBarrier.range.buffer = coverageAdjustmentScratchBuffer; - alphaTestBarrier.range.size = coverageAdjustmentScratchBuffer->getSize(); - alphaTestBarrier.range.offset = 0; - - // Memory dependency to ensure that the previous compute pass has finished writing to the output image, - // also transitions the layout of said image: GENERAL -> SHADER_READ_ONLY_OPTIMAL - video::IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t readyForNorm = {}; - readyForNorm.barrier.dep.srcStageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; - readyForNorm.barrier.dep.srcAccessMask = asset::ACCESS_FLAGS::SHADER_WRITE_BITS; - readyForNorm.barrier.dep.dstStageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; - readyForNorm.barrier.dep.dstAccessMask = asset::ACCESS_FLAGS::SHADER_READ_BITS; - readyForNorm.oldLayout = video::IGPUImage::LAYOUT::GENERAL; - readyForNorm.newLayout = video::IGPUImage::LAYOUT::READ_ONLY_OPTIMAL; - readyForNorm.image = normalizationInImage.get(); - readyForNorm.subresourceRange.aspectMask = asset::IImage::EAF_COLOR_BIT; - readyForNorm.subresourceRange.levelCount = 1u; - readyForNorm.subresourceRange.layerCount = normalizationInImage->getCreationParameters().arrayLayers; - - depInfo.bufBarriers = { &alphaTestBarrier, &alphaTestBarrier + 1 }; - depInfo.imgBarriers = { &readyForNorm, &readyForNorm + 1 }; - - cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE, depInfo); - - cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, normalizationPipeline->getLayout(), 0u, 1u, &normalizationDS); - cmdbuf->bindComputePipeline(normalizationPipeline); dispatchHelper(cmdbuf, normalizationPipeline->getLayout(), pushConstants, dispatchInfo); } } diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp index c3ceb66675..38b9479350 100644 --- a/src/nbl/video/utilities/CComputeBlit.cpp +++ b/src/nbl/video/utilities/CComputeBlit.cpp @@ -50,12 +50,12 @@ using namespace nbl::hlsl; struct ConstevalParameters { -NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkGroupSize = )===" << retval.workgroupSize << R"===(; -using kernel_weight_binding_t = )===" << layout->getBindingInfoForHLSL({.binding=info.kernelWeights,.requiredStages=IShader::E_SHADER_STAGE::ESS_COMPUTE}) << R"===(; -using input_sampler_binding_t = )===" << layout->getBindingInfoForHLSL({.binding=info.samplers,.requiredStages=IShader::E_SHADER_STAGE::ESS_COMPUTE}) << R"===(; -using input_image_binding_t = )===" << layout->getBindingInfoForHLSL({.binding=info.inputs,.requiredStages=IShader::E_SHADER_STAGE::ESS_COMPUTE}) << R"===(; -using output_binding_t = )===" << layout->getBindingInfoForHLSL({.binding=info.outputs,.requiredStages=IShader::E_SHADER_STAGE::ESS_COMPUTE}) << R"===(; -NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedMemoryDWORDs = )===" << (sharedMemoryPerInvocation* retval.workgroupSize)/sizeof(uint32_t) << R"===(; + NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkGroupSize = )===" << retval.workgroupSize << R"===(; + using kernel_weight_binding_t = )===" << layout->getBindingInfoForHLSL({.binding=info.kernelWeights,.requiredStages=IShader::E_SHADER_STAGE::ESS_COMPUTE}) << R"===(; + using input_sampler_binding_t = )===" << layout->getBindingInfoForHLSL({.binding=info.samplers,.requiredStages=IShader::E_SHADER_STAGE::ESS_COMPUTE}) << R"===(; + using input_image_binding_t = )===" << layout->getBindingInfoForHLSL({.binding=info.inputs,.requiredStages=IShader::E_SHADER_STAGE::ESS_COMPUTE}) << R"===(; + using output_binding_t = )===" << layout->getBindingInfoForHLSL({.binding=info.outputs,.requiredStages=IShader::E_SHADER_STAGE::ESS_COMPUTE}) << R"===(; + NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedMemoryDWORDs = )===" << (sharedMemoryPerInvocation* retval.workgroupSize)/sizeof(uint32_t) << R"===(; }; )==="; return tmp.str(); @@ -135,30 +135,7 @@ core::smart_refctd_ptr createBlitSpecializedShader( const uint32_t smemFloatCount = m_availableSharedMemory / (sizeof(float) * outChannelCount); const uint32_t blitDimCount = static_cast(imageType) + 1; - - std::ostringstream shaderSourceStream; - shaderSourceStream - << "#include \"nbl/builtin/hlsl/blit/common.hlsl\"\n" - "#include \"nbl/builtin/hlsl/blit/parameters.hlsl\"\n" - "#include \"nbl/builtin/hlsl/blit/compute_blit.hlsl\"\n"; - - shaderSourceStream - << "typedef nbl::hlsl::blit::consteval_parameters_t<" << workgroupSize << ", 1, 1, " << smemFloatCount << ", " - << outChannelCount << ", " << blitDimCount << ", " << paddedAlphaBinCount << "> ceval_params_t;\n"; - - shaderSourceStream - << "[[vk::combinedImageSampler]] [[vk::binding(0, 0)]]\n" - "nbl::hlsl::blit::impl::dim_to_image_properties::combined_sampler_t inCS;\n" - "[[vk::combinedImageSampler]] [[vk::binding(0, 0)]]\n" - "SamplerState inSamp;\n" - - "[[vk::image_format(\""<< formatQualifier << "\")]]\n" - "[[vk::binding(1, 0)]]\n" - "nbl::hlsl::blit::impl::dim_to_image_properties::image_t outImg;\n" - - "[[vk::binding(0, 1)]] Buffer kernelWeights;\n" - "[[vk::push_constant]] nbl::hlsl::blit::parameters_t params;" - "groupshared float32_t sMem[" << m_availableSharedMemory / sizeof(float) << "];\n"; +....... if (alphaSemantic == asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE) { @@ -184,58 +161,6 @@ core::smart_refctd_ptr createBlitSpecializedShader( " InCSAccessor inCSA; OutImgAccessor outImgA; KernelWeightsAccessor kwA; HistogramAccessor hA; SharedAccessor sA;\n" " blit.execute(inCSA, outImgA, kwA, hA, sA, workGroupID, localInvocationIndex);\n" "}\n"; - - auto cpuShader = core::make_smart_refctd_ptr(shaderSourceStream.str().c_str(), IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, IGPUShader::E_SHADER_STAGE::E_CONTENT_TYPE::ECT_HLSL, "CComputeBlit::createBlitSpecializedShader"); - auto gpuShader = m_device->createShader(std::move(cpuShader.get())); - - return gpuShader; -} - -template -core::smart_refctd_ptr getBlitPipeline( - const asset::E_FORMAT outFormat, - const asset::IImage::E_TYPE imageType, - const core::vectorSIMDu32& inExtent, - const core::vectorSIMDu32& outExtent, - const asset::IBlitUtilities::E_ALPHA_SEMANTIC alphaSemantic, - const typename BlitUtilities::convolution_kernels_t& kernels, - const uint32_t workgroupSize = 256, - const uint32_t alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount) -{ - const auto paddedAlphaBinCount = getPaddedAlphaBinCount(core::vectorSIMDu32(workgroupSize, 1, 1, 1), alphaBinCount); - - const SBlitCacheKey key = - { - .wgSize = workgroupSize, - .imageType = imageType, - .alphaBinCount = paddedAlphaBinCount, - .outFormat = outFormat, - .smemSize = m_availableSharedMemory, - .coverageAdjustment = (alphaSemantic == asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE) - }; - - if (m_blitPipelines.find(key) == m_blitPipelines.end()) - { - const auto blitType = (alphaSemantic == asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE) ? EBT_COVERAGE_ADJUSTMENT : EBT_REGULAR; - - auto specShader = createBlitSpecializedShader( - outFormat, - imageType, - inExtent, - outExtent, - alphaSemantic, - kernels, - workgroupSize, - paddedAlphaBinCount); - - IGPUComputePipeline::SCreationParams creationParams; - creationParams.shader.shader = specShader.get(); - creationParams.shader.entryPoint = "main"; - creationParams.layout = m_blitPipelineLayout[blitType].get(); - m_device->createComputePipelines(nullptr, { &creationParams, &creationParams + 1 }, &m_blitPipelines[key]); - } - - return m_blitPipelines[key]; } core::smart_refctd_ptr CComputeBlit::createAlphaTestSpecializedShader(const asset::IImage::E_TYPE imageType, const uint32_t alphaBinCount) @@ -244,21 +169,7 @@ core::smart_refctd_ptr CComputeBlit::createAlphaTestSpecializ const auto paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount); const uint32_t blitDimCount = static_cast(imageType) + 1; - std::ostringstream shaderSourceStream; - - shaderSourceStream - << "#include \"nbl/builtin/hlsl/blit/common.hlsl\"\n" - "#include \"nbl/builtin/hlsl/blit/parameters.hlsl\"\n" - "#include \"nbl/builtin/hlsl/blit/alpha_test.hlsl\"\n" - - "typedef nbl::hlsl::blit::consteval_parameters_t<" << workgroupDims.x << ", " << workgroupDims.y << ", " << workgroupDims.z << ", " - "0, 0, " << blitDimCount << ", " << paddedAlphaBinCount << "> ceval_params_t;\n" - - "[[vk::binding(0, 0)]]\n" - "nbl::hlsl::blit::impl::dim_to_image_properties::combined_sampler_t inCS;\n" - - "[[vk::binding(2 , 0)]] RWStructuredBuffer statsBuff;\n" - "[[vk::push_constant]] nbl::hlsl::blit::parameters_t params;" +........ "struct PassedPixelsAccessor { void atomicAdd(uint32_t wgID, uint32_t v) { InterlockedAdd(statsBuff[wgID * (ceval_params_t::AlphaBinCount + 1) + ceval_params_t::AlphaBinCount], v); } };\n" "struct InCSAccessor { float32_t4 get(int32_t3 c, uint32_t l) { return inCS[nbl::hlsl::blit::impl::dim_to_image_properties::getIndexCoord(c, l)]; } };\n" @@ -269,29 +180,6 @@ core::smart_refctd_ptr CComputeBlit::createAlphaTestSpecializ " InCSAccessor inCSA;PassedPixelsAccessor ppA;\n" " nbl::hlsl::blit::alpha_test(ppA, inCSA, params.inputDims, params.referenceAlpha, globalInvocationID, workGroupID);\n" "}\n"; - - auto cpuShader = core::make_smart_refctd_ptr(shaderSourceStream.str().c_str(), IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, IGPUShader::E_CONTENT_TYPE::ECT_HLSL, "CComputeBlitGLSLGLSL::createAlphaTestSpecializedShader"); -} - -core::smart_refctd_ptr getAlphaTestPipeline(const uint32_t alphaBinCount, const asset::IImage::E_TYPE imageType) -{ - const auto workgroupDims = getDefaultWorkgroupDims(imageType); - const auto paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount); - - assert(paddedAlphaBinCount >= asset::IBlitUtilities::MinAlphaBinCount); - const auto pipelineIndex = (paddedAlphaBinCount / asset::IBlitUtilities::MinAlphaBinCount) - 1; - - if (m_alphaTestPipelines[pipelineIndex][imageType]) - return m_alphaTestPipelines[pipelineIndex][imageType]; - - auto specShader = createAlphaTestSpecializedShader(imageType, paddedAlphaBinCount); - IGPUComputePipeline::SCreationParams creationParams; - creationParams.shader.shader = specShader.get(); - creationParams.shader.entryPoint = "main"; - creationParams.layout = m_blitPipelineLayout[EBT_COVERAGE_ADJUSTMENT].get(); - assert(m_device->createComputePipelines(nullptr, { &creationParams, &creationParams + 1 }, &m_alphaTestPipelines[pipelineIndex][imageType])); - - return m_alphaTestPipelines[pipelineIndex][imageType]; } // @param `outFormat` dictates encoding. @@ -301,22 +189,7 @@ core::smart_refctd_ptr CComputeBlit::createNormalizationSpeci const auto paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount); const uint32_t blitDimCount = static_cast(imageType) + 1; - std::ostringstream shaderSourceStream; - - shaderSourceStream - << "#include \"nbl/builtin/hlsl/blit/common.hlsl\"\n" - "#include \"nbl/builtin/hlsl/blit/parameters.hlsl\"\n" - "#include \"nbl/builtin/hlsl/blit/normalization.hlsl\"\n" - - "typedef nbl::hlsl::blit::consteval_parameters_t<" << workgroupDims.x << ", " << workgroupDims.y << ", " << workgroupDims.z << ", " - "0, 0, " << blitDimCount << ", " << paddedAlphaBinCount << "> ceval_params_t;\n" - - "[[vk::binding(0, 0)]]\n" - "nbl::hlsl::blit::impl::dim_to_image_properties::combined_sampler_t inCS;\n" - - "[[vk::image_format(\"unknown\")]]\n" - "[[vk::binding(1, 0)]]\n" - "nbl::hlsl::blit::impl::dim_to_image_properties::image_t outImg;\n" +.... "[[vk::binding(2 , 0)]] RWStructuredBuffer statsBuff;\n" "[[vk::push_constant]] nbl::hlsl::blit::parameters_t params;" @@ -335,27 +208,5 @@ core::smart_refctd_ptr CComputeBlit::createNormalizationSpeci " InCSAccessor inCSA; OutImgAccessor outImgA; HistogramAccessor hA; PassedPixelsAccessor ppA; SharedAccessor sA;\n" " blit.execute(inCSA, outImgA, hA, ppA, sA, workGroupID, globalInvocationID, localInvocationIndex);\n" "}\n"; - - auto cpuShader = core::make_smart_refctd_ptr(shaderSourceStream.str().c_str(), IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, IGPUShader::E_CONTENT_TYPE::ECT_HLSL, "CComputeBlitGLSL::createNormalizationSpecializedShader"); -} - -core::smart_refctd_ptr getNormalizationPipeline(const asset::IImage::E_TYPE imageType, const asset::E_FORMAT outFormat, - const uint32_t alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount) -{ - const auto workgroupDims = getDefaultWorkgroupDims(imageType); - const uint32_t paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount); - const SNormalizationCacheKey key = { imageType, paddedAlphaBinCount, outFormat }; - - if (m_normalizationPipelines.find(key) == m_normalizationPipelines.end()) - { - auto specShader = createNormalizationSpecializedShader(imageType, outFormat, paddedAlphaBinCount); - IGPUComputePipeline::SCreationParams creationParams; - creationParams.shader.shader = specShader.get(); - creationParams.shader.entryPoint = "main"; - creationParams.layout = m_blitPipelineLayout[EBT_COVERAGE_ADJUSTMENT].get(); - assert(m_device->createComputePipelines(nullptr, { &creationParams, &creationParams + 1 }, &m_normalizationPipelines[key])); - } - - return m_normalizationPipelines[key]; } #endif \ No newline at end of file From c2459bb31724022613c411a96dca5b075bd3dfa5 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 7 Nov 2024 17:00:10 +0100 Subject: [PATCH 150/358] actually test using concepts --- include/nbl/builtin/hlsl/blit/common.hlsl | 38 ++++++++--- .../builtin/hlsl/blit/default_blit.comp.hlsl | 39 ++++++++--- include/nbl/builtin/hlsl/concepts.hlsl | 7 +- include/nbl/builtin/hlsl/concepts/__end.hlsl | 38 +++++++++++ .../concepts/anisotropically_sampled.hlsl | 47 +++++++++++++ .../builtin/hlsl/concepts/loadable_image.hlsl | 66 +++++++++++++++++++ .../nbl/builtin/hlsl/concepts/mip_mapped.hlsl | 44 +++++++++++++ .../builtin/hlsl/concepts/storable_image.hlsl | 44 +++++++++++++ src/nbl/builtin/CMakeLists.txt | 6 ++ 9 files changed, 306 insertions(+), 23 deletions(-) create mode 100644 include/nbl/builtin/hlsl/concepts/__end.hlsl create mode 100644 include/nbl/builtin/hlsl/concepts/anisotropically_sampled.hlsl create mode 100644 include/nbl/builtin/hlsl/concepts/loadable_image.hlsl create mode 100644 include/nbl/builtin/hlsl/concepts/mip_mapped.hlsl create mode 100644 include/nbl/builtin/hlsl/concepts/storable_image.hlsl diff --git a/include/nbl/builtin/hlsl/blit/common.hlsl b/include/nbl/builtin/hlsl/blit/common.hlsl index 93ed579312..fe1019134b 100644 --- a/include/nbl/builtin/hlsl/blit/common.hlsl +++ b/include/nbl/builtin/hlsl/blit/common.hlsl @@ -43,6 +43,9 @@ RWTexture3D outAs3D[ConstevalParameters::output_binding_t::Count]; groupshared uint32_t sMem[ConstevalParameters::SharedMemoryDWORDs]; + + +#include /* struct HistogramAccessor { @@ -62,19 +65,34 @@ struct SharedAccessor sMem[idx] = val; } }; -struct InCSAccessor -{ - float32_t4 get(float32_t3 c, uint32_t l) - { - return inCS.SampleLevel(inSamp, blit::impl::dim_to_image_properties::getIndexCoord(c, l), 0); - } -}; +*/ + struct OutImgAccessor { - void set(int32_t3 c, uint32_t l, float32_t4 v) + template) + void set(const vector uv, uint16_t layer, const vector data) { - outImg[blit::impl::dim_to_image_properties::getIndexCoord(c, l)] = v; + return __set_impl(uv,layer,data); } + + template + void __set_impl(const vector uv, uint16_t layer, const float32_t4 data); + + uint32_t descIx; }; -*/ +template<> +void OutImgAccessor::__set_impl<1>(const uint16_t1 uv, uint16_t layer, const float32_t4 data) +{ + outAs1DArray[descIx][uint32_t2(uv,layer)] = data; +} +template<> +void OutImgAccessor::__set_impl<2>(const uint16_t2 uv, uint16_t layer, const float32_t4 data) +{ + outAs2DArray[descIx][uint32_t3(uv,layer)] = data; +} +template<> +void OutImgAccessor::__set_impl<3>(const uint16_t3 uv, uint16_t layer, const float32_t4 data) +{ + outAs3D[descIx][uv] = data; +} #endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl b/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl index c9184d0164..b9ed374873 100644 --- a/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl +++ b/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl @@ -21,21 +21,38 @@ struct KernelWeightsAccessor return kernelWeights[idx]; } }; -struct InCSAccessor + +inCS.SampleLevel(inSamp, blit::impl::dim_to_image_properties::getIndexCoord(c, l), 0); +*/ +struct InImgAccessor { - float32_t4 get(float32_t3 c, uint32_t l) + template) + vector data get(const vector uv, uint16_t layer, uint16_t level) { - return inCS.SampleLevel(inSamp, blit::impl::dim_to_image_properties::getIndexCoord(c, l), 0); + return __get_impl(uv,_static_cast(layer),_static_cast(level),data); } + + template + float32_t4 __get_impl(const vector uv, float layer, float level); + + uint32_t descIx : 20; + uint32_t samplerIx : 12; }; -struct OutImgAccessor +template<> +float32_t4 InImgAccessor::__get_impl<1>(const float32_t1 uv, float layer, float level) { - void set(int32_t3 c, uint32_t l, float32_t4 v) - { - outImg[blit::impl::dim_to_image_properties::getIndexCoord(c, l)] = v; - } -}; -*/ + return inAs1DArray[descIx].SampleLevel(inSamp[samplerIx],float32_t2(uv,layer),level); +} +template<> +float32_t4 InImgAccessor::__get_impl<2>(const float32_t2 uv, float layer, float level) +{ + return inAs2DArray[descIx].SampleLevel(inSamp[samplerIx],float32_t3(uv,layer),level); +} +template<> +float32_t4 InImgAccessor::__get_impl<3>(const float32_t3 uv, float layer, float level) +{ + return inAs3D[descIx].SampleLevel(inSamp[samplerIx],uv,level); +} using namespace nbl::hlsl::blit; @@ -44,6 +61,8 @@ using namespace nbl::hlsl::blit; [numthreads(ConstevalParameters::WorkGroupSize,1,1)] void main() { + InImgAccessor inImgA; + OutImgAccessor outImgA; /* blit::compute_blit_t blit = blit::compute_blit_t::create(params); InCSAccessor inCSA; diff --git a/include/nbl/builtin/hlsl/concepts.hlsl b/include/nbl/builtin/hlsl/concepts.hlsl index 0aa1af7b55..29660b8b45 100644 --- a/include/nbl/builtin/hlsl/concepts.hlsl +++ b/include/nbl/builtin/hlsl/concepts.hlsl @@ -17,6 +17,7 @@ namespace hlsl namespace concepts { // common implementation juice +#define NBL_CONCEPT_MAX_PARAM_COUNT 32 #include #define NBL_IMPL_CONCEPT_FULL_TPLT(z, n, unused) BOOST_PP_SEQ_ELEM(n,NBL_CONCEPT_TPLT_PRM_KINDS) BOOST_PP_SEQ_ELEM(n,NBL_CONCEPT_TPLT_PRM_NAMES) #include @@ -34,7 +35,7 @@ namespace concepts //! Now diverge -#ifndef __cpp_concepts +#ifdef __cpp_concepts // to define a concept using `concept Name = SomeContexprBoolCondition;` @@ -138,10 +139,10 @@ concept matricial = is_matrix::value; // put just after the closing `>` on the partial template specialization `template` declaration e.g. `template NBL_PARTIAL_REQ_TOP(SomeCond) #define NBL_PARTIAL_REQ_TOP(...) // put just before closing `>` on the partial template specialization Type args, e.g. `MyStruct)> -#define NBL_PARTIAL_REQ_BOT(...) ,std::enable_if_t<(__VA_ARGS__),void> +#define NBL_PARTIAL_REQ_BOT(...) ,::nbl::hlsl::enable_if_t<(__VA_ARGS__),void> // condition, use instead of the closing `>` of a function template -#define NBL_FUNC_REQUIRES(...) ,std::enable_if_t<(__VA_ARGS__),bool> = true> +#define NBL_FUNC_REQUIRES(...) ,::nbl::hlsl::enable_if_t<(__VA_ARGS__),bool> = true> // diff --git a/include/nbl/builtin/hlsl/concepts/__end.hlsl b/include/nbl/builtin/hlsl/concepts/__end.hlsl new file mode 100644 index 0000000000..9a2e2bff50 --- /dev/null +++ b/include/nbl/builtin/hlsl/concepts/__end.hlsl @@ -0,0 +1,38 @@ +// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#undef NBL_CONCEPT_PARAM_31 +#undef NBL_CONCEPT_PARAM_30 +#undef NBL_CONCEPT_PARAM_29 +#undef NBL_CONCEPT_PARAM_28 +#undef NBL_CONCEPT_PARAM_27 +#undef NBL_CONCEPT_PARAM_26 +#undef NBL_CONCEPT_PARAM_25 +#undef NBL_CONCEPT_PARAM_24 +#undef NBL_CONCEPT_PARAM_23 +#undef NBL_CONCEPT_PARAM_22 +#undef NBL_CONCEPT_PARAM_21 +#undef NBL_CONCEPT_PARAM_20 +#undef NBL_CONCEPT_PARAM_19 +#undef NBL_CONCEPT_PARAM_18 +#undef NBL_CONCEPT_PARAM_17 +#undef NBL_CONCEPT_PARAM_16 +#undef NBL_CONCEPT_PARAM_15 +#undef NBL_CONCEPT_PARAM_14 +#undef NBL_CONCEPT_PARAM_13 +#undef NBL_CONCEPT_PARAM_12 +#undef NBL_CONCEPT_PARAM_11 +#undef NBL_CONCEPT_PARAM_10 +#undef NBL_CONCEPT_PARAM_9 +#undef NBL_CONCEPT_PARAM_8 +#undef NBL_CONCEPT_PARAM_7 +#undef NBL_CONCEPT_PARAM_6 +#undef NBL_CONCEPT_PARAM_5 +#undef NBL_CONCEPT_PARAM_4 +#undef NBL_CONCEPT_PARAM_3 +#undef NBL_CONCEPT_PARAM_2 +#undef NBL_CONCEPT_PARAM_1 +#undef NBL_CONCEPT_PARAM_0 +#undef NBL_CONCEPT_TPLT_PRM_NAMES +#undef NBL_CONCEPT_TPLT_PRM_KINDS +#undef NBL_CONCEPT_NAME \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/concepts/anisotropically_sampled.hlsl b/include/nbl/builtin/hlsl/concepts/anisotropically_sampled.hlsl new file mode 100644 index 0000000000..b04d1786d8 --- /dev/null +++ b/include/nbl/builtin/hlsl/concepts/anisotropically_sampled.hlsl @@ -0,0 +1,47 @@ +// Copyright (C) 2024-2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ANISOTROPICALLY_SAMPLED_INCLUDED_ +#define _NBL_BUILTIN_HLSL_CONCEPTS_ANISOTROPICALLY_SAMPLED_INCLUDED_ + + +#include + + +namespace nbl +{ +namespace hlsl +{ +namespace concepts +{ +// declare concept +#define NBL_CONCEPT_NAME AnisotropicallySampled +#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(int32_t) +#define NBL_CONCEPT_TPLT_PRM_NAMES (U)(Dims) +// not the greatest syntax but works +#define NBL_CONCEPT_PARAM_0 (a,U) +#define NBL_CONCEPT_PARAM_1 (uv,vector) +#define NBL_CONCEPT_PARAM_2 (layer,uint16_t) +#define NBL_CONCEPT_PARAM_3 (dU,vector) +#define NBL_CONCEPT_PARAM_4 (dV,vector) +// start concept +NBL_CONCEPT_BEGIN(5) +// need to be defined AFTER the cocnept begins +#define a NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 +#define uv NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 +#define layer NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 +#define dU NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_3 +#define dV NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_4 +NBL_CONCEPT_END( + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((a.template get(uv,layer,dU,dV)) , ::nbl::hlsl::is_same_v, float32_t4>)) +); +#undef dV +#undef dU +#undef layer +#undef uv +#undef a +#include +} +} +} +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/concepts/loadable_image.hlsl b/include/nbl/builtin/hlsl/concepts/loadable_image.hlsl new file mode 100644 index 0000000000..2f32ba48d1 --- /dev/null +++ b/include/nbl/builtin/hlsl/concepts/loadable_image.hlsl @@ -0,0 +1,66 @@ +// Copyright (C) 2024-2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_LOADABLE_IMAGE_INCLUDED_ +#define _NBL_BUILTIN_HLSL_CONCEPTS_LOADABLE_IMAGE_INCLUDED_ + + +#include + + +namespace nbl +{ +namespace hlsl +{ +namespace concepts +{ +// declare concept +#define NBL_CONCEPT_NAME StorableImage +#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(int32_t) +#define NBL_CONCEPT_TPLT_PRM_NAMES (U)(T)(Dims) +// not the greatest syntax but works +#define NBL_CONCEPT_PARAM_0 (a,U) +#define NBL_CONCEPT_PARAM_1 (uv,vector) +#define NBL_CONCEPT_PARAM_2 (layer,uint16_t) +// start concept +NBL_CONCEPT_BEGIN(3) +// need to be defined AFTER the cocnept begins +#define a NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 +#define uv NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 +#define layer NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 +NBL_CONCEPT_END( + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((a.template get(uv,layer)) , ::nbl::hlsl::is_same_v, vector)) +); +#undef layer +#undef uv +#undef a +#include + +// declare concept +#define NBL_CONCEPT_NAME MipmappedStorableImage +#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(int32_t) +#define NBL_CONCEPT_TPLT_PRM_NAMES (U)(T)(Dims) +// not the greatest syntax but works +#define NBL_CONCEPT_PARAM_0 (a,U) +#define NBL_CONCEPT_PARAM_1 (uv,vector) +#define NBL_CONCEPT_PARAM_2 (layer,uint16_t) +#define NBL_CONCEPT_PARAM_3 (level,uint16_t) +// start concept +NBL_CONCEPT_BEGIN(4) +// need to be defined AFTER the cocnept begins +#define a NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 +#define uv NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 +#define layer NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 +#define level NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_3 +NBL_CONCEPT_END( + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((a.template get(uv,layer,level)) , ::nbl::hlsl::is_same_v, vector)) +); +#undef level +#undef layer +#undef uv +#undef a +#include +} +} +} +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/concepts/mip_mapped.hlsl b/include/nbl/builtin/hlsl/concepts/mip_mapped.hlsl new file mode 100644 index 0000000000..5aa9a7da88 --- /dev/null +++ b/include/nbl/builtin/hlsl/concepts/mip_mapped.hlsl @@ -0,0 +1,44 @@ +// Copyright (C) 2024-2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_MIP_MAPPED_INCLUDED_ +#define _NBL_BUILTIN_HLSL_CONCEPTS_MIP_MAPPED_INCLUDED_ + + +#include + + +namespace nbl +{ +namespace hlsl +{ +namespace concepts +{ +// declare concept +#define NBL_CONCEPT_NAME MipMapped +#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(int32_t) +#define NBL_CONCEPT_TPLT_PRM_NAMES (U)(Dims) +// not the greatest syntax but works +#define NBL_CONCEPT_PARAM_0 (a,U) +#define NBL_CONCEPT_PARAM_1 (uv,vector) +#define NBL_CONCEPT_PARAM_2 (layer,uint16_t) +#define NBL_CONCEPT_PARAM_3 (level,float) +// start concept +NBL_CONCEPT_BEGIN(4) +// need to be defined AFTER the cocnept begins +#define a NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 +#define uv NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 +#define layer NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 +#define level NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_3 +NBL_CONCEPT_END( + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((a.template get(uv,layer,level)) , ::nbl::hlsl::is_same_v, float32_t4>)) +); +#undef level +#undef layer +#undef uv +#undef a +#include +} +} +} +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/concepts/storable_image.hlsl b/include/nbl/builtin/hlsl/concepts/storable_image.hlsl new file mode 100644 index 0000000000..a46d211b82 --- /dev/null +++ b/include/nbl/builtin/hlsl/concepts/storable_image.hlsl @@ -0,0 +1,44 @@ +// Copyright (C) 2024-2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_STORABLE_IMAGE_INCLUDED_ +#define _NBL_BUILTIN_HLSL_CONCEPTS_STORABLE_IMAGE_INCLUDED_ + + +#include + + +namespace nbl +{ +namespace hlsl +{ +namespace concepts +{ +// declare concept +#define NBL_CONCEPT_NAME StorableImage +#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(int32_t) +#define NBL_CONCEPT_TPLT_PRM_NAMES (U)(T)(Dims) +// not the greatest syntax but works +#define NBL_CONCEPT_PARAM_0 (a,U) +#define NBL_CONCEPT_PARAM_1 (uv,vector) +#define NBL_CONCEPT_PARAM_2 (layer,uint16_t) +#define NBL_CONCEPT_PARAM_3 (data,vector) +// start concept +NBL_CONCEPT_BEGIN(4) +// need to be defined AFTER the cocnept begins +#define a NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 +#define uv NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 +#define layer NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 +#define data NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_3 +NBL_CONCEPT_END( + ((NBL_CONCEPT_REQ_EXPR)(a.template set(uv,layer,data))) +); +#undef data +#undef layer +#undef uv +#undef a +#include +} +} +} +#endif \ No newline at end of file diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index 4dbd039b54..1ec29c4e8b 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -323,5 +323,11 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/memory_accessor.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/enums.hlsl") # LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/binding_info.hlsl") +# +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/__end.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/anisotropically_sampled.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/loadable_image.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/mip_mapped.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/storable_image.hlsl") ADD_CUSTOM_BUILTIN_RESOURCES(nblBuiltinResourceData NBL_RESOURCES_TO_EMBED "${NBL_ROOT_PATH}/include" "nbl/builtin" "nbl::builtin" "${NBL_ROOT_PATH_BINARY}/include" "${NBL_ROOT_PATH_BINARY}/src" "STATIC" "INTERNAL") From 0fac0cb73ce73deeb26830169448cd6dfcd4be9f Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 7 Nov 2024 17:04:02 +0100 Subject: [PATCH 151/358] aaaand the typos --- include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl b/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl index b9ed374873..1a519b8730 100644 --- a/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl +++ b/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl @@ -27,9 +27,9 @@ inCS.SampleLevel(inSamp, blit::impl::dim_to_image_properties) - vector data get(const vector uv, uint16_t layer, uint16_t level) + vector get(const vector uv, uint16_t layer, uint16_t level) { - return __get_impl(uv,_static_cast(layer),_static_cast(level),data); + return __get_impl(uv,_static_cast(layer),_static_cast(level)); } template From 25d1b7f9aa13b50fa31ee39438ef53394fff4b92 Mon Sep 17 00:00:00 2001 From: Fletterio Date: Thu, 7 Nov 2024 16:48:45 -0300 Subject: [PATCH 152/358] Span change --- include/nbl/asset/utils/ISPIRVOptimizer.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/nbl/asset/utils/ISPIRVOptimizer.h b/include/nbl/asset/utils/ISPIRVOptimizer.h index a18343ab1b..f1eb340d8f 100644 --- a/include/nbl/asset/utils/ISPIRVOptimizer.h +++ b/include/nbl/asset/utils/ISPIRVOptimizer.h @@ -39,7 +39,9 @@ class ISPIRVOptimizer final : public core::IReferenceCounted EOP_COUNT }; - ISPIRVOptimizer(core::vector&& _passes) : m_passes(_passes) {} + ISPIRVOptimizer(std::span _passes) : m_passes(_passes.size()) { + std::copy(_passes.begin(), _passes.end(), m_passes.begin()); + } core::smart_refctd_ptr optimize(const uint32_t* _spirv, uint32_t _dwordCount, system::logger_opt_ptr logger) const; core::smart_refctd_ptr optimize(const ICPUBuffer* _spirv, system::logger_opt_ptr logger) const; From 9ec93a91b82af0e6847fa8f1bbe90b2a375994e8 Mon Sep 17 00:00:00 2001 From: Fletterio Date: Thu, 7 Nov 2024 17:12:28 -0300 Subject: [PATCH 153/358] Span change to SPIRV Optimizer --- include/nbl/asset/utils/ISPIRVOptimizer.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/nbl/asset/utils/ISPIRVOptimizer.h b/include/nbl/asset/utils/ISPIRVOptimizer.h index f1eb340d8f..da14a3c917 100644 --- a/include/nbl/asset/utils/ISPIRVOptimizer.h +++ b/include/nbl/asset/utils/ISPIRVOptimizer.h @@ -39,9 +39,7 @@ class ISPIRVOptimizer final : public core::IReferenceCounted EOP_COUNT }; - ISPIRVOptimizer(std::span _passes) : m_passes(_passes.size()) { - std::copy(_passes.begin(), _passes.end(), m_passes.begin()); - } + ISPIRVOptimizer(std::span _passes) : m_passes(_passes.begin(), _passes.end()) {} core::smart_refctd_ptr optimize(const uint32_t* _spirv, uint32_t _dwordCount, system::logger_opt_ptr logger) const; core::smart_refctd_ptr optimize(const ICPUBuffer* _spirv, system::logger_opt_ptr logger) const; From 65cd93958d4a33e4297edf5bbaef02e4482e0299 Mon Sep 17 00:00:00 2001 From: Ali Cheraghi Date: Thu, 7 Nov 2024 17:32:59 +0330 Subject: [PATCH 154/358] video: fix vkspec doc links Signed-off-by: Ali Cheraghi --- src/nbl/video/IGPUCommandBuffer.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/nbl/video/IGPUCommandBuffer.cpp b/src/nbl/video/IGPUCommandBuffer.cpp index f1ed3a9a73..a690020bb2 100644 --- a/src/nbl/video/IGPUCommandBuffer.cpp +++ b/src/nbl/video/IGPUCommandBuffer.cpp @@ -46,7 +46,7 @@ bool IGPUCommandBuffer::checkStateBeforeRecording(const core::bitflag flags, const SInheritanceInfo* inheritanceInfo) { // Using Vulkan 1.2 VUIDs here because we don't want to confuse ourselves with Dynamic Rendering being core - // https://vulkan.lunarg.com/doc/view/1.2.176.1/linux/1.2-extensions/vkspec.html#VUID-vkBeginCommandBuffer-commandBuffer-00049 + // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkBeginCommandBuffer-commandBuffer-00049 if (m_state == STATE::RECORDING || m_state == STATE::PENDING) { m_logger.log("Failed to begin command buffer: command buffer must not be in RECORDING or PENDING state.", system::ILogger::ELL_ERROR); @@ -57,7 +57,7 @@ bool IGPUCommandBuffer::begin(const core::bitflag flags, const SInheritan const auto physDev = getOriginDevice()->getPhysicalDevice(); if (m_level==IGPUCommandPool::BUFFER_LEVEL::PRIMARY) { - // https://vulkan.lunarg.com/doc/view/1.2.176.1/linux/1.2-extensions/vkspec.html#VUID-vkBeginCommandBuffer-commandBuffer-02840 + // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkBeginCommandBuffer-commandBuffer-02840 if (flags.hasFlags(USAGE::ONE_TIME_SUBMIT_BIT|USAGE::SIMULTANEOUS_USE_BIT)) { m_logger.log("Failed to begin command buffer: a primary command buffer must not have both USAGE::ONE_TIME_SUBMIT_BIT and USAGE::SIMULTANEOUS_USE_BIT set.", system::ILogger::ELL_ERROR); @@ -76,14 +76,14 @@ bool IGPUCommandBuffer::begin(const core::bitflag flags, const SInheritan } else if (inheritanceInfo) { - // https://vulkan.lunarg.com/doc/view/1.2.176.1/linux/1.2-extensions/vkspec.html#VUID-vkBeginCommandBuffer-commandBuffer-00052 + // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkBeginCommandBuffer-commandBuffer-00052 if (inheritanceInfo->queryFlags.hasFlags(QUERY_CONTROL_FLAGS::PRECISE_BIT) && (!inheritanceInfo->occlusionQueryEnable/*|| TODO: precise occlusion queries limit/feature*/)) { m_logger.log("Failed to begin command buffer: Precise Occlusion Queries cannot be used!", system::ILogger::ELL_ERROR); return false; } } - // https://vulkan.lunarg.com/doc/view/1.2.176.1/linux/1.2-extensions/vkspec.html#VUID-vkBeginCommandBuffer-commandBuffer-00051 + // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkBeginCommandBuffer-commandBuffer-00051 else { m_logger.log("Failed to begin command buffer: a secondary command buffer requires an inheritance info structure!", system::ILogger::ELL_ERROR); @@ -98,15 +98,15 @@ bool IGPUCommandBuffer::begin(const core::bitflag flags, const SInheritan return false; } - // https://vulkan.lunarg.com/doc/view/1.2.176.1/linux/1.2-extensions/vkspec.html#VUID-VkCommandBufferBeginInfo-flags-00053 - // https://vulkan.lunarg.com/doc/view/1.2.176.1/linux/1.2-extensions/vkspec.html#VUID-VkCommandBufferBeginInfo-flags-00054 + // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkCommandBufferBeginInfo-flags-00053 + // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkCommandBufferBeginInfo-flags-00054 if (!inheritanceInfo || !inheritanceInfo->renderpass || !inheritanceInfo->renderpass->isCompatibleDevicewise(this) || inheritanceInfo->subpassrenderpass->getSubpassCount()) { m_logger.log("Failed to begin command buffer: a secondary command buffer must have valid inheritance info with a valid renderpass.", system::ILogger::ELL_ERROR); return false; } - // https://vulkan.lunarg.com/doc/view/1.2.176.1/linux/1.2-extensions/vkspec.html#VUID-VkCommandBufferBeginInfo-flags-00055 + // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkCommandBufferBeginInfo-flags-00055 if (inheritanceInfo->framebuffer && !inheritanceInfo->framebuffer->isCompatibleDevicewise(this)/* TODO: better check needed || inheritanceInfo->framebuffer->getCreationParameters().renderpass != inheritanceInfo->renderpass*/) return false; } From 8d5e0b06c2a03ee2c2b19567d726a099e603d756 Mon Sep 17 00:00:00 2001 From: Ali Cheraghi Date: Fri, 8 Nov 2024 01:03:25 +0330 Subject: [PATCH 155/358] video: improve error logging in `IGPUCommandBuffer` Signed-off-by: Ali Cheraghi --- include/nbl/video/IGPUCommandBuffer.h | 11 + src/nbl/video/IGPUCommandBuffer.cpp | 471 +++++++++++++++++++++++--- 2 files changed, 436 insertions(+), 46 deletions(-) diff --git a/include/nbl/video/IGPUCommandBuffer.h b/include/nbl/video/IGPUCommandBuffer.h index 620206349a..932c02a272 100644 --- a/include/nbl/video/IGPUCommandBuffer.h +++ b/include/nbl/video/IGPUCommandBuffer.h @@ -737,14 +737,20 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject if (invalidBufferBinding({range.offset,range.buffer},alignment,usages)) return true; if ((range.size&(alignment-1)) && range.size!=asset::SBufferRange::WholeBuffer) + { + m_logger.log("Size %d not aligned to %d for the command!", system::ILogger::ELL_ERROR, range.size, alignment); return true; + } return false; } inline bool invalidImage(const IGPUImage* image, const IGPUImage::E_USAGE_FLAGS usages) const { if (!image || !this->isCompatibleDevicewise(image)) + { + m_logger.log("invalid image!", system::ILogger::ELL_ERROR); return true; + } if (!image->getCreationParameters().usage.hasFlags(usages)) { m_logger.log("Incorrect `IGPUImage` usage flags for the command!", system::ILogger::ELL_ERROR); @@ -762,6 +768,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject case IGPUImage::LAYOUT::SHARED_PRESENT: break; default: + m_logger.log("invalid destination image layout!", system::ILogger::ELL_ERROR); return true; } if (invalidImage(image,IGPUImage::EUF_TRANSFER_DST_BIT)) @@ -769,7 +776,10 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject if constexpr (!clear) { if (image->getCreationParameters().samples!=IGPUImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT) + { + m_logger.log("destination image sample count must be 1!", system::ILogger::ELL_ERROR); return true; + } } return false; } @@ -782,6 +792,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject case IGPUImage::LAYOUT::SHARED_PRESENT: break; default: + m_logger.log("invalid source image layout!", system::ILogger::ELL_ERROR); return true; } return invalidImage(image,IGPUImage::EUF_TRANSFER_SRC_BIT); diff --git a/src/nbl/video/IGPUCommandBuffer.cpp b/src/nbl/video/IGPUCommandBuffer.cpp index a690020bb2..51453311f5 100644 --- a/src/nbl/video/IGPUCommandBuffer.cpp +++ b/src/nbl/video/IGPUCommandBuffer.cpp @@ -2,6 +2,9 @@ #include "nbl/video/ILogicalDevice.h" #include "nbl/video/IPhysicalDevice.h" +#define NBL_LOG_FUNCTION m_logger.log +#include "nbl/logging_macros.h" + namespace nbl::video { @@ -16,27 +19,27 @@ bool IGPUCommandBuffer::checkStateBeforeRecording(const core::bitflag(renderpassScope.value), withinSubpass ? "":" not" ); return false; } if (checkForParentPoolReset()) { - m_logger.log("Failed to record into command buffer: pool was reset since the recording begin() call.", system::ILogger::ELL_ERROR); + NBL_LOG_ERROR("pool was reset since the recording begin() call!"); return false; } const auto& queueFamilyProps = getOriginDevice()->getPhysicalDevice()->getQueueFamilyProperties()[m_cmdpool->getQueueFamilyIndex()]; if (!bool(queueFamilyProps.queueFlags&allowedQueueFlags)) { - m_logger.log("Failed to record into command buffer: this command is not supported by the Queue Family of the Command Pool.", system::ILogger::ELL_ERROR); + NBL_LOG_ERROR("this command is not supported by the Queue Family of the Command Pool!"); return false; } return true; @@ -49,7 +52,7 @@ bool IGPUCommandBuffer::begin(const core::bitflag flags, const SInheritan // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkBeginCommandBuffer-commandBuffer-00049 if (m_state == STATE::RECORDING || m_state == STATE::PENDING) { - m_logger.log("Failed to begin command buffer: command buffer must not be in RECORDING or PENDING state.", system::ILogger::ELL_ERROR); + NBL_LOG_ERROR("command buffer must not be in RECORDING or PENDING state!"); return false; } @@ -60,13 +63,13 @@ bool IGPUCommandBuffer::begin(const core::bitflag flags, const SInheritan // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkBeginCommandBuffer-commandBuffer-02840 if (flags.hasFlags(USAGE::ONE_TIME_SUBMIT_BIT|USAGE::SIMULTANEOUS_USE_BIT)) { - m_logger.log("Failed to begin command buffer: a primary command buffer must not have both USAGE::ONE_TIME_SUBMIT_BIT and USAGE::SIMULTANEOUS_USE_BIT set.", system::ILogger::ELL_ERROR); + NBL_LOG_ERROR("a primary command buffer must not have both USAGE::ONE_TIME_SUBMIT_BIT and USAGE::SIMULTANEOUS_USE_BIT set!"); return false; } // this is an extra added by me (devsh) if (whollyInsideRenderpass) { - m_logger.log("Failed to begin command buffer: a primary command buffer must not have the USAGE::RENDER_PASS_CONTINUE_BIT set.", system::ILogger::ELL_ERROR); + NBL_LOG_ERROR("a primary command buffer must not have the USAGE::RENDER_PASS_CONTINUE_BIT set!"); return false; } #ifdef _NBL_DEBUG @@ -79,14 +82,14 @@ bool IGPUCommandBuffer::begin(const core::bitflag flags, const SInheritan // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkBeginCommandBuffer-commandBuffer-00052 if (inheritanceInfo->queryFlags.hasFlags(QUERY_CONTROL_FLAGS::PRECISE_BIT) && (!inheritanceInfo->occlusionQueryEnable/*|| TODO: precise occlusion queries limit/feature*/)) { - m_logger.log("Failed to begin command buffer: Precise Occlusion Queries cannot be used!", system::ILogger::ELL_ERROR); + NBL_LOG_ERROR("Precise Occlusion Queries cannot be used!"); return false; } } // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkBeginCommandBuffer-commandBuffer-00051 else { - m_logger.log("Failed to begin command buffer: a secondary command buffer requires an inheritance info structure!", system::ILogger::ELL_ERROR); + NBL_LOG_ERROR("a secondary command buffer requires an inheritance info structure!"); return false; } @@ -94,7 +97,7 @@ bool IGPUCommandBuffer::begin(const core::bitflag flags, const SInheritan { if (!physDev->getQueueFamilyProperties()[m_cmdpool->getQueueFamilyIndex()].queueFlags.hasFlags(queue_flags_t::GRAPHICS_BIT)) { - m_logger.log("Failed to begin command buffer: a secondary command buffer which continues a Render Pass is requires a Graphics Queue Family.", system::ILogger::ELL_ERROR); + NBL_LOG_ERROR("a secondary command buffer which continues a Render Pass is requires a Graphics Queue Family!"); return false; } @@ -102,18 +105,21 @@ bool IGPUCommandBuffer::begin(const core::bitflag flags, const SInheritan // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkCommandBufferBeginInfo-flags-00054 if (!inheritanceInfo || !inheritanceInfo->renderpass || !inheritanceInfo->renderpass->isCompatibleDevicewise(this) || inheritanceInfo->subpassrenderpass->getSubpassCount()) { - m_logger.log("Failed to begin command buffer: a secondary command buffer must have valid inheritance info with a valid renderpass.", system::ILogger::ELL_ERROR); + NBL_LOG_ERROR("a secondary command buffer must have valid inheritance info with a valid renderpass!"); return false; } // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkCommandBufferBeginInfo-flags-00055 if (inheritanceInfo->framebuffer && !inheritanceInfo->framebuffer->isCompatibleDevicewise(this)/* TODO: better check needed || inheritanceInfo->framebuffer->getCreationParameters().renderpass != inheritanceInfo->renderpass*/) + { + NBL_LOG_ERROR("a secondary command buffer must have compatible framebuffer!"); return false; + } } // extras from me (devsh) else if (inheritanceInfo && (inheritanceInfo->renderpass||inheritanceInfo->framebuffer)) { - m_logger.log("Failed to begin command buffer: Do not provide renderpass or framebuffer to a Command Buffer begin without also the USAGE::RENDER_PASS_CONTINUE_BIT bitflag.", system::ILogger::ELL_ERROR); + NBL_LOG_ERROR("Do not provide renderpass or framebuffer to a Command Buffer begin without also the USAGE::RENDER_PASS_CONTINUE_BIT bitflag!"); return false; } @@ -125,7 +131,7 @@ bool IGPUCommandBuffer::begin(const core::bitflag flags, const SInheritan releaseResourcesBackToPool(); if (!canReset()) { - m_logger.log("Failed to begin command buffer: command buffer allocated from a command pool with ECF_RESET_COMMAND_BUFFER_BIT flag not set cannot be reset, and command buffer not in INITIAL state.", system::ILogger::ELL_ERROR); + NBL_LOG_ERROR("command buffer allocated from a command pool with ECF_RESET_COMMAND_BUFFER_BIT flag not set cannot be reset, and command buffer not in INITIAL state!"); m_state = STATE::INVALID; return false; } @@ -142,9 +148,15 @@ bool IGPUCommandBuffer::begin(const core::bitflag flags, const SInheritan if (inheritanceInfo) { if (inheritanceInfo->framebuffer && !inheritanceInfo->framebuffer->getCreationParameters().renderpass->compatible(inheritanceInfo->renderpass)) + { + NBL_LOG_ERROR("a secondary command buffer must have compatible renderpass!"); return false; + } if (!m_cmdpool->m_commandListPool.emplace(m_commandList,core::smart_refctd_ptr(inheritanceInfo->renderpass),core::smart_refctd_ptr(inheritanceInfo->framebuffer))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_cachedInheritanceInfo = *inheritanceInfo; } else @@ -157,7 +169,7 @@ bool IGPUCommandBuffer::reset(const core::bitflag flags) { if (!canReset()) { - m_logger.log("Failed to reset command buffer, state is: %d", system::ILogger::ELL_ERROR, (uint32_t)m_state); + NBL_LOG_ERROR("%d!", (uint32_t)m_state); m_state = STATE::INVALID; return false; } @@ -230,7 +242,10 @@ bool IGPUCommandBuffer::setEvent(IEvent* _event, const SEventDependencyInfo& dep return false; if (!_event || !this->isCompatibleDevicewise(_event)) + { + NBL_LOG_ERROR("incompatible event!"); return false; + } // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdSetEvent2-srcStageMask-03827 // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdSetEvent2-srcStageMask-03828 @@ -238,7 +253,10 @@ bool IGPUCommandBuffer::setEvent(IEvent* _event, const SEventDependencyInfo& dep return false; if (!m_cmdpool->m_commandListPool.emplace(m_commandList, core::smart_refctd_ptr(_event))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; return setEvent_impl(_event,depInfo); @@ -250,10 +268,16 @@ bool IGPUCommandBuffer::resetEvent(IEvent* _event, const core::bitflagisCompatibleDevicewise(_event)) + { + NBL_LOG_ERROR("incompatible event!"); return false; + } if (stageMask.hasFlags(stage_flags_t::HOST_BIT)) + { + NBL_LOG_ERROR("stageMask must not include HOST_BIT!"); return false; + } // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdResetEvent2-stageMask-03929 // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdResetEvent2-stageMask-03930 @@ -264,10 +288,16 @@ bool IGPUCommandBuffer::resetEvent(IEvent* _event, const core::bitflagsupportsMask(m_cmdpool->getQueueFamilyIndex(),stageMask)) + { + NBL_LOG_ERROR("unsupported stageMask!"); return false; + } if (!m_cmdpool->m_commandListPool.emplace(m_commandList,core::smart_refctd_ptr(_event))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; return resetEvent_impl(_event,stageMask); @@ -279,14 +309,20 @@ bool IGPUCommandBuffer::waitEvents(const std::span events, const SEvent return false; if (events.empty()) + { + NBL_LOG_ERROR("no events to wait for!"); return false; + } uint32_t totalBufferCount = 0u; uint32_t totalImageCount = 0u; for (auto i=0u; iisCompatibleDevicewise(events[i])) + { + NBL_LOG_ERROR("incompatible event!"); return false; + } const auto& depInfo = depInfos[i]; // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdWaitEvents2-srcStageMask-03842 @@ -301,7 +337,10 @@ bool IGPUCommandBuffer::waitEvents(const std::span events, const SEvent auto* cmd = m_cmdpool->m_commandListPool.emplace(m_commandList,events.size(),events.data(),totalBufferCount,totalImageCount); if (!cmd) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } auto outIt = cmd->getDeviceMemoryBacked(); for (auto i=0u; i bool + auto invalidSubpassMemoryBarrier = [dependencyFlags](nbl::system::logger_opt_smart_ptr logger, const asset::SMemoryBarrier& barrier) -> bool { if (barrier.srcStageMask&stage_flags_t::FRAMEBUFFER_SPACE_BITS) { // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdPipelineBarrier2-None-07890 if (barrier.dstStageMask&(~stage_flags_t::FRAMEBUFFER_SPACE_BITS)) + { + logger.log("destination stage masks of memory barriers included non FRAMEBUFFER_SPACE_BITS stages while source stage masks has FRAMEBUFFER_SPACE_BITS within a subpass!"); return true; + } // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdPipelineBarrier2-dependencyFlags-07891 if (!dependencyFlags.hasFlags(asset::EDF_BY_REGION_BIT)) + { + logger.log("dependency flags of memory barriers must includ EDF_BY_REGION_BIT while source stage masks has FRAMEBUFFER_SPACE_BITS within a subpass!"); return true; + } } // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdPipelineBarrier2-None-07892 constexpr auto NotGraphicsBits = ~stage_flags_t::ALL_GRAPHICS_BITS; - if ((barrier.srcStageMask&NotGraphicsBits) || (barrier.dstStageMask&NotGraphicsBits)) + if ((barrier.srcStageMask&NotGraphicsBits) || (barrier.dstStageMask&NotGraphicsBits)) { + logger.log("source & destination stage masks of memory barriers must only include graphics pipeline stages!"); return true; + } return false; }; for (const auto& barrier : depInfo.memBarriers) { - if (invalidSubpassMemoryBarrier(barrier)) + if (invalidSubpassMemoryBarrier(m_logger, barrier)) return false; } for (const auto& barrier : depInfo.imgBarriers) { - if (invalidSubpassMemoryBarrier(barrier.barrier.dep)) + if (invalidSubpassMemoryBarrier(m_logger, barrier.barrier.dep)) return false; // TODO: under NBL_DEBUG, cause waay too expensive to validate @@ -367,12 +420,18 @@ bool IGPUCommandBuffer::pipelineBarrier(const core::bitflagm_commandListPool.emplace(m_commandList,depInfo.bufBarriers.size(),depInfo.imgBarriers.size()); if (!cmd) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } auto outIt = cmd->getVariableCountResources(); for (const auto& barrier : depInfo.bufBarriers) @@ -403,12 +468,15 @@ bool IGPUCommandBuffer::fillBuffer(const asset::SBufferRange& range, if (invalidBufferRange(range,4u,IGPUBuffer::EUF_TRANSFER_DST_BIT)) { - m_logger.log("Invalid arguments see `IGPUCommandBuffer::invalidBufferRange`.", system::ILogger::ELL_ERROR); + NBL_LOG_ERROR("Invalid arguments see `IGPUCommandBuffer::invalidBufferRange`!"); return false; } if (!m_cmdpool->m_commandListPool.emplace(m_commandList,core::smart_refctd_ptr(range.buffer))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; return fillBuffer_impl(range,data); } @@ -420,17 +488,20 @@ bool IGPUCommandBuffer::updateBuffer(const asset::SBufferRange& rang if (invalidBufferRange(range,4u,IGPUBuffer::EUF_TRANSFER_DST_BIT|IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF)) { - m_logger.log("Invalid arguments see `IGPUCommandBuffer::validate_updateBuffer`.", system::ILogger::ELL_ERROR); + NBL_LOG_ERROR("Invalid arguments see `IGPUCommandBuffer::validate_updateBuffer`!"); return false; } if (range.actualSize()>0x10000ull) { - m_logger.log("Inline Buffer Updates are limited to 64kb!", system::ILogger::ELL_ERROR); + NBL_LOG_ERROR("Inline Buffer Updates are limited to 64kb!"); return false; } if (!m_cmdpool->m_commandListPool.emplace(m_commandList,core::smart_refctd_ptr(range.buffer))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; return updateBuffer_impl(range,pData); } @@ -439,7 +510,10 @@ bool IGPUCommandBuffer::copyBuffer(const IGPUBuffer* const srcBuffer, IGPUBuffer { // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdCopyBuffer.html#VUID-vkCmdCopyBuffer-regionCount-arraylength if (regionCount==0u) + { + NBL_LOG_ERROR("regionCount must be larger than 0!"); return false; + } if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT|queue_flags_t::GRAPHICS_BIT|queue_flags_t::TRANSFER_BIT,RENDERPASS_SCOPE::OUTSIDE)) return false; @@ -456,7 +530,10 @@ bool IGPUCommandBuffer::copyBuffer(const IGPUBuffer* const srcBuffer, IGPUBuffer // pRegions is too expensive to validate if (!m_cmdpool->m_commandListPool.emplace(m_commandList,core::smart_refctd_ptr(srcBuffer),core::smart_refctd_ptr(dstBuffer))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; return copyBuffer_impl(srcBuffer, dstBuffer, regionCount, pRegions); } @@ -471,10 +548,16 @@ bool IGPUCommandBuffer::clearColorImage(IGPUImage* const image, const IGPUImage: return false; const auto format = image->getCreationParameters().format; if (asset::isDepthOrStencilFormat(format) || asset::isBlockCompressionFormat(format)) + { + NBL_LOG_ERROR("invalid format!"); return false; + } if (!m_cmdpool->m_commandListPool.emplace(m_commandList,core::smart_refctd_ptr(image))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; return clearColorImage_impl(image, imageLayout, pColor, rangeCount, pRanges); } @@ -488,10 +571,16 @@ bool IGPUCommandBuffer::clearDepthStencilImage(IGPUImage* const image, const IGP return false; const auto format = image->getCreationParameters().format; if (!asset::isDepthOrStencilFormat(format)) + { + NBL_LOG_ERROR("invalid format!"); return false; + } if (!m_cmdpool->m_commandListPool.emplace(m_commandList,core::smart_refctd_ptr(image))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; return clearDepthStencilImage_impl(image, imageLayout, pDepthStencil, rangeCount, pRanges); } @@ -499,7 +588,11 @@ bool IGPUCommandBuffer::clearDepthStencilImage(IGPUImage* const image, const IGP bool IGPUCommandBuffer::copyBufferToImage(const IGPUBuffer* const srcBuffer, IGPUImage* const dstImage, const IGPUImage::LAYOUT dstImageLayout, const uint32_t regionCount, const IGPUImage::SBufferCopy* const pRegions) { if (regionCount==0u) + { + NBL_LOG_ERROR("regionCount must be larger than 0!"); return false; + } + if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT|queue_flags_t::GRAPHICS_BIT|queue_flags_t::TRANSFER_BIT,RENDERPASS_SCOPE::OUTSIDE)) return false; @@ -511,7 +604,10 @@ bool IGPUCommandBuffer::copyBufferToImage(const IGPUBuffer* const srcBuffer, IGP // pRegions is too expensive to validate if (!m_cmdpool->m_commandListPool.emplace(m_commandList, core::smart_refctd_ptr(srcBuffer), core::smart_refctd_ptr(dstImage))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; return copyBufferToImage_impl(srcBuffer, dstImage, dstImageLayout, regionCount, pRegions); @@ -520,7 +616,10 @@ bool IGPUCommandBuffer::copyBufferToImage(const IGPUBuffer* const srcBuffer, IGP bool IGPUCommandBuffer::copyImageToBuffer(const IGPUImage* const srcImage, const IGPUImage::LAYOUT srcImageLayout, const IGPUBuffer* const dstBuffer, const uint32_t regionCount, const IGPUImage::SBufferCopy* const pRegions) { if (regionCount==0u) + { + NBL_LOG_ERROR("regionCount must be larger than 0!"); return false; + } if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT|queue_flags_t::GRAPHICS_BIT|queue_flags_t::TRANSFER_BIT,RENDERPASS_SCOPE::OUTSIDE)) return false; @@ -532,7 +631,10 @@ bool IGPUCommandBuffer::copyImageToBuffer(const IGPUImage* const srcImage, const // pRegions is too expensive to validate if (!m_cmdpool->m_commandListPool.emplace(m_commandList, core::smart_refctd_ptr(srcImage), core::smart_refctd_ptr(dstBuffer))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; return copyImageToBuffer_impl(srcImage, srcImageLayout, dstBuffer, regionCount, pRegions); @@ -541,7 +643,10 @@ bool IGPUCommandBuffer::copyImageToBuffer(const IGPUImage* const srcImage, const bool IGPUCommandBuffer::copyImage(const IGPUImage* const srcImage, const IGPUImage::LAYOUT srcImageLayout, IGPUImage* const dstImage, const IGPUImage::LAYOUT dstImageLayout, const uint32_t regionCount, const IGPUImage::SImageCopy* const pRegions) { if (regionCount==0u) + { + NBL_LOG_ERROR("regionCount must be larger than 0!"); return false; + } if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT|queue_flags_t::GRAPHICS_BIT|queue_flags_t::TRANSFER_BIT,RENDERPASS_SCOPE::OUTSIDE)) return false; @@ -553,16 +658,25 @@ bool IGPUCommandBuffer::copyImage(const IGPUImage* const srcImage, const IGPUIma const auto& srcParams = srcImage->getCreationParameters(); const auto& dstParams = dstImage->getCreationParameters(); if (srcParams.samples!=dstParams.samples) + { + NBL_LOG_ERROR("source and destination have unequal sample count!"); return false; + } if (asset::getBytesPerPixel(srcParams.format)!=asset::getBytesPerPixel(dstParams.format)) + { + NBL_LOG_ERROR("source and destination have unequal pixel strides!"); return false; + } // pRegions is too expensive to validate if (!dstImage->validateCopies(pRegions,pRegions+regionCount,srcImage)) return false; if (!m_cmdpool->m_commandListPool.emplace(m_commandList, core::smart_refctd_ptr(srcImage), core::smart_refctd_ptr(dstImage))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; return copyImage_impl(srcImage, srcImageLayout, dstImage, dstImageLayout, regionCount, pRegions); @@ -577,7 +691,10 @@ uint32_t IGPUCommandBuffer::buildAccelerationStructures_common(const std::spangetEnabledFeatures(); if (!features.accelerationStructure) + { + NBL_LOG_ERROR("'accelerationStructure' feature not enabled!"); return false; + } uint32_t totalGeometries = 0u; uint32_t resourcesToTrack = 0u; @@ -587,13 +704,13 @@ uint32_t IGPUCommandBuffer::buildAccelerationStructures_common(const std::spangetObjectDebugName():"nullptr" ); return false; @@ -604,7 +721,7 @@ uint32_t IGPUCommandBuffer::buildAccelerationStructures_common(const std::spanm_commandListPool.emplace(m_commandList,resourcesToTrack); if (!cmd) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } auto oit = cmd->getVariableCountResources(); if (indirectBuffer) @@ -650,12 +770,21 @@ bool IGPUCommandBuffer::copyAccelerationStructure(const IGPUAccelerationStructur return false; if (!copyInfo.src || !this->isCompatibleDevicewise(copyInfo.src)) + { + NBL_LOG_ERROR("invalid source copy info!"); return false; + } if (!copyInfo.dst || !this->isCompatibleDevicewise(copyInfo.dst)) + { + NBL_LOG_ERROR("invalid destination copy info!"); return false; + } if (!m_cmdpool->m_commandListPool.emplace(m_commandList, core::smart_refctd_ptr(copyInfo.src), core::smart_refctd_ptr(copyInfo.dst))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; return copyAccelerationStructure_impl(copyInfo); @@ -667,12 +796,18 @@ bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const IGPUAcceleration return false; if (!copyInfo.src || !this->isCompatibleDevicewise(copyInfo.src)) + { + NBL_LOG_ERROR("invalid source copy info!"); return false; + } if (invalidBufferBinding(copyInfo.dst,256u,IGPUBuffer::EUF_TRANSFER_DST_BIT)) return false; if (!m_cmdpool->m_commandListPool.emplace(m_commandList, core::smart_refctd_ptr(copyInfo.src), core::smart_refctd_ptr(copyInfo.dst.buffer))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; return copyAccelerationStructureToMemory_impl(copyInfo); @@ -686,10 +821,16 @@ bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const IGPUAccelerati if (invalidBufferBinding(copyInfo.src,256u,IGPUBuffer::EUF_TRANSFER_SRC_BIT)) return false; if (!copyInfo.dst || !this->isCompatibleDevicewise(copyInfo.dst)) + { + NBL_LOG_ERROR("invalid destination copy info!"); return false; + } if (!m_cmdpool->m_commandListPool.emplace(m_commandList, core::smart_refctd_ptr(copyInfo.dst), core::smart_refctd_ptr(copyInfo.src.buffer))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; return copyAccelerationStructureFromMemory_impl(copyInfo); @@ -702,10 +843,16 @@ bool IGPUCommandBuffer::bindComputePipeline(const IGPUComputePipeline* const pip return false; if (!this->isCompatibleDevicewise(pipeline)) + { + NBL_LOG_ERROR("incompatible pipeline device!"); return false; + } if (!m_cmdpool->m_commandListPool.emplace(m_commandList, core::smart_refctd_ptr(pipeline))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; bindComputePipeline_impl(pipeline); @@ -722,10 +869,16 @@ bool IGPUCommandBuffer::bindGraphicsPipeline(const IGPUGraphicsPipeline* const p return false; if (!pipeline || !this->isCompatibleDevicewise(pipeline)) + { + NBL_LOG_ERROR("incompatible pipeline device!"); return false; + } if (!m_cmdpool->m_commandListPool.emplace(m_commandList, core::smart_refctd_ptr(pipeline))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; return bindGraphicsPipeline_impl(pipeline); @@ -740,25 +893,31 @@ bool IGPUCommandBuffer::bindDescriptorSets( return false; if (!layout ||!this->isCompatibleDevicewise(layout)) + { + NBL_LOG_ERROR("invalid layout!"); return false; + } for (uint32_t i=0u; iisCompatibleDevicewise(pDescriptorSets[i])) { - m_logger.log("IGPUCommandBuffer::bindDescriptorSets failed, pDescriptorSets[%d] was not created by the same ILogicalDevice as the commandbuffer!", system::ILogger::ELL_ERROR, i); + NBL_LOG_ERROR("pDescriptorSets[%d] was not created by the same ILogicalDevice as the commandbuffer!", i); return false; } - if (!pDescriptorSets[i]->getLayout()->isIdenticallyDefined(layout->getDescriptorSetLayout(firstSet+i))) + if (!pDescriptorSets[i]->getLayout()->isIdenticallyDefined(layout->getDescriptorSetLayout(firstSet + i))) { - m_logger.log("IGPUCommandBuffer::bindDescriptorSets failed, pDescriptorSets[%d] not identically defined as layout's %dth descriptor layout!", system::ILogger::ELL_ERROR, i, firstSet+i); + NBL_LOG_ERROR("pDescriptorSets[%d] not identically defined as layout's %dth descriptor layout!", i, firstSet+i); return false; } } if (!m_cmdpool->m_commandListPool.emplace(m_commandList,core::smart_refctd_ptr(layout),descriptorSetCount,pDescriptorSets)) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } for (uint32_t i=0u; igetLayout()->versionChangeInvalidatesCommandBuffer()) @@ -773,9 +932,9 @@ bool IGPUCommandBuffer::bindDescriptorSets( { const char* debugName = pDescriptorSets[i]->getDebugName(); if (debugName) - m_logger.log("Descriptor set (%s, %p) was modified between two recorded bind commands since the last command buffer's beginning.", system::ILogger::ELL_ERROR, debugName, pDescriptorSets[i]); + NBL_LOG_ERROR("Descriptor set (%s, %p) was modified between two recorded bind commands since the last command buffer's beginning!", debugName, pDescriptorSets[i]) else - m_logger.log("Descriptor set (%p) was modified between two recorded bind commands since the last command buffer's beginning.", system::ILogger::ELL_ERROR, pDescriptorSets[i]); + NBL_LOG_ERROR("Descriptor set (%p) was modified between two recorded bind commands since the last command buffer's beginning!", pDescriptorSets[i]); m_state = STATE::INVALID; return false; @@ -795,10 +954,16 @@ bool IGPUCommandBuffer::pushConstants(const IGPUPipelineLayout* const layout, co return false; if (!layout || !this->isCompatibleDevicewise(layout)) + { + NBL_LOG_ERROR("invalid layout!"); return false; + } if (!m_cmdpool->m_commandListPool.emplace(m_commandList, core::smart_refctd_ptr(layout))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; return pushConstants_impl(layout, stageFlags, offset, size, pValues); @@ -807,7 +972,11 @@ bool IGPUCommandBuffer::pushConstants(const IGPUPipelineLayout* const layout, co bool IGPUCommandBuffer::bindVertexBuffers(const uint32_t firstBinding, const uint32_t bindingCount, const asset::SBufferBinding* const pBindings) { if (firstBinding+bindingCount>asset::SVertexInputParams::MAX_ATTR_BUF_BINDING_COUNT) + { + NBL_LOG_ERROR("bindings count exceeded the maximum allowed bindings!"); return false; + } + if (!checkStateBeforeRecording(queue_flags_t::GRAPHICS_BIT)) return false; @@ -816,7 +985,10 @@ bool IGPUCommandBuffer::bindVertexBuffers(const uint32_t firstBinding, const uin return false; if (!m_cmdpool->m_commandListPool.emplace(m_commandList,bindingCount,pBindings)) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; return bindVertexBuffers_impl(firstBinding, bindingCount, pBindings); @@ -846,7 +1018,10 @@ bool IGPUCommandBuffer::bindIndexBuffer(const asset::SBufferBindingm_commandListPool.emplace(m_commandList, core::smart_refctd_ptr(binding.buffer))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; return bindIndexBuffer_impl(binding,indexType); @@ -877,10 +1052,16 @@ bool IGPUCommandBuffer::setLineWidth(const float width) { const auto& limits = device->getPhysicalDevice()->getLimits(); if (widthlimits.lineWidthRange[1]) + { + NBL_LOG_ERROR("width(%d) is out of the allowable range [%d, %d]!", width, limits.lineWidthRange[0], limits.lineWidthRange[1]); return false; + } } else if (width!=1.f) + { + NBL_LOG_ERROR("invalid width(%d). only 1.0 is supported!", width); return false; + } m_noCommands = false; return setLineWidth_impl(width); @@ -892,10 +1073,16 @@ bool IGPUCommandBuffer::setDepthBounds(const float minDepthBounds, const float m return false; if (!getOriginDevice()->getEnabledFeatures().depthBounds) + { + NBL_LOG_ERROR("feature not enabled!"); return false; + } // TODO: implement and handle VK_EXT_depth_range_unrestrices if (minDepthBounds<0.f || maxDepthBounds>1.f) + { + NBL_LOG_ERROR("invalid bounds [%d, %d]!", minDepthBounds, maxDepthBounds); return false; + } m_noCommands = false; return setDepthBounds_impl(minDepthBounds,maxDepthBounds); @@ -909,10 +1096,16 @@ bool IGPUCommandBuffer::resetQueryPool(IQueryPool* const queryPool, const uint32 return false; if (!queryPool || !this->isCompatibleDevicewise(queryPool)) + { + NBL_LOG_ERROR("invalid parameter 'queryPool'!"); return false; + } if (!m_cmdpool->m_commandListPool.emplace(m_commandList, core::smart_refctd_ptr(queryPool))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; return resetQueryPool_impl(queryPool, firstQuery, queryCount); @@ -925,10 +1118,16 @@ bool IGPUCommandBuffer::beginQuery(IQueryPool* const queryPool, const uint32_t q return false; if (!queryPool || !this->isCompatibleDevicewise(queryPool)) + { + NBL_LOG_ERROR("invalid parameter 'queryPool'!"); return false; + } if (!m_cmdpool->m_commandListPool.emplace(m_commandList, core::smart_refctd_ptr(queryPool))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; return beginQuery_impl(queryPool, query, flags); @@ -941,10 +1140,16 @@ bool IGPUCommandBuffer::endQuery(IQueryPool* const queryPool, const uint32_t que return false; if (!queryPool || !this->isCompatibleDevicewise(queryPool)) + { + NBL_LOG_ERROR("invalid parameter 'queryPool'!"); return false; + } if (!m_cmdpool->m_commandListPool.emplace(m_commandList, core::smart_refctd_ptr(queryPool))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; return endQuery_impl(queryPool, query); @@ -956,16 +1161,30 @@ bool IGPUCommandBuffer::writeTimestamp(const stage_flags_t pipelineStage, IQuery return false; const auto qFamIx = m_cmdpool->getQueueFamilyIndex(); - if (!getOriginDevice()->getSupportedStageMask(qFamIx).hasFlags(pipelineStage) || getOriginDevice()->getPhysicalDevice()->getQueueFamilyProperties()[qFamIx].timestampValidBits==0u) + if (!getOriginDevice()->getSupportedStageMask(qFamIx).hasFlags(pipelineStage)) + { + NBL_LOG_ERROR("incompatible parameter 'pipelineStage'!"); return false; + } + if (getOriginDevice()->getPhysicalDevice()->getQueueFamilyProperties()[qFamIx].timestampValidBits == 0u) + { + NBL_LOG_ERROR("timestamps not supported for this queue family index (%d)!", qFamIx); + return false; + } if (!queryPool || !this->isCompatibleDevicewise(queryPool) || queryPool->getCreationParameters().queryType!=IQueryPool::TYPE::TIMESTAMP || query>=queryPool->getCreationParameters().queryCount) + { + NBL_LOG_ERROR("invalid parameter 'queryPool'!"); return false; + } assert(core::isPoT(static_cast(pipelineStage))); // should only be 1 stage (1 bit set) if (!m_cmdpool->m_commandListPool.emplace(m_commandList, core::smart_refctd_ptr(queryPool))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; return writeTimestamp_impl(pipelineStage, queryPool, query); @@ -976,16 +1195,33 @@ bool IGPUCommandBuffer::writeAccelerationStructureProperties(const std::spanisCompatibleDevicewise(queryPool) || pAccelerationStructures.empty()) + if (!queryPool || !this->isCompatibleDevicewise(queryPool)) + { + NBL_LOG_ERROR("invalid parameter 'queryPool'!"); return false; + } - for (auto& as : pAccelerationStructures) - if (!isCompatibleDevicewise(as)) + if (pAccelerationStructures.empty()) + { + NBL_LOG_ERROR("parameter 'pAccelerationStructures' is empty!"); return false; + } + + for (auto& as : pAccelerationStructures) + { + if (!isCompatibleDevicewise(as)) + { + NBL_LOG_ERROR("incompatible device!"); + return false; + } + } auto cmd = m_cmdpool->m_commandListPool.emplace(m_commandList, queryPool, pAccelerationStructures.size()); if (!cmd) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } auto oit = cmd->getVariableCountResources(); for (auto& as : pAccelerationStructures) @@ -1002,15 +1238,26 @@ bool IGPUCommandBuffer::copyQueryPoolResults( return false; // TODO: rest of validation - if (!queryPool || !this->isCompatibleDevicewise(queryPool) || queryCount==0u || firstQuery+queryCount>=queryPool->getCreationParameters().queryCount) + if (!queryPool || !this->isCompatibleDevicewise(queryPool)) + { + NBL_LOG_ERROR("invalid parameter 'queryPool'!"); + return false; + } + if (queryCount==0u || firstQuery+queryCount>=queryPool->getCreationParameters().queryCount) + { + NBL_LOG_ERROR("parameter 'queryCount' exceeded the valid range [1, %d]!", queryPool->getCreationParameters().queryCount - firstQuery); return false; + } const size_t alignment = flags.hasFlags(IQueryPool::RESULTS_FLAGS::_64_BIT) ? alignof(uint64_t):alignof(uint32_t); if (invalidBufferRange({dstBuffer.offset,queryCount*stride,dstBuffer.buffer},alignment,IGPUBuffer::EUF_TRANSFER_DST_BIT)) return false; if (!m_cmdpool->m_commandListPool.emplace(m_commandList, core::smart_refctd_ptr(queryPool), core::smart_refctd_ptr(dstBuffer.buffer))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; return copyQueryPoolResults_impl(queryPool, firstQuery, queryCount, dstBuffer, stride, flags); @@ -1023,11 +1270,17 @@ bool IGPUCommandBuffer::dispatch(const uint32_t groupCountX, const uint32_t grou return false; if (groupCountX==0 || groupCountY==0 || groupCountZ==0) + { + NBL_LOG_ERROR("invalid group counts (%d, %d, %d)!", groupCountX, groupCountY, groupCountZ); return false; + } const auto& limits = getOriginDevice()->getPhysicalDevice()->getLimits(); if (groupCountX>limits.maxComputeWorkGroupCount[0] || groupCountY>limits.maxComputeWorkGroupCount[1] || groupCountZ>limits.maxComputeWorkGroupCount[2]) + { + NBL_LOG_ERROR("group counts (%d, %d, %d) exceeds maximum counts (%d, %d, %d)!", groupCountX, groupCountY, groupCountZ, limits.maxComputeWorkGroupCount[0], limits.maxComputeWorkGroupCount[1], limits.maxComputeWorkGroupCount[2]); return false; + } m_noCommands = false; return dispatch_impl(groupCountX,groupCountY,groupCountZ); @@ -1042,7 +1295,10 @@ bool IGPUCommandBuffer::dispatchIndirect(const asset::SBufferBindingm_commandListPool.emplace(m_commandList,core::smart_refctd_ptr(binding.buffer))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; return dispatchIndirect_impl(binding); @@ -1052,37 +1308,63 @@ bool IGPUCommandBuffer::dispatchIndirect(const asset::SBufferBindingisCompatibleDevicewise(info.framebuffer)) + { + NBL_LOG_ERROR("invalid framebuffer!"); return false; + } const auto& renderArea = info.renderArea; if (renderArea.extent.width==0u || renderArea.extent.height==0u) + { + NBL_LOG_ERROR("invalid extent size [%d, %d]!", renderArea.extent.width, renderArea.extent.height); return false; + } const auto& framebufferParams = info.framebuffer->getCreationParameters(); if (renderArea.offset.x+renderArea.extent.width>framebufferParams.width || renderArea.offset.y+renderArea.extent.height>framebufferParams.height) + { + NBL_LOG_ERROR("render area [%d, %d] exceeds valid range [%d, %d]!", + renderArea.offset.x + renderArea.extent.width, renderArea.offset.y + renderArea.extent.height, + framebufferParams.width, framebufferParams.height); return false; + } if (info.renderpass) { if (!framebufferParams.renderpass->compatible(info.renderpass)) + { + NBL_LOG_ERROR("renderpass is incompatible with the framebuffer!"); return false; + } } else info.renderpass = framebufferParams.renderpass.get(); if (info.renderpass->getDepthStencilLoadOpAttachmentEnd()!=0u && !info.depthStencilClearValues) + { + NBL_LOG_ERROR("depthStencilClearValues must be greater than the largest attachment index specifying a load Op of CLEAR!"); return false; + } if (info.renderpass->getColorLoadOpAttachmentEnd()!=0u && !info.colorClearValues) + { + NBL_LOG_ERROR("colorClearValues must be greater than the largest attachment index specifying a load Op of CLEAR!"); return false; + } if (!m_cmdpool->m_commandListPool.emplace(m_commandList,core::smart_refctd_ptr(info.renderpass),core::smart_refctd_ptr(info.framebuffer))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; if (!beginRenderPass_impl(info,contents)) @@ -1096,10 +1378,16 @@ bool IGPUCommandBuffer::beginRenderPass(SRenderpassBeginInfo info, const SUBPASS bool IGPUCommandBuffer::nextSubpass(const SUBPASS_CONTENTS contents) { if (m_recordingFlags.hasFlags(USAGE::RENDER_PASS_CONTINUE_BIT)) + { + NBL_LOG_ERROR("primary command buffer must not include the RENDER_PASS_CONTINUE_BIT flag!"); return false; + } // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdNextSubpass2KHR.html#VUID-vkCmdNextSubpass2-None-03102 if (m_cachedInheritanceInfo.subpass+1>=m_cachedInheritanceInfo.renderpass->getSubpassCount()) + { + NBL_LOG_ERROR("no more subpasses to transit!"); return false; + } m_cachedInheritanceInfo.subpass++; m_noCommands = false; @@ -1112,7 +1400,10 @@ bool IGPUCommandBuffer::endRenderPass() return false; // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdEndRenderPass2KHR.html#VUID-vkCmdEndRenderPass2-None-03103 if (m_cachedInheritanceInfo.subpass+1!=m_cachedInheritanceInfo.renderpass->getSubpassCount()) + { + NBL_LOG_ERROR("the amount of transited (%d) sub-passes must be equal to total sub-pass count!", m_cachedInheritanceInfo.subpass + 1, m_cachedInheritanceInfo.renderpass->getSubpassCount()); return false; + } m_cachedInheritanceInfo.subpass = SInheritanceInfo{}.subpass; m_noCommands = false; @@ -1126,27 +1417,40 @@ bool IGPUCommandBuffer::clearAttachments(const SClearAttachments& info) return false; if (!info.valid()) + { + NBL_LOG_ERROR("invalid parameter 'info'!"); return false; + } const auto& rpassParams = m_cachedInheritanceInfo.renderpass->getCreationParameters(); const auto& subpass = rpassParams.subpasses[m_cachedInheritanceInfo.subpass]; if (info.clearDepth||info.clearStencil) { if (!subpass.depthStencilAttachment.render.used()) + { + NBL_LOG_ERROR("current subpass attachment and the clear format doesn't match!"); return false; + } const auto& depthStencilAttachment = rpassParams.depthStencilAttachments[subpass.depthStencilAttachment.render.attachmentIndex]; // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdClearAttachments.html#VUID-vkCmdClearAttachments-aspectMask-07884 if (info.clearDepth && asset::isStencilOnlyFormat(depthStencilAttachment.format)) + { + NBL_LOG_ERROR("stencil only asset can't be cleared with the 'clearDepth' parameter!"); return false; + } // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdClearAttachments.html#VUID-vkCmdClearAttachments-aspectMask-07885 if (info.clearStencil && asset::isDepthOnlyFormat(depthStencilAttachment.format)) - return false; + NBL_LOG_ERROR("depth only asset can't be cleared with the 'clearStencil' parameter!"); + return false; } for (auto i=0; im_cachedInheritanceInfo.framebuffer->getCreationParameters().layers) + { + NBL_LOG_ERROR("region layers (%d) exceeds the valid amount (%d)!", region.baseArrayLayer + region.layerCount, m_cachedInheritanceInfo.framebuffer->getCreationParameters().layers); return false; + } } m_noCommands = false; @@ -1170,7 +1477,10 @@ bool IGPUCommandBuffer::draw(const uint32_t vertexCount, const uint32_t instance return false; if (vertexCount==0u || instanceCount == 0u) + { + NBL_LOG_ERROR("invalid 'vertexCount' (%d) or 'instanceCount' (%d)!", vertexCount, instanceCount); return false; + } m_noCommands = false; return draw_impl(vertexCount,instanceCount,firstVertex,firstInstance); @@ -1182,7 +1492,10 @@ bool IGPUCommandBuffer::drawIndexed(const uint32_t indexCount, const uint32_t in return false; if (indexCount==0u || instanceCount == 0u) + { + NBL_LOG_ERROR("invalid 'indexCount' (%d) or 'instanceCount' (%d)!", indexCount, instanceCount); return false; + } m_noCommands = false; return drawIndexed_impl(indexCount,instanceCount,firstIndex,vertexOffset,firstInstance); @@ -1199,10 +1512,16 @@ bool IGPUCommandBuffer::invalidDrawIndirect(const asset::SBufferBindinggetOriginDevice()->getPhysicalDevice()->getLimits().maxDrawIndirectCount) + } + if (drawCount > getOriginDevice()->getPhysicalDevice()->getLimits().maxDrawIndirectCount) + { + NBL_LOG_ERROR("draw count (%d) exceeds maximum allowed amount (%d)!", drawCount, getOriginDevice()->getPhysicalDevice()->getLimits().maxDrawIndirectCount); return true; - if (invalidBufferRange({binding.offset,stride*(drawCount-1u)+sizeof(IndirectCommand),binding.buffer},alignof(uint32_t),IGPUBuffer::EUF_INDIRECT_BUFFER_BIT)) + } + if (invalidBufferRange({ binding.offset,stride * (drawCount - 1u) + sizeof(IndirectCommand),binding.buffer }, alignof(uint32_t), IGPUBuffer::EUF_INDIRECT_BUFFER_BIT)) return true; } return false; @@ -1214,7 +1533,10 @@ template requires nbl::is_any_of_v& indirectBinding, const asset::SBufferBinding& countBinding, const uint32_t maxDrawCount, const uint32_t stride) { if (!getOriginDevice()->getPhysicalDevice()->getLimits().drawIndirectCount) + { + NBL_LOG_ERROR("indirect draws with draw call count are not supported!"); return true; + } if (invalidDrawIndirect(indirectBinding,maxDrawCount,stride)) return true; @@ -1232,7 +1554,10 @@ bool IGPUCommandBuffer::drawIndirect(const asset::SBufferBindingm_commandListPool.emplace(m_commandList,core::smart_refctd_ptr(binding.buffer))) - return false; + { + NBL_LOG_ERROR("out of host memory!"); + return true; + } m_noCommands = false; return drawIndirect_impl(binding, drawCount, stride); @@ -1244,7 +1569,10 @@ bool IGPUCommandBuffer::drawIndexedIndirect(const asset::SBufferBindingm_commandListPool.emplace(m_commandList, core::smart_refctd_ptr(binding.buffer))) - return false; + { + NBL_LOG_ERROR("out of host memory!"); + return true; + } m_noCommands = false; return drawIndexedIndirect_impl(binding, drawCount, stride); @@ -1256,7 +1584,10 @@ bool IGPUCommandBuffer::drawIndirectCount(const asset::SBufferBindingm_commandListPool.emplace(m_commandList, core::smart_refctd_ptr(indirectBinding.buffer), core::smart_refctd_ptr(countBinding.buffer))) - return false; + { + NBL_LOG_ERROR("out of host memory!"); + return true; + } m_noCommands = false; return drawIndirectCount_impl(indirectBinding, countBinding, maxDrawCount, stride); @@ -1268,7 +1599,10 @@ bool IGPUCommandBuffer::drawIndexedIndirectCount(const asset::SBufferBindingm_commandListPool.emplace(m_commandList, core::smart_refctd_ptr(indirectBinding.buffer), core::smart_refctd_ptr(countBinding.buffer))) - return false; + { + NBL_LOG_ERROR("out of host memory!"); + return true; + } m_noCommands = false; return drawIndexedIndirectCount_impl(indirectBinding, countBinding, maxDrawCount, stride); @@ -1341,28 +1675,43 @@ bool IGPUCommandBuffer::blitImage(const IGPUImage* const srcImage, const IGPUIma return false; if (regions.empty() || disallowedLayoutForBlitAndResolve(srcImageLayout) || disallowedLayoutForBlitAndResolve(dstImageLayout)) + { + NBL_LOG_ERROR("invalid parameters!"); return false; + } const auto* physDev = getOriginDevice()->getPhysicalDevice(); const auto& srcParams = srcImage->getCreationParameters(); if (!srcImage || !this->isCompatibleDevicewise(srcImage) || !srcParams.usage.hasFlags(IGPUImage::EUF_TRANSFER_SRC_BIT) || !physDev->getImageFormatUsages(srcImage->getTiling())[srcParams.format].blitSrc) + { + NBL_LOG_ERROR("invalid source image!"); return false; + } const auto& dstParams = dstImage->getCreationParameters(); if (!dstImage || !this->isCompatibleDevicewise(dstImage) || !dstParams.usage.hasFlags(IGPUImage::EUF_TRANSFER_DST_BIT) || !physDev->getImageFormatUsages(dstImage->getTiling())[dstParams.format].blitDst) + { + NBL_LOG_ERROR("invalid destination image!"); return false; + } // TODO rest of: https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdBlitImage.html#VUID-vkCmdBlitImage-srcImage-00229 for (auto region : regions) { if (region.layerCount==0 || !region.aspectMask) + { + NBL_LOG_ERROR("invalid region layerCount (%d) or aspectMask (%d)!", (uint32_t)region.layerCount, (uint32_t)region.aspectMask); return false; + } // probably validate the offsets, and extents } if (!m_cmdpool->m_commandListPool.emplace(m_commandList, core::smart_refctd_ptr(srcImage), core::smart_refctd_ptr(dstImage))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; return blitImage_impl(srcImage, srcImageLayout, dstImage, dstImageLayout, regions, filter); @@ -1374,29 +1723,50 @@ bool IGPUCommandBuffer::resolveImage(const IGPUImage* const srcImage, const IGPU return false; if (regionCount==0u || !pRegions || disallowedLayoutForBlitAndResolve(srcImageLayout) || disallowedLayoutForBlitAndResolve(dstImageLayout)) + { + NBL_LOG_ERROR("invalid parameters!"); return false; + } const auto* physDev = getOriginDevice()->getPhysicalDevice(); const auto& srcParams = srcImage->getCreationParameters(); if (!srcImage || !this->isCompatibleDevicewise(srcImage) || !srcParams.usage.hasFlags(IGPUImage::EUF_TRANSFER_SRC_BIT)) + { + NBL_LOG_ERROR("invalid source image!"); return false; + } // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdResolveImage.html#VUID-vkCmdResolveImage-srcImage-00258 - if (srcParams.samples==IGPUImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT) + if (srcParams.samples == IGPUImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT) + { + NBL_LOG_ERROR("source image sample count must be 1!"); return false; + } const auto& dstParams = dstImage->getCreationParameters(); if (!dstImage || !this->isCompatibleDevicewise(dstImage) || !dstParams.usage.hasFlags(IGPUImage::EUF_TRANSFER_SRC_BIT) || !physDev->getImageFormatUsages(dstImage->getTiling())[dstParams.format].attachment) + { + NBL_LOG_ERROR("invalid destination image!"); return false; + } // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdResolveImage.html#VUID-vkCmdResolveImage-dstImage-00259 if (dstParams.samples!=IGPUImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT) + { + NBL_LOG_ERROR("destination image sample count must be 1!"); return false; + } // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdResolveImage.html#VUID-vkCmdResolveImage-srcImage-01386 if (srcParams.format!=dstParams.format) + { + NBL_LOG_ERROR("source and destination image formats doesn't match!"); return false; + } if (!m_cmdpool->m_commandListPool.emplace(m_commandList, core::smart_refctd_ptr(srcImage), core::smart_refctd_ptr(dstImage))) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } m_noCommands = false; return resolveImage_impl(srcImage, srcImageLayout, dstImage, dstImageLayout, regionCount, pRegions); @@ -1410,15 +1780,24 @@ bool IGPUCommandBuffer::executeCommands(const uint32_t count, IGPUCommandBuffer* for (uint32_t i=0u; igetLevel()!=IGPUCommandPool::BUFFER_LEVEL::SECONDARY) + { + NBL_LOG_ERROR("cmdbufs[%d] level is not SECONDARY!", i); return false; + } if (!this->isCompatibleDevicewise(cmdbufs[i])) + { + NBL_LOG_ERROR("cmdbufs[%d] has incompatible device!", i); return false; + } } - auto cmd = m_cmdpool->m_commandListPool.emplace(m_commandList,count); + auto cmd = m_cmdpool->m_commandListPool.emplace(m_commandList, count); if (!cmd) + { + NBL_LOG_ERROR("out of host memory!"); return false; + } for (auto i=0u; igetVariableCountResources()[i] = core::smart_refctd_ptr(cmdbufs[i]); m_noCommands = false; From 0cc3a187fdc8780564af897ccf0b0dd52f5263b6 Mon Sep 17 00:00:00 2001 From: Mateusz Kielan Date: Fri, 8 Nov 2024 07:01:13 +0100 Subject: [PATCH 156/358] Update ISPIRVOptimizer.h spans always need to be const --- include/nbl/asset/utils/ISPIRVOptimizer.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/nbl/asset/utils/ISPIRVOptimizer.h b/include/nbl/asset/utils/ISPIRVOptimizer.h index da14a3c917..9d50cdbc0f 100644 --- a/include/nbl/asset/utils/ISPIRVOptimizer.h +++ b/include/nbl/asset/utils/ISPIRVOptimizer.h @@ -39,7 +39,7 @@ class ISPIRVOptimizer final : public core::IReferenceCounted EOP_COUNT }; - ISPIRVOptimizer(std::span _passes) : m_passes(_passes.begin(), _passes.end()) {} + ISPIRVOptimizer(std::span _passes) : m_passes(_passes.begin(), _passes.end()) {} core::smart_refctd_ptr optimize(const uint32_t* _spirv, uint32_t _dwordCount, system::logger_opt_ptr logger) const; core::smart_refctd_ptr optimize(const ICPUBuffer* _spirv, system::logger_opt_ptr logger) const; @@ -51,4 +51,4 @@ class ISPIRVOptimizer final : public core::IReferenceCounted } -#endif \ No newline at end of file +#endif From e8db5fed744a3586a183504fe11ef15b924b17c9 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 8 Nov 2024 15:35:52 +0700 Subject: [PATCH 157/358] latest example still not working --- examples_tests | 2 +- include/nbl/application_templates/MonoDeviceApplication.hpp | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/examples_tests b/examples_tests index 7e3b373a96..d7efbc4a73 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 7e3b373a96a6ef512f82404f9b60545fe103a036 +Subproject commit d7efbc4a73381c9dc62afe373fa1c945f5fcb76b diff --git a/include/nbl/application_templates/MonoDeviceApplication.hpp b/include/nbl/application_templates/MonoDeviceApplication.hpp index 60bdae0619..8006ff36a5 100644 --- a/include/nbl/application_templates/MonoDeviceApplication.hpp +++ b/include/nbl/application_templates/MonoDeviceApplication.hpp @@ -203,16 +203,11 @@ class MonoDeviceApplication : public virtual MonoSystemMonoLoggerApplication } // These features are features you'll enable if present but won't interfere with your choice of device - // There's no intersection operator (yet) on the features, so its not used yet! // virtual function so you can override as needed for some example father down the line virtual video::SPhysicalDeviceFeatures getPreferredDeviceFeatures() const { video::SPhysicalDeviceFeatures retval = {}; - /*retval.shaderFloat64 = true; - retval.shaderDrawParameters = true; - retval.drawIndirectCount = true;*/ - return retval; } From e890b4716bda199f70b5311de3059a4e52e8d137 Mon Sep 17 00:00:00 2001 From: Erfan Ahmadi Date: Fri, 8 Nov 2024 12:41:00 +0400 Subject: [PATCH 158/358] ISwapchain and SimpleManagedSurface, fix previous wrong assumptions around frames in flight and cleanup the interface --- examples_tests | 2 +- include/nbl/video/CVulkanSwapchain.h | 12 +++- include/nbl/video/ISwapchain.h | 58 ++++++++++--------- .../video/utilities/CSmoothResizeSurface.h | 4 +- .../video/utilities/ISimpleManagedSurface.h | 37 ++++++++---- src/nbl/video/CVulkanSwapchain.cpp | 27 ++++++--- 6 files changed, 88 insertions(+), 52 deletions(-) diff --git a/examples_tests b/examples_tests index e77ed5d468..8766b00ddd 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit e77ed5d468f929ac5e7f1909f728895c923eb2c4 +Subproject commit 8766b00ddd1fc0d6e35ff662fd7a12fbd5051a45 diff --git a/include/nbl/video/CVulkanSwapchain.h b/include/nbl/video/CVulkanSwapchain.h index 8854a8a79e..5a16ef0a5e 100644 --- a/include/nbl/video/CVulkanSwapchain.h +++ b/include/nbl/video/CVulkanSwapchain.h @@ -20,8 +20,14 @@ class CVulkanSwapchain final : public ISwapchain void setObjectDebugName(const char* label) const override; - inline const void* getNativeHandle() const {return &m_vkSwapchainKHR;} + inline const void* getNativeHandle() const override {return &m_vkSwapchainKHR;} inline VkSwapchainKHR getInternalObject() const {return m_vkSwapchainKHR;} + + // returns the maximum number of time acquire can be called without releasing the image index through present. + inline uint8_t getMaxBlockingAcquiresBeforePresent() const override { return static_cast(m_maxBlockingAcquiresBeforePresent); } + + // returns the maximum number of acquires you can request without waiting for previous acquire semaphores to signal. + uint8_t getMaxAcquiresInFlight() const override { return getImageCount(); } private: CVulkanSwapchain( @@ -32,7 +38,8 @@ class CVulkanSwapchain final : public ISwapchain const VkSwapchainKHR swapchain, const VkSemaphore* const _acquireAdaptorSemaphores, const VkSemaphore* const _prePresentSemaphores, - const VkSemaphore* const _presentAdaptorSemaphores + const VkSemaphore* const _presentAdaptorSemaphores, + const uint32_t maxAcquiresBeforePresent ); ~CVulkanSwapchain(); @@ -51,6 +58,7 @@ class CVulkanSwapchain final : public ISwapchain uint64_t m_perImageAcquireCount[ISwapchain::MaxImages] = { 0 }; // nasty way to fight UB of the Vulkan spec bool m_needToWaitIdle = true; + uint32_t m_maxBlockingAcquiresBeforePresent = 0u; }; } diff --git a/include/nbl/video/ISwapchain.h b/include/nbl/video/ISwapchain.h index fdebd01438..d052a819bd 100644 --- a/include/nbl/video/ISwapchain.h +++ b/include/nbl/video/ISwapchain.h @@ -464,15 +464,21 @@ class ISwapchain : public IBackendObject // Vulkan: const VkSwapchainKHR* virtual const void* getNativeHandle() const = 0; - // only public because MultiTimelineEventHandlerST needs to know about it - class DeferredFrameSemaphoreDrop final - { + // returns the maximum number of time acquires with infinite timeout which can be called before releasing the image index through present. + virtual uint8_t getMaxBlockingAcquiresBeforePresent() const = 0u; + + // returns the maximum number of acquires you can request without waiting for previous acquire semaphores to signal. + virtual uint8_t getMaxAcquiresInFlight() const = 0u; + + // only public because MultiTimelineEventHandlerST needs to know about it + class DeferredFrameSemaphoreDrop final + { using sema_refctd_ptr = core::smart_refctd_ptr; - core::smart_refctd_dynamic_array m_otherSemaphores = nullptr; + core::smart_refctd_dynamic_array m_otherSemaphores = nullptr; - public: - inline DeferredFrameSemaphoreDrop(const std::span _semaphores) - { + public: + inline DeferredFrameSemaphoreDrop(const std::span _semaphores) + { const auto otherCount = _semaphores.size()-1; // first semaphore always serves as the timeline and will be refcounted in the event handler if (otherCount==0) @@ -481,34 +487,34 @@ class ISwapchain : public IBackendObject for (auto i=0ull; ioperator[](i) = sema_refctd_ptr(_semaphores[i].semaphore); - } + } DeferredFrameSemaphoreDrop(const DeferredFrameSemaphoreDrop& other) = delete; - inline DeferredFrameSemaphoreDrop(DeferredFrameSemaphoreDrop&& other) : m_otherSemaphores(nullptr) - { - this->operator=(std::move(other)); - } + inline DeferredFrameSemaphoreDrop(DeferredFrameSemaphoreDrop&& other) : m_otherSemaphores(nullptr) + { + this->operator=(std::move(other)); + } DeferredFrameSemaphoreDrop& operator=(const DeferredFrameSemaphoreDrop& other) = delete; - inline DeferredFrameSemaphoreDrop& operator=(DeferredFrameSemaphoreDrop&& other) - { + inline DeferredFrameSemaphoreDrop& operator=(DeferredFrameSemaphoreDrop&& other) + { m_otherSemaphores = std::move(other.m_otherSemaphores); - other.m_otherSemaphores = nullptr; - return *this; - } - - struct single_poll_t {}; - static inline single_poll_t single_poll; - inline bool operator()(single_poll_t _single_poll) - { - operator()(); - return true; - } + other.m_otherSemaphores = nullptr; + return *this; + } + + struct single_poll_t {}; + static inline single_poll_t single_poll; + inline bool operator()(single_poll_t _single_poll) + { + operator()(); + return true; + } inline void operator()() { m_otherSemaphores = nullptr; } - }; + }; protected: ISwapchain(core::smart_refctd_ptr&& dev, SCreationParams&& params, const uint8_t imageCount, core::smart_refctd_ptr&& oldSwapchain); diff --git a/include/nbl/video/utilities/CSmoothResizeSurface.h b/include/nbl/video/utilities/CSmoothResizeSurface.h index fc75445c1f..4d3a243b90 100644 --- a/include/nbl/video/utilities/CSmoothResizeSurface.h +++ b/include/nbl/video/utilities/CSmoothResizeSurface.h @@ -336,7 +336,7 @@ class NBL_API2 ISmoothResizeSurface : public ISimpleManagedSurface auto device = const_cast(queue->getOriginDevice()); // need to wait before resetting a commandbuffer - const auto maxFramesInFlight = getMaxFramesInFlight(); + const auto maxFramesInFlight = getMaxAcquiresInFlight(); if (acquireCount>maxFramesInFlight) { const ISemaphore::SWaitInfo cmdbufDonePending[1] = { @@ -454,7 +454,7 @@ class CSmoothResizeSurface final : public ISmoothResizeSurface // transient commandbuffer and pool to perform the blits or other copies to SC images auto pool = device->createCommandPool(queue->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - if (!pool || !pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{m_cmdbufs.data(),getMaxFramesInFlight()})) + if (!pool || !pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{m_cmdbufs.data(),getMaxAcquiresInFlightUpperLimit()})) return init_fail(); m_swapchainResources = std::move(scResources); diff --git a/include/nbl/video/utilities/ISimpleManagedSurface.h b/include/nbl/video/utilities/ISimpleManagedSurface.h index 5f0b01eefc..c01a5e267f 100644 --- a/include/nbl/video/utilities/ISimpleManagedSurface.h +++ b/include/nbl/video/utilities/ISimpleManagedSurface.h @@ -39,9 +39,22 @@ class NBL_API2 ISimpleManagedSurface : public core::IReferenceCounted // inline ISurface* getSurface() {return m_surface.get();} inline const ISurface* getSurface() const {return m_surface.get();} + - // - inline uint8_t getMaxFramesInFlight() const {return m_maxFramesInFlight;} + // returns the maximum number of acquires you can request without waiting for previous acquire semaphores to signal. + // this value is dynamic, so don't think about caching this value, this might change on swapchain recreation. + inline uint8_t getMaxAcquiresInFlight() + { + auto swapchainResources = getSwapchainResources(); + if (swapchainResources && swapchainResources->swapchain) + return core::min(swapchainResources->swapchain->getMaxAcquiresInFlight(), m_maxImageCount); + return m_maxImageCount; + } + + // returns upper limit of maximum acquires in flight returned in `getMaxAcquiresInFlight()`. + // use this to allocate your command buffers or per frame in flight resources + // call this function after initilization. + inline uint8_t getMaxAcquiresInFlightUpperLimit() const { assert(m_maxImageCount > 0); return m_maxImageCount; } // A small utility for the boilerplate inline uint8_t pickQueueFamily(ILogicalDevice* device) const @@ -166,7 +179,7 @@ class NBL_API2 ISimpleManagedSurface : public core::IReferenceCounted m_queue->waitIdle(); m_queue = nullptr; - m_maxFramesInFlight = 0; + m_maxImageCount = 0; m_acquireCount = 0; std::fill(m_acquireSemaphores.begin(),m_acquireSemaphores.end(),nullptr); } @@ -179,7 +192,7 @@ class NBL_API2 ISimpleManagedSurface : public core::IReferenceCounted // An interesting solution to the "Frames In Flight", our tiny wrapper class will have its own Timeline Semaphore incrementing with each acquire, and thats it. inline uint64_t getAcquireCount() {return m_acquireCount;} - inline ISemaphore* getAcquireSemaphore(const uint8_t ix) {return ixgetSurfaceCapabilitiesForPhysicalDevice(physDev,caps); - // vkAcquireNextImageKHR should not be called if the number of images that the application has currently acquired is greater than SwapchainImages-MinimumImages - m_maxFramesInFlight = core::min(caps.maxImageCount+1-caps.minImageCount,ISwapchain::MaxImages); + m_maxImageCount = core::min(caps.maxImageCount,ISwapchain::MaxImages); } - for (uint8_t i=0u; icreateSemaphore(0u); if (!m_acquireSemaphores[i]) @@ -251,7 +263,7 @@ class NBL_API2 ISimpleManagedSurface : public core::IReferenceCounted const auto nextAcquireSignal = m_acquireCount+1; SAcquireResult retval = { - .semaphore = m_acquireSemaphores[nextAcquireSignal%m_maxFramesInFlight].get(), + .semaphore = m_acquireSemaphores[nextAcquireSignal%m_maxImageCount].get(), .acquireCount = nextAcquireSignal }; const IQueue::SSubmitInfo::SSemaphoreInfo signalInfos[1] = { @@ -344,12 +356,13 @@ class NBL_API2 ISimpleManagedSurface : public core::IReferenceCounted // Use a Threadsafe queue to make sure we can do smooth resize in derived class, might get re-assigned CThreadSafeQueueAdapter* m_queue = nullptr; - // - uint8_t m_maxFramesInFlight = 0; + + uint8_t m_maxImageCount = 0; + // Created and persistent after first initialization, Note that we need one swapchain per Frame In Fligth because Acquires can't wait or synchronize with anything - // The only rule is that you can only have `m_maxFramesInFlight` pending acquires to wait with an infinte timeout, so thats about as far as they synchronize. std::array,ISwapchain::MaxImages> m_acquireSemaphores; - // You don't want to use `m_swapchainResources.swapchain->getAcquireCount()` because it resets when swapchain gets recreated + + // We shouldn't use `m_swapchainResources.swapchain->getAcquireCount()` because it resets when swapchain gets recreated uint64_t m_acquireCount = 0; }; diff --git a/src/nbl/video/CVulkanSwapchain.cpp b/src/nbl/video/CVulkanSwapchain.cpp index eeb4d67224..9b5fba8b93 100644 --- a/src/nbl/video/CVulkanSwapchain.cpp +++ b/src/nbl/video/CVulkanSwapchain.cpp @@ -13,11 +13,11 @@ core::smart_refctd_ptr CVulkanSwapchain::create(core::smart_re return nullptr; if (!params.surface) return nullptr; + + auto physDev = logicalDevice->getPhysicalDevice(); // now check if any queue family of the physical device supports this surface { - auto physDev = logicalDevice->getPhysicalDevice(); - bool noSupport = true; for (auto familyIx=0u; familyIxgetQueueCount(familyIx) && params.surface->isSupportedForPhysicalDevice(physDev,familyIx)) @@ -148,7 +148,12 @@ core::smart_refctd_ptr CVulkanSwapchain::create(core::smart_re return nullptr; } - return core::smart_refctd_ptr(new CVulkanSwapchain(std::move(device),std::move(params),imageCount,std::move(oldSwapchain),vk_swapchain,semaphores+imageCount,semaphores,semaphores+2*imageCount),core::dont_grab); + ISurface::SCapabilities caps = {}; + params.surface->getSurfaceCapabilitiesForPhysicalDevice(physDev, caps); + // read https://registry.khronos.org/vulkan/specs/1.3-extensions/html/chap34.html#swapchain-acquire-forward-progress + const uint32_t maxAcquiresWithoutPresent = imageCount - caps.minImageCount + 1u; + + return core::smart_refctd_ptr(new CVulkanSwapchain(std::move(device),std::move(params),imageCount,std::move(oldSwapchain),vk_swapchain,semaphores+imageCount,semaphores,semaphores+2*imageCount, maxAcquiresWithoutPresent),core::dont_grab); } CVulkanSwapchain::CVulkanSwapchain( @@ -159,9 +164,10 @@ CVulkanSwapchain::CVulkanSwapchain( const VkSwapchainKHR swapchain, const VkSemaphore* const _acquireAdaptorSemaphores, const VkSemaphore* const _prePresentSemaphores, - const VkSemaphore* const _presentAdaptorSemaphores -) : ISwapchain(std::move(logicalDevice),std::move(params),imageCount,std::move(oldSwapchain)), - m_imgMemRequirements{.size=0,.memoryTypeBits=0x0u,.alignmentLog2=63,.prefersDedicatedAllocation=true,.requiresDedicatedAllocation=true}, m_vkSwapchainKHR(swapchain) + const VkSemaphore* const _presentAdaptorSemaphores, + const uint32_t maxAcquiresBeforePresent) + : ISwapchain(std::move(logicalDevice),std::move(params),imageCount,std::move(oldSwapchain)), + m_imgMemRequirements{.size=0,.memoryTypeBits=0x0u,.alignmentLog2=63,.prefersDedicatedAllocation=true,.requiresDedicatedAllocation=true}, m_vkSwapchainKHR(swapchain), m_maxBlockingAcquiresBeforePresent(maxAcquiresBeforePresent) { // we've got it from here! if (m_oldSwapchain) @@ -173,10 +179,13 @@ CVulkanSwapchain::CVulkanSwapchain( auto retval = vulkanDevice->getFunctionTable()->vk.vkGetSwapchainImagesKHR(vulkanDevice->getInternalObject(),m_vkSwapchainKHR,&dummy,m_images); assert(retval==VK_SUCCESS && dummy==getImageCount()); } + const uint8_t maxAcquiresInFlight = getMaxAcquiresInFlight(); + assert(maxAcquiresInFlight == imageCount); + assert(getMaxBlockingAcquiresBeforePresent() <= imageCount); - std::copy_n(_acquireAdaptorSemaphores,imageCount,m_acquireAdaptorSemaphores); - std::copy_n(_prePresentSemaphores,imageCount,m_prePresentSemaphores); - std::copy_n(_presentAdaptorSemaphores,imageCount,m_presentAdaptorSemaphores); + std::copy_n(_acquireAdaptorSemaphores,maxAcquiresInFlight,m_acquireAdaptorSemaphores); + std::copy_n(_prePresentSemaphores,maxAcquiresInFlight,m_prePresentSemaphores); + std::copy_n(_presentAdaptorSemaphores,maxAcquiresInFlight,m_presentAdaptorSemaphores); } CVulkanSwapchain::~CVulkanSwapchain() From 2675ce268bb7c7558ed35ba9d0822a84b8212ca9 Mon Sep 17 00:00:00 2001 From: Ali Cheraghi Date: Fri, 8 Nov 2024 13:17:42 +0330 Subject: [PATCH 159/358] video: use NBL_LOG_ERROR in IGPUCommandBuffer.h Signed-off-by: Ali Cheraghi --- include/nbl/video/IGPUCommandBuffer.h | 21 +++++++++++++-------- src/nbl/video/IGPUCommandBuffer.cpp | 12 +++++++----- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/include/nbl/video/IGPUCommandBuffer.h b/include/nbl/video/IGPUCommandBuffer.h index 932c02a272..17983660c2 100644 --- a/include/nbl/video/IGPUCommandBuffer.h +++ b/include/nbl/video/IGPUCommandBuffer.h @@ -11,6 +11,10 @@ #include "nbl/video/IGPUCommandPool.h" #include "nbl/video/IQueue.h" +#include "git_info.h" +#define NBL_LOG_FUNCTION m_logger.log +#include "nbl/logging_macros.h" + // TODO: remove #define VK_NO_PROTOTYPES #include @@ -716,7 +720,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject return true; if (!buffer->getCreationParams().usage.hasFlags(usages)) { - m_logger.log("Incorrect `IGPUBuffer` usage flags for the command!", system::ILogger::ELL_ERROR); + NBL_LOG_ERROR("Incorrect `IGPUBuffer` usage flags for the command!"); return true; } return false; @@ -727,7 +731,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject return true; if (binding.offset&(alignment-1)) { - m_logger.log("Offset %d not aligned to %d for the command!", system::ILogger::ELL_ERROR, binding.offset, alignment); + NBL_LOG_ERROR("Offset %d not aligned to %d for the command!", binding.offset, alignment); return true; } return false; @@ -738,7 +742,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject return true; if ((range.size&(alignment-1)) && range.size!=asset::SBufferRange::WholeBuffer) { - m_logger.log("Size %d not aligned to %d for the command!", system::ILogger::ELL_ERROR, range.size, alignment); + NBL_LOG_ERROR("Size %d not aligned to %d for the command!", range.size, alignment); return true; } return false; @@ -748,12 +752,12 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject { if (!image || !this->isCompatibleDevicewise(image)) { - m_logger.log("invalid image!", system::ILogger::ELL_ERROR); + NBL_LOG_ERROR("invalid image!"); return true; } if (!image->getCreationParameters().usage.hasFlags(usages)) { - m_logger.log("Incorrect `IGPUImage` usage flags for the command!", system::ILogger::ELL_ERROR); + NBL_LOG_ERROR("Incorrect `IGPUImage` usage flags for the command!"); return true; } return false; @@ -768,7 +772,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject case IGPUImage::LAYOUT::SHARED_PRESENT: break; default: - m_logger.log("invalid destination image layout!", system::ILogger::ELL_ERROR); + NBL_LOG_ERROR("invalid destination image layout!"); return true; } if (invalidImage(image,IGPUImage::EUF_TRANSFER_DST_BIT)) @@ -777,7 +781,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject { if (image->getCreationParameters().samples!=IGPUImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT) { - m_logger.log("destination image sample count must be 1!", system::ILogger::ELL_ERROR); + NBL_LOG_ERROR("destination image sample count must be 1!"); return true; } } @@ -792,7 +796,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject case IGPUImage::LAYOUT::SHARED_PRESENT: break; default: - m_logger.log("invalid source image layout!", system::ILogger::ELL_ERROR); + NBL_LOG_ERROR("invalid source image layout!"); return true; } return invalidImage(image,IGPUImage::EUF_TRANSFER_SRC_BIT); @@ -853,4 +857,5 @@ extern template bool IGPUCommandBuffer::invalidDependency(const SDependencyInfo< } +#include "nbl/undef_logging_macros.h" #endif diff --git a/src/nbl/video/IGPUCommandBuffer.cpp b/src/nbl/video/IGPUCommandBuffer.cpp index 51453311f5..43fc709b77 100644 --- a/src/nbl/video/IGPUCommandBuffer.cpp +++ b/src/nbl/video/IGPUCommandBuffer.cpp @@ -1440,8 +1440,10 @@ bool IGPUCommandBuffer::clearAttachments(const SClearAttachments& info) } // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdClearAttachments.html#VUID-vkCmdClearAttachments-aspectMask-07885 if (info.clearStencil && asset::isDepthOnlyFormat(depthStencilAttachment.format)) + { NBL_LOG_ERROR("depth only asset can't be cleared with the 'clearStencil' parameter!"); - return false; + return false; + } } for (auto i=0; im_commandListPool.emplace(m_commandList,core::smart_refctd_ptr(binding.buffer))) { NBL_LOG_ERROR("out of host memory!"); - return true; + return false; } m_noCommands = false; @@ -1571,7 +1573,7 @@ bool IGPUCommandBuffer::drawIndexedIndirect(const asset::SBufferBindingm_commandListPool.emplace(m_commandList, core::smart_refctd_ptr(binding.buffer))) { NBL_LOG_ERROR("out of host memory!"); - return true; + return false; } m_noCommands = false; @@ -1586,7 +1588,7 @@ bool IGPUCommandBuffer::drawIndirectCount(const asset::SBufferBindingm_commandListPool.emplace(m_commandList, core::smart_refctd_ptr(indirectBinding.buffer), core::smart_refctd_ptr(countBinding.buffer))) { NBL_LOG_ERROR("out of host memory!"); - return true; + return false; } m_noCommands = false; @@ -1601,7 +1603,7 @@ bool IGPUCommandBuffer::drawIndexedIndirectCount(const asset::SBufferBindingm_commandListPool.emplace(m_commandList, core::smart_refctd_ptr(indirectBinding.buffer), core::smart_refctd_ptr(countBinding.buffer))) { NBL_LOG_ERROR("out of host memory!"); - return true; + return false; } m_noCommands = false; From 6d743f3055d4333851e824cd543449959b1e4c69 Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 8 Nov 2024 15:09:07 +0100 Subject: [PATCH 160/358] start computing the push constants --- include/nbl/builtin/hlsl/blit/common.hlsl | 2 + .../nbl/builtin/hlsl/blit/compute_blit.hlsl | 1 + include/nbl/builtin/hlsl/blit/parameters.hlsl | 99 +++++-- .../nbl/builtin/hlsl/cpp_compat/intrinsics.h | 8 + include/nbl/builtin/hlsl/type_traits.hlsl | 2 + include/nbl/video/utilities/CComputeBlit.h | 250 +++--------------- src/nbl/video/utilities/CComputeBlit.cpp | 80 +++++- 7 files changed, 191 insertions(+), 251 deletions(-) diff --git a/include/nbl/builtin/hlsl/blit/common.hlsl b/include/nbl/builtin/hlsl/blit/common.hlsl index fe1019134b..f3708098db 100644 --- a/include/nbl/builtin/hlsl/blit/common.hlsl +++ b/include/nbl/builtin/hlsl/blit/common.hlsl @@ -44,6 +44,8 @@ RWTexture3D outAs3D[ConstevalParameters::output_binding_t::Count]; groupshared uint32_t sMem[ConstevalParameters::SharedMemoryDWORDs]; +[[vk::push_constant]] const nbl::hlsl::blit::SPerWorkgroup pc; + #include /* diff --git a/include/nbl/builtin/hlsl/blit/compute_blit.hlsl b/include/nbl/builtin/hlsl/blit/compute_blit.hlsl index 791924ccc3..84e62913e5 100644 --- a/include/nbl/builtin/hlsl/blit/compute_blit.hlsl +++ b/include/nbl/builtin/hlsl/blit/compute_blit.hlsl @@ -77,6 +77,7 @@ struct compute_blit_t uint16_t localInvocationIndex) { const float3 halfScale = scale * float3(0.5f, 0.5f, 0.5f); + // bottom of the input tile const uint32_t3 minOutputPixel = workGroupID * outputTexelsPerWG; const float3 minOutputPixelCenterOfWG = float3(minOutputPixel)*scale + halfScale; // this can be negative, in which case HW sampler takes care of wrapping for us diff --git a/include/nbl/builtin/hlsl/blit/parameters.hlsl b/include/nbl/builtin/hlsl/blit/parameters.hlsl index 3992fcd689..24b868a7e4 100644 --- a/include/nbl/builtin/hlsl/blit/parameters.hlsl +++ b/include/nbl/builtin/hlsl/blit/parameters.hlsl @@ -14,7 +14,7 @@ namespace blit struct parameters_t { - float32_t3 fScale; + float32_t3 fScale; // float32_t3 negativeSupport; float32_t referenceAlpha; uint32_t kernelWeightsOffsetY; @@ -24,17 +24,15 @@ struct parameters_t uint16_t3 inputDims; uint16_t3 outputDims; - uint16_t3 windowDims; + uint16_t3 windowDims; // uint16_t3 phaseCount; - uint16_t3 preloadRegion; + uint16_t3 preloadRegion; // uint16_t3 iterationRegionXPrefixProducts; uint16_t3 iterationRegionYPrefixProducts; uint16_t3 iterationRegionZPrefixProducts; - //! Offset into the shared memory array which tells us from where the second buffer of shared memory begins - //! Given by max(memory_for_preload_region, memory_for_result_of_y_pass) - uint16_t secondScratchOffset; - uint16_t outputTexelsPerWGZ; + uint16_t secondScratchOffset; // + uint16_t outputTexelsPerWGZ; // uint32_t3 getOutputTexelsPerWG() { @@ -44,36 +42,79 @@ struct parameters_t } }; -struct parameters2_t +// We do some dumb things with bitfields here like not using `vector`, because AMD doesn't support them in push constants +struct SPerWorkgroup { - float32_t3 fScale; - float32_t3 negativeSupportMinusHalf; - float32_t referenceAlpha; - uint32_t kernelWeightsOffsetY; - uint32_t kernelWeightsOffsetZ; - uint32_t inPixelCount; - uint32_t outPixelCount; + static inline SPerWorkgroup create(const float32_t3 _scale, const uint16_t3 output, const uint16_t3 preload, const uint16_t _otherPreloadOffset) + { + SPerWorkgroup retval; + retval.scale = _scale; + retval.preloadWidth = preload[0]; + retval.preloadHeight = preload[1]; + retval.preloadDepth = preload[2]; + retval.outputWidth = output[0]; + retval.outputHeight = output[1]; + retval.outputDepth = output[2]; + retval.otherPreloadOffset = _otherPreloadOffset; + return retval; + } - uint16_t3 inputDims; - uint16_t3 outputDims; - uint16_t3 windowDims; - uint16_t3 phaseCount; - uint16_t3 preloadRegion; - uint16_t3 iterationRegionXPrefixProducts; - uint16_t3 iterationRegionYPrefixProducts; - uint16_t3 iterationRegionZPrefixProducts; + inline uint16_t3 getOutput() NBL_CONST_MEMBER_FUNC + { + return uint16_t3(outputWidth,outputHeight,outputDepth); + } + + inline uint16_t3 getWorkgroupCount(const uint16_t3 outExtent, const uint16_t layersToBlit=0) NBL_CONST_MEMBER_FUNC + { + uint16_t3 retval = uint16_t3(1,1,1); + retval += (outExtent-uint16_t3(1,1,1))/getOutput(); + if (layersToBlit) + retval[3] = layersToBlit; + return retval; + } +#ifndef __HLSL_VERSION + inline operator bool() const + { + return outputWidth && outputHeight && outputDepth && preloadWidth && preloadHeight && preloadDepth; + } +#endif + + // ratio of input pixels to output + float32_t3 scale; + // 16bit in each dimension because some GPUs actually have enough shared memory for 32k pixels + uint32_t outputWidth : 16; + uint32_t outputHeight : 16; + uint32_t outputDepth : 16; + uint32_t unused0 : 16; // channel, image type, iterationRegionPrefixSums ? + uint32_t preloadWidth : 16; + uint32_t preloadHeight : 16; + uint32_t preloadDepth : 16; //! Offset into the shared memory array which tells us from where the second buffer of shared memory begins //! Given by max(memory_for_preload_region, memory_for_result_of_y_pass) - uint16_t secondScratchOffset; - uint16_t outputTexelsPerWGZ; + uint32_t otherPreloadOffset : 16; +}; - uint32_t3 getOutputTexelsPerWG() +struct Parameters +{ + static Parameters create( + const SPerWorkgroup perWG, + const uint16_t3 inImageExtent, const uint16_t3 outImageExtent + ) { - //! `outputTexelsPerWG.xy` just happens to be in the first components of `iterationRegionsXPrefixProducts` and `iterationRegionYPrefixProducts` --this is - //! the result of how we choose to iterate, i.e. if, in the future, we decide to iterate differently, this needs to change. - return uint32_t3(iterationRegionXPrefixProducts.x, iterationRegionYPrefixProducts.x, outputTexelsPerWGZ); + Parameters retval; + retval.perWG = perWG; + return retval; } + + SPerWorkgroup perWG; + // general settings + uint32_t lastChannel : 2; + uint32_t coverage : 1; + uint32_t unused : 29; + //! coverage settings + // required to compare the atomic count of passing pixels against, so we can get original coverage + uint32_t inPixelCount; }; diff --git a/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h b/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h index 9cb37dfe3f..e83ced0bc5 100644 --- a/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h +++ b/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h @@ -66,6 +66,14 @@ NBL_BIT_OP_GLM_PASSTHROUGH(findLSB,findLSB) NBL_BIT_OP_GLM_PASSTHROUGH(findMSB,findMSB) +// TODO: some of the functions in this header should move to `tgmath` +template requires ::nbl::hlsl::is_floating_point_v +inline T floor(const T& v) +{ + return glm::floor(v); +} + + // inverse not defined cause its implemented via hidden friend template inline matrix inverse(const matrix& m) diff --git a/include/nbl/builtin/hlsl/type_traits.hlsl b/include/nbl/builtin/hlsl/type_traits.hlsl index 2e02737339..58cedd81dd 100644 --- a/include/nbl/builtin/hlsl/type_traits.hlsl +++ b/include/nbl/builtin/hlsl/type_traits.hlsl @@ -606,6 +606,8 @@ NBL_CONSTEXPR bool is_unsigned_v = is_unsigned::value; template NBL_CONSTEXPR bool is_integral_v = is_integral::value; template +NBL_CONSTEXPR bool is_floating_point_v = is_floating_point::value; +template NBL_CONSTEXPR bool is_signed_v = is_signed::value; template NBL_CONSTEXPR bool is_scalar_v = is_scalar::value; diff --git a/include/nbl/video/utilities/CComputeBlit.h b/include/nbl/video/utilities/CComputeBlit.h index f7775ef3d6..6288c6080c 100644 --- a/include/nbl/video/utilities/CComputeBlit.h +++ b/include/nbl/video/utilities/CComputeBlit.h @@ -14,7 +14,7 @@ class CComputeBlit : public core::IReferenceCounted constexpr static inline asset::SPushConstantRange DefaultPushConstantRange = { .stageFlags = IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0ull, - .size = sizeof(hlsl::blit::parameters2_t) + .size = sizeof(hlsl::blit::Parameters) }; constexpr static inline std::span DefaultPushConstantRanges = {&DefaultPushConstantRange,1}; @@ -60,6 +60,7 @@ class CComputeBlit : public core::IReferenceCounted core::smart_refctd_ptr blit; core::smart_refctd_ptr coverage; uint16_t workgroupSize; + uint16_t sharedMemorySize; }; struct SPipelinesCreateInfo { @@ -108,7 +109,7 @@ class CComputeBlit : public core::IReferenceCounted } // Use the return values of `getOutputViewFormat` and `getCoverageAdjustmentIntermediateFormat` for this - static inline uint32_t getAlphaBinCount(const uint16_t workgroupSize, const asset::E_FORMAT intermediateAlpha, const uint32_t layersToBlit) + static inline uint32_t getAlphaBinCount(const SPipelines& pipelines, const asset::E_FORMAT intermediateAlpha, const uint32_t layersToBlit) { uint16_t baseBucketCount; using format_t = nbl::asset::E_FORMAT; @@ -131,76 +132,38 @@ class CComputeBlit : public core::IReferenceCounted default: return 0; } - // the absolute minimum needed to store a single pixel of a worst case format (precise, all 4 channels) - constexpr auto singlePixelStorage = 4*sizeof(hlsl::float32_t); - constexpr auto ratio = singlePixelStorage/sizeof(uint16_t); // atomicAdd gets performed on MSB or LSB of a single DWORD - const auto paddedAlphaBinCount = core::min(core::roundUp(baseBucketCount,workgroupSize*2),workgroupSize*ratio); + const auto paddedAlphaBinCount = core::min( + core::roundUp(baseBucketCount,pipelines.workgroupSize*2), + (pipelines.sharedMemorySize-sizeof(uint32_t)*2)/sizeof(uint16_t) + ); return paddedAlphaBinCount*layersToBlit; } - static inline uint32_t getNormalizationByteSize(const uint16_t workgroupSize, const asset::E_FORMAT intermediateAlpha, const uint32_t layersToBlit) + static inline uint32_t getNormalizationByteSize(const SPipelines& pipelines, const asset::E_FORMAT intermediateAlpha, const uint32_t layersToBlit) { - return getAlphaBinCount(workgroupSize,intermediateAlpha,layersToBlit)*sizeof(uint16_t)+sizeof(uint32_t)+sizeof(uint32_t); + return getAlphaBinCount(pipelines,intermediateAlpha,layersToBlit)*sizeof(uint16_t)+sizeof(uint32_t)+sizeof(uint32_t); } -#if 0 + - //! Returns the number of output texels produced by one workgroup, deciding factor is `m_availableSharedMemory`. - //! @param outImageFormat is the format of output (of the blit step) image. - //! If a normalization step is involved then this will be the same as the format of normalization step's input image --which may differ from the - //! final output format, because we blit to a higher precision format for normalization. + //! Returns the number of output texels produced by one workgroup, deciding factor is `pipelines.sharedmemorySize`. + //! @param outFormat is the format of output (of the blit step) image. template - bool getOutputTexelsPerWorkGroup( - core::vectorSIMDu32& outputTexelsPerWG, - const core::vectorSIMDu32& inExtent, - const core::vectorSIMDu32& outExtent, - const asset::E_FORMAT outImageFormat, - const asset::IImage::E_TYPE imageType, - const typename BlitUtilities::convolution_kernels_t& kernels) + static inline hlsl::blit::SPerWorkgroup computePerWorkGroup( + const uint16_t sharedMemorySize, const typename BlitUtilities::convolution_kernels_t& kernels, const IGPUImage::E_TYPE type, + const bool halfPrecision, const hlsl::uint16_t3 inExtent, const hlsl::uint16_t3 outExtent + ) { - core::vectorSIMDf scale = static_cast(inExtent).preciseDivision(static_cast(outExtent)); - - const core::vectorSIMDf minSupport(std::get<0>(kernels).getMinSupport(), std::get<1>(kernels).getMinSupport(), std::get<2>(kernels).getMinSupport()); - const core::vectorSIMDf maxSupport(std::get<0>(kernels).getMaxSupport(), std::get<1>(kernels).getMaxSupport(), std::get<2>(kernels).getMaxSupport()); - - outputTexelsPerWG = core::vectorSIMDu32(1, 1, 1, 1); - size_t requiredSmem = getRequiredSharedMemorySize(outputTexelsPerWG, outExtent, imageType, minSupport, maxSupport, scale, asset::getFormatChannelCount(outImageFormat)); - bool failed = true; - asset::IImage::E_TYPE minDimAxes[3] = { asset::IImage::ET_1D, asset::IImage::ET_2D, asset::IImage::ET_3D }; - while (requiredSmem < m_availableSharedMemory) - { - failed = false; - - std::sort(minDimAxes, minDimAxes + imageType + 1, [&outputTexelsPerWG](const asset::IImage::E_TYPE a, const asset::IImage::E_TYPE b) -> bool { return outputTexelsPerWG[a] < outputTexelsPerWG[b]; }); - - int i = 0; - for (; i < imageType + 1; ++i) - { - const auto axis = minDimAxes[i]; - - core::vectorSIMDu32 delta(0, 0, 0, 0); - delta[axis] = 1; - - if (outputTexelsPerWG[axis] < outExtent[axis]) - { - // Note: we use outImageFormat's channel count as opposed to its image view's format because, even in the event that they are different, we blit - // as if we will be writing to a storage image of out - requiredSmem = getRequiredSharedMemorySize(outputTexelsPerWG + delta, outExtent, imageType, minSupport, maxSupport, scale, asset::getFormatChannelCount(outImageFormat)); - - if (requiredSmem <= m_availableSharedMemory) - { - outputTexelsPerWG += delta; - break; - } - } - } - if (i == imageType + 1) // If we cannot find any axis to increment outputTexelsPerWG along, then break - break; - } - - return !failed; + const hlsl::float32_t3 minSupport(std::get<0>(kernels).getMinSupport(), std::get<1>(kernels).getMinSupport(), std::get<2>(kernels).getMinSupport()); + const hlsl::float32_t3 maxSupport(std::get<0>(kernels).getMaxSupport(), std::get<1>(kernels).getMaxSupport(), std::get<2>(kernels).getMaxSupport()); + return computePerWorkGroup(sharedMemorySize,minSupport,maxSupport,type,halfPrecision); } + static hlsl::blit::SPerWorkgroup computePerWorkGroup( + const uint16_t sharedMemorySize, const hlsl::float32_t3 minSupportInOutput, const hlsl::float32_t3 maxSupportInOutput, const IGPUImage::E_TYPE type, + const bool halfPrecision, const hlsl::uint16_t3 inExtent, const hlsl::uint16_t3 outExtent + ); +#if 0 template inline void buildParameters( nbl::hlsl::blit::parameters_t& outPC, @@ -212,37 +175,20 @@ class CComputeBlit : public core::IReferenceCounted const uint32_t layersToBlit = 1, const float referenceAlpha = 0.f) { - nbl::hlsl::uint16_t3 inDim(inImageExtent.x, inImageExtent.y, inImageExtent.z); - nbl::hlsl::uint16_t3 outDim(outImageExtent.x, outImageExtent.y, outImageExtent.z); - - if (imageType < asset::IImage::ET_3D) - { - inDim.z = layersToBlit; - outDim.z = layersToBlit; - } - - constexpr auto MaxImageDim = 1 << 16; - const auto maxImageDims = core::vectorSIMDu32(MaxImageDim, MaxImageDim, MaxImageDim, MaxImageDim); - assert((inDim.x < maxImageDims.x) && (inDim.y < maxImageDims.y) && (inDim.z < maxImageDims.z)); - assert((outDim.x < maxImageDims.x) && (outDim.y < maxImageDims.y) && (outDim.z < maxImageDims.z)); - - outPC.inputDims = inDim; - outPC.outputDims = outDim; +core::vectorSIMDf scale = static_cast(inImageExtent).preciseDivision(static_cast(outImageExtent)); - core::vectorSIMDf scale = static_cast(inImageExtent).preciseDivision(static_cast(outImageExtent)); +const core::vectorSIMDf minSupport(std::get<0>(kernels).getMinSupport(), std::get<1>(kernels).getMinSupport(), std::get<2>(kernels).getMinSupport()); +const core::vectorSIMDf maxSupport(std::get<0>(kernels).getMaxSupport(), std::get<1>(kernels).getMaxSupport(), std::get<2>(kernels).getMaxSupport()); - const core::vectorSIMDf minSupport(std::get<0>(kernels).getMinSupport(), std::get<1>(kernels).getMinSupport(), std::get<2>(kernels).getMinSupport()); - const core::vectorSIMDf maxSupport(std::get<0>(kernels).getMaxSupport(), std::get<1>(kernels).getMaxSupport(), std::get<2>(kernels).getMaxSupport()); +core::vectorSIMDu32 outputTexelsPerWG; +getOutputTexelsPerWorkGroup(outputTexelsPerWG, inImageExtent, outImageExtent, inImageFormat, imageType, kernels); +const auto preloadRegion = getPreloadRegion(outputTexelsPerWG, imageType, minSupport, maxSupport, scale); - core::vectorSIMDu32 outputTexelsPerWG; - getOutputTexelsPerWorkGroup(outputTexelsPerWG, inImageExtent, outImageExtent, inImageFormat, imageType, kernels); - const auto preloadRegion = getPreloadRegion(outputTexelsPerWG, imageType, minSupport, maxSupport, scale); - - outPC.secondScratchOffset = core::max(preloadRegion.x * preloadRegion.y * preloadRegion.z, outputTexelsPerWG.x * outputTexelsPerWG.y * preloadRegion.z); +outPC.secondScratchOffset = core::max(preloadRegion.x * preloadRegion.y * preloadRegion.z, outputTexelsPerWG.x * outputTexelsPerWG.y * preloadRegion.z); outPC.iterationRegionXPrefixProducts = { outputTexelsPerWG.x, outputTexelsPerWG.x * preloadRegion.y, outputTexelsPerWG.x * preloadRegion.y * preloadRegion.z }; outPC.referenceAlpha = referenceAlpha; - outPC.fScale = { scale.x, scale.y, scale.z }; - outPC.inPixelCount = inImageExtent.x * inImageExtent.y * inImageExtent.z; +outPC.fScale = { scale.x, scale.y, scale.z }; +outPC.inPixelCount = inImageExtent.x * inImageExtent.y * inImageExtent.z; outPC.negativeSupport.x = minSupport.x; outPC.negativeSupport.y = minSupport.y; outPC.negativeSupport.z = minSupport.z; outPC.outPixelCount = outImageExtent.x * outImageExtent.y * outImageExtent.z; @@ -267,101 +213,6 @@ class CComputeBlit : public core::IReferenceCounted outPC.outputTexelsPerWGZ = outputTexelsPerWG.z; outPC.preloadRegion = { preloadRegion.x, preloadRegion.y, preloadRegion.z }; } - - static inline void buildAlphaTestDispatchInfo(dispatch_info_t& outDispatchInfo, const core::vectorSIMDu32& inImageExtent, const asset::IImage::E_TYPE imageType, const uint32_t layersToBlit = 1) - { - const core::vectorSIMDu32 workgroupDims = getDefaultWorkgroupDims(imageType); - const core::vectorSIMDu32 workgroupCount = (inImageExtent + workgroupDims - core::vectorSIMDu32(1u, 1u, 1u, 1u)) / workgroupDims; - - outDispatchInfo.wgCount[0] = workgroupCount.x; - outDispatchInfo.wgCount[1] = workgroupCount.y; - if (imageType < asset::IImage::ET_3D) - outDispatchInfo.wgCount[2] = layersToBlit; - else - outDispatchInfo.wgCount[2] = workgroupCount[2]; - } - - template - inline void buildBlitDispatchInfo( - dispatch_info_t& dispatchInfo, - const core::vectorSIMDu32& inImageExtent, - const core::vectorSIMDu32& outImageExtent, - const asset::E_FORMAT inImageFormat, - const asset::IImage::E_TYPE imageType, - const typename BlitUtilities::convolution_kernels_t& kernels, - const uint32_t workgroupSize = 256, - const uint32_t layersToBlit = 1) - { - core::vectorSIMDu32 outputTexelsPerWG; - getOutputTexelsPerWorkGroup(outputTexelsPerWG, inImageExtent, outImageExtent, inImageFormat, imageType, kernels); - const auto wgCount = (outImageExtent + outputTexelsPerWG - core::vectorSIMDu32(1, 1, 1)) / core::vectorSIMDu32(outputTexelsPerWG.x, outputTexelsPerWG.y, outputTexelsPerWG.z, 1); - - dispatchInfo.wgCount[0] = wgCount[0]; - dispatchInfo.wgCount[1] = wgCount[1]; - if (imageType < asset::IImage::ET_3D) - dispatchInfo.wgCount[2] = layersToBlit; - else - dispatchInfo.wgCount[2] = wgCount[2]; - } - - - static inline void buildNormalizationDispatchInfo(dispatch_info_t& outDispatchInfo, const core::vectorSIMDu32& outImageExtent, const asset::IImage::E_TYPE imageType, const uint32_t layersToBlit = 1) - { - const core::vectorSIMDu32 workgroupDims = getDefaultWorkgroupDims(imageType); - const core::vectorSIMDu32 workgroupCount = (outImageExtent + workgroupDims - core::vectorSIMDu32(1u, 1u, 1u, 1u)) / workgroupDims; - - outDispatchInfo.wgCount[0] = workgroupCount.x; - outDispatchInfo.wgCount[1] = workgroupCount.y; - if (imageType < asset::IImage::ET_3D) - outDispatchInfo.wgCount[2] = layersToBlit; - else - outDispatchInfo.wgCount[2] = workgroupCount[2]; - } - - //! User is responsible for the memory barriers between previous writes and the first - //! dispatch on the input image, and future reads of output image and the last dispatch. - template - inline void blit( - const core::vectorSIMDu32& inImageExtent, - const asset::IImage::E_TYPE inImageType, - const asset::E_FORMAT inImageFormat, - core::smart_refctd_ptr normalizationInImage, - const typename BlitUtilities::convolution_kernels_t& kernels, - const uint32_t layersToBlit = 1, - core::smart_refctd_ptr coverageAdjustmentScratchBuffer = nullptr, - const float referenceAlpha = 0.f, - const uint32_t alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount, - const uint32_t workgroupSize = 256) - { - const core::vectorSIMDu32 outImageExtent(normalizationInImage->getCreationParameters().extent.width, normalizationInImage->getCreationParameters().extent.height, normalizationInImage->getCreationParameters().extent.depth, 1u); - - nbl::hlsl::blit::parameters_t pushConstants; - buildParameters(pushConstants, inImageExtent, outImageExtent, inImageType, inImageFormat, kernels, layersToBlit, referenceAlpha); - - if (alphaSemantic == asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE) - { - dispatch_info_t dispatchInfo; - buildAlphaTestDispatchInfo(dispatchInfo, inImageExtent, inImageType, layersToBlit); -// bind omitted - dispatchHelper(cmdbuf, alphaTestPipeline->getLayout(), pushConstants, dispatchInfo); - } - - { - dispatch_info_t dispatchInfo; - buildBlitDispatchInfo(dispatchInfo, inImageExtent, outImageExtent, inImageFormat, inImageType, kernels, workgroupSize, layersToBlit); -// bind omitted - dispatchHelper(cmdbuf, blitPipeline->getLayout(), pushConstants, dispatchInfo); - } - - // After this dispatch ends and finishes writing to outImage, normalize outImage - if (alphaSemantic == asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE) - { - dispatch_info_t dispatchInfo; - buildNormalizationDispatchInfo(dispatchInfo, outImageExtent, inImageType, layersToBlit); - - dispatchHelper(cmdbuf, normalizationPipeline->getLayout(), pushConstants, dispatchInfo); - } - } #endif @@ -376,41 +227,6 @@ class CComputeBlit : public core::IReferenceCounted core::smart_refctd_ptr m_device; system::logger_opt_smart_ptr m_logger; core::smart_refctd_ptr m_shaderCache; - - //! This calculates the inclusive upper bound on the preload region i.e. it will be reachable for some cases. For the rest it will be bigger - //! by a pixel in each dimension. - //! Used to size the shared memory. - inline core::vectorSIMDu32 getPreloadRegion( - const core::vectorSIMDu32& outputTexelsPerWG, - const asset::IImage::E_TYPE imageType, - const core::vectorSIMDf& scaledMinSupport, - const core::vectorSIMDf& scaledMaxSupport, - const core::vectorSIMDf& scale) - { - core::vectorSIMDu32 preloadRegion = core::vectorSIMDu32(core::floor((scaledMaxSupport - scaledMinSupport) + core::vectorSIMDf(outputTexelsPerWG - core::vectorSIMDu32(1, 1, 1, 1)) * scale)) + core::vectorSIMDu32(1, 1, 1, 1); - - // Set the unused dimensions to 1 to avoid weird behaviours with scaled kernels - for (auto axis = imageType + 1; axis < 3u; ++axis) - preloadRegion[axis] = 1; - - return preloadRegion; - } - - //! Query shared memory size for a given `outputTexelsPerWG`. - inline size_t getRequiredSharedMemorySize( - const core::vectorSIMDu32& outputTexelsPerWG, - const core::vectorSIMDu32& outExtent, - const asset::IImage::E_TYPE imageType, - const core::vectorSIMDf& scaledMinSupport, - const core::vectorSIMDf& scaledMaxSupport, - const core::vectorSIMDf& scale, - const uint32_t channelCount) - { - const auto preloadRegion = getPreloadRegion(outputTexelsPerWG, imageType, scaledMinSupport, scaledMaxSupport, scale); - - const size_t requiredSmem = (core::max(preloadRegion.x * preloadRegion.y * preloadRegion.z, outputTexelsPerWG.x * outputTexelsPerWG.y * preloadRegion.z) + outputTexelsPerWG.x * preloadRegion.y * preloadRegion.z) * channelCount * sizeof(float); - return requiredSmem; - }; }; } diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp index 38b9479350..58d43963e3 100644 --- a/src/nbl/video/utilities/CComputeBlit.cpp +++ b/src/nbl/video/utilities/CComputeBlit.cpp @@ -2,6 +2,8 @@ #include "nbl/builtin/hlsl/binding_info.hlsl" using namespace nbl::core; +using namespace nbl::hlsl; +using namespace nbl::hlsl::blit; using namespace nbl::system; using namespace nbl::asset; using namespace nbl::video; @@ -31,9 +33,10 @@ auto CComputeBlit::createAndCachePipelines(const SPipelinesCreateInfo& info) -> if (retval.workgroupSize getBindingInfoForHLSL({.binding=info.samplers,.requiredStages=IShader::E_SHADER_STAGE::ESS_COMPUTE}) << R"===(; using input_image_binding_t = )===" << layout->getBindingInfoForHLSL({.binding=info.inputs,.requiredStages=IShader::E_SHADER_STAGE::ESS_COMPUTE}) << R"===(; using output_binding_t = )===" << layout->getBindingInfoForHLSL({.binding=info.outputs,.requiredStages=IShader::E_SHADER_STAGE::ESS_COMPUTE}) << R"===(; - NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedMemoryDWORDs = )===" << (sharedMemoryPerInvocation* retval.workgroupSize)/sizeof(uint32_t) << R"===(; + NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedMemoryDWORDs = )===" << retval.sharedMemorySize/sizeof(uint32_t) << R"===(; }; )==="; return tmp.str(); @@ -78,7 +81,7 @@ struct ConstevalParameters params.layout = layout; params.shader.entryPoint = "main"; params.shader.shader = shader.get(); - params.shader.requiredSubgroupSize = static_cast(hlsl::findMSB(limits.maxSubgroupSize)); + params.shader.requiredSubgroupSize = static_cast(findMSB(limits.maxSubgroupSize)); // needed for the prefix and reductions to work params.shader.requireFullSubgroups = true; return ICPUComputePipeline::create(params); @@ -113,6 +116,73 @@ struct ConstevalParameters return retval; } +SPerWorkgroup CComputeBlit::computePerWorkGroup( + const uint16_t sharedMemorySize, const float32_t3 minSupportInOutput, const float32_t3 maxSupportInOutput, + const IGPUImage::E_TYPE type, const bool halfPrecision, const uint16_t3 inExtent, const uint16_t3 outExtent +) +{ + SPerWorkgroup retval; + memset(&retval,0,sizeof(retval)); + + const auto Dims = static_cast(type)+1; + const auto scale = float32_t3(inExtent)/float32_t3(outExtent); + const auto supportWidthInOutput = maxSupportInOutput-minSupportInOutput; + + IGPUImage::E_TYPE minDimAxes[3] = { IGPUImage::ET_1D, IGPUImage::ET_2D, IGPUImage::ET_3D }; + using namespace nbl::hlsl; + for (uint16_t3 output(1,1,1); true;) + { + // now try and grow our support + const auto combinedSupportInOutput = supportWidthInOutput+float32_t3(output-uint16_t3(1,1,1)); + // note that its not ceil on purpose + uint32_t3 preload = uint32_t3(hlsl::floor(combinedSupportInOutput*scale))+uint32_t3(1,1,1); + // Set the unused dimensions to 1 to avoid weird behaviours with scaled kernels + for (auto a=Dims; a<3; a++) + preload[a] = 1; + // TODO: the blits should probably be implemented with separate preload per channel + { + // think in terms of inputs (for now we have a fixed, unoptimized ordering of XYZ) + const uint16_t firstPass = preload.x*preload.y*preload.z; + const uint16_t secondPass = output.x*preload.y*preload.z; + const uint16_t thirdPass = output.x*output.y*preload.z; + // + uint32_t otherPreloadOffset = firstPass; + // third pass aliases first pass input storage + if (Dims==3 && otherPreloadOffset1 ? secondPass:0u); + const auto requiredSharedMemory = totalPixels*sizeof(float);//(halfPrecision ? sizeof(float16_t):sizeof(float32_t)); TODO: impl the blit in 16bits + // too much + if (requiredSharedMemory>size_t(sharedMemorySize)) + break; + // still fits, update return value + retval = SPerWorkgroup::create(scale,output,preload,otherPreloadOffset); + } + + // we want to fix the dimension that's the smallest, so that we increase the volume of the support by a smallest increment and stay close to a cube shape + { + std::sort(minDimAxes,minDimAxes+Dims,[output](const IGPUImage::E_TYPE a, const IGPUImage::E_TYPE b)->bool + { + return output[a] From 7a58da4ececd04685fd5fa9826a24b75bd854b38 Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 8 Nov 2024 16:54:43 +0100 Subject: [PATCH 161/358] get the blit to compile and write output image --- examples_tests | 2 +- include/nbl/builtin/hlsl/blit/common.hlsl | 3 +- .../builtin/hlsl/blit/default_blit.comp.hlsl | 18 +++++++ include/nbl/builtin/hlsl/blit/parameters.hlsl | 49 +++++++++++-------- include/nbl/video/IGPUCommandBuffer.h | 5 ++ include/nbl/video/utilities/CComputeBlit.h | 8 +-- src/nbl/video/utilities/CComputeBlit.cpp | 6 +-- 7 files changed, 62 insertions(+), 29 deletions(-) diff --git a/examples_tests b/examples_tests index 54e0ab15f3..ddfc8d0995 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 54e0ab15f3ee5734be10d73ca236322013797a2e +Subproject commit ddfc8d0995e117bfdb1924d493cd4fdb40cb478c diff --git a/include/nbl/builtin/hlsl/blit/common.hlsl b/include/nbl/builtin/hlsl/blit/common.hlsl index f3708098db..739c17b652 100644 --- a/include/nbl/builtin/hlsl/blit/common.hlsl +++ b/include/nbl/builtin/hlsl/blit/common.hlsl @@ -6,6 +6,7 @@ #include +#include namespace nbl { namespace hlsl @@ -44,7 +45,7 @@ RWTexture3D outAs3D[ConstevalParameters::output_binding_t::Count]; groupshared uint32_t sMem[ConstevalParameters::SharedMemoryDWORDs]; -[[vk::push_constant]] const nbl::hlsl::blit::SPerWorkgroup pc; +[[vk::push_constant]] const nbl::hlsl::blit::Parameters pc; #include diff --git a/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl b/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl index 1a519b8730..1407d7fc77 100644 --- a/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl +++ b/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl @@ -62,7 +62,25 @@ using namespace nbl::hlsl::blit; void main() { InImgAccessor inImgA; + OutImgAccessor outImgA; + outImgA.descIx = pc.outputDescIx; + + const uint16_t3 wgID = _static_cast(glsl::gl_WorkGroupID()); + const uint16_t3 baseCoord = pc.perWG.getOutputBaseCoord(wgID); + // TODO: If and when someone can be bothered, change the blit api to compile a pipeline per image dimension, maybe it will be faster + switch (pc.perWG.imageDim) + { + case 1: + outImgA.set(uint16_t1(baseCoord.x),wgID.z,float32_t4(1,0,1,1)); + break; + case 2: + outImgA.set(baseCoord.xy,wgID.z,float32_t4(1,0,1,1)); + break; + case 3: + outImgA.set(baseCoord,0xdeadu,float32_t4(1,0,1,1)); + break; + } /* blit::compute_blit_t blit = blit::compute_blit_t::create(params); InCSAccessor inCSA; diff --git a/include/nbl/builtin/hlsl/blit/parameters.hlsl b/include/nbl/builtin/hlsl/blit/parameters.hlsl index 24b868a7e4..163a24fac3 100644 --- a/include/nbl/builtin/hlsl/blit/parameters.hlsl +++ b/include/nbl/builtin/hlsl/blit/parameters.hlsl @@ -45,10 +45,11 @@ struct parameters_t // We do some dumb things with bitfields here like not using `vector`, because AMD doesn't support them in push constants struct SPerWorkgroup { - static inline SPerWorkgroup create(const float32_t3 _scale, const uint16_t3 output, const uint16_t3 preload, const uint16_t _otherPreloadOffset) + static inline SPerWorkgroup create(const float32_t3 _scale, const uint16_t _imageDim, const uint16_t3 output, const uint16_t3 preload, const uint16_t _otherPreloadOffset) { SPerWorkgroup retval; retval.scale = _scale; + retval.imageDim = _imageDim; retval.preloadWidth = preload[0]; retval.preloadHeight = preload[1]; retval.preloadDepth = preload[2]; @@ -59,22 +60,23 @@ struct SPerWorkgroup return retval; } - inline uint16_t3 getOutput() NBL_CONST_MEMBER_FUNC + inline uint16_t3 getOutputBaseCoord(const uint16_t3 workgroup) NBL_CONST_MEMBER_FUNC { - return uint16_t3(outputWidth,outputHeight,outputDepth); + return workgroup*uint16_t3(outputWidth,outputHeight,outputDepth); } inline uint16_t3 getWorkgroupCount(const uint16_t3 outExtent, const uint16_t layersToBlit=0) NBL_CONST_MEMBER_FUNC { - uint16_t3 retval = uint16_t3(1,1,1); - retval += (outExtent-uint16_t3(1,1,1))/getOutput(); + const uint16_t3 unit = uint16_t3(1,1,1); + uint16_t3 retval = unit; + retval += (outExtent-unit)/getOutputBaseCoord(unit); if (layersToBlit) - retval[3] = layersToBlit; + retval[2] = layersToBlit; return retval; } #ifndef __HLSL_VERSION - inline operator bool() const + explicit inline operator bool() const { return outputWidth && outputHeight && outputDepth && preloadWidth && preloadHeight && preloadDepth; } @@ -82,11 +84,13 @@ struct SPerWorkgroup // ratio of input pixels to output float32_t3 scale; + // whether its an image1D, image2D or image3D + uint32_t imageDim : 2; + uint32_t unused0 : 14; // channel, iterationRegionPrefixSums ? // 16bit in each dimension because some GPUs actually have enough shared memory for 32k pixels uint32_t outputWidth : 16; uint32_t outputHeight : 16; uint32_t outputDepth : 16; - uint32_t unused0 : 16; // channel, image type, iterationRegionPrefixSums ? uint32_t preloadWidth : 16; uint32_t preloadHeight : 16; uint32_t preloadDepth : 16; @@ -97,22 +101,27 @@ struct SPerWorkgroup struct Parameters { - static Parameters create( - const SPerWorkgroup perWG, - const uint16_t3 inImageExtent, const uint16_t3 outImageExtent - ) +#ifndef __HLSL_VERSION + explicit inline operator bool() const { - Parameters retval; - retval.perWG = perWG; - return retval; + return bool(perWG); } +#endif - SPerWorkgroup perWG; - // general settings - uint32_t lastChannel : 2; - uint32_t coverage : 1; - uint32_t unused : 29; + SPerWorkgroup perWG; // rename to perBlitWG? + //! general settings + uint32_t inputDescIx : 19; + uint32_t samplerDescIx : 11; + uint32_t unused0 : 2; + // + uint32_t outputDescIx : 19; + uint32_t channelCount : 3; + uint32_t unused1 : 10; + // + uint32_t unused2 : 12; //! coverage settings + uint32_t intermAlphaDescIx : 19; + uint32_t coverage : 1; // required to compare the atomic count of passing pixels against, so we can get original coverage uint32_t inPixelCount; }; diff --git a/include/nbl/video/IGPUCommandBuffer.h b/include/nbl/video/IGPUCommandBuffer.h index 620206349a..185ba365ef 100644 --- a/include/nbl/video/IGPUCommandBuffer.h +++ b/include/nbl/video/IGPUCommandBuffer.h @@ -416,6 +416,11 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject //! dispatches bool dispatch(const uint32_t groupCountX, const uint32_t groupCountY=1, const uint32_t groupCountZ=1); + template requires std::is_integral_v + bool dispatch(const hlsl::vector groupCount) + { + return dispatch(groupCount.x,groupCount.y,groupCount.z); + } bool dispatchIndirect(const asset::SBufferBinding& binding); //! Begin/End RenderPasses diff --git a/include/nbl/video/utilities/CComputeBlit.h b/include/nbl/video/utilities/CComputeBlit.h index 6288c6080c..033d0774f1 100644 --- a/include/nbl/video/utilities/CComputeBlit.h +++ b/include/nbl/video/utilities/CComputeBlit.h @@ -151,16 +151,16 @@ class CComputeBlit : public core::IReferenceCounted template static inline hlsl::blit::SPerWorkgroup computePerWorkGroup( const uint16_t sharedMemorySize, const typename BlitUtilities::convolution_kernels_t& kernels, const IGPUImage::E_TYPE type, - const bool halfPrecision, const hlsl::uint16_t3 inExtent, const hlsl::uint16_t3 outExtent + const hlsl::uint16_t3 inExtent, const hlsl::uint16_t3 outExtent, const bool halfPrecision=false ) { const hlsl::float32_t3 minSupport(std::get<0>(kernels).getMinSupport(), std::get<1>(kernels).getMinSupport(), std::get<2>(kernels).getMinSupport()); const hlsl::float32_t3 maxSupport(std::get<0>(kernels).getMaxSupport(), std::get<1>(kernels).getMaxSupport(), std::get<2>(kernels).getMaxSupport()); - return computePerWorkGroup(sharedMemorySize,minSupport,maxSupport,type,halfPrecision); + return computePerWorkGroup(sharedMemorySize,minSupport,maxSupport,type,inExtent,outExtent,halfPrecision); } - static hlsl::blit::SPerWorkgroup computePerWorkGroup( + NBL_API2 static hlsl::blit::SPerWorkgroup computePerWorkGroup( const uint16_t sharedMemorySize, const hlsl::float32_t3 minSupportInOutput, const hlsl::float32_t3 maxSupportInOutput, const IGPUImage::E_TYPE type, - const bool halfPrecision, const hlsl::uint16_t3 inExtent, const hlsl::uint16_t3 outExtent + const hlsl::uint16_t3 inExtent, const hlsl::uint16_t3 outExtent, const bool halfPrecision=false ); #if 0 diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp index 58d43963e3..90dbf1a273 100644 --- a/src/nbl/video/utilities/CComputeBlit.cpp +++ b/src/nbl/video/utilities/CComputeBlit.cpp @@ -117,8 +117,8 @@ struct ConstevalParameters } SPerWorkgroup CComputeBlit::computePerWorkGroup( - const uint16_t sharedMemorySize, const float32_t3 minSupportInOutput, const float32_t3 maxSupportInOutput, - const IGPUImage::E_TYPE type, const bool halfPrecision, const uint16_t3 inExtent, const uint16_t3 outExtent + const uint16_t sharedMemorySize, const float32_t3 minSupportInOutput, const float32_t3 maxSupportInOutput, const IGPUImage::E_TYPE type, + const uint16_t3 inExtent, const uint16_t3 outExtent, const bool halfPrecision ) { SPerWorkgroup retval; @@ -157,7 +157,7 @@ SPerWorkgroup CComputeBlit::computePerWorkGroup( if (requiredSharedMemory>size_t(sharedMemorySize)) break; // still fits, update return value - retval = SPerWorkgroup::create(scale,output,preload,otherPreloadOffset); + retval = SPerWorkgroup::create(scale,Dims,output,preload,otherPreloadOffset); } // we want to fix the dimension that's the smallest, so that we increase the volume of the support by a smallest increment and stay close to a cube shape From 2bca4ab30403748e9899790779b519dfe8b7c7cd Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Fri, 8 Nov 2024 17:22:16 +0100 Subject: [PATCH 162/358] add NBL_LLVM_ENABLE_ASSERTIONS --- 3rdparty/dxc/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/3rdparty/dxc/CMakeLists.txt b/3rdparty/dxc/CMakeLists.txt index d17b2d8908..8b34c76f88 100644 --- a/3rdparty/dxc/CMakeLists.txt +++ b/3rdparty/dxc/CMakeLists.txt @@ -1,3 +1,6 @@ +option(NBL_LLVM_ENABLE_ASSERTIONS "LLVM_ENABLE_ASSERTIONS" ON) + +list(APPEND NBL_DXC_CMAKE_OPTIONS "-DLLVM_ENABLE_ASSERTIONS:BOOL=${NBL_LLVM_ENABLE_ASSERTIONS}") list(APPEND NBL_DXC_CMAKE_OPTIONS "-DHLSL_OPTIONAL_PROJS_IN_DEFAULT:BOOL=OFF") list(APPEND NBL_DXC_CMAKE_OPTIONS "-DHLSL_ENABLE_ANALYZE:BOOL=OFF") list(APPEND NBL_DXC_CMAKE_OPTIONS "-DHLSL_OFFICIAL_BUILD:BOOL=OFF") From 316d23f052ba3468def7d6e23dbaf258e02bf7de Mon Sep 17 00:00:00 2001 From: Fletterio Date: Fri, 8 Nov 2024 16:51:28 -0300 Subject: [PATCH 163/358] Small fix on merge conflict --- src/nbl/asset/utils/ISPIRVOptimizer.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/nbl/asset/utils/ISPIRVOptimizer.cpp b/src/nbl/asset/utils/ISPIRVOptimizer.cpp index f3c4589e4a..2716bcd70f 100644 --- a/src/nbl/asset/utils/ISPIRVOptimizer.cpp +++ b/src/nbl/asset/utils/ISPIRVOptimizer.cpp @@ -43,7 +43,6 @@ nbl::core::smart_refctd_ptr ISPIRVOptimizer::optimize(const uint32_t &spvtools::CreateIfConversionPass, &spvtools::CreateStripDebugInfoPass, //&spvtools::CreateAggressiveDCEPass ->>>>>>> master }; auto msgConsumer = [&logger](spv_message_level_t level, const char* src, const spv_position_t& pos, const char* msg) From f59e68e15f3d3095aeb8badd625f554aa9e925ae Mon Sep 17 00:00:00 2001 From: Fletterio Date: Fri, 8 Nov 2024 16:53:36 -0300 Subject: [PATCH 164/358] Examples submodule pointer update --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 54e0ab15f3..ddfc8d0995 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 54e0ab15f3ee5734be10d73ca236322013797a2e +Subproject commit ddfc8d0995e117bfdb1924d493cd4fdb40cb478c From dd66a09f40c093fe0590ad20ea1a8d1079076116 Mon Sep 17 00:00:00 2001 From: Erfan Ahmadi Date: Sat, 9 Nov 2024 11:09:51 +0400 Subject: [PATCH 165/358] apply fixes to uint8_t clamping of swapchain values --- examples_tests | 2 +- include/nbl/video/CVulkanSwapchain.h | 6 +++--- src/nbl/video/CVulkanSwapchain.cpp | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples_tests b/examples_tests index 8766b00ddd..15dbab0d57 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 8766b00ddd1fc0d6e35ff662fd7a12fbd5051a45 +Subproject commit 15dbab0d5782422e8346b58eeceec36082d75f11 diff --git a/include/nbl/video/CVulkanSwapchain.h b/include/nbl/video/CVulkanSwapchain.h index 5a16ef0a5e..20843a3fbb 100644 --- a/include/nbl/video/CVulkanSwapchain.h +++ b/include/nbl/video/CVulkanSwapchain.h @@ -24,7 +24,7 @@ class CVulkanSwapchain final : public ISwapchain inline VkSwapchainKHR getInternalObject() const {return m_vkSwapchainKHR;} // returns the maximum number of time acquire can be called without releasing the image index through present. - inline uint8_t getMaxBlockingAcquiresBeforePresent() const override { return static_cast(m_maxBlockingAcquiresBeforePresent); } + inline uint8_t getMaxBlockingAcquiresBeforePresent() const override { return m_maxBlockingAcquiresBeforePresent; } // returns the maximum number of acquires you can request without waiting for previous acquire semaphores to signal. uint8_t getMaxAcquiresInFlight() const override { return getImageCount(); } @@ -39,7 +39,7 @@ class CVulkanSwapchain final : public ISwapchain const VkSemaphore* const _acquireAdaptorSemaphores, const VkSemaphore* const _prePresentSemaphores, const VkSemaphore* const _presentAdaptorSemaphores, - const uint32_t maxAcquiresBeforePresent + const uint8_t maxAcquiresBeforePresent ); ~CVulkanSwapchain(); @@ -58,7 +58,7 @@ class CVulkanSwapchain final : public ISwapchain uint64_t m_perImageAcquireCount[ISwapchain::MaxImages] = { 0 }; // nasty way to fight UB of the Vulkan spec bool m_needToWaitIdle = true; - uint32_t m_maxBlockingAcquiresBeforePresent = 0u; + uint8_t m_maxBlockingAcquiresBeforePresent = 0u; }; } diff --git a/src/nbl/video/CVulkanSwapchain.cpp b/src/nbl/video/CVulkanSwapchain.cpp index 9b5fba8b93..c1c248d4a0 100644 --- a/src/nbl/video/CVulkanSwapchain.cpp +++ b/src/nbl/video/CVulkanSwapchain.cpp @@ -151,7 +151,7 @@ core::smart_refctd_ptr CVulkanSwapchain::create(core::smart_re ISurface::SCapabilities caps = {}; params.surface->getSurfaceCapabilitiesForPhysicalDevice(physDev, caps); // read https://registry.khronos.org/vulkan/specs/1.3-extensions/html/chap34.html#swapchain-acquire-forward-progress - const uint32_t maxAcquiresWithoutPresent = imageCount - caps.minImageCount + 1u; + const uint8_t maxAcquiresWithoutPresent = core::min(imageCount,ISwapchain::MaxImages) - caps.minImageCount + 1u; return core::smart_refctd_ptr(new CVulkanSwapchain(std::move(device),std::move(params),imageCount,std::move(oldSwapchain),vk_swapchain,semaphores+imageCount,semaphores,semaphores+2*imageCount, maxAcquiresWithoutPresent),core::dont_grab); } @@ -165,7 +165,7 @@ CVulkanSwapchain::CVulkanSwapchain( const VkSemaphore* const _acquireAdaptorSemaphores, const VkSemaphore* const _prePresentSemaphores, const VkSemaphore* const _presentAdaptorSemaphores, - const uint32_t maxAcquiresBeforePresent) + const uint8_t maxAcquiresBeforePresent) : ISwapchain(std::move(logicalDevice),std::move(params),imageCount,std::move(oldSwapchain)), m_imgMemRequirements{.size=0,.memoryTypeBits=0x0u,.alignmentLog2=63,.prefersDedicatedAllocation=true,.requiresDedicatedAllocation=true}, m_vkSwapchainKHR(swapchain), m_maxBlockingAcquiresBeforePresent(maxAcquiresBeforePresent) { From bd61877908bab98d95c6b046a09ae2bb78812060 Mon Sep 17 00:00:00 2001 From: Erfan Ahmadi Date: Sat, 9 Nov 2024 12:08:18 +0400 Subject: [PATCH 166/358] update examples --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 15dbab0d57..36e5d21a71 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 15dbab0d5782422e8346b58eeceec36082d75f11 +Subproject commit 36e5d21a71d0e2dda798d0fc78e85421000b51c6 From 1e84ef0d0639fd5a28560007e09f93f34269eb67 Mon Sep 17 00:00:00 2001 From: Erfan Ahmadi Date: Sat, 9 Nov 2024 12:14:20 +0400 Subject: [PATCH 167/358] update examples --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 54e0ab15f3..0df6008c92 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 54e0ab15f3ee5734be10d73ca236322013797a2e +Subproject commit 0df6008c923712ed729b47cf7711c6301622fdb3 From aae1dd8131170e7abed72886e4811dbc7208fe45 Mon Sep 17 00:00:00 2001 From: Fletterio Date: Sat, 9 Nov 2024 21:06:30 -0300 Subject: [PATCH 168/358] PR review refactor --- include/nbl/builtin/hlsl/workgroup/fft.hlsl | 192 ++++++++++---------- 1 file changed, 100 insertions(+), 92 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup/fft.hlsl b/include/nbl/builtin/hlsl/workgroup/fft.hlsl index 8ec1df2e21..7605bf8a0e 100644 --- a/include/nbl/builtin/hlsl/workgroup/fft.hlsl +++ b/include/nbl/builtin/hlsl/workgroup/fft.hlsl @@ -23,102 +23,76 @@ namespace fft { // ---------------------------------- Utils ----------------------------------------------- -template -struct exchangeValues + +// No need to expose these +namespace impl { - static void __call(NBL_REF_ARG(complex_t) lo, NBL_REF_ARG(complex_t) hi, uint32_t threadID, uint32_t stride, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor) + template + struct exchangeValues { - const bool topHalf = bool(threadID & stride); - // Pack into float vector because ternary operator does not support structs - vector exchanged = topHalf ? vector(lo.real(), lo.imag()) : vector(hi.real(), hi.imag()); - shuffleXor >(exchanged, stride, sharedmemAdaptor); - if (topHalf) - { - lo.real(exchanged.x); - lo.imag(exchanged.y); - } - else + static void __call(NBL_REF_ARG(complex_t) lo, NBL_REF_ARG(complex_t) hi, uint32_t threadID, uint32_t stride, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor) { - hi.real(exchanged.x); - hi.imag(exchanged.y); + const bool topHalf = bool(threadID & stride); + // Pack into float vector because ternary operator does not support structs + vector exchanged = topHalf ? vector(lo.real(), lo.imag()) : vector(hi.real(), hi.imag()); + shuffleXor >(exchanged, stride, sharedmemAdaptor); + if (topHalf) + { + lo.real(exchanged.x); + lo.imag(exchanged.y); + } + else + { + hi.real(exchanged.x); + hi.imag(exchanged.y); + } } - } -}; - -// Get the required size (in number of uint32_t elements) of the workgroup shared memory array needed for the FFT -template -NBL_CONSTEXPR uint32_t SharedMemoryDWORDs = (sizeof(complex_t) / sizeof(uint32_t)) * WorkgroupSize; - + }; -template -enable_if_t bitShiftRightHigher(uint32_t i) -{ - // Highest H bits are numbered N-1 through N - H - // N - H is then the middle bit - // Lowest bits numbered from 0 through N - H - 1 - uint32_t low = i & ((1 << (N - H)) - 1); - uint32_t mid = i & (1 << (N - H)); - uint32_t high = i & ~((1 << (N - H + 1)) - 1); - - high >>= 1; - mid <<= H - 1; - - return mid | high | low; -} + template + enable_if_t<(H <= N) && (N < 32), uint32_t> circularBitShiftRightHigher(uint32_t i) + { + // Highest H bits are numbered N-1 through N - H + // N - H is then the middle bit + // Lowest bits numbered from 0 through N - H - 1 + NBL_CONSTEXPR_STATIC_INLINE uint32_t lowMask = (1 << (N - H)) - 1; + NBL_CONSTEXPR_STATIC_INLINE uint32_t midMask = 1 << (N - H); + NBL_CONSTEXPR_STATIC_INLINE uint32_t highMask = ~((1 << (N - H + 1)) - 1); -template -enable_if_t bitShiftLeftHigher(uint32_t i) -{ - // Highest H bits are numbered N-1 through N - H - // N - 1 is then the highest bit, and N - 2 through N - H are the middle bits - // Lowest bits numbered from 0 through N - H - 1 - uint32_t low = i & ((1 << (N - H)) - 1); - uint32_t mid = i & (~((1 << (N - H)) - 1) | ~(1 << (N - 1))); - uint32_t high = i & (1 << (N - 1)); + uint32_t low = i & lowMask; + uint32_t mid = i & midMask; + uint32_t high = i & highMask; - mid <<= 1; - high >>= H - 1; + high >>= 1; + mid <<= H - 1; - return mid | high | low; -} + return mid | high | low; + } -// This function maps the index `idx` in the output array of a Forward FFT to the index `freqIdx` in the DFT such that `DFT[freqIdx] = output[idx]` -// This is because Cooley-Tukey + subgroup operations end up spewing out the outputs in a weird order -template -uint32_t getFrequencyIndex(uint32_t outputIdx) -{ - NBL_CONSTEXPR_STATIC_INLINE uint32_t ELEMENTS_PER_INVOCATION_LOG_2 = uint32_t(mpl::log2::value); - NBL_CONSTEXPR_STATIC_INLINE uint32_t FFT_SIZE_LOG_2 = ELEMENTS_PER_INVOCATION_LOG_2 + uint32_t(mpl::log2::value); + template + enable_if_t<(H <= N) && (N < 32), uint32_t> circularBitShiftLeftHigher(uint32_t i) + { + // Highest H bits are numbered N-1 through N - H + // N - 1 is then the highest bit, and N - 2 through N - H are the middle bits + // Lowest bits numbered from 0 through N - H - 1 + NBL_CONSTEXPR_STATIC_INLINE uint32_t lowMask = (1 << (N - H)) - 1; + NBL_CONSTEXPR_STATIC_INLINE uint32_t midMask = ~((1 << (N - H)) - 1) | ~(1 << (N - 1)); + NBL_CONSTEXPR_STATIC_INLINE uint32_t highMask = 1 << (N - 1); - return bitShiftRightHigher(glsl::bitfieldReverse(outputIdx) >> (32 - FFT_SIZE_LOG_2)); -} + uint32_t low = i & lowMask; + uint32_t mid = i & midMask; + uint32_t high = i & highMask; -// This function maps the index `freqIdx` in the DFT to the index `idx` in the output array of a Forward FFT such that `DFT[freqIdx] = output[idx]` -// It is essentially the inverse of `getFrequencyIndex` -template -uint32_t getOutputIndex(uint32_t freqIdx) -{ - NBL_CONSTEXPR_STATIC_INLINE uint32_t ELEMENTS_PER_INVOCATION_LOG_2 = uint32_t(mpl::log2::value); - NBL_CONSTEXPR_STATIC_INLINE uint32_t FFT_SIZE_LOG_2 = ELEMENTS_PER_INVOCATION_LOG_2 + uint32_t(mpl::log2::value); + mid <<= 1; + high >>= H - 1; - return glsl::bitfieldReverse(bitShiftLeftHigher(freqIdx)) >> (32 - FFT_SIZE_LOG_2); -} - -// Mirrors an index about the Nyquist frequency -template -uint32_t mirror(uint32_t idx) -{ - NBL_CONSTEXPR_STATIC_INLINE uint32_t FFT_SIZE = WorkgroupSize * uint32_t(ElementsPerInvocation); - return (FFT_SIZE - idx) & (FFT_SIZE - 1); -} + return mid | high | low; + } +} //namespace impl -// When packing real FFTs a common operation is to get `DFT[T]` and `DFT[-T]` to unpack the result of a packed real FFT. -// Given an index `idx` into the Nabla-ordered DFT such that `output[idx] = DFT[T]`, this function is such that `output[getNegativeIndex(idx)] = DFT[-T]` -template -uint32_t getNegativeIndex(uint32_t idx) -{ - return getOutputIndex(mirror(getFrequencyIndex(idx))); -} +// Get the required size (in number of uint32_t elements) of the workgroup shared memory array needed for the FFT +template +NBL_CONSTEXPR uint32_t SharedMemoryDWORDs = (sizeof(complex_t) / sizeof(uint32_t)) * WorkgroupSize; // Util to unpack two values from the packed FFT X + iY - get outputs in the same input arguments, storing x to lo and y to hi template @@ -129,11 +103,45 @@ void unpack(NBL_REF_ARG(complex_t) lo, NBL_REF_ARG(complex_t) hi lo = x; } +template +struct FFTIndexingUtils +{ + // This function maps the index `idx` in the output array of a Nabla FFT to the index `freqIdx` in the DFT such that `DFT[freqIdx] = NablaFFT[idx]` + // This is because Cooley-Tukey + subgroup operations end up spewing out the outputs in a weird order + static uint32_t getDFTIndex(uint32_t outputIdx) + { + return impl::circularBitShiftRightHigher(glsl::bitfieldReverse(outputIdx) >> (32 - FFTSizeLog2)); + } + + // This function maps the index `freqIdx` in the DFT to the index `idx` in the output array of a Nabla FFT such that `DFT[freqIdx] = NablaFFT[idx]` + // It is essentially the inverse of `getDFTIndex` + static uint32_t getNablaIndex(uint32_t freqIdx) + { + return glsl::bitfieldReverse(impl::circularBitShiftLeftHigher(freqIdx)) >> (32 - FFTSizeLog2); + } + + // Mirrors an index about the Nyquist frequency in the DFT order + static uint32_t getDFTMirrorIndex(uint32_t idx) + { + return (FFTSize - idx) & (FFTSize - 1); + } + + // Given an index `idx` of an element into the Nabla FFT, get the index into the Nabla FFT of the element corresponding to its negative frequency + static uint32_t getNablaMirrorIndex(uint32_t idx) + { + return getNablaIndex(getDFTMirrorIndex(getDFTIndex(idx))); + } + + NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerInvocationLog2 = mpl::log2::value; + NBL_CONSTEXPR_STATIC_INLINE uint16_t FFTSizeLog2 = ElementsPerInvocationLog2 + mpl::log2::value; + NBL_CONSTEXPR_STATIC_INLINE uint32_t FFTSize = uint32_t(WorkgroupSize) * uint32_t(ElementsPerInvocation); +}; + } //namespace fft // ----------------------------------- End Utils ----------------------------------------------- -template +template struct FFT; // For the FFT methods below, we assume: @@ -153,13 +161,13 @@ struct FFT; // * void workgroupExecutionAndMemoryBarrier(); // 2 items per invocation forward specialization -template +template struct FFT<2,false, WorkgroupSize, Scalar, device_capabilities> { template static void FFT_loop(uint32_t stride, NBL_REF_ARG(complex_t) lo, NBL_REF_ARG(complex_t) hi, uint32_t threadID, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor) { - fft::exchangeValues::__call(lo, hi, threadID, stride, sharedmemAdaptor); + fft::impl::exchangeValues::__call(lo, hi, threadID, stride, sharedmemAdaptor); // Get twiddle with k = threadID mod stride, halfN = stride hlsl::fft::DIF::radix2(hlsl::fft::twiddle(threadID & (stride - 1), stride), lo, hi); @@ -199,7 +207,7 @@ struct FFT<2,false, WorkgroupSize, Scalar, device_capabilities> } // special last workgroup-shuffle - fft::exchangeValues::__call(lo, hi, threadID, glsl::gl_SubgroupSize(), sharedmemAdaptor); + fft::impl::exchangeValues::__call(lo, hi, threadID, glsl::gl_SubgroupSize(), sharedmemAdaptor); // Remember to update the accessor's state sharedmemAccessor = sharedmemAdaptor.accessor; @@ -217,7 +225,7 @@ struct FFT<2,false, WorkgroupSize, Scalar, device_capabilities> // 2 items per invocation inverse specialization -template +template struct FFT<2,true, WorkgroupSize, Scalar, device_capabilities> { template @@ -226,7 +234,7 @@ struct FFT<2,true, WorkgroupSize, Scalar, device_capabilities> // Get twiddle with k = threadID mod stride, halfN = stride hlsl::fft::DIT::radix2(hlsl::fft::twiddle(threadID & (stride - 1), stride), lo, hi); - fft::exchangeValues::__call(lo, hi, threadID, stride, sharedmemAdaptor); + fft::impl::exchangeValues::__call(lo, hi, threadID, stride, sharedmemAdaptor); } @@ -255,7 +263,7 @@ struct FFT<2,true, WorkgroupSize, Scalar, device_capabilities> sharedmemAdaptor.accessor = sharedmemAccessor; // special first workgroup-shuffle - fft::exchangeValues::__call(lo, hi, threadID, glsl::gl_SubgroupSize(), sharedmemAdaptor); + fft::impl::exchangeValues::__call(lo, hi, threadID, glsl::gl_SubgroupSize(), sharedmemAdaptor); // The bigger steps [unroll] @@ -283,7 +291,7 @@ struct FFT<2,true, WorkgroupSize, Scalar, device_capabilities> }; // Forward FFT -template +template struct FFT { template @@ -326,7 +334,7 @@ struct FFT }; // Inverse FFT -template +template struct FFT { template From 0365e3075a6adaf82bae17ba75b4f0dfc143a462 Mon Sep 17 00:00:00 2001 From: Fletterio Date: Sat, 9 Nov 2024 21:11:25 -0300 Subject: [PATCH 169/358] Bit shift masks change --- include/nbl/builtin/hlsl/workgroup/fft.hlsl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup/fft.hlsl b/include/nbl/builtin/hlsl/workgroup/fft.hlsl index 7605bf8a0e..7d43caa63a 100644 --- a/include/nbl/builtin/hlsl/workgroup/fft.hlsl +++ b/include/nbl/builtin/hlsl/workgroup/fft.hlsl @@ -57,7 +57,7 @@ namespace impl // Lowest bits numbered from 0 through N - H - 1 NBL_CONSTEXPR_STATIC_INLINE uint32_t lowMask = (1 << (N - H)) - 1; NBL_CONSTEXPR_STATIC_INLINE uint32_t midMask = 1 << (N - H); - NBL_CONSTEXPR_STATIC_INLINE uint32_t highMask = ~((1 << (N - H + 1)) - 1); + NBL_CONSTEXPR_STATIC_INLINE uint32_t highMask = ~(lowMask | midMask); uint32_t low = i & lowMask; uint32_t mid = i & midMask; @@ -76,8 +76,8 @@ namespace impl // N - 1 is then the highest bit, and N - 2 through N - H are the middle bits // Lowest bits numbered from 0 through N - H - 1 NBL_CONSTEXPR_STATIC_INLINE uint32_t lowMask = (1 << (N - H)) - 1; - NBL_CONSTEXPR_STATIC_INLINE uint32_t midMask = ~((1 << (N - H)) - 1) | ~(1 << (N - 1)); NBL_CONSTEXPR_STATIC_INLINE uint32_t highMask = 1 << (N - 1); + NBL_CONSTEXPR_STATIC_INLINE uint32_t midMask = ~(lowMask | highMask); uint32_t low = i & lowMask; uint32_t mid = i & midMask; From 7df72fee46fa04fa6be3bbbd3aecb028c2096560 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 11 Nov 2024 10:21:48 +0700 Subject: [PATCH 170/358] ngfx integration --- include/nbl/video/IAPIConnection.h | 14 ++++- include/nbl/video/utilities/ngfx.h | 79 +++++++++++++++++++++++++++++ src/nbl/video/CVulkanConnection.cpp | 15 ++++-- src/nbl/video/IAPIConnection.cpp | 3 ++ 4 files changed, 105 insertions(+), 6 deletions(-) create mode 100644 include/nbl/video/utilities/ngfx.h diff --git a/include/nbl/video/IAPIConnection.h b/include/nbl/video/IAPIConnection.h index b20a3573d1..840944bab2 100644 --- a/include/nbl/video/IAPIConnection.h +++ b/include/nbl/video/IAPIConnection.h @@ -11,7 +11,7 @@ #include "nbl/video/debug/IDebugCallback.h" #include "nbl/video/utilities/renderdoc.h" - +#include "nbl/video/utilities/ngfx.h" namespace nbl::video { @@ -61,7 +61,16 @@ class NBL_API2 IAPIConnection : public core::IReferenceCounted const SFeatures& getEnabledFeatures() const { return m_enabledFeatures; } - const bool isRunningInRenderdoc() const { return m_rdoc_api; } + enum DebuggerType + { + EDT_NONE, + EDT_RENDERDOC, + EDT_NGFX + }; + const DebuggerType isRunningInGraphicsDebugger() const { + return m_ngfx_api.useNGFX ? EDT_NGFX : // ngfx takes priority? + m_rdoc_api ? EDT_RENDERDOC : EDT_NONE; + } virtual bool startCapture() = 0; virtual bool endCapture() = 0; @@ -70,6 +79,7 @@ class NBL_API2 IAPIConnection : public core::IReferenceCounted std::vector> m_physicalDevices; renderdoc_api_t* m_rdoc_api; + ngfx_api_t m_ngfx_api; SFeatures m_enabledFeatures = {}; }; diff --git a/include/nbl/video/utilities/ngfx.h b/include/nbl/video/utilities/ngfx.h new file mode 100644 index 0000000000..af260b72e2 --- /dev/null +++ b/include/nbl/video/utilities/ngfx.h @@ -0,0 +1,79 @@ +#ifndef _NBL_VIDEO_UTILITIES_NGFX_H_INCLUDED_ +#define _NBL_VIDEO_UTILITIES_NGFX_H_INCLUDED_ + +#include "C:\Program Files\NVIDIA Corporation\Nsight Graphics 2024.1.0\SDKs\NsightGraphicsSDK\0.8.0\include\NGFX_Injection.h" + +namespace nbl::video +{ + struct SNGFXIntegration + { + bool useNGFX; + NGFX_Injection_InstallationInfo versionInfo; + }; + + bool injectNGFXToProcess(SNGFXIntegration& api) + { + uint32_t numInstallations = 0; + auto result = NGFX_Injection_EnumerateInstallations(&numInstallations, nullptr); + if (numInstallations == 0 || NGFX_INJECTION_RESULT_OK != result) + { + api.useNGFX = false; + return false; + } + + std::vector installations(numInstallations); + result = NGFX_Injection_EnumerateInstallations(&numInstallations, installations.data()); + if (numInstallations == 0 || NGFX_INJECTION_RESULT_OK != result) + { + api.useNGFX = false; + return false; + } + + // get latest installation + api.versionInfo = installations.back(); + + uint32_t numActivities = 0; + result = NGFX_Injection_EnumerateActivities(&api.versionInfo, &numActivities, nullptr); + if (numActivities == 0 || NGFX_INJECTION_RESULT_OK != result) + { + api.useNGFX = false; + return false; + } + + std::vector activities(numActivities); + result = NGFX_Injection_EnumerateActivities(&api.versionInfo, &numActivities, activities.data()); + if (NGFX_INJECTION_RESULT_OK != result) + { + api.useNGFX = false; + return false; + } + + const NGFX_Injection_Activity* pActivityToInject = nullptr; + for (const NGFX_Injection_Activity& activity : activities) + { + if (activity.type == NGFX_INJECTION_ACTIVITY_FRAME_DEBUGGER) // only want frame debugger + { + pActivityToInject = &activity; + break; + } + } + + if (!pActivityToInject) { + api.useNGFX = false; + return false; + } + + result = NGFX_Injection_InjectToProcess(&api.versionInfo, pActivityToInject); + if (NGFX_INJECTION_RESULT_OK != result) + { + api.useNGFX = false; + return false; + } + + return true; + } + + using ngfx_api_t = SNGFXIntegration; +} + +#endif //_NBL_VIDEO_UTILITIES_NGFX_H_INCLUDED_ \ No newline at end of file diff --git a/src/nbl/video/CVulkanConnection.cpp b/src/nbl/video/CVulkanConnection.cpp index 737059d947..c0b5fe1e7f 100644 --- a/src/nbl/video/CVulkanConnection.cpp +++ b/src/nbl/video/CVulkanConnection.cpp @@ -323,7 +323,8 @@ CVulkanConnection::~CVulkanConnection() bool CVulkanConnection::startCapture() { - if (!isRunningInRenderdoc()) + auto debugType = isRunningInGraphicsDebugger(); + if (debugType == EDT_NONE) return false; if (flag.test()) { @@ -335,13 +336,17 @@ bool CVulkanConnection::startCapture() } flag.test_and_set(); - m_rdoc_api->StartFrameCapture(RENDERDOC_DEVICEPOINTER_FROM_VKINSTANCE(m_vkInstance), NULL); + if (debugType == EDT_RENDERDOC) + m_rdoc_api->StartFrameCapture(RENDERDOC_DEVICEPOINTER_FROM_VKINSTANCE(m_vkInstance), NULL); + else + NGFX_Injection_ExecuteActivityCommand(); return true; } bool CVulkanConnection::endCapture() { - if (!isRunningInRenderdoc()) + auto debugType = isRunningInGraphicsDebugger(); + if (debugType == EDT_NONE) return false; if (!flag.test()) { @@ -352,7 +357,9 @@ bool CVulkanConnection::endCapture() return false; } - m_rdoc_api->EndFrameCapture(RENDERDOC_DEVICEPOINTER_FROM_VKINSTANCE(m_vkInstance), NULL); + if (debugType == EDT_RENDERDOC) + m_rdoc_api->EndFrameCapture(RENDERDOC_DEVICEPOINTER_FROM_VKINSTANCE(m_vkInstance), NULL); + // no equivalent end frame capture for ngfx, ends captures on next frame delimiter flag.clear(); return true; } diff --git a/src/nbl/video/IAPIConnection.cpp b/src/nbl/video/IAPIConnection.cpp index 5b3d72760f..750fd02f5d 100644 --- a/src/nbl/video/IAPIConnection.cpp +++ b/src/nbl/video/IAPIConnection.cpp @@ -43,6 +43,9 @@ IAPIConnection::IAPIConnection(const SFeatures& enabledFeatures) int ret = RENDERDOC_GetAPI(MinRenderdocVersion, (void**)&m_rdoc_api); assert(ret == 1); #endif + + // probably is platform agnostic, for now + injectNGFXToProcess(m_ngfx_api); } } From ecb1d09bb82ac5f843d74b4d08009b50da9f7988 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 11 Nov 2024 11:19:57 +0700 Subject: [PATCH 171/358] fixed some linker issues --- include/nbl/video/utilities/ngfx.h | 8 +++++++- src/nbl/video/CVulkanConnection.cpp | 2 +- src/nbl/video/IAPIConnection.cpp | 1 + 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/nbl/video/utilities/ngfx.h b/include/nbl/video/utilities/ngfx.h index af260b72e2..91eabfa2b3 100644 --- a/include/nbl/video/utilities/ngfx.h +++ b/include/nbl/video/utilities/ngfx.h @@ -1,6 +1,7 @@ #ifndef _NBL_VIDEO_UTILITIES_NGFX_H_INCLUDED_ #define _NBL_VIDEO_UTILITIES_NGFX_H_INCLUDED_ +// TODO: hopefully this is temporary #include "C:\Program Files\NVIDIA Corporation\Nsight Graphics 2024.1.0\SDKs\NsightGraphicsSDK\0.8.0\include\NGFX_Injection.h" namespace nbl::video @@ -11,7 +12,7 @@ namespace nbl::video NGFX_Injection_InstallationInfo versionInfo; }; - bool injectNGFXToProcess(SNGFXIntegration& api) + inline bool injectNGFXToProcess(SNGFXIntegration& api) { uint32_t numInstallations = 0; auto result = NGFX_Injection_EnumerateInstallations(&numInstallations, nullptr); @@ -73,6 +74,11 @@ namespace nbl::video return true; } + inline void executeNGFXCommand() + { + NGFX_Injection_ExecuteActivityCommand(); + } + using ngfx_api_t = SNGFXIntegration; } diff --git a/src/nbl/video/CVulkanConnection.cpp b/src/nbl/video/CVulkanConnection.cpp index c0b5fe1e7f..e1a33a1418 100644 --- a/src/nbl/video/CVulkanConnection.cpp +++ b/src/nbl/video/CVulkanConnection.cpp @@ -339,7 +339,7 @@ bool CVulkanConnection::startCapture() if (debugType == EDT_RENDERDOC) m_rdoc_api->StartFrameCapture(RENDERDOC_DEVICEPOINTER_FROM_VKINSTANCE(m_vkInstance), NULL); else - NGFX_Injection_ExecuteActivityCommand(); + executeNGFXCommand(); return true; } diff --git a/src/nbl/video/IAPIConnection.cpp b/src/nbl/video/IAPIConnection.cpp index 750fd02f5d..8dc156bb94 100644 --- a/src/nbl/video/IAPIConnection.cpp +++ b/src/nbl/video/IAPIConnection.cpp @@ -2,6 +2,7 @@ #include "nbl/video/IPhysicalDevice.h" #include "nbl/video/utilities/renderdoc.h" +#include "nbl/video/utilities/ngfx.h" #if defined(_NBL_POSIX_API_) #include From 51cbd88629622cd071c098499c2ddf68d98110c4 Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Tue, 12 Nov 2024 00:58:02 +0100 Subject: [PATCH 172/358] @alichraghi small homework - read https://en.cppreference.com/w/cpp/language/inline and afterwards https://gudok.xyz/inline/ please --- src/nbl/asset/utils/IShaderCompiler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nbl/asset/utils/IShaderCompiler.cpp b/src/nbl/asset/utils/IShaderCompiler.cpp index bde89557ec..01cd360389 100644 --- a/src/nbl/asset/utils/IShaderCompiler.cpp +++ b/src/nbl/asset/utils/IShaderCompiler.cpp @@ -24,7 +24,7 @@ IShaderCompiler::IShaderCompiler(core::smart_refctd_ptr&& syste m_defaultIncludeFinder = core::make_smart_refctd_ptr(core::smart_refctd_ptr(m_system)); } -inline core::smart_refctd_ptr nbl::asset::IShaderCompiler::compileToSPIRV(const std::string_view code, const SCompilerOptions& options) const +core::smart_refctd_ptr nbl::asset::IShaderCompiler::compileToSPIRV(const std::string_view code, const SCompilerOptions& options) const { CCache::SEntry entry; std::vector dependencies; From 6c0b3cb73e62ba709920c4beb916eaf53b6571f8 Mon Sep 17 00:00:00 2001 From: Erfan Ahmadi Date: Tue, 12 Nov 2024 10:55:58 +0400 Subject: [PATCH 173/358] CVulkanSwapchain allow 2 more acquiresInFlight --- include/nbl/video/CVulkanSwapchain.h | 6 ++++-- src/nbl/video/CVulkanSwapchain.cpp | 14 ++++++++------ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/include/nbl/video/CVulkanSwapchain.h b/include/nbl/video/CVulkanSwapchain.h index 20843a3fbb..5a5c7de492 100644 --- a/include/nbl/video/CVulkanSwapchain.h +++ b/include/nbl/video/CVulkanSwapchain.h @@ -27,7 +27,7 @@ class CVulkanSwapchain final : public ISwapchain inline uint8_t getMaxBlockingAcquiresBeforePresent() const override { return m_maxBlockingAcquiresBeforePresent; } // returns the maximum number of acquires you can request without waiting for previous acquire semaphores to signal. - uint8_t getMaxAcquiresInFlight() const override { return getImageCount(); } + uint8_t getMaxAcquiresInFlight() const override { return m_maxAcquiresInFlight; } private: CVulkanSwapchain( @@ -39,7 +39,8 @@ class CVulkanSwapchain final : public ISwapchain const VkSemaphore* const _acquireAdaptorSemaphores, const VkSemaphore* const _prePresentSemaphores, const VkSemaphore* const _presentAdaptorSemaphores, - const uint8_t maxAcquiresBeforePresent + const uint8_t maxAcquiresBeforePresent, + const uint8_t maxAcquiresInFlight ); ~CVulkanSwapchain(); @@ -59,6 +60,7 @@ class CVulkanSwapchain final : public ISwapchain // nasty way to fight UB of the Vulkan spec bool m_needToWaitIdle = true; uint8_t m_maxBlockingAcquiresBeforePresent = 0u; + uint8_t m_maxAcquiresInFlight = 0u; }; } diff --git a/src/nbl/video/CVulkanSwapchain.cpp b/src/nbl/video/CVulkanSwapchain.cpp index c1c248d4a0..69757110df 100644 --- a/src/nbl/video/CVulkanSwapchain.cpp +++ b/src/nbl/video/CVulkanSwapchain.cpp @@ -138,9 +138,12 @@ core::smart_refctd_ptr CVulkanSwapchain::create(core::smart_re .pNext = nullptr, .flags = 0 }; + + // imageCount + 2u, because we allow max 2 pending submissions to wait on image availability while others are being presented or rendered into. + const uint8_t maxAcquiresinFlight = core::min(imageCount + 2u, ISwapchain::MaxImages); VkSemaphore semaphores[ISwapchain::MaxImages]; - for (auto i=0u; i<3*imageCount; i++) - if (vk.vkCreateSemaphore(vk_device,i CVulkanSwapchain::create(core::smart_re // read https://registry.khronos.org/vulkan/specs/1.3-extensions/html/chap34.html#swapchain-acquire-forward-progress const uint8_t maxAcquiresWithoutPresent = core::min(imageCount,ISwapchain::MaxImages) - caps.minImageCount + 1u; - return core::smart_refctd_ptr(new CVulkanSwapchain(std::move(device),std::move(params),imageCount,std::move(oldSwapchain),vk_swapchain,semaphores+imageCount,semaphores,semaphores+2*imageCount, maxAcquiresWithoutPresent),core::dont_grab); + return core::smart_refctd_ptr(new CVulkanSwapchain(std::move(device),std::move(params),imageCount,std::move(oldSwapchain),vk_swapchain,semaphores+maxAcquiresinFlight,semaphores,semaphores+2*maxAcquiresinFlight, maxAcquiresWithoutPresent, maxAcquiresinFlight),core::dont_grab); } CVulkanSwapchain::CVulkanSwapchain( @@ -165,7 +168,8 @@ CVulkanSwapchain::CVulkanSwapchain( const VkSemaphore* const _acquireAdaptorSemaphores, const VkSemaphore* const _prePresentSemaphores, const VkSemaphore* const _presentAdaptorSemaphores, - const uint8_t maxAcquiresBeforePresent) + const uint8_t maxAcquiresBeforePresent, + const uint8_t maxAcquiresInFlight) : ISwapchain(std::move(logicalDevice),std::move(params),imageCount,std::move(oldSwapchain)), m_imgMemRequirements{.size=0,.memoryTypeBits=0x0u,.alignmentLog2=63,.prefersDedicatedAllocation=true,.requiresDedicatedAllocation=true}, m_vkSwapchainKHR(swapchain), m_maxBlockingAcquiresBeforePresent(maxAcquiresBeforePresent) { @@ -179,8 +183,6 @@ CVulkanSwapchain::CVulkanSwapchain( auto retval = vulkanDevice->getFunctionTable()->vk.vkGetSwapchainImagesKHR(vulkanDevice->getInternalObject(),m_vkSwapchainKHR,&dummy,m_images); assert(retval==VK_SUCCESS && dummy==getImageCount()); } - const uint8_t maxAcquiresInFlight = getMaxAcquiresInFlight(); - assert(maxAcquiresInFlight == imageCount); assert(getMaxBlockingAcquiresBeforePresent() <= imageCount); std::copy_n(_acquireAdaptorSemaphores,maxAcquiresInFlight,m_acquireAdaptorSemaphores); From d37770f6dfab7f19de6c798ddd2c5a93256ee7ae Mon Sep 17 00:00:00 2001 From: Erfan Ahmadi Date: Tue, 12 Nov 2024 11:04:00 +0400 Subject: [PATCH 174/358] fix and remove assumption that imageCount == acquiresInFlight --- src/nbl/video/CVulkanSwapchain.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/nbl/video/CVulkanSwapchain.cpp b/src/nbl/video/CVulkanSwapchain.cpp index 69757110df..af551fd28d 100644 --- a/src/nbl/video/CVulkanSwapchain.cpp +++ b/src/nbl/video/CVulkanSwapchain.cpp @@ -171,7 +171,10 @@ CVulkanSwapchain::CVulkanSwapchain( const uint8_t maxAcquiresBeforePresent, const uint8_t maxAcquiresInFlight) : ISwapchain(std::move(logicalDevice),std::move(params),imageCount,std::move(oldSwapchain)), - m_imgMemRequirements{.size=0,.memoryTypeBits=0x0u,.alignmentLog2=63,.prefersDedicatedAllocation=true,.requiresDedicatedAllocation=true}, m_vkSwapchainKHR(swapchain), m_maxBlockingAcquiresBeforePresent(maxAcquiresBeforePresent) + m_imgMemRequirements{.size=0,.memoryTypeBits=0x0u,.alignmentLog2=63,.prefersDedicatedAllocation=true,.requiresDedicatedAllocation=true}, + m_vkSwapchainKHR(swapchain), + m_maxBlockingAcquiresBeforePresent(maxAcquiresBeforePresent), + m_maxAcquiresInFlight(maxAcquiresInFlight) { // we've got it from here! if (m_oldSwapchain) @@ -205,7 +208,7 @@ CVulkanSwapchain::~CVulkanSwapchain() .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO, .pNext = nullptr, .flags = 0, - .semaphoreCount = getImageCount(), + .semaphoreCount = getMaxAcquiresInFlight(), .pSemaphores = m_prePresentSemaphores, .pValues = m_perImageAcquireCount }; @@ -213,7 +216,7 @@ CVulkanSwapchain::~CVulkanSwapchain() if (vk.vkWaitSemaphores(vk_device,&info,~0ull)!=VK_TIMEOUT) break; - for (auto i=0u; i Date: Tue, 12 Nov 2024 14:56:38 +0100 Subject: [PATCH 175/358] looking in empty cache was borken --- src/nbl/asset/utils/IShaderCompiler.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nbl/asset/utils/IShaderCompiler.cpp b/src/nbl/asset/utils/IShaderCompiler.cpp index bde89557ec..095ed717d8 100644 --- a/src/nbl/asset/utils/IShaderCompiler.cpp +++ b/src/nbl/asset/utils/IShaderCompiler.cpp @@ -272,6 +272,7 @@ IShaderCompiler::CCache::EntrySet::const_iterator IShaderCompiler::CCache::find_ { auto found = m_container.find(mainFile); // go through all dependencies + if (found!=m_container.end()) for (auto i = 0; i < found->dependencies.size(); i++) { const auto& dependency = found->dependencies[i]; From 9a233c723f9896820627f8798ad35b2e5df3428a Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 12 Nov 2024 16:04:10 +0100 Subject: [PATCH 176/358] fix preload sizes --- include/nbl/video/utilities/CComputeBlit.h | 2 +- src/nbl/video/utilities/CComputeBlit.cpp | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/nbl/video/utilities/CComputeBlit.h b/include/nbl/video/utilities/CComputeBlit.h index 033d0774f1..a06f8914c4 100644 --- a/include/nbl/video/utilities/CComputeBlit.h +++ b/include/nbl/video/utilities/CComputeBlit.h @@ -159,7 +159,7 @@ class CComputeBlit : public core::IReferenceCounted return computePerWorkGroup(sharedMemorySize,minSupport,maxSupport,type,inExtent,outExtent,halfPrecision); } NBL_API2 static hlsl::blit::SPerWorkgroup computePerWorkGroup( - const uint16_t sharedMemorySize, const hlsl::float32_t3 minSupportInOutput, const hlsl::float32_t3 maxSupportInOutput, const IGPUImage::E_TYPE type, + const uint16_t sharedMemorySize, const hlsl::float32_t3 minSupportInInput, const hlsl::float32_t3 maxSupportInInput, const IGPUImage::E_TYPE type, const hlsl::uint16_t3 inExtent, const hlsl::uint16_t3 outExtent, const bool halfPrecision=false ); diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp index 90dbf1a273..3d6c85cd74 100644 --- a/src/nbl/video/utilities/CComputeBlit.cpp +++ b/src/nbl/video/utilities/CComputeBlit.cpp @@ -117,7 +117,7 @@ struct ConstevalParameters } SPerWorkgroup CComputeBlit::computePerWorkGroup( - const uint16_t sharedMemorySize, const float32_t3 minSupportInOutput, const float32_t3 maxSupportInOutput, const IGPUImage::E_TYPE type, + const uint16_t sharedMemorySize, const float32_t3 minSupportInInput, const float32_t3 maxSupportInInput, const IGPUImage::E_TYPE type, const uint16_t3 inExtent, const uint16_t3 outExtent, const bool halfPrecision ) { @@ -126,16 +126,16 @@ SPerWorkgroup CComputeBlit::computePerWorkGroup( const auto Dims = static_cast(type)+1; const auto scale = float32_t3(inExtent)/float32_t3(outExtent); - const auto supportWidthInOutput = maxSupportInOutput-minSupportInOutput; + const auto supportWidthInInput = maxSupportInInput-minSupportInInput; IGPUImage::E_TYPE minDimAxes[3] = { IGPUImage::ET_1D, IGPUImage::ET_2D, IGPUImage::ET_3D }; using namespace nbl::hlsl; for (uint16_t3 output(1,1,1); true;) { // now try and grow our support - const auto combinedSupportInOutput = supportWidthInOutput+float32_t3(output-uint16_t3(1,1,1)); + const auto combinedSupportInInput = supportWidthInInput+float32_t3(output-uint16_t3(1,1,1))*scale; // note that its not ceil on purpose - uint32_t3 preload = uint32_t3(hlsl::floor(combinedSupportInOutput*scale))+uint32_t3(1,1,1); + uint32_t3 preload = uint32_t3(hlsl::floor(combinedSupportInInput))+uint32_t3(1,1,1); // Set the unused dimensions to 1 to avoid weird behaviours with scaled kernels for (auto a=Dims; a<3; a++) preload[a] = 1; @@ -162,9 +162,9 @@ SPerWorkgroup CComputeBlit::computePerWorkGroup( // we want to fix the dimension that's the smallest, so that we increase the volume of the support by a smallest increment and stay close to a cube shape { - std::sort(minDimAxes,minDimAxes+Dims,[output](const IGPUImage::E_TYPE a, const IGPUImage::E_TYPE b)->bool + std::sort(minDimAxes,minDimAxes+Dims,[preload](const IGPUImage::E_TYPE a, const IGPUImage::E_TYPE b)->bool { - return output[a] Date: Tue, 12 Nov 2024 16:22:13 +0100 Subject: [PATCH 177/358] move acccessor concepts to their own folder and namespace --- .../concepts/{ => accessors}/anisotropically_sampled.hlsl | 7 +++++-- .../hlsl/concepts/{ => accessors}/loadable_image.hlsl | 7 +++++-- .../builtin/hlsl/concepts/{ => accessors}/mip_mapped.hlsl | 7 +++++-- .../hlsl/concepts/{ => accessors}/storable_image.hlsl | 7 +++++-- src/nbl/builtin/CMakeLists.txt | 8 ++++---- 5 files changed, 24 insertions(+), 12 deletions(-) rename include/nbl/builtin/hlsl/concepts/{ => accessors}/anisotropically_sampled.hlsl (87%) rename include/nbl/builtin/hlsl/concepts/{ => accessors}/loadable_image.hlsl (92%) rename include/nbl/builtin/hlsl/concepts/{ => accessors}/mip_mapped.hlsl (88%) rename include/nbl/builtin/hlsl/concepts/{ => accessors}/storable_image.hlsl (87%) diff --git a/include/nbl/builtin/hlsl/concepts/anisotropically_sampled.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/anisotropically_sampled.hlsl similarity index 87% rename from include/nbl/builtin/hlsl/concepts/anisotropically_sampled.hlsl rename to include/nbl/builtin/hlsl/concepts/accessors/anisotropically_sampled.hlsl index b04d1786d8..e6019d056c 100644 --- a/include/nbl/builtin/hlsl/concepts/anisotropically_sampled.hlsl +++ b/include/nbl/builtin/hlsl/concepts/accessors/anisotropically_sampled.hlsl @@ -1,8 +1,8 @@ // Copyright (C) 2024-2025 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ANISOTROPICALLY_SAMPLED_INCLUDED_ -#define _NBL_BUILTIN_HLSL_CONCEPTS_ANISOTROPICALLY_SAMPLED_INCLUDED_ +#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_ANISOTROPICALLY_SAMPLED_INCLUDED_ +#define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_ANISOTROPICALLY_SAMPLED_INCLUDED_ #include @@ -14,6 +14,8 @@ namespace hlsl { namespace concepts { +namespace accessors +{ // declare concept #define NBL_CONCEPT_NAME AnisotropicallySampled #define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(int32_t) @@ -44,4 +46,5 @@ NBL_CONCEPT_END( } } } +} #endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/concepts/loadable_image.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/loadable_image.hlsl similarity index 92% rename from include/nbl/builtin/hlsl/concepts/loadable_image.hlsl rename to include/nbl/builtin/hlsl/concepts/accessors/loadable_image.hlsl index 2f32ba48d1..c272eeb1ab 100644 --- a/include/nbl/builtin/hlsl/concepts/loadable_image.hlsl +++ b/include/nbl/builtin/hlsl/concepts/accessors/loadable_image.hlsl @@ -1,8 +1,8 @@ // Copyright (C) 2024-2025 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_LOADABLE_IMAGE_INCLUDED_ -#define _NBL_BUILTIN_HLSL_CONCEPTS_LOADABLE_IMAGE_INCLUDED_ +#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_LOADABLE_IMAGE_INCLUDED_ +#define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_LOADABLE_IMAGE_INCLUDED_ #include @@ -14,6 +14,8 @@ namespace hlsl { namespace concepts { +namespace accessors +{ // declare concept #define NBL_CONCEPT_NAME StorableImage #define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(int32_t) @@ -63,4 +65,5 @@ NBL_CONCEPT_END( } } } +} #endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/concepts/mip_mapped.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/mip_mapped.hlsl similarity index 88% rename from include/nbl/builtin/hlsl/concepts/mip_mapped.hlsl rename to include/nbl/builtin/hlsl/concepts/accessors/mip_mapped.hlsl index 5aa9a7da88..c49e66617b 100644 --- a/include/nbl/builtin/hlsl/concepts/mip_mapped.hlsl +++ b/include/nbl/builtin/hlsl/concepts/accessors/mip_mapped.hlsl @@ -1,8 +1,8 @@ // Copyright (C) 2024-2025 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_MIP_MAPPED_INCLUDED_ -#define _NBL_BUILTIN_HLSL_CONCEPTS_MIP_MAPPED_INCLUDED_ +#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_MIP_MAPPED_INCLUDED_ +#define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_MIP_MAPPED_INCLUDED_ #include @@ -14,6 +14,8 @@ namespace hlsl { namespace concepts { +namespace accessors +{ // declare concept #define NBL_CONCEPT_NAME MipMapped #define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(int32_t) @@ -41,4 +43,5 @@ NBL_CONCEPT_END( } } } +} #endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/concepts/storable_image.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/storable_image.hlsl similarity index 87% rename from include/nbl/builtin/hlsl/concepts/storable_image.hlsl rename to include/nbl/builtin/hlsl/concepts/accessors/storable_image.hlsl index a46d211b82..7eda9b9303 100644 --- a/include/nbl/builtin/hlsl/concepts/storable_image.hlsl +++ b/include/nbl/builtin/hlsl/concepts/accessors/storable_image.hlsl @@ -1,8 +1,8 @@ // Copyright (C) 2024-2025 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_STORABLE_IMAGE_INCLUDED_ -#define _NBL_BUILTIN_HLSL_CONCEPTS_STORABLE_IMAGE_INCLUDED_ +#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_STORABLE_IMAGE_INCLUDED_ +#define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_STORABLE_IMAGE_INCLUDED_ #include @@ -14,6 +14,8 @@ namespace hlsl { namespace concepts { +namespace accessors +{ // declare concept #define NBL_CONCEPT_NAME StorableImage #define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(int32_t) @@ -41,4 +43,5 @@ NBL_CONCEPT_END( } } } +} #endif \ No newline at end of file diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index 1ec29c4e8b..2818b3f64c 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -325,9 +325,9 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/enums.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/binding_info.hlsl") # LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/__end.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/anisotropically_sampled.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/loadable_image.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/mip_mapped.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/storable_image.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/anisotropically_sampled.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/loadable_image.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/mip_mapped.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/storable_image.hlsl") ADD_CUSTOM_BUILTIN_RESOURCES(nblBuiltinResourceData NBL_RESOURCES_TO_EMBED "${NBL_ROOT_PATH}/include" "nbl/builtin" "nbl::builtin" "${NBL_ROOT_PATH_BINARY}/include" "${NBL_ROOT_PATH_BINARY}/src" "STATIC" "INTERNAL") From 60b3a8dbdb21da8359cadf5e1714ff277b15dbdc Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 13 Nov 2024 10:57:45 +0700 Subject: [PATCH 178/358] fix typo --- include/nbl/video/ILogicalDevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 7c4c7cd264..276ed74501 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -941,7 +941,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe NBL_LOG_ERROR("The queryPool was not created by this device"); return false; } - if (firstQuery + queryCount >= queryPool->getCreationParameters().queryCount) + if (firstQuery + queryCount > queryPool->getCreationParameters().queryCount) { NBL_LOG_ERROR("Query index out of bounds"); return false; From c5f2fcdcbb56cac16d95ba7fe67c125db53b67e6 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 13 Nov 2024 10:58:36 +0700 Subject: [PATCH 179/358] get to latest working example --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 79cc5532b6..e603c07fa1 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 79cc5532b61caa0837e8b0ed098c07d22896b3d0 +Subproject commit e603c07fa171de61d0344612d2c1cf4c15e5bd1a From 8025337f5ea7d1370f75127f81453ad05d84657b Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 13 Nov 2024 14:24:57 +0700 Subject: [PATCH 180/358] add new spirv intrinsic unpack --- examples_tests | 2 +- include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index e603c07fa1..e1992f9dc6 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit e603c07fa171de61d0344612d2c1cf4c15e5bd1a +Subproject commit e1992f9dc6a5fe2a5331019a2a207e4fb5452d65 diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl index 2a16e29133..e3e59b466b 100644 --- a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl +++ b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl @@ -197,6 +197,9 @@ SquareMatrix matrixInverse(NBL_CONST_REF_ARG(SquareMatrix) mat); [[vk::ext_instruction(GLSLstd450UnpackSnorm2x16, "GLSL.std.450")]] float32_t2 unpackSnorm2x16(uint32_t p); +[[vk::ext_instruction(GLSLstd450UnpackUnorm4x8, "GLSL.std.450")]] +float32_t4 unpackUnorm4x8(uint32_t p); + // Memory Semantics link here: https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#Memory_Semantics_-id- // https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#_memory_semantics_id From 9f25ce511c49fe4faf1e4588355b26b6b538202b Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Wed, 13 Nov 2024 18:49:15 +0100 Subject: [PATCH 181/358] make Nabla docker build "exec" based with git cache volume --- CMakeLists.txt | 1 + docker/.env | 2 + docker/Dockerfile | 116 +++++++++++++ docker/compose.yml | 28 +++ docker/compose/Dockerfile | 162 ------------------ .../ci/stages/.env/platform/windows/.env | 8 - .../stages/dev/axes/dynamic/debug/compose.yml | 81 --------- .../dev/axes/dynamic/release/compose.yml | 79 --------- .../axes/dynamic/relwithdebinfo/compose.yml | 81 --------- .../stages/dev/axes/static/debug/compose.yml | 81 --------- .../dev/axes/static/release/compose.yml | 79 --------- .../axes/static/relwithdebinfo/compose.yml | 81 --------- docker/compose/ci/stages/dev/compose.yml | 39 ----- docker/compose/scripts/os/javaHome.py | 23 --- docker/compose/scripts/os/resources/zoo.cfg | 6 - docker/dev.py | 62 ------- docker/scripts/__init__.py | 0 docker/scripts/nbl/ci/dev/.vscode/launch.json | 117 ------------- docker/scripts/nbl/ci/dev/__init__.py | 0 docker/scripts/nbl/ci/dev/build.py | 84 --------- docker/scripts/nbl/ci/dev/cmake.py | 47 ----- docker/scripts/nbl/ci/dev/cpack.py | 72 -------- docker/scripts/nbl/ci/dev/lib/kazoo.py | 105 ------------ docker/scripts/ncpfmp.bat | 13 -- 24 files changed, 147 insertions(+), 1220 deletions(-) create mode 100644 docker/.env create mode 100644 docker/Dockerfile create mode 100644 docker/compose.yml delete mode 100644 docker/compose/Dockerfile delete mode 100644 docker/compose/ci/stages/.env/platform/windows/.env delete mode 100644 docker/compose/ci/stages/dev/axes/dynamic/debug/compose.yml delete mode 100644 docker/compose/ci/stages/dev/axes/dynamic/release/compose.yml delete mode 100644 docker/compose/ci/stages/dev/axes/dynamic/relwithdebinfo/compose.yml delete mode 100644 docker/compose/ci/stages/dev/axes/static/debug/compose.yml delete mode 100644 docker/compose/ci/stages/dev/axes/static/release/compose.yml delete mode 100644 docker/compose/ci/stages/dev/axes/static/relwithdebinfo/compose.yml delete mode 100644 docker/compose/ci/stages/dev/compose.yml delete mode 100644 docker/compose/scripts/os/javaHome.py delete mode 100644 docker/compose/scripts/os/resources/zoo.cfg delete mode 100644 docker/dev.py delete mode 100644 docker/scripts/__init__.py delete mode 100644 docker/scripts/nbl/ci/dev/.vscode/launch.json delete mode 100644 docker/scripts/nbl/ci/dev/__init__.py delete mode 100644 docker/scripts/nbl/ci/dev/build.py delete mode 100644 docker/scripts/nbl/ci/dev/cmake.py delete mode 100644 docker/scripts/nbl/ci/dev/cpack.py delete mode 100644 docker/scripts/nbl/ci/dev/lib/kazoo.py delete mode 100644 docker/scripts/ncpfmp.bat diff --git a/CMakeLists.txt b/CMakeLists.txt index 0976e00b52..4c0994c44b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,6 +17,7 @@ enable_language(C CXX ASM ASM_NASM) if(MSVC) enable_language(ASM_MASM) + link_libraries(delayimp) endif() option(NBL_STATIC_BUILD "" OFF) # ON for static builds, OFF for shared diff --git a/docker/.env b/docker/.env new file mode 100644 index 0000000000..623184f422 --- /dev/null +++ b/docker/.env @@ -0,0 +1,2 @@ +THIS_PROJECT_WORKING_DIRECTORY=C:\docker +THIS_PROJECT_NABLA_DIRECTORY=C:/Users/ContainerAdministrator/Nabla/bind \ No newline at end of file diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000000..a0427ccfeb --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,116 @@ +# escape=` + +ARG BASE_IMAGE=mcr.microsoft.com/windows/servercore:ltsc2022-amd64 + +FROM ${BASE_IMAGE} + +SHELL ["cmd", "/S", "/C"] + +USER ContainerAdministrator + +ENV THIS_PROJECT_WORKING_DIRECTORY="C:\docker" +ENV THIS_PROJECT_NABLA_DIRECTORY="C:/Users/ContainerAdministrator/Nabla/bind" +ENV VULKAN_SDK_INSTALL_DIRECTORY="${THIS_PROJECT_WORKING_DIRECTORY}\dependencies\VulkanSDK" +ENV VS_INSTALL_DIRECTORY="${THIS_PROJECT_WORKING_DIRECTORY}\dependencies\VS\BuildTools" + +RUN ` + # Download the Build Tools (17.11.5 October 8, 2024 version) bootstrapper. https://learn.microsoft.com/en-us/visualstudio/releases/2022/release-history + ` + curl -SL --output vs_buildtools.exe https://download.visualstudio.microsoft.com/download/pr/69e24482-3b48-44d3-af65-51f866a08313/471c9a89fa8ba27d356748ae0cf25eb1f362184992dc0bb6e9ccf10178c43c27/vs_BuildTools.exe ` + ` + # Install Build Tools with the Microsoft.VisualStudio.Workload.VCTools recommended workload and ATL & ATLMFC, excluding some Windows SDKs. + ` + && (start /w vs_buildtools.exe --quiet --wait --norestart --nocache ` + --installPath "%VS_INSTALL_DIRECTORY%" ` + --add Microsoft.VisualStudio.Workload.VCTools --includeRecommended ` + --add Microsoft.VisualStudio.Component.VC.ATL ` + --add Microsoft.VisualStudio.Component.VC.ATLMFC ` + --remove Microsoft.VisualStudio.Component.Windows10SDK.10240 ` + --remove Microsoft.VisualStudio.Component.Windows10SDK.10586 ` + --remove Microsoft.VisualStudio.Component.Windows10SDK.14393 ` + --remove Microsoft.VisualStudio.Component.Windows81SDK ` + || IF "%ERRORLEVEL%"=="3010" EXIT 0) ` + ` + # Add VS's CMake to the system PATH and cleanup + ` + && setx PATH "%PATH%;%VS_INSTALL_DIRECTORY%\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin" /M ` + ` + # Cleanup + ` + && del /q vs_buildtools.exe + +ENV VS_DEV_CMD_DIRECTORY="${VS_INSTALL_DIRECTORY}\Common7\Tools" + +RUN ` + # Add VS_DEV_CMD_DIRECTORY to the system PATH + ` + setx PATH "%PATH%;%VS_DEV_CMD_DIRECTORY%" /M + +RUN ` + # Download VulkanSDK + ` + curl -SL --output VulkanSDK-Installer.exe https://sdk.lunarg.com/sdk/download/1.3.268.0/windows/VulkanSDK-1.3.268.0-Installer.exe ` + ` + # Install VulkanSDK + ` + && VulkanSDK-Installer.exe install --root "%VULKAN_SDK_INSTALL_DIRECTORY%" --default-answer --accept-licenses --confirm-command ` + ` + # Cleanup + ` + && del /q VulkanSDK-Installer.exe + +RUN ` + # Download & install choco packet manager + ` + powershell Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1')) + +RUN ` + # Download & install executable Strawberry Perl + ` + choco install -y strawberryperl --version 5.28.2.1 + +RUN ` + # Download & install Python + ` + choco install -y python --version 3.11.6 + +RUN ` + # Download & install git + ` + choco install -y git --version 2.43.0 + +RUN ` + # Download & install nasm + ` + choco install -y nasm --version 2.16.1 + +RUN ` + # Download & install ninja + ` + choco install -y ninja --version 1.12.1 + +RUN ` + # Enable Long Paths feature + ` + reg add "HKLM\SYSTEM\CurrentControlSet\Control\FileSystem" /v "LongPathsEnabled" /t REG_DWORD /d 1 /f + +RUN ` + # Force git to use HTTPS protocol & trust containers + ` + git config --system protocol.*.allow always ` + ` + && git config --system url."https://github.com/".insteadOf "git@github.com:" ` + ` + && git config --system --add safe.directory * + +RUN ` + # Post environment setup + ` + setx VS_INSTALL_DIRECTORY "%VS_INSTALL_DIRECTORY%" /M ` + ` + && setx PATH "%PATH%;%VS_INSTALL_DIRECTORY%\VC\Auxiliary\Build" /M ` + ` + && setx NBL_CI_MODE "ON" + +WORKDIR ${THIS_PROJECT_NABLA_DIRECTORY} +ENTRYPOINT ["cmd.exe", "/K"] \ No newline at end of file diff --git a/docker/compose.yml b/docker/compose.yml new file mode 100644 index 0000000000..04c1013081 --- /dev/null +++ b/docker/compose.yml @@ -0,0 +1,28 @@ +services: + nabla: + build: + context: . + dockerfile: Dockerfile + image: dcr.devsh.eu/nabla/source + container_name: dev.nabla.build + env_file: + - .env + environment: + - THIS_PROJECT_WORKING_DIRECTORY=${THIS_PROJECT_WORKING_DIRECTORY} + - THIS_PROJECT_NABLA_DIRECTORY=${THIS_PROJECT_NABLA_DIRECTORY} + volumes: + - nabla-cache-git:${THIS_PROJECT_NABLA_DIRECTORY}/.git + networks: + docker_default: + deploy: + resources: + limits: + cpus: '6' + memory: 12G + +volumes: + nabla-cache-git: + +networks: + docker_default: + external: true \ No newline at end of file diff --git a/docker/compose/Dockerfile b/docker/compose/Dockerfile deleted file mode 100644 index 959c5de7b3..0000000000 --- a/docker/compose/Dockerfile +++ /dev/null @@ -1,162 +0,0 @@ -# escape=` - -ARG BASE_IMAGE=mcr.microsoft.com/windows/servercore:ltsc2022 - -ARG THIS_PROJECT_WORKING_DIRECTORY="C:\docker" -ARG THIS_PROJECT_NABLA_DIRECTORY="C:/Users/ContainerAdministrator/Nabla/bind" -ARG VULKAN_SDK_INSTALL_DIRECTORY="${THIS_PROJECT_WORKING_DIRECTORY}\dependencies\VulkanSDK" -ARG VS_INSTALL_DIRECTORY="${THIS_PROJECT_WORKING_DIRECTORY}\dependencies\VS\BuildTools" -ARG APACHE_ZOOKEEPER_INSTALL_DIRECTORY="${THIS_PROJECT_WORKING_DIRECTORY}\dependencies\ApacheZooKeeper" -ARG JAVA_HOME_SCRIPT="${THIS_PROJECT_WORKING_DIRECTORY}\dependencies\scripts\java" -ARG VS_DEV_CMD_DIRECTORY="${VS_INSTALL_DIRECTORY}\Common7\Tools" - -FROM ${BASE_IMAGE} - -SHELL ["cmd", "/S", "/C"] - -ARG VS_INSTALL_DIRECTORY - -RUN ` - # Download the Build Tools bootstrapper. - ` - curl -SL --output vs_buildtools.exe https://aka.ms/vs/17/release/vs_buildtools.exe ` - ` - # Install Build Tools with the Microsoft.VisualStudio.Workload.VCTools recommended workload and ATL & ATLMFC, excluding some Windows SDKs. - ` - && (start /w vs_buildtools.exe --quiet --wait --norestart --nocache ` - --installPath "%VS_INSTALL_DIRECTORY%" ` - --add Microsoft.VisualStudio.Workload.VCTools --includeRecommended ` - --add Microsoft.VisualStudio.Component.VC.ATL ` - --add Microsoft.VisualStudio.Component.VC.ATLMFC ` - --remove Microsoft.VisualStudio.Component.Windows10SDK.10240 ` - --remove Microsoft.VisualStudio.Component.Windows10SDK.10586 ` - --remove Microsoft.VisualStudio.Component.Windows10SDK.14393 ` - --remove Microsoft.VisualStudio.Component.Windows81SDK ` - || IF "%ERRORLEVEL%"=="3010" EXIT 0) ` - ` - # add VS's CMake to the system PATH and cleanup - ` - && setx PATH "%PATH%;%VS_INSTALL_DIRECTORY%\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin" /M ` - ` - # Cleanup - ` - && del /q vs_buildtools.exe - -ARG VULKAN_SDK_INSTALL_DIRECTORY - -RUN ` - # Download VulkanSDK - ` - curl -SL --output VulkanSDK-Installer.exe https://sdk.lunarg.com/sdk/download/1.3.268.0/windows/VulkanSDK-1.3.268.0-Installer.exe ` - ` - # Install VulkanSDK - ` - && VulkanSDK-Installer.exe install --root "%VULKAN_SDK_INSTALL_DIRECTORY%" --default-answer --accept-licenses --confirm-command ` - ` - # Cleanup - ` - && del /q VulkanSDK-Installer.exe - -RUN ` - # Download & install choco packet manager - ` - powershell Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1')) - -RUN ` - # Download & install executable Strawberry Perl 5.28.2.1 - ` - choco install -y strawberryperl --version 5.28.2.1 - -RUN ` - # Download & install Python 3.11.6 - ` - choco install -y python --version 3.11.6 - -RUN ` - # Donwload debugpy Python module - ` - python -m pip install --upgrade debugpy - -RUN ` - # Download & install git 2.43.0 - ` - choco install -y git --version 2.43.0 - -ARG APACHE_ZOOKEEPER_INSTALL_DIRECTORY - -RUN ` - # Download Apache ZooKeeper - ` - curl -SL --output zookeeper.zip https://dlcdn.apache.org/zookeeper/stable/apache-zookeeper-3.8.4-bin.tar.gz ` - ` - # Create install directory - ` - && mkdir "%APACHE_ZOOKEEPER_INSTALL_DIRECTORY%" ` - ` - # Unpack - ` - && tar -xf zookeeper.zip -C "%APACHE_ZOOKEEPER_INSTALL_DIRECTORY%" ` - ` - # Cleanup - ` - && del /q zookeeper.zip ` - ` - && setx PATH "%PATH%;%APACHE_ZOOKEEPER_INSTALL_DIRECTORY%\apache-zookeeper-3.8.4-bin\bin" /M - -RUN ` - # Download kazoo 2.8.0 Python (more recent versions doesn't work well with Windows) module - ` - python -m pip install kazoo==2.8.0 - -RUN ` - # Download psutil Python module - ` - python -m pip install psutil - -RUN ` - # Download OpenJDK 11 LTS - ` - choco install -y openjdk11 - -RUN ` - # Download & install nasm 2.16.1 - ` - choco install -y nasm --version 2.16.1 - -RUN ` - # Download & install nano 7.2.36 - ` - choco install -y nano --version 7.2.36 - -ARG THIS_PROJECT_WORKING_DIRECTORY - -RUN ` - setx THIS_PROJECT_WORKING_DIRECTORY "%THIS_PROJECT_WORKING_DIRECTORY%" /M - -ARG THIS_PROJECT_NABLA_DIRECTORY - -RUN ` - setx THIS_PROJECT_NABLA_DIRECTORY "%THIS_PROJECT_NABLA_DIRECTORY%" /M ` - && setx PATH "%PATH%;%THIS_PROJECT_NABLA_DIRECTORY%/docker/scripts" /M - -RUN ` - git config --system --add safe.directory * - -ARG JAVA_HOME_SCRIPT - -COPY scripts\os\javaHome.py ${JAVA_HOME_SCRIPT}\javaHome.py - -RUN ` - py "%JAVA_HOME_SCRIPT%\javaHome.py" - -COPY scripts\os\resources\zoo.cfg ${APACHE_ZOOKEEPER_INSTALL_DIRECTORY}\apache-zookeeper-3.8.3-bin\conf\zoo.cfg - -ARG VS_DEV_CMD_DIRECTORY - -RUN ` - setx PATH "%PATH%;%VS_DEV_CMD_DIRECTORY%" /M - -RUN ` - reg add "HKLM\SYSTEM\CurrentControlSet\Control\FileSystem" /v "LongPathsEnabled" /t REG_DWORD /d 1 /f - -ENTRYPOINT ["powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"] \ No newline at end of file diff --git a/docker/compose/ci/stages/.env/platform/windows/.env b/docker/compose/ci/stages/.env/platform/windows/.env deleted file mode 100644 index b0b0f41a5c..0000000000 --- a/docker/compose/ci/stages/.env/platform/windows/.env +++ /dev/null @@ -1,8 +0,0 @@ -THIS_PROJECT_PLATFORM=windows -THIS_PROJECT_BASE_IMAGE=artifactory.devsh.eu/nabla/${THIS_PROJECT_PLATFORM}/base:latest -THIS_PROJECT_ARCH=x86_64 -THIS_PROJECT_WORKING_DIRECTORY="C:\docker" -THIS_PROJECT_ARTIFACTORY_NABLA_DIRECTORY="C:/Users/ContainerAdministrator/Nabla/artifactory" -THIS_PROJECT_NABLA_DIRECTORY="C:/Users/ContainerAdministrator/Nabla/bind" -NABLA_TARGET_REVISION="docker" -THIS_PROJECT_DEBUG="" \ No newline at end of file diff --git a/docker/compose/ci/stages/dev/axes/dynamic/debug/compose.yml b/docker/compose/ci/stages/dev/axes/dynamic/debug/compose.yml deleted file mode 100644 index 1ffbc08ae0..0000000000 --- a/docker/compose/ci/stages/dev/axes/dynamic/debug/compose.yml +++ /dev/null @@ -1,81 +0,0 @@ -version: '3' - -services: - nabla.kazoo.server.dynamic.debug: - image: ${THIS_PROJECT_BASE_IMAGE} - container_name: dev.nabla.kazoo.server.dynamic.debug.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - hostname: dev.nabla.kazoo.server.dynamic.debug.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - profiles: ["dev", "dynamic", "dev.dynamic", "dev.dynamic.debug"] - env_file: - - ../../../../.env/platform/${THIS_PROJECT_PLATFORM}/.env - networks: - nabla.network: - entrypoint: ["zkServer.cmd"] - volumes: - - type: bind - source: ../../../../../../../../ - target: ${THIS_PROJECT_NABLA_DIRECTORY} - healthcheck: - test: ["CMD", "ncpfmp.bat", "nbl.ci.dev.lib.kazoo", "--host", "localhost"] - interval: 30s - timeout: 10s - retries: 3 - - nabla.cmake.dynamic.debug: - image: ${THIS_PROJECT_BASE_IMAGE} - container_name: dev.nabla.cmake.dynamic.debug.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - hostname: dev.nabla.cmake.dynamic.debug.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - profiles: ["dev", "dynamic", "dev.dynamic", "dev.dynamic.debug"] - env_file: - - ../../../../.env/platform/${THIS_PROJECT_PLATFORM}/.env - environment: - - NBL_BUILD_DIR=${THIS_PROJECT_NABLA_DIRECTORY}/build/.docker/${THIS_PROJECT_PLATFORM}/${THIS_PROJECT_ARCH}/dynamic/debug - networks: - nabla.network: - volumes: - - type: bind - source: ../../../../../../../../ - target: ${THIS_PROJECT_NABLA_DIRECTORY} - entrypoint: ["ncpfmp.bat", "nbl.ci.dev.cmake", "--libType", "dynamic", "--config", "debug"] - - nabla.build.dynamic.debug: - image: ${THIS_PROJECT_BASE_IMAGE} - container_name: dev.nabla.build.dynamic.debug.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - hostname: dev.nabla.build.dynamic.debug.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - profiles: ["dev", "dynamic", "dev.dynamic", "dev.dynamic.debug"] - env_file: - - ../../../../.env/platform/${THIS_PROJECT_PLATFORM}/.env - networks: - nabla.network: - depends_on: - nabla.cmake.dynamic.debug: - condition: service_completed_successfully - nabla.kazoo.server.dynamic.debug: - condition: service_healthy - volumes: - - type: bind - source: ../../../../../../../../ - target: ${THIS_PROJECT_NABLA_DIRECTORY} - entrypoint: ["ncpfmp.bat", "nbl.ci.dev.build", "--config", "Debug", "--libType", "dynamic"] - - nabla.cpack.dynamic.debug: - image: ${THIS_PROJECT_BASE_IMAGE} - container_name: dev.nabla.cpack.dynamic.debug.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - hostname: dev.nabla.cpack.dynamic.debug.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - profiles: ["dev", "dynamic", "dev.dynamic", "dev.dynamic.debug"] - env_file: - - ../../../../.env/platform/${THIS_PROJECT_PLATFORM}/.env - networks: - nabla.network: - depends_on: - nabla.build.dynamic.debug: - condition: service_completed_successfully - volumes: - - type: bind - source: ../../../../../../../../ - target: ${THIS_PROJECT_NABLA_DIRECTORY} - entrypoint: ["ncpfmp.bat", "nbl.ci.dev.cpack", "--libType", "dynamic", "--config", "Debug"] - -networks: - nabla.network: - external: true \ No newline at end of file diff --git a/docker/compose/ci/stages/dev/axes/dynamic/release/compose.yml b/docker/compose/ci/stages/dev/axes/dynamic/release/compose.yml deleted file mode 100644 index cb902eb970..0000000000 --- a/docker/compose/ci/stages/dev/axes/dynamic/release/compose.yml +++ /dev/null @@ -1,79 +0,0 @@ -version: '3' - -services: - nabla.kazoo.server.dynamic.release: - image: ${THIS_PROJECT_BASE_IMAGE} - container_name: dev.nabla.kazoo.server.dynamic.release.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - hostname: dev.nabla.kazoo.server.dynamic.release.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - profiles: ["dev", "dynamic", "dev.dynamic", "dev.dynamic.release"] - env_file: - - ../../../../.env/platform/${THIS_PROJECT_PLATFORM}/.env - networks: - nabla.network: - entrypoint: ["zkServer.cmd"] - volumes: - - type: bind - source: ../../../../../../../../ - target: ${THIS_PROJECT_NABLA_DIRECTORY} - healthcheck: - test: ["CMD", "ncpfmp.bat", "nbl.ci.dev.lib.kazoo", "--host", "localhost"] - interval: 30s - timeout: 10s - retries: 3 - - nabla.cmake.dynamic.release: - image: ${THIS_PROJECT_BASE_IMAGE} - container_name: dev.nabla.cmake.dynamic.release.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - hostname: dev.nabla.cmake.dynamic.release.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - profiles: ["dev", "dynamic", "dev.dynamic", "dev.dynamic.release"] - env_file: - - ../../../../.env/platform/${THIS_PROJECT_PLATFORM}/.env - networks: - nabla.network: - volumes: - - type: bind - source: ../../../../../../../../ - target: ${THIS_PROJECT_NABLA_DIRECTORY} - entrypoint: ["ncpfmp.bat", "nbl.ci.dev.cmake", "--libType", "dynamic", "--config", "release"] - - nabla.build.dynamic.release: - image: ${THIS_PROJECT_BASE_IMAGE} - container_name: dev.nabla.build.dynamic.release.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - hostname: dev.nabla.build.dynamic.release.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - profiles: ["dev", "dynamic", "dev.dynamic", "dev.dynamic.release"] - env_file: - - ../../../../.env/platform/${THIS_PROJECT_PLATFORM}/.env - networks: - nabla.network: - depends_on: - nabla.cmake.dynamic.release: - condition: service_completed_successfully - nabla.kazoo.server.dynamic.release: - condition: service_healthy - volumes: - - type: bind - source: ../../../../../../../../ - target: ${THIS_PROJECT_NABLA_DIRECTORY} - entrypoint: ["ncpfmp.bat", "nbl.ci.dev.build", "--config", "Release", "--libType", "dynamic"] - - nabla.cpack.dynamic.release: - image: ${THIS_PROJECT_BASE_IMAGE} - container_name: dev.nabla.cpack.dynamic.release.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - hostname: dev.nabla.cpack.dynamic.release.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - profiles: ["dev", "dynamic", "dev.dynamic", "dev.dynamic.release"] - env_file: - - ../../../../.env/platform/${THIS_PROJECT_PLATFORM}/.env - networks: - nabla.network: - depends_on: - nabla.build.dynamic.release: - condition: service_completed_successfully - volumes: - - type: bind - source: ../../../../../../../../ - target: ${THIS_PROJECT_NABLA_DIRECTORY} - entrypoint: ["ncpfmp.bat", "nbl.ci.dev.cpack", "--libType", "dynamic", "--config", "Release"] - -networks: - nabla.network: - external: true \ No newline at end of file diff --git a/docker/compose/ci/stages/dev/axes/dynamic/relwithdebinfo/compose.yml b/docker/compose/ci/stages/dev/axes/dynamic/relwithdebinfo/compose.yml deleted file mode 100644 index f56329507e..0000000000 --- a/docker/compose/ci/stages/dev/axes/dynamic/relwithdebinfo/compose.yml +++ /dev/null @@ -1,81 +0,0 @@ -version: '3' - -services: - nabla.kazoo.server.dynamic.relwithdebinfo: - image: ${THIS_PROJECT_BASE_IMAGE} - container_name: dev.nabla.kazoo.server.dynamic.relwithdebinfo.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - hostname: dev.nabla.kazoo.server.dynamic.relwithdebinfo.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - profiles: ["dev", "dynamic", "dev.dynamic", "dev.dynamic.relwithdebinfo"] - env_file: - - ../../../../.env/platform/${THIS_PROJECT_PLATFORM}/.env - networks: - nabla.network: - entrypoint: ["zkServer.cmd"] - volumes: - - type: bind - source: ../../../../../../../../ - target: ${THIS_PROJECT_NABLA_DIRECTORY} - healthcheck: - test: ["CMD", "ncpfmp.bat", "nbl.ci.dev.lib.kazoo", "--host", "localhost"] - interval: 30s - timeout: 10s - retries: 3 - - nabla.cmake.dynamic.relwithdebinfo: - image: ${THIS_PROJECT_BASE_IMAGE} - container_name: dev.nabla.cmake.dynamic.relwithdebinfo.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - hostname: dev.nabla.cmake.dynamic.relwithdebinfo.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - profiles: ["dev", "dynamic", "dev.dynamic", "dev.dynamic.relwithdebinfo"] - env_file: - - ../../../../.env/platform/${THIS_PROJECT_PLATFORM}/.env - environment: - - NBL_BUILD_DIR=${THIS_PROJECT_NABLA_DIRECTORY}/build/.docker/${THIS_PROJECT_PLATFORM}/${THIS_PROJECT_ARCH}/dynamic/relwithdebinfo - networks: - nabla.network: - volumes: - - type: bind - source: ../../../../../../../../ - target: ${THIS_PROJECT_NABLA_DIRECTORY} - entrypoint: ["ncpfmp.bat", "nbl.ci.dev.cmake", "--libType", "dynamic", "--config", "relwithdebinfo"] - - nabla.build.dynamic.relwithdebinfo: - image: ${THIS_PROJECT_BASE_IMAGE} - container_name: dev.nabla.build.dynamic.relwithdebinfo.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - hostname: dev.nabla.build.dynamic.relwithdebinfo.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - profiles: ["dev", "dynamic", "dev.dynamic", "dev.dynamic.relwithdebinfo"] - env_file: - - ../../../../.env/platform/${THIS_PROJECT_PLATFORM}/.env - networks: - nabla.network: - depends_on: - nabla.cmake.dynamic.relwithdebinfo: - condition: service_completed_successfully - nabla.kazoo.server.dynamic.relwithdebinfo: - condition: service_healthy - volumes: - - type: bind - source: ../../../../../../../../ - target: ${THIS_PROJECT_NABLA_DIRECTORY} - entrypoint: ["ncpfmp.bat", "nbl.ci.dev.build", "--config", "RelWithDebInfo", "--libType", "dynamic"] - - nabla.cpack.dynamic.relwithdebinfo: - image: ${THIS_PROJECT_BASE_IMAGE} - container_name: dev.nabla.cpack.dynamic.relwithdebinfo.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - hostname: dev.nabla.cpack.dynamic.relwithdebinfo.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - profiles: ["dev", "dynamic", "dev.dynamic", "dev.dynamic.relwithdebinfo"] - env_file: - - ../../../../.env/platform/${THIS_PROJECT_PLATFORM}/.env - networks: - nabla.network: - depends_on: - nabla.build.dynamic.relwithdebinfo: - condition: service_completed_successfully - volumes: - - type: bind - source: ../../../../../../../../ - target: ${THIS_PROJECT_NABLA_DIRECTORY} - entrypoint: ["ncpfmp.bat", "nbl.ci.dev.cpack", "--libType", "dynamic", "--config", "RelWithDebInfo"] - -networks: - nabla.network: - external: true \ No newline at end of file diff --git a/docker/compose/ci/stages/dev/axes/static/debug/compose.yml b/docker/compose/ci/stages/dev/axes/static/debug/compose.yml deleted file mode 100644 index 48f2fd3800..0000000000 --- a/docker/compose/ci/stages/dev/axes/static/debug/compose.yml +++ /dev/null @@ -1,81 +0,0 @@ -version: '3' - -services: - nabla.kazoo.server.static.debug: - image: ${THIS_PROJECT_BASE_IMAGE} - container_name: dev.nabla.kazoo.server.static.debug.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - hostname: dev.nabla.kazoo.server.static.debug.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - profiles: ["dev", "static", "dev.static", "dev.static.debug"] - env_file: - - ../../../../.env/platform/${THIS_PROJECT_PLATFORM}/.env - networks: - nabla.network: - entrypoint: ["zkServer.cmd"] - volumes: - - type: bind - source: ../../../../../../../../ - target: ${THIS_PROJECT_NABLA_DIRECTORY} - healthcheck: - test: ["CMD", "ncpfmp.bat", "nbl.ci.dev.lib.kazoo", "--host", "localhost"] - interval: 30s - timeout: 10s - retries: 3 - - nabla.cmake.static.debug: - image: ${THIS_PROJECT_BASE_IMAGE} - container_name: dev.nabla.cmake.static.debug.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - hostname: dev.nabla.cmake.static.debug.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - profiles: ["dev", "static", "dev.static", "dev.static.debug"] - env_file: - - ../../../../.env/platform/${THIS_PROJECT_PLATFORM}/.env - environment: - - NBL_BUILD_DIR=${THIS_PROJECT_NABLA_DIRECTORY}/build/.docker/${THIS_PROJECT_PLATFORM}/${THIS_PROJECT_ARCH}/static/debug - networks: - nabla.network: - volumes: - - type: bind - source: ../../../../../../../../ - target: ${THIS_PROJECT_NABLA_DIRECTORY} - entrypoint: ["ncpfmp.bat", "nbl.ci.dev.cmake", "--libType", "static", "--config", "debug"] - - nabla.build.static.debug: - image: ${THIS_PROJECT_BASE_IMAGE} - container_name: dev.nabla.build.static.debug.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - hostname: dev.nabla.build.static.debug.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - profiles: ["dev", "static", "dev.static", "dev.static.debug"] - env_file: - - ../../../../.env/platform/${THIS_PROJECT_PLATFORM}/.env - networks: - nabla.network: - depends_on: - nabla.cmake.static.debug: - condition: service_completed_successfully - nabla.kazoo.server.static.debug: - condition: service_healthy - volumes: - - type: bind - source: ../../../../../../../../ - target: ${THIS_PROJECT_NABLA_DIRECTORY} - entrypoint: ["ncpfmp.bat", "nbl.ci.dev.build", "--config", "Debug", "--libType", "static"] - - nabla.cpack.static.debug: - image: ${THIS_PROJECT_BASE_IMAGE} - container_name: dev.nabla.cpack.static.debug.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - hostname: dev.nabla.cpack.static.debug.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - profiles: ["dev", "static", "dev.static", "dev.static.debug"] - env_file: - - ../../../../.env/platform/${THIS_PROJECT_PLATFORM}/.env - networks: - nabla.network: - depends_on: - nabla.build.static.debug: - condition: service_completed_successfully - volumes: - - type: bind - source: ../../../../../../../../ - target: ${THIS_PROJECT_NABLA_DIRECTORY} - entrypoint: ["ncpfmp.bat", "nbl.ci.dev.cpack", "--libType", "static", "--config", "Debug"] - -networks: - nabla.network: - external: true \ No newline at end of file diff --git a/docker/compose/ci/stages/dev/axes/static/release/compose.yml b/docker/compose/ci/stages/dev/axes/static/release/compose.yml deleted file mode 100644 index 0856eb9e43..0000000000 --- a/docker/compose/ci/stages/dev/axes/static/release/compose.yml +++ /dev/null @@ -1,79 +0,0 @@ -version: '3' - -services: - nabla.kazoo.server.static.release: - image: ${THIS_PROJECT_BASE_IMAGE} - container_name: dev.nabla.kazoo.server.static.release.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - hostname: dev.nabla.kazoo.server.static.release.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - profiles: ["dev", "static", "dev.static", "dev.static.release"] - env_file: - - ../../../../.env/platform/${THIS_PROJECT_PLATFORM}/.env - networks: - nabla.network: - entrypoint: ["zkServer.cmd"] - volumes: - - type: bind - source: ../../../../../../../../ - target: ${THIS_PROJECT_NABLA_DIRECTORY} - healthcheck: - test: ["CMD", "ncpfmp.bat", "nbl.ci.dev.lib.kazoo", "--host", "localhost"] - interval: 30s - timeout: 10s - retries: 3 - - nabla.cmake.static.release: - image: ${THIS_PROJECT_BASE_IMAGE} - container_name: dev.nabla.cmake.static.release.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - hostname: dev.nabla.cmake.static.release.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - profiles: ["dev", "static", "dev.static", "dev.static.release"] - env_file: - - ../../../../.env/platform/${THIS_PROJECT_PLATFORM}/.env - networks: - nabla.network: - volumes: - - type: bind - source: ../../../../../../../../ - target: ${THIS_PROJECT_NABLA_DIRECTORY} - entrypoint: ["ncpfmp.bat", "nbl.ci.dev.cmake", "--libType", "static", "--config", "release"] - - nabla.build.static.release: - image: ${THIS_PROJECT_BASE_IMAGE} - container_name: dev.nabla.build.static.release.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - hostname: dev.nabla.build.static.release.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - profiles: ["dev", "static", "dev.static", "dev.static.release"] - env_file: - - ../../../../.env/platform/${THIS_PROJECT_PLATFORM}/.env - networks: - nabla.network: - depends_on: - nabla.cmake.static.release: - condition: service_completed_successfully - nabla.kazoo.server.static.release: - condition: service_healthy - volumes: - - type: bind - source: ../../../../../../../../ - target: ${THIS_PROJECT_NABLA_DIRECTORY} - entrypoint: ["ncpfmp.bat", "nbl.ci.dev.build", "--config", "Release", "--libType", "static"] - - nabla.cpack.static.release: - image: ${THIS_PROJECT_BASE_IMAGE} - container_name: dev.nabla.cpack.static.release.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - hostname: dev.nabla.cpack.static.release.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - profiles: ["dev", "static", "dev.static", "dev.static.release"] - env_file: - - ../../../../.env/platform/${THIS_PROJECT_PLATFORM}/.env - networks: - nabla.network: - depends_on: - nabla.build.static.release: - condition: service_completed_successfully - volumes: - - type: bind - source: ../../../../../../../../ - target: ${THIS_PROJECT_NABLA_DIRECTORY} - entrypoint: ["ncpfmp.bat", "nbl.ci.dev.cpack", "--libType", "static", "--config", "Release"] - -networks: - nabla.network: - external: true \ No newline at end of file diff --git a/docker/compose/ci/stages/dev/axes/static/relwithdebinfo/compose.yml b/docker/compose/ci/stages/dev/axes/static/relwithdebinfo/compose.yml deleted file mode 100644 index 792ffb8789..0000000000 --- a/docker/compose/ci/stages/dev/axes/static/relwithdebinfo/compose.yml +++ /dev/null @@ -1,81 +0,0 @@ -version: '3' - -services: - nabla.kazoo.server.static.relwithdebinfo: - image: ${THIS_PROJECT_BASE_IMAGE} - container_name: dev.nabla.kazoo.server.static.relwithdebinfo.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - hostname: dev.nabla.kazoo.server.static.relwithdebinfo.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - profiles: ["dev", "static", "dev.static", "dev.static.relwithdebinfo"] - env_file: - - ../../../../.env/platform/${THIS_PROJECT_PLATFORM}/.env - networks: - nabla.network: - entrypoint: ["zkServer.cmd"] - volumes: - - type: bind - source: ../../../../../../../../ - target: ${THIS_PROJECT_NABLA_DIRECTORY} - healthcheck: - test: ["CMD", "ncpfmp.bat", "nbl.ci.dev.lib.kazoo", "--host", "localhost"] - interval: 30s - timeout: 10s - retries: 3 - - nabla.cmake.static.relwithdebinfo: - image: ${THIS_PROJECT_BASE_IMAGE} - container_name: dev.nabla.cmake.static.relwithdebinfo.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - hostname: dev.nabla.cmake.static.relwithdebinfo.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - profiles: ["dev", "static", "dev.static", "dev.static.relwithdebinfo"] - env_file: - - ../../../../.env/platform/${THIS_PROJECT_PLATFORM}/.env - environment: - - NBL_BUILD_DIR=${THIS_PROJECT_NABLA_DIRECTORY}/build/.docker/${THIS_PROJECT_PLATFORM}/${THIS_PROJECT_ARCH}/static/relwithdebinfo - networks: - nabla.network: - volumes: - - type: bind - source: ../../../../../../../../ - target: ${THIS_PROJECT_NABLA_DIRECTORY} - entrypoint: ["ncpfmp.bat", "nbl.ci.dev.cmake", "--libType", "static", "--config", "relwithdebinfo"] - - nabla.build.static.relwithdebinfo: - image: ${THIS_PROJECT_BASE_IMAGE} - container_name: dev.nabla.build.static.relwithdebinfo.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - hostname: dev.nabla.build.static.relwithdebinfo.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - profiles: ["dev", "static", "dev.static", "dev.static.relwithdebinfo"] - env_file: - - ../../../../.env/platform/${THIS_PROJECT_PLATFORM}/.env - networks: - nabla.network: - depends_on: - nabla.cmake.static.relwithdebinfo: - condition: service_completed_successfully - nabla.kazoo.server.static.relwithdebinfo: - condition: service_healthy - volumes: - - type: bind - source: ../../../../../../../../ - target: ${THIS_PROJECT_NABLA_DIRECTORY} - entrypoint: ["ncpfmp.bat", "nbl.ci.dev.build", "--config", "RelWithDebInfo", "--libType", "static"] - - nabla.cpack.static.relwithdebinfo: - image: ${THIS_PROJECT_BASE_IMAGE} - container_name: dev.nabla.cpack.static.relwithdebinfo.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - hostname: dev.nabla.cpack.static.relwithdebinfo.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - profiles: ["dev", "static", "dev.static", "dev.static.relwithdebinfo"] - env_file: - - ../../../../.env/platform/${THIS_PROJECT_PLATFORM}/.env - networks: - nabla.network: - depends_on: - nabla.build.static.relwithdebinfo: - condition: service_completed_successfully - volumes: - - type: bind - source: ../../../../../../../../ - target: ${THIS_PROJECT_NABLA_DIRECTORY} - entrypoint: ["ncpfmp.bat", "nbl.ci.dev.cpack", "--libType", "static", "--config", "RelWithDebInfo"] - -networks: - nabla.network: - external: true \ No newline at end of file diff --git a/docker/compose/ci/stages/dev/compose.yml b/docker/compose/ci/stages/dev/compose.yml deleted file mode 100644 index f1d0b64a3d..0000000000 --- a/docker/compose/ci/stages/dev/compose.yml +++ /dev/null @@ -1,39 +0,0 @@ -include: - - path: - - axes/static/release/compose.yml - project_directory: axes/static/release - env_file: ../.env/platform/${THIS_PROJECT_PLATFORM}/.env - - path: - - axes/static/relwithdebinfo/compose.yml - project_directory: axes/static/relwithdebinfo - env_file: ../.env/platform/${THIS_PROJECT_PLATFORM}/.env - - path: - - axes/static/debug/compose.yml - project_directory: axes/static/debug - env_file: ../.env/platform/${THIS_PROJECT_PLATFORM}/.env - - path: - - axes/dynamic/release/compose.yml - project_directory: axes/dynamic/release - env_file: ../.env/platform/${THIS_PROJECT_PLATFORM}/.env - - path: - - axes/dynamic/relwithdebinfo/compose.yml - project_directory: axes/dynamic/relwithdebinfo - env_file: ../.env/platform/${THIS_PROJECT_PLATFORM}/.env - - path: - - axes/dynamic/debug/compose.yml - project_directory: axes/dynamic/debug - env_file: ../.env/platform/${THIS_PROJECT_PLATFORM}/.env - -services: - nabla.init: - build: - context: ../../../ - args: - - THIS_PROJECT_NABLA_DIRECTORY=${THIS_PROJECT_NABLA_DIRECTORY} - env_file: - - ../.env/platform/${THIS_PROJECT_PLATFORM}/.env - image: ${THIS_PROJECT_BASE_IMAGE} - container_name: dev.nabla.init.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - hostname: dev.nabla.init.${THIS_PROJECT_ARCH}.${THIS_PROJECT_PLATFORM} - networks: - nabla.network: \ No newline at end of file diff --git a/docker/compose/scripts/os/javaHome.py b/docker/compose/scripts/os/javaHome.py deleted file mode 100644 index 347a72def5..0000000000 --- a/docker/compose/scripts/os/javaHome.py +++ /dev/null @@ -1,23 +0,0 @@ -import subprocess, os, re - -completedProcess = subprocess.run( - "java -XshowSettings:properties", - shell=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - check=False -) - -output = completedProcess.stderr.strip() -regexMatch = re.search(r'java\.home = (.+)', output) -if regexMatch: - JAVA_HOME = regexMatch.group(1).strip() -else: - JAVA_HOME = "" - -if JAVA_HOME: - os.system(f'setx JAVA_HOME "{JAVA_HOME}" /M') - print(f'JAVA_HOME has been set to: {JAVA_HOME}') -else: - print("Error: Unable to retrieve or set JAVA_HOME.") \ No newline at end of file diff --git a/docker/compose/scripts/os/resources/zoo.cfg b/docker/compose/scripts/os/resources/zoo.cfg deleted file mode 100644 index 9dfefa90e1..0000000000 --- a/docker/compose/scripts/os/resources/zoo.cfg +++ /dev/null @@ -1,6 +0,0 @@ -tickTime=2000 -initLimit=10 -syncLimit=5 -dataDir=C:/tmp/zookeeper -clientPort=2181 -maxClientCnxns=500 \ No newline at end of file diff --git a/docker/dev.py b/docker/dev.py deleted file mode 100644 index d0ef2187e1..0000000000 --- a/docker/dev.py +++ /dev/null @@ -1,62 +0,0 @@ -import os, subprocess, sys, argparse - - -def parseInputArguments(): - parser = argparse.ArgumentParser(description="Nabla CI Pipeline compose Framework script") - - parser.add_argument("--platform", help="Target platform", type=str, default="windows") - parser.add_argument("--arch", help="Target arch", type=str, default="x86_64") - parser.add_argument('--profiles', nargs='*', default=["dev.dynamic.debug"], help='Target list of profiles to apply') - - args = parser.parse_args() - - return args - - -def updateSubmodules(root): - updateSubmoduleScript = os.path.normpath(os.path.join(root, "cmake/submodules/update.cmake")) - return subprocess.run(f"cmake -P \"{updateSubmoduleScript}\"", check=True) - - -def main(): - try: - args = parseInputArguments() - - root = os.path.normpath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../")) - - updateSubmodules(root) - - os.chdir(os.path.normpath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "compose/ci/stages/dev"))) - - platform = args.platform - arch = args.arch - - if subprocess.call(["docker", "network", "inspect", "nabla.network"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) != 0: - subprocess.run(["docker", "network", "create", "--driver", "nat", "--subnet", "172.28.0.0/16", "--gateway", "172.28.5.1", "nabla.network"], check=True) # create nabla.network network if not present - - envFile = os.path.abspath(f"../.env/platform/{platform}/.env") - profiles = (lambda profiles: [item for profile in profiles for item in ["--profile", profile]])(args.profiles) - - compose = [ - "docker", "compose", - "-f", f"./compose.yml", - "--env-file", envFile - ] + profiles - - subprocess.run(compose + ["build"], check=True) - subprocess.run(compose + ["config"], check=True) - subprocess.run(compose + ["create", "--force-recreate"], check=True) - subprocess.run(compose + ["up"], check=True) - subprocess.run(compose + ["down"], check=True) - - except subprocess.CalledProcessError as e: - print(f"Subprocess failed with exit code {e.returncode}") - sys.exit(e.returncode) - - except Exception as e: - print(f"Unexpected error: {e}") - sys.exit(-1) - - -if __name__ == "__main__": - main() diff --git a/docker/scripts/__init__.py b/docker/scripts/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docker/scripts/nbl/ci/dev/.vscode/launch.json b/docker/scripts/nbl/ci/dev/.vscode/launch.json deleted file mode 100644 index e664be35d1..0000000000 --- a/docker/scripts/nbl/ci/dev/.vscode/launch.json +++ /dev/null @@ -1,117 +0,0 @@ -{ - "version": "0.2.0", - "configurations": [ - { - "name": "dev.nabla.base.x86_64.windows Container Remote Debug", - "type": "python", - "request": "attach", - "connect": { - "host": "172.28.5.2", - "port": 5678 - }, - "pathMappings": [ - { - "localRoot": "${workspaceFolder}", - "remoteRoot": "." - } - ], - "justMyCode": true - }, - { - "name": "dev.nabla.static.release.x86_64.windows Container Remote Debug", - "type": "python", - "request": "attach", - "connect": { - "host": "172.28.5.3", - "port": 5679 - }, - "pathMappings": [ - { - "localRoot": "${workspaceFolder}", - "remoteRoot": "." - } - ], - "justMyCode": true - }, - { - "name": "dev.nabla.static.relwithdebinfo.x86_64.windows Container Remote Debug", - "type": "python", - "request": "attach", - "connect": { - "host": "172.28.5.4", - "port": 5680 - }, - "pathMappings": [ - { - "localRoot": "${workspaceFolder}", - "remoteRoot": "." - } - ], - "justMyCode": true - }, - { - "name": "dev.nabla.static.debug.x86_64.windows Container Remote Debug", - "type": "python", - "request": "attach", - "connect": { - "host": "172.28.5.5", - "port": 5681 - }, - "pathMappings": [ - { - "localRoot": "${workspaceFolder}", - "remoteRoot": "." - } - ], - "justMyCode": true - }, - { - "name": "dev.nabla.dynamic.release.x86_64.windows Container Remote Debug", - "type": "python", - "request": "attach", - "connect": { - "host": "172.28.5.6", - "port": 5682 - }, - "pathMappings": [ - { - "localRoot": "${workspaceFolder}", - "remoteRoot": "." - } - ], - "justMyCode": true - }, - { - "name": "dev.nabla.dynamic.relwithdebinfo.x86_64.windows Container Remote Debug", - "type": "python", - "request": "attach", - "connect": { - "host": "172.28.5.7", - "port": 5683 - }, - "pathMappings": [ - { - "localRoot": "${workspaceFolder}", - "remoteRoot": "." - } - ], - "justMyCode": true - }, - { - "name": "dev.nabla.dynamic.debug.x86_64.windows Container Remote Debug", - "type": "python", - "request": "attach", - "connect": { - "host": "172.28.5.8", - "port": 5684 - }, - "pathMappings": [ - { - "localRoot": "${workspaceFolder}", - "remoteRoot": "." - } - ], - "justMyCode": true - } - ] -} \ No newline at end of file diff --git a/docker/scripts/nbl/ci/dev/__init__.py b/docker/scripts/nbl/ci/dev/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docker/scripts/nbl/ci/dev/build.py b/docker/scripts/nbl/ci/dev/build.py deleted file mode 100644 index b16f9d3a8b..0000000000 --- a/docker/scripts/nbl/ci/dev/build.py +++ /dev/null @@ -1,84 +0,0 @@ -import os, subprocess, sys, argparse -from .lib.kazoo import * - -def parseInputArguments(): - parser = argparse.ArgumentParser(description="Nabla CI Pipeline nbl.ci.dev.build Framework Module") - - parser.add_argument("--config", help="Target CMake configuration", type=str, default="Release") - parser.add_argument("--libType", help="Target library type", type=str, default="dynamic") - - args = parser.parse_args() - - return args - -def buildNabla(libType, config): - return subprocess.run(f"cmake --build --preset ci-build-{libType}-msvc-{config}", check=False) - - -def buildProject(libType, config, buildDirectory): - return subprocess.run(f"cmake --build \"{buildDirectory}\" --config {config}", check=False) - - -def getCPackBundleHash(buildDirectory, target, component = "ALL", relativeDirectory = "/"): - return f"{buildDirectory};{target};{component};{relativeDirectory};" - - -def main(): - try: - THIS_PROJECT_NABLA_DIRECTORY = os.environ.get('THIS_PROJECT_NABLA_DIRECTORY', '') - - if not THIS_PROJECT_NABLA_DIRECTORY: - raise ValueError("THIS_PROJECT_NABLA_DIRECTORY environment variables doesn't exist!") - - THIS_PROJECT_PLATFORM = os.environ.get('THIS_PROJECT_PLATFORM', '') - - if not THIS_PROJECT_PLATFORM: - raise ValueError("THIS_PROJECT_PLATFORM environment variables doesn't exist!") - - THIS_PROJECT_ARCH = os.environ.get('THIS_PROJECT_ARCH', '') - - if not THIS_PROJECT_ARCH: - raise ValueError("THIS_PROJECT_ARCH environment variables doesn't exist!") - - THIS_SERVICE_BINARY_PROJECT_PATH = os.environ.get('THIS_SERVICE_BINARY_PROJECT_PATH', '') - - os.chdir(THIS_PROJECT_NABLA_DIRECTORY) - - args = parseInputArguments() - - config = args.config - lowerCaseConfig = config.lower() - libType = args.libType - - topBuildDirectory = os.path.normpath(os.path.join(THIS_PROJECT_NABLA_DIRECTORY, f"build/.docker/{THIS_PROJECT_PLATFORM}/{THIS_PROJECT_ARCH}/{libType}/{lowerCaseConfig}")) - targetBuildDirectory = os.path.normpath(os.path.join(topBuildDirectory, THIS_SERVICE_BINARY_PROJECT_PATH)) - - if topBuildDirectory == targetBuildDirectory: - buildNabla(libType, lowerCaseConfig) - cpackBundleHash = getCPackBundleHash(topBuildDirectory, "Libraries") + getCPackBundleHash(topBuildDirectory, "Runtime") - else: - buildProject(libType, config, targetBuildDirectory) - cpackBundleHash += getCPackBundleHash(targetBuildDirectory, "Executables") + getCPackBundleHash(targetBuildDirectory, "Media") - - kazooConnector = KazooConnector(f"dev.nabla.kazoo.server.{libType}.{lowerCaseConfig}.x86_64.{THIS_PROJECT_PLATFORM}") # DNS record as compose service name - kazooConnector.connect() - - zNodePath = f"/CPACK_INSTALL_CMAKE_PROJECTS" - kazooConnector.createKazooAtomic(zNodePath) - kazooConnector.appendKazooAtomic(zNodePath, cpackBundleHash) - print(f"Atomic update performed on {zNodePath} zNode path") - print(f"cpackBundleHash = {cpackBundleHash}") - - kazooConnector.disconnect() - - except subprocess.CalledProcessError as e: - print(f"Subprocess failed with exit code {e.returncode}") - sys.exit(e.returncode) - - except Exception as e: - print(f"Unexpected error: {e}") - sys.exit(-1) - - -if __name__ == "__main__": - main() diff --git a/docker/scripts/nbl/ci/dev/cmake.py b/docker/scripts/nbl/ci/dev/cmake.py deleted file mode 100644 index b0ca2b86ed..0000000000 --- a/docker/scripts/nbl/ci/dev/cmake.py +++ /dev/null @@ -1,47 +0,0 @@ -import os, subprocess, sys, argparse - -def parseInputArguments(): - parser = argparse.ArgumentParser(description="Nabla CI Pipeline nbl.ci.dev.cmake Framework Module") - - parser.add_argument("--libType", help="Target library type", type=str, default="dynamic") - parser.add_argument("--config", help="Target library type", type=str, default="release") - - args = parser.parse_args() - - return args - -def configure(libType, config): - subprocess.run(f"cmake . --preset ci-configure-{libType}-msvc-{config}", check=True) - -def main(): - try: - THIS_PROJECT_NABLA_DIRECTORY = os.environ.get('THIS_PROJECT_NABLA_DIRECTORY', '') - - if not THIS_PROJECT_NABLA_DIRECTORY: - raise ValueError("THIS_PROJECT_NABLA_DIRECTORY environment variables doesn't exist!") - - THIS_PROJECT_ARCH = os.environ.get('THIS_PROJECT_ARCH', '') - - if not THIS_PROJECT_ARCH: - raise ValueError("THIS_PROJECT_ARCH environment variables doesn't exist!") - - os.chdir(THIS_PROJECT_NABLA_DIRECTORY) - - args = parseInputArguments() - - libType = args.libType - config = args.config - - configure(libType, config) - - except subprocess.CalledProcessError as e: - print(f"Subprocess failed with exit code {e.returncode}") - sys.exit(e.returncode) - - except Exception as e: - print(f"Unexpected error: {e}") - sys.exit(-1) - - -if __name__ == "__main__": - main() diff --git a/docker/scripts/nbl/ci/dev/cpack.py b/docker/scripts/nbl/ci/dev/cpack.py deleted file mode 100644 index 48a0adf790..0000000000 --- a/docker/scripts/nbl/ci/dev/cpack.py +++ /dev/null @@ -1,72 +0,0 @@ -import os, subprocess, argparse -from .lib.kazoo import * - -def parseInputArguments(): - parser = argparse.ArgumentParser(description="Nabla CI Pipeline nbl.ci.dev.cpack Framework Module") - - parser.add_argument("--config", help="Target CMake configuration", type=str, default="Release") - parser.add_argument("--libType", help="Target library type", type=str, default="dynamic") - - args = parser.parse_args() - - return args - -def cpack(libType, config, CPACK_INSTALL_CMAKE_PROJECTS, packageDirectory): - if not packageDirectory: - packageDirectory = f"./package/{config}/{libType}" - - return subprocess.run(f"cpack --preset ci-package-{libType}-msvc-{config} -B \"{packageDirectory}\" -D CPACK_INSTALL_CMAKE_PROJECTS=\"{CPACK_INSTALL_CMAKE_PROJECTS}\"", check=True) - - -def main(): - try: - THIS_PROJECT_NABLA_DIRECTORY = os.environ.get('THIS_PROJECT_NABLA_DIRECTORY', '') - - if not THIS_PROJECT_NABLA_DIRECTORY: - raise ValueError("THIS_PROJECT_NABLA_DIRECTORY environment variables doesn't exist!") - - THIS_PROJECT_PLATFORM = os.environ.get('THIS_PROJECT_PLATFORM', '') - - if not THIS_PROJECT_PLATFORM: - raise ValueError("THIS_PROJECT_PLATFORM environment variables doesn't exist!") - - THIS_PROJECT_ARCH = os.environ.get('THIS_PROJECT_ARCH', '') - - if not THIS_PROJECT_ARCH: - raise ValueError("THIS_PROJECT_ARCH environment variables doesn't exist!") - - os.chdir(THIS_PROJECT_NABLA_DIRECTORY) - - args = parseInputArguments() - - config = args.config - lowerCaseConfig = config.lower() - libType = args.libType - - kazooConnector = KazooConnector(f"dev.nabla.kazoo.server.{libType}.{lowerCaseConfig}.x86_64.{THIS_PROJECT_PLATFORM}") # DNS record as compose service name - kazooConnector.connect() - - zNodePath = f"/CPACK_INSTALL_CMAKE_PROJECTS" - cpackBundleHash = kazooConnector.getKazooAtomic(zNodePath) - print(f"Atomic read performed on {zNodePath} zNode path") - - kazooConnector.requestServerShutdown() - kazooConnector.disconnect() - - if cpackBundleHash: - print(f"CPACK_INSTALL_CMAKE_PROJECTS = {cpackBundleHash}") - cpack(libType, lowerCaseConfig, cpackBundleHash, f"{THIS_PROJECT_NABLA_DIRECTORY}/build/artifacts/{THIS_PROJECT_PLATFORM}/{THIS_PROJECT_ARCH}/{config}/{libType}") - else: - print("CPACK_INSTALL_CMAKE_PROJECTS is empty, skipping cpack...") - - except subprocess.CalledProcessError as e: - print(f"Subprocess failed with exit code {e.returncode}") - sys.exit(e.returncode) - - except Exception as e: - print(f"Unexpected error: {e}") - sys.exit(-1) - - -if __name__ == "__main__": - main() diff --git a/docker/scripts/nbl/ci/dev/lib/kazoo.py b/docker/scripts/nbl/ci/dev/lib/kazoo.py deleted file mode 100644 index 2e67b4edf4..0000000000 --- a/docker/scripts/nbl/ci/dev/lib/kazoo.py +++ /dev/null @@ -1,105 +0,0 @@ -import os, subprocess, sys, argparse, kazoo.exceptions, kazoo.client, socket - -def getLocalIPV4(): - try: - s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) - s.connect(('8.8.8.8', 80)) - local_ipv4 = s.getsockname()[0] # Get the local IPv4 address - s.close() - - return local_ipv4 - except socket.error: - return None - - -def resolveServiceToIPv4(serviceName): - try: - ipv4Address = socket.gethostbyname(serviceName) - return ipv4Address - except socket.gaierror as e: - print(f"Error while resolving {serviceName} to an IPv4 address: {e}") - return None - - -class KazooConnector: - def __init__(self, dnsServiceName): - self.dnsServiceName = dnsServiceName - self.host = resolveServiceToIPv4(self.dnsServiceName) - self.zk = kazoo.client.KazooClient(hosts=self.dnsServiceName) - - def connect(self): - self.zk.start() - print(f"Connected to {self.dnsServiceName} kazoo host") - - def disconnect(self): - self.zk.stop() - self.zk.close() - print(f"Disconnected from {self.dnsServiceName} kazoo host") - - def requestServerShutdown(self): - self.createKazooAtomic("/sdRequest") - print(f"Requested shutdown of {self.dnsServiceName} kazoo host") - - def createKazooAtomic(self, zNodePath): - if not self.zk.exists(zNodePath): - self.zk.create(zNodePath, b"") - - def getKazooAtomic(self, zNodePath): - if self.zk.exists(zNodePath): - data, _ = self.zk.get(zNodePath) - return data.decode() - else: - return "" - - def appendKazooAtomic(self, zNodePath, data): - while True: - try: - currentData, stat = self.zk.get(zNodePath) - newData = currentData.decode() + data - self.zk.set(zNodePath, newData.encode(), version=stat.version) - break - except kazoo.exceptions.BadVersionException: - pass - - -def healthyCheck(host): - try: - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - ipv4 = host - - if host == "localhost" or host == "127.0.0.1": - ipv4 = getLocalIPV4() - - s.settimeout(5) - s.connect((ipv4, 2181)) - - print(f"Connected to {ipv4} kazoo host") - - # TODO: find lib which does nice shutdown cross platform - sdProcess = subprocess.run("zkCli.cmd get /sdRequest", stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) - shutdown = not sdProcess.stderr.strip() - - if shutdown: - print("Requested shutdown...") - - try: - subprocess.run(f"shutdown /s /f", check=True) - except subprocess.CalledProcessError as e: - print(f"Could not shutdown container because of: {e.stderr}") - - return True - except (socket.error, socket.timeout): - print(f"Excpetion caught while trying to connect to kazoo host: \"{socket.error}\"") - return False - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Nabla CI Pipeline nbl.ci.dev.kazoo Framework Module") - - parser.add_argument("--host", help="Kazoo Server host", type=str, default="localhost") - - args = parser.parse_args() - - if healthyCheck(args.host): - sys.exit(0) # healthy - else: - sys.exit(1) # not healthy \ No newline at end of file diff --git a/docker/scripts/ncpfmp.bat b/docker/scripts/ncpfmp.bat deleted file mode 100644 index ce13fa7ebd..0000000000 --- a/docker/scripts/ncpfmp.bat +++ /dev/null @@ -1,13 +0,0 @@ -:: Nabla CI Pipeline Framework Module proxy - -@echo off -@setlocal - -:: Get the directory where this platform proxy script is located -set scriptDirectory=%~dp0 - -:: Change the current working directory to the cross-platform Python build script -cd /d "%scriptDirectory%" - -:: Execute implementation of the pipeline build script in Python -python -m %* \ No newline at end of file From dcc537e3e018e3e1bbfb6fec888ff2213edb1498 Mon Sep 17 00:00:00 2001 From: Fletterio Date: Wed, 13 Nov 2024 16:30:53 -0300 Subject: [PATCH 182/358] More changes following Bloom PR review --- examples_tests | 2 +- include/nbl/builtin/hlsl/workgroup/fft.hlsl | 103 +++++++++++++------- 2 files changed, 70 insertions(+), 35 deletions(-) diff --git a/examples_tests b/examples_tests index 0df6008c92..e946590f68 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 0df6008c923712ed729b47cf7711c6301622fdb3 +Subproject commit e946590f686875e4bf0262b738e776fc668b58da diff --git a/include/nbl/builtin/hlsl/workgroup/fft.hlsl b/include/nbl/builtin/hlsl/workgroup/fft.hlsl index 7d43caa63a..44a0dfc1d7 100644 --- a/include/nbl/builtin/hlsl/workgroup/fft.hlsl +++ b/include/nbl/builtin/hlsl/workgroup/fft.hlsl @@ -8,6 +8,7 @@ #include "nbl/builtin/hlsl/mpl.hlsl" #include "nbl/builtin/hlsl/memory_accessor.hlsl" #include "nbl/builtin/hlsl/bit.hlsl" +#include "nbl/builtin/hlsl/concepts.hlsl" // Caveats // - Sin and Cos in HLSL take 32-bit floats. Using this library with 64-bit floats works perfectly fine, but DXC will emit warnings @@ -90,10 +91,6 @@ namespace impl } } //namespace impl -// Get the required size (in number of uint32_t elements) of the workgroup shared memory array needed for the FFT -template -NBL_CONSTEXPR uint32_t SharedMemoryDWORDs = (sizeof(complex_t) / sizeof(uint32_t)) * WorkgroupSize; - // Util to unpack two values from the packed FFT X + iY - get outputs in the same input arguments, storing x to lo and y to hi template void unpack(NBL_REF_ARG(complex_t) lo, NBL_REF_ARG(complex_t) hi) @@ -103,7 +100,7 @@ void unpack(NBL_REF_ARG(complex_t) lo, NBL_REF_ARG(complex_t) hi lo = x; } -template +template struct FFTIndexingUtils { // This function maps the index `idx` in the output array of a Nabla FFT to the index `freqIdx` in the DFT such that `DFT[freqIdx] = NablaFFT[idx]` @@ -132,16 +129,36 @@ struct FFTIndexingUtils return getNablaIndex(getDFTMirrorIndex(getDFTIndex(idx))); } - NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerInvocationLog2 = mpl::log2::value; - NBL_CONSTEXPR_STATIC_INLINE uint16_t FFTSizeLog2 = ElementsPerInvocationLog2 + mpl::log2::value; - NBL_CONSTEXPR_STATIC_INLINE uint32_t FFTSize = uint32_t(WorkgroupSize) * uint32_t(ElementsPerInvocation); + NBL_CONSTEXPR_STATIC_INLINE uint16_t FFTSizeLog2 = ElementsPerInvocationLog2 + WorkgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint32_t FFTSize = uint32_t(1) << FFTSizeLog2; }; } //namespace fft -// ----------------------------------- End Utils ----------------------------------------------- +// ----------------------------------- End Utils -------------------------------------------------------------- + +namespace fft +{ + +template 0 && _WorkgroupSizeLog2 >= 5) +struct ConstevalParameters +{ + using scalar_t = _Scalar; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerInvocationLog2 = _ElementsPerInvocationLog2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint32_t TotalSize = uint32_t(1) << (ElementsPerInvocationLog2 + WorkgroupSizeLog2); -template + NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerInvocation = uint16_t(1) << ElementsPerInvocationLog2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(1) << WorkgroupSizeLog2; + + // Required size (in number of uint32_t elements) of the workgroup shared memory array needed for the FFT + NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedMemoryDWORDs = (sizeof(complex_t) / sizeof(uint32_t)) << WorkgroupSizeLog2; +}; + +} //namespace fft + +template struct FFT; // For the FFT methods below, we assume: @@ -161,9 +178,11 @@ struct FFT; // * void workgroupExecutionAndMemoryBarrier(); // 2 items per invocation forward specialization -template -struct FFT<2,false, WorkgroupSize, Scalar, device_capabilities> +template +struct FFT, device_capabilities> { + using consteval_params_t = fft::ConstevalParameters<1, WorkgroupSizeLog2, Scalar>; + template static void FFT_loop(uint32_t stride, NBL_REF_ARG(complex_t) lo, NBL_REF_ARG(complex_t) hi, uint32_t threadID, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor) { @@ -177,6 +196,8 @@ struct FFT<2,false, WorkgroupSize, Scalar, device_capabilities> template static void __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor) { + NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = consteval_params_t::WorkgroupSize; + // Compute the indices only once const uint32_t threadID = uint32_t(SubgroupContiguousIndex()); const uint32_t loIx = threadID; @@ -222,12 +243,12 @@ struct FFT<2,false, WorkgroupSize, Scalar, device_capabilities> } }; - - // 2 items per invocation inverse specialization -template -struct FFT<2,true, WorkgroupSize, Scalar, device_capabilities> +template +struct FFT, device_capabilities> { + using consteval_params_t = fft::ConstevalParameters<1, WorkgroupSizeLog2, Scalar>; + template static void FFT_loop(uint32_t stride, NBL_REF_ARG(complex_t) lo, NBL_REF_ARG(complex_t) hi, uint32_t threadID, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor) { @@ -241,6 +262,8 @@ struct FFT<2,true, WorkgroupSize, Scalar, device_capabilities> template static void __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor) { + NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = consteval_params_t::WorkgroupSize; + // Compute the indices only once const uint32_t threadID = uint32_t(SubgroupContiguousIndex()); const uint32_t loIx = threadID; @@ -291,17 +314,23 @@ struct FFT<2,true, WorkgroupSize, Scalar, device_capabilities> }; // Forward FFT -template -struct FFT +template +struct FFT, device_capabilities> { + using consteval_params_t = fft::ConstevalParameters; + using small_fft_consteval_params_t = fft::ConstevalParameters<1, WorkgroupSizeLog2, Scalar>; + template - static enable_if_t< (mpl::is_pot_v && K > 2), void > __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor) + static void __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor) { + NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = consteval_params_t::WorkgroupSize; + NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerInvocation = consteval_params_t::ElementsPerInvocation; + [unroll] - for (uint32_t stride = (K / 2) * WorkgroupSize; stride > WorkgroupSize; stride >>= 1) + for (uint32_t stride = (ElementsPerInvocation / 2) * WorkgroupSize; stride > WorkgroupSize; stride >>= 1) { [unroll] - for (uint32_t virtualThreadID = SubgroupContiguousIndex(); virtualThreadID < (K / 2) * WorkgroupSize; virtualThreadID += WorkgroupSize) + for (uint32_t virtualThreadID = SubgroupContiguousIndex(); virtualThreadID < (ElementsPerInvocation / 2) * WorkgroupSize; virtualThreadID += WorkgroupSize) { const uint32_t loIx = ((virtualThreadID & (~(stride - 1))) << 1) | (virtualThreadID & (stride - 1)); const uint32_t hiIx = loIx | stride; @@ -318,47 +347,53 @@ struct FFT accessor.memoryBarrier(); // no execution barrier just making sure writes propagate to accessor } - // do K/2 small workgroup FFTs + // do ElementsPerInvocation/2 small workgroup FFTs accessor_adaptors::Offset offsetAccessor; offsetAccessor.accessor = accessor; [unroll] - for (uint32_t k = 0; k < K; k += 2) + for (uint32_t k = 0; k < ElementsPerInvocation; k += 2) { if (k) sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); offsetAccessor.offset = WorkgroupSize*k; - FFT<2,false, WorkgroupSize, Scalar, device_capabilities>::template __call(offsetAccessor,sharedmemAccessor); + FFT::template __call(offsetAccessor,sharedmemAccessor); } accessor = offsetAccessor.accessor; } }; // Inverse FFT -template -struct FFT +template +struct FFT, device_capabilities> { + using consteval_params_t = fft::ConstevalParameters; + using small_fft_consteval_params_t = fft::ConstevalParameters<1, WorkgroupSizeLog2, Scalar>; + template - static enable_if_t< (mpl::is_pot_v && K > 2), void > __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor) + static void __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor) { + NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = consteval_params_t::WorkgroupSize; + NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerInvocation = consteval_params_t::ElementsPerInvocation; + // do K/2 small workgroup FFTs accessor_adaptors::Offset offsetAccessor; offsetAccessor.accessor = accessor; [unroll] - for (uint32_t k = 0; k < K; k += 2) + for (uint32_t k = 0; k < ElementsPerInvocation; k += 2) { if (k) sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); offsetAccessor.offset = WorkgroupSize*k; - FFT<2,true, WorkgroupSize, Scalar, device_capabilities>::template __call(offsetAccessor,sharedmemAccessor); + FFT::template __call(offsetAccessor,sharedmemAccessor); } accessor = offsetAccessor.accessor; [unroll] - for (uint32_t stride = 2 * WorkgroupSize; stride < K * WorkgroupSize; stride <<= 1) + for (uint32_t stride = 2 * WorkgroupSize; stride < ElementsPerInvocation * WorkgroupSize; stride <<= 1) { accessor.memoryBarrier(); // no execution barrier just making sure writes propagate to accessor [unroll] - for (uint32_t virtualThreadID = SubgroupContiguousIndex(); virtualThreadID < (K / 2) * WorkgroupSize; virtualThreadID += WorkgroupSize) + for (uint32_t virtualThreadID = SubgroupContiguousIndex(); virtualThreadID < (ElementsPerInvocation / 2) * WorkgroupSize; virtualThreadID += WorkgroupSize) { const uint32_t loIx = ((virtualThreadID & (~(stride - 1))) << 1) | (virtualThreadID & (stride - 1)); const uint32_t hiIx = loIx | stride; @@ -370,11 +405,11 @@ struct FFT hlsl::fft::DIT::radix2(hlsl::fft::twiddle(virtualThreadID & (stride - 1), stride), lo,hi); // Divide by special factor at the end - if ( (K / 2) * WorkgroupSize == stride) + if ( (ElementsPerInvocation / 2) * WorkgroupSize == stride) { divides_assign< complex_t > divAss; - divAss(lo, K / 2); - divAss(hi, K / 2); + divAss(lo, ElementsPerInvocation / 2); + divAss(hi, ElementsPerInvocation / 2); } accessor.set(loIx, lo); From 253ffafed0de434ef3eaa8cd2af12dff0fe284d9 Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Wed, 13 Nov 2024 20:40:40 +0100 Subject: [PATCH 183/358] Lets commit entire git history into an image and update incrementally - create `git-cache-updater` service & `git-cache/update-git-cache.cmd` updating dcr.devsh.eu/nabla/source/git-cache:latest with entire Nabla history locally (currently full master only, but I'm going to track all branches there) --- docker/git-cache/Dockerfile | 47 ++++++++++++++++++++ docker/git-cache/compose.yml | 7 +++ docker/git-cache/update-git-cache.cmd | 64 +++++++++++++++++++++++++++ 3 files changed, 118 insertions(+) create mode 100644 docker/git-cache/Dockerfile create mode 100644 docker/git-cache/compose.yml create mode 100644 docker/git-cache/update-git-cache.cmd diff --git a/docker/git-cache/Dockerfile b/docker/git-cache/Dockerfile new file mode 100644 index 0000000000..fca4e232ad --- /dev/null +++ b/docker/git-cache/Dockerfile @@ -0,0 +1,47 @@ +# escape=` + +ARG BASE_IMAGE=mcr.microsoft.com/windows/servercore:ltsc2022-amd64 + +FROM ${BASE_IMAGE} + +SHELL ["cmd", "/S", "/C"] + +RUN ` + # Install Chocolatey + ` + powershell -Command "Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1'))" + +RUN ` + # Install Git + ` + choco install -y git --version 2.43.0 + +RUN ` + # Install CMake + ` + curl -SL --output cmake.zip https://github.com/Kitware/CMake/releases/download/v3.31.0/cmake-3.31.0-windows-x86_64.zip ` + ` + && mkdir "./cmake" ` + ` + && tar -xf cmake.zip -C "./cmake" ` + ` + && del /q cmake.zip + +WORKDIR C:\gitcache + +RUN ` + # Post environment setup + ` + git config --system protocol.*.allow always ` + ` + && git config --system url."https://github.com/".insteadOf "git@github.com:" ` + ` + && git config --system --add safe.directory * ` + ` + && setx THIS_PROJECT_GIT_CACHE "C:\gitcache" ` + ` + && git init ` + ` + && git remote add origin https://github.com/Devsh-Graphics-Programming/Nabla.git + +ENTRYPOINT ["cmd.exe", "/K"] \ No newline at end of file diff --git a/docker/git-cache/compose.yml b/docker/git-cache/compose.yml new file mode 100644 index 0000000000..d4d89c7aa9 --- /dev/null +++ b/docker/git-cache/compose.yml @@ -0,0 +1,7 @@ +services: + git-cache-updater: + build: + context: . + dockerfile: Dockerfile + image: dcr.devsh.eu/nabla/source/git-cache:latest + container_name: git.cache.update \ No newline at end of file diff --git a/docker/git-cache/update-git-cache.cmd b/docker/git-cache/update-git-cache.cmd new file mode 100644 index 0000000000..19f30faa39 --- /dev/null +++ b/docker/git-cache/update-git-cache.cmd @@ -0,0 +1,64 @@ +@echo off +REM Set cache image reference +set IMAGE_NAME=dcr.devsh.eu/nabla/source/git-cache:latest + +REM Start the git-cache-updater container in detached mode and capture the container ID +set CONTAINER_ID= +for /f "delims=" %%i in ('docker-compose run --remove-orphans -d git-cache-updater') do set CONTAINER_ID=%%i + +REM Check if the container started successfully +if "%CONTAINER_ID%"=="" ( + echo Failed to start the git-cache-updater container. + exit /b 1 +) + +echo Started container with ID %CONTAINER_ID% + +REM Fetch master commits +docker exec -i -t %CONTAINER_ID% git fetch origin master +if %errorlevel% neq 0 ( + echo "Error: git fetch failed" + docker stop %CONTAINER_ID% + exit /b %errorlevel% +) + +REM Checkout master. TODO: since it happens at runtime I could loop over /remotes' CURRENT branches and track all history +docker exec -i -t %CONTAINER_ID% git checkout master -f +if %errorlevel% neq 0 ( + echo "Error: git checkout failed" + docker stop %CONTAINER_ID% + exit /b %errorlevel% +) + +REM Update & checkout submodules with CMake +docker exec -i -t %CONTAINER_ID% "C:\cmake\cmake-3.31.0-windows-x86_64\bin\cmake" -P cmake\submodules\update.cmake +if %errorlevel% neq 0 ( + echo "Error: CMake submodule update failed" + docker stop %CONTAINER_ID% + exit /b %errorlevel% +) + +REM Stop the container before committing +docker stop %CONTAINER_ID% +if %errorlevel% neq 0 ( + echo "Error: failed to stop container" + exit /b %errorlevel% +) + +REM Commit the updated container as a new image +docker commit %CONTAINER_ID% %IMAGE_NAME% +if %errorlevel% neq 0 ( + echo "Error: failed to commit the container" + exit /b %errorlevel% +) + +echo Git cache updated and committed as %IMAGE_NAME%. + +REM Remove the update container +docker rm %CONTAINER_ID% +if %errorlevel% neq 0 ( + echo "Error: failed to remove the update container" + exit /b %errorlevel% +) + +echo Removed %CONTAINER_ID% update container. \ No newline at end of file From cfc8dc9c9a386e65f78bd3f2998040fca5a5992c Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Wed, 13 Nov 2024 15:06:36 -0800 Subject: [PATCH 184/358] Saving work --- examples_tests | 2 +- .../hlsl/matrix_utils/transformation_matrix_utils.hlsl | 7 +------ 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/examples_tests b/examples_tests index 2f26a62337..7cb41f7f6b 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 2f26a6233740f0c7cd8128373a14ccdb2a9bb0ab +Subproject commit 7cb41f7f6b6eec116305dffe748d70d9ca07a1c3 diff --git a/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl b/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl index add1471bb5..d3d7d7fa04 100644 --- a/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl +++ b/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl @@ -12,12 +12,6 @@ namespace nbl namespace hlsl { -// TODO: if `IdentityFloat32_t3x4` and `IdentityFloat32_t3x4` constexprs are ok, then I can expand them into templated struct, not doing it untill the concept is approved -//template -//struct IdentityMatrix -//{ -// -//}; NBL_CONSTEXPR hlsl::float32_t3x4 IdentityFloat32_t3x4 = hlsl::float32_t3x4(hlsl::float32_t4(1, 0, 0, 0), hlsl::float32_t4(0, 0, 1, 0), hlsl::float32_t4(0, 0, 1, 0)); NBL_CONSTEXPR hlsl::float32_t4x4 IdentityFloat32_t4x4 = @@ -75,6 +69,7 @@ namespace transformation_matrix_utils_impl } } +//! returs adjugate of the cofactor (sub 3x3) matrix template inline matrix getSub3x3TransposeCofactors(const matrix& mat) { From 93e983e031e1cb2dc1528b46297927a1623ff5b3 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 14 Nov 2024 16:00:02 +0700 Subject: [PATCH 185/358] fix mapping index type with vk --- examples_tests | 2 +- src/nbl/video/CVulkanAccelerationStructure.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples_tests b/examples_tests index e1992f9dc6..ec7cfa853a 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit e1992f9dc6a5fe2a5331019a2a207e4fb5452d65 +Subproject commit ec7cfa853a06beaad504e4c31ee32c79a6160172 diff --git a/src/nbl/video/CVulkanAccelerationStructure.h b/src/nbl/video/CVulkanAccelerationStructure.h index 69df8b6e3b..c524199e53 100644 --- a/src/nbl/video/CVulkanAccelerationStructure.h +++ b/src/nbl/video/CVulkanAccelerationStructure.h @@ -128,7 +128,7 @@ void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles(triangles.vertexData[0]); outBase.geometry.triangles.vertexStride = triangles.vertexStride; outBase.geometry.triangles.maxVertex = triangles.maxVertex; - outBase.geometry.triangles.indexType = static_cast(triangles.indexType); + outBase.geometry.triangles.indexType = (triangles.indexType == asset::E_INDEX_TYPE::EIT_UNKNOWN) ? VK_INDEX_TYPE_NONE_KHR : static_cast(triangles.indexType); outBase.geometry.triangles.indexData = QueryOnly ? NullAddress:getVkDeviceOrHostAddress(triangles.indexData); // except that the hostAddress member of VkAccelerationStructureGeometryTrianglesDataKHR::transformData will be examined to check if it is NULL. if (!triangles.hasTransform()) From 8d044aa91d7b59cd9deb5f6c2030479586aa0454 Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Thu, 14 Nov 2024 12:26:08 +0100 Subject: [PATCH 186/358] make the git cache work, put all into single compose file with small update script --- docker/Dockerfile | 3 +++ docker/compose.yml | 15 ++++++++++----- docker/git-cache/Dockerfile | 2 +- docker/git-cache/compose.yml | 7 ------- .../update-git-cache.cmd => update.cmd} | 16 ++++++++++++++++ 5 files changed, 30 insertions(+), 13 deletions(-) delete mode 100644 docker/git-cache/compose.yml rename docker/{git-cache/update-git-cache.cmd => update.cmd} (81%) diff --git a/docker/Dockerfile b/docker/Dockerfile index a0427ccfeb..cd85b933c3 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -113,4 +113,7 @@ RUN ` && setx NBL_CI_MODE "ON" WORKDIR ${THIS_PROJECT_NABLA_DIRECTORY} + +COPY --from=dcr.devsh.eu/nabla/source/git-cache:latest /gitcache/.git ./.git + ENTRYPOINT ["cmd.exe", "/K"] \ No newline at end of file diff --git a/docker/compose.yml b/docker/compose.yml index 04c1013081..b7bbf59f37 100644 --- a/docker/compose.yml +++ b/docker/compose.yml @@ -1,4 +1,12 @@ services: + git-cache-updater: + build: + context: ./git-cache + dockerfile: Dockerfile + image: dcr.devsh.eu/nabla/source/git-cache:latest + container_name: git.cache.update + networks: + docker_default: nabla: build: context: . @@ -10,8 +18,6 @@ services: environment: - THIS_PROJECT_WORKING_DIRECTORY=${THIS_PROJECT_WORKING_DIRECTORY} - THIS_PROJECT_NABLA_DIRECTORY=${THIS_PROJECT_NABLA_DIRECTORY} - volumes: - - nabla-cache-git:${THIS_PROJECT_NABLA_DIRECTORY}/.git networks: docker_default: deploy: @@ -19,9 +25,8 @@ services: limits: cpus: '6' memory: 12G - -volumes: - nabla-cache-git: + depends_on: + - git-cache-updater networks: docker_default: diff --git a/docker/git-cache/Dockerfile b/docker/git-cache/Dockerfile index fca4e232ad..6704869b2a 100644 --- a/docker/git-cache/Dockerfile +++ b/docker/git-cache/Dockerfile @@ -42,6 +42,6 @@ RUN ` ` && git init ` ` - && git remote add origin https://github.com/Devsh-Graphics-Programming/Nabla.git + && git remote add origin https://github.com/Devsh-Graphics-Programming/Nabla.git ENTRYPOINT ["cmd.exe", "/K"] \ No newline at end of file diff --git a/docker/git-cache/compose.yml b/docker/git-cache/compose.yml deleted file mode 100644 index d4d89c7aa9..0000000000 --- a/docker/git-cache/compose.yml +++ /dev/null @@ -1,7 +0,0 @@ -services: - git-cache-updater: - build: - context: . - dockerfile: Dockerfile - image: dcr.devsh.eu/nabla/source/git-cache:latest - container_name: git.cache.update \ No newline at end of file diff --git a/docker/git-cache/update-git-cache.cmd b/docker/update.cmd similarity index 81% rename from docker/git-cache/update-git-cache.cmd rename to docker/update.cmd index 19f30faa39..6d08e0455b 100644 --- a/docker/git-cache/update-git-cache.cmd +++ b/docker/update.cmd @@ -38,6 +38,22 @@ if %errorlevel% neq 0 ( exit /b %errorlevel% ) +docker exec -i -t %CONTAINER_ID% cmd /C "for /d %%i in (*) do if /i not %%i==.git rmdir /s /q %%i" + +if %errorlevel% neq 0 ( + echo "Error: failed to clean up files" + docker stop %CONTAINER_ID% + exit /b %errorlevel% +) + +docker exec -i -t %CONTAINER_ID% cmd /C "for %%i in (*) do if /i not %%i==.git del /q %%i" + +if %errorlevel% neq 0 ( + echo "Error: failed to clean up files" + docker stop %CONTAINER_ID% + exit /b %errorlevel% +) + REM Stop the container before committing docker stop %CONTAINER_ID% if %errorlevel% neq 0 ( From a06c270e693327491c4acce23b905a207181d9ec Mon Sep 17 00:00:00 2001 From: Ali Cheraghi Date: Wed, 13 Nov 2024 23:53:43 +0330 Subject: [PATCH 187/358] ICPUBuffer v2.0 - Replaced all wrappers usec-ases with new constructors and polymorphic memory resources - Allocation failures can now be properly checked - Don't re-allocate for shader cache writes Signed-off-by: Ali Cheraghi --- include/nbl/asset/CVectorCPUBuffer.h | 34 ---- include/nbl/asset/ICPUBuffer.h | 187 +++++++----------- include/nbl/asset/ICPUShader.h | 2 +- .../filters/CFlattenRegionsImageFilter.h | 2 +- .../filters/dithering/CPrecomputedDither.h | 2 +- .../interchange/IImageAssetHandlerBase.h | 2 +- include/nbl/asset/utils/CCPUMeshPackerV1.h | 8 +- include/nbl/asset/utils/CCPUMeshPackerV2.h | 6 +- include/nbl/asset/utils/CDirQuantCacheBase.h | 4 +- include/nbl/asset/utils/IMeshManipulator.h | 6 +- include/nbl/asset/utils/IMeshPacker.h | 2 +- include/nbl/asset/utils/IShaderCompiler.h | 3 +- .../nbl/core/alloc/resource_owning_vector.h | 50 +++++ include/nbl/ext/ScreenShot/ScreenShot.h | 2 +- .../nbl/scene/ICullingLoDSelectionSystem.h | 2 +- include/nbl/scene/ISkinInstanceCacheManager.h | 2 +- include/nbl/scene/ITransformTreeManager.h | 2 +- src/nbl/asset/IAssetManager.cpp | 4 +- src/nbl/asset/ICPUImage.cpp | 2 +- src/nbl/asset/interchange/CBufferLoaderBIN.h | 2 +- src/nbl/asset/interchange/CGLILoader.cpp | 2 +- src/nbl/asset/interchange/CGLTFLoader.cpp | 26 +-- .../CGraphicsPipelineLoaderMTL.cpp | 4 +- src/nbl/asset/interchange/CHLSLLoader.cpp | 2 +- src/nbl/asset/interchange/CImageLoaderJPG.cpp | 2 +- .../asset/interchange/CImageLoaderOpenEXR.cpp | 2 +- src/nbl/asset/interchange/CImageLoaderPNG.cpp | 2 +- src/nbl/asset/interchange/CImageLoaderTGA.cpp | 8 +- src/nbl/asset/interchange/CImageWriterJPG.cpp | 2 +- src/nbl/asset/interchange/CImageWriterTGA.cpp | 2 +- .../asset/interchange/COBJMeshFileLoader.cpp | 4 +- .../asset/interchange/CPLYMeshFileLoader.cpp | 10 +- src/nbl/asset/interchange/CSPVLoader.cpp | 2 +- .../asset/interchange/CSTLMeshFileLoader.cpp | 2 +- src/nbl/asset/utils/CDerivativeMapCreator.cpp | 4 +- src/nbl/asset/utils/CGLSLCompiler.cpp | 2 +- src/nbl/asset/utils/CGeometryCreator.cpp | 30 +-- src/nbl/asset/utils/CHLSLCompiler.cpp | 2 +- src/nbl/asset/utils/CMeshManipulator.cpp | 20 +- src/nbl/asset/utils/CMeshManipulator.h | 8 +- src/nbl/asset/utils/ISPIRVOptimizer.cpp | 2 +- src/nbl/asset/utils/IShaderCompiler.cpp | 21 +- .../DepthPyramidGenerator.cpp | 2 +- src/nbl/ext/FFT/FFT.cpp | 2 +- src/nbl/ext/LumaMeter/CLumaMeter.cpp | 2 +- src/nbl/ext/MitsubaLoader/CMitsubaLoader.cpp | 14 +- .../ext/MitsubaLoader/CSerializedLoader.cpp | 10 +- src/nbl/ext/RadixSort/RadixSort.cpp | 4 +- src/nbl/ext/TextRendering/TextRendering.cpp | 2 +- src/nbl/ext/ToneMapper/CToneMapper.cpp | 2 +- src/nbl/video/CCUDAHandler.cpp | 2 +- .../video/utilities/CPropertyPoolHandler.cpp | 2 +- src/nbl/video/utilities/CScanner.cpp | 2 +- .../video/utilities/ImageRegionIterator.cpp | 4 +- 54 files changed, 251 insertions(+), 280 deletions(-) delete mode 100644 include/nbl/asset/CVectorCPUBuffer.h create mode 100644 include/nbl/core/alloc/resource_owning_vector.h diff --git a/include/nbl/asset/CVectorCPUBuffer.h b/include/nbl/asset/CVectorCPUBuffer.h deleted file mode 100644 index 494a279d4d..0000000000 --- a/include/nbl/asset/CVectorCPUBuffer.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef _NBL_ASSET_C_VECTOR_CPU_BUFFER_H_INCLUDED_ -#define _NBL_ASSET_C_VECTOR_CPU_BUFFER_H_INCLUDED_ - -#include "nbl/asset/ICPUBuffer.h" - -namespace nbl::asset { - -template -class CVectorCPUBuffer final : public ICPUBuffer -{ -public: - inline CVectorCPUBuffer(core::vector&& _vec) : ICPUBuffer(_vec.size(), _vec.data()), m_vec(std::move(_vec)) - { - } - -protected: - virtual inline ~CVectorCPUBuffer() - { - freeData(); - } - inline void freeData() override - { - m_vec.clear(); - ICPUBuffer::data = nullptr; - } - - core::vector m_vec; -}; - -} - - - -#endif \ No newline at end of file diff --git a/include/nbl/asset/ICPUBuffer.h b/include/nbl/asset/ICPUBuffer.h index e992fc8683..84af10769d 100644 --- a/include/nbl/asset/ICPUBuffer.h +++ b/include/nbl/asset/ICPUBuffer.h @@ -6,8 +6,6 @@ #include -#include "nbl/core/alloc/null_allocator.h" - #include "nbl/asset/IBuffer.h" #include "nbl/asset/IAsset.h" #include "nbl/asset/IPreHashed.h" @@ -15,36 +13,56 @@ namespace nbl::asset { +struct SICPUBufferCreationParams +{ + size_t size; + void* data = nullptr; + size_t alignment = _NBL_SIMD_ALIGNMENT; + std::pmr::memory_resource* memoryResource = nullptr; +}; + //! One of CPU class-object representing an Asset +//! TODO: remove, alloc can fail, should be a static create method instead! /** One of Assets used for storage of large arrays, so that storage can be decoupled from other objects such as meshbuffers, images, animations and shader source/bytecode. @see IAsset */ -class ICPUBuffer : public asset::IBuffer, public IPreHashed +class ICPUBuffer final : public asset::IBuffer, public IPreHashed { - protected: - //! Non-allocating constructor for CCustormAllocatorCPUBuffer derivative - ICPUBuffer(size_t sizeInBytes, void* dat) : asset::IBuffer({ dat ? sizeInBytes : 0,EUF_TRANSFER_DST_BIT }), data(dat) {} - public: - //! Constructor. TODO: remove, alloc can fail, should be a static create method instead! - /** @param sizeInBytes Size in bytes. If `dat` argument is present, it denotes size of data pointed by `dat`, otherwise - size of data to be allocated. - */ - ICPUBuffer(size_t sizeInBytes) : asset::IBuffer({0,EUF_TRANSFER_DST_BIT}) - { - data = _NBL_ALIGNED_MALLOC(sizeInBytes,_NBL_SIMD_ALIGNMENT); - if (!data) // FIXME: cannot fail like that, need factory `create` methods - return; + ICPUBuffer(size_t size, void* data, std::pmr::memory_resource* memoryResource, size_t alignment, bool adopt_memory) : + asset::IBuffer({ size, EUF_TRANSFER_DST_BIT }), m_data(data), m_mem_resource(memoryResource), m_alignment(alignment), m_adopt_memory(adopt_memory) {} - m_creationParams.size = sizeInBytes; + //! allocates uninitialized memory, copies `data` into allocation if `!data` not nullptr + core::smart_refctd_ptr static create(const SICPUBufferCreationParams& params) { + std::pmr::memory_resource* memoryResource = params.memoryResource; + if (!params.memoryResource) + memoryResource = std::pmr::get_default_resource(); + + auto data = memoryResource->allocate(params.size, params.alignment); + if (!data) + return nullptr; + + if (params.data) + memcpy(data, params.data, params.size); + + return core::make_smart_refctd_ptr(params.size, data, memoryResource, params.alignment, false); + } + + //! does not allocate memory, adopts the `data` pointer, no copies done + core::smart_refctd_ptr static create(const SICPUBufferCreationParams& params, core::adopt_memory_t) { + std::pmr::memory_resource* memoryResource; + if (!params.memoryResource) + memoryResource = std::pmr::get_default_resource(); + return core::make_smart_refctd_ptr(params.size, params.data, memoryResource, params.alignment, true); } core::smart_refctd_ptr clone(uint32_t = ~0u) const override final { - auto cp = core::make_smart_refctd_ptr(m_creationParams.size); - memcpy(cp->getPointer(), data, m_creationParams.size); + auto cp = create({ m_creationParams.size, m_data }); + memcpy(cp->getPointer(), m_data, m_creationParams.size); cp->setContentHash(getContentHash()); return cp; } @@ -52,27 +70,27 @@ class ICPUBuffer : public asset::IBuffer, public IPreHashed constexpr static inline auto AssetType = ET_BUFFER; inline IAsset::E_TYPE getAssetType() const override final { return AssetType; } - inline size_t getDependantCount() const override {return 0;} + inline size_t getDependantCount() const override { return 0; } // inline core::blake3_hash_t computeContentHash() const override { - core::blake3_hasher hasher; - if (data) - hasher.update(data,m_creationParams.size); - return static_cast(hasher); + core::blake3_hasher hasher; + if (m_data) + hasher.update(m_data, m_creationParams.size); + return static_cast(hasher); } - inline bool missingContent() const override {return !data;} + inline bool missingContent() const override { return !m_data; } //! Returns pointer to data. - const void* getPointer() const {return data;} - void* getPointer() - { + const void* getPointer() const { return m_data; } + void* getPointer() + { assert(isMutable()); - return data; + return m_data; } - + inline core::bitflag getUsageFlags() const { return m_creationParams.usage; @@ -90,93 +108,30 @@ class ICPUBuffer : public asset::IBuffer, public IPreHashed return true; } - protected: - inline IAsset* getDependant_impl(const size_t ix) override - { - return nullptr; - } - - inline void discardContent_impl() override - { - return freeData(); - } - - // REMEMBER TO CALL FROM DTOR! - // TODO: idea, make the `ICPUBuffer` an ADT, and use the default allocator CCPUBuffer instead for consistency - // TODO: idea make a macro for overriding all `delete` operators of a class to enforce a finalizer that runs in reverse order to destructors (to allow polymorphic cleanups) - virtual inline void freeData() - { - if (data) - _NBL_ALIGNED_FREE(data); - data = nullptr; - m_creationParams.size = 0ull; - } - - void* data; -}; - - -template< - typename Allocator = _NBL_DEFAULT_ALLOCATOR_METATYPE, - bool = std::is_same >::value -> -class CCustomAllocatorCPUBuffer; - -using CDummyCPUBuffer = CCustomAllocatorCPUBuffer, true>; - -//! Specialization of ICPUBuffer capable of taking custom allocators -/* - Take a look that with this usage you have to specify custom alloctor - passing an object type for allocation and a pointer to allocated - data for it's storage by ICPUBuffer. - - So the need for the class existence is for common following tricks - among others creating an - \bICPUBuffer\b over an already existing \bvoid*\b array without any \imemcpy\i or \itaking over the memory ownership\i. - You can use it with a \bnull_allocator\b that adopts memory (it is a bit counter intuitive because \badopt = take\b ownership, - but a \inull allocator\i doesn't do anything, even free the memory, so you're all good). - */ - -template -class CCustomAllocatorCPUBuffer : public ICPUBuffer -{ - static_assert(sizeof(typename Allocator::value_type) == 1u, "Allocator::value_type must be of size 1"); - protected: - Allocator m_allocator; - - virtual ~CCustomAllocatorCPUBuffer() final - { - freeData(); - } - inline void freeData() override - { - if (ICPUBuffer::data) - m_allocator.deallocate(reinterpret_cast(ICPUBuffer::data), ICPUBuffer::m_creationParams.size); - ICPUBuffer::data = nullptr; // so that ICPUBuffer won't try deallocating - } - - public: - CCustomAllocatorCPUBuffer(size_t sizeInBytes, void* dat, core::adopt_memory_t, Allocator&& alctr = Allocator()) : ICPUBuffer(sizeInBytes,dat), m_allocator(std::move(alctr)) - { - } -}; - -template -class CCustomAllocatorCPUBuffer : public CCustomAllocatorCPUBuffer -{ - using Base = CCustomAllocatorCPUBuffer; - - protected: - virtual ~CCustomAllocatorCPUBuffer() = default; - inline void freeData() override {} - - public: - using Base::Base; - - // TODO: remove, alloc can fail, should be a static create method instead! - CCustomAllocatorCPUBuffer(size_t sizeInBytes, const void* dat, Allocator&& alctr = Allocator()) : Base(sizeInBytes, alctr.allocate(sizeInBytes), core::adopt_memory, std::move(alctr)) - { - memcpy(Base::data,dat,sizeInBytes); - } +protected: + inline IAsset* getDependant_impl(const size_t ix) override + { + return nullptr; + } + + inline void discardContent_impl() override + { + return freeData(); + } + + // REMEMBER TO CALL FROM DTOR! + virtual inline void freeData() + { + if (!m_adopt_memory && m_data) + m_mem_resource->deallocate(m_data, m_creationParams.size, m_alignment); + m_data = nullptr; + m_creationParams.size = 0ull; + } + + void* m_data; + std::pmr::memory_resource* m_mem_resource; + size_t m_alignment; + bool m_adopt_memory; }; } // end namespace nbl::asset diff --git a/include/nbl/asset/ICPUShader.h b/include/nbl/asset/ICPUShader.h index 86db467d3d..ae8758d44f 100644 --- a/include/nbl/asset/ICPUShader.h +++ b/include/nbl/asset/ICPUShader.h @@ -33,7 +33,7 @@ class ICPUShader : public IAsset, public IShader : IShader(stage, std::move(filepathHint)), m_code(std::move(code)), m_contentType(contentType) {} ICPUShader(const char* code, const E_SHADER_STAGE stage, const E_CONTENT_TYPE contentType, std::string&& filepathHint) - : ICPUShader(core::make_smart_refctd_ptr(strlen(code) + 1u), stage, contentType, std::move(filepathHint)) + : ICPUShader(ICPUBuffer::create({ strlen(code) + 1u }), stage, contentType, std::move(filepathHint)) { assert(contentType != E_CONTENT_TYPE::ECT_SPIRV); // because using strlen needs `code` to be null-terminated memcpy(m_code->getPointer(), code, m_code->getSize()); diff --git a/include/nbl/asset/filters/CFlattenRegionsImageFilter.h b/include/nbl/asset/filters/CFlattenRegionsImageFilter.h index 3bc22f94c0..0eeba246e2 100644 --- a/include/nbl/asset/filters/CFlattenRegionsImageFilter.h +++ b/include/nbl/asset/filters/CFlattenRegionsImageFilter.h @@ -89,7 +89,7 @@ class CFlattenRegionsImageFilter : public CImageFilter(bufferSize); + auto buffer = ICPUBuffer::create({ bufferSize }); state->outImage->setBufferAndRegions(std::move(buffer),std::move(regions)); }; diff --git a/include/nbl/asset/filters/dithering/CPrecomputedDither.h b/include/nbl/asset/filters/dithering/CPrecomputedDither.h index 33dd20a190..eb1c4627ca 100644 --- a/include/nbl/asset/filters/dithering/CPrecomputedDither.h +++ b/include/nbl/asset/filters/dithering/CPrecomputedDither.h @@ -58,7 +58,7 @@ namespace nbl const size_t newDecodeBufferSize = extent.x * extent.y * extent.z * creationParams.arrayLayers * decodeTexelByteSize; const core::vector3du32_SIMD decodeBufferByteStrides = TexelBlockInfo(decodeFormat).convert3DTexelStridesTo1DByteStrides(core::vector3du32_SIMD(extent.x, extent.y, extent.z)); - auto decodeFlattenBuffer = core::make_smart_refctd_ptr(newDecodeBufferSize); + auto decodeFlattenBuffer = ICPUBuffer::create({ newDecodeBufferSize }); decodeFlattenBuffer->setContentHash(IPreHashed::INVALID_HASH); auto* inData = reinterpret_cast(flattenDitheringImage->getBuffer()->getPointer()); diff --git a/include/nbl/asset/interchange/IImageAssetHandlerBase.h b/include/nbl/asset/interchange/IImageAssetHandlerBase.h index 8a0c0f58aa..b39ae2a8be 100644 --- a/include/nbl/asset/interchange/IImageAssetHandlerBase.h +++ b/include/nbl/asset/interchange/IImageAssetHandlerBase.h @@ -110,7 +110,7 @@ class IImageAssetHandlerBase : public virtual core::IReferenceCounted bufferSize += memsize.getIntegerApprox(); } - auto texelBuffer = core::make_smart_refctd_ptr(bufferSize); + auto texelBuffer = ICPUBuffer::create({ bufferSize }); newImage->setBufferAndRegions(std::move(texelBuffer), newRegions); newImage->setContentHash(IPreHashed::INVALID_HASH); } diff --git a/include/nbl/asset/utils/CCPUMeshPackerV1.h b/include/nbl/asset/utils/CCPUMeshPackerV1.h index 23b2da563a..45d84f81b3 100644 --- a/include/nbl/asset/utils/CCPUMeshPackerV1.h +++ b/include/nbl/asset/utils/CCPUMeshPackerV1.h @@ -296,11 +296,11 @@ void CCPUMeshPackerV1::instantiateDataStorage() const size_t vtxBuffSupportedByteSize = base_t::alctrTraits::get_total_size(base_t::m_vtxBuffAlctr); const size_t perInsBuffSupportedByteSize = base_t::alctrTraits::get_total_size(base_t::m_vtxBuffAlctr); - m_output.MDIDataBuffer = core::make_smart_refctd_ptr(MDIDataBuffSupportedByteSize); - m_output.indexBuffer.buffer = core::make_smart_refctd_ptr(idxBuffSupportedByteSize); + m_output.MDIDataBuffer = ICPUBuffer::create({ MDIDataBuffSupportedByteSize }); + m_output.indexBuffer.buffer = ICPUBuffer::create({ idxBuffSupportedByteSize }); - core::smart_refctd_ptr unifiedVtxBuff = core::make_smart_refctd_ptr(vtxBuffSupportedByteSize); - core::smart_refctd_ptr unifiedInsBuff = core::make_smart_refctd_ptr(perInsBuffSupportedByteSize); + core::smart_refctd_ptr unifiedVtxBuff = ICPUBuffer::create({ vtxBuffSupportedByteSize }); + core::smart_refctd_ptr unifiedInsBuff = ICPUBuffer::create({ perInsBuffSupportedByteSize }); //divide unified vtx buffers //proportions: sizeOfAttr1 : sizeOfAttr2 : ... : sizeOfAttrN diff --git a/include/nbl/asset/utils/CCPUMeshPackerV2.h b/include/nbl/asset/utils/CCPUMeshPackerV2.h index 83308ab82b..a41d65b067 100644 --- a/include/nbl/asset/utils/CCPUMeshPackerV2.h +++ b/include/nbl/asset/utils/CCPUMeshPackerV2.h @@ -61,9 +61,9 @@ void CCPUMeshPackerV2::instantiateDataStorage() const uint32_t idxBuffByteSize = base_t::m_idxBuffAlctr.get_total_size() * sizeof(uint16_t); const uint32_t vtxBuffByteSize = base_t::m_vtxBuffAlctr.get_total_size(); - base_t::m_packerDataStore.MDIDataBuffer = core::make_smart_refctd_ptr(MDIDataBuffByteSize); - base_t::m_packerDataStore.indexBuffer = core::make_smart_refctd_ptr(idxBuffByteSize); - base_t::m_packerDataStore.vertexBuffer = core::make_smart_refctd_ptr(vtxBuffByteSize); + base_t::m_packerDataStore.MDIDataBuffer = ICPUBuffer::create({ MDIDataBuffByteSize }); + base_t::m_packerDataStore.indexBuffer = ICPUBuffer::create({ idxBuffByteSize }); + base_t::m_packerDataStore.vertexBuffer = ICPUBuffer::create({ vtxBuffByteSize }); } /* diff --git a/include/nbl/asset/utils/CDirQuantCacheBase.h b/include/nbl/asset/utils/CDirQuantCacheBase.h index 4d82d920c4..4057e3e4d9 100644 --- a/include/nbl/asset/utils/CDirQuantCacheBase.h +++ b/include/nbl/asset/utils/CDirQuantCacheBase.h @@ -297,7 +297,7 @@ class CDirQuantCacheBase : public impl::CDirQuantCacheBase if (!file) return false; - auto buffer = core::make_smart_refctd_ptr(file->getSize()); + auto buffer = asset::ICPUBuffer::create({ file->getSize() }); system::IFile::success_t succ; file->read(succ, buffer->getPointer(), 0, file->getSize()); @@ -346,7 +346,7 @@ class CDirQuantCacheBase : public impl::CDirQuantCacheBase asset::SBufferRange bufferRange; bufferRange.offset = 0; bufferRange.size = getSerializedCacheSizeInBytes(); - bufferRange.buffer = core::make_smart_refctd_ptr(bufferRange.size); + bufferRange.buffer = asset::ICPUBuffer::create({ bufferRange.size }); saveCacheToBuffer(bufferRange); diff --git a/include/nbl/asset/utils/IMeshManipulator.h b/include/nbl/asset/utils/IMeshManipulator.h index dfdd44e475..f84d85c75d 100644 --- a/include/nbl/asset/utils/IMeshManipulator.h +++ b/include/nbl/asset/utils/IMeshManipulator.h @@ -557,7 +557,7 @@ class NBL_API2 IMeshManipulator : public virtual core::IReferenceCounted core::smart_refctd_ptr iotaUint32Buffer; if (iotaLength) { - iotaUint32Buffer = core::make_smart_refctd_ptr(sizeof(uint32_t)*iotaLength); + iotaUint32Buffer = ICPUBuffer::create({ sizeof(uint32_t)*iotaLength }); auto ptr = reinterpret_cast(iotaUint32Buffer->getPointer()); std::iota(ptr,ptr+iotaLength,0u); } @@ -599,13 +599,13 @@ class NBL_API2 IMeshManipulator : public virtual core::IReferenceCounted if (indexType==EIT_16BIT) { auto inPtr = reinterpret_cast(correctlyOffsetIndexBufferPtr); - newIndexBuffer = core::make_smart_refctd_ptr(sizeof(uint32_t)*indexCount); + newIndexBuffer = ICPUBuffer::create({ sizeof(uint32_t)*indexCount }); std::copy(inPtr,inPtr+indexCount,reinterpret_cast(newIndexBuffer->getPointer())); } else { auto inPtr = reinterpret_cast(correctlyOffsetIndexBufferPtr); - newIndexBuffer = core::make_smart_refctd_ptr(sizeof(uint16_t)*indexCount); + newIndexBuffer = ICPUBuffer::create({ sizeof(uint16_t)*indexCount }); std::copy(inPtr,inPtr+indexCount,reinterpret_cast(newIndexBuffer->getPointer())); } } diff --git a/include/nbl/asset/utils/IMeshPacker.h b/include/nbl/asset/utils/IMeshPacker.h index 3f09062b18..9daafd9ded 100644 --- a/include/nbl/asset/utils/IMeshPacker.h +++ b/include/nbl/asset/utils/IMeshPacker.h @@ -551,7 +551,7 @@ class IMeshPacker : public IMeshPackerBase core::smart_refctd_ptr idxBufferToProcess; if (iota) { - idxBufferToProcess = core::make_smart_refctd_ptr(sizeof(uint32_t) * idxCount); + idxBufferToProcess = ICPUBuffer::create({ sizeof(uint32_t) * idxCount }); auto ptr = reinterpret_cast(idxBufferToProcess->getPointer()); std::iota(ptr, ptr + idxCount, 0u); idxType = EIT_32BIT; diff --git a/include/nbl/asset/utils/IShaderCompiler.h b/include/nbl/asset/utils/IShaderCompiler.h index ac7ed5eb13..7acf61e0b0 100644 --- a/include/nbl/asset/utils/IShaderCompiler.h +++ b/include/nbl/asset/utils/IShaderCompiler.h @@ -219,6 +219,7 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted core::blake3_hash_t hash = {}; // If true, then `getIncludeStandard` was used to find, otherwise `getIncludeRelative` bool standardInclude = false; + }; struct SCompilerArgs; // Forward declaration for SPreprocessorArgs's friend declaration @@ -525,7 +526,7 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted constexpr size_t nullTerminatorSize = 1u; size_t outSize = origLen + formatArgsCharSize + formatSize + nullTerminatorSize - 2 * templateArgsCount; - nbl::core::smart_refctd_ptr outBuffer = nbl::core::make_smart_refctd_ptr(outSize); + nbl::core::smart_refctd_ptr outBuffer = ICPUBuffer::create({ outSize }); auto origCode = std::string_view(reinterpret_cast(original->getContent()->getPointer()), origLen); auto outCode = reinterpret_cast(outBuffer->getPointer()); diff --git a/include/nbl/core/alloc/resource_owning_vector.h b/include/nbl/core/alloc/resource_owning_vector.h new file mode 100644 index 0000000000..330c4fa819 --- /dev/null +++ b/include/nbl/core/alloc/resource_owning_vector.h @@ -0,0 +1,50 @@ +// Copyright (C) 2019-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine" and was originally part of the "Irrlicht Engine" +// For conditions of distribution and use, see copyright notice in nabla.h +// See the original file in irrlicht source for authors + +#ifndef _NBL_CORE_ALLOC_RESOURCE_OWNING_VECTOR_INCLUDED_ +#define _NBL_CORE_ALLOC_RESOURCE_OWNING_VECTOR_INCLUDED_ + +#include + +using namespace nbl; + +namespace nbl::core +{ + +class resource_owning_vector : public std::pmr::memory_resource { +public: + // only create the resource from an already sized vector + resource_owning_vector(core::vector&& buffer) : buffer(buffer), offset(0) + { + assert(buffer.size()); + }; + + void* data() { + return buffer.data(); + } + +protected: + void* do_allocate(size_t bytes, size_t alignment) override { + auto space = buffer.size() - offset; + auto ptr = buffer.data() + offset; + assert(bytes <= space); + offset += bytes; + return ptr; + } + + void do_deallocate(void* p, size_t bytes, size_t alignment) override {} + + bool do_is_equal(const std::pmr::memory_resource& other) const noexcept override { + return this == &other; + } + +private: + core::vector buffer; + size_t offset; +}; + +} + +#endif \ No newline at end of file diff --git a/include/nbl/ext/ScreenShot/ScreenShot.h b/include/nbl/ext/ScreenShot/ScreenShot.h index fd0c2723b4..4e71749cd7 100644 --- a/include/nbl/ext/ScreenShot/ScreenShot.h +++ b/include/nbl/ext/ScreenShot/ScreenShot.h @@ -154,7 +154,7 @@ inline core::smart_refctd_ptr createScreenShot( region.imageOffset = { 0u, 0u, 0u }; region.imageExtent = cpuNewImage->getCreationParameters().extent; - auto cpuNewTexelBuffer = core::make_smart_refctd_ptr(gpuTexelBufferSize); + auto cpuNewTexelBuffer = ICPUBuffer::create({ gpuTexelBufferSize }); { memcpy(cpuNewTexelBuffer->getPointer(), gpuTexelBuffer->getBoundMemory().memory->getMappedPointer(), gpuTexelBuffer->getSize()); } diff --git a/include/nbl/scene/ICullingLoDSelectionSystem.h b/include/nbl/scene/ICullingLoDSelectionSystem.h index 0e4c40b771..dee356b20f 100644 --- a/include/nbl/scene/ICullingLoDSelectionSystem.h +++ b/include/nbl/scene/ICullingLoDSelectionSystem.h @@ -499,7 +499,7 @@ class ICullingLoDSelectionSystem : public virtual core::IReferenceCounted auto glslFile = loadBuiltinData(Path.value); core::smart_refctd_ptr glsl; { - glsl = core::make_smart_refctd_ptr(glslFile->getSize()); + glsl = asset::ICPUBuffer::create({ glslFile->getSize() }); memcpy(glsl->getPointer(), glslFile->getMappedPointer(), glsl->getSize()); } return {core::make_smart_refctd_ptr(std::move(glsl), asset::IShader::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_GLSL, Path.value), Path.value}; diff --git a/include/nbl/scene/ISkinInstanceCacheManager.h b/include/nbl/scene/ISkinInstanceCacheManager.h index 0d7f7a3391..5a5e3f5881 100644 --- a/include/nbl/scene/ISkinInstanceCacheManager.h +++ b/include/nbl/scene/ISkinInstanceCacheManager.h @@ -38,7 +38,7 @@ class ISkinInstanceCacheManager : public virtual core::IReferenceCounted auto glslFile = loadBuiltinData(uniqueString); core::smart_refctd_ptr glsl; { - glsl = core::make_smart_refctd_ptr(glslFile->getSize()); + glsl = asset::ICPUBuffer::create({ glslFile->getSize() }); memcpy(glsl->getPointer(), glslFile->getMappedPointer(), glsl->getSize()); } auto shader = device->createShader(core::make_smart_refctd_ptr(core::smart_refctd_ptr(glsl), type, asset::IShader::E_CONTENT_TYPE::ECT_GLSL, "")); diff --git a/include/nbl/scene/ITransformTreeManager.h b/include/nbl/scene/ITransformTreeManager.h index 6152c32481..35d29b3ce5 100644 --- a/include/nbl/scene/ITransformTreeManager.h +++ b/include/nbl/scene/ITransformTreeManager.h @@ -121,7 +121,7 @@ class ITransformTreeManager : public virtual core::IReferenceCounted auto glslFile = loadBuiltinData(Path.value); core::smart_refctd_ptr glsl; { - glsl = core::make_smart_refctd_ptr(glslFile->getSize()); + glsl = asset::ICPUBuffer::create({ glslFile->getSize() }); memcpy(glsl->getPointer(), glslFile->getMappedPointer(), glsl->getSize()); } auto shader = device->createShader(core::make_smart_refctd_ptr(core::smart_refctd_ptr(glsl), type, asset::IShader::E_CONTENT_TYPE::ECT_GLSL, "????")); diff --git a/src/nbl/asset/IAssetManager.cpp b/src/nbl/asset/IAssetManager.cpp index e6a23fdb44..275a4b75fb 100644 --- a/src/nbl/asset/IAssetManager.cpp +++ b/src/nbl/asset/IAssetManager.cpp @@ -343,7 +343,7 @@ void IAssetManager::insertBuiltinAssets() info.samples = asset::ICPUImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT; info.flags = static_cast(0u); info.usage = asset::IImage::EUF_INPUT_ATTACHMENT_BIT|asset::IImage::EUF_SAMPLED_BIT; - auto buf = core::make_smart_refctd_ptr(info.extent.width*info.extent.height*asset::getTexelOrBlockBytesize(info.format)); + auto buf = asset::ICPUBuffer::create({ info.extent.width*info.extent.height*asset::getTexelOrBlockBytesize(info.format) }); memcpy(buf->getPointer(), //magenta-grey 2x2 chessboard std::array{{255, 0, 255, 255, 128, 128, 128, 255, 128, 128, 128, 255, 255, 0, 255, 255}}.data(), @@ -403,7 +403,7 @@ void IAssetManager::insertBuiltinAssets() auto ds1 = core::make_smart_refctd_ptr(core::smart_refctd_ptr(defaultDs1Layout.get())); { constexpr size_t UBO_SZ = sizeof(asset::SBasicViewParameters); - auto ubo = core::make_smart_refctd_ptr(UBO_SZ); + auto ubo = asset::ICPUBuffer::create({ UBO_SZ }); //for filling this UBO with actual data, one can use asset::SBasicViewParameters struct defined in nbl/asset/asset_utils.h asset::fillBufferWithDeadBeef(ubo.get()); diff --git a/src/nbl/asset/ICPUImage.cpp b/src/nbl/asset/ICPUImage.cpp index aac6291e9f..27a4f0834b 100644 --- a/src/nbl/asset/ICPUImage.cpp +++ b/src/nbl/asset/ICPUImage.cpp @@ -122,7 +122,7 @@ class CFlattenRegionsStreamHashImageFilter : public CMatchedSizeInOutImageFilter auto getScratchAsBuffer = [&memory = state->scratch.memory](size_t size, size_t offset = 0ull) { - return core::make_smart_refctd_ptr, true> >(size, (uint8_t*)memory + offset, core::adopt_memory); // adopt memory & don't free it on exit + return ICPUBuffer::create({ size, (uint8_t*)memory + offset }, core::adopt_memory); // adopt memory & don't free it on exit }; /* diff --git a/src/nbl/asset/interchange/CBufferLoaderBIN.h b/src/nbl/asset/interchange/CBufferLoaderBIN.h index 8434a01637..2d0ea5f765 100644 --- a/src/nbl/asset/interchange/CBufferLoaderBIN.h +++ b/src/nbl/asset/interchange/CBufferLoaderBIN.h @@ -36,7 +36,7 @@ class CBufferLoaderBIN final : public asset::IAssetLoader private: struct SContext { - SContext(const size_t& sizeInBytes) : sourceCodeBuffer(core::make_smart_refctd_ptr(sizeInBytes)) {} + SContext(const size_t& sizeInBytes) : sourceCodeBuffer(ICPUBuffer::create({ sizeInBytes })) {} system::IFile* file; core::smart_refctd_ptr sourceCodeBuffer; }; diff --git a/src/nbl/asset/interchange/CGLILoader.cpp b/src/nbl/asset/interchange/CGLILoader.cpp index aa75fb52ae..1599b765b0 100644 --- a/src/nbl/asset/interchange/CGLILoader.cpp +++ b/src/nbl/asset/interchange/CGLILoader.cpp @@ -116,7 +116,7 @@ namespace nbl const auto texelBlockDimension = asset::getBlockDimensions(format.first); const auto texelBlockByteSize = asset::getTexelOrBlockBytesize(format.first); - auto texelBuffer = core::make_smart_refctd_ptr(texture.size()); + auto texelBuffer = ICPUBuffer::create({ texture.size() }); auto data = reinterpret_cast(texelBuffer->getPointer()); ICPUImage::SCreationParams imageInfo = {}; diff --git a/src/nbl/asset/interchange/CGLTFLoader.cpp b/src/nbl/asset/interchange/CGLTFLoader.cpp index 7382bd2f08..413bf6fd22 100644 --- a/src/nbl/asset/interchange/CGLTFLoader.cpp +++ b/src/nbl/asset/interchange/CGLTFLoader.cpp @@ -59,7 +59,7 @@ using namespace nbl::asset; }; core::smart_refctd_ptr glslFile = loadBuiltinData(decltype(constexprStringType)::value); - auto glsl = core::make_smart_refctd_ptr(glslFile->getSize()); + auto glsl = asset::ICPUBuffer::create({ glslFile->getSize() }); memcpy(glsl->getPointer(),glslFile->getMappedPointer(),glsl->getSize()); auto unspecializedShader = core::make_smart_refctd_ptr(std::move(glsl), stage, asset::ICPUShader::E_CONTENT_TYPE::ECT_GLSL, stage != ICPUShader::ESS_VERTEX ? "?IrrlichtBAW glTFLoader FragmentShader?" : "?IrrlichtBAW glTFLoader VertexShader?"); @@ -115,7 +115,7 @@ using namespace nbl::asset; { simdjson::dom::parser parser; - auto jsonBuffer = core::make_smart_refctd_ptr(_file->getSize()); + auto jsonBuffer = ICPUBuffer::create({ _file->getSize() }); { system::IFile::success_t success; _file->read(success, jsonBuffer->getPointer(), 0u, jsonBuffer->getSize()); @@ -529,8 +529,8 @@ using namespace nbl::asset; core::vector skeletonJointCountPrefixSum; skeletonJointCountPrefixSum.resize(skeletonJointCount.size()); // now create buffer for skeletons - SBufferBinding parentJointID = {0ull,core::make_smart_refctd_ptr(sizeof(ICPUSkeleton::joint_id_t)*nodeCount)}; - SBufferBinding defaultTransforms = {0ull,core::make_smart_refctd_ptr(sizeof(core::matrix3x4SIMD)*nodeCount)}; + SBufferBinding parentJointID = {0ull,ICPUBuffer::create({ sizeof(ICPUSkeleton::joint_id_t)*nodeCount })}; + SBufferBinding defaultTransforms = {0ull,ICPUBuffer::create({ sizeof(core::matrix3x4SIMD)*nodeCount })}; core::vector names(nodeCount); // and fill them { @@ -564,8 +564,8 @@ using namespace nbl::asset; uint32_t totalSkinJointRefs = 0u; for (const auto& skin : glTF.skins) totalSkinJointRefs += skin.joints.size(); - vertexJointToSkeletonJoint = core::make_smart_refctd_ptr(sizeof(ICPUSkeleton::joint_id_t)*totalSkinJointRefs); - inverseBindPose = core::make_smart_refctd_ptr(sizeof(core::matrix3x4SIMD)*totalSkinJointRefs); + vertexJointToSkeletonJoint = ICPUBuffer::create({ sizeof(ICPUSkeleton::joint_id_t)*totalSkinJointRefs }); + inverseBindPose = ICPUBuffer::create({ sizeof(core::matrix3x4SIMD)*totalSkinJointRefs }); } // then go over skins uint32_t skinJointRefCount = 0u; @@ -1079,8 +1079,8 @@ using namespace nbl::asset; constexpr bool isValidWeighComponentT = std::is_same::value || std::is_same::value || std::is_same::value; static_assert(isValidJointComponentT && isValidWeighComponentT); - vOverrideJointsBuffer = core::make_smart_refctd_ptr(vCommonOverrideAttributesCount * vJointsTexelByteSize); - vOverrideWeightsBuffer = core::make_smart_refctd_ptr(vCommonOverrideAttributesCount * vWeightsTexelByteSize); + vOverrideJointsBuffer = asset::ICPUBuffer::create({ vCommonOverrideAttributesCount * vJointsTexelByteSize }); + vOverrideWeightsBuffer = asset::ICPUBuffer::create({ vCommonOverrideAttributesCount * vWeightsTexelByteSize }); for (size_t vAttributeIx = 0; vAttributeIx < vCommonOverrideAttributesCount; ++vAttributeIx) { @@ -1231,8 +1231,8 @@ using namespace nbl::asset; const size_t repackJointsTexelByteSize = asset::getTexelOrBlockBytesize(repackJointsFormat); const size_t repackWeightsTexelByteSize = asset::getTexelOrBlockBytesize(repackWeightsFormat); - auto vOverrideRepackedJointsBuffer = core::make_smart_refctd_ptr(vCommonOverrideAttributesCount * repackJointsTexelByteSize); - auto vOverrideRepackedWeightsBuffer = core::make_smart_refctd_ptr(vCommonOverrideAttributesCount * repackWeightsTexelByteSize); + auto vOverrideRepackedJointsBuffer = asset::ICPUBuffer::create({ vCommonOverrideAttributesCount * repackJointsTexelByteSize }); + auto vOverrideRepackedWeightsBuffer = asset::ICPUBuffer::create({ vCommonOverrideAttributesCount * repackWeightsTexelByteSize }); memset(vOverrideRepackedJointsBuffer->getPointer(), 0, vOverrideRepackedJointsBuffer->getSize()); memset(vOverrideRepackedWeightsBuffer->getPointer(), 0, vOverrideRepackedWeightsBuffer->getSize()); @@ -1413,7 +1413,7 @@ using namespace nbl::asset; assert(weightsQuantizeFormat != EF_UNKNOWN); { vOverrideWeightsBuffer = std::move(core::smart_refctd_ptr()); //! free memory - auto vOverrideQuantizedWeightsBuffer = core::make_smart_refctd_ptr(weightComponentsByteStride * vCommonOverrideAttributesCount); + auto vOverrideQuantizedWeightsBuffer = asset::ICPUBuffer::create({ weightComponentsByteStride * vCommonOverrideAttributesCount }); { for (size_t vAttributeIx = 0; vAttributeIx < vCommonOverrideAttributesCount; ++vAttributeIx) { @@ -1561,7 +1561,7 @@ using namespace nbl::asset; inverseBindPoseBinding.offset = skin.inverseBindPose.offset; SBufferBinding jointAABBBufferBinding; - jointAABBBufferBinding.buffer = core::make_smart_refctd_ptr(jointCount*sizeof(core::aabbox3df)); + jointAABBBufferBinding.buffer = asset::ICPUBuffer::create({ jointCount*sizeof(core::aabbox3df) }); jointAABBBufferBinding.offset = 0u; auto* aabbPtr = reinterpret_cast(jointAABBBufferBinding.buffer->getPointer()); @@ -1658,7 +1658,7 @@ using namespace nbl::asset; simdjson::dom::parser parser; auto* _file = context.loadContext.mainFile; - auto jsonBuffer = core::make_smart_refctd_ptr(_file->getSize()); + auto jsonBuffer = ICPUBuffer::create({ _file->getSize() }); { system::IFile::success_t success; _file->read(success, jsonBuffer->getPointer(), 0u, jsonBuffer->getSize()); diff --git a/src/nbl/asset/interchange/CGraphicsPipelineLoaderMTL.cpp b/src/nbl/asset/interchange/CGraphicsPipelineLoaderMTL.cpp index 082f6c2dbc..3522b793d1 100644 --- a/src/nbl/asset/interchange/CGraphicsPipelineLoaderMTL.cpp +++ b/src/nbl/asset/interchange/CGraphicsPipelineLoaderMTL.cpp @@ -43,7 +43,7 @@ CGraphicsPipelineLoaderMTL::CGraphicsPipelineLoaderMTL(IAssetManager* _am, core: }; core::smart_refctd_ptr data = loadBuiltinData(Path.value); - auto buffer = core::make_smart_refctd_ptr(data->getSize()+1u); + auto buffer = asset::ICPUBuffer::create({ data->getSize()+1u }); char* bufferPtr = reinterpret_cast(buffer->getPointer()); memcpy(bufferPtr, data->getMappedPointer(), data->getSize()); bufferPtr[data->getSize()] = '\0'; @@ -627,7 +627,7 @@ CGraphicsPipelineLoaderMTL::image_views_set_t CGraphicsPipelineLoaderMTL::loadIm } #endif } - auto imgDataBuf = core::make_smart_refctd_ptr(bufSz); + auto imgDataBuf = ICPUBuffer::create({ bufSz }); for (uint32_t i = CMTLMetadata::CRenderpassIndependentPipeline::EMP_REFL_POSX, j = 0u; i < CMTLMetadata::CRenderpassIndependentPipeline::EMP_REFL_POSX + 6u; ++i) { #ifndef _NBL_DEBUG diff --git a/src/nbl/asset/interchange/CHLSLLoader.cpp b/src/nbl/asset/interchange/CHLSLLoader.cpp index 2ebea205d0..a6924b132a 100644 --- a/src/nbl/asset/interchange/CHLSLLoader.cpp +++ b/src/nbl/asset/interchange/CHLSLLoader.cpp @@ -15,7 +15,7 @@ SAssetBundle CHLSLLoader::loadAsset(system::IFile* _file, const IAssetLoader::SA return {}; const auto len = _file->getSize(); - auto source = core::make_smart_refctd_ptr(len+1); + auto source = ICPUBuffer::create({ len+1 }); system::IFile::success_t success; _file->read(success, source->getPointer(), 0, len); diff --git a/src/nbl/asset/interchange/CImageLoaderJPG.cpp b/src/nbl/asset/interchange/CImageLoaderJPG.cpp index 2c0687b18d..45677ff5cf 100644 --- a/src/nbl/asset/interchange/CImageLoaderJPG.cpp +++ b/src/nbl/asset/interchange/CImageLoaderJPG.cpp @@ -310,7 +310,7 @@ asset::SAssetBundle CImageLoaderJPG::loadAsset(system::IFile* _file, const asset uint32_t rowspan = region.bufferRowLength * cinfo.out_color_components; // Allocate memory for buffer - auto buffer = core::make_smart_refctd_ptr(rowspan*height); + auto buffer = asset::ICPUBuffer::create({ rowspan*height }); // Here we use the library's state variable cinfo.output_scanline as the // loop counter, so that we don't have to keep track ourselves. diff --git a/src/nbl/asset/interchange/CImageLoaderOpenEXR.cpp b/src/nbl/asset/interchange/CImageLoaderOpenEXR.cpp index 0bd9fce739..da4e00070f 100644 --- a/src/nbl/asset/interchange/CImageLoaderOpenEXR.cpp +++ b/src/nbl/asset/interchange/CImageLoaderOpenEXR.cpp @@ -359,7 +359,7 @@ SAssetBundle CImageLoaderOpenEXR::loadAsset(system::IFile* _file, const asset::I auto image = ICPUImage::create(std::move(params)); { // create image and buffer that backs it const uint32_t texelFormatByteSize = getTexelOrBlockBytesize(image->getCreationParameters().format); - auto texelBuffer = core::make_smart_refctd_ptr(image->getImageDataSizeInBytes()); + auto texelBuffer = ICPUBuffer::create({ image->getImageDataSizeInBytes() }); auto regions = core::make_refctd_dynamic_array>(1u); ICPUImage::SBufferCopy& region = regions->front(); region.imageSubresource.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; diff --git a/src/nbl/asset/interchange/CImageLoaderPNG.cpp b/src/nbl/asset/interchange/CImageLoaderPNG.cpp index 84a451c9d7..d7548146c3 100644 --- a/src/nbl/asset/interchange/CImageLoaderPNG.cpp +++ b/src/nbl/asset/interchange/CImageLoaderPNG.cpp @@ -286,7 +286,7 @@ asset::SAssetBundle CImageLoaderPng::loadAsset(system::IFile* _file, const asset region.imageOffset = { 0u, 0u, 0u }; region.imageExtent = imgInfo.extent; - auto texelBuffer = core::make_smart_refctd_ptr(region.bufferRowLength * region.imageExtent.height * texelFormatBytesize); + auto texelBuffer = ICPUBuffer::create({ region.bufferRowLength * region.imageExtent.height * texelFormatBytesize }); // Fill array of pointers to rows in image data const uint32_t pitch = region.bufferRowLength*texelFormatBytesize; diff --git a/src/nbl/asset/interchange/CImageLoaderTGA.cpp b/src/nbl/asset/interchange/CImageLoaderTGA.cpp index e87a263675..5177cb2d19 100644 --- a/src/nbl/asset/interchange/CImageLoaderTGA.cpp +++ b/src/nbl/asset/interchange/CImageLoaderTGA.cpp @@ -30,7 +30,7 @@ namespace asset auto inputTexelOrBlockByteSize = inputBuffer->getSize(); const uint32_t texelOrBlockLength = inputTexelOrBlockByteSize / asset::getTexelOrBlockBytesize(inputFormat); - auto outputBuffer = core::make_smart_refctd_ptr(texelOrBlockLength * asset::getTexelOrBlockBytesize(outputFormat)); + auto outputBuffer = asset::ICPUBuffer::create({ texelOrBlockLength * asset::getTexelOrBlockBytesize(outputFormat) }); auto outputTexelOrBlockByteSize = outputBuffer->getSize(); for (auto i = 0ull; i < texelOrBlockLength; ++i) @@ -49,7 +49,7 @@ void CImageLoaderTGA::loadCompressedImage(system::IFile *file, const STGAHeader& // I only changed the formatting a little bit. int32_t bytesPerPixel = header.PixelDepth/8; int32_t imageSizeInBytes = header.ImageHeight * header.ImageWidth * bytesPerPixel; - bufferData = core::make_smart_refctd_ptr(wholeSizeWithPitchInBytes); + bufferData = ICPUBuffer::create({ wholeSizeWithPitchInBytes }); auto data = reinterpret_cast(bufferData->getPointer()); int32_t currentByte = 0; @@ -202,7 +202,7 @@ asset::SAssetBundle CImageLoaderTGA::loadAsset(system::IFile* _file, const asset if (header.ColorMapType) { auto colorMapEntryByteSize = header.ColorMapEntrySize / 8 * header.ColorMapLength; - auto colorMapEntryBuffer = core::make_smart_refctd_ptr(colorMapEntryByteSize); + auto colorMapEntryBuffer = asset::ICPUBuffer::create({ (size_t)colorMapEntryByteSize }); { system::IFile::success_t success; _file->read(success, colorMapEntryBuffer->getPointer(), offset, header.ColorMapEntrySize / 8 * header.ColorMapLength); @@ -251,7 +251,7 @@ asset::SAssetBundle CImageLoaderTGA::loadAsset(system::IFile* _file, const asset { region.bufferRowLength = calcPitchInBlocks(region.imageExtent.width, getTexelOrBlockBytesize(EF_R8G8B8_SRGB)); const int32_t imageSize = endBufferSize = region.imageExtent.height * region.bufferRowLength * bytesPerTexel; - texelBuffer = core::make_smart_refctd_ptr(imageSize); + texelBuffer = ICPUBuffer::create({ (size_t)imageSize }); { system::IFile::success_t success; _file->read(success, texelBuffer->getPointer(), offset, imageSize); diff --git a/src/nbl/asset/interchange/CImageWriterJPG.cpp b/src/nbl/asset/interchange/CImageWriterJPG.cpp index 8af22d58c7..dea43ba696 100644 --- a/src/nbl/asset/interchange/CImageWriterJPG.cpp +++ b/src/nbl/asset/interchange/CImageWriterJPG.cpp @@ -151,7 +151,7 @@ static bool writeJPEGFile(system::IFile* file, system::ISystem* sys, const asset jpeg_start_compress(&cinfo, TRUE); const auto JPG_BYTE_PITCH = rowByteSize; - auto destBuffer = core::make_smart_refctd_ptr(JPG_BYTE_PITCH); + auto destBuffer = asset::ICPUBuffer::create({ JPG_BYTE_PITCH }); auto dest = reinterpret_cast(destBuffer->getPointer()); if (dest) diff --git a/src/nbl/asset/interchange/CImageWriterTGA.cpp b/src/nbl/asset/interchange/CImageWriterTGA.cpp index 9c14cc8292..a5b5e676b9 100644 --- a/src/nbl/asset/interchange/CImageWriterTGA.cpp +++ b/src/nbl/asset/interchange/CImageWriterTGA.cpp @@ -134,7 +134,7 @@ bool CImageWriterTGA::writeAsset(system::IFile* _file, const SAssetWriteParams& int32_t row_size = ((imageHeader.PixelDepth / 8) * imageHeader.ImageWidth); // allocate a row do translate data into - auto rowPointerBuffer = core::make_smart_refctd_ptr(row_size); + auto rowPointerBuffer = ICPUBuffer::create({ (size_t)row_size }); auto row_pointer = reinterpret_cast(rowPointerBuffer->getPointer()); uint32_t y; diff --git a/src/nbl/asset/interchange/COBJMeshFileLoader.cpp b/src/nbl/asset/interchange/COBJMeshFileLoader.cpp index adaa786f5f..61d9eab6ec 100644 --- a/src/nbl/asset/interchange/COBJMeshFileLoader.cpp +++ b/src/nbl/asset/interchange/COBJMeshFileLoader.cpp @@ -465,10 +465,10 @@ asset::SAssetBundle COBJMeshFileLoader::loadAsset(system::IFile* _file, const as submeshes[i]->setPipeline(std::move(pipeline.first)); } - core::smart_refctd_ptr vtxBuf = core::make_smart_refctd_ptr(vertices.size() * sizeof(SObjVertex)); + core::smart_refctd_ptr vtxBuf = ICPUBuffer::create({ vertices.size() * sizeof(SObjVertex) }); memcpy(vtxBuf->getPointer(), vertices.data(), vtxBuf->getSize()); - auto ixBuf = core::make_smart_refctd_ptr(ixBufOffset); + auto ixBuf = ICPUBuffer::create({ ixBufOffset }); for (size_t i = 0ull; i < submeshes.size(); ++i) { if (submeshWasLoadedFromCache[i]) diff --git a/src/nbl/asset/interchange/CPLYMeshFileLoader.cpp b/src/nbl/asset/interchange/CPLYMeshFileLoader.cpp index 4ad0710dbf..3cdc261408 100644 --- a/src/nbl/asset/interchange/CPLYMeshFileLoader.cpp +++ b/src/nbl/asset/interchange/CPLYMeshFileLoader.cpp @@ -373,7 +373,7 @@ asset::SAssetBundle CPLYMeshFileLoader::loadAsset(system::IFile* _file, const as if (!attributes[ET_POS].buffer) { attributes[ET_POS].offset = 0u; - attributes[ET_POS].buffer = core::make_smart_refctd_ptr(asset::getTexelOrBlockBytesize(EF_R32G32B32_SFLOAT) * plyVertexElement.Count); + attributes[ET_POS].buffer = asset::ICPUBuffer::create({ asset::getTexelOrBlockBytesize(EF_R32G32B32_SFLOAT) * plyVertexElement.Count }); } } else if(propertyName == "nx" || propertyName == "ny" || propertyName == "nz") @@ -381,7 +381,7 @@ asset::SAssetBundle CPLYMeshFileLoader::loadAsset(system::IFile* _file, const as if (!attributes[ET_NORM].buffer) { attributes[ET_NORM].offset = 0u; - attributes[ET_NORM].buffer = core::make_smart_refctd_ptr(asset::getTexelOrBlockBytesize(EF_R32G32B32_SFLOAT) * plyVertexElement.Count); + attributes[ET_NORM].buffer = asset::ICPUBuffer::create({ asset::getTexelOrBlockBytesize(EF_R32G32B32_SFLOAT) * plyVertexElement.Count }); } } else if (propertyName == "u" || propertyName == "s" || propertyName == "v" || propertyName == "t") @@ -389,7 +389,7 @@ asset::SAssetBundle CPLYMeshFileLoader::loadAsset(system::IFile* _file, const as if (!attributes[ET_UV].buffer) { attributes[ET_UV].offset = 0u; - attributes[ET_UV].buffer = core::make_smart_refctd_ptr(asset::getTexelOrBlockBytesize(EF_R32G32_SFLOAT) * plyVertexElement.Count); + attributes[ET_UV].buffer = asset::ICPUBuffer::create({ asset::getTexelOrBlockBytesize(EF_R32G32_SFLOAT) * plyVertexElement.Count }); } } else if (propertyName == "red" || propertyName == "green" || propertyName == "blue" || propertyName == "alpha") @@ -397,7 +397,7 @@ asset::SAssetBundle CPLYMeshFileLoader::loadAsset(system::IFile* _file, const as if (!attributes[ET_COL].buffer) { attributes[ET_COL].offset = 0u; - attributes[ET_COL].buffer = core::make_smart_refctd_ptr(asset::getTexelOrBlockBytesize(EF_R32G32B32A32_SFLOAT) * plyVertexElement.Count); + attributes[ET_COL].buffer = asset::ICPUBuffer::create({ asset::getTexelOrBlockBytesize(EF_R32G32B32A32_SFLOAT) * plyVertexElement.Count }); } } } @@ -426,7 +426,7 @@ asset::SAssetBundle CPLYMeshFileLoader::loadAsset(system::IFile* _file, const as if (indices.size()) { - asset::SBufferBinding indexBinding = { 0, core::make_smart_refctd_ptr(indices.size() * sizeof(uint32_t)) }; + asset::SBufferBinding indexBinding = { 0, asset::ICPUBuffer::create({ indices.size() * sizeof(uint32_t) }) }; memcpy(indexBinding.buffer->getPointer(), indices.data(), indexBinding.buffer->getSize()); mb->setIndexCount(indices.size()); diff --git a/src/nbl/asset/interchange/CSPVLoader.cpp b/src/nbl/asset/interchange/CSPVLoader.cpp index baadc5ba24..6787912cde 100644 --- a/src/nbl/asset/interchange/CSPVLoader.cpp +++ b/src/nbl/asset/interchange/CSPVLoader.cpp @@ -65,7 +65,7 @@ SAssetBundle CSPVLoader::loadAsset(system::IFile* _file, const IAssetLoader::SAs if (!_file) return {}; - auto buffer = core::make_smart_refctd_ptr(_file->getSize()); + auto buffer = ICPUBuffer::create({ _file->getSize() }); system::IFile::success_t success; _file->read(success, buffer->getPointer(), 0, _file->getSize()); diff --git a/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp b/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp index c080857c63..defce74b7b 100644 --- a/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp +++ b/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp @@ -274,7 +274,7 @@ SAssetBundle CSTLMeshFileLoader::loadAsset(system::IFile* _file, const IAssetLoa } // end while (_file->getPos() < filesize) const size_t vtxSize = hasColor ? (3 * sizeof(float) + 4 + 4) : (3 * sizeof(float) + 4); - auto vertexBuf = core::make_smart_refctd_ptr(vtxSize * positions.size()); + auto vertexBuf = asset::ICPUBuffer::create({ vtxSize * positions.size() }); using quant_normal_t = CQuantNormalCache::value_type_t; diff --git a/src/nbl/asset/utils/CDerivativeMapCreator.cpp b/src/nbl/asset/utils/CDerivativeMapCreator.cpp index 91051113b8..ee288d9aa7 100644 --- a/src/nbl/asset/utils/CDerivativeMapCreator.cpp +++ b/src/nbl/asset/utils/CDerivativeMapCreator.cpp @@ -80,7 +80,7 @@ core::smart_refctd_ptr CDerivativeMapCreator::createDerivativeMapFrom auto outParams = inParams; outParams.format = getRGformat(outParams.format); const uint32_t pitch = IImageAssetHandlerBase::calcPitchInBlocks(outParams.extent.width, getTexelOrBlockBytesize(outParams.format)); - auto buffer = core::make_smart_refctd_ptr(getTexelOrBlockBytesize(outParams.format) * pitch * outParams.extent.height); + auto buffer = ICPUBuffer::create({ getTexelOrBlockBytesize(outParams.format) * pitch * outParams.extent.height }); ICPUImage::SBufferCopy region; region.imageOffset = { 0,0,0 }; region.imageExtent = outParams.extent; @@ -194,7 +194,7 @@ core::smart_refctd_ptr CDerivativeMapCreator::createDerivativeMapFrom core::smart_refctd_ptr newDerivativeNormalMapImage; { const uint32_t pitch = IImageAssetHandlerBase::calcPitchInBlocks(newImageParams.extent.width,getTexelOrBlockBytesize(newImageParams.format)); - core::smart_refctd_ptr newCpuBuffer = core::make_smart_refctd_ptr(getTexelOrBlockBytesize(newImageParams.format) * pitch * newImageParams.extent.height); + core::smart_refctd_ptr newCpuBuffer = ICPUBuffer::create({ getTexelOrBlockBytesize(newImageParams.format) * pitch * newImageParams.extent.height }); ICPUImage::SBufferCopy region; region.imageOffset = { 0,0,0 }; diff --git a/src/nbl/asset/utils/CGLSLCompiler.cpp b/src/nbl/asset/utils/CGLSLCompiler.cpp index f90be9f27a..763468b79f 100644 --- a/src/nbl/asset/utils/CGLSLCompiler.cpp +++ b/src/nbl/asset/utils/CGLSLCompiler.cpp @@ -277,7 +277,7 @@ core::smart_refctd_ptr CGLSLCompiler::compileToSPIRV_impl(const std: if (bin_res.GetCompilationStatus() == shaderc_compilation_status_success) { - auto outSpirv = core::make_smart_refctd_ptr(std::distance(bin_res.cbegin(), bin_res.cend()) * sizeof(uint32_t)); + auto outSpirv = ICPUBuffer::create({ std::distance(bin_res.cbegin(), bin_res.cend()) * sizeof(uint32_t) }); memcpy(outSpirv->getPointer(), bin_res.cbegin(), outSpirv->getSize()); if (glslOptions.spirvOptimizer) diff --git a/src/nbl/asset/utils/CGeometryCreator.cpp b/src/nbl/asset/utils/CGeometryCreator.cpp index 50898be56f..ea18b4bf2b 100644 --- a/src/nbl/asset/utils/CGeometryCreator.cpp +++ b/src/nbl/asset/utils/CGeometryCreator.cpp @@ -37,7 +37,7 @@ CGeometryCreator::return_type CGeometryCreator::createCubeMesh(const core::vecto // Create indices { retval.indexCount = 36u; - auto indices = core::make_smart_refctd_ptr(sizeof(uint16_t)*retval.indexCount); + auto indices = asset::ICPUBuffer::create({ sizeof(uint16_t)*retval.indexCount }); indices->addUsageFlags(asset::IBuffer::EUF_INDEX_BUFFER_BIT); auto u = reinterpret_cast(indices->getPointer()); for (uint32_t i=0u; i<6u; ++i) @@ -53,7 +53,7 @@ CGeometryCreator::return_type CGeometryCreator::createCubeMesh(const core::vecto } // Create vertices - auto vertices = core::make_smart_refctd_ptr(24u*vertexSize); + auto vertices = asset::ICPUBuffer::create({ 24u*vertexSize }); vertices->addUsageFlags(IBuffer::EUF_VERTEX_BUFFER_BIT); CubeVertex* ptr = (CubeVertex*)vertices->getPointer(); @@ -190,9 +190,9 @@ CGeometryCreator::return_type CGeometryCreator::createArrowMesh(const uint32_t t coneVertices[i].pos[c] = newPos[c]; } - auto newArrowVertexBuffer = core::make_smart_refctd_ptr(newArrowVertexCount * sizeof(ArrowVertex)); + auto newArrowVertexBuffer = asset::ICPUBuffer::create({ newArrowVertexCount * sizeof(ArrowVertex) }); newArrowVertexBuffer->setUsageFlags(newArrowVertexBuffer->getUsageFlags() | asset::IBuffer::EUF_VERTEX_BUFFER_BIT); - auto newArrowIndexBuffer = core::make_smart_refctd_ptr(newArrowIndexCount * sizeof(uint16_t)); + auto newArrowIndexBuffer = asset::ICPUBuffer::create({ newArrowIndexCount * sizeof(uint16_t) }); newArrowIndexBuffer->setUsageFlags(newArrowIndexBuffer->getUsageFlags() | asset::IBuffer::EUF_INDEX_BUFFER_BIT); for (auto z = 0ull; z < newArrowVertexCount; ++z) @@ -269,7 +269,7 @@ CGeometryCreator::return_type CGeometryCreator::createSphereMesh(float radius, u const uint32_t polyCountXPitch = polyCountX + 1; // get to same vertex on next level retval.indexCount = (polyCountX * polyCountY) * 6; - auto indices = core::make_smart_refctd_ptr(sizeof(uint32_t) * retval.indexCount); + auto indices = asset::ICPUBuffer::create({ sizeof(uint32_t) * retval.indexCount }); // Create indices { @@ -339,7 +339,7 @@ CGeometryCreator::return_type CGeometryCreator::createSphereMesh(float radius, u { size_t vertexSize = 3 * 4 + 4 + 2 * 4 + 4; size_t vertexCount = (polyCountXPitch * polyCountY) + 2; - auto vtxBuf = core::make_smart_refctd_ptr(vertexCount * vertexSize); + auto vtxBuf = asset::ICPUBuffer::create({ vertexCount * vertexSize }); auto* tmpMem = reinterpret_cast(vtxBuf->getPointer()); for (size_t i = 0; i < vertexCount; i++) { @@ -463,7 +463,7 @@ CGeometryCreator::return_type CGeometryCreator::createCylinderMesh(float radius, },{vertexSize,SVertexInputBindingParams::EVIR_PER_VERTEX} }; const size_t vtxCnt = 2u*tesselation; - auto vtxBuf = core::make_smart_refctd_ptr(vtxCnt*sizeof(CylinderVertex)); + auto vtxBuf = asset::ICPUBuffer::create({ vtxCnt*sizeof(CylinderVertex) }); CylinderVertex* vertices = reinterpret_cast(vtxBuf->getPointer()); for (auto i=0ull; i(retval.indexCount *sizeof(uint16_t)); + auto idxBuf = asset::ICPUBuffer::create({ retval.indexCount *sizeof(uint16_t) }); uint16_t* indices = (uint16_t*)idxBuf->getPointer(); for (uint32_t i = 0u, j = 0u; i < halfIx; ++i) @@ -526,7 +526,7 @@ CGeometryCreator::return_type CGeometryCreator::createConeMesh( float radius, fl IMeshManipulator* const meshManipulatorOverride) const { const size_t vtxCnt = tesselation * 2; - auto vtxBuf = core::make_smart_refctd_ptr(vtxCnt * sizeof(ConeVertex)); + auto vtxBuf = asset::ICPUBuffer::create({ vtxCnt * sizeof(ConeVertex) }); ConeVertex* vertices = reinterpret_cast(vtxBuf->getPointer()); ConeVertex* baseVertices = vertices; @@ -570,7 +570,7 @@ CGeometryCreator::return_type CGeometryCreator::createConeMesh( float radius, fl apexVertices[i].normal = quantNormalCache->quantize(core::normalize(u1)); } - auto idxBuf = core::make_smart_refctd_ptr(3u * tesselation * sizeof(uint16_t)); + auto idxBuf = asset::ICPUBuffer::create({ 3u * tesselation * sizeof(uint16_t) }); uint16_t* indices = (uint16_t*)idxBuf->getPointer(); const uint32_t firstIndexOfBaseVertices = 0; @@ -633,13 +633,13 @@ CGeometryCreator::return_type CGeometryCreator::createRectangleMesh(const core:: u[4] = 3; u[5] = 2; - auto indices = core::make_smart_refctd_ptr(sizeof(u)); + auto indices = asset::ICPUBuffer::create({ sizeof(u) }); memcpy(indices->getPointer(), u, sizeof(u)); indices->addUsageFlags(asset::IBuffer::EUF_INDEX_BUFFER_BIT); retval.indexBuffer = { 0ull, std::move(indices) }; // Create vertices - auto vertices = core::make_smart_refctd_ptr(4 * vertexSize); + auto vertices = asset::ICPUBuffer::create({ 4 * vertexSize }); RectangleVertex* ptr = (RectangleVertex*)vertices->getPointer(); ptr[0] = RectangleVertex(core::vector3df_SIMD(-1.0f, 1.0f, 0.0f) * _size, video::SColor(0xFFFFFFFFu), @@ -676,7 +676,7 @@ CGeometryCreator::return_type CGeometryCreator::createDiskMesh(float radius, uin const float angle = 360.0f / static_cast(tesselation); - auto vertices = core::make_smart_refctd_ptr(vertexCount * vertexSize); + auto vertices = asset::ICPUBuffer::create({ vertexCount * vertexSize }); DiskVertex* ptr = (DiskVertex*)vertices->getPointer(); const core::vectorSIMDf v0(0.0f, radius, 0.0f, 1.0f); @@ -1693,8 +1693,8 @@ CGeometryCreator::return_type CGeometryCreator::createIcoSphere(float radius, ui {vertexSize,SVertexInputBindingParams::EVIR_PER_VERTEX} }; - auto vertexBuffer = core::make_smart_refctd_ptr(IcosphereData.getInterleavedVertexSize()); - auto indexBuffer = core::make_smart_refctd_ptr(IcosphereData.getIndexSize()); + auto vertexBuffer = asset::ICPUBuffer::create({ IcosphereData.getInterleavedVertexSize() }); + auto indexBuffer = asset::ICPUBuffer::create({ IcosphereData.getIndexSize() }); memcpy(vertexBuffer->getPointer(), IcosphereData.getInterleavedVertices(), vertexBuffer->getSize()); memcpy(indexBuffer->getPointer(), IcosphereData.getIndices(), indexBuffer->getSize()); diff --git a/src/nbl/asset/utils/CHLSLCompiler.cpp b/src/nbl/asset/utils/CHLSLCompiler.cpp index ab7bbd4996..2827361ff4 100644 --- a/src/nbl/asset/utils/CHLSLCompiler.cpp +++ b/src/nbl/asset/utils/CHLSLCompiler.cpp @@ -485,7 +485,7 @@ core::smart_refctd_ptr CHLSLCompiler::compileToSPIRV_impl(const std: return nullptr; } - auto outSpirv = core::make_smart_refctd_ptr(compileResult.objectBlob->GetBufferSize()); + auto outSpirv = ICPUBuffer::create({ compileResult.objectBlob->GetBufferSize() }); memcpy(outSpirv->getPointer(), compileResult.objectBlob->GetBufferPointer(), compileResult.objectBlob->GetBufferSize()); // Optimizer step diff --git a/src/nbl/asset/utils/CMeshManipulator.cpp b/src/nbl/asset/utils/CMeshManipulator.cpp index 8996972a98..29a096071e 100644 --- a/src/nbl/asset/utils/CMeshManipulator.cpp +++ b/src/nbl/asset/utils/CMeshManipulator.cpp @@ -59,7 +59,7 @@ void IMeshManipulator::flipSurfaces(ICPUMeshBuffer* inbuffer) } else //even { - auto newIndexBuffer = core::make_smart_refctd_ptr((idxcnt + 1u) * sizeof(uint16_t)); + auto newIndexBuffer = ICPUBuffer::create({ (idxcnt + 1u) * sizeof(uint16_t) }); auto* destPtr = reinterpret_cast(newIndexBuffer->getPointer()); destPtr[0] = idx[0]; memcpy(destPtr + 1u, idx, sizeof(uint16_t) * idxcnt); @@ -104,7 +104,7 @@ void IMeshManipulator::flipSurfaces(ICPUMeshBuffer* inbuffer) } else //even { - auto newIndexBuffer = core::make_smart_refctd_ptr((idxcnt + 1u) * sizeof(uint32_t)); + auto newIndexBuffer = ICPUBuffer::create({ (idxcnt + 1u) * sizeof(uint32_t) }); auto* destPtr = reinterpret_cast(newIndexBuffer->getPointer()); destPtr[0] = idx[0]; memcpy(destPtr + 1u, idx, sizeof(uint32_t) * idxcnt); @@ -188,7 +188,7 @@ core::smart_refctd_ptr CMeshManipulator::createMeshBufferFetchOp vtxParams.bindings[NEW_VTX_BUF_BINDING].stride = vertexSize; vtxParams.bindings[NEW_VTX_BUF_BINDING].inputRate = SVertexInputBindingParams::EVIR_PER_VERTEX; - auto newVertBuffer = core::make_smart_refctd_ptr(vertexCount*vertexSize); + auto newVertBuffer = ICPUBuffer::create({ vertexCount*vertexSize }); outbuffer->setVertexBufferBinding({ 0u, core::smart_refctd_ptr(newVertBuffer) }, NEW_VTX_BUF_BINDING); for (size_t i = 0; i < MAX_ATTRIBS; ++i) { @@ -311,7 +311,7 @@ core::smart_refctd_ptr IMeshManipulator::createMeshBufferUniqueP vtxParams.bindings[NEW_VTX_BUF_BINDING].inputRate = SVertexInputBindingParams::EVIR_PER_VERTEX; vtxParams.bindings[NEW_VTX_BUF_BINDING].stride = stride; - auto vertexBuffer = core::make_smart_refctd_ptr(stride*idxCnt); + auto vertexBuffer = ICPUBuffer::create({ stride*idxCnt }); clone->setVertexBufferBinding({0u, vertexBuffer}, 0u); for (size_t i=0; i IMeshManipulator::createMeshBufferUniqueP if (_makeIndexBuf) { - auto idxbuf = core::make_smart_refctd_ptr(idxCnt*(idxCnt<0x10000 ? 2u : 4u)); + auto idxbuf = ICPUBuffer::create({ idxCnt*(idxCnt<0x10000 ? 2u : 4u) }); if (idxCnt<0x10000u) { for (uint32_t i = 0u; i < idxCnt; ++i) @@ -419,7 +419,7 @@ core::smart_refctd_ptr IMeshManipulator::calculateSmoothNormals( } const auto normalFormatBytesize = asset::getTexelOrBlockBytesize(inbuffer->getAttribFormat(normalAttr)); - auto normalBuf = core::make_smart_refctd_ptr(normalFormatBytesize*IMeshManipulator::upperBoundVertexID(inbuffer)); + auto normalBuf = ICPUBuffer::create({ normalFormatBytesize*IMeshManipulator::upperBoundVertexID(inbuffer) }); outbuffer->setVertexBufferBinding({0ull,std::move(normalBuf)},normalBinding); auto pipeline = core::move_and_static_cast(oldPipeline->clone(0u)); @@ -561,7 +561,7 @@ core::smart_refctd_ptr IMeshManipulator::createMeshBufferWelded( { if (!oldIndices) { - inbuffer->setIndexBufferBinding({ 0u, core::make_smart_refctd_ptr((maxRedirect >= 0x10000u ? sizeof(uint32_t) : sizeof(uint16_t)) * inbuffer->getIndexCount()) }); + inbuffer->setIndexBufferBinding({ 0u, ICPUBuffer::create({ (maxRedirect >= 0x10000u ? sizeof(uint32_t) : sizeof(uint16_t)) * inbuffer->getIndexCount() }) }); inbuffer->setIndexType(maxRedirect>=0x10000u ? EIT_32BIT:EIT_16BIT); } } @@ -724,7 +724,7 @@ core::smart_refctd_ptr IMeshManipulator::createOptimizedMeshBuff if (newIdxType == EIT_16BIT) { - newIdxBuffer = core::make_smart_refctd_ptr(sizeof(uint16_t)*outbuffer->getIndexCount()); + newIdxBuffer = ICPUBuffer::create({ sizeof(uint16_t)*outbuffer->getIndexCount() }); // no need to change index buffer offset because it's always 0 (after duplicating original mesh) for (size_t i = 0; i < outbuffer->getIndexCount(); ++i) reinterpret_cast(newIdxBuffer->getPointer())[i] = reinterpret_cast(indices)[i] - minIdx; @@ -823,7 +823,7 @@ void IMeshManipulator::requantizeMeshBuffer(ICPUMeshBuffer* _meshbuffer, const S const size_t vertexSize = newAttribs[activeAttributeCount - 1].offset + newAttribs[activeAttributeCount - 1].size; - auto newVertexBuffer = core::make_smart_refctd_ptr(vertexCnt * vertexSize); + auto newVertexBuffer = ICPUBuffer::create({ vertexCnt * vertexSize }); constexpr uint32_t VTX_BUF_BINDING = 0u; assert(_meshbuffer->getVertexBufferBindings()[0].buffer); @@ -907,7 +907,7 @@ void CMeshManipulator::_filterInvalidTriangles(ICPUMeshBuffer* _input) }); const size_t newSize = std::distance(begin, newEnd) * sizeof(Triangle); - auto newBuf = core::make_smart_refctd_ptr(newSize); + auto newBuf = ICPUBuffer::create({ newSize }); memcpy(newBuf->getPointer(), copy, newSize); _NBL_ALIGNED_FREE(copy); diff --git a/src/nbl/asset/utils/CMeshManipulator.h b/src/nbl/asset/utils/CMeshManipulator.h index bc8cdc91d6..7a5b895665 100644 --- a/src/nbl/asset/utils/CMeshManipulator.h +++ b/src/nbl/asset/utils/CMeshManipulator.h @@ -55,7 +55,7 @@ class CMeshManipulator : public IMeshManipulator if (!_in) return nullptr; - auto out = core::make_smart_refctd_ptr(sizeof(uint32_t) * _idxCount); + auto out = ICPUBuffer::create({ sizeof(uint32_t) * _idxCount }); auto* outPtr = reinterpret_cast(out->getPointer()); @@ -86,7 +86,7 @@ class CMeshManipulator : public IMeshManipulator { const auto outputSize = _idxCount = (_idxCount - 1) * 2; - auto output = core::make_smart_refctd_ptr(sizeof(OutType)*outputSize); + auto output = ICPUBuffer::create({ sizeof(OutType)*outputSize }); const auto* iptr = reinterpret_cast(_input); auto* optr = reinterpret_cast(output->getPointer()); for (uint32_t i = 0, j = 0; i < outputSize;) @@ -102,7 +102,7 @@ class CMeshManipulator : public IMeshManipulator { const auto outputSize = _idxCount = (_idxCount - 2) * 3; - auto output = core::make_smart_refctd_ptr(sizeof(OutType)*outputSize); + auto output = ICPUBuffer::create({ sizeof(OutType)*outputSize }); const auto* iptr = reinterpret_cast(_input); auto* optr = reinterpret_cast(output->getPointer()); for (uint32_t i = 0, j = 0; i < outputSize; j += 2) @@ -124,7 +124,7 @@ class CMeshManipulator : public IMeshManipulator { const auto outputSize = _idxCount = (_idxCount - 2) * 3; - auto output = core::make_smart_refctd_ptr(sizeof(OutType)*outputSize); + auto output = ICPUBuffer::create({ sizeof(OutType)*outputSize }); const auto* iptr = reinterpret_cast(_input); auto* optr = reinterpret_cast(output->getPointer()); for (uint32_t i = 0, j = 1; i < outputSize;) diff --git a/src/nbl/asset/utils/ISPIRVOptimizer.cpp b/src/nbl/asset/utils/ISPIRVOptimizer.cpp index 2716bcd70f..92cbedd143 100644 --- a/src/nbl/asset/utils/ISPIRVOptimizer.cpp +++ b/src/nbl/asset/utils/ISPIRVOptimizer.cpp @@ -82,7 +82,7 @@ nbl::core::smart_refctd_ptr ISPIRVOptimizer::optimize(const uint32_t if (!resultBytesize) return nullptr; - auto result = core::make_smart_refctd_ptr(resultBytesize); + auto result = ICPUBuffer::create({ resultBytesize }); memcpy(result->getPointer(), optimized.data(), resultBytesize); return result; diff --git a/src/nbl/asset/utils/IShaderCompiler.cpp b/src/nbl/asset/utils/IShaderCompiler.cpp index 786110c44c..927f07e96a 100644 --- a/src/nbl/asset/utils/IShaderCompiler.cpp +++ b/src/nbl/asset/utils/IShaderCompiler.cpp @@ -3,10 +3,9 @@ // For conditions of distribution and use, see copyright notice in nabla.h #include "nbl/asset/utils/IShaderCompiler.h" #include "nbl/asset/utils/shadercUtils.h" - #include "nbl/asset/utils/shaderCompiler_serialization.h" -#include "nbl/asset/CVectorCPUBuffer.h" +#include "nbl/core/alloc/resource_owning_vector.h" #include #include @@ -343,7 +342,7 @@ core::smart_refctd_ptr IShaderCompiler::CCache::serialize() const // Might as well memcpy everything memcpy(retVal.data() + SHADER_BUFFER_SIZE_BYTES + shaderBufferSize, dumpedContainerJson.data(), dumpedContainerJsonLength); - return core::make_smart_refctd_ptr>>(std::move(retVal)); + return ICPUBuffer::create({ .size = retVal.size(), .data = retVal.data(), .memoryResource = new core::resource_owning_vector(std::move(retVal)) }); } core::smart_refctd_ptr IShaderCompiler::CCache::deserialize(const std::span serializedCache) @@ -376,7 +375,7 @@ core::smart_refctd_ptr IShaderCompiler::CCache::deseria // We must now recreate the shaders, add them to each entry, then move the entry into the multiset for (auto i = 0u; i < entries.size(); i++) { // Create buffer to hold the code - auto code = core::make_smart_refctd_ptr(shaderCreationParams[i].codeByteSize); + auto code = ICPUBuffer::create({ shaderCreationParams[i].codeByteSize }); // Copy the shader bytecode into the buffer memcpy(code->getPointer(), serializedCache.data() + SHADER_BUFFER_SIZE_BYTES + shaderCreationParams[i].offset, shaderCreationParams[i].codeByteSize); @@ -400,32 +399,32 @@ bool nbl::asset::IShaderCompiler::CCache::SEntry::setContent(const asset::ICPUBu size_t propsSize = LZMA_PROPS_SIZE; size_t destLen = uncompressedSpirvBuffer->getSize() + uncompressedSpirvBuffer->getSize() / 3 + 128; - std::vector compressedSpirv = {}; - compressedSpirv.resize(propsSize + destLen); + core::vector compressedSpirv(propsSize + destLen); CLzmaEncProps props; LzmaEncProps_Init(&props); props.dictSize = 1 << 16; // 64KB props.writeEndMark = 1; - ISzAlloc alloc = { SzAlloc, SzFree }; + ISzAlloc sz_alloc = { SzAlloc, SzFree }; int res = LzmaEncode( compressedSpirv.data() + LZMA_PROPS_SIZE, &destLen, reinterpret_cast(uncompressedSpirvBuffer->getPointer()), uncompressedSpirvBuffer->getSize(), &props, compressedSpirv.data(), &propsSize, props.writeEndMark, - nullptr, &alloc, &alloc); + nullptr, &sz_alloc, &sz_alloc); if (res != SZ_OK || propsSize != LZMA_PROPS_SIZE) return false; + compressedSpirv.resize(propsSize + destLen); - spirv = core::make_smart_refctd_ptr(propsSize + destLen); - memcpy(spirv->getPointer(), compressedSpirv.data(), spirv->getSize()); + auto memResource = new core::resource_owning_vector(std::move(compressedSpirv)); + spirv = ICPUBuffer::create({ .size = propsSize + destLen, .data = memResource->data(), .memoryResource = std::move(memResource)}); return true; } core::smart_refctd_ptr nbl::asset::IShaderCompiler::CCache::SEntry::decompressShader() const { - auto uncompressedBuf = core::make_smart_refctd_ptr(uncompressedSize); + auto uncompressedBuf = ICPUBuffer::create({ uncompressedSize }); uncompressedBuf->setContentHash(uncompressedContentHash); size_t dstSize = uncompressedBuf->getSize(); diff --git a/src/nbl/ext/DepthPyramidGenerator/DepthPyramidGenerator.cpp b/src/nbl/ext/DepthPyramidGenerator/DepthPyramidGenerator.cpp index 0e3727df0b..0a960691be 100644 --- a/src/nbl/ext/DepthPyramidGenerator/DepthPyramidGenerator.cpp +++ b/src/nbl/ext/DepthPyramidGenerator/DepthPyramidGenerator.cpp @@ -90,7 +90,7 @@ layout(local_size_x = WORKGROUP_X_AND_Y_SIZE, local_size_y = WORKGROUP_X_AND_Y_S const uint32_t perPassMipCnt = static_cast(config.workGroupSize) == 32u ? 6u : 5u; constexpr size_t extraSize = 32u; - auto shaderCode = core::make_smart_refctd_ptr(strlen(source) + extraSize + 1u); + auto shaderCode = ICPUBuffer::create({ strlen(source) + extraSize + 1u }); snprintf(reinterpret_cast(shaderCode->getPointer()), shaderCode->getSize(), source, static_cast(m_config.workGroupSize), format, redOp, mipScaling); auto cpuSpecializedShader = core::make_smart_refctd_ptr( diff --git a/src/nbl/ext/FFT/FFT.cpp b/src/nbl/ext/FFT/FFT.cpp index b5714e839e..746fc1801f 100644 --- a/src/nbl/ext/FFT/FFT.cpp +++ b/src/nbl/ext/FFT/FFT.cpp @@ -54,7 +54,7 @@ layout(local_size_x=_NBL_GLSL_WORKGROUP_SIZE_, local_size_y=1, local_size_z=1) i constexpr size_t extraSize = 8u*2u+1u; - auto source = core::make_smart_refctd_ptr(strlen(sourceFmt)+extraSize+1u); + auto source = ICPUBuffer::create({ strlen(sourceFmt)+extraSize+1u }); snprintf( reinterpret_cast(source->getPointer()),source->getSize(), sourceFmt, DEFAULT_WORK_GROUP_SIZE, diff --git a/src/nbl/ext/LumaMeter/CLumaMeter.cpp b/src/nbl/ext/LumaMeter/CLumaMeter.cpp index 7218441a34..dcfbdc2838 100644 --- a/src/nbl/ext/LumaMeter/CLumaMeter.cpp +++ b/src/nbl/ext/LumaMeter/CLumaMeter.cpp @@ -178,7 +178,7 @@ void main() const size_t xyzMatrixChars = strlen(xyzMatrix); const size_t extraSize = lumaChars+meterModeChars+eotfChars+xyzMatrixChars; - auto shader = core::make_smart_refctd_ptr(strlen(sourceFmt)+extraSize+1u); + auto shader = ICPUBuffer::create({ strlen(sourceFmt)+extraSize+1u }); snprintf( reinterpret_cast(shader->getPointer()),shader->getSize(),sourceFmt, DEFAULT_BIN_COUNT,DEFAULT_BIN_GLOBAL_REPLICATION, diff --git a/src/nbl/ext/MitsubaLoader/CMitsubaLoader.cpp b/src/nbl/ext/MitsubaLoader/CMitsubaLoader.cpp index 17d4dc37fd..6d2edd58b2 100644 --- a/src/nbl/ext/MitsubaLoader/CMitsubaLoader.cpp +++ b/src/nbl/ext/MitsubaLoader/CMitsubaLoader.cpp @@ -320,7 +320,7 @@ static core::smart_refctd_ptr createSingleChannelImage(const a } const size_t texelBytesz = asset::getTexelOrBlockBytesize(outParams.format); region.bufferRowLength = asset::IImageAssetHandlerBase::calcPitchInBlocks(outParams.extent.width, texelBytesz); - auto buffer = core::make_smart_refctd_ptr(texelBytesz * region.bufferRowLength * outParams.extent.height); + auto buffer = asset::ICPUBuffer::create({ texelBytesz * region.bufferRowLength * outParams.extent.height }); region.imageOffset = { 0,0,0 }; region.imageExtent = outParams.extent; region.imageSubresource.baseArrayLayer = 0u; @@ -848,7 +848,7 @@ SContext::shape_ass_type CMitsubaLoader::loadBasicShape(SContext& ctx, uint32_t if (totalVertexCount) { constexpr uint32_t hidefRGBSize = 4u; - auto newRGBbuff = core::make_smart_refctd_ptr(hidefRGBSize*totalVertexCount); + auto newRGBbuff = asset::ICPUBuffer::create({ hidefRGBSize*totalVertexCount }); newMesh = core::smart_refctd_ptr_static_cast(mesh->clone(1u)); constexpr uint32_t COLOR_ATTR = 1u; constexpr uint32_t COLOR_BUF_BINDING = 15u; @@ -1196,7 +1196,7 @@ inline core::smart_refctd_ptr CMitsubaLoader::createDS auto d = ds0->getDescriptors(PRECOMPUTED_VT_DATA_BINDING).begin(); { - auto precompDataBuf = core::make_smart_refctd_ptr(sizeof(asset::ICPUVirtualTexture::SPrecomputedData)); + auto precompDataBuf = ICPUBuffer::create({ sizeof(asset::ICPUVirtualTexture::SPrecomputedData) }); memcpy(precompDataBuf->getPointer(), &_ctx.backend_ctx.vt.vt->getPrecomputedData(), precompDataBuf->getSize()); d->buffer.offset = 0u; @@ -1205,7 +1205,7 @@ inline core::smart_refctd_ptr CMitsubaLoader::createDS } d = ds0->getDescriptors(INSTR_BUF_BINDING).begin(); { - auto instrbuf = core::make_smart_refctd_ptr(_compResult.instructions.size()*sizeof(decltype(_compResult.instructions)::value_type)); + auto instrbuf = ICPUBuffer::create({ _compResult.instructions.size()*sizeof(decltype(_compResult.instructions)::value_type) }); memcpy(instrbuf->getPointer(), _compResult.instructions.data(), instrbuf->getSize()); d->buffer.offset = 0u; @@ -1214,7 +1214,7 @@ inline core::smart_refctd_ptr CMitsubaLoader::createDS } d = ds0->getDescriptors(BSDF_BUF_BINDING).begin(); { - auto bsdfbuf = core::make_smart_refctd_ptr(_compResult.bsdfData.size()*sizeof(decltype(_compResult.bsdfData)::value_type)); + auto bsdfbuf = ICPUBuffer::create({ _compResult.bsdfData.size()*sizeof(decltype(_compResult.bsdfData)::value_type) }); memcpy(bsdfbuf->getPointer(), _compResult.bsdfData.data(), bsdfbuf->getSize()); d->buffer.offset = 0u; @@ -1226,7 +1226,7 @@ inline core::smart_refctd_ptr CMitsubaLoader::createDS const size_t sz = _compResult.prefetch_stream.size()*sizeof(decltype(_compResult.prefetch_stream)::value_type); constexpr size_t MIN_SSBO_SZ = 128ull; //prefetch stream won't be generated if no textures are used, so make sure we're not creating 0-size buffer - auto prefetch_instr_buf = core::make_smart_refctd_ptr(std::max(MIN_SSBO_SZ, sz)); + auto prefetch_instr_buf = ICPUBuffer::create({ std::max(MIN_SSBO_SZ, sz) }); memcpy(prefetch_instr_buf->getPointer(), _compResult.prefetch_stream.data(), sz); d->buffer.offset = 0u; @@ -1295,7 +1295,7 @@ inline core::smart_refctd_ptr CMitsubaLoader::createDS #endif d = ds0->getDescriptors(INSTANCE_DATA_BINDING).begin(); { - auto instDataBuf = core::make_smart_refctd_ptr(instanceData.size()*sizeof(nbl_glsl_ext_Mitsuba_Loader_instance_data_t)); + auto instDataBuf = ICPUBuffer::create({ instanceData.size()*sizeof(nbl_glsl_ext_Mitsuba_Loader_instance_data_t) }); memcpy(instDataBuf->getPointer(), instanceData.data(), instDataBuf->getSize()); d->buffer.offset = 0u; diff --git a/src/nbl/ext/MitsubaLoader/CSerializedLoader.cpp b/src/nbl/ext/MitsubaLoader/CSerializedLoader.cpp index 1d3ff8bc96..c19395397c 100644 --- a/src/nbl/ext/MitsubaLoader/CSerializedLoader.cpp +++ b/src/nbl/ext/MitsubaLoader/CSerializedLoader.cpp @@ -227,18 +227,18 @@ asset::SAssetBundle CSerializedLoader::loadAsset(system::IFile* _file, const ass continue; } - auto indexbuf = core::make_smart_refctd_ptr(indexDataSize); + auto indexbuf = asset::ICPUBuffer::create({ indexDataSize }); const uint32_t posAttrSize = typeSize*3u; - auto posbuf = core::make_smart_refctd_ptr(vertexCount*posAttrSize); + auto posbuf = asset::ICPUBuffer::create({ vertexCount*posAttrSize }); core::smart_refctd_ptr normalbuf,uvbuf,colorbuf; if (requiresNormals) - normalbuf = core::make_smart_refctd_ptr(sizeof(uint32_t)*vertexCount); + normalbuf = asset::ICPUBuffer::create({ sizeof(uint32_t)*vertexCount }); // TODO: UV quantization and optimization (maybe lets just always use half floats?) constexpr size_t uvAttrSize = sizeof(float)*2u; if (hasUVs) - uvbuf = core::make_smart_refctd_ptr(uvAttrSize*vertexCount); + uvbuf = asset::ICPUBuffer::create({ uvAttrSize*vertexCount }); if (hasColors) - colorbuf = core::make_smart_refctd_ptr(sizeof(uint32_t)*vertexCount); + colorbuf = asset::ICPUBuffer::create({ sizeof(uint32_t)*vertexCount }); void* posPtr = posbuf->getPointer(); CQuantNormalCache::value_type_t* normalPtr = !normalbuf ? nullptr:reinterpret_cast*>(normalbuf->getPointer()); diff --git a/src/nbl/ext/RadixSort/RadixSort.cpp b/src/nbl/ext/RadixSort/RadixSort.cpp index 568ec5a301..2d67a1131e 100644 --- a/src/nbl/ext/RadixSort/RadixSort.cpp +++ b/src/nbl/ext/RadixSort/RadixSort.cpp @@ -92,7 +92,7 @@ layout (local_size_x = _NBL_GLSL_WORKGROUP_SIZE_) in; // Todo: This just the value I took from FFT example, don't know how it is being computed. const size_t extraSize = 4u + 8u + 8u + 128u; - auto shader = core::make_smart_refctd_ptr(strlen(source_fmt) + extraSize + 1u); + auto shader = asset::ICPUBuffer::create({ strlen(source_fmt) + extraSize + 1u }); snprintf(reinterpret_cast(shader->getPointer()), shader->getSize(), source_fmt, m_wg_size, BUCKETS_COUNT, shader_file_path); auto cpu_specialized_shader = core::make_smart_refctd_ptr( @@ -123,7 +123,7 @@ layout (local_size_x = _NBL_GLSL_WORKGROUP_SIZE_) in; // Todo: This just the value I took from FFT example, don't know how it is being computed. const size_t extraSize = 4u + 8u + 8u + 128u; - auto shader = core::make_smart_refctd_ptr(strlen(source_fmt) + extraSize + 1u); + auto shader = asset::ICPUBuffer::create({ strlen(source_fmt) + extraSize + 1u }); snprintf(reinterpret_cast(shader->getPointer()), shader->getSize(), source_fmt, m_wg_size, shader_file_path); auto cpu_specialized_shader = core::make_smart_refctd_ptr( diff --git a/src/nbl/ext/TextRendering/TextRendering.cpp b/src/nbl/ext/TextRendering/TextRendering.cpp index 6715eb3404..ea0a04e6cb 100644 --- a/src/nbl/ext/TextRendering/TextRendering.cpp +++ b/src/nbl/ext/TextRendering/TextRendering.cpp @@ -109,7 +109,7 @@ core::smart_refctd_ptr FontFace::generateGlyphMSDF(uint32_t baseMSDFP } auto image = ICPUImage::create(std::move(imgParams)); - auto buffer = core::make_smart_refctd_ptr(bufferSize); + auto buffer = ICPUBuffer::create({ bufferSize }); auto regions = core::make_refctd_dynamic_array>(mipLevels); size_t bufferOffset = 0ull; diff --git a/src/nbl/ext/ToneMapper/CToneMapper.cpp b/src/nbl/ext/ToneMapper/CToneMapper.cpp index 5dbce53b83..c97d05b2df 100644 --- a/src/nbl/ext/ToneMapper/CToneMapper.cpp +++ b/src/nbl/ext/ToneMapper/CToneMapper.cpp @@ -498,7 +498,7 @@ void main() usingTemporalAdaptationDefineChars+ outViewFormatQualifierChars+eotfChars+inXYZMatrixChars+outXYZMatrixChars+oetfChars+quantizationChars; - auto shader = core::make_smart_refctd_ptr(strlen(sourceFmt)+extraSize+1u); + auto shader = ICPUBuffer::create({ strlen(sourceFmt)+extraSize+1u }); snprintf( reinterpret_cast(shader->getPointer()),shader->getSize(),sourceFmt, DEFAULT_WORKGROUP_DIM,DEFAULT_WORKGROUP_DIM,_operator, diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 183afe6b43..7fb60d79bf 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -513,7 +513,7 @@ CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog) if (_size==0ull) return {nullptr,NVRTC_ERROR_INVALID_INPUT}; - auto ptx = core::make_smart_refctd_ptr(_size); + auto ptx = asset::ICPUBuffer::create({ _size }); return {std::move(ptx),m_nvrtc.pnvrtcGetPTX(prog,reinterpret_cast(ptx->getPointer()))}; } diff --git a/src/nbl/video/utilities/CPropertyPoolHandler.cpp b/src/nbl/video/utilities/CPropertyPoolHandler.cpp index 40f17e4e75..b9ebe8b1b1 100644 --- a/src/nbl/video/utilities/CPropertyPoolHandler.cpp +++ b/src/nbl/video/utilities/CPropertyPoolHandler.cpp @@ -28,7 +28,7 @@ CPropertyPoolHandler::CPropertyPoolHandler(core::smart_refctd_ptr(glslFile->getSize()); + glsl = asset::ICPUBuffer::create({ glslFile->getSize() }); memcpy(glsl->getPointer(), glslFile->getMappedPointer(), glsl->getSize()); } diff --git a/src/nbl/video/utilities/CScanner.cpp b/src/nbl/video/utilities/CScanner.cpp index ec31272358..0f4399b00d 100644 --- a/src/nbl/video/utilities/CScanner.cpp +++ b/src/nbl/video/utilities/CScanner.cpp @@ -23,7 +23,7 @@ core::smart_refctd_ptr CScanner::createShader(const bool indi else glsl = loadBuiltinData("nbl/builtin/glsl/scan/direct.comp"); } - auto buffer = core::make_smart_refctd_ptr(glsl->getSize()); + auto buffer = asset::ICPUBuffer::create({ glsl->getSize() }); memcpy(buffer->getPointer(), glsl->getMappedPointer(), glsl->getSize()); auto cpushader = core::make_smart_refctd_ptr(std::move(buffer), asset::IShader::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_GLSL, "????"); const char* storageType = nullptr; diff --git a/src/nbl/video/utilities/ImageRegionIterator.cpp b/src/nbl/video/utilities/ImageRegionIterator.cpp index 8285f23846..50b990aef2 100644 --- a/src/nbl/video/utilities/ImageRegionIterator.cpp +++ b/src/nbl/video/utilities/ImageRegionIterator.cpp @@ -92,7 +92,7 @@ ImageRegionIterator::ImageRegionIterator( uint64_t offsetInCPUBuffer = region.bufferOffset; uint8_t* inCpuBufferPointer = const_cast(reinterpret_cast(srcData) + offsetInCPUBuffer); - core::smart_refctd_ptr inCPUBuffer = core::make_smart_refctd_ptr< asset::CCustomAllocatorCPUBuffer, true> >(0xdeadbeefBADC0FFEull, inCpuBufferPointer, core::adopt_memory); + core::smart_refctd_ptr inCPUBuffer = asset::ICPUBuffer::create({ 0xdeadbeefBADC0FFEull, inCpuBufferPointer }, core::adopt_memory); inCPUImage->setBufferAndRegions(std::move(inCPUBuffer), inCPUImageRegions); assert(inCPUImage->getBuffer()); assert(inCPUImage->getRegions().size() > 0u); @@ -392,7 +392,7 @@ bool ImageRegionIterator::advanceAndCopyToStagingBuffer(asset::IImage::SBufferCo outCpuImageRegion.imageSubresource.layerCount = core::max(regionToCopyNext.imageSubresource.layerCount, 1u); uint8_t* outCpuBufferPointer = reinterpret_cast(stagingBufferPointer) + stagingBufferOffset; - core::smart_refctd_ptr outCPUBuffer = core::make_smart_refctd_ptr>>(outCPUBufferSize, outCpuBufferPointer, core::adopt_memory); + core::smart_refctd_ptr outCPUBuffer = asset::ICPUBuffer::create({ outCPUBufferSize, outCpuBufferPointer }, core::adopt_memory); outCPUImage->setBufferAndRegions(std::move(outCPUBuffer), outCPUImageRegions); assert(outCPUImage->getBuffer()); assert(outCPUImage->getRegions().size() > 0u); From f25b884984bc6a609b55a21f89233017679f8551 Mon Sep 17 00:00:00 2001 From: Ali Cheraghi Date: Thu, 14 Nov 2024 15:18:15 +0330 Subject: [PATCH 188/358] asset: several fixes - move `SICPUBufferCreationParams` into `ICPUBuffer::SCreationParams` - designated initializers for ICPUBuffer::create - revert changes to any code thats surrounded by `#if 0` - correct coding convention - `resource_owning_vector` -> `VectorViewNullMemoryResource` (and don't manage anything) - `(size_t)x` -> `static_cast(x)` - pass alignment in `ICPUBuffer::clone` Signed-off-by: Ali Cheraghi --- include/nbl/asset/ICPUBuffer.h | 29 ++++++----- include/nbl/asset/ICPUShader.h | 2 +- .../filters/CFlattenRegionsImageFilter.h | 2 +- .../filters/dithering/CPrecomputedDither.h | 2 +- .../interchange/IImageAssetHandlerBase.h | 2 +- include/nbl/asset/utils/CCPUMeshPackerV1.h | 8 +-- include/nbl/asset/utils/CCPUMeshPackerV2.h | 6 +-- include/nbl/asset/utils/CDirQuantCacheBase.h | 4 +- include/nbl/asset/utils/IMeshManipulator.h | 6 +-- include/nbl/asset/utils/IMeshPacker.h | 2 +- include/nbl/asset/utils/IShaderCompiler.h | 2 +- .../core/alloc/VectorViewNullMemoryResource.h | 47 +++++++++++++++++ .../nbl/core/alloc/resource_owning_vector.h | 50 ------------------- include/nbl/ext/ScreenShot/ScreenShot.h | 2 +- .../nbl/scene/ICullingLoDSelectionSystem.h | 2 +- include/nbl/scene/ISkinInstanceCacheManager.h | 2 +- include/nbl/scene/ITransformTreeManager.h | 2 +- src/nbl/asset/IAssetManager.cpp | 4 +- src/nbl/asset/ICPUImage.cpp | 2 +- src/nbl/asset/interchange/CBufferLoaderBIN.h | 2 +- src/nbl/asset/interchange/CGLILoader.cpp | 2 +- src/nbl/asset/interchange/CGLTFLoader.cpp | 26 +++++----- .../CGraphicsPipelineLoaderMTL.cpp | 4 +- src/nbl/asset/interchange/CHLSLLoader.cpp | 2 +- src/nbl/asset/interchange/CImageLoaderJPG.cpp | 2 +- .../asset/interchange/CImageLoaderOpenEXR.cpp | 2 +- src/nbl/asset/interchange/CImageLoaderPNG.cpp | 2 +- src/nbl/asset/interchange/CImageLoaderTGA.cpp | 8 +-- src/nbl/asset/interchange/CImageWriterJPG.cpp | 2 +- src/nbl/asset/interchange/CImageWriterTGA.cpp | 2 +- .../asset/interchange/COBJMeshFileLoader.cpp | 4 +- .../asset/interchange/CPLYMeshFileLoader.cpp | 10 ++-- src/nbl/asset/interchange/CSPVLoader.cpp | 2 +- .../asset/interchange/CSTLMeshFileLoader.cpp | 2 +- src/nbl/asset/utils/CDerivativeMapCreator.cpp | 4 +- src/nbl/asset/utils/CGLSLCompiler.cpp | 2 +- src/nbl/asset/utils/CGeometryCreator.cpp | 30 +++++------ src/nbl/asset/utils/CHLSLCompiler.cpp | 2 +- src/nbl/asset/utils/CMeshManipulator.cpp | 20 ++++---- src/nbl/asset/utils/CMeshManipulator.h | 8 +-- src/nbl/asset/utils/ISPIRVOptimizer.cpp | 2 +- src/nbl/asset/utils/IShaderCompiler.cpp | 10 ++-- .../DepthPyramidGenerator.cpp | 2 +- src/nbl/ext/FFT/FFT.cpp | 2 +- src/nbl/ext/LumaMeter/CLumaMeter.cpp | 2 +- src/nbl/ext/MitsubaLoader/CMitsubaLoader.cpp | 14 +++--- .../ext/MitsubaLoader/CSerializedLoader.cpp | 10 ++-- src/nbl/ext/RadixSort/RadixSort.cpp | 4 +- src/nbl/ext/TextRendering/TextRendering.cpp | 2 +- src/nbl/ext/ToneMapper/CToneMapper.cpp | 2 +- src/nbl/video/CCUDAHandler.cpp | 2 +- .../video/utilities/CPropertyPoolHandler.cpp | 2 +- src/nbl/video/utilities/CScanner.cpp | 2 +- .../video/utilities/ImageRegionIterator.cpp | 4 +- 54 files changed, 188 insertions(+), 186 deletions(-) create mode 100644 include/nbl/core/alloc/VectorViewNullMemoryResource.h delete mode 100644 include/nbl/core/alloc/resource_owning_vector.h diff --git a/include/nbl/asset/ICPUBuffer.h b/include/nbl/asset/ICPUBuffer.h index 84af10769d..37ce5f222d 100644 --- a/include/nbl/asset/ICPUBuffer.h +++ b/include/nbl/asset/ICPUBuffer.h @@ -13,16 +13,7 @@ namespace nbl::asset { -struct SICPUBufferCreationParams -{ - size_t size; - void* data = nullptr; - size_t alignment = _NBL_SIMD_ALIGNMENT; - std::pmr::memory_resource* memoryResource = nullptr; -}; - //! One of CPU class-object representing an Asset -//! TODO: remove, alloc can fail, should be a static create method instead! /** One of Assets used for storage of large arrays, so that storage can be decoupled from other objects such as meshbuffers, images, animations and shader source/bytecode. @@ -32,11 +23,25 @@ struct SICPUBufferCreationParams class ICPUBuffer final : public asset::IBuffer, public IPreHashed { public: + struct SCreationParams : asset::IBuffer::SCreationParams + { + size_t size; + void* data = nullptr; + size_t alignment = _NBL_SIMD_ALIGNMENT; + std::pmr::memory_resource* memoryResource = nullptr; + + SCreationParams& operator =(const asset::IBuffer::SCreationParams& rhs) + { + static_cast(*this) = rhs; + return *this; + } + }; + ICPUBuffer(size_t size, void* data, std::pmr::memory_resource* memoryResource, size_t alignment, bool adopt_memory) : asset::IBuffer({ size, EUF_TRANSFER_DST_BIT }), m_data(data), m_mem_resource(memoryResource), m_alignment(alignment), m_adopt_memory(adopt_memory) {} //! allocates uninitialized memory, copies `data` into allocation if `!data` not nullptr - core::smart_refctd_ptr static create(const SICPUBufferCreationParams& params) { + core::smart_refctd_ptr static create(const SCreationParams& params) { std::pmr::memory_resource* memoryResource = params.memoryResource; if (!params.memoryResource) memoryResource = std::pmr::get_default_resource(); @@ -52,7 +57,7 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed } //! does not allocate memory, adopts the `data` pointer, no copies done - core::smart_refctd_ptr static create(const SICPUBufferCreationParams& params, core::adopt_memory_t) { + core::smart_refctd_ptr static create(const SCreationParams& params, core::adopt_memory_t) { std::pmr::memory_resource* memoryResource; if (!params.memoryResource) memoryResource = std::pmr::get_default_resource(); @@ -61,7 +66,7 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed core::smart_refctd_ptr clone(uint32_t = ~0u) const override final { - auto cp = create({ m_creationParams.size, m_data }); + auto cp = create({ .size = m_creationParams.size, .data = m_data, .alignment = m_alignment }); memcpy(cp->getPointer(), m_data, m_creationParams.size); cp->setContentHash(getContentHash()); return cp; diff --git a/include/nbl/asset/ICPUShader.h b/include/nbl/asset/ICPUShader.h index ae8758d44f..b304155b0c 100644 --- a/include/nbl/asset/ICPUShader.h +++ b/include/nbl/asset/ICPUShader.h @@ -33,7 +33,7 @@ class ICPUShader : public IAsset, public IShader : IShader(stage, std::move(filepathHint)), m_code(std::move(code)), m_contentType(contentType) {} ICPUShader(const char* code, const E_SHADER_STAGE stage, const E_CONTENT_TYPE contentType, std::string&& filepathHint) - : ICPUShader(ICPUBuffer::create({ strlen(code) + 1u }), stage, contentType, std::move(filepathHint)) + : ICPUShader(ICPUBuffer::create({ .size = strlen(code) + 1u }), stage, contentType, std::move(filepathHint)) { assert(contentType != E_CONTENT_TYPE::ECT_SPIRV); // because using strlen needs `code` to be null-terminated memcpy(m_code->getPointer(), code, m_code->getSize()); diff --git a/include/nbl/asset/filters/CFlattenRegionsImageFilter.h b/include/nbl/asset/filters/CFlattenRegionsImageFilter.h index 0eeba246e2..832c5629c5 100644 --- a/include/nbl/asset/filters/CFlattenRegionsImageFilter.h +++ b/include/nbl/asset/filters/CFlattenRegionsImageFilter.h @@ -89,7 +89,7 @@ class CFlattenRegionsImageFilter : public CImageFilteroutImage->setBufferAndRegions(std::move(buffer),std::move(regions)); }; diff --git a/include/nbl/asset/filters/dithering/CPrecomputedDither.h b/include/nbl/asset/filters/dithering/CPrecomputedDither.h index eb1c4627ca..36ff4f0b39 100644 --- a/include/nbl/asset/filters/dithering/CPrecomputedDither.h +++ b/include/nbl/asset/filters/dithering/CPrecomputedDither.h @@ -58,7 +58,7 @@ namespace nbl const size_t newDecodeBufferSize = extent.x * extent.y * extent.z * creationParams.arrayLayers * decodeTexelByteSize; const core::vector3du32_SIMD decodeBufferByteStrides = TexelBlockInfo(decodeFormat).convert3DTexelStridesTo1DByteStrides(core::vector3du32_SIMD(extent.x, extent.y, extent.z)); - auto decodeFlattenBuffer = ICPUBuffer::create({ newDecodeBufferSize }); + auto decodeFlattenBuffer = ICPUBuffer::create({ .size = newDecodeBufferSize }); decodeFlattenBuffer->setContentHash(IPreHashed::INVALID_HASH); auto* inData = reinterpret_cast(flattenDitheringImage->getBuffer()->getPointer()); diff --git a/include/nbl/asset/interchange/IImageAssetHandlerBase.h b/include/nbl/asset/interchange/IImageAssetHandlerBase.h index b39ae2a8be..bbbf373908 100644 --- a/include/nbl/asset/interchange/IImageAssetHandlerBase.h +++ b/include/nbl/asset/interchange/IImageAssetHandlerBase.h @@ -110,7 +110,7 @@ class IImageAssetHandlerBase : public virtual core::IReferenceCounted bufferSize += memsize.getIntegerApprox(); } - auto texelBuffer = ICPUBuffer::create({ bufferSize }); + auto texelBuffer = ICPUBuffer::create({ .size = bufferSize }); newImage->setBufferAndRegions(std::move(texelBuffer), newRegions); newImage->setContentHash(IPreHashed::INVALID_HASH); } diff --git a/include/nbl/asset/utils/CCPUMeshPackerV1.h b/include/nbl/asset/utils/CCPUMeshPackerV1.h index 45d84f81b3..23b2da563a 100644 --- a/include/nbl/asset/utils/CCPUMeshPackerV1.h +++ b/include/nbl/asset/utils/CCPUMeshPackerV1.h @@ -296,11 +296,11 @@ void CCPUMeshPackerV1::instantiateDataStorage() const size_t vtxBuffSupportedByteSize = base_t::alctrTraits::get_total_size(base_t::m_vtxBuffAlctr); const size_t perInsBuffSupportedByteSize = base_t::alctrTraits::get_total_size(base_t::m_vtxBuffAlctr); - m_output.MDIDataBuffer = ICPUBuffer::create({ MDIDataBuffSupportedByteSize }); - m_output.indexBuffer.buffer = ICPUBuffer::create({ idxBuffSupportedByteSize }); + m_output.MDIDataBuffer = core::make_smart_refctd_ptr(MDIDataBuffSupportedByteSize); + m_output.indexBuffer.buffer = core::make_smart_refctd_ptr(idxBuffSupportedByteSize); - core::smart_refctd_ptr unifiedVtxBuff = ICPUBuffer::create({ vtxBuffSupportedByteSize }); - core::smart_refctd_ptr unifiedInsBuff = ICPUBuffer::create({ perInsBuffSupportedByteSize }); + core::smart_refctd_ptr unifiedVtxBuff = core::make_smart_refctd_ptr(vtxBuffSupportedByteSize); + core::smart_refctd_ptr unifiedInsBuff = core::make_smart_refctd_ptr(perInsBuffSupportedByteSize); //divide unified vtx buffers //proportions: sizeOfAttr1 : sizeOfAttr2 : ... : sizeOfAttrN diff --git a/include/nbl/asset/utils/CCPUMeshPackerV2.h b/include/nbl/asset/utils/CCPUMeshPackerV2.h index a41d65b067..83308ab82b 100644 --- a/include/nbl/asset/utils/CCPUMeshPackerV2.h +++ b/include/nbl/asset/utils/CCPUMeshPackerV2.h @@ -61,9 +61,9 @@ void CCPUMeshPackerV2::instantiateDataStorage() const uint32_t idxBuffByteSize = base_t::m_idxBuffAlctr.get_total_size() * sizeof(uint16_t); const uint32_t vtxBuffByteSize = base_t::m_vtxBuffAlctr.get_total_size(); - base_t::m_packerDataStore.MDIDataBuffer = ICPUBuffer::create({ MDIDataBuffByteSize }); - base_t::m_packerDataStore.indexBuffer = ICPUBuffer::create({ idxBuffByteSize }); - base_t::m_packerDataStore.vertexBuffer = ICPUBuffer::create({ vtxBuffByteSize }); + base_t::m_packerDataStore.MDIDataBuffer = core::make_smart_refctd_ptr(MDIDataBuffByteSize); + base_t::m_packerDataStore.indexBuffer = core::make_smart_refctd_ptr(idxBuffByteSize); + base_t::m_packerDataStore.vertexBuffer = core::make_smart_refctd_ptr(vtxBuffByteSize); } /* diff --git a/include/nbl/asset/utils/CDirQuantCacheBase.h b/include/nbl/asset/utils/CDirQuantCacheBase.h index 4057e3e4d9..45cb920dc8 100644 --- a/include/nbl/asset/utils/CDirQuantCacheBase.h +++ b/include/nbl/asset/utils/CDirQuantCacheBase.h @@ -297,7 +297,7 @@ class CDirQuantCacheBase : public impl::CDirQuantCacheBase if (!file) return false; - auto buffer = asset::ICPUBuffer::create({ file->getSize() }); + auto buffer = asset::ICPUBuffer::create({ .size = file->getSize() }); system::IFile::success_t succ; file->read(succ, buffer->getPointer(), 0, file->getSize()); @@ -346,7 +346,7 @@ class CDirQuantCacheBase : public impl::CDirQuantCacheBase asset::SBufferRange bufferRange; bufferRange.offset = 0; bufferRange.size = getSerializedCacheSizeInBytes(); - bufferRange.buffer = asset::ICPUBuffer::create({ bufferRange.size }); + bufferRange.buffer = asset::ICPUBuffer::create({ .size = bufferRange.size }); saveCacheToBuffer(bufferRange); diff --git a/include/nbl/asset/utils/IMeshManipulator.h b/include/nbl/asset/utils/IMeshManipulator.h index f84d85c75d..c4c3d1ef59 100644 --- a/include/nbl/asset/utils/IMeshManipulator.h +++ b/include/nbl/asset/utils/IMeshManipulator.h @@ -557,7 +557,7 @@ class NBL_API2 IMeshManipulator : public virtual core::IReferenceCounted core::smart_refctd_ptr iotaUint32Buffer; if (iotaLength) { - iotaUint32Buffer = ICPUBuffer::create({ sizeof(uint32_t)*iotaLength }); + iotaUint32Buffer = ICPUBuffer::create({ .size = sizeof(uint32_t)*iotaLength }); auto ptr = reinterpret_cast(iotaUint32Buffer->getPointer()); std::iota(ptr,ptr+iotaLength,0u); } @@ -599,13 +599,13 @@ class NBL_API2 IMeshManipulator : public virtual core::IReferenceCounted if (indexType==EIT_16BIT) { auto inPtr = reinterpret_cast(correctlyOffsetIndexBufferPtr); - newIndexBuffer = ICPUBuffer::create({ sizeof(uint32_t)*indexCount }); + newIndexBuffer = ICPUBuffer::create({ .size = sizeof(uint32_t)*indexCount }); std::copy(inPtr,inPtr+indexCount,reinterpret_cast(newIndexBuffer->getPointer())); } else { auto inPtr = reinterpret_cast(correctlyOffsetIndexBufferPtr); - newIndexBuffer = ICPUBuffer::create({ sizeof(uint16_t)*indexCount }); + newIndexBuffer = ICPUBuffer::create({ .size = sizeof(uint16_t)*indexCount }); std::copy(inPtr,inPtr+indexCount,reinterpret_cast(newIndexBuffer->getPointer())); } } diff --git a/include/nbl/asset/utils/IMeshPacker.h b/include/nbl/asset/utils/IMeshPacker.h index 9daafd9ded..3f09062b18 100644 --- a/include/nbl/asset/utils/IMeshPacker.h +++ b/include/nbl/asset/utils/IMeshPacker.h @@ -551,7 +551,7 @@ class IMeshPacker : public IMeshPackerBase core::smart_refctd_ptr idxBufferToProcess; if (iota) { - idxBufferToProcess = ICPUBuffer::create({ sizeof(uint32_t) * idxCount }); + idxBufferToProcess = core::make_smart_refctd_ptr(sizeof(uint32_t) * idxCount); auto ptr = reinterpret_cast(idxBufferToProcess->getPointer()); std::iota(ptr, ptr + idxCount, 0u); idxType = EIT_32BIT; diff --git a/include/nbl/asset/utils/IShaderCompiler.h b/include/nbl/asset/utils/IShaderCompiler.h index 7acf61e0b0..f9c94569cc 100644 --- a/include/nbl/asset/utils/IShaderCompiler.h +++ b/include/nbl/asset/utils/IShaderCompiler.h @@ -526,7 +526,7 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted constexpr size_t nullTerminatorSize = 1u; size_t outSize = origLen + formatArgsCharSize + formatSize + nullTerminatorSize - 2 * templateArgsCount; - nbl::core::smart_refctd_ptr outBuffer = ICPUBuffer::create({ outSize }); + nbl::core::smart_refctd_ptr outBuffer = ICPUBuffer::create({ .size = outSize }); auto origCode = std::string_view(reinterpret_cast(original->getContent()->getPointer()), origLen); auto outCode = reinterpret_cast(outBuffer->getPointer()); diff --git a/include/nbl/core/alloc/VectorViewNullMemoryResource.h b/include/nbl/core/alloc/VectorViewNullMemoryResource.h new file mode 100644 index 0000000000..ce61b9b08d --- /dev/null +++ b/include/nbl/core/alloc/VectorViewNullMemoryResource.h @@ -0,0 +1,47 @@ +// Copyright (C) 2019-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine" and was originally part of the "Irrlicht Engine" +// For conditions of distribution and use, see copyright notice in nabla.h +// See the original file in irrlicht source for authors + +#ifndef _NBL_CORE_ALLOC_VECTOR_VIEW_NULL_MEMORY_RESOURCE_INCLUDED_ +#define _NBL_CORE_ALLOC_VECTOR_VIEW_NULL_MEMORY_RESOURCE_INCLUDED_ + +#include + +using namespace nbl; + +namespace nbl::core +{ + +class VectorViewNullMemoryResource : public std::pmr::memory_resource +{ + public: + // only create the resource from an already sized vector + VectorViewNullMemoryResource(core::vector&& buffer) : buffer(buffer) + { + assert(buffer.size()); + }; + + void* data() { + return buffer.data(); + } + + protected: + void* do_allocate(size_t bytes, size_t alignment) override { + assert(bytes <= buffer.size()); + return buffer.data(); + } + + void do_deallocate(void* p, size_t bytes, size_t alignment) override {} + + bool do_is_equal(const std::pmr::memory_resource& other) const noexcept override { + return this == &other; + } + + private: + core::vector buffer; +}; + +} + +#endif \ No newline at end of file diff --git a/include/nbl/core/alloc/resource_owning_vector.h b/include/nbl/core/alloc/resource_owning_vector.h deleted file mode 100644 index 330c4fa819..0000000000 --- a/include/nbl/core/alloc/resource_owning_vector.h +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (C) 2019-2024 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine" and was originally part of the "Irrlicht Engine" -// For conditions of distribution and use, see copyright notice in nabla.h -// See the original file in irrlicht source for authors - -#ifndef _NBL_CORE_ALLOC_RESOURCE_OWNING_VECTOR_INCLUDED_ -#define _NBL_CORE_ALLOC_RESOURCE_OWNING_VECTOR_INCLUDED_ - -#include - -using namespace nbl; - -namespace nbl::core -{ - -class resource_owning_vector : public std::pmr::memory_resource { -public: - // only create the resource from an already sized vector - resource_owning_vector(core::vector&& buffer) : buffer(buffer), offset(0) - { - assert(buffer.size()); - }; - - void* data() { - return buffer.data(); - } - -protected: - void* do_allocate(size_t bytes, size_t alignment) override { - auto space = buffer.size() - offset; - auto ptr = buffer.data() + offset; - assert(bytes <= space); - offset += bytes; - return ptr; - } - - void do_deallocate(void* p, size_t bytes, size_t alignment) override {} - - bool do_is_equal(const std::pmr::memory_resource& other) const noexcept override { - return this == &other; - } - -private: - core::vector buffer; - size_t offset; -}; - -} - -#endif \ No newline at end of file diff --git a/include/nbl/ext/ScreenShot/ScreenShot.h b/include/nbl/ext/ScreenShot/ScreenShot.h index 4e71749cd7..18ddc13ee4 100644 --- a/include/nbl/ext/ScreenShot/ScreenShot.h +++ b/include/nbl/ext/ScreenShot/ScreenShot.h @@ -154,7 +154,7 @@ inline core::smart_refctd_ptr createScreenShot( region.imageOffset = { 0u, 0u, 0u }; region.imageExtent = cpuNewImage->getCreationParameters().extent; - auto cpuNewTexelBuffer = ICPUBuffer::create({ gpuTexelBufferSize }); + auto cpuNewTexelBuffer = ICPUBuffer::create({ .size = gpuTexelBufferSize }); { memcpy(cpuNewTexelBuffer->getPointer(), gpuTexelBuffer->getBoundMemory().memory->getMappedPointer(), gpuTexelBuffer->getSize()); } diff --git a/include/nbl/scene/ICullingLoDSelectionSystem.h b/include/nbl/scene/ICullingLoDSelectionSystem.h index dee356b20f..0e4c40b771 100644 --- a/include/nbl/scene/ICullingLoDSelectionSystem.h +++ b/include/nbl/scene/ICullingLoDSelectionSystem.h @@ -499,7 +499,7 @@ class ICullingLoDSelectionSystem : public virtual core::IReferenceCounted auto glslFile = loadBuiltinData(Path.value); core::smart_refctd_ptr glsl; { - glsl = asset::ICPUBuffer::create({ glslFile->getSize() }); + glsl = core::make_smart_refctd_ptr(glslFile->getSize()); memcpy(glsl->getPointer(), glslFile->getMappedPointer(), glsl->getSize()); } return {core::make_smart_refctd_ptr(std::move(glsl), asset::IShader::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_GLSL, Path.value), Path.value}; diff --git a/include/nbl/scene/ISkinInstanceCacheManager.h b/include/nbl/scene/ISkinInstanceCacheManager.h index 5a5e3f5881..671ed2387f 100644 --- a/include/nbl/scene/ISkinInstanceCacheManager.h +++ b/include/nbl/scene/ISkinInstanceCacheManager.h @@ -38,7 +38,7 @@ class ISkinInstanceCacheManager : public virtual core::IReferenceCounted auto glslFile = loadBuiltinData(uniqueString); core::smart_refctd_ptr glsl; { - glsl = asset::ICPUBuffer::create({ glslFile->getSize() }); + glsl = asset::ICPUBuffer::create({ .size = glslFile->getSize() }); memcpy(glsl->getPointer(), glslFile->getMappedPointer(), glsl->getSize()); } auto shader = device->createShader(core::make_smart_refctd_ptr(core::smart_refctd_ptr(glsl), type, asset::IShader::E_CONTENT_TYPE::ECT_GLSL, "")); diff --git a/include/nbl/scene/ITransformTreeManager.h b/include/nbl/scene/ITransformTreeManager.h index 35d29b3ce5..6152c32481 100644 --- a/include/nbl/scene/ITransformTreeManager.h +++ b/include/nbl/scene/ITransformTreeManager.h @@ -121,7 +121,7 @@ class ITransformTreeManager : public virtual core::IReferenceCounted auto glslFile = loadBuiltinData(Path.value); core::smart_refctd_ptr glsl; { - glsl = asset::ICPUBuffer::create({ glslFile->getSize() }); + glsl = core::make_smart_refctd_ptr(glslFile->getSize()); memcpy(glsl->getPointer(), glslFile->getMappedPointer(), glsl->getSize()); } auto shader = device->createShader(core::make_smart_refctd_ptr(core::smart_refctd_ptr(glsl), type, asset::IShader::E_CONTENT_TYPE::ECT_GLSL, "????")); diff --git a/src/nbl/asset/IAssetManager.cpp b/src/nbl/asset/IAssetManager.cpp index 275a4b75fb..c728773764 100644 --- a/src/nbl/asset/IAssetManager.cpp +++ b/src/nbl/asset/IAssetManager.cpp @@ -343,7 +343,7 @@ void IAssetManager::insertBuiltinAssets() info.samples = asset::ICPUImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT; info.flags = static_cast(0u); info.usage = asset::IImage::EUF_INPUT_ATTACHMENT_BIT|asset::IImage::EUF_SAMPLED_BIT; - auto buf = asset::ICPUBuffer::create({ info.extent.width*info.extent.height*asset::getTexelOrBlockBytesize(info.format) }); + auto buf = asset::ICPUBuffer::create({ .size = info.extent.width*info.extent.height*asset::getTexelOrBlockBytesize(info.format) }); memcpy(buf->getPointer(), //magenta-grey 2x2 chessboard std::array{{255, 0, 255, 255, 128, 128, 128, 255, 128, 128, 128, 255, 255, 0, 255, 255}}.data(), @@ -403,7 +403,7 @@ void IAssetManager::insertBuiltinAssets() auto ds1 = core::make_smart_refctd_ptr(core::smart_refctd_ptr(defaultDs1Layout.get())); { constexpr size_t UBO_SZ = sizeof(asset::SBasicViewParameters); - auto ubo = asset::ICPUBuffer::create({ UBO_SZ }); + auto ubo = asset::ICPUBuffer::create({ .size = UBO_SZ }); //for filling this UBO with actual data, one can use asset::SBasicViewParameters struct defined in nbl/asset/asset_utils.h asset::fillBufferWithDeadBeef(ubo.get()); diff --git a/src/nbl/asset/ICPUImage.cpp b/src/nbl/asset/ICPUImage.cpp index 27a4f0834b..28c893fa86 100644 --- a/src/nbl/asset/ICPUImage.cpp +++ b/src/nbl/asset/ICPUImage.cpp @@ -122,7 +122,7 @@ class CFlattenRegionsStreamHashImageFilter : public CMatchedSizeInOutImageFilter auto getScratchAsBuffer = [&memory = state->scratch.memory](size_t size, size_t offset = 0ull) { - return ICPUBuffer::create({ size, (uint8_t*)memory + offset }, core::adopt_memory); // adopt memory & don't free it on exit + return ICPUBuffer::create({ .size = size, .data = (uint8_t*)memory + offset }, core::adopt_memory); // adopt memory & don't free it on exit }; /* diff --git a/src/nbl/asset/interchange/CBufferLoaderBIN.h b/src/nbl/asset/interchange/CBufferLoaderBIN.h index 2d0ea5f765..8c3e50bad9 100644 --- a/src/nbl/asset/interchange/CBufferLoaderBIN.h +++ b/src/nbl/asset/interchange/CBufferLoaderBIN.h @@ -36,7 +36,7 @@ class CBufferLoaderBIN final : public asset::IAssetLoader private: struct SContext { - SContext(const size_t& sizeInBytes) : sourceCodeBuffer(ICPUBuffer::create({ sizeInBytes })) {} + SContext(const size_t& sizeInBytes) : sourceCodeBuffer(ICPUBuffer::create({ .size = sizeInBytes })) {} system::IFile* file; core::smart_refctd_ptr sourceCodeBuffer; }; diff --git a/src/nbl/asset/interchange/CGLILoader.cpp b/src/nbl/asset/interchange/CGLILoader.cpp index 1599b765b0..3f8f9f72d7 100644 --- a/src/nbl/asset/interchange/CGLILoader.cpp +++ b/src/nbl/asset/interchange/CGLILoader.cpp @@ -116,7 +116,7 @@ namespace nbl const auto texelBlockDimension = asset::getBlockDimensions(format.first); const auto texelBlockByteSize = asset::getTexelOrBlockBytesize(format.first); - auto texelBuffer = ICPUBuffer::create({ texture.size() }); + auto texelBuffer = ICPUBuffer::create({ .size = texture.size() }); auto data = reinterpret_cast(texelBuffer->getPointer()); ICPUImage::SCreationParams imageInfo = {}; diff --git a/src/nbl/asset/interchange/CGLTFLoader.cpp b/src/nbl/asset/interchange/CGLTFLoader.cpp index 413bf6fd22..67f6d86a77 100644 --- a/src/nbl/asset/interchange/CGLTFLoader.cpp +++ b/src/nbl/asset/interchange/CGLTFLoader.cpp @@ -59,7 +59,7 @@ using namespace nbl::asset; }; core::smart_refctd_ptr glslFile = loadBuiltinData(decltype(constexprStringType)::value); - auto glsl = asset::ICPUBuffer::create({ glslFile->getSize() }); + auto glsl = asset::ICPUBuffer::create({ .size = glslFile->getSize() }); memcpy(glsl->getPointer(),glslFile->getMappedPointer(),glsl->getSize()); auto unspecializedShader = core::make_smart_refctd_ptr(std::move(glsl), stage, asset::ICPUShader::E_CONTENT_TYPE::ECT_GLSL, stage != ICPUShader::ESS_VERTEX ? "?IrrlichtBAW glTFLoader FragmentShader?" : "?IrrlichtBAW glTFLoader VertexShader?"); @@ -115,7 +115,7 @@ using namespace nbl::asset; { simdjson::dom::parser parser; - auto jsonBuffer = ICPUBuffer::create({ _file->getSize() }); + auto jsonBuffer = ICPUBuffer::create({ .size = _file->getSize() }); { system::IFile::success_t success; _file->read(success, jsonBuffer->getPointer(), 0u, jsonBuffer->getSize()); @@ -529,8 +529,8 @@ using namespace nbl::asset; core::vector skeletonJointCountPrefixSum; skeletonJointCountPrefixSum.resize(skeletonJointCount.size()); // now create buffer for skeletons - SBufferBinding parentJointID = {0ull,ICPUBuffer::create({ sizeof(ICPUSkeleton::joint_id_t)*nodeCount })}; - SBufferBinding defaultTransforms = {0ull,ICPUBuffer::create({ sizeof(core::matrix3x4SIMD)*nodeCount })}; + SBufferBinding parentJointID = {0ull,ICPUBuffer::create({ .size = sizeof(ICPUSkeleton::joint_id_t)*nodeCount })}; + SBufferBinding defaultTransforms = {0ull,ICPUBuffer::create({ .size = sizeof(core::matrix3x4SIMD)*nodeCount })}; core::vector names(nodeCount); // and fill them { @@ -564,8 +564,8 @@ using namespace nbl::asset; uint32_t totalSkinJointRefs = 0u; for (const auto& skin : glTF.skins) totalSkinJointRefs += skin.joints.size(); - vertexJointToSkeletonJoint = ICPUBuffer::create({ sizeof(ICPUSkeleton::joint_id_t)*totalSkinJointRefs }); - inverseBindPose = ICPUBuffer::create({ sizeof(core::matrix3x4SIMD)*totalSkinJointRefs }); + vertexJointToSkeletonJoint = ICPUBuffer::create({ .size = sizeof(ICPUSkeleton::joint_id_t)*totalSkinJointRefs }); + inverseBindPose = ICPUBuffer::create({ .size = sizeof(core::matrix3x4SIMD)*totalSkinJointRefs }); } // then go over skins uint32_t skinJointRefCount = 0u; @@ -1079,8 +1079,8 @@ using namespace nbl::asset; constexpr bool isValidWeighComponentT = std::is_same::value || std::is_same::value || std::is_same::value; static_assert(isValidJointComponentT && isValidWeighComponentT); - vOverrideJointsBuffer = asset::ICPUBuffer::create({ vCommonOverrideAttributesCount * vJointsTexelByteSize }); - vOverrideWeightsBuffer = asset::ICPUBuffer::create({ vCommonOverrideAttributesCount * vWeightsTexelByteSize }); + vOverrideJointsBuffer = asset::ICPUBuffer::create({ .size = vCommonOverrideAttributesCount * vJointsTexelByteSize }); + vOverrideWeightsBuffer = asset::ICPUBuffer::create({ .size = vCommonOverrideAttributesCount * vWeightsTexelByteSize }); for (size_t vAttributeIx = 0; vAttributeIx < vCommonOverrideAttributesCount; ++vAttributeIx) { @@ -1231,8 +1231,8 @@ using namespace nbl::asset; const size_t repackJointsTexelByteSize = asset::getTexelOrBlockBytesize(repackJointsFormat); const size_t repackWeightsTexelByteSize = asset::getTexelOrBlockBytesize(repackWeightsFormat); - auto vOverrideRepackedJointsBuffer = asset::ICPUBuffer::create({ vCommonOverrideAttributesCount * repackJointsTexelByteSize }); - auto vOverrideRepackedWeightsBuffer = asset::ICPUBuffer::create({ vCommonOverrideAttributesCount * repackWeightsTexelByteSize }); + auto vOverrideRepackedJointsBuffer = asset::ICPUBuffer::create({ .size = vCommonOverrideAttributesCount * repackJointsTexelByteSize }); + auto vOverrideRepackedWeightsBuffer = asset::ICPUBuffer::create({ .size = vCommonOverrideAttributesCount * repackWeightsTexelByteSize }); memset(vOverrideRepackedJointsBuffer->getPointer(), 0, vOverrideRepackedJointsBuffer->getSize()); memset(vOverrideRepackedWeightsBuffer->getPointer(), 0, vOverrideRepackedWeightsBuffer->getSize()); @@ -1413,7 +1413,7 @@ using namespace nbl::asset; assert(weightsQuantizeFormat != EF_UNKNOWN); { vOverrideWeightsBuffer = std::move(core::smart_refctd_ptr()); //! free memory - auto vOverrideQuantizedWeightsBuffer = asset::ICPUBuffer::create({ weightComponentsByteStride * vCommonOverrideAttributesCount }); + auto vOverrideQuantizedWeightsBuffer = asset::ICPUBuffer::create({ .size = weightComponentsByteStride * vCommonOverrideAttributesCount }); { for (size_t vAttributeIx = 0; vAttributeIx < vCommonOverrideAttributesCount; ++vAttributeIx) { @@ -1561,7 +1561,7 @@ using namespace nbl::asset; inverseBindPoseBinding.offset = skin.inverseBindPose.offset; SBufferBinding jointAABBBufferBinding; - jointAABBBufferBinding.buffer = asset::ICPUBuffer::create({ jointCount*sizeof(core::aabbox3df) }); + jointAABBBufferBinding.buffer = asset::ICPUBuffer::create({ .size = jointCount*sizeof(core::aabbox3df) }); jointAABBBufferBinding.offset = 0u; auto* aabbPtr = reinterpret_cast(jointAABBBufferBinding.buffer->getPointer()); @@ -1658,7 +1658,7 @@ using namespace nbl::asset; simdjson::dom::parser parser; auto* _file = context.loadContext.mainFile; - auto jsonBuffer = ICPUBuffer::create({ _file->getSize() }); + auto jsonBuffer = ICPUBuffer::create({ .size = _file->getSize() }); { system::IFile::success_t success; _file->read(success, jsonBuffer->getPointer(), 0u, jsonBuffer->getSize()); diff --git a/src/nbl/asset/interchange/CGraphicsPipelineLoaderMTL.cpp b/src/nbl/asset/interchange/CGraphicsPipelineLoaderMTL.cpp index 3522b793d1..d1b95846a0 100644 --- a/src/nbl/asset/interchange/CGraphicsPipelineLoaderMTL.cpp +++ b/src/nbl/asset/interchange/CGraphicsPipelineLoaderMTL.cpp @@ -43,7 +43,7 @@ CGraphicsPipelineLoaderMTL::CGraphicsPipelineLoaderMTL(IAssetManager* _am, core: }; core::smart_refctd_ptr data = loadBuiltinData(Path.value); - auto buffer = asset::ICPUBuffer::create({ data->getSize()+1u }); + auto buffer = core::make_smart_refctd_ptr(data->getSize()+1u); char* bufferPtr = reinterpret_cast(buffer->getPointer()); memcpy(bufferPtr, data->getMappedPointer(), data->getSize()); bufferPtr[data->getSize()] = '\0'; @@ -627,7 +627,7 @@ CGraphicsPipelineLoaderMTL::image_views_set_t CGraphicsPipelineLoaderMTL::loadIm } #endif } - auto imgDataBuf = ICPUBuffer::create({ bufSz }); + auto imgDataBuf = ICPUBuffer::create({ .size = bufSz }); for (uint32_t i = CMTLMetadata::CRenderpassIndependentPipeline::EMP_REFL_POSX, j = 0u; i < CMTLMetadata::CRenderpassIndependentPipeline::EMP_REFL_POSX + 6u; ++i) { #ifndef _NBL_DEBUG diff --git a/src/nbl/asset/interchange/CHLSLLoader.cpp b/src/nbl/asset/interchange/CHLSLLoader.cpp index a6924b132a..f14096e473 100644 --- a/src/nbl/asset/interchange/CHLSLLoader.cpp +++ b/src/nbl/asset/interchange/CHLSLLoader.cpp @@ -15,7 +15,7 @@ SAssetBundle CHLSLLoader::loadAsset(system::IFile* _file, const IAssetLoader::SA return {}; const auto len = _file->getSize(); - auto source = ICPUBuffer::create({ len+1 }); + auto source = ICPUBuffer::create({ .size = len+1 }); system::IFile::success_t success; _file->read(success, source->getPointer(), 0, len); diff --git a/src/nbl/asset/interchange/CImageLoaderJPG.cpp b/src/nbl/asset/interchange/CImageLoaderJPG.cpp index 45677ff5cf..54131bd9e4 100644 --- a/src/nbl/asset/interchange/CImageLoaderJPG.cpp +++ b/src/nbl/asset/interchange/CImageLoaderJPG.cpp @@ -310,7 +310,7 @@ asset::SAssetBundle CImageLoaderJPG::loadAsset(system::IFile* _file, const asset uint32_t rowspan = region.bufferRowLength * cinfo.out_color_components; // Allocate memory for buffer - auto buffer = asset::ICPUBuffer::create({ rowspan*height }); + auto buffer = asset::ICPUBuffer::create({ .size = rowspan*height }); // Here we use the library's state variable cinfo.output_scanline as the // loop counter, so that we don't have to keep track ourselves. diff --git a/src/nbl/asset/interchange/CImageLoaderOpenEXR.cpp b/src/nbl/asset/interchange/CImageLoaderOpenEXR.cpp index da4e00070f..e78452e5a2 100644 --- a/src/nbl/asset/interchange/CImageLoaderOpenEXR.cpp +++ b/src/nbl/asset/interchange/CImageLoaderOpenEXR.cpp @@ -359,7 +359,7 @@ SAssetBundle CImageLoaderOpenEXR::loadAsset(system::IFile* _file, const asset::I auto image = ICPUImage::create(std::move(params)); { // create image and buffer that backs it const uint32_t texelFormatByteSize = getTexelOrBlockBytesize(image->getCreationParameters().format); - auto texelBuffer = ICPUBuffer::create({ image->getImageDataSizeInBytes() }); + auto texelBuffer = ICPUBuffer::create({ .size = image->getImageDataSizeInBytes() }); auto regions = core::make_refctd_dynamic_array>(1u); ICPUImage::SBufferCopy& region = regions->front(); region.imageSubresource.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; diff --git a/src/nbl/asset/interchange/CImageLoaderPNG.cpp b/src/nbl/asset/interchange/CImageLoaderPNG.cpp index d7548146c3..43739cd844 100644 --- a/src/nbl/asset/interchange/CImageLoaderPNG.cpp +++ b/src/nbl/asset/interchange/CImageLoaderPNG.cpp @@ -286,7 +286,7 @@ asset::SAssetBundle CImageLoaderPng::loadAsset(system::IFile* _file, const asset region.imageOffset = { 0u, 0u, 0u }; region.imageExtent = imgInfo.extent; - auto texelBuffer = ICPUBuffer::create({ region.bufferRowLength * region.imageExtent.height * texelFormatBytesize }); + auto texelBuffer = ICPUBuffer::create({ .size = region.bufferRowLength * region.imageExtent.height * texelFormatBytesize }); // Fill array of pointers to rows in image data const uint32_t pitch = region.bufferRowLength*texelFormatBytesize; diff --git a/src/nbl/asset/interchange/CImageLoaderTGA.cpp b/src/nbl/asset/interchange/CImageLoaderTGA.cpp index 5177cb2d19..faf848e23b 100644 --- a/src/nbl/asset/interchange/CImageLoaderTGA.cpp +++ b/src/nbl/asset/interchange/CImageLoaderTGA.cpp @@ -30,7 +30,7 @@ namespace asset auto inputTexelOrBlockByteSize = inputBuffer->getSize(); const uint32_t texelOrBlockLength = inputTexelOrBlockByteSize / asset::getTexelOrBlockBytesize(inputFormat); - auto outputBuffer = asset::ICPUBuffer::create({ texelOrBlockLength * asset::getTexelOrBlockBytesize(outputFormat) }); + auto outputBuffer = asset::ICPUBuffer::create({ .size = texelOrBlockLength * asset::getTexelOrBlockBytesize(outputFormat) }); auto outputTexelOrBlockByteSize = outputBuffer->getSize(); for (auto i = 0ull; i < texelOrBlockLength; ++i) @@ -49,7 +49,7 @@ void CImageLoaderTGA::loadCompressedImage(system::IFile *file, const STGAHeader& // I only changed the formatting a little bit. int32_t bytesPerPixel = header.PixelDepth/8; int32_t imageSizeInBytes = header.ImageHeight * header.ImageWidth * bytesPerPixel; - bufferData = ICPUBuffer::create({ wholeSizeWithPitchInBytes }); + bufferData = ICPUBuffer::create({ .size = wholeSizeWithPitchInBytes }); auto data = reinterpret_cast(bufferData->getPointer()); int32_t currentByte = 0; @@ -202,7 +202,7 @@ asset::SAssetBundle CImageLoaderTGA::loadAsset(system::IFile* _file, const asset if (header.ColorMapType) { auto colorMapEntryByteSize = header.ColorMapEntrySize / 8 * header.ColorMapLength; - auto colorMapEntryBuffer = asset::ICPUBuffer::create({ (size_t)colorMapEntryByteSize }); + auto colorMapEntryBuffer = asset::ICPUBuffer::create({ .size = static_cast(colorMapEntryByteSize) }); { system::IFile::success_t success; _file->read(success, colorMapEntryBuffer->getPointer(), offset, header.ColorMapEntrySize / 8 * header.ColorMapLength); @@ -251,7 +251,7 @@ asset::SAssetBundle CImageLoaderTGA::loadAsset(system::IFile* _file, const asset { region.bufferRowLength = calcPitchInBlocks(region.imageExtent.width, getTexelOrBlockBytesize(EF_R8G8B8_SRGB)); const int32_t imageSize = endBufferSize = region.imageExtent.height * region.bufferRowLength * bytesPerTexel; - texelBuffer = ICPUBuffer::create({ (size_t)imageSize }); + texelBuffer = ICPUBuffer::create({ .size = static_cast(imageSize) }); { system::IFile::success_t success; _file->read(success, texelBuffer->getPointer(), offset, imageSize); diff --git a/src/nbl/asset/interchange/CImageWriterJPG.cpp b/src/nbl/asset/interchange/CImageWriterJPG.cpp index dea43ba696..1386fb93e3 100644 --- a/src/nbl/asset/interchange/CImageWriterJPG.cpp +++ b/src/nbl/asset/interchange/CImageWriterJPG.cpp @@ -151,7 +151,7 @@ static bool writeJPEGFile(system::IFile* file, system::ISystem* sys, const asset jpeg_start_compress(&cinfo, TRUE); const auto JPG_BYTE_PITCH = rowByteSize; - auto destBuffer = asset::ICPUBuffer::create({ JPG_BYTE_PITCH }); + auto destBuffer = asset::ICPUBuffer::create({ .size = JPG_BYTE_PITCH }); auto dest = reinterpret_cast(destBuffer->getPointer()); if (dest) diff --git a/src/nbl/asset/interchange/CImageWriterTGA.cpp b/src/nbl/asset/interchange/CImageWriterTGA.cpp index a5b5e676b9..e35a3bceab 100644 --- a/src/nbl/asset/interchange/CImageWriterTGA.cpp +++ b/src/nbl/asset/interchange/CImageWriterTGA.cpp @@ -134,7 +134,7 @@ bool CImageWriterTGA::writeAsset(system::IFile* _file, const SAssetWriteParams& int32_t row_size = ((imageHeader.PixelDepth / 8) * imageHeader.ImageWidth); // allocate a row do translate data into - auto rowPointerBuffer = ICPUBuffer::create({ (size_t)row_size }); + auto rowPointerBuffer = ICPUBuffer::create({ .size = static_cast(row_size) }); auto row_pointer = reinterpret_cast(rowPointerBuffer->getPointer()); uint32_t y; diff --git a/src/nbl/asset/interchange/COBJMeshFileLoader.cpp b/src/nbl/asset/interchange/COBJMeshFileLoader.cpp index 61d9eab6ec..f87858c5fd 100644 --- a/src/nbl/asset/interchange/COBJMeshFileLoader.cpp +++ b/src/nbl/asset/interchange/COBJMeshFileLoader.cpp @@ -465,10 +465,10 @@ asset::SAssetBundle COBJMeshFileLoader::loadAsset(system::IFile* _file, const as submeshes[i]->setPipeline(std::move(pipeline.first)); } - core::smart_refctd_ptr vtxBuf = ICPUBuffer::create({ vertices.size() * sizeof(SObjVertex) }); + core::smart_refctd_ptr vtxBuf = ICPUBuffer::create({ .size = vertices.size() * sizeof(SObjVertex) }); memcpy(vtxBuf->getPointer(), vertices.data(), vtxBuf->getSize()); - auto ixBuf = ICPUBuffer::create({ ixBufOffset }); + auto ixBuf = ICPUBuffer::create({ .size = ixBufOffset }); for (size_t i = 0ull; i < submeshes.size(); ++i) { if (submeshWasLoadedFromCache[i]) diff --git a/src/nbl/asset/interchange/CPLYMeshFileLoader.cpp b/src/nbl/asset/interchange/CPLYMeshFileLoader.cpp index 3cdc261408..1d22ffc293 100644 --- a/src/nbl/asset/interchange/CPLYMeshFileLoader.cpp +++ b/src/nbl/asset/interchange/CPLYMeshFileLoader.cpp @@ -373,7 +373,7 @@ asset::SAssetBundle CPLYMeshFileLoader::loadAsset(system::IFile* _file, const as if (!attributes[ET_POS].buffer) { attributes[ET_POS].offset = 0u; - attributes[ET_POS].buffer = asset::ICPUBuffer::create({ asset::getTexelOrBlockBytesize(EF_R32G32B32_SFLOAT) * plyVertexElement.Count }); + attributes[ET_POS].buffer = asset::ICPUBuffer::create({ .size = asset::getTexelOrBlockBytesize(EF_R32G32B32_SFLOAT) * plyVertexElement.Count }); } } else if(propertyName == "nx" || propertyName == "ny" || propertyName == "nz") @@ -381,7 +381,7 @@ asset::SAssetBundle CPLYMeshFileLoader::loadAsset(system::IFile* _file, const as if (!attributes[ET_NORM].buffer) { attributes[ET_NORM].offset = 0u; - attributes[ET_NORM].buffer = asset::ICPUBuffer::create({ asset::getTexelOrBlockBytesize(EF_R32G32B32_SFLOAT) * plyVertexElement.Count }); + attributes[ET_NORM].buffer = asset::ICPUBuffer::create({ .size = asset::getTexelOrBlockBytesize(EF_R32G32B32_SFLOAT) * plyVertexElement.Count }); } } else if (propertyName == "u" || propertyName == "s" || propertyName == "v" || propertyName == "t") @@ -389,7 +389,7 @@ asset::SAssetBundle CPLYMeshFileLoader::loadAsset(system::IFile* _file, const as if (!attributes[ET_UV].buffer) { attributes[ET_UV].offset = 0u; - attributes[ET_UV].buffer = asset::ICPUBuffer::create({ asset::getTexelOrBlockBytesize(EF_R32G32_SFLOAT) * plyVertexElement.Count }); + attributes[ET_UV].buffer = asset::ICPUBuffer::create({ .size = asset::getTexelOrBlockBytesize(EF_R32G32_SFLOAT) * plyVertexElement.Count }); } } else if (propertyName == "red" || propertyName == "green" || propertyName == "blue" || propertyName == "alpha") @@ -397,7 +397,7 @@ asset::SAssetBundle CPLYMeshFileLoader::loadAsset(system::IFile* _file, const as if (!attributes[ET_COL].buffer) { attributes[ET_COL].offset = 0u; - attributes[ET_COL].buffer = asset::ICPUBuffer::create({ asset::getTexelOrBlockBytesize(EF_R32G32B32A32_SFLOAT) * plyVertexElement.Count }); + attributes[ET_COL].buffer = asset::ICPUBuffer::create({ .size = asset::getTexelOrBlockBytesize(EF_R32G32B32A32_SFLOAT) * plyVertexElement.Count }); } } } @@ -426,7 +426,7 @@ asset::SAssetBundle CPLYMeshFileLoader::loadAsset(system::IFile* _file, const as if (indices.size()) { - asset::SBufferBinding indexBinding = { 0, asset::ICPUBuffer::create({ indices.size() * sizeof(uint32_t) }) }; + asset::SBufferBinding indexBinding = { 0, asset::ICPUBuffer::create({ .size = indices.size() * sizeof(uint32_t) }) }; memcpy(indexBinding.buffer->getPointer(), indices.data(), indexBinding.buffer->getSize()); mb->setIndexCount(indices.size()); diff --git a/src/nbl/asset/interchange/CSPVLoader.cpp b/src/nbl/asset/interchange/CSPVLoader.cpp index 6787912cde..aa5cc0d20e 100644 --- a/src/nbl/asset/interchange/CSPVLoader.cpp +++ b/src/nbl/asset/interchange/CSPVLoader.cpp @@ -65,7 +65,7 @@ SAssetBundle CSPVLoader::loadAsset(system::IFile* _file, const IAssetLoader::SAs if (!_file) return {}; - auto buffer = ICPUBuffer::create({ _file->getSize() }); + auto buffer = ICPUBuffer::create({ .size = _file->getSize() }); system::IFile::success_t success; _file->read(success, buffer->getPointer(), 0, _file->getSize()); diff --git a/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp b/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp index defce74b7b..226db5a0dc 100644 --- a/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp +++ b/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp @@ -274,7 +274,7 @@ SAssetBundle CSTLMeshFileLoader::loadAsset(system::IFile* _file, const IAssetLoa } // end while (_file->getPos() < filesize) const size_t vtxSize = hasColor ? (3 * sizeof(float) + 4 + 4) : (3 * sizeof(float) + 4); - auto vertexBuf = asset::ICPUBuffer::create({ vtxSize * positions.size() }); + auto vertexBuf = asset::ICPUBuffer::create({ .size = vtxSize * positions.size() }); using quant_normal_t = CQuantNormalCache::value_type_t; diff --git a/src/nbl/asset/utils/CDerivativeMapCreator.cpp b/src/nbl/asset/utils/CDerivativeMapCreator.cpp index ee288d9aa7..b19b14b364 100644 --- a/src/nbl/asset/utils/CDerivativeMapCreator.cpp +++ b/src/nbl/asset/utils/CDerivativeMapCreator.cpp @@ -80,7 +80,7 @@ core::smart_refctd_ptr CDerivativeMapCreator::createDerivativeMapFrom auto outParams = inParams; outParams.format = getRGformat(outParams.format); const uint32_t pitch = IImageAssetHandlerBase::calcPitchInBlocks(outParams.extent.width, getTexelOrBlockBytesize(outParams.format)); - auto buffer = ICPUBuffer::create({ getTexelOrBlockBytesize(outParams.format) * pitch * outParams.extent.height }); + auto buffer = ICPUBuffer::create({ .size = getTexelOrBlockBytesize(outParams.format) * pitch * outParams.extent.height }); ICPUImage::SBufferCopy region; region.imageOffset = { 0,0,0 }; region.imageExtent = outParams.extent; @@ -194,7 +194,7 @@ core::smart_refctd_ptr CDerivativeMapCreator::createDerivativeMapFrom core::smart_refctd_ptr newDerivativeNormalMapImage; { const uint32_t pitch = IImageAssetHandlerBase::calcPitchInBlocks(newImageParams.extent.width,getTexelOrBlockBytesize(newImageParams.format)); - core::smart_refctd_ptr newCpuBuffer = ICPUBuffer::create({ getTexelOrBlockBytesize(newImageParams.format) * pitch * newImageParams.extent.height }); + core::smart_refctd_ptr newCpuBuffer = ICPUBuffer::create({ .size = getTexelOrBlockBytesize(newImageParams.format) * pitch * newImageParams.extent.height }); ICPUImage::SBufferCopy region; region.imageOffset = { 0,0,0 }; diff --git a/src/nbl/asset/utils/CGLSLCompiler.cpp b/src/nbl/asset/utils/CGLSLCompiler.cpp index 763468b79f..ec1e5cd809 100644 --- a/src/nbl/asset/utils/CGLSLCompiler.cpp +++ b/src/nbl/asset/utils/CGLSLCompiler.cpp @@ -277,7 +277,7 @@ core::smart_refctd_ptr CGLSLCompiler::compileToSPIRV_impl(const std: if (bin_res.GetCompilationStatus() == shaderc_compilation_status_success) { - auto outSpirv = ICPUBuffer::create({ std::distance(bin_res.cbegin(), bin_res.cend()) * sizeof(uint32_t) }); + auto outSpirv = ICPUBuffer::create({ .size = std::distance(bin_res.cbegin(), bin_res.cend()) * sizeof(uint32_t) }); memcpy(outSpirv->getPointer(), bin_res.cbegin(), outSpirv->getSize()); if (glslOptions.spirvOptimizer) diff --git a/src/nbl/asset/utils/CGeometryCreator.cpp b/src/nbl/asset/utils/CGeometryCreator.cpp index ea18b4bf2b..335cb326e8 100644 --- a/src/nbl/asset/utils/CGeometryCreator.cpp +++ b/src/nbl/asset/utils/CGeometryCreator.cpp @@ -37,7 +37,7 @@ CGeometryCreator::return_type CGeometryCreator::createCubeMesh(const core::vecto // Create indices { retval.indexCount = 36u; - auto indices = asset::ICPUBuffer::create({ sizeof(uint16_t)*retval.indexCount }); + auto indices = asset::ICPUBuffer::create({ .size = sizeof(uint16_t)*retval.indexCount }); indices->addUsageFlags(asset::IBuffer::EUF_INDEX_BUFFER_BIT); auto u = reinterpret_cast(indices->getPointer()); for (uint32_t i=0u; i<6u; ++i) @@ -53,7 +53,7 @@ CGeometryCreator::return_type CGeometryCreator::createCubeMesh(const core::vecto } // Create vertices - auto vertices = asset::ICPUBuffer::create({ 24u*vertexSize }); + auto vertices = asset::ICPUBuffer::create({ .size = 24u*vertexSize }); vertices->addUsageFlags(IBuffer::EUF_VERTEX_BUFFER_BIT); CubeVertex* ptr = (CubeVertex*)vertices->getPointer(); @@ -190,9 +190,9 @@ CGeometryCreator::return_type CGeometryCreator::createArrowMesh(const uint32_t t coneVertices[i].pos[c] = newPos[c]; } - auto newArrowVertexBuffer = asset::ICPUBuffer::create({ newArrowVertexCount * sizeof(ArrowVertex) }); + auto newArrowVertexBuffer = asset::ICPUBuffer::create({ .size = newArrowVertexCount * sizeof(ArrowVertex) }); newArrowVertexBuffer->setUsageFlags(newArrowVertexBuffer->getUsageFlags() | asset::IBuffer::EUF_VERTEX_BUFFER_BIT); - auto newArrowIndexBuffer = asset::ICPUBuffer::create({ newArrowIndexCount * sizeof(uint16_t) }); + auto newArrowIndexBuffer = asset::ICPUBuffer::create({ .size = newArrowIndexCount * sizeof(uint16_t) }); newArrowIndexBuffer->setUsageFlags(newArrowIndexBuffer->getUsageFlags() | asset::IBuffer::EUF_INDEX_BUFFER_BIT); for (auto z = 0ull; z < newArrowVertexCount; ++z) @@ -269,7 +269,7 @@ CGeometryCreator::return_type CGeometryCreator::createSphereMesh(float radius, u const uint32_t polyCountXPitch = polyCountX + 1; // get to same vertex on next level retval.indexCount = (polyCountX * polyCountY) * 6; - auto indices = asset::ICPUBuffer::create({ sizeof(uint32_t) * retval.indexCount }); + auto indices = asset::ICPUBuffer::create({ .size = sizeof(uint32_t) * retval.indexCount }); // Create indices { @@ -339,7 +339,7 @@ CGeometryCreator::return_type CGeometryCreator::createSphereMesh(float radius, u { size_t vertexSize = 3 * 4 + 4 + 2 * 4 + 4; size_t vertexCount = (polyCountXPitch * polyCountY) + 2; - auto vtxBuf = asset::ICPUBuffer::create({ vertexCount * vertexSize }); + auto vtxBuf = asset::ICPUBuffer::create({ .size = vertexCount * vertexSize }); auto* tmpMem = reinterpret_cast(vtxBuf->getPointer()); for (size_t i = 0; i < vertexCount; i++) { @@ -463,7 +463,7 @@ CGeometryCreator::return_type CGeometryCreator::createCylinderMesh(float radius, },{vertexSize,SVertexInputBindingParams::EVIR_PER_VERTEX} }; const size_t vtxCnt = 2u*tesselation; - auto vtxBuf = asset::ICPUBuffer::create({ vtxCnt*sizeof(CylinderVertex) }); + auto vtxBuf = asset::ICPUBuffer::create({ .size = vtxCnt*sizeof(CylinderVertex) }); CylinderVertex* vertices = reinterpret_cast(vtxBuf->getPointer()); for (auto i=0ull; igetPointer(); for (uint32_t i = 0u, j = 0u; i < halfIx; ++i) @@ -526,7 +526,7 @@ CGeometryCreator::return_type CGeometryCreator::createConeMesh( float radius, fl IMeshManipulator* const meshManipulatorOverride) const { const size_t vtxCnt = tesselation * 2; - auto vtxBuf = asset::ICPUBuffer::create({ vtxCnt * sizeof(ConeVertex) }); + auto vtxBuf = asset::ICPUBuffer::create({ .size = vtxCnt * sizeof(ConeVertex) }); ConeVertex* vertices = reinterpret_cast(vtxBuf->getPointer()); ConeVertex* baseVertices = vertices; @@ -570,7 +570,7 @@ CGeometryCreator::return_type CGeometryCreator::createConeMesh( float radius, fl apexVertices[i].normal = quantNormalCache->quantize(core::normalize(u1)); } - auto idxBuf = asset::ICPUBuffer::create({ 3u * tesselation * sizeof(uint16_t) }); + auto idxBuf = asset::ICPUBuffer::create({ .size = 3u * tesselation * sizeof(uint16_t) }); uint16_t* indices = (uint16_t*)idxBuf->getPointer(); const uint32_t firstIndexOfBaseVertices = 0; @@ -633,13 +633,13 @@ CGeometryCreator::return_type CGeometryCreator::createRectangleMesh(const core:: u[4] = 3; u[5] = 2; - auto indices = asset::ICPUBuffer::create({ sizeof(u) }); + auto indices = asset::ICPUBuffer::create({ .size = sizeof(u) }); memcpy(indices->getPointer(), u, sizeof(u)); indices->addUsageFlags(asset::IBuffer::EUF_INDEX_BUFFER_BIT); retval.indexBuffer = { 0ull, std::move(indices) }; // Create vertices - auto vertices = asset::ICPUBuffer::create({ 4 * vertexSize }); + auto vertices = asset::ICPUBuffer::create({ .size = 4 * vertexSize }); RectangleVertex* ptr = (RectangleVertex*)vertices->getPointer(); ptr[0] = RectangleVertex(core::vector3df_SIMD(-1.0f, 1.0f, 0.0f) * _size, video::SColor(0xFFFFFFFFu), @@ -676,7 +676,7 @@ CGeometryCreator::return_type CGeometryCreator::createDiskMesh(float radius, uin const float angle = 360.0f / static_cast(tesselation); - auto vertices = asset::ICPUBuffer::create({ vertexCount * vertexSize }); + auto vertices = asset::ICPUBuffer::create({ .size = vertexCount * vertexSize }); DiskVertex* ptr = (DiskVertex*)vertices->getPointer(); const core::vectorSIMDf v0(0.0f, radius, 0.0f, 1.0f); @@ -1693,8 +1693,8 @@ CGeometryCreator::return_type CGeometryCreator::createIcoSphere(float radius, ui {vertexSize,SVertexInputBindingParams::EVIR_PER_VERTEX} }; - auto vertexBuffer = asset::ICPUBuffer::create({ IcosphereData.getInterleavedVertexSize() }); - auto indexBuffer = asset::ICPUBuffer::create({ IcosphereData.getIndexSize() }); + auto vertexBuffer = asset::ICPUBuffer::create({ .size = IcosphereData.getInterleavedVertexSize() }); + auto indexBuffer = asset::ICPUBuffer::create({ .size = IcosphereData.getIndexSize() }); memcpy(vertexBuffer->getPointer(), IcosphereData.getInterleavedVertices(), vertexBuffer->getSize()); memcpy(indexBuffer->getPointer(), IcosphereData.getIndices(), indexBuffer->getSize()); diff --git a/src/nbl/asset/utils/CHLSLCompiler.cpp b/src/nbl/asset/utils/CHLSLCompiler.cpp index 2827361ff4..3559fdb07b 100644 --- a/src/nbl/asset/utils/CHLSLCompiler.cpp +++ b/src/nbl/asset/utils/CHLSLCompiler.cpp @@ -485,7 +485,7 @@ core::smart_refctd_ptr CHLSLCompiler::compileToSPIRV_impl(const std: return nullptr; } - auto outSpirv = ICPUBuffer::create({ compileResult.objectBlob->GetBufferSize() }); + auto outSpirv = ICPUBuffer::create({ .size = compileResult.objectBlob->GetBufferSize() }); memcpy(outSpirv->getPointer(), compileResult.objectBlob->GetBufferPointer(), compileResult.objectBlob->GetBufferSize()); // Optimizer step diff --git a/src/nbl/asset/utils/CMeshManipulator.cpp b/src/nbl/asset/utils/CMeshManipulator.cpp index 29a096071e..aef3e74e9c 100644 --- a/src/nbl/asset/utils/CMeshManipulator.cpp +++ b/src/nbl/asset/utils/CMeshManipulator.cpp @@ -59,7 +59,7 @@ void IMeshManipulator::flipSurfaces(ICPUMeshBuffer* inbuffer) } else //even { - auto newIndexBuffer = ICPUBuffer::create({ (idxcnt + 1u) * sizeof(uint16_t) }); + auto newIndexBuffer = ICPUBuffer::create({ .size = (idxcnt + 1u) * sizeof(uint16_t) }); auto* destPtr = reinterpret_cast(newIndexBuffer->getPointer()); destPtr[0] = idx[0]; memcpy(destPtr + 1u, idx, sizeof(uint16_t) * idxcnt); @@ -104,7 +104,7 @@ void IMeshManipulator::flipSurfaces(ICPUMeshBuffer* inbuffer) } else //even { - auto newIndexBuffer = ICPUBuffer::create({ (idxcnt + 1u) * sizeof(uint32_t) }); + auto newIndexBuffer = ICPUBuffer::create({ .size = (idxcnt + 1u) * sizeof(uint32_t) }); auto* destPtr = reinterpret_cast(newIndexBuffer->getPointer()); destPtr[0] = idx[0]; memcpy(destPtr + 1u, idx, sizeof(uint32_t) * idxcnt); @@ -188,7 +188,7 @@ core::smart_refctd_ptr CMeshManipulator::createMeshBufferFetchOp vtxParams.bindings[NEW_VTX_BUF_BINDING].stride = vertexSize; vtxParams.bindings[NEW_VTX_BUF_BINDING].inputRate = SVertexInputBindingParams::EVIR_PER_VERTEX; - auto newVertBuffer = ICPUBuffer::create({ vertexCount*vertexSize }); + auto newVertBuffer = ICPUBuffer::create({ .size = vertexCount*vertexSize }); outbuffer->setVertexBufferBinding({ 0u, core::smart_refctd_ptr(newVertBuffer) }, NEW_VTX_BUF_BINDING); for (size_t i = 0; i < MAX_ATTRIBS; ++i) { @@ -311,7 +311,7 @@ core::smart_refctd_ptr IMeshManipulator::createMeshBufferUniqueP vtxParams.bindings[NEW_VTX_BUF_BINDING].inputRate = SVertexInputBindingParams::EVIR_PER_VERTEX; vtxParams.bindings[NEW_VTX_BUF_BINDING].stride = stride; - auto vertexBuffer = ICPUBuffer::create({ stride*idxCnt }); + auto vertexBuffer = ICPUBuffer::create({ .size = stride*idxCnt }); clone->setVertexBufferBinding({0u, vertexBuffer}, 0u); for (size_t i=0; i IMeshManipulator::createMeshBufferUniqueP if (_makeIndexBuf) { - auto idxbuf = ICPUBuffer::create({ idxCnt*(idxCnt<0x10000 ? 2u : 4u) }); + auto idxbuf = ICPUBuffer::create({ .size = idxCnt*(idxCnt<0x10000 ? 2u : 4u) }); if (idxCnt<0x10000u) { for (uint32_t i = 0u; i < idxCnt; ++i) @@ -419,7 +419,7 @@ core::smart_refctd_ptr IMeshManipulator::calculateSmoothNormals( } const auto normalFormatBytesize = asset::getTexelOrBlockBytesize(inbuffer->getAttribFormat(normalAttr)); - auto normalBuf = ICPUBuffer::create({ normalFormatBytesize*IMeshManipulator::upperBoundVertexID(inbuffer) }); + auto normalBuf = ICPUBuffer::create({ .size = normalFormatBytesize*IMeshManipulator::upperBoundVertexID(inbuffer) }); outbuffer->setVertexBufferBinding({0ull,std::move(normalBuf)},normalBinding); auto pipeline = core::move_and_static_cast(oldPipeline->clone(0u)); @@ -561,7 +561,7 @@ core::smart_refctd_ptr IMeshManipulator::createMeshBufferWelded( { if (!oldIndices) { - inbuffer->setIndexBufferBinding({ 0u, ICPUBuffer::create({ (maxRedirect >= 0x10000u ? sizeof(uint32_t) : sizeof(uint16_t)) * inbuffer->getIndexCount() }) }); + inbuffer->setIndexBufferBinding({ 0u, ICPUBuffer::create({ .size = (maxRedirect >= 0x10000u ? sizeof(uint32_t) : sizeof(uint16_t)) * inbuffer->getIndexCount() }) }); inbuffer->setIndexType(maxRedirect>=0x10000u ? EIT_32BIT:EIT_16BIT); } } @@ -724,7 +724,7 @@ core::smart_refctd_ptr IMeshManipulator::createOptimizedMeshBuff if (newIdxType == EIT_16BIT) { - newIdxBuffer = ICPUBuffer::create({ sizeof(uint16_t)*outbuffer->getIndexCount() }); + newIdxBuffer = ICPUBuffer::create({ .size = sizeof(uint16_t)*outbuffer->getIndexCount() }); // no need to change index buffer offset because it's always 0 (after duplicating original mesh) for (size_t i = 0; i < outbuffer->getIndexCount(); ++i) reinterpret_cast(newIdxBuffer->getPointer())[i] = reinterpret_cast(indices)[i] - minIdx; @@ -823,7 +823,7 @@ void IMeshManipulator::requantizeMeshBuffer(ICPUMeshBuffer* _meshbuffer, const S const size_t vertexSize = newAttribs[activeAttributeCount - 1].offset + newAttribs[activeAttributeCount - 1].size; - auto newVertexBuffer = ICPUBuffer::create({ vertexCnt * vertexSize }); + auto newVertexBuffer = ICPUBuffer::create({ .size = vertexCnt * vertexSize }); constexpr uint32_t VTX_BUF_BINDING = 0u; assert(_meshbuffer->getVertexBufferBindings()[0].buffer); @@ -907,7 +907,7 @@ void CMeshManipulator::_filterInvalidTriangles(ICPUMeshBuffer* _input) }); const size_t newSize = std::distance(begin, newEnd) * sizeof(Triangle); - auto newBuf = ICPUBuffer::create({ newSize }); + auto newBuf = ICPUBuffer::create({ .size = newSize }); memcpy(newBuf->getPointer(), copy, newSize); _NBL_ALIGNED_FREE(copy); diff --git a/src/nbl/asset/utils/CMeshManipulator.h b/src/nbl/asset/utils/CMeshManipulator.h index 7a5b895665..bb216e6a61 100644 --- a/src/nbl/asset/utils/CMeshManipulator.h +++ b/src/nbl/asset/utils/CMeshManipulator.h @@ -55,7 +55,7 @@ class CMeshManipulator : public IMeshManipulator if (!_in) return nullptr; - auto out = ICPUBuffer::create({ sizeof(uint32_t) * _idxCount }); + auto out = ICPUBuffer::create({ .size = sizeof(uint32_t) * _idxCount }); auto* outPtr = reinterpret_cast(out->getPointer()); @@ -86,7 +86,7 @@ class CMeshManipulator : public IMeshManipulator { const auto outputSize = _idxCount = (_idxCount - 1) * 2; - auto output = ICPUBuffer::create({ sizeof(OutType)*outputSize }); + auto output = ICPUBuffer::create({ .size = sizeof(OutType)*outputSize }); const auto* iptr = reinterpret_cast(_input); auto* optr = reinterpret_cast(output->getPointer()); for (uint32_t i = 0, j = 0; i < outputSize;) @@ -102,7 +102,7 @@ class CMeshManipulator : public IMeshManipulator { const auto outputSize = _idxCount = (_idxCount - 2) * 3; - auto output = ICPUBuffer::create({ sizeof(OutType)*outputSize }); + auto output = ICPUBuffer::create({ .size = sizeof(OutType)*outputSize }); const auto* iptr = reinterpret_cast(_input); auto* optr = reinterpret_cast(output->getPointer()); for (uint32_t i = 0, j = 0; i < outputSize; j += 2) @@ -124,7 +124,7 @@ class CMeshManipulator : public IMeshManipulator { const auto outputSize = _idxCount = (_idxCount - 2) * 3; - auto output = ICPUBuffer::create({ sizeof(OutType)*outputSize }); + auto output = ICPUBuffer::create({ .size = sizeof(OutType)*outputSize }); const auto* iptr = reinterpret_cast(_input); auto* optr = reinterpret_cast(output->getPointer()); for (uint32_t i = 0, j = 1; i < outputSize;) diff --git a/src/nbl/asset/utils/ISPIRVOptimizer.cpp b/src/nbl/asset/utils/ISPIRVOptimizer.cpp index 92cbedd143..f9894293db 100644 --- a/src/nbl/asset/utils/ISPIRVOptimizer.cpp +++ b/src/nbl/asset/utils/ISPIRVOptimizer.cpp @@ -82,7 +82,7 @@ nbl::core::smart_refctd_ptr ISPIRVOptimizer::optimize(const uint32_t if (!resultBytesize) return nullptr; - auto result = ICPUBuffer::create({ resultBytesize }); + auto result = ICPUBuffer::create({ .size = resultBytesize }); memcpy(result->getPointer(), optimized.data(), resultBytesize); return result; diff --git a/src/nbl/asset/utils/IShaderCompiler.cpp b/src/nbl/asset/utils/IShaderCompiler.cpp index 927f07e96a..161c9d3ba2 100644 --- a/src/nbl/asset/utils/IShaderCompiler.cpp +++ b/src/nbl/asset/utils/IShaderCompiler.cpp @@ -5,7 +5,7 @@ #include "nbl/asset/utils/shadercUtils.h" #include "nbl/asset/utils/shaderCompiler_serialization.h" -#include "nbl/core/alloc/resource_owning_vector.h" +#include "nbl/core/alloc/VectorViewNullMemoryResource.h" #include #include @@ -342,7 +342,7 @@ core::smart_refctd_ptr IShaderCompiler::CCache::serialize() const // Might as well memcpy everything memcpy(retVal.data() + SHADER_BUFFER_SIZE_BYTES + shaderBufferSize, dumpedContainerJson.data(), dumpedContainerJsonLength); - return ICPUBuffer::create({ .size = retVal.size(), .data = retVal.data(), .memoryResource = new core::resource_owning_vector(std::move(retVal)) }); + return ICPUBuffer::create({ .size = retVal.size(), .data = retVal.data(), .memoryResource = new core::VectorViewNullMemoryResource(std::move(retVal)) }); } core::smart_refctd_ptr IShaderCompiler::CCache::deserialize(const std::span serializedCache) @@ -375,7 +375,7 @@ core::smart_refctd_ptr IShaderCompiler::CCache::deseria // We must now recreate the shaders, add them to each entry, then move the entry into the multiset for (auto i = 0u; i < entries.size(); i++) { // Create buffer to hold the code - auto code = ICPUBuffer::create({ shaderCreationParams[i].codeByteSize }); + auto code = ICPUBuffer::create({ .size = shaderCreationParams[i].codeByteSize }); // Copy the shader bytecode into the buffer memcpy(code->getPointer(), serializedCache.data() + SHADER_BUFFER_SIZE_BYTES + shaderCreationParams[i].offset, shaderCreationParams[i].codeByteSize); @@ -416,7 +416,7 @@ bool nbl::asset::IShaderCompiler::CCache::SEntry::setContent(const asset::ICPUBu if (res != SZ_OK || propsSize != LZMA_PROPS_SIZE) return false; compressedSpirv.resize(propsSize + destLen); - auto memResource = new core::resource_owning_vector(std::move(compressedSpirv)); + auto memResource = new core::VectorViewNullMemoryResource(std::move(compressedSpirv)); spirv = ICPUBuffer::create({ .size = propsSize + destLen, .data = memResource->data(), .memoryResource = std::move(memResource)}); return true; @@ -424,7 +424,7 @@ bool nbl::asset::IShaderCompiler::CCache::SEntry::setContent(const asset::ICPUBu core::smart_refctd_ptr nbl::asset::IShaderCompiler::CCache::SEntry::decompressShader() const { - auto uncompressedBuf = ICPUBuffer::create({ uncompressedSize }); + auto uncompressedBuf = ICPUBuffer::create({ .size = uncompressedSize }); uncompressedBuf->setContentHash(uncompressedContentHash); size_t dstSize = uncompressedBuf->getSize(); diff --git a/src/nbl/ext/DepthPyramidGenerator/DepthPyramidGenerator.cpp b/src/nbl/ext/DepthPyramidGenerator/DepthPyramidGenerator.cpp index 0a960691be..6a2ba21b58 100644 --- a/src/nbl/ext/DepthPyramidGenerator/DepthPyramidGenerator.cpp +++ b/src/nbl/ext/DepthPyramidGenerator/DepthPyramidGenerator.cpp @@ -90,7 +90,7 @@ layout(local_size_x = WORKGROUP_X_AND_Y_SIZE, local_size_y = WORKGROUP_X_AND_Y_S const uint32_t perPassMipCnt = static_cast(config.workGroupSize) == 32u ? 6u : 5u; constexpr size_t extraSize = 32u; - auto shaderCode = ICPUBuffer::create({ strlen(source) + extraSize + 1u }); + auto shaderCode = ICPUBuffer::create({ .size = strlen(source) + extraSize + 1u }); snprintf(reinterpret_cast(shaderCode->getPointer()), shaderCode->getSize(), source, static_cast(m_config.workGroupSize), format, redOp, mipScaling); auto cpuSpecializedShader = core::make_smart_refctd_ptr( diff --git a/src/nbl/ext/FFT/FFT.cpp b/src/nbl/ext/FFT/FFT.cpp index 746fc1801f..978833c1cc 100644 --- a/src/nbl/ext/FFT/FFT.cpp +++ b/src/nbl/ext/FFT/FFT.cpp @@ -54,7 +54,7 @@ layout(local_size_x=_NBL_GLSL_WORKGROUP_SIZE_, local_size_y=1, local_size_z=1) i constexpr size_t extraSize = 8u*2u+1u; - auto source = ICPUBuffer::create({ strlen(sourceFmt)+extraSize+1u }); + auto source = ICPUBuffer::create({ .size = strlen(sourceFmt)+extraSize+1u }); snprintf( reinterpret_cast(source->getPointer()),source->getSize(), sourceFmt, DEFAULT_WORK_GROUP_SIZE, diff --git a/src/nbl/ext/LumaMeter/CLumaMeter.cpp b/src/nbl/ext/LumaMeter/CLumaMeter.cpp index dcfbdc2838..79f9926e6b 100644 --- a/src/nbl/ext/LumaMeter/CLumaMeter.cpp +++ b/src/nbl/ext/LumaMeter/CLumaMeter.cpp @@ -178,7 +178,7 @@ void main() const size_t xyzMatrixChars = strlen(xyzMatrix); const size_t extraSize = lumaChars+meterModeChars+eotfChars+xyzMatrixChars; - auto shader = ICPUBuffer::create({ strlen(sourceFmt)+extraSize+1u }); + auto shader = ICPUBuffer::create({ .size = strlen(sourceFmt)+extraSize+1u }); snprintf( reinterpret_cast(shader->getPointer()),shader->getSize(),sourceFmt, DEFAULT_BIN_COUNT,DEFAULT_BIN_GLOBAL_REPLICATION, diff --git a/src/nbl/ext/MitsubaLoader/CMitsubaLoader.cpp b/src/nbl/ext/MitsubaLoader/CMitsubaLoader.cpp index 6d2edd58b2..3bc564925a 100644 --- a/src/nbl/ext/MitsubaLoader/CMitsubaLoader.cpp +++ b/src/nbl/ext/MitsubaLoader/CMitsubaLoader.cpp @@ -320,7 +320,7 @@ static core::smart_refctd_ptr createSingleChannelImage(const a } const size_t texelBytesz = asset::getTexelOrBlockBytesize(outParams.format); region.bufferRowLength = asset::IImageAssetHandlerBase::calcPitchInBlocks(outParams.extent.width, texelBytesz); - auto buffer = asset::ICPUBuffer::create({ texelBytesz * region.bufferRowLength * outParams.extent.height }); + auto buffer = asset::ICPUBuffer::create({ .size = texelBytesz * region.bufferRowLength * outParams.extent.height }); region.imageOffset = { 0,0,0 }; region.imageExtent = outParams.extent; region.imageSubresource.baseArrayLayer = 0u; @@ -848,7 +848,7 @@ SContext::shape_ass_type CMitsubaLoader::loadBasicShape(SContext& ctx, uint32_t if (totalVertexCount) { constexpr uint32_t hidefRGBSize = 4u; - auto newRGBbuff = asset::ICPUBuffer::create({ hidefRGBSize*totalVertexCount }); + auto newRGBbuff = asset::ICPUBuffer::create({ .size = hidefRGBSize*totalVertexCount }); newMesh = core::smart_refctd_ptr_static_cast(mesh->clone(1u)); constexpr uint32_t COLOR_ATTR = 1u; constexpr uint32_t COLOR_BUF_BINDING = 15u; @@ -1196,7 +1196,7 @@ inline core::smart_refctd_ptr CMitsubaLoader::createDS auto d = ds0->getDescriptors(PRECOMPUTED_VT_DATA_BINDING).begin(); { - auto precompDataBuf = ICPUBuffer::create({ sizeof(asset::ICPUVirtualTexture::SPrecomputedData) }); + auto precompDataBuf = ICPUBuffer::create({ .size = sizeof(asset::ICPUVirtualTexture::SPrecomputedData) }); memcpy(precompDataBuf->getPointer(), &_ctx.backend_ctx.vt.vt->getPrecomputedData(), precompDataBuf->getSize()); d->buffer.offset = 0u; @@ -1205,7 +1205,7 @@ inline core::smart_refctd_ptr CMitsubaLoader::createDS } d = ds0->getDescriptors(INSTR_BUF_BINDING).begin(); { - auto instrbuf = ICPUBuffer::create({ _compResult.instructions.size()*sizeof(decltype(_compResult.instructions)::value_type) }); + auto instrbuf = ICPUBuffer::create({ .size = _compResult.instructions.size()*sizeof(decltype(_compResult.instructions)::value_type) }); memcpy(instrbuf->getPointer(), _compResult.instructions.data(), instrbuf->getSize()); d->buffer.offset = 0u; @@ -1214,7 +1214,7 @@ inline core::smart_refctd_ptr CMitsubaLoader::createDS } d = ds0->getDescriptors(BSDF_BUF_BINDING).begin(); { - auto bsdfbuf = ICPUBuffer::create({ _compResult.bsdfData.size()*sizeof(decltype(_compResult.bsdfData)::value_type) }); + auto bsdfbuf = ICPUBuffer::create({ .size = _compResult.bsdfData.size()*sizeof(decltype(_compResult.bsdfData)::value_type) }); memcpy(bsdfbuf->getPointer(), _compResult.bsdfData.data(), bsdfbuf->getSize()); d->buffer.offset = 0u; @@ -1226,7 +1226,7 @@ inline core::smart_refctd_ptr CMitsubaLoader::createDS const size_t sz = _compResult.prefetch_stream.size()*sizeof(decltype(_compResult.prefetch_stream)::value_type); constexpr size_t MIN_SSBO_SZ = 128ull; //prefetch stream won't be generated if no textures are used, so make sure we're not creating 0-size buffer - auto prefetch_instr_buf = ICPUBuffer::create({ std::max(MIN_SSBO_SZ, sz) }); + auto prefetch_instr_buf = ICPUBuffer::create({ .size = std::max(MIN_SSBO_SZ, sz) }); memcpy(prefetch_instr_buf->getPointer(), _compResult.prefetch_stream.data(), sz); d->buffer.offset = 0u; @@ -1295,7 +1295,7 @@ inline core::smart_refctd_ptr CMitsubaLoader::createDS #endif d = ds0->getDescriptors(INSTANCE_DATA_BINDING).begin(); { - auto instDataBuf = ICPUBuffer::create({ instanceData.size()*sizeof(nbl_glsl_ext_Mitsuba_Loader_instance_data_t) }); + auto instDataBuf = ICPUBuffer::create({ .size = instanceData.size()*sizeof(nbl_glsl_ext_Mitsuba_Loader_instance_data_t) }); memcpy(instDataBuf->getPointer(), instanceData.data(), instDataBuf->getSize()); d->buffer.offset = 0u; diff --git a/src/nbl/ext/MitsubaLoader/CSerializedLoader.cpp b/src/nbl/ext/MitsubaLoader/CSerializedLoader.cpp index c19395397c..61d209bfe4 100644 --- a/src/nbl/ext/MitsubaLoader/CSerializedLoader.cpp +++ b/src/nbl/ext/MitsubaLoader/CSerializedLoader.cpp @@ -227,18 +227,18 @@ asset::SAssetBundle CSerializedLoader::loadAsset(system::IFile* _file, const ass continue; } - auto indexbuf = asset::ICPUBuffer::create({ indexDataSize }); + auto indexbuf = asset::ICPUBuffer::create({ .size = indexDataSize }); const uint32_t posAttrSize = typeSize*3u; - auto posbuf = asset::ICPUBuffer::create({ vertexCount*posAttrSize }); + auto posbuf = asset::ICPUBuffer::create({ .size = vertexCount*posAttrSize }); core::smart_refctd_ptr normalbuf,uvbuf,colorbuf; if (requiresNormals) - normalbuf = asset::ICPUBuffer::create({ sizeof(uint32_t)*vertexCount }); + normalbuf = asset::ICPUBuffer::create({ .size = sizeof(uint32_t)*vertexCount }); // TODO: UV quantization and optimization (maybe lets just always use half floats?) constexpr size_t uvAttrSize = sizeof(float)*2u; if (hasUVs) - uvbuf = asset::ICPUBuffer::create({ uvAttrSize*vertexCount }); + uvbuf = asset::ICPUBuffer::create({ .size = uvAttrSize*vertexCount }); if (hasColors) - colorbuf = asset::ICPUBuffer::create({ sizeof(uint32_t)*vertexCount }); + colorbuf = asset::ICPUBuffer::create({ .size = sizeof(uint32_t)*vertexCount }); void* posPtr = posbuf->getPointer(); CQuantNormalCache::value_type_t* normalPtr = !normalbuf ? nullptr:reinterpret_cast*>(normalbuf->getPointer()); diff --git a/src/nbl/ext/RadixSort/RadixSort.cpp b/src/nbl/ext/RadixSort/RadixSort.cpp index 2d67a1131e..e41280aeeb 100644 --- a/src/nbl/ext/RadixSort/RadixSort.cpp +++ b/src/nbl/ext/RadixSort/RadixSort.cpp @@ -92,7 +92,7 @@ layout (local_size_x = _NBL_GLSL_WORKGROUP_SIZE_) in; // Todo: This just the value I took from FFT example, don't know how it is being computed. const size_t extraSize = 4u + 8u + 8u + 128u; - auto shader = asset::ICPUBuffer::create({ strlen(source_fmt) + extraSize + 1u }); + auto shader = asset::ICPUBuffer::create({ .size = strlen(source_fmt) + extraSize + 1u }); snprintf(reinterpret_cast(shader->getPointer()), shader->getSize(), source_fmt, m_wg_size, BUCKETS_COUNT, shader_file_path); auto cpu_specialized_shader = core::make_smart_refctd_ptr( @@ -123,7 +123,7 @@ layout (local_size_x = _NBL_GLSL_WORKGROUP_SIZE_) in; // Todo: This just the value I took from FFT example, don't know how it is being computed. const size_t extraSize = 4u + 8u + 8u + 128u; - auto shader = asset::ICPUBuffer::create({ strlen(source_fmt) + extraSize + 1u }); + auto shader = asset::ICPUBuffer::create({ .size = strlen(source_fmt) + extraSize + 1u }); snprintf(reinterpret_cast(shader->getPointer()), shader->getSize(), source_fmt, m_wg_size, shader_file_path); auto cpu_specialized_shader = core::make_smart_refctd_ptr( diff --git a/src/nbl/ext/TextRendering/TextRendering.cpp b/src/nbl/ext/TextRendering/TextRendering.cpp index ea0a04e6cb..75a07222ac 100644 --- a/src/nbl/ext/TextRendering/TextRendering.cpp +++ b/src/nbl/ext/TextRendering/TextRendering.cpp @@ -109,7 +109,7 @@ core::smart_refctd_ptr FontFace::generateGlyphMSDF(uint32_t baseMSDFP } auto image = ICPUImage::create(std::move(imgParams)); - auto buffer = ICPUBuffer::create({ bufferSize }); + auto buffer = ICPUBuffer::create({ .size = bufferSize }); auto regions = core::make_refctd_dynamic_array>(mipLevels); size_t bufferOffset = 0ull; diff --git a/src/nbl/ext/ToneMapper/CToneMapper.cpp b/src/nbl/ext/ToneMapper/CToneMapper.cpp index c97d05b2df..733d9ce206 100644 --- a/src/nbl/ext/ToneMapper/CToneMapper.cpp +++ b/src/nbl/ext/ToneMapper/CToneMapper.cpp @@ -498,7 +498,7 @@ void main() usingTemporalAdaptationDefineChars+ outViewFormatQualifierChars+eotfChars+inXYZMatrixChars+outXYZMatrixChars+oetfChars+quantizationChars; - auto shader = ICPUBuffer::create({ strlen(sourceFmt)+extraSize+1u }); + auto shader = ICPUBuffer::create({ .size = strlen(sourceFmt)+extraSize+1u }); snprintf( reinterpret_cast(shader->getPointer()),shader->getSize(),sourceFmt, DEFAULT_WORKGROUP_DIM,DEFAULT_WORKGROUP_DIM,_operator, diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 7fb60d79bf..b39371db16 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -513,7 +513,7 @@ CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog) if (_size==0ull) return {nullptr,NVRTC_ERROR_INVALID_INPUT}; - auto ptx = asset::ICPUBuffer::create({ _size }); + auto ptx = asset::ICPUBuffer::create({ .size = _size }); return {std::move(ptx),m_nvrtc.pnvrtcGetPTX(prog,reinterpret_cast(ptx->getPointer()))}; } diff --git a/src/nbl/video/utilities/CPropertyPoolHandler.cpp b/src/nbl/video/utilities/CPropertyPoolHandler.cpp index b9ebe8b1b1..40f17e4e75 100644 --- a/src/nbl/video/utilities/CPropertyPoolHandler.cpp +++ b/src/nbl/video/utilities/CPropertyPoolHandler.cpp @@ -28,7 +28,7 @@ CPropertyPoolHandler::CPropertyPoolHandler(core::smart_refctd_ptrgetSize() }); + glsl = core::make_smart_refctd_ptr(glslFile->getSize()); memcpy(glsl->getPointer(), glslFile->getMappedPointer(), glsl->getSize()); } diff --git a/src/nbl/video/utilities/CScanner.cpp b/src/nbl/video/utilities/CScanner.cpp index 0f4399b00d..ec31272358 100644 --- a/src/nbl/video/utilities/CScanner.cpp +++ b/src/nbl/video/utilities/CScanner.cpp @@ -23,7 +23,7 @@ core::smart_refctd_ptr CScanner::createShader(const bool indi else glsl = loadBuiltinData("nbl/builtin/glsl/scan/direct.comp"); } - auto buffer = asset::ICPUBuffer::create({ glsl->getSize() }); + auto buffer = core::make_smart_refctd_ptr(glsl->getSize()); memcpy(buffer->getPointer(), glsl->getMappedPointer(), glsl->getSize()); auto cpushader = core::make_smart_refctd_ptr(std::move(buffer), asset::IShader::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_GLSL, "????"); const char* storageType = nullptr; diff --git a/src/nbl/video/utilities/ImageRegionIterator.cpp b/src/nbl/video/utilities/ImageRegionIterator.cpp index 50b990aef2..66799f635b 100644 --- a/src/nbl/video/utilities/ImageRegionIterator.cpp +++ b/src/nbl/video/utilities/ImageRegionIterator.cpp @@ -92,7 +92,7 @@ ImageRegionIterator::ImageRegionIterator( uint64_t offsetInCPUBuffer = region.bufferOffset; uint8_t* inCpuBufferPointer = const_cast(reinterpret_cast(srcData) + offsetInCPUBuffer); - core::smart_refctd_ptr inCPUBuffer = asset::ICPUBuffer::create({ 0xdeadbeefBADC0FFEull, inCpuBufferPointer }, core::adopt_memory); + core::smart_refctd_ptr inCPUBuffer = asset::ICPUBuffer::create({ .size = 0xdeadbeefBADC0FFEull, .data = inCpuBufferPointer }, core::adopt_memory); inCPUImage->setBufferAndRegions(std::move(inCPUBuffer), inCPUImageRegions); assert(inCPUImage->getBuffer()); assert(inCPUImage->getRegions().size() > 0u); @@ -392,7 +392,7 @@ bool ImageRegionIterator::advanceAndCopyToStagingBuffer(asset::IImage::SBufferCo outCpuImageRegion.imageSubresource.layerCount = core::max(regionToCopyNext.imageSubresource.layerCount, 1u); uint8_t* outCpuBufferPointer = reinterpret_cast(stagingBufferPointer) + stagingBufferOffset; - core::smart_refctd_ptr outCPUBuffer = asset::ICPUBuffer::create({ outCPUBufferSize, outCpuBufferPointer }, core::adopt_memory); + core::smart_refctd_ptr outCPUBuffer = asset::ICPUBuffer::create({ .size = outCPUBufferSize, .data = outCpuBufferPointer }, core::adopt_memory); outCPUImage->setBufferAndRegions(std::move(outCPUBuffer), outCPUImageRegions); assert(outCPUImage->getBuffer()); assert(outCPUImage->getRegions().size() > 0u); From 3398802492a8bb389927620f9081373567cef03e Mon Sep 17 00:00:00 2001 From: Ali Cheraghi Date: Thu, 14 Nov 2024 19:42:22 +0330 Subject: [PATCH 189/358] asset: reference counted memory_resource Signed-off-by: Ali Cheraghi --- include/nbl/asset/ICPUBuffer.h | 44 +++++++++---------- .../nbl/core/alloc/refctd_memory_resource.h | 42 ++++++++++++++++++ src/nbl/CMakeLists.txt | 1 + src/nbl/asset/utils/IShaderCompiler.cpp | 5 ++- src/nbl/core/alloc/refctd_memory_resource.cpp | 30 +++++++++++++ .../video/utilities/ImageRegionIterator.cpp | 4 +- 6 files changed, 98 insertions(+), 28 deletions(-) create mode 100644 include/nbl/core/alloc/refctd_memory_resource.h create mode 100644 src/nbl/core/alloc/refctd_memory_resource.cpp diff --git a/include/nbl/asset/ICPUBuffer.h b/include/nbl/asset/ICPUBuffer.h index 37ce5f222d..a466013dc4 100644 --- a/include/nbl/asset/ICPUBuffer.h +++ b/include/nbl/asset/ICPUBuffer.h @@ -10,6 +10,8 @@ #include "nbl/asset/IAsset.h" #include "nbl/asset/IPreHashed.h" +#include "nbl/core/alloc/refctd_memory_resource.h" + namespace nbl::asset { @@ -28,7 +30,7 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed size_t size; void* data = nullptr; size_t alignment = _NBL_SIMD_ALIGNMENT; - std::pmr::memory_resource* memoryResource = nullptr; + core::smart_refctd_ptr memoryResource = nullptr; SCreationParams& operator =(const asset::IBuffer::SCreationParams& rhs) { @@ -37,31 +39,28 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed } }; - ICPUBuffer(size_t size, void* data, std::pmr::memory_resource* memoryResource, size_t alignment, bool adopt_memory) : - asset::IBuffer({ size, EUF_TRANSFER_DST_BIT }), m_data(data), m_mem_resource(memoryResource), m_alignment(alignment), m_adopt_memory(adopt_memory) {} - //! allocates uninitialized memory, copies `data` into allocation if `!data` not nullptr - core::smart_refctd_ptr static create(const SCreationParams& params) { - std::pmr::memory_resource* memoryResource = params.memoryResource; + core::smart_refctd_ptr static create(SCreationParams&& params) { if (!params.memoryResource) - memoryResource = std::pmr::get_default_resource(); + params.memoryResource = core::getDefaultMemoryResource(); - auto data = memoryResource->allocate(params.size, params.alignment); + auto data = params.memoryResource->allocate(params.size, params.alignment); if (!data) return nullptr; - if (params.data) memcpy(data, params.data, params.size); + params.data = data; - return core::make_smart_refctd_ptr(params.size, data, memoryResource, params.alignment, false); + return core::smart_refctd_ptr(new ICPUBuffer(std::move(params)), core::dont_grab); } //! does not allocate memory, adopts the `data` pointer, no copies done - core::smart_refctd_ptr static create(const SCreationParams& params, core::adopt_memory_t) { - std::pmr::memory_resource* memoryResource; + core::smart_refctd_ptr static create(SCreationParams&& params, core::adopt_memory_t) { + if (!params.data) + return nullptr; if (!params.memoryResource) - memoryResource = std::pmr::get_default_resource(); - return core::make_smart_refctd_ptr(params.size, params.data, memoryResource, params.alignment, true); + params.memoryResource = core::getDefaultMemoryResource(); + return core::smart_refctd_ptr(new ICPUBuffer(std::move(params)), core::dont_grab); } core::smart_refctd_ptr clone(uint32_t = ~0u) const override final @@ -77,7 +76,6 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed inline size_t getDependantCount() const override { return 0; } - // inline core::blake3_hash_t computeContentHash() const override { core::blake3_hasher hasher; @@ -121,22 +119,20 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed inline void discardContent_impl() override { - return freeData(); - } - - // REMEMBER TO CALL FROM DTOR! - virtual inline void freeData() - { - if (!m_adopt_memory && m_data) + if (m_data) m_mem_resource->deallocate(m_data, m_creationParams.size, m_alignment); m_data = nullptr; m_creationParams.size = 0ull; } +private: + ICPUBuffer(SCreationParams&& params) : + asset::IBuffer({ params.size, EUF_TRANSFER_DST_BIT }), m_data(params.data), + m_mem_resource(params.memoryResource), m_alignment(params.alignment) {} + void* m_data; - std::pmr::memory_resource* m_mem_resource; + core::smart_refctd_ptr m_mem_resource; size_t m_alignment; - bool m_adopt_memory; }; } // end namespace nbl::asset diff --git a/include/nbl/core/alloc/refctd_memory_resource.h b/include/nbl/core/alloc/refctd_memory_resource.h new file mode 100644 index 0000000000..e9bc599825 --- /dev/null +++ b/include/nbl/core/alloc/refctd_memory_resource.h @@ -0,0 +1,42 @@ +// Copyright (C) 2019-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine" and was originally part of the "Irrlicht Engine" +// For conditions of distribution and use, see copyright notice in nabla.h +// See the original file in irrlicht source for authors + +#ifndef _NBL_CORE_ALLOC_REFCTD_MEMORY_RESOURCE_INCLUDED_ +#define _NBL_CORE_ALLOC_REFCTD_MEMORY_RESOURCE_INCLUDED_ + +#include "BuildConfigOptions.h" +#include "nbl/core/IReferenceCounted.h" + +#include + +using namespace nbl; + +namespace nbl::core +{ + +class refctd_memory_resource : public core::IReferenceCounted +{ + public: + refctd_memory_resource(std::pmr::memory_resource* pmr) : m_pmr(pmr) {}; + + void* allocate(size_t bytes, size_t alignment) { + return m_pmr->allocate(bytes, alignment); + } + + void deallocate(void* p, size_t bytes, size_t alignment) { + return m_pmr->deallocate(p, bytes, alignment); + } + + private: + std::pmr::memory_resource* m_pmr; +}; + +NBL_API2 smart_refctd_ptr getNullMemoryResource(); +NBL_API2 smart_refctd_ptr getDefaultMemoryResource(); +NBL_API2 void setDefaultMemoryResource(smart_refctd_ptr memoryResource); + +} + +#endif \ No newline at end of file diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index fdf032548f..83845b9c84 100755 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -119,6 +119,7 @@ unset(NABLA_HEADERS_PUBLIC2 ${NBL_TMP_FULL_PATHS}) set(NBL_CORE_SOURCES ${NBL_ROOT_PATH}/src/nbl/core/IReferenceCounted.cpp + ${NBL_ROOT_PATH}/src/nbl/core/alloc/refctd_memory_resource.cpp ${NBL_ROOT_PATH}/src/nbl/core/hash/blake.cpp ) set(NBL_SYSTEM_SOURCES diff --git a/src/nbl/asset/utils/IShaderCompiler.cpp b/src/nbl/asset/utils/IShaderCompiler.cpp index 161c9d3ba2..1d7d193eaf 100644 --- a/src/nbl/asset/utils/IShaderCompiler.cpp +++ b/src/nbl/asset/utils/IShaderCompiler.cpp @@ -342,7 +342,8 @@ core::smart_refctd_ptr IShaderCompiler::CCache::serialize() const // Might as well memcpy everything memcpy(retVal.data() + SHADER_BUFFER_SIZE_BYTES + shaderBufferSize, dumpedContainerJson.data(), dumpedContainerJsonLength); - return ICPUBuffer::create({ .size = retVal.size(), .data = retVal.data(), .memoryResource = new core::VectorViewNullMemoryResource(std::move(retVal)) }); + auto memoryResource = new core::VectorViewNullMemoryResource(std::move(retVal)); + return ICPUBuffer::create({ .size = retVal.size(), .data = retVal.data(), .memoryResource = core::make_smart_refctd_ptr(memoryResource) }); } core::smart_refctd_ptr IShaderCompiler::CCache::deserialize(const std::span serializedCache) @@ -417,7 +418,7 @@ bool nbl::asset::IShaderCompiler::CCache::SEntry::setContent(const asset::ICPUBu compressedSpirv.resize(propsSize + destLen); auto memResource = new core::VectorViewNullMemoryResource(std::move(compressedSpirv)); - spirv = ICPUBuffer::create({ .size = propsSize + destLen, .data = memResource->data(), .memoryResource = std::move(memResource)}); + spirv = ICPUBuffer::create({ .size = propsSize + destLen, .data = memResource->data(), .memoryResource = core::make_smart_refctd_ptr(memResource) }); return true; } diff --git a/src/nbl/core/alloc/refctd_memory_resource.cpp b/src/nbl/core/alloc/refctd_memory_resource.cpp new file mode 100644 index 0000000000..1035b335fb --- /dev/null +++ b/src/nbl/core/alloc/refctd_memory_resource.cpp @@ -0,0 +1,30 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "nbl/core/alloc/refctd_memory_resource.h" + +using namespace nbl; +using namespace core; + +static smart_refctd_ptr null_memory_resource = nullptr; +static smart_refctd_ptr default_memory_resource = nullptr; + +smart_refctd_ptr core::getNullMemoryResource() +{ + if (!null_memory_resource) + null_memory_resource = make_smart_refctd_ptr(std::pmr::null_memory_resource()); + return null_memory_resource; +} + +smart_refctd_ptr core::getDefaultMemoryResource() +{ + if (!default_memory_resource) + default_memory_resource = make_smart_refctd_ptr(std::pmr::get_default_resource()); + return default_memory_resource; +} + +void core::setDefaultMemoryResource(smart_refctd_ptr memoryResource) +{ + default_memory_resource = memoryResource; +} diff --git a/src/nbl/video/utilities/ImageRegionIterator.cpp b/src/nbl/video/utilities/ImageRegionIterator.cpp index 66799f635b..6f5b6c7778 100644 --- a/src/nbl/video/utilities/ImageRegionIterator.cpp +++ b/src/nbl/video/utilities/ImageRegionIterator.cpp @@ -92,7 +92,7 @@ ImageRegionIterator::ImageRegionIterator( uint64_t offsetInCPUBuffer = region.bufferOffset; uint8_t* inCpuBufferPointer = const_cast(reinterpret_cast(srcData) + offsetInCPUBuffer); - core::smart_refctd_ptr inCPUBuffer = asset::ICPUBuffer::create({ .size = 0xdeadbeefBADC0FFEull, .data = inCpuBufferPointer }, core::adopt_memory); + core::smart_refctd_ptr inCPUBuffer = asset::ICPUBuffer::create({ .size = 0xdeadbeefBADC0FFEull, .data = inCpuBufferPointer, .memoryResource = core::getNullMemoryResource() }, core::adopt_memory); inCPUImage->setBufferAndRegions(std::move(inCPUBuffer), inCPUImageRegions); assert(inCPUImage->getBuffer()); assert(inCPUImage->getRegions().size() > 0u); @@ -392,7 +392,7 @@ bool ImageRegionIterator::advanceAndCopyToStagingBuffer(asset::IImage::SBufferCo outCpuImageRegion.imageSubresource.layerCount = core::max(regionToCopyNext.imageSubresource.layerCount, 1u); uint8_t* outCpuBufferPointer = reinterpret_cast(stagingBufferPointer) + stagingBufferOffset; - core::smart_refctd_ptr outCPUBuffer = asset::ICPUBuffer::create({ .size = outCPUBufferSize, .data = outCpuBufferPointer }, core::adopt_memory); + core::smart_refctd_ptr outCPUBuffer = asset::ICPUBuffer::create({ .size = outCPUBufferSize, .data = outCpuBufferPointer, .memoryResource = core::getNullMemoryResource() }, core::adopt_memory); outCPUImage->setBufferAndRegions(std::move(outCPUBuffer), outCPUImageRegions); assert(outCPUImage->getBuffer()); assert(outCPUImage->getRegions().size() > 0u); From eed29fa5aa78458edfc74712770eeffca1130d47 Mon Sep 17 00:00:00 2001 From: Ali Cheraghi Date: Thu, 14 Nov 2024 21:29:56 +0330 Subject: [PATCH 190/358] more ICPUBuffer fixes Signed-off-by: Ali Cheraghi --- include/nbl/asset/ICPUBuffer.h | 5 +++++ include/nbl/core/alloc/refctd_memory_resource.h | 2 +- src/nbl/asset/ICPUImage.cpp | 2 +- src/nbl/core/alloc/refctd_memory_resource.cpp | 4 ++-- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/include/nbl/asset/ICPUBuffer.h b/include/nbl/asset/ICPUBuffer.h index a466013dc4..d3c5c2250b 100644 --- a/include/nbl/asset/ICPUBuffer.h +++ b/include/nbl/asset/ICPUBuffer.h @@ -130,6 +130,11 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed asset::IBuffer({ params.size, EUF_TRANSFER_DST_BIT }), m_data(params.data), m_mem_resource(params.memoryResource), m_alignment(params.alignment) {} + ~ICPUBuffer() override { + if (m_data) + m_mem_resource->deallocate(m_data, m_creationParams.size, m_alignment); + } + void* m_data; core::smart_refctd_ptr m_mem_resource; size_t m_alignment; diff --git a/include/nbl/core/alloc/refctd_memory_resource.h b/include/nbl/core/alloc/refctd_memory_resource.h index e9bc599825..627f3b97b9 100644 --- a/include/nbl/core/alloc/refctd_memory_resource.h +++ b/include/nbl/core/alloc/refctd_memory_resource.h @@ -35,7 +35,7 @@ class refctd_memory_resource : public core::IReferenceCounted NBL_API2 smart_refctd_ptr getNullMemoryResource(); NBL_API2 smart_refctd_ptr getDefaultMemoryResource(); -NBL_API2 void setDefaultMemoryResource(smart_refctd_ptr memoryResource); +NBL_API2 void setDefaultMemoryResource(refctd_memory_resource* memoryResource); } diff --git a/src/nbl/asset/ICPUImage.cpp b/src/nbl/asset/ICPUImage.cpp index 28c893fa86..7be1c35954 100644 --- a/src/nbl/asset/ICPUImage.cpp +++ b/src/nbl/asset/ICPUImage.cpp @@ -122,7 +122,7 @@ class CFlattenRegionsStreamHashImageFilter : public CMatchedSizeInOutImageFilter auto getScratchAsBuffer = [&memory = state->scratch.memory](size_t size, size_t offset = 0ull) { - return ICPUBuffer::create({ .size = size, .data = (uint8_t*)memory + offset }, core::adopt_memory); // adopt memory & don't free it on exit + return ICPUBuffer::create({ .size = size, .data = (uint8_t*)memory + offset, .memoryResource = core::getNullMemoryResource() }, core::adopt_memory); // adopt memory & don't free it on exit }; /* diff --git a/src/nbl/core/alloc/refctd_memory_resource.cpp b/src/nbl/core/alloc/refctd_memory_resource.cpp index 1035b335fb..124465e645 100644 --- a/src/nbl/core/alloc/refctd_memory_resource.cpp +++ b/src/nbl/core/alloc/refctd_memory_resource.cpp @@ -24,7 +24,7 @@ smart_refctd_ptr core::getDefaultMemoryResource() return default_memory_resource; } -void core::setDefaultMemoryResource(smart_refctd_ptr memoryResource) +void core::setDefaultMemoryResource(refctd_memory_resource* memoryResource) { - default_memory_resource = memoryResource; + default_memory_resource = smart_refctd_ptr(memoryResource, dont_grab); } From cf9a866623311036f3dec3ece4f477c1ce198d63 Mon Sep 17 00:00:00 2001 From: Ali Cheraghi Date: Fri, 15 Nov 2024 00:53:50 +0330 Subject: [PATCH 191/358] asset: fix several bugs Signed-off-by: Ali Cheraghi --- include/nbl/asset/ICPUBuffer.h | 10 ++++++---- include/nbl/asset/ICPUMeshBuffer.h | 2 +- .../nbl/core/alloc/VectorViewNullMemoryResource.h | 13 +++++++++---- include/nbl/core/alloc/refctd_memory_resource.h | 4 +--- src/nbl/asset/ICPUImage.cpp | 4 ++-- src/nbl/asset/interchange/CImageWriterJPG.cpp | 2 +- src/nbl/asset/interchange/CImageWriterPNG.cpp | 2 +- src/nbl/asset/interchange/CImageWriterTGA.cpp | 2 +- src/nbl/asset/interchange/CSTLMeshFileLoader.cpp | 2 +- src/nbl/asset/utils/CMeshManipulator.cpp | 10 +++++----- src/nbl/asset/utils/IShaderCompiler.cpp | 9 +++++---- src/nbl/core/alloc/refctd_memory_resource.cpp | 2 +- src/nbl/system/CArchiveLoaderTar.cpp | 4 ++-- 13 files changed, 36 insertions(+), 30 deletions(-) diff --git a/include/nbl/asset/ICPUBuffer.h b/include/nbl/asset/ICPUBuffer.h index d3c5c2250b..6206127b6c 100644 --- a/include/nbl/asset/ICPUBuffer.h +++ b/include/nbl/asset/ICPUBuffer.h @@ -40,7 +40,8 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed }; //! allocates uninitialized memory, copies `data` into allocation if `!data` not nullptr - core::smart_refctd_ptr static create(SCreationParams&& params) { + core::smart_refctd_ptr static create(SCreationParams&& params) + { if (!params.memoryResource) params.memoryResource = core::getDefaultMemoryResource(); @@ -55,7 +56,8 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed } //! does not allocate memory, adopts the `data` pointer, no copies done - core::smart_refctd_ptr static create(SCreationParams&& params, core::adopt_memory_t) { + core::smart_refctd_ptr static create(SCreationParams&& params, core::adopt_memory_t) + { if (!params.data) return nullptr; if (!params.memoryResource) @@ -122,6 +124,7 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed if (m_data) m_mem_resource->deallocate(m_data, m_creationParams.size, m_alignment); m_data = nullptr; + m_mem_resource = nullptr; m_creationParams.size = 0ull; } @@ -131,8 +134,7 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed m_mem_resource(params.memoryResource), m_alignment(params.alignment) {} ~ICPUBuffer() override { - if (m_data) - m_mem_resource->deallocate(m_data, m_creationParams.size, m_alignment); + discardContent_impl(); } void* m_data; diff --git a/include/nbl/asset/ICPUMeshBuffer.h b/include/nbl/asset/ICPUMeshBuffer.h index 4ecc0f35ba..532b622090 100644 --- a/include/nbl/asset/ICPUMeshBuffer.h +++ b/include/nbl/asset/ICPUMeshBuffer.h @@ -544,7 +544,7 @@ class ICPUMeshBuffer final : public IMeshBuffer(dst); if (isSignedFormat(format)) { diff --git a/include/nbl/core/alloc/VectorViewNullMemoryResource.h b/include/nbl/core/alloc/VectorViewNullMemoryResource.h index ce61b9b08d..082986886d 100644 --- a/include/nbl/core/alloc/VectorViewNullMemoryResource.h +++ b/include/nbl/core/alloc/VectorViewNullMemoryResource.h @@ -17,10 +17,10 @@ class VectorViewNullMemoryResource : public std::pmr::memory_resource { public: // only create the resource from an already sized vector - VectorViewNullMemoryResource(core::vector&& buffer) : buffer(buffer) + VectorViewNullMemoryResource(core::vector&& buffer) : buffer(std::move(buffer)), already_called(false) { assert(buffer.size()); - }; + } void* data() { return buffer.data(); @@ -28,11 +28,15 @@ class VectorViewNullMemoryResource : public std::pmr::memory_resource protected: void* do_allocate(size_t bytes, size_t alignment) override { - assert(bytes <= buffer.size()); + if (already_called || bytes > buffer.size() || !core::is_aligned_to(bytes, alignment)) + return nullptr; + already_called = true; return buffer.data(); } - void do_deallocate(void* p, size_t bytes, size_t alignment) override {} + void do_deallocate(void* p, size_t bytes, size_t alignment) override { + assert(p == buffer.data()); + } bool do_is_equal(const std::pmr::memory_resource& other) const noexcept override { return this == &other; @@ -40,6 +44,7 @@ class VectorViewNullMemoryResource : public std::pmr::memory_resource private: core::vector buffer; + bool already_called; }; } diff --git a/include/nbl/core/alloc/refctd_memory_resource.h b/include/nbl/core/alloc/refctd_memory_resource.h index 627f3b97b9..b1b13179e1 100644 --- a/include/nbl/core/alloc/refctd_memory_resource.h +++ b/include/nbl/core/alloc/refctd_memory_resource.h @@ -11,12 +11,10 @@ #include -using namespace nbl; - namespace nbl::core { -class refctd_memory_resource : public core::IReferenceCounted +class refctd_memory_resource : public IReferenceCounted { public: refctd_memory_resource(std::pmr::memory_resource* pmr) : m_pmr(pmr) {}; diff --git a/src/nbl/asset/ICPUImage.cpp b/src/nbl/asset/ICPUImage.cpp index 7be1c35954..c20404a567 100644 --- a/src/nbl/asset/ICPUImage.cpp +++ b/src/nbl/asset/ICPUImage.cpp @@ -122,7 +122,7 @@ class CFlattenRegionsStreamHashImageFilter : public CMatchedSizeInOutImageFilter auto getScratchAsBuffer = [&memory = state->scratch.memory](size_t size, size_t offset = 0ull) { - return ICPUBuffer::create({ .size = size, .data = (uint8_t*)memory + offset, .memoryResource = core::getNullMemoryResource() }, core::adopt_memory); // adopt memory & don't free it on exit + return ICPUBuffer::create({ .size = size, .data = reinterpret_cast(memory) + offset, .memoryResource = core::getNullMemoryResource() }, core::adopt_memory); // adopt memory & don't free it on exit }; /* @@ -303,7 +303,7 @@ core::blake3_hash_t ICPUImage::computeContentHash() const // TODO, Arek: Matt if you want this templated then assign me to update instead of hardcoding or change it yourself otherwise delete this comment const bool passed = filter.execute(std::execution::par, &state); - _NBL_DELETE_ARRAY((uint8_t*)state.scratch.memory, state.scratch.size); + _NBL_DELETE_ARRAY(reinterpret_cast(state.scratch.memory), state.scratch.size); assert(passed); // actually this should never fail, leaving in case return state.outHash; diff --git a/src/nbl/asset/interchange/CImageWriterJPG.cpp b/src/nbl/asset/interchange/CImageWriterJPG.cpp index 1386fb93e3..f80c01d2d8 100644 --- a/src/nbl/asset/interchange/CImageWriterJPG.cpp +++ b/src/nbl/asset/interchange/CImageWriterJPG.cpp @@ -160,7 +160,7 @@ static bool writeJPEGFile(system::IFile* file, system::ISystem* sys, const asset JSAMPROW row_pointer[1]; /* pointer to JSAMPLE row[s] */ row_pointer[0] = dest; - uint8_t* src = (uint8_t*)convertedImage->getBuffer()->getPointer(); + uint8_t* src = reinterpret_cast(convertedImage->getBuffer()->getPointer()); /* Switch up, write from bottom -> top because the texture is flipped from OpenGL side */ uint32_t eof = cinfo.image_height * cinfo.image_width * cinfo.input_components; diff --git a/src/nbl/asset/interchange/CImageWriterPNG.cpp b/src/nbl/asset/interchange/CImageWriterPNG.cpp index b56862e185..ddb9cc1933 100644 --- a/src/nbl/asset/interchange/CImageWriterPNG.cpp +++ b/src/nbl/asset/interchange/CImageWriterPNG.cpp @@ -177,7 +177,7 @@ bool CImageWriterPNG::writeAsset(system::IFile* _file, const SAssetWriteParams& } } - uint8_t* data = (uint8_t*)convertedImage->getBuffer()->getPointer(); + uint8_t* data = reinterpret_cast(convertedImage->getBuffer()->getPointer()); constexpr uint32_t maxPNGFileHeight = 16u * 1024u; // arbitrary limit if (trueExtent.Y>maxPNGFileHeight) diff --git a/src/nbl/asset/interchange/CImageWriterTGA.cpp b/src/nbl/asset/interchange/CImageWriterTGA.cpp index e35a3bceab..88c723cb58 100644 --- a/src/nbl/asset/interchange/CImageWriterTGA.cpp +++ b/src/nbl/asset/interchange/CImageWriterTGA.cpp @@ -120,7 +120,7 @@ bool CImageWriterTGA::writeAsset(system::IFile* _file, const SAssetWriteParams& return false; } - uint8_t* scan_lines = (uint8_t*)convertedImage->getBuffer()->getPointer(); + uint8_t* scan_lines = reinterpret_cast(convertedImage->getBuffer()->getPointer()); if (!scan_lines) return false; diff --git a/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp b/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp index 226db5a0dc..841bd3f2a6 100644 --- a/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp +++ b/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp @@ -283,7 +283,7 @@ SAssetBundle CSTLMeshFileLoader::loadAsset(system::IFile* _file, const IAssetLoa { if (i % 3 == 0) normal = quantNormalCache->quantize(normals[i / 3]); - uint8_t* ptr = ((uint8_t*)(vertexBuf->getPointer())) + i * vtxSize; + uint8_t* ptr = (reinterpret_cast(vertexBuf->getPointer())) + i * vtxSize; memcpy(ptr, positions[i].pointer, 3 * 4); *reinterpret_cast(ptr + 12) = normal; diff --git a/src/nbl/asset/utils/CMeshManipulator.cpp b/src/nbl/asset/utils/CMeshManipulator.cpp index aef3e74e9c..798fbea287 100644 --- a/src/nbl/asset/utils/CMeshManipulator.cpp +++ b/src/nbl/asset/utils/CMeshManipulator.cpp @@ -444,7 +444,7 @@ static bool cmpVertices(ICPUMeshBuffer* _inbuf, const void* _va, const void* _vb constexpr uint32_t MAX_ATTRIBS = ICPUMeshBuffer::MAX_VERTEX_ATTRIB_COUNT; - const uint8_t* va = (uint8_t*)_va, *vb = (uint8_t*)_vb; + const uint8_t* va = reinterpret_cast(_va), *vb = reinterpret_cast(_vb); for (size_t i = 0u; i < MAX_ATTRIBS; ++i) { if (!_inbuf->isAttributeEnabled(i)) @@ -517,7 +517,7 @@ core::smart_refctd_ptr IMeshManipulator::createMeshBufferWelded( uint32_t maxRedirect = 0; - uint8_t* epicData = (uint8_t*)_NBL_ALIGNED_MALLOC(vertexSize*vertexCount,_NBL_SIMD_ALIGNMENT); + uint8_t* epicData = reinterpret_cast(_NBL_ALIGNED_MALLOC(vertexSize*vertexCount,_NBL_SIMD_ALIGNMENT)); for (auto i=0u; i IMeshManipulator::createOptimizedMeshBuff const size_t bufsz = outbuffer->getAttribBoundBuffer(posId).buffer->getSize(); const size_t vertexSize = pipeline->getCachedCreationParams().vertexInput.bindings[0].stride; - uint8_t* const v = (uint8_t*)(outbuffer->getAttribBoundBuffer(posId).buffer->getPointer()); // after prefetch optim. we have guarantee of single vertex buffer so we can do like this - uint8_t* const vCopy = (uint8_t*)_NBL_ALIGNED_MALLOC(bufsz, _NBL_SIMD_ALIGNMENT); + uint8_t* const v = reinterpret_cast(outbuffer->getAttribBoundBuffer(posId).buffer->getPointer()); // after prefetch optim. we have guarantee of single vertex buffer so we can do like this + uint8_t* const vCopy = reinterpret_cast(_NBL_ALIGNED_MALLOC(bufsz, _NBL_SIMD_ALIGNMENT)); memcpy(vCopy, v, bufsz); size_t baseVtx = outbuffer->getBaseVertex(); @@ -894,7 +894,7 @@ void CMeshManipulator::_filterInvalidTriangles(ICPUMeshBuffer* _input) struct Triangle { IdxT i[3]; - } *const begin = (Triangle*)copy, *const end = (Triangle*)((uint8_t*)copy + size); + } *const begin = (Triangle*)copy, *const end = (Triangle*)(reinterpret_cast(copy) + size); Triangle* const newEnd = std::remove_if(begin, end, [&_input](const Triangle& _t) { diff --git a/src/nbl/asset/utils/IShaderCompiler.cpp b/src/nbl/asset/utils/IShaderCompiler.cpp index 1d7d193eaf..cf146cfdcc 100644 --- a/src/nbl/asset/utils/IShaderCompiler.cpp +++ b/src/nbl/asset/utils/IShaderCompiler.cpp @@ -155,7 +155,7 @@ auto IShaderCompiler::CIncludeFinder::getIncludeStandard(const system::path& req core::blake3_hasher hasher; - hasher.update((uint8_t*)(retVal.contents.data()), retVal.contents.size() * (sizeof(char) / sizeof(uint8_t))); + hasher.update(reinterpret_cast(retVal.contents.data()), retVal.contents.size() * (sizeof(char) / sizeof(uint8_t))); retVal.hash = static_cast(hasher); return retVal; } @@ -171,7 +171,7 @@ auto IShaderCompiler::CIncludeFinder::getIncludeRelative(const system::path& req else retVal = std::move(trySearchPaths(includeName)); core::blake3_hasher hasher; - hasher.update((uint8_t*)(retVal.contents.data()), retVal.contents.size() * (sizeof(char) / sizeof(uint8_t))); + hasher.update(reinterpret_cast(retVal.contents.data()), retVal.contents.size() * (sizeof(char) / sizeof(uint8_t))); retVal.hash = static_cast(hasher); return retVal; } @@ -327,7 +327,8 @@ core::smart_refctd_ptr IShaderCompiler::CCache::serialize() const uint64_t dumpedContainerJsonLength = dumpedContainerJson.size(); // Create a buffer able to hold all shaders + the containerJson - core::vector retVal(shaderBufferSize + SHADER_BUFFER_SIZE_BYTES + dumpedContainerJsonLength); + size_t retValSize = shaderBufferSize + SHADER_BUFFER_SIZE_BYTES + dumpedContainerJsonLength; + core::vector retVal(retValSize); // first SHADER_BUFFER_SIZE_BYTES (8) in the buffer are the size of the shader buffer memcpy(retVal.data(), &shaderBufferSize, SHADER_BUFFER_SIZE_BYTES); @@ -343,7 +344,7 @@ core::smart_refctd_ptr IShaderCompiler::CCache::serialize() const memcpy(retVal.data() + SHADER_BUFFER_SIZE_BYTES + shaderBufferSize, dumpedContainerJson.data(), dumpedContainerJsonLength); auto memoryResource = new core::VectorViewNullMemoryResource(std::move(retVal)); - return ICPUBuffer::create({ .size = retVal.size(), .data = retVal.data(), .memoryResource = core::make_smart_refctd_ptr(memoryResource) }); + return ICPUBuffer::create({ .size = retValSize, .data = memoryResource->data(), .memoryResource = core::make_smart_refctd_ptr(memoryResource) }); } core::smart_refctd_ptr IShaderCompiler::CCache::deserialize(const std::span serializedCache) diff --git a/src/nbl/core/alloc/refctd_memory_resource.cpp b/src/nbl/core/alloc/refctd_memory_resource.cpp index 124465e645..ecf7549957 100644 --- a/src/nbl/core/alloc/refctd_memory_resource.cpp +++ b/src/nbl/core/alloc/refctd_memory_resource.cpp @@ -7,11 +7,11 @@ using namespace nbl; using namespace core; -static smart_refctd_ptr null_memory_resource = nullptr; static smart_refctd_ptr default_memory_resource = nullptr; smart_refctd_ptr core::getNullMemoryResource() { + static smart_refctd_ptr null_memory_resource = nullptr; if (!null_memory_resource) null_memory_resource = make_smart_refctd_ptr(std::pmr::null_memory_resource()); return null_memory_resource; diff --git a/src/nbl/system/CArchiveLoaderTar.cpp b/src/nbl/system/CArchiveLoaderTar.cpp index 6ec9831717..25a3bfd6df 100644 --- a/src/nbl/system/CArchiveLoaderTar.cpp +++ b/src/nbl/system/CArchiveLoaderTar.cpp @@ -76,7 +76,7 @@ bool CArchiveLoaderTar::isALoadableFileFormat(IFile* file) const memset(fHead.Checksum, ' ', 8); // old header - for (uint8_t* p = (uint8_t*)(&fHead); p < (uint8_t*)(&fHead.Magic[0]); ++p) + for (uint8_t* p = reinterpret_cast(&fHead); p < reinterpret_cast(&fHead.Magic[0]); ++p) { checksum1 += *p; checksum2 += char(*p); @@ -84,7 +84,7 @@ bool CArchiveLoaderTar::isALoadableFileFormat(IFile* file) const if (!strncmp(fHead.Magic, "ustar", 5)) { - for (uint8_t* p = (uint8_t*)(&fHead.Magic[0]); p < (uint8_t*)(&fHead) + sizeof(fHead); ++p) + for (uint8_t* p = reinterpret_cast(&fHead.Magic[0]); p < reinterpret_cast(&fHead) + sizeof(fHead); ++p) { checksum1 += *p; checksum2 += char(*p); From 22630d0ba1a8a36a1a3265b9b27b9c15b0c83718 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Thu, 14 Nov 2024 15:06:50 -0800 Subject: [PATCH 192/358] Identity matrices refactor --- include/nbl/asset/IAccelerationStructure.h | 4 +- .../hlsl/matrix_utils/matrix_traits.hlsl | 30 +++++++++ .../transformation_matrix_utils.hlsl | 66 +++++++++++++++++-- 3 files changed, 94 insertions(+), 6 deletions(-) create mode 100644 include/nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl diff --git a/include/nbl/asset/IAccelerationStructure.h b/include/nbl/asset/IAccelerationStructure.h index 9dc07b2675..b3fb583796 100644 --- a/include/nbl/asset/IAccelerationStructure.h +++ b/include/nbl/asset/IAccelerationStructure.h @@ -198,13 +198,13 @@ class ITopLevelAccelerationStructure : public AccelerationStructure template struct StaticInstance final { - hlsl::float32_t3x4 transform = hlsl::IdentityFloat32_t3x4; + hlsl::float32_t3x4 transform = hlsl::float32_t3x4_identity; Instance base = {}; }; template struct MatrixMotionInstance final { - hlsl::float32_t3x4 transform[2] = { hlsl::IdentityFloat32_t3x4, hlsl::IdentityFloat32_t3x4 }; + hlsl::float32_t3x4 transform[2] = { hlsl::float32_t3x4_identity, hlsl::float32_t3x4_identity }; Instance base = {}; }; struct SRT diff --git a/include/nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl b/include/nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl new file mode 100644 index 0000000000..5a40089c86 --- /dev/null +++ b/include/nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl @@ -0,0 +1,30 @@ +#ifndef _NBL_BUILTIN_HLSL_MATRIX_UTILS_MATRIX_TRAITS_INCLUDED_ +#define _NBL_BUILTIN_HLSL_MATRIX_UTILS_MATRIX_TRAITS_INCLUDED_ + +#include +#include +// TODO: remove this header when deleting vectorSIMDf.hlsl +#include +#include "vectorSIMD.h" +#include + +namespace nbl +{ +namespace hlsl +{ + +template +struct matrix_traits; +// partial spec for matrices +template +struct matrix_traits > +{ + NBL_CONSTEXPR_STATIC_INLINE uint32_t RowCount = N; + NBL_CONSTEXPR_STATIC_INLINE uint32_t ColumnCount = M; + NBL_CONSTEXPR_STATIC_INLINE bool Square = N == M; +}; + +} +} + +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl b/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl index d3d7d7fa04..8858ada6d2 100644 --- a/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl +++ b/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl @@ -6,16 +6,74 @@ // TODO: remove this header when deleting vectorSIMDf.hlsl #include #include "vectorSIMD.h" +#include +#include +#include namespace nbl { namespace hlsl { + // goal: identity::value + // plan: diagonal + // identity + // + // partial spec: + // template + // identity + +namespace transfromation_matrix_utils_impl +{ +template +matrix diagonal(float diagonal) +{ + using MatT = matrix; + MatT output; + + for (int i = 0; i < matrix_traits::RowCount; ++i) + for (int j = 0; j < matrix_traits::ColumnCount; ++j) + output[i][j] = 0; + + auto a = matrix_traits::RowCount; + auto b = matrix_traits::ColumnCount; + + for (int diag = 0; diag < matrix_traits::RowCount; ++diag) + output[diag][diag] = diagonal; + + return output; +}; +} -NBL_CONSTEXPR hlsl::float32_t3x4 IdentityFloat32_t3x4 = - hlsl::float32_t3x4(hlsl::float32_t4(1, 0, 0, 0), hlsl::float32_t4(0, 0, 1, 0), hlsl::float32_t4(0, 0, 1, 0)); -NBL_CONSTEXPR hlsl::float32_t4x4 IdentityFloat32_t4x4 = - hlsl::float32_t4x4(hlsl::float32_t4(1, 0, 0, 0), hlsl::float32_t4(0, 0, 1, 0), hlsl::float32_t4(0, 0, 1, 0), hlsl::float32_t4(0, 0, 0, 1)); +template +struct identity; + +template +struct identity > +{ + static matrix get() + { + return transfromation_matrix_utils_impl::diagonal(1); + } +}; + +#define IDENTITY_MATRIX(TYPE, N, M)\ +const matrix TYPE ## N ## x ## M ## _identity = identity >::get(); + +#define DEFINE_IDENTITY_MATRICES(TYPE)\ +IDENTITY_MATRIX(TYPE, 2, 2)\ +IDENTITY_MATRIX(TYPE, 3, 3)\ +IDENTITY_MATRIX(TYPE, 4, 4)\ +IDENTITY_MATRIX(TYPE, 3, 4) + +DEFINE_IDENTITY_MATRICES(float32_t) +DEFINE_IDENTITY_MATRICES(float64_t) +DEFINE_IDENTITY_MATRICES(int32_t) +DEFINE_IDENTITY_MATRICES(int64_t) +DEFINE_IDENTITY_MATRICES(uint32_t) +DEFINE_IDENTITY_MATRICES(uint64_t) + +#undef DEFINE_IDENTITY_MATRICES +#undef IDENTITY_MATRIX // TODO: this is temporary function, delete when removing vectorSIMD template From 09833d9b02790ea5985eadd344dc9a15e93f639d Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Thu, 14 Nov 2024 20:19:27 -0300 Subject: [PATCH 193/358] Fix build --- src/nbl/video/utilities/CAssetConverter.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index bb6e0037d8..3f11842826 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -2979,7 +2979,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul } // wipe gpu item in staging cache (this may drop it as well if it was made for only a root asset == no users) - auto findInStaging = [&reservations](const asset_traits::video_t* gpuObj)->core::blake3_hash_t* + auto findInStaging = [&reservations](const typename asset_traits::video_t* gpuObj)->core::blake3_hash_t* { auto& stagingCache = std::get>(reservations.m_stagingCaches); const auto found = stagingCache.find(const_cast::video_t*>(gpuObj)); @@ -3628,7 +3628,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul // want to check if deps successfully exist - auto missingDependent = [&reservations](const asset_traits::video_t* dep)->bool + auto missingDependent = [&reservations](const typename asset_traits::video_t* dep)->bool { auto& stagingCache = std::get>(reservations.m_stagingCaches); auto found = stagingCache.find(const_cast::video_t*>(dep)); From afe18dd88bc41621be5cf3d8535ea46e426f186a Mon Sep 17 00:00:00 2001 From: Ali Cheraghi Date: Fri, 15 Nov 2024 11:47:02 +0330 Subject: [PATCH 194/358] asset: pass correct arg to is_aligned Signed-off-by: Ali Cheraghi --- .../nbl/core/alloc/VectorViewNullMemoryResource.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/include/nbl/core/alloc/VectorViewNullMemoryResource.h b/include/nbl/core/alloc/VectorViewNullMemoryResource.h index 082986886d..936534f18d 100644 --- a/include/nbl/core/alloc/VectorViewNullMemoryResource.h +++ b/include/nbl/core/alloc/VectorViewNullMemoryResource.h @@ -27,18 +27,22 @@ class VectorViewNullMemoryResource : public std::pmr::memory_resource } protected: - void* do_allocate(size_t bytes, size_t alignment) override { - if (already_called || bytes > buffer.size() || !core::is_aligned_to(bytes, alignment)) + void* do_allocate(size_t bytes, size_t alignment) override + { + if (already_called || bytes > buffer.size() || !core::is_aligned_to(buffer.data(), alignment)) return nullptr; already_called = true; return buffer.data(); } - void do_deallocate(void* p, size_t bytes, size_t alignment) override { + void do_deallocate(void* p, size_t bytes, size_t alignment) override + { assert(p == buffer.data()); + already_called = false; } - bool do_is_equal(const std::pmr::memory_resource& other) const noexcept override { + bool do_is_equal(const std::pmr::memory_resource& other) const noexcept override + { return this == &other; } From 80e94fb6677fe049a531a00088e07e3235c97d06 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 15 Nov 2024 16:19:57 +0700 Subject: [PATCH 195/358] latest ray query geom creator example --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index ec7cfa853a..71a603f7f5 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit ec7cfa853a06beaad504e4c31ee32c79a6160172 +Subproject commit 71a603f7f5fa6f29e0eaeda23c047483996c3a69 From 8736be121dbe0f85a497ffd181f451279fc821a2 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 15 Nov 2024 16:46:38 +0700 Subject: [PATCH 196/358] ray query geom creator initial complete --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 71a603f7f5..9a160b3188 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 71a603f7f5fa6f29e0eaeda23c047483996c3a69 +Subproject commit 9a160b318820487a94e0ec64e17ec687ea4434d3 From 72f847d0aa649f67d453f9c63cf93185b6c91d9e Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 15 Nov 2024 16:43:36 +0100 Subject: [PATCH 197/358] post merge examples_tests pointer correction --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 0df6008c92..643883c334 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 0df6008c923712ed729b47cf7711c6301622fdb3 +Subproject commit 643883c334bb4d95df3b9736e24d70bfdbd14d78 From 22e86a763d587c964fea868b77171d4d7aea4188 Mon Sep 17 00:00:00 2001 From: devsh Date: Sat, 16 Nov 2024 01:15:18 +0100 Subject: [PATCH 198/358] fix for MSVC 17.12.0 regression in handling partial specs constrained by `requires` and concepts --- examples_tests | 2 +- src/nbl/video/utilities/CAssetConverter.cpp | 69 +++++++++++++++++---- 2 files changed, 58 insertions(+), 13 deletions(-) diff --git a/examples_tests b/examples_tests index 643883c334..22f9294f9a 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 643883c334bb4d95df3b9736e24d70bfdbd14d78 +Subproject commit 22f9294f9ad1c35f5909411248dd08c7e1ebbbe2 diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 3f11842826..cbbfe09a7c 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -1500,13 +1500,59 @@ class GetDependantVisit : public GetDependantVisitBase requires nbl::is_any_of_v -class GetDependantVisit : public GetDependantVisitBase +template<> +class GetDependantVisit : public GetDependantVisitBase { - using base_t = GetDependantVisitBase; + public: +// using AssetType = ICPUComputePipeline; + + inline auto& getSpecInfo(const IShader::E_SHADER_STAGE stage) + { + assert(hlsl::bitCount(stage)==1); + return specInfo[hlsl::findLSB(stage)]; + } + + // ok to do non owning since some cache owns anyway + IGPUPipelineLayout* layout = nullptr; + // has to be public to allow for initializer list constructor + std::array::value*/sizeof(IShader::E_SHADER_STAGE)*8> specInfo = {}; + protected: + bool descend_impl( + const instance_t& user, const CAssetConverter::patch_t& userPatch, + const instance_t& dep, const CAssetConverter::patch_t& soloPatch + ) + { + auto depObj = getDependant(dep,soloPatch); + if (!depObj) + return false; + layout = depObj.get(); + return true; + } + bool descend_impl( + const instance_t& user, const CAssetConverter::patch_t& userPatch, + const instance_t& dep, const CAssetConverter::patch_t& soloPatch, + const IShader::E_SHADER_STAGE stage, const IShader::SSpecInfo& inSpecInfo + ) + { + auto depObj = getDependant(dep,soloPatch); + if (!depObj) + return false; + getSpecInfo(stage) = { + .entryPoint = inSpecInfo.entryPoint, + .shader = depObj.get(), + .entries = inSpecInfo.entries, + .requiredSubgroupSize = inSpecInfo.requiredSubgroupSize, + .requireFullSubgroups = inSpecInfo.requireFullSubgroups + }; + return true; + } +}; +template<> +class GetDependantVisit : public GetDependantVisitBase +{ public: - using AssetType = AssetT; +// using AssetType = ICPUGraphicsPipeline; inline auto& getSpecInfo(const IShader::E_SHADER_STAGE stage) { @@ -1523,23 +1569,23 @@ class GetDependantVisit : public GetDependantVisitBase protected: bool descend_impl( - const instance_t& user, const CAssetConverter::patch_t& userPatch, + const instance_t& user, const CAssetConverter::patch_t& userPatch, const instance_t& dep, const CAssetConverter::patch_t& soloPatch ) { - auto depObj = base_t::getDependant(dep,soloPatch); + auto depObj = getDependant(dep,soloPatch); if (!depObj) return false; layout = depObj.get(); return true; } bool descend_impl( - const instance_t& user, const CAssetConverter::patch_t& userPatch, + const instance_t& user, const CAssetConverter::patch_t& userPatch, const instance_t& dep, const CAssetConverter::patch_t& soloPatch, const IShader::E_SHADER_STAGE stage, const IShader::SSpecInfo& inSpecInfo ) { - auto depObj = base_t::getDependant(dep,soloPatch); + auto depObj = getDependant(dep,soloPatch); if (!depObj) return false; getSpecInfo(stage) = { @@ -1547,17 +1593,16 @@ class GetDependantVisit : public GetDependantVisitBase .shader = depObj.get(), .entries = inSpecInfo.entries, .requiredSubgroupSize = inSpecInfo.requiredSubgroupSize, - .requireFullSubgroups = stage==IShader::E_SHADER_STAGE::ESS_COMPUTE ? inSpecInfo.requireFullSubgroups:uint8_t(0) + .requireFullSubgroups = 0 }; return true; } - template requires (std::is_same_v&& std::is_same_v) bool descend_impl( - const instance_t& user, const CAssetConverter::patch_t& userPatch, + const instance_t& user, const CAssetConverter::patch_t& userPatch, const instance_t& dep, const CAssetConverter::patch_t& soloPatch ) { - auto depObj = base_t::getDependant(dep,soloPatch); + auto depObj = getDependant(dep,soloPatch); if (!depObj) return false; renderpass = depObj.get(); From 6789fa74e4029e9e030fc9ed9af189b369db9bad Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 18 Nov 2024 15:45:34 +0700 Subject: [PATCH 199/358] use snorm unpack instead --- examples_tests | 2 +- include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples_tests b/examples_tests index 9a160b3188..f779e6912d 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 9a160b318820487a94e0ec64e17ec687ea4434d3 +Subproject commit f779e6912d70b8d87cc718d2b03832d2b556fcfe diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl index e3e59b466b..a6df85e4fa 100644 --- a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl +++ b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl @@ -197,8 +197,8 @@ SquareMatrix matrixInverse(NBL_CONST_REF_ARG(SquareMatrix) mat); [[vk::ext_instruction(GLSLstd450UnpackSnorm2x16, "GLSL.std.450")]] float32_t2 unpackSnorm2x16(uint32_t p); -[[vk::ext_instruction(GLSLstd450UnpackUnorm4x8, "GLSL.std.450")]] -float32_t4 unpackUnorm4x8(uint32_t p); +[[vk::ext_instruction(GLSLstd450UnpackSnorm4x8, "GLSL.std.450")]] +float32_t4 unpackSnorm4x8(uint32_t p); // Memory Semantics link here: https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#Memory_Semantics_-id- From f36574a8614e4c2ced37bc9752d9fa9eb2cb91f5 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 18 Nov 2024 17:46:18 +0700 Subject: [PATCH 200/358] added allocation flag check for device address --- examples_tests | 2 +- include/nbl/video/utilities/IUtilities.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples_tests b/examples_tests index f779e6912d..502ddac630 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit f779e6912d70b8d87cc718d2b03832d2b556fcfe +Subproject commit 502ddac630b91819d07cd835bb6ffc12c9e7519d diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h index 3cd24a6d44..8ef38fd080 100644 --- a/include/nbl/video/utilities/IUtilities.h +++ b/include/nbl/video/utilities/IUtilities.h @@ -390,7 +390,9 @@ class NBL_API2 IUtilities : public core::IReferenceCounted auto buffer = m_device->createBuffer(std::move(params)); auto mreqs = buffer->getMemoryReqs(); mreqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); - auto mem = m_device->allocate(mreqs,buffer.get()); + auto allocFlags = (params.usage & asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) ? + IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT : IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE; + auto mem = m_device->allocate(mreqs,buffer.get(), allocFlags); auto submitSuccess = autoSubmit( submit, From 932a258f33a7ae622fcbb46adc69446346183ce0 Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 18 Nov 2024 14:53:44 +0100 Subject: [PATCH 201/358] fix IMGUI for new ICPUBuffer API --- src/nbl/ext/ImGui/ImGui.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nbl/ext/ImGui/ImGui.cpp b/src/nbl/ext/ImGui/ImGui.cpp index 9e9f9f2e55..50a57f85fa 100644 --- a/src/nbl/ext/ImGui/ImGui.cpp +++ b/src/nbl/ext/ImGui/ImGui.cpp @@ -434,7 +434,7 @@ smart_refctd_ptr UI::createFontAtlasTexture(const SCreationParame // set its contents { const size_t image_size = getTexelOrBlockBytesize(NBL_FORMAT_FONT)*width*height; - auto buffer = make_smart_refctd_ptr>>(image_size,pixels,adopt_memory); + auto buffer = ICPUBuffer::create({.size=image_size,.data=pixels,.alignment=alignof(uint32_t),.memoryResource=core::getNullMemoryResource()},adopt_memory); auto regions = make_refctd_dynamic_array>(1ull); { auto region = regions->begin(); From ec1db6823351d9e72172247aa2b05e5a180a29a3 Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Mon, 18 Nov 2024 15:29:01 +0100 Subject: [PATCH 202/358] update imguizmo with its upstream --- 3rdparty/CMakeLists.txt | 13 +++++++++++-- 3rdparty/imguizmo | 2 +- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt index 7b9b6da784..1d0b3fee0f 100755 --- a/3rdparty/CMakeLists.txt +++ b/3rdparty/CMakeLists.txt @@ -383,8 +383,17 @@ if(NBL_BUILD_IMGUI) target_link_libraries(imtestengine PUBLIC imtestsuite) - set(IMGUIZMO_BUILD_EXAMPLE OFF) - add_subdirectory(imguizmo EXCLUDE_FROM_ALL) + # imguizmo + add_library(imguizmo + "${CMAKE_CURRENT_SOURCE_DIR}/imguizmo/GraphEditor.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/imguizmo/ImCurveEdit.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/imguizmo/ImGradient.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/imguizmo/ImGuizmo.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/imguizmo/ImSequencer.cpp" + ) + + target_include_directories(imguizmo PUBLIC $) + target_link_libraries(imguizmo PUBLIC imgui) # note we override imgui config with our own set(NBL_IMGUI_USER_CONFIG_FILEPATH "${NBL_IMGUI_ROOT}/nabla_imconfig.h") diff --git a/3rdparty/imguizmo b/3rdparty/imguizmo index 6f4b2197ef..b10e91756d 160000 --- a/3rdparty/imguizmo +++ b/3rdparty/imguizmo @@ -1 +1 @@ -Subproject commit 6f4b2197efd715d16b19775b00f36c6c6f5aacb6 +Subproject commit b10e91756d32395f5c1fefd417899b657ed7cb88 From b9905c29d46cdd97acd5c71e06f7dc8cf2d43383 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 19 Nov 2024 10:50:58 +0700 Subject: [PATCH 203/358] latest example and kept in UnpackUnorm4x8 --- examples_tests | 2 +- include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 502ddac630..41937881f9 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 502ddac630b91819d07cd835bb6ffc12c9e7519d +Subproject commit 41937881f94d7fe89e2f20cf54a9169e9968224c diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl index a6df85e4fa..5c7d730094 100644 --- a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl +++ b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl @@ -200,6 +200,9 @@ float32_t2 unpackSnorm2x16(uint32_t p); [[vk::ext_instruction(GLSLstd450UnpackSnorm4x8, "GLSL.std.450")]] float32_t4 unpackSnorm4x8(uint32_t p); +[[vk::ext_instruction(GLSLstd450UnpackUnorm4x8, "GLSL.std.450")]] +float32_t4 unpackUnorm4x8(uint32_t p); + // Memory Semantics link here: https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#Memory_Semantics_-id- // https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#_memory_semantics_id From 69cd5d2116fee8e04cf8cd91d7aeb59c9ad9d29d Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 19 Nov 2024 15:39:23 +0700 Subject: [PATCH 204/358] restructured integration --- include/nbl/video/IAPIConnection.h | 18 ++++-- include/nbl/video/utilities/ngfx.h | 85 ----------------------------- src/nbl/video/CVulkanConnection.cpp | 3 +- src/nbl/video/IAPIConnection.cpp | 77 +++++++++++++++++++++++++- 4 files changed, 90 insertions(+), 93 deletions(-) delete mode 100644 include/nbl/video/utilities/ngfx.h diff --git a/include/nbl/video/IAPIConnection.h b/include/nbl/video/IAPIConnection.h index 840944bab2..1ec862e05b 100644 --- a/include/nbl/video/IAPIConnection.h +++ b/include/nbl/video/IAPIConnection.h @@ -11,7 +11,6 @@ #include "nbl/video/debug/IDebugCallback.h" #include "nbl/video/utilities/renderdoc.h" -#include "nbl/video/utilities/ngfx.h" namespace nbl::video { @@ -61,16 +60,13 @@ class NBL_API2 IAPIConnection : public core::IReferenceCounted const SFeatures& getEnabledFeatures() const { return m_enabledFeatures; } - enum DebuggerType + enum SDebuggerType { EDT_NONE, EDT_RENDERDOC, EDT_NGFX }; - const DebuggerType isRunningInGraphicsDebugger() const { - return m_ngfx_api.useNGFX ? EDT_NGFX : // ngfx takes priority? - m_rdoc_api ? EDT_RENDERDOC : EDT_NONE; - } + const SDebuggerType isRunningInGraphicsDebugger() const { return m_debuggerType; } virtual bool startCapture() = 0; virtual bool endCapture() = 0; @@ -78,8 +74,18 @@ class NBL_API2 IAPIConnection : public core::IReferenceCounted IAPIConnection(const SFeatures& enabledFeatures); std::vector> m_physicalDevices; + SDebuggerType m_debuggerType; renderdoc_api_t* m_rdoc_api; + + struct SNGFXIntegration { + bool useNGFX; + + bool injectNGFXToProcess(); + bool executeNGFXCommand(); + }; + using ngfx_api_t = SNGFXIntegration; ngfx_api_t m_ngfx_api; + SFeatures m_enabledFeatures = {}; }; diff --git a/include/nbl/video/utilities/ngfx.h b/include/nbl/video/utilities/ngfx.h deleted file mode 100644 index 91eabfa2b3..0000000000 --- a/include/nbl/video/utilities/ngfx.h +++ /dev/null @@ -1,85 +0,0 @@ -#ifndef _NBL_VIDEO_UTILITIES_NGFX_H_INCLUDED_ -#define _NBL_VIDEO_UTILITIES_NGFX_H_INCLUDED_ - -// TODO: hopefully this is temporary -#include "C:\Program Files\NVIDIA Corporation\Nsight Graphics 2024.1.0\SDKs\NsightGraphicsSDK\0.8.0\include\NGFX_Injection.h" - -namespace nbl::video -{ - struct SNGFXIntegration - { - bool useNGFX; - NGFX_Injection_InstallationInfo versionInfo; - }; - - inline bool injectNGFXToProcess(SNGFXIntegration& api) - { - uint32_t numInstallations = 0; - auto result = NGFX_Injection_EnumerateInstallations(&numInstallations, nullptr); - if (numInstallations == 0 || NGFX_INJECTION_RESULT_OK != result) - { - api.useNGFX = false; - return false; - } - - std::vector installations(numInstallations); - result = NGFX_Injection_EnumerateInstallations(&numInstallations, installations.data()); - if (numInstallations == 0 || NGFX_INJECTION_RESULT_OK != result) - { - api.useNGFX = false; - return false; - } - - // get latest installation - api.versionInfo = installations.back(); - - uint32_t numActivities = 0; - result = NGFX_Injection_EnumerateActivities(&api.versionInfo, &numActivities, nullptr); - if (numActivities == 0 || NGFX_INJECTION_RESULT_OK != result) - { - api.useNGFX = false; - return false; - } - - std::vector activities(numActivities); - result = NGFX_Injection_EnumerateActivities(&api.versionInfo, &numActivities, activities.data()); - if (NGFX_INJECTION_RESULT_OK != result) - { - api.useNGFX = false; - return false; - } - - const NGFX_Injection_Activity* pActivityToInject = nullptr; - for (const NGFX_Injection_Activity& activity : activities) - { - if (activity.type == NGFX_INJECTION_ACTIVITY_FRAME_DEBUGGER) // only want frame debugger - { - pActivityToInject = &activity; - break; - } - } - - if (!pActivityToInject) { - api.useNGFX = false; - return false; - } - - result = NGFX_Injection_InjectToProcess(&api.versionInfo, pActivityToInject); - if (NGFX_INJECTION_RESULT_OK != result) - { - api.useNGFX = false; - return false; - } - - return true; - } - - inline void executeNGFXCommand() - { - NGFX_Injection_ExecuteActivityCommand(); - } - - using ngfx_api_t = SNGFXIntegration; -} - -#endif //_NBL_VIDEO_UTILITIES_NGFX_H_INCLUDED_ \ No newline at end of file diff --git a/src/nbl/video/CVulkanConnection.cpp b/src/nbl/video/CVulkanConnection.cpp index e1a33a1418..da2e08b6bc 100644 --- a/src/nbl/video/CVulkanConnection.cpp +++ b/src/nbl/video/CVulkanConnection.cpp @@ -339,7 +339,7 @@ bool CVulkanConnection::startCapture() if (debugType == EDT_RENDERDOC) m_rdoc_api->StartFrameCapture(RENDERDOC_DEVICEPOINTER_FROM_VKINSTANCE(m_vkInstance), NULL); else - executeNGFXCommand(); + m_ngfx_api.executeNGFXCommand(); return true; } @@ -360,6 +360,7 @@ bool CVulkanConnection::endCapture() if (debugType == EDT_RENDERDOC) m_rdoc_api->EndFrameCapture(RENDERDOC_DEVICEPOINTER_FROM_VKINSTANCE(m_vkInstance), NULL); // no equivalent end frame capture for ngfx, ends captures on next frame delimiter + // see https://www.reddit.com/r/GraphicsProgramming/comments/w0hl9o/graphics_debugger_record_before_first_frame/ flag.clear(); return true; } diff --git a/src/nbl/video/IAPIConnection.cpp b/src/nbl/video/IAPIConnection.cpp index 8dc156bb94..e679363d53 100644 --- a/src/nbl/video/IAPIConnection.cpp +++ b/src/nbl/video/IAPIConnection.cpp @@ -4,6 +4,9 @@ #include "nbl/video/utilities/renderdoc.h" #include "nbl/video/utilities/ngfx.h" +// TODO: temporary hopefully +#include "C:\Program Files\NVIDIA Corporation\Nsight Graphics 2024.1.0\SDKs\NsightGraphicsSDK\0.8.0\include\NGFX_Injection.h" + #if defined(_NBL_POSIX_API_) #include #endif @@ -11,6 +14,7 @@ namespace nbl::video { + std::span IAPIConnection::getPhysicalDevices() const { static_assert(sizeof(std::unique_ptr) == sizeof(void*)); @@ -46,8 +50,79 @@ IAPIConnection::IAPIConnection(const SFeatures& enabledFeatures) #endif // probably is platform agnostic, for now - injectNGFXToProcess(m_ngfx_api); + m_ngfx_api.injectNGFXToProcess(); + + m_debuggerType = m_ngfx_api.useNGFX ? EDT_NGFX : // ngfx takes priority? + m_rdoc_api ? EDT_RENDERDOC : EDT_NONE; + } +} + +bool IAPIConnection::SNGFXIntegration::injectNGFXToProcess() +{ + uint32_t numInstallations = 0; + auto result = NGFX_Injection_EnumerateInstallations(&numInstallations, nullptr); + if (numInstallations == 0 || NGFX_INJECTION_RESULT_OK != result) + { + useNGFX = false; + return false; + } + + std::vector installations(numInstallations); + result = NGFX_Injection_EnumerateInstallations(&numInstallations, installations.data()); + if (numInstallations == 0 || NGFX_INJECTION_RESULT_OK != result) + { + useNGFX = false; + return false; + } + + // get latest installation + NGFX_Injection_InstallationInfo versionInfo = installations.back(); + + uint32_t numActivities = 0; + result = NGFX_Injection_EnumerateActivities(&versionInfo, &numActivities, nullptr); + if (numActivities == 0 || NGFX_INJECTION_RESULT_OK != result) + { + useNGFX = false; + return false; } + + std::vector activities(numActivities); + result = NGFX_Injection_EnumerateActivities(&versionInfo, &numActivities, activities.data()); + if (NGFX_INJECTION_RESULT_OK != result) + { + useNGFX = false; + return false; + } + + const NGFX_Injection_Activity* pActivityToInject = nullptr; + for (const NGFX_Injection_Activity& activity : activities) + { + if (activity.type == NGFX_INJECTION_ACTIVITY_FRAME_DEBUGGER) // only want frame debugger + { + pActivityToInject = &activity; + break; + } + } + + if (!pActivityToInject) { + useNGFX = false; + return false; + } + + result = NGFX_Injection_InjectToProcess(&versionInfo, pActivityToInject); + if (NGFX_INJECTION_RESULT_OK != result) + { + useNGFX = false; + return false; + } + + useNGFX = true; + return true; +} + +bool IAPIConnection::SNGFXIntegration::executeNGFXCommand() +{ + return NGFX_Injection_ExecuteActivityCommand() == NGFX_INJECTION_RESULT_OK; } } \ No newline at end of file From 65bbad8adc7d9866381f7ea604df85ab4c81283d Mon Sep 17 00:00:00 2001 From: Fletterio Date: Tue, 19 Nov 2024 14:51:30 -0300 Subject: [PATCH 205/358] Adds ternary op for complex numbers --- include/nbl/builtin/hlsl/complex.hlsl | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/nbl/builtin/hlsl/complex.hlsl b/include/nbl/builtin/hlsl/complex.hlsl index 249013b1b9..69d16bf850 100644 --- a/include/nbl/builtin/hlsl/complex.hlsl +++ b/include/nbl/builtin/hlsl/complex.hlsl @@ -379,6 +379,19 @@ complex_t rotateRight(NBL_CONST_REF_ARG(complex_t) value) return retVal; } +// Annoyed at having to write a lot of boilerplate to do a select +// Essentially returns what you'd expect from doing `condition ? a : b` +template +complex_t ternaryOperator(bool condition, NBL_CONST_REF_ARG(complex_t) a, NBL_CONST_REF_ARG(complex_t) b) +{ + const vector aVector = vector(a.real(), a.imag()); + const vector bVector = vector(b.real(), b.imag()); + const vector resultVector = condition ? aVector : bVector; + const complex_t result = { resultVector.x, resultVector.y }; + return result; +} + + } } From 13dd52d1c1e696450df2cda89e1f137847b58cc4 Mon Sep 17 00:00:00 2001 From: Fletterio Date: Tue, 19 Nov 2024 17:01:40 -0300 Subject: [PATCH 206/358] Restore submodule pointer --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 22f9294f9a..2cd9e91a4b 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 22f9294f9ad1c35f5909411248dd08c7e1ebbbe2 +Subproject commit 2cd9e91a4b2cb96d22fda18d1d8ab10d1c423328 From f1b65e9da97ce8b35a1d80f0cf4b5076035da47e Mon Sep 17 00:00:00 2001 From: Fletterio Date: Tue, 19 Nov 2024 17:25:15 -0300 Subject: [PATCH 207/358] VectorViewNullMemoryResource fix --- include/nbl/core/alloc/VectorViewNullMemoryResource.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/core/alloc/VectorViewNullMemoryResource.h b/include/nbl/core/alloc/VectorViewNullMemoryResource.h index 936534f18d..44b7da152b 100644 --- a/include/nbl/core/alloc/VectorViewNullMemoryResource.h +++ b/include/nbl/core/alloc/VectorViewNullMemoryResource.h @@ -17,7 +17,7 @@ class VectorViewNullMemoryResource : public std::pmr::memory_resource { public: // only create the resource from an already sized vector - VectorViewNullMemoryResource(core::vector&& buffer) : buffer(std::move(buffer)), already_called(false) + VectorViewNullMemoryResource(core::vector&& _buffer) : buffer(std::move(_buffer)), already_called(false) { assert(buffer.size()); } From 2617c164ad5ea6c948496aa573282e9ff6ce2018 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Tue, 19 Nov 2024 14:25:46 -0800 Subject: [PATCH 208/358] Fixed templated functions --- examples_tests | 2 +- include/nbl/asset/IAccelerationStructure.h | 4 +- include/nbl/asset/utils/IMeshManipulator.h | 3 +- include/nbl/builtin/hlsl/concepts.hlsl | 1 - .../nbl/builtin/hlsl/cpp_compat/matrix.hlsl | 1 + .../nbl/builtin/hlsl/cpp_compat/unroll.hlsl | 12 ++ .../hlsl/matrix_utils/matrix_traits.hlsl | 36 +++-- .../transformation_matrix_utils.hlsl | 131 +++++++----------- include/nbl/core/math/plane3dSIMD.h | 2 +- src/nbl/asset/utils/CGeometryCreator.cpp | 4 +- src/nbl/asset/utils/CMeshManipulator.cpp | 18 ++- 11 files changed, 106 insertions(+), 108 deletions(-) create mode 100644 include/nbl/builtin/hlsl/cpp_compat/unroll.hlsl diff --git a/examples_tests b/examples_tests index f56c121641..2ef03b660f 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit f56c121641cae50eb7eb555e42549b6736a390c3 +Subproject commit 2ef03b660f2a08a1b196a1e11ffdc520091b43b0 diff --git a/include/nbl/asset/IAccelerationStructure.h b/include/nbl/asset/IAccelerationStructure.h index b3fb583796..889a5ce626 100644 --- a/include/nbl/asset/IAccelerationStructure.h +++ b/include/nbl/asset/IAccelerationStructure.h @@ -198,13 +198,13 @@ class ITopLevelAccelerationStructure : public AccelerationStructure template struct StaticInstance final { - hlsl::float32_t3x4 transform = hlsl::float32_t3x4_identity; + hlsl::float32_t3x4 transform = hlsl::diagonal(1.0f); Instance base = {}; }; template struct MatrixMotionInstance final { - hlsl::float32_t3x4 transform[2] = { hlsl::float32_t3x4_identity, hlsl::float32_t3x4_identity }; + hlsl::float32_t3x4 transform[2] = { hlsl::diagonal(1.0f), hlsl::diagonal(1.0f) }; Instance base = {}; }; struct SRT diff --git a/include/nbl/asset/utils/IMeshManipulator.h b/include/nbl/asset/utils/IMeshManipulator.h index 0a5bf12313..3c2fc6ccf4 100644 --- a/include/nbl/asset/utils/IMeshManipulator.h +++ b/include/nbl/asset/utils/IMeshManipulator.h @@ -411,7 +411,8 @@ class NBL_API2 IMeshManipulator : public virtual core::IReferenceCounted if (jointIDFLT_MIN) { - core::vectorSIMDf boneSpacePos = hlsl::transformVector(hlsl::getMatrix3x4As4x4(inverseBindPoses[jointID]), pos); + const hlsl::float32_t4x4 transformationMatrix = hlsl::getMatrix3x4As4x4(inverseBindPoses[jointID]); + core::vectorSIMDf boneSpacePos = hlsl::transformVector(transformationMatrix, pos); jointAABBs[jointID].addInternalPoint(boneSpacePos.getAsVector3df()); noJointInfluence = false; } diff --git a/include/nbl/builtin/hlsl/concepts.hlsl b/include/nbl/builtin/hlsl/concepts.hlsl index 29660b8b45..943004045c 100644 --- a/include/nbl/builtin/hlsl/concepts.hlsl +++ b/include/nbl/builtin/hlsl/concepts.hlsl @@ -4,7 +4,6 @@ #ifndef _NBL_BUILTIN_HLSL_CONCEPTS_INCLUDED_ #define _NBL_BUILTIN_HLSL_CONCEPTS_INCLUDED_ - #include #include #include diff --git a/include/nbl/builtin/hlsl/cpp_compat/matrix.hlsl b/include/nbl/builtin/hlsl/cpp_compat/matrix.hlsl index cc89f9d003..b04bd6c7e0 100644 --- a/include/nbl/builtin/hlsl/cpp_compat/matrix.hlsl +++ b/include/nbl/builtin/hlsl/cpp_compat/matrix.hlsl @@ -14,6 +14,7 @@ struct matrix final : private glm::mat using Base = glm::mat; using Base::Base; using Base::operator[]; + //using type = matrix; // For assigning to same dimension use implicit ctor, and even then only allow for dimension truncation template requires ((X!=N || Y!=M) && X>=N && Y>=M) diff --git a/include/nbl/builtin/hlsl/cpp_compat/unroll.hlsl b/include/nbl/builtin/hlsl/cpp_compat/unroll.hlsl new file mode 100644 index 0000000000..36bcd944c6 --- /dev/null +++ b/include/nbl/builtin/hlsl/cpp_compat/unroll.hlsl @@ -0,0 +1,12 @@ +#ifndef _NBL_BUILTIN_HLSL_CPP_COMPAT_UNROLL_INCLUDED_ +#define _NBL_BUILTIN_HLSL_CPP_COMPAT_UNROLL_INCLUDED_ + +#ifdef __HLSL_VERSION +#define NBL_UNROLL [unroll] +#define NBL_UNROLL_LIMITED(LIMIT) [unroll(LIMIT)] +#else +#define NBL_UNROLL // can't be bothered / TODO +#define NBL_UNROLL_LIMITED(LIMIT) +#endif + +#endif diff --git a/include/nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl b/include/nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl index 5a40089c86..28142af3ac 100644 --- a/include/nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl +++ b/include/nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl @@ -2,11 +2,6 @@ #define _NBL_BUILTIN_HLSL_MATRIX_UTILS_MATRIX_TRAITS_INCLUDED_ #include -#include -// TODO: remove this header when deleting vectorSIMDf.hlsl -#include -#include "vectorSIMD.h" -#include namespace nbl { @@ -15,14 +10,33 @@ namespace hlsl template struct matrix_traits; -// partial spec for matrices -template -struct matrix_traits > + +// i choose to implement it this way because of this DXC bug: https://github.com/microsoft/DirectXShaderCompiler/issues/7007 +#define DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(ROW_COUNT, COLUMN_COUNT) \ +template \ +struct matrix_traits > \ +{ \ + using ComponentType = T; \ + NBL_CONSTEXPR_STATIC_INLINE uint32_t RowCount = ROW_COUNT; \ + NBL_CONSTEXPR_STATIC_INLINE uint32_t ColumnCount = COLUMN_COUNT; \ + NBL_CONSTEXPR_STATIC_INLINE bool Square = RowCount == ColumnCount; \ +}; + +DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(2, 2) +DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(3, 3) +DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(4, 4) +DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(3, 4) + +// TODO: when this bug: https://github.com/microsoft/DirectXShaderCompiler/issues/7007 is fixed, uncomment and delete template specializations +/*template +struct matrix_traits > { - NBL_CONSTEXPR_STATIC_INLINE uint32_t RowCount = N; - NBL_CONSTEXPR_STATIC_INLINE uint32_t ColumnCount = M; - NBL_CONSTEXPR_STATIC_INLINE bool Square = N == M; + using ComponentType = T; + NBL_CONSTEXPR_STATIC_INLINE uint32_t RowCount = ROW_COUNT; + NBL_CONSTEXPR_STATIC_INLINE uint32_t ColumnCount = COLUMN_COUNT; + NBL_CONSTEXPR_STATIC_INLINE bool Square = RowCount == ColumnCount; }; +*/ } } diff --git a/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl b/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl index 8858ada6d2..d1a628ccc0 100644 --- a/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl +++ b/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl @@ -1,83 +1,49 @@ #ifndef _NBL_BUILTIN_HLSL_MATRIX_UTILS_TRANSFORMATION_MATRIX_UTILS_INCLUDED_ #define _NBL_BUILTIN_HLSL_MATRIX_UTILS_TRANSFORMATION_MATRIX_UTILS_INCLUDED_ - -#include #include // TODO: remove this header when deleting vectorSIMDf.hlsl +#ifndef __HLSL_VERSION #include #include "vectorSIMD.h" -#include +#endif #include -#include +#include "nbl/builtin/hlsl/cpp_compat/unroll.hlsl" namespace nbl { namespace hlsl { - // goal: identity::value - // plan: diagonal - // identity - // - // partial spec: - // template - // identity - -namespace transfromation_matrix_utils_impl -{ -template -matrix diagonal(float diagonal) + +template +MatT diagonal(float diagonal = 1) { - using MatT = matrix; MatT output; - for (int i = 0; i < matrix_traits::RowCount; ++i) - for (int j = 0; j < matrix_traits::ColumnCount; ++j) + NBL_UNROLL_LIMITED(4) + for (uint32_t i = 0; i < matrix_traits::RowCount; ++i) + NBL_UNROLL_LIMITED(4) + for (uint32_t j = 0; j < matrix_traits::ColumnCount; ++j) output[i][j] = 0; - auto a = matrix_traits::RowCount; - auto b = matrix_traits::ColumnCount; - - for (int diag = 0; diag < matrix_traits::RowCount; ++diag) + NBL_UNROLL_LIMITED(4) + for (uint32_t diag = 0; diag < matrix_traits::RowCount; ++diag) output[diag][diag] = diagonal; return output; -}; } template -struct identity; - -template -struct identity > +MatT identity() { - static matrix get() - { - return transfromation_matrix_utils_impl::diagonal(1); - } -}; - -#define IDENTITY_MATRIX(TYPE, N, M)\ -const matrix TYPE ## N ## x ## M ## _identity = identity >::get(); - -#define DEFINE_IDENTITY_MATRICES(TYPE)\ -IDENTITY_MATRIX(TYPE, 2, 2)\ -IDENTITY_MATRIX(TYPE, 3, 3)\ -IDENTITY_MATRIX(TYPE, 4, 4)\ -IDENTITY_MATRIX(TYPE, 3, 4) - -DEFINE_IDENTITY_MATRICES(float32_t) -DEFINE_IDENTITY_MATRICES(float64_t) -DEFINE_IDENTITY_MATRICES(int32_t) -DEFINE_IDENTITY_MATRICES(int64_t) -DEFINE_IDENTITY_MATRICES(uint32_t) -DEFINE_IDENTITY_MATRICES(uint64_t) - -#undef DEFINE_IDENTITY_MATRICES -#undef IDENTITY_MATRIX + // TODO + // static_assert(MatT::Square); + return diagonal(1); +} // TODO: this is temporary function, delete when removing vectorSIMD +#ifndef __HLSL_VERSION template -inline core::vectorSIMDf transformVector(const matrix& mat, const core::vectorSIMDf& vec) +inline core::vectorSIMDf transformVector(NBL_CONST_REF_ARG(matrix) mat, NBL_CONST_REF_ARG(core::vectorSIMDf) vec) { core::vectorSIMDf output; float32_t4 tmp; @@ -89,9 +55,9 @@ inline core::vectorSIMDf transformVector(const matrix& mat, const core: return output; } - +#endif template -inline matrix getMatrix3x4As4x4(const matrix& mat) +inline matrix getMatrix3x4As4x4(NBL_CONST_REF_ARG(matrix) mat) { matrix output; for (int i = 0; i < 3; ++i) @@ -101,14 +67,14 @@ inline matrix getMatrix3x4As4x4(const matrix& mat) return output; } -template -inline matrix getSub3x3(const matrix& mat) +template +inline matrix getSub3x3(NBL_CONST_REF_ARG(matrix) mat) { return matrix(mat); } -template -inline matrix getAs64BitPrecisionMatrix(const matrix& mat) +template +inline matrix getAs64BitPrecisionMatrix(NBL_CONST_REF_ARG(matrix) mat) { matrix output; for (int i = 0; i < N; ++i) @@ -119,8 +85,9 @@ inline matrix getAs64BitPrecisionMatrix(const matrix - inline T determinant_helper(const matrix& mat, vector& r1crossr2) + inline T determinant_helper(NBL_CONST_REF_ARG(matrix) mat, NBL_REF_ARG(vector) r1crossr2) { r1crossr2 = hlsl::cross(mat[1], mat[2]); return hlsl::dot(mat[0], r1crossr2); @@ -128,8 +95,8 @@ namespace transformation_matrix_utils_impl } //! returs adjugate of the cofactor (sub 3x3) matrix -template -inline matrix getSub3x3TransposeCofactors(const matrix& mat) +template +inline matrix getSub3x3TransposeCofactors(NBL_CONST_REF_ARG(matrix) mat) { static_assert(N >= 3 && M >= 3); @@ -146,15 +113,15 @@ inline matrix getSub3x3TransposeCofactors(const matrix& mat) return output; } -template -inline bool getSub3x3InverseTranspose(const matrix& matIn, matrix& matOut) +template +inline bool getSub3x3InverseTranspose(NBL_CONST_REF_ARG(matrix) matIn, NBL_CONST_REF_ARG(matrix) matOut) { matrix matIn3x3 = getSub3x3(matIn); vector r1crossr2; T d = transformation_matrix_utils_impl::determinant_helper(matIn3x3, r1crossr2); - if (core::iszero(d, FLT_MIN)) + if (abs(d) <= FLT_MIN) return false; - auto rcp = core::reciprocal(d); + auto rcp = T(1.0f)/d; // matrix of cofactors * 1/det matOut = getSub3x3TransposeCofactors(matIn3x3); @@ -168,28 +135,33 @@ inline bool getSub3x3InverseTranspose(const matrix& matIn, matrix -inline matrix concatenateBFollowedByA(const matrix& a, const matrix& b) +inline matrix concatenateBFollowedByA(NBL_CONST_REF_ARG(matrix) a, NBL_CONST_REF_ARG(const matrix) b) { - const matrix a4x4 = getMatrix3x4As4x4(a); - const matrix b4x4 = getMatrix3x4As4x4(b); + // TODO + // static_assert(N == 3 || N == 4); + + const matrix a4x4 = getMatrix3x4As4x4(a); + const matrix b4x4 = getMatrix3x4As4x4(b); return matrix(mul(a4x4, b4x4)); } -// TODO: why NBL_REF_ARG(MatType) doesn't work????? - -template -inline void setScale(matrix& outMat, NBL_CONST_REF_ARG(vector) scale) +template +inline void setScale(NBL_REF_ARG(matrix) outMat, NBL_CONST_REF_ARG(vector) scale) { + // TODO + // static_assert(N == 3 || N == 4); + outMat[0][0] = scale[0]; outMat[1][1] = scale[1]; outMat[2][2] = scale[2]; } //! Replaces curent rocation and scale by rotation represented by quaternion `quat`, leaves 4th row and 4th colum unchanged -template -inline void setRotation(matrix& outMat, NBL_CONST_REF_ARG(nbl::hlsl::quaternion) quat) +template +inline void setRotation(NBL_REF_ARG(matrix) outMat, NBL_CONST_REF_ARG(nbl::hlsl::quaternion) quat) { - static_assert(N == 3 || N == 4); + // TODO + //static_assert(N == 3 || N == 4); outMat[0] = vector( 1 - 2 * (quat.data.y * quat.data.y + quat.data.z * quat.data.z), @@ -214,10 +186,11 @@ inline void setRotation(matrix& outMat, NBL_CONST_REF_ARG(nbl::hlsl::qu ); } -template -inline void setTranslation(matrix& outMat, NBL_CONST_REF_ARG(vector) translation) +template +inline void setTranslation(NBL_REF_ARG(matrix) outMat, NBL_CONST_REF_ARG(vector) translation) { - static_assert(N == 3 || N == 4); + // TODO + // static_assert(N == 3 || N == 4); outMat[0].w = translation.x; outMat[1].w = translation.y; diff --git a/include/nbl/core/math/plane3dSIMD.h b/include/nbl/core/math/plane3dSIMD.h index 2686c0a821..0a9f163208 100644 --- a/include/nbl/core/math/plane3dSIMD.h +++ b/include/nbl/core/math/plane3dSIMD.h @@ -104,7 +104,7 @@ class plane3dSIMDf : private vectorSIMDf static inline plane3dSIMDf transform(const plane3dSIMDf& in, const hlsl::float32_t3x4& mat) { hlsl::float32_t3x4 a = mat; - hlsl::float32_t4x4 inv = hlsl::getMatrix3x4As4x4(a); + hlsl::float32_t4x4 inv = hlsl::getMatrix3x4As4x4(a); hlsl::inverse(inv); vectorSIMDf normal(in.getNormal()); diff --git a/src/nbl/asset/utils/CGeometryCreator.cpp b/src/nbl/asset/utils/CGeometryCreator.cpp index ac065a8c97..a15b52cfca 100644 --- a/src/nbl/asset/utils/CGeometryCreator.cpp +++ b/src/nbl/asset/utils/CGeometryCreator.cpp @@ -697,8 +697,8 @@ CGeometryCreator::return_type CGeometryCreator::createDiskMesh(float radius, uin for (int i = 2; i < vertexCount-1; i++) { hlsl::float32_t3x4 rotMatrix; - hlsl::setRotation(rotMatrix, hlsl::quaternion::create(0.0f, 0.0f, core::radians((i - 1) * angle))); - core::vectorSIMDf vn = hlsl::transformVector(hlsl::getMatrix3x4As4x4(rotMatrix), v0); + hlsl::setRotation(rotMatrix, hlsl::quaternion::create(0.0f, 0.0f, core::radians((i - 1) * angle))); + core::vectorSIMDf vn = hlsl::transformVector(hlsl::getMatrix3x4As4x4(rotMatrix), v0); ptr[i] = DiskVertex(vn, video::SColor(0xFFFFFFFFu), core::vector2du32_SIMD(0u, 1u), core::vector3df_SIMD(0.0f, 0.0f, 1.0f)); diff --git a/src/nbl/asset/utils/CMeshManipulator.cpp b/src/nbl/asset/utils/CMeshManipulator.cpp index a76b1ed9d8..ed1e37cbdd 100644 --- a/src/nbl/asset/utils/CMeshManipulator.cpp +++ b/src/nbl/asset/utils/CMeshManipulator.cpp @@ -9,9 +9,7 @@ #include #include - #include "nbl/asset/asset.h" -#include "nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl" #include "nbl/asset/IRenderpassIndependentPipeline.h" #include "nbl/asset/utils/CMeshManipulator.h" #include "nbl/asset/utils/CSmoothNormalGenerator.h" @@ -1708,20 +1706,20 @@ hlsl::float32_t3x4 IMeshManipulator::calculateOBB(const nbl::asset::ICPUMeshBuff float ABBQuality = ABBDiff.x * ABBDiff.y + ABBDiff.y * ABBDiff.z + ABBDiff.z * ABBDiff.x; hlsl::float32_t3x4 scaleMat; hlsl::float32_t3x4 translationMat; - hlsl::setTranslation(translationMat, core::getAsVec3(-(MinPoint) / OBBDiff)); - hlsl::setScale(scaleMat, core::getAsVec3(OBBDiff)); - TransMat = hlsl::concatenateBFollowedByA(TransMat, scaleMat); - TransMat = hlsl::concatenateBFollowedByA(TransMat, translationMat); + hlsl::setTranslation(translationMat, core::getAsVec3(-(MinPoint) / OBBDiff)); + hlsl::setScale(scaleMat, core::getAsVec3(OBBDiff)); + TransMat = hlsl::concatenateBFollowedByA(TransMat, scaleMat); + TransMat = hlsl::concatenateBFollowedByA(TransMat, translationMat); if (ABBQuality < OBBQuality) { - hlsl::setTranslation(translationMat, core::getAsVec3(-(AABBMin) / ABBDiff)); - hlsl::setScale(scaleMat, core::getAsVec3(ABBDiff)); + hlsl::setTranslation(translationMat, core::getAsVec3(-(AABBMin) / ABBDiff)); + hlsl::setScale(scaleMat, core::getAsVec3(ABBDiff)); TransMat = hlsl::float32_t3x4( 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0); - TransMat = hlsl::concatenateBFollowedByA(TransMat, scaleMat); - TransMat = hlsl::concatenateBFollowedByA(TransMat, translationMat); + TransMat = hlsl::concatenateBFollowedByA(TransMat, scaleMat); + TransMat = hlsl::concatenateBFollowedByA(TransMat, translationMat); } return TransMat; From 7100daca081e9e466ec3bcfb2f7758f6bd2efe0d Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 20 Nov 2024 09:12:14 +0700 Subject: [PATCH 209/358] add position fetch extension feature --- src/nbl/video/CVulkanPhysicalDevice.cpp | 5 +++++ .../video/device_capabilities/device_features.json | 11 +++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/nbl/video/CVulkanPhysicalDevice.cpp b/src/nbl/video/CVulkanPhysicalDevice.cpp index b7bfe94974..595393649b 100644 --- a/src/nbl/video/CVulkanPhysicalDevice.cpp +++ b/src/nbl/video/CVulkanPhysicalDevice.cpp @@ -1062,6 +1062,8 @@ std::unique_ptr CVulkanPhysicalDevice::create(core::smart features.rayQuery = isExtensionSupported(VK_KHR_RAY_QUERY_EXTENSION_NAME) && isExtensionSupported(VK_KHR_RAY_TRACING_MAINTENANCE_1_EXTENSION_NAME); + features.rayTracingPositionFetch = isExtensionSupported(VK_KHR_RAY_TRACING_POSITION_FETCH_EXTENSION_NAME); + if (isExtensionSupported(VK_NV_REPRESENTATIVE_FRAGMENT_TEST_EXTENSION_NAME)) features.representativeFragmentTest = representativeFragmentTestFeatures.representativeFragmentTest; @@ -1463,6 +1465,9 @@ core::smart_refctd_ptr CVulkanPhysicalDevice::createLogicalDevic VkPhysicalDeviceRayQueryFeaturesKHR rayQueryFeatures = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_QUERY_FEATURES_KHR,nullptr }; REQUIRE_EXTENSION_IF(enabledFeatures.rayQuery,VK_KHR_RAY_QUERY_EXTENSION_NAME,&rayQueryFeatures); // feature dependency taken care of + VkPhysicalDeviceRayTracingPositionFetchFeaturesKHR rayTracingPositionFetchFeatures = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_TRACING_POSITION_FETCH_FEATURES_KHR,nullptr }; + REQUIRE_EXTENSION_IF(enabledFeatures.rayTracingPositionFetch, VK_KHR_RAY_TRACING_POSITION_FETCH_EXTENSION_NAME, &rayTracingPositionFetchFeatures); // feature dependency taken care of + VkPhysicalDeviceShaderSMBuiltinsFeaturesNV shaderSMBuiltinsFeaturesNV = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_SM_BUILTINS_FEATURES_NV,nullptr }; enableExtensionIfAvailable(VK_NV_SHADER_SM_BUILTINS_EXTENSION_NAME,&shaderSMBuiltinsFeaturesNV); diff --git a/src/nbl/video/device_capabilities/device_features.json b/src/nbl/video/device_capabilities/device_features.json index db1383968e..fa1128b075 100644 --- a/src/nbl/video/device_capabilities/device_features.json +++ b/src/nbl/video/device_capabilities/device_features.json @@ -4023,8 +4023,15 @@ "VK_AMD_extension_478", "VK_AMD_extension_479", "VK_EXT_extension_480", - "VK_EXT_extension_481", - "TODO: implement", + "VK_EXT_extension_481" + ] + }, + { + "type": "bool", + "name": "rayTracingPositionFetch", + "value": false, + "comment": [ + "RayTracingPositionFetchFeaturesKHR", "VK_KHR_ray_tracing_position_fetch" ] }, From 38491b8407dc1ff49d133be319f599cf60e59342 Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 20 Nov 2024 11:00:20 +0100 Subject: [PATCH 210/358] post merge examples_tests submodule pointer set --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 41937881f9..2af7f38067 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 41937881f94d7fe89e2f20cf54a9169e9968224c +Subproject commit 2af7f380674697637c2fc1384c098bb9e14bf572 From 8e5ab1a529e60a9d16d956e926b8feb09a39a54a Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 20 Nov 2024 17:10:45 +0700 Subject: [PATCH 211/358] moved position fetch to limits --- src/nbl/video/CVulkanPhysicalDevice.cpp | 11 ++++++----- .../video/device_capabilities/device_features.json | 1 + 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/nbl/video/CVulkanPhysicalDevice.cpp b/src/nbl/video/CVulkanPhysicalDevice.cpp index 595393649b..d9740bc813 100644 --- a/src/nbl/video/CVulkanPhysicalDevice.cpp +++ b/src/nbl/video/CVulkanPhysicalDevice.cpp @@ -705,6 +705,7 @@ std::unique_ptr CVulkanPhysicalDevice::create(core::smart VkPhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR workgroupMemoryExplicitLayout = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_WORKGROUP_MEMORY_EXPLICIT_LAYOUT_FEATURES_KHR }; VkPhysicalDeviceRayTracingMaintenance1FeaturesKHR raytracingMaintenance1Features = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_TRACING_MAINTENANCE_1_FEATURES_KHR }; VkPhysicalDeviceRasterizationOrderAttachmentAccessFeaturesARM rasterizationOrderAttachmentAccessFeatures = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_FEATURES_ARM }; + VkPhysicalDeviceRayTracingPositionFetchFeaturesKHR rayTracingPositionFetchFeatures = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_TRACING_POSITION_FETCH_FEATURES_KHR }; VkPhysicalDeviceColorWriteEnableFeaturesEXT colorWriteEnableFeatures = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COLOR_WRITE_ENABLE_FEATURES_EXT }; #if 0 VkPhysicalDeviceCooperativeMatrixFeaturesKHR cooperativeMatrixFeatures = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR }; @@ -768,6 +769,8 @@ std::unique_ptr CVulkanPhysicalDevice::create(core::smart addToPNextChain(&workgroupMemoryExplicitLayout); if (isExtensionSupported(VK_ARM_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_EXTENSION_NAME)) addToPNextChain(&rasterizationOrderAttachmentAccessFeatures); + if (isExtensionSupported(VK_KHR_RAY_TRACING_POSITION_FETCH_EXTENSION_NAME)) + addToPNextChain(&rayTracingPositionFetchFeatures); if (isExtensionSupported(VK_EXT_COLOR_WRITE_ENABLE_EXTENSION_NAME)) addToPNextChain(&colorWriteEnableFeatures); // call @@ -1062,8 +1065,6 @@ std::unique_ptr CVulkanPhysicalDevice::create(core::smart features.rayQuery = isExtensionSupported(VK_KHR_RAY_QUERY_EXTENSION_NAME) && isExtensionSupported(VK_KHR_RAY_TRACING_MAINTENANCE_1_EXTENSION_NAME); - features.rayTracingPositionFetch = isExtensionSupported(VK_KHR_RAY_TRACING_POSITION_FETCH_EXTENSION_NAME); - if (isExtensionSupported(VK_NV_REPRESENTATIVE_FRAGMENT_TEST_EXTENSION_NAME)) features.representativeFragmentTest = representativeFragmentTestFeatures.representativeFragmentTest; @@ -1198,6 +1199,9 @@ std::unique_ptr CVulkanPhysicalDevice::create(core::smart properties.limits.workgroupMemoryExplicitLayout16BitAccess = workgroupMemoryExplicitLayout.workgroupMemoryExplicitLayout16BitAccess; } + if (isExtensionSupported(VK_KHR_RAY_TRACING_POSITION_FETCH_EXTENSION_NAME) && isExtensionSupported(VK_KHR_ACCELERATION_STRUCTURE_EXTENSION_NAME)) + properties.limits.rayTracingPositionFetch = rayTracingPositionFetchFeatures.rayTracingPositionFetch; + if (isExtensionSupported(VK_EXT_COLOR_WRITE_ENABLE_EXTENSION_NAME)) properties.limits.colorWriteEnable = colorWriteEnableFeatures.colorWriteEnable; #if 0 //TODO @@ -1465,9 +1469,6 @@ core::smart_refctd_ptr CVulkanPhysicalDevice::createLogicalDevic VkPhysicalDeviceRayQueryFeaturesKHR rayQueryFeatures = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_QUERY_FEATURES_KHR,nullptr }; REQUIRE_EXTENSION_IF(enabledFeatures.rayQuery,VK_KHR_RAY_QUERY_EXTENSION_NAME,&rayQueryFeatures); // feature dependency taken care of - VkPhysicalDeviceRayTracingPositionFetchFeaturesKHR rayTracingPositionFetchFeatures = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_TRACING_POSITION_FETCH_FEATURES_KHR,nullptr }; - REQUIRE_EXTENSION_IF(enabledFeatures.rayTracingPositionFetch, VK_KHR_RAY_TRACING_POSITION_FETCH_EXTENSION_NAME, &rayTracingPositionFetchFeatures); // feature dependency taken care of - VkPhysicalDeviceShaderSMBuiltinsFeaturesNV shaderSMBuiltinsFeaturesNV = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_SM_BUILTINS_FEATURES_NV,nullptr }; enableExtensionIfAvailable(VK_NV_SHADER_SM_BUILTINS_EXTENSION_NAME,&shaderSMBuiltinsFeaturesNV); diff --git a/src/nbl/video/device_capabilities/device_features.json b/src/nbl/video/device_capabilities/device_features.json index fa1128b075..3c09088a2b 100644 --- a/src/nbl/video/device_capabilities/device_features.json +++ b/src/nbl/video/device_capabilities/device_features.json @@ -4030,6 +4030,7 @@ "type": "bool", "name": "rayTracingPositionFetch", "value": false, + "expose": "MOVE_TO_LIMIT", "comment": [ "RayTracingPositionFetchFeaturesKHR", "VK_KHR_ray_tracing_position_fetch" From b43d4dfd50c050f08aac8620a5fac76e047f04a1 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 20 Nov 2024 17:16:14 +0700 Subject: [PATCH 212/358] spv functions outline --- .../hlsl/spirv_intrinsics/raytracing.hlsl | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl new file mode 100644 index 0000000000..8d7ac0a151 --- /dev/null +++ b/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl @@ -0,0 +1,48 @@ +// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_BUILTIN_HLSL_SPIRV_INTRINSICS_RAYTRACING_INCLUDED_ +#define _NBL_BUILTIN_HLSL_SPIRV_INTRINSICS_RAYTRACING_INCLUDED_ + +// #include "nbl/builtin/hlsl/spirv_intrinsics/core.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace spirv +{ + +[[vk::ext_capability(spv::CapabilityRayQueryKHR)]] +[[vk::ext_instruction(spv::OpRayQueryInitializeKHR)]] +void rayQueryInitializeEXT(/* spv::OpTypeRayQueryKHR function_ptr query, spv::OpTypeAccelerationStructureKHR AS, uint32_t flags, uint32_t cull mask, float3 origin, float32_t tmin, float3 direction, float32_t tmax*/); + +[[vk::ext_capability(spv::CapabilityRayQueryKHR)]] +[[vk::ext_instruction(spv::OpRayQueryProceedKHR)]] +bool rayQueryProceedEXT(/* spv::OpTypeRayQueryKHR function_ptr query */); + +[[vk::ext_capability(spv::CapabilityRayQueryKHR)]] +[[vk::ext_instruction(spv::OpRayQueryGetIntersectionTypeKHR)]] +int rayQueryGetIntersectionTypeEXT(/* spv::OpTypeRayQueryKHR function_ptr query, uint32_t intersection (candidate 0, commited 1) */); + +[[vk::ext_capability(spv::CapabilityRayQueryKHR)]] +[[vk::ext_instruction(spv::OpRayQueryGetIntersectionInstanceIdKHR)]] +int rayQueryGetIntersectionInstanceIdEXT(/* spv::OpTypeRayQueryKHR function_ptr query, uint32_t intersection (candidate 0, commited 1) */); + +[[vk::ext_capability(spv::CapabilityRayQueryKHR)]] +[[vk::ext_instruction(spv::OpRayQueryGetIntersectionPrimitiveIndexKHR)]] +int rayQueryGetIntersectionPrimitiveIndexEXT(/* spv::OpTypeRayQueryKHR function_ptr query, uint32_t intersection (candidate 0, commited 1) */); + +// position fetch for ray tracing uses gl_HitTriangleVertexPositionsEXT -> HitTriangleVertexPositionsKHR decorated OpVariable + +[[vk::ext_capability(spv::CapabilityRayQueryPositionFetchKHR)]] +[[vk::ext_instruction(spv::OpRayQueryGetIntersectionTriangleVertexPositionsKHR)]] // ray query version +// get intersection triangle vertex position - gl_HitTriangleVertexPositionsEXT +// returns 3-array float3 vertex positions +void rayQueryGetIntersectionTriangleVertexPositionsEXT(/* spv::OpTypeRayQueryKHR function_ptr query, uint32_t intersection (candidate 0, commited 1), out float3 positions[3] */); + +} +} +} + +#endif // _NBL_BUILTIN_HLSL_SPIRV_INTRINSICS_RAYTRACING_INCLUDED_ \ No newline at end of file From 639cf78243ae56f89f176c5e37f638f5d1896645 Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Wed, 20 Nov 2024 11:17:46 +0100 Subject: [PATCH 213/358] add 3rdparty/ngfx/ngfx.cmake capable of finding the NSight GFX SDK & creating interface `ngfx` interface target on success, allow for picking found versions --- 3rdparty/CMakeLists.txt | 3 ++ 3rdparty/ngfx/ngfx.cmake | 63 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 3rdparty/ngfx/ngfx.cmake diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt index 1d0b3fee0f..1aa44383e3 100755 --- a/3rdparty/CMakeLists.txt +++ b/3rdparty/CMakeLists.txt @@ -296,6 +296,9 @@ NBL_ADD_GIT_TRACKING_META_LIBRARY(nabla "${NBL_ROOT_PATH}") NBL_ADD_GIT_TRACKING_META_LIBRARY(dxc "${CMAKE_CURRENT_SOURCE_DIR}/dxc/dxc") NBL_GENERATE_GIT_TRACKING_META() +# NGFX +include(ngfx/ngfx.cmake) + if(NBL_BUILD_IMGUI) set(NBL_IMGUI_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/imgui") set(NBL_IMGUI_TEST_ENGINE_PROJECT_ROOT "${THIRD_PARTY_SOURCE_DIR}/imgui_test_engine") diff --git a/3rdparty/ngfx/ngfx.cmake b/3rdparty/ngfx/ngfx.cmake new file mode 100644 index 0000000000..6505f5d8ac --- /dev/null +++ b/3rdparty/ngfx/ngfx.cmake @@ -0,0 +1,63 @@ +option(NBL_BUILD_WITH_NGFX "Enable NGFX build" OFF) + +# NOTE: on windows default installation path is: +# "C:/Program Files/NVIDIA Corporation/Nsight Graphics /SDKs/NsightGraphicsSDK" <- define as "NGFX_SDK" environment variable +# then you can pick SDK version with "NGFX_SDK_VERSION" cache variable (CMake GUI list supported) + +if(NBL_BUILD_WITH_NGFX) + if(NOT DEFINED ENV{NGFX_SDK}) + message(FATAL_ERROR "\"NGFX_SDK\" environment variable must be defined to build with NBL_BUILD_WITH_NGFX enabled!") + endif() + + set(NGFX_SDK "$ENV{NGFX_SDK}") + cmake_path(NORMAL_PATH NGFX_SDK OUTPUT_VARIABLE NGFX_SDK) + + if(NOT EXISTS "${NGFX_SDK}") + message(FATAL_ERROR "Found \"NGFX_SDK\" environment variable but it is invalid, env:NGFX_SDK=\"${NGFX_SDK}\" doesn't exist!") + endif() + + file(GLOB ENTRIES "${NGFX_SDK}/*") + + set(NGFX_VERSIONS "") + foreach(ENTRY ${ENTRIES}) + if(IS_DIRECTORY ${ENTRY}) + list(APPEND NGFX_VERSIONS ${ENTRY}) + endif() + endforeach() + + if(NOT NGFX_VERSIONS) + message(FATAL_ERROR "Could not find any NGFX SDK Version!") + endif() + + list(TRANSFORM NGFX_VERSIONS REPLACE "${NGFX_SDK}/" "") + list(SORT NGFX_VERSIONS) + list(GET NGFX_VERSIONS -1 LATEST_NGFX_VERSION) + + # on the cache variable init pick the latest version, then let user pick from list + set(NGFX_SDK_VERSION "${LATEST_NGFX_VERSION}" CACHE STRING "NGFX SDK Version") + set_property(CACHE NGFX_SDK_VERSION PROPERTY STRINGS ${NGFX_VERSIONS}) + + set(NGFX_SDK_BASE "${NGFX_SDK}/$CACHE{NGFX_SDK_VERSION}") + + # TODO: wanna support more *host* platforms? (*) + # NOTE: also I'm hardcoding windows x64 library requests till I know the answer for (*) + find_file(NBL_NGFX_INJECTION_HEADER NGFX_Injection.h PATHS ${NGFX_SDK_BASE}/include) + find_file(NBL_NGFX_INJECTION_DLL NGFX_Injection.dll PATHS ${NGFX_SDK_BASE}/lib/x64) + find_file(NBL_NGFX_INJECTION_IMPORT_LIBRARY NGFX_Injection.lib PATHS ${NGFX_SDK_BASE}/lib/x64) + + if(NBL_NGFX_INJECTION_HEADER AND NBL_NGFX_INJECTION_DLL AND NBL_NGFX_INJECTION_IMPORT_LIBRARY) + message(STATUS "Enabled build with NVIDIA Nsight Graphics SDK $CACHE{NGFX_SDK_VERSION}\nlocated in: \"${NGFX_SDK_BASE}\"") + else() + message(STATUS "Could not enable build with NVIDIA Nsight Graphics SDK $CACHE{NGFX_SDK_VERSION} - invalid components!") + message(STATUS "Located in: \"${NGFX_SDK_BASE}\"") + message(STATUS "NBL_NGFX_INJECTION_HEADER=\"${NBL_NGFX_INJECTION_HEADER}\"") + message(STATUS "NBL_NGFX_INJECTION_DLL=\"${NBL_NGFX_INJECTION_DLL}\"") + message(STATUS "NBL_NGFX_INJECTION_IMPORT_LIBRARY=\"${NBL_NGFX_INJECTION_IMPORT_LIBRARY}\"") + message(FATAL_ERROR "You installation may be corupted, please fix it and re-run CMake or disable NBL_BUILD_WITH_NGFX!") + endif() + + add_library(ngfx INTERFACE) + target_sources(ngfx INTERFACE "${NBL_NGFX_INJECTION_HEADER}") + target_include_directories(ngfx INTERFACE "${NGFX_SDK_BASE}/include") + target_link_libraries(ngfx INTERFACE "${NBL_NGFX_INJECTION_IMPORT_LIBRARY}") +endif() \ No newline at end of file From 048d093f56546b09d6387626ff2af280168fb2e1 Mon Sep 17 00:00:00 2001 From: AnastaZIuk Date: Wed, 20 Nov 2024 15:25:43 +0100 Subject: [PATCH 214/358] integrate NGFX with Nabla build system, delay load the DLL on windows target platforms --- 3rdparty/ngfx/ngfx.cmake | 10 +- include/nbl/video/IAPIConnection.h | 10 +- src/nbl/CMakeLists.txt | 15 +++ src/nbl/video/IAPIConnection.cpp | 171 ++++++++++++++++++++--------- 4 files changed, 148 insertions(+), 58 deletions(-) diff --git a/3rdparty/ngfx/ngfx.cmake b/3rdparty/ngfx/ngfx.cmake index 6505f5d8ac..f69e05ac9d 100644 --- a/3rdparty/ngfx/ngfx.cmake +++ b/3rdparty/ngfx/ngfx.cmake @@ -37,7 +37,8 @@ if(NBL_BUILD_WITH_NGFX) set(NGFX_SDK_VERSION "${LATEST_NGFX_VERSION}" CACHE STRING "NGFX SDK Version") set_property(CACHE NGFX_SDK_VERSION PROPERTY STRINGS ${NGFX_VERSIONS}) - set(NGFX_SDK_BASE "${NGFX_SDK}/$CACHE{NGFX_SDK_VERSION}") + set(NGFX_SDK_VERSION "$CACHE{NGFX_SDK_VERSION}") + set(NGFX_SDK_BASE "${NGFX_SDK}/${NGFX_SDK_VERSION}") # TODO: wanna support more *host* platforms? (*) # NOTE: also I'm hardcoding windows x64 library requests till I know the answer for (*) @@ -46,9 +47,9 @@ if(NBL_BUILD_WITH_NGFX) find_file(NBL_NGFX_INJECTION_IMPORT_LIBRARY NGFX_Injection.lib PATHS ${NGFX_SDK_BASE}/lib/x64) if(NBL_NGFX_INJECTION_HEADER AND NBL_NGFX_INJECTION_DLL AND NBL_NGFX_INJECTION_IMPORT_LIBRARY) - message(STATUS "Enabled build with NVIDIA Nsight Graphics SDK $CACHE{NGFX_SDK_VERSION}\nlocated in: \"${NGFX_SDK_BASE}\"") + message(STATUS "Enabled build with NVIDIA Nsight Graphics SDK ${NGFX_SDK_VERSION}\nlocated in: \"${NGFX_SDK_BASE}\"") else() - message(STATUS "Could not enable build with NVIDIA Nsight Graphics SDK $CACHE{NGFX_SDK_VERSION} - invalid components!") + message(STATUS "Could not enable build with NVIDIA Nsight Graphics SDK ${NGFX_SDK_VERSION} - invalid components!") message(STATUS "Located in: \"${NGFX_SDK_BASE}\"") message(STATUS "NBL_NGFX_INJECTION_HEADER=\"${NBL_NGFX_INJECTION_HEADER}\"") message(STATUS "NBL_NGFX_INJECTION_DLL=\"${NBL_NGFX_INJECTION_DLL}\"") @@ -60,4 +61,7 @@ if(NBL_BUILD_WITH_NGFX) target_sources(ngfx INTERFACE "${NBL_NGFX_INJECTION_HEADER}") target_include_directories(ngfx INTERFACE "${NGFX_SDK_BASE}/include") target_link_libraries(ngfx INTERFACE "${NBL_NGFX_INJECTION_IMPORT_LIBRARY}") + target_link_options(ngfx INTERFACE "/DELAYLOAD:NGFX_Injection.dll") + target_compile_definitions(ngfx INTERFACE NGFX_INJECTION_DLL_DIR="${NGFX_SDK_BASE}/lib/x64") + target_compile_definitions(ngfx INTERFACE NGFX_VERSION="${NGFX_SDK_VERSION}") endif() \ No newline at end of file diff --git a/include/nbl/video/IAPIConnection.h b/include/nbl/video/IAPIConnection.h index 1ec862e05b..ccb7965822 100644 --- a/include/nbl/video/IAPIConnection.h +++ b/include/nbl/video/IAPIConnection.h @@ -77,11 +77,17 @@ class NBL_API2 IAPIConnection : public core::IReferenceCounted SDebuggerType m_debuggerType; renderdoc_api_t* m_rdoc_api; - struct SNGFXIntegration { - bool useNGFX; + struct SNGFXIntegration + { + SNGFXIntegration(); + + bool useNGFX = false; bool injectNGFXToProcess(); bool executeNGFXCommand(); + inline bool isAPILoaded() { return m_loaded; } + private: + const bool m_loaded; }; using ngfx_api_t = SNGFXIntegration; ngfx_api_t m_ngfx_api; diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 83845b9c84..75f3dd3bec 100755 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -659,6 +659,21 @@ write_source_definitions("${_NBL_DEFINE_FILE_WRAPPER_}" "${_NBL_SOURCE_DEFINITIO # git version tracking target_link_libraries(Nabla PUBLIC gtml) +# NGFX +if(TARGET ngfx) + if(NBL_STATIC_BUILD) + target_link_libraries(Nabla INTERFACE ngfx) + else() + target_link_libraries(Nabla PRIVATE ngfx) + endif() + + target_include_directories(Nabla PRIVATE $) + target_compile_definitions(Nabla + PRIVATE NBL_BUILD_WITH_NGFX + PRIVATE $ + ) +endif() + #on MSVC it won't compile without this option! if (MSVC) target_compile_options(Nabla PUBLIC /bigobj) diff --git a/src/nbl/video/IAPIConnection.cpp b/src/nbl/video/IAPIConnection.cpp index e679363d53..b3d697f3a7 100644 --- a/src/nbl/video/IAPIConnection.cpp +++ b/src/nbl/video/IAPIConnection.cpp @@ -2,10 +2,12 @@ #include "nbl/video/IPhysicalDevice.h" #include "nbl/video/utilities/renderdoc.h" -#include "nbl/video/utilities/ngfx.h" -// TODO: temporary hopefully -#include "C:\Program Files\NVIDIA Corporation\Nsight Graphics 2024.1.0\SDKs\NsightGraphicsSDK\0.8.0\include\NGFX_Injection.h" +#include "nbl/system/CSystemWin32.h" + +#ifdef NBL_BUILD_WITH_NGFX +#include "NGFX_Injection.h" +#endif #if defined(_NBL_POSIX_API_) #include @@ -14,7 +16,6 @@ namespace nbl::video { - std::span IAPIConnection::getPhysicalDevices() const { static_assert(sizeof(std::unique_ptr) == sizeof(void*)); @@ -59,70 +60,134 @@ IAPIConnection::IAPIConnection(const SFeatures& enabledFeatures) bool IAPIConnection::SNGFXIntegration::injectNGFXToProcess() { - uint32_t numInstallations = 0; - auto result = NGFX_Injection_EnumerateInstallations(&numInstallations, nullptr); - if (numInstallations == 0 || NGFX_INJECTION_RESULT_OK != result) + #ifdef NBL_BUILD_WITH_NGFX + if (m_loaded) //! this check is mandatory! { - useNGFX = false; - return false; - } + uint32_t numInstallations = 0; + auto result = NGFX_Injection_EnumerateInstallations(&numInstallations, nullptr); + if (numInstallations == 0 || NGFX_INJECTION_RESULT_OK != result) + { + useNGFX = false; + return false; + } - std::vector installations(numInstallations); - result = NGFX_Injection_EnumerateInstallations(&numInstallations, installations.data()); - if (numInstallations == 0 || NGFX_INJECTION_RESULT_OK != result) - { - useNGFX = false; - return false; - } + std::vector installations(numInstallations); + result = NGFX_Injection_EnumerateInstallations(&numInstallations, installations.data()); + if (numInstallations == 0 || NGFX_INJECTION_RESULT_OK != result) + { + useNGFX = false; + return false; + } - // get latest installation - NGFX_Injection_InstallationInfo versionInfo = installations.back(); + // get latest installation + NGFX_Injection_InstallationInfo versionInfo = installations.back(); - uint32_t numActivities = 0; - result = NGFX_Injection_EnumerateActivities(&versionInfo, &numActivities, nullptr); - if (numActivities == 0 || NGFX_INJECTION_RESULT_OK != result) - { - useNGFX = false; - return false; - } + uint32_t numActivities = 0; + result = NGFX_Injection_EnumerateActivities(&versionInfo, &numActivities, nullptr); + if (numActivities == 0 || NGFX_INJECTION_RESULT_OK != result) + { + useNGFX = false; + return false; + } - std::vector activities(numActivities); - result = NGFX_Injection_EnumerateActivities(&versionInfo, &numActivities, activities.data()); - if (NGFX_INJECTION_RESULT_OK != result) - { - useNGFX = false; - return false; - } + std::vector activities(numActivities); + result = NGFX_Injection_EnumerateActivities(&versionInfo, &numActivities, activities.data()); + if (NGFX_INJECTION_RESULT_OK != result) + { + useNGFX = false; + return false; + } - const NGFX_Injection_Activity* pActivityToInject = nullptr; - for (const NGFX_Injection_Activity& activity : activities) - { - if (activity.type == NGFX_INJECTION_ACTIVITY_FRAME_DEBUGGER) // only want frame debugger + const NGFX_Injection_Activity* pActivityToInject = nullptr; + for (const NGFX_Injection_Activity& activity : activities) { - pActivityToInject = &activity; - break; + if (activity.type == NGFX_INJECTION_ACTIVITY_FRAME_DEBUGGER) // only want frame debugger + { + pActivityToInject = &activity; + break; + } } - } - if (!pActivityToInject) { - useNGFX = false; - return false; - } + if (!pActivityToInject) { + useNGFX = false; + return false; + } - result = NGFX_Injection_InjectToProcess(&versionInfo, pActivityToInject); - if (NGFX_INJECTION_RESULT_OK != result) - { - useNGFX = false; - return false; - } + result = NGFX_Injection_InjectToProcess(&versionInfo, pActivityToInject); + if (NGFX_INJECTION_RESULT_OK != result) + { + useNGFX = false; + return false; + } + + useNGFX = true; + + return true; + } // optional TOOD: could log on "else" + #endif // NBL_BUILD_WITH_NGFX - useNGFX = true; - return true; + return false; } bool IAPIConnection::SNGFXIntegration::executeNGFXCommand() { - return NGFX_Injection_ExecuteActivityCommand() == NGFX_INJECTION_RESULT_OK; + #ifdef NBL_BUILD_WITH_NGFX + if(m_loaded) //! this check is mandatory! + return NGFX_Injection_ExecuteActivityCommand() == NGFX_INJECTION_RESULT_OK; // optional TOOD: could log on "else" + #endif // NBL_BUILD_WITH_NGFX + + return false; } +IAPIConnection::SNGFXIntegration::SNGFXIntegration() + : useNGFX(false /*??*/), m_loaded([]() -> bool + { +#ifdef NBL_BUILD_WITH_NGFX + //! absolute path to official install NGFX SDK runtime directory + auto getOfficialRuntimeDirectory = []() + { + const char* sdk = std::getenv("NGFX_SDK"); + const char* version = std::getenv("NGFX_VERSION"); + const bool composed = sdk && version; + + if (composed) + { + const auto directory = system::path(sdk) / system::path(version) / "lib" / "x64"; + + if (std::filesystem::exists(directory)) + return directory; + } + + return system::path(""); + }; + + //! batch request with priority order & custom Nabla runtime search, I'm assuming we are loading the runtime from official SDK not custom location + //! one question is if we should have any constraints for min/max version, maybe force the "version" + //! to match the "NGFX_VERSION" define so to "what we built with", or don't have any - just like now + + #if defined(_NBL_PLATFORM_WINDOWS_) + static constexpr std::string_view NGFXMODULE = "NGFX_Injection.dll"; + HMODULE isAlreadyLoaded = GetModuleHandleA(NGFXMODULE.data()); + + if (!isAlreadyLoaded) + { + const auto dll = getOfficialRuntimeDirectory() / NGFXMODULE.data(); + const HRESULT hook = system::CSystemWin32::delayLoadDLL(NGFXMODULE.data(), { NGFX_INJECTION_DLL_DIR, dll.parent_path() }); + + //! don't be scared if you see "No symbols loaded" - you will not hit "false" in this case, the DLL will get loaded if found, + //! proc addresses will be resolved correctly but status will scream "FAILED" because we don't have any PDB to load + if (FAILED(hook)) + return false; + } + #else + #error "TODO!" + #endif + + return true; +#else + return false; // no NGFX build -> no API to load +#endif + }()) +{} + } \ No newline at end of file From 4f029c86e0406ce6c755ba96f7c0de5654af4748 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 21 Nov 2024 11:24:32 +0700 Subject: [PATCH 215/358] implemented ray query methods --- .../hlsl/spirv_intrinsics/raytracing.hlsl | 32 +++++++++++++------ 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl index 8d7ac0a151..86e9cc687b 100644 --- a/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl +++ b/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl @@ -4,7 +4,7 @@ #ifndef _NBL_BUILTIN_HLSL_SPIRV_INTRINSICS_RAYTRACING_INCLUDED_ #define _NBL_BUILTIN_HLSL_SPIRV_INTRINSICS_RAYTRACING_INCLUDED_ -// #include "nbl/builtin/hlsl/spirv_intrinsics/core.hlsl" +#include "nbl/builtin/hlsl/spirv_intrinsics/core.hlsl" namespace nbl { @@ -13,33 +13,47 @@ namespace hlsl namespace spirv { +using RayQueryKHR = vk::SpirvOpaqueType; +using AccelerationStructureKHR = vk::SpirvOpaqueType; + +// matching Ray Query Committed Intersection Type +static const uint32_t RayQueryCommittedIntersectionNoneKHR = 0; +static const uint32_t RayQueryCommittedIntersectionTriangleKHR = 1; +static const uint32_t RayQueryCommittedIntersectionGeneratedKHR = 2; + +// matching Ray Query Candidate Intersection Type +static const uint32_t RayQueryCandidateIntersectionTriangleKHR = 0; +static const uint32_t RayQueryCandidateIntersectionAABBKHR = 1; + +[[vk::ext_capability(spv::CapabilityRayQueryKHR)]] +[[vk::ext_instruction(spv::OpRayQueryInitializeKHR)]] +void rayQueryInitializeEXT([[vk::ext_reference]] RayQueryKHR query, [[vk::ext_reference]] AccelerationStructureKHR AS, uint32_t flags, uint32_t cull mask, float3 origin, float32_t tmin, float3 direction, float32_t tmax); + [[vk::ext_capability(spv::CapabilityRayQueryKHR)]] [[vk::ext_instruction(spv::OpRayQueryInitializeKHR)]] -void rayQueryInitializeEXT(/* spv::OpTypeRayQueryKHR function_ptr query, spv::OpTypeAccelerationStructureKHR AS, uint32_t flags, uint32_t cull mask, float3 origin, float32_t tmin, float3 direction, float32_t tmax*/); +void rayQueryInitializeEXT([[vk::ext_reference]] RayQueryKHR query, [[vk::ext_reference]] RaytracingAccelerationStructure AS, uint32_t flags, uint32_t cull mask, float3 origin, float32_t tmin, float3 direction, float32_t tmax); [[vk::ext_capability(spv::CapabilityRayQueryKHR)]] [[vk::ext_instruction(spv::OpRayQueryProceedKHR)]] -bool rayQueryProceedEXT(/* spv::OpTypeRayQueryKHR function_ptr query */); +bool rayQueryProceedEXT([[vk::ext_reference]] RayQueryKHR query); [[vk::ext_capability(spv::CapabilityRayQueryKHR)]] [[vk::ext_instruction(spv::OpRayQueryGetIntersectionTypeKHR)]] -int rayQueryGetIntersectionTypeEXT(/* spv::OpTypeRayQueryKHR function_ptr query, uint32_t intersection (candidate 0, commited 1) */); +int rayQueryGetIntersectionTypeEXT([[vk::ext_reference]] RayQueryKHR query, uint32_t intersection); [[vk::ext_capability(spv::CapabilityRayQueryKHR)]] [[vk::ext_instruction(spv::OpRayQueryGetIntersectionInstanceIdKHR)]] -int rayQueryGetIntersectionInstanceIdEXT(/* spv::OpTypeRayQueryKHR function_ptr query, uint32_t intersection (candidate 0, commited 1) */); +int rayQueryGetIntersectionInstanceIdEXT([[vk::ext_reference]] RayQueryKHR query, uint32_t intersection); [[vk::ext_capability(spv::CapabilityRayQueryKHR)]] [[vk::ext_instruction(spv::OpRayQueryGetIntersectionPrimitiveIndexKHR)]] -int rayQueryGetIntersectionPrimitiveIndexEXT(/* spv::OpTypeRayQueryKHR function_ptr query, uint32_t intersection (candidate 0, commited 1) */); +int rayQueryGetIntersectionPrimitiveIndexEXT([[vk::ext_reference]] RayQueryKHR query, uint32_t intersection); // position fetch for ray tracing uses gl_HitTriangleVertexPositionsEXT -> HitTriangleVertexPositionsKHR decorated OpVariable [[vk::ext_capability(spv::CapabilityRayQueryPositionFetchKHR)]] [[vk::ext_instruction(spv::OpRayQueryGetIntersectionTriangleVertexPositionsKHR)]] // ray query version -// get intersection triangle vertex position - gl_HitTriangleVertexPositionsEXT -// returns 3-array float3 vertex positions -void rayQueryGetIntersectionTriangleVertexPositionsEXT(/* spv::OpTypeRayQueryKHR function_ptr query, uint32_t intersection (candidate 0, commited 1), out float3 positions[3] */); +void rayQueryGetIntersectionTriangleVertexPositionsEXT([[vk::ext_reference]] RayQueryKHR query, uint32_t intersection, out float32_t3 pos[3]); } } From fedac6237bd2199d7256a1c07b10322dfdff15d5 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 21 Nov 2024 12:20:06 +0700 Subject: [PATCH 216/358] some corrections to init --- .../hlsl/spirv_intrinsics/raytracing.hlsl | 29 ++++++++++++++----- src/nbl/video/CVulkanPhysicalDevice.cpp | 5 ++++ 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl index 86e9cc687b..d953a92bef 100644 --- a/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl +++ b/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl @@ -13,7 +13,10 @@ namespace hlsl namespace spirv { +//[[vk::ext_capability(spv::CapabilityRayQueryKHR)]] https://github.com/microsoft/DirectXShaderCompiler/issues/6958 using RayQueryKHR = vk::SpirvOpaqueType; + +//[[vk::ext_capability(spv::CapabilityAccelerationStructureKHR)]] using AccelerationStructureKHR = vk::SpirvOpaqueType; // matching Ray Query Committed Intersection Type @@ -26,34 +29,46 @@ static const uint32_t RayQueryCandidateIntersectionTriangleKHR = 0; static const uint32_t RayQueryCandidateIntersectionAABBKHR = 1; [[vk::ext_capability(spv::CapabilityRayQueryKHR)]] +[[vk::ext_extension("SPV_KHR_ray_query")]] [[vk::ext_instruction(spv::OpRayQueryInitializeKHR)]] -void rayQueryInitializeEXT([[vk::ext_reference]] RayQueryKHR query, [[vk::ext_reference]] AccelerationStructureKHR AS, uint32_t flags, uint32_t cull mask, float3 origin, float32_t tmin, float3 direction, float32_t tmax); +void rayQueryInitializeKHR([[vk::ext_reference]] RayQueryKHR query, [[vk::ext_reference]] AccelerationStructureKHR AS, uint32_t flags, uint32_t cull mask, float3 origin, float32_t tmin, float3 direction, float32_t tmax); [[vk::ext_capability(spv::CapabilityRayQueryKHR)]] +[[vk::ext_extension("SPV_KHR_ray_query")]] [[vk::ext_instruction(spv::OpRayQueryInitializeKHR)]] -void rayQueryInitializeEXT([[vk::ext_reference]] RayQueryKHR query, [[vk::ext_reference]] RaytracingAccelerationStructure AS, uint32_t flags, uint32_t cull mask, float3 origin, float32_t tmin, float3 direction, float32_t tmax); +void rayQueryInitializeKHR([[vk::ext_reference]] RayQueryKHR query, [[vk::ext_reference]] RaytracingAccelerationStructure AS, uint32_t flags, uint32_t cull mask, float3 origin, float32_t tmin, float3 direction, float32_t tmax); [[vk::ext_capability(spv::CapabilityRayQueryKHR)]] +[[vk::ext_extension("SPV_KHR_ray_query")]] [[vk::ext_instruction(spv::OpRayQueryProceedKHR)]] -bool rayQueryProceedEXT([[vk::ext_reference]] RayQueryKHR query); +bool rayQueryProceedKHR([[vk::ext_reference]] RayQueryKHR query); [[vk::ext_capability(spv::CapabilityRayQueryKHR)]] +[[vk::ext_extension("SPV_KHR_ray_query")]] [[vk::ext_instruction(spv::OpRayQueryGetIntersectionTypeKHR)]] -int rayQueryGetIntersectionTypeEXT([[vk::ext_reference]] RayQueryKHR query, uint32_t intersection); +int rayQueryGetIntersectionTypeKHR([[vk::ext_reference]] RayQueryKHR query, uint32_t intersection); [[vk::ext_capability(spv::CapabilityRayQueryKHR)]] +[[vk::ext_extension("SPV_KHR_ray_query")]] [[vk::ext_instruction(spv::OpRayQueryGetIntersectionInstanceIdKHR)]] -int rayQueryGetIntersectionInstanceIdEXT([[vk::ext_reference]] RayQueryKHR query, uint32_t intersection); +int rayQueryGetIntersectionInstanceIdKHR([[vk::ext_reference]] RayQueryKHR query, uint32_t intersection); [[vk::ext_capability(spv::CapabilityRayQueryKHR)]] +[[vk::ext_extension("SPV_KHR_ray_query")]] [[vk::ext_instruction(spv::OpRayQueryGetIntersectionPrimitiveIndexKHR)]] -int rayQueryGetIntersectionPrimitiveIndexEXT([[vk::ext_reference]] RayQueryKHR query, uint32_t intersection); +int rayQueryGetIntersectionPrimitiveIndexKHR([[vk::ext_reference]] RayQueryKHR query, uint32_t intersection); // position fetch for ray tracing uses gl_HitTriangleVertexPositionsEXT -> HitTriangleVertexPositionsKHR decorated OpVariable +[[vk::ext_builtin_input(spv::BuiltInHitTriangleVertexPositionsKHR)]] +static const float32_t3 HitTriangleVertexPositionsKHR[3]; [[vk::ext_capability(spv::CapabilityRayQueryPositionFetchKHR)]] +[[vk::ext_extension("SPV_KHR_ray_tracing_position_fetch")]] [[vk::ext_instruction(spv::OpRayQueryGetIntersectionTriangleVertexPositionsKHR)]] // ray query version -void rayQueryGetIntersectionTriangleVertexPositionsEXT([[vk::ext_reference]] RayQueryKHR query, uint32_t intersection, out float32_t3 pos[3]); +void rayQueryGetIntersectionTriangleVertexPositionsKHR([[vk::ext_reference]] RayQueryKHR query, uint32_t intersection, out float32_t3 pos[3]); +// how else would you pass an array of float3s? +// attempting to compile this with usage on godbolt gives: fatal error: failed to legalize SPIR-V: Id 3346 is defined more than once +// but without usage is fine } } diff --git a/src/nbl/video/CVulkanPhysicalDevice.cpp b/src/nbl/video/CVulkanPhysicalDevice.cpp index d9740bc813..79ffc7306d 100644 --- a/src/nbl/video/CVulkanPhysicalDevice.cpp +++ b/src/nbl/video/CVulkanPhysicalDevice.cpp @@ -1572,6 +1572,9 @@ core::smart_refctd_ptr CVulkanPhysicalDevice::createLogicalDevic &rasterizationOrderAttachmentAccessFeatures ); + VkPhysicalDeviceRayTracingPositionFetchFeaturesKHR rayTracingPositionFetchFeatures = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_TRACING_POSITION_FETCH_FEATURES_KHR, nullptr }; + REQUIRE_EXTENSION_IF(enabledFeatures.accelerationStructure && m_initData.properties.limits.rayTracingPositionFetch, VK_KHR_RAY_TRACING_POSITION_FETCH_EXTENSION_NAME, &rayTracingPositionFetchFeatures); + VkPhysicalDeviceColorWriteEnableFeaturesEXT colorWriteEnableFeatures = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COLOR_WRITE_ENABLE_FEATURES_EXT,nullptr }; enableExtensionIfAvailable(VK_EXT_COLOR_WRITE_ENABLE_EXTENSION_NAME,&colorWriteEnableFeatures); #if 0 @@ -1757,6 +1760,8 @@ core::smart_refctd_ptr CVulkanPhysicalDevice::createLogicalDevic rayQueryFeatures.rayQuery = enabledFeatures.rayQuery; + rayTracingPositionFetchFeatures.rayTracingPositionFetch = limits.rayTracingPositionFetch; + //shaderSMBuiltinsFeaturesNV [LIMIT SO ENABLE EVERYTHING BY DEFAULT] representativeFragmentTestFeatures.representativeFragmentTest = enabledFeatures.representativeFragmentTest; From be03a55106b26082729ab11a57c5a2d747a2a02d Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 21 Nov 2024 16:34:13 +0700 Subject: [PATCH 217/358] ray query inline spirv working --- examples_tests | 2 +- .../builtin/hlsl/spirv_intrinsics/raytracing.hlsl | 14 +++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/examples_tests b/examples_tests index 2af7f38067..31a565c93c 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 2af7f380674697637c2fc1384c098bb9e14bf572 +Subproject commit 31a565c93c788fa05d1cd9651cc00a0ae7bca5c7 diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl index d953a92bef..4a77535f30 100644 --- a/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl +++ b/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl @@ -28,15 +28,18 @@ static const uint32_t RayQueryCommittedIntersectionGeneratedKHR = 2; static const uint32_t RayQueryCandidateIntersectionTriangleKHR = 0; static const uint32_t RayQueryCandidateIntersectionAABBKHR = 1; +[[vk::ext_instruction(spv::OpConvertUToAccelerationStructureKHR)]] +AccelerationStructureKHR accelerationStructureKHR(uint64_t u); + [[vk::ext_capability(spv::CapabilityRayQueryKHR)]] [[vk::ext_extension("SPV_KHR_ray_query")]] [[vk::ext_instruction(spv::OpRayQueryInitializeKHR)]] -void rayQueryInitializeKHR([[vk::ext_reference]] RayQueryKHR query, [[vk::ext_reference]] AccelerationStructureKHR AS, uint32_t flags, uint32_t cull mask, float3 origin, float32_t tmin, float3 direction, float32_t tmax); +void rayQueryInitializeKHR([[vk::ext_reference]] RayQueryKHR query, AccelerationStructureKHR AS, uint32_t flags, uint32_t cullMask, float32_t3 origin, float32_t tmin, float32_t3 direction, float32_t tmax); [[vk::ext_capability(spv::CapabilityRayQueryKHR)]] [[vk::ext_extension("SPV_KHR_ray_query")]] [[vk::ext_instruction(spv::OpRayQueryInitializeKHR)]] -void rayQueryInitializeKHR([[vk::ext_reference]] RayQueryKHR query, [[vk::ext_reference]] RaytracingAccelerationStructure AS, uint32_t flags, uint32_t cull mask, float3 origin, float32_t tmin, float3 direction, float32_t tmax); +void rayQueryInitializeKHR([[vk::ext_reference]] RayQueryKHR query, RaytracingAccelerationStructure AS, uint32_t flags, uint32_t cullMask, float32_t3 origin, float32_t tmin, float32_t3 direction, float32_t tmax); [[vk::ext_capability(spv::CapabilityRayQueryKHR)]] [[vk::ext_extension("SPV_KHR_ray_query")]] @@ -58,6 +61,11 @@ int rayQueryGetIntersectionInstanceIdKHR([[vk::ext_reference]] RayQueryKHR query [[vk::ext_instruction(spv::OpRayQueryGetIntersectionPrimitiveIndexKHR)]] int rayQueryGetIntersectionPrimitiveIndexKHR([[vk::ext_reference]] RayQueryKHR query, uint32_t intersection); +[[vk::ext_capability(spv::CapabilityRayQueryKHR)]] +[[vk::ext_extension("SPV_KHR_ray_query")]] +[[vk::ext_instruction(spv::OpRayQueryGetIntersectionBarycentricsKHR)]] +float2 rayQueryGetIntersectionBarycentricsKHR([[vk::ext_reference]] RayQueryKHR query, uint32_t intersection); + // position fetch for ray tracing uses gl_HitTriangleVertexPositionsEXT -> HitTriangleVertexPositionsKHR decorated OpVariable [[vk::ext_builtin_input(spv::BuiltInHitTriangleVertexPositionsKHR)]] static const float32_t3 HitTriangleVertexPositionsKHR[3]; @@ -74,4 +82,4 @@ void rayQueryGetIntersectionTriangleVertexPositionsKHR([[vk::ext_reference]] Ray } } -#endif // _NBL_BUILTIN_HLSL_SPIRV_INTRINSICS_RAYTRACING_INCLUDED_ \ No newline at end of file +#endif // _NBL_BUILTIN_HLSL_SPIRV_INTRINSICS_RAYTRACING_INCLUDED_ From 264dd697ccf492572d82c89124c79402afc505a7 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 22 Nov 2024 10:15:59 +0700 Subject: [PATCH 218/358] fixed position fetch definition --- examples_tests | 2 +- include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/examples_tests b/examples_tests index 31a565c93c..b0ab7b560f 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 31a565c93c788fa05d1cd9651cc00a0ae7bca5c7 +Subproject commit b0ab7b560f9150192c25da88ed2c2729e5e31da2 diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl index 4a77535f30..63b519c12a 100644 --- a/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl +++ b/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl @@ -70,13 +70,12 @@ float2 rayQueryGetIntersectionBarycentricsKHR([[vk::ext_reference]] RayQueryKHR [[vk::ext_builtin_input(spv::BuiltInHitTriangleVertexPositionsKHR)]] static const float32_t3 HitTriangleVertexPositionsKHR[3]; +using __arr3_float3 = float32_t3[3]; + [[vk::ext_capability(spv::CapabilityRayQueryPositionFetchKHR)]] [[vk::ext_extension("SPV_KHR_ray_tracing_position_fetch")]] [[vk::ext_instruction(spv::OpRayQueryGetIntersectionTriangleVertexPositionsKHR)]] // ray query version -void rayQueryGetIntersectionTriangleVertexPositionsKHR([[vk::ext_reference]] RayQueryKHR query, uint32_t intersection, out float32_t3 pos[3]); -// how else would you pass an array of float3s? -// attempting to compile this with usage on godbolt gives: fatal error: failed to legalize SPIR-V: Id 3346 is defined more than once -// but without usage is fine +__arr3_float3 rayQueryGetIntersectionTriangleVertexPositionsKHR([[vk::ext_reference]] RayQueryKHR query, uint32_t intersection); } } From 076128f0e6c719ec9c920a81b28c2fd25a8f3a3a Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 22 Nov 2024 10:19:01 +0700 Subject: [PATCH 219/358] added new spirv intrinsic file to builtin --- src/nbl/builtin/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index 2818b3f64c..b70a5f119e 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -213,6 +213,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/macros.h") #spirv intrinsics LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/core.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/fragment_shader_pixel_interlock.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/raytracing.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/subgroup_arithmetic.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/subgroup_ballot.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/subgroup_basic.hlsl") From 87bf160f983425bdcb2b89e1a813a6b488a9d411 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 22 Nov 2024 15:15:36 +0700 Subject: [PATCH 220/358] minor adjustments and example --- examples_tests | 2 +- .../hlsl/spirv_intrinsics/raytracing.hlsl | 17 ++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/examples_tests b/examples_tests index b0ab7b560f..225a480a52 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit b0ab7b560f9150192c25da88ed2c2729e5e31da2 +Subproject commit 225a480a52003e1c406c4a8d8391d2c22ebdbd94 diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl index 63b519c12a..87d6b83f52 100644 --- a/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl +++ b/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl @@ -49,36 +49,35 @@ bool rayQueryProceedKHR([[vk::ext_reference]] RayQueryKHR query); [[vk::ext_capability(spv::CapabilityRayQueryKHR)]] [[vk::ext_extension("SPV_KHR_ray_query")]] [[vk::ext_instruction(spv::OpRayQueryGetIntersectionTypeKHR)]] -int rayQueryGetIntersectionTypeKHR([[vk::ext_reference]] RayQueryKHR query, uint32_t intersection); +int rayQueryGetIntersectionTypeKHR([[vk::ext_reference]] RayQueryKHR query, uint32_t committed); [[vk::ext_capability(spv::CapabilityRayQueryKHR)]] [[vk::ext_extension("SPV_KHR_ray_query")]] [[vk::ext_instruction(spv::OpRayQueryGetIntersectionInstanceIdKHR)]] -int rayQueryGetIntersectionInstanceIdKHR([[vk::ext_reference]] RayQueryKHR query, uint32_t intersection); +int rayQueryGetIntersectionInstanceIdKHR([[vk::ext_reference]] RayQueryKHR query, uint32_t committed); [[vk::ext_capability(spv::CapabilityRayQueryKHR)]] [[vk::ext_extension("SPV_KHR_ray_query")]] [[vk::ext_instruction(spv::OpRayQueryGetIntersectionPrimitiveIndexKHR)]] -int rayQueryGetIntersectionPrimitiveIndexKHR([[vk::ext_reference]] RayQueryKHR query, uint32_t intersection); +int rayQueryGetIntersectionPrimitiveIndexKHR([[vk::ext_reference]] RayQueryKHR query, uint32_t committed); [[vk::ext_capability(spv::CapabilityRayQueryKHR)]] [[vk::ext_extension("SPV_KHR_ray_query")]] [[vk::ext_instruction(spv::OpRayQueryGetIntersectionBarycentricsKHR)]] -float2 rayQueryGetIntersectionBarycentricsKHR([[vk::ext_reference]] RayQueryKHR query, uint32_t intersection); +float2 rayQueryGetIntersectionBarycentricsKHR([[vk::ext_reference]] RayQueryKHR query, uint32_t committed); // position fetch for ray tracing uses gl_HitTriangleVertexPositionsEXT -> HitTriangleVertexPositionsKHR decorated OpVariable [[vk::ext_builtin_input(spv::BuiltInHitTriangleVertexPositionsKHR)]] static const float32_t3 HitTriangleVertexPositionsKHR[3]; -using __arr3_float3 = float32_t3[3]; - +// ray query version [[vk::ext_capability(spv::CapabilityRayQueryPositionFetchKHR)]] [[vk::ext_extension("SPV_KHR_ray_tracing_position_fetch")]] -[[vk::ext_instruction(spv::OpRayQueryGetIntersectionTriangleVertexPositionsKHR)]] // ray query version -__arr3_float3 rayQueryGetIntersectionTriangleVertexPositionsKHR([[vk::ext_reference]] RayQueryKHR query, uint32_t intersection); +[[vk::ext_instruction(spv::OpRayQueryGetIntersectionTriangleVertexPositionsKHR)]] +float3 rayQueryGetIntersectionTriangleVertexPositionsKHR([[vk::ext_reference]] RayQueryKHR query, uint32_t committed)[3]; } } } -#endif // _NBL_BUILTIN_HLSL_SPIRV_INTRINSICS_RAYTRACING_INCLUDED_ +#endif From 8562bcfac41411d1d9b70a1d725f66711e29cbc7 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 22 Nov 2024 16:18:36 +0700 Subject: [PATCH 221/358] removed redundant constants --- examples_tests | 2 +- .../nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl | 9 --------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/examples_tests b/examples_tests index 225a480a52..386e2a7ebc 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 225a480a52003e1c406c4a8d8391d2c22ebdbd94 +Subproject commit 386e2a7ebc957b7dffdb10ce27949f0c4e34a161 diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl index 87d6b83f52..81295e31e5 100644 --- a/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl +++ b/include/nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl @@ -19,15 +19,6 @@ using RayQueryKHR = vk::SpirvOpaqueType; //[[vk::ext_capability(spv::CapabilityAccelerationStructureKHR)]] using AccelerationStructureKHR = vk::SpirvOpaqueType; -// matching Ray Query Committed Intersection Type -static const uint32_t RayQueryCommittedIntersectionNoneKHR = 0; -static const uint32_t RayQueryCommittedIntersectionTriangleKHR = 1; -static const uint32_t RayQueryCommittedIntersectionGeneratedKHR = 2; - -// matching Ray Query Candidate Intersection Type -static const uint32_t RayQueryCandidateIntersectionTriangleKHR = 0; -static const uint32_t RayQueryCandidateIntersectionAABBKHR = 1; - [[vk::ext_instruction(spv::OpConvertUToAccelerationStructureKHR)]] AccelerationStructureKHR accelerationStructureKHR(uint64_t u); From 1f174fe1194095bc230b1a608710946c701ed3ba Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 22 Nov 2024 16:25:41 +0700 Subject: [PATCH 222/358] latest example --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 386e2a7ebc..4b774eacb5 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 386e2a7ebc957b7dffdb10ce27949f0c4e34a161 +Subproject commit 4b774eacb513eff8823b3c2e4229897d11d5f1a8 From 48d4440f0567a9be16a694ab540f9df7ae45f0ed Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 22 Nov 2024 16:42:40 +0700 Subject: [PATCH 223/358] fast-forward Vulkan-Headers, example --- 3rdparty/Vulkan-Headers | 2 +- examples_tests | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/3rdparty/Vulkan-Headers b/3rdparty/Vulkan-Headers index 2c823b7f27..31aa7f634b 160000 --- a/3rdparty/Vulkan-Headers +++ b/3rdparty/Vulkan-Headers @@ -1 +1 @@ -Subproject commit 2c823b7f27590ec0a489f7fbe14b154e13fa5cfb +Subproject commit 31aa7f634b052d87ede4664053e85f3f4d1d50d3 diff --git a/examples_tests b/examples_tests index 4b774eacb5..96b1826cf0 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 4b774eacb513eff8823b3c2e4229897d11d5f1a8 +Subproject commit 96b1826cf037d24bcef0b0b494a58252e008327e From e618e58f284dcf68c86f62173d832732b53e7496 Mon Sep 17 00:00:00 2001 From: Fletterio Date: Fri, 22 Nov 2024 21:19:40 -0300 Subject: [PATCH 224/358] Yet more utils, such as bitreversal --- include/nbl/builtin/hlsl/fft/common.hlsl | 61 ++++++++++++++++++++- include/nbl/builtin/hlsl/workgroup/fft.hlsl | 43 ++++++++++++--- 2 files changed, 94 insertions(+), 10 deletions(-) diff --git a/include/nbl/builtin/hlsl/fft/common.hlsl b/include/nbl/builtin/hlsl/fft/common.hlsl index fa54da5a3f..3456123a5f 100644 --- a/include/nbl/builtin/hlsl/fft/common.hlsl +++ b/include/nbl/builtin/hlsl/fft/common.hlsl @@ -1,9 +1,47 @@ #ifndef _NBL_BUILTIN_HLSL_FFT_COMMON_INCLUDED_ #define _NBL_BUILTIN_HLSL_FFT_COMMON_INCLUDED_ -#include "nbl/builtin/hlsl/complex.hlsl" #include "nbl/builtin/hlsl/cpp_compat.hlsl" + +#ifndef __HLSL_VERSION +#include + +namespace nbl +{ +namespace hlsl +{ +namespace fft +{ + +static inline uint32_t3 padDimensions(uint32_t3 dimensions, std::span axes, bool realFFT = false) +{ + uint16_t axisCount = 0; + for (auto i : axes) + { + dimensions[i] = core::roundUpToPoT(dimensions[i]); + if (realFFT && !axisCount++) + dimensions[i] /= 2; + } + return dimensions; +} + +static inline uint64_t getOutputBufferSize(const uint32_t3& inputDimensions, uint32_t numChannels, std::span axes, bool realFFT = false, bool halfFloats = false) +{ + auto paddedDims = padDimensions(inputDimensions, axes); + uint64_t numberOfComplexElements = paddedDims[0] * paddedDims[1] * paddedDims[2] * numChannels; + return 2 * numberOfComplexElements * (halfFloats ? sizeof(float16_t) : sizeof(float32_t)); +} + + +} +} +} + +#else + +#include "nbl/builtin/hlsl/complex.hlsl" #include "nbl/builtin/hlsl/numbers.hlsl" +#include "nbl/builtin/hlsl/concepts.hlsl" namespace nbl { @@ -53,8 +91,29 @@ using DIT = DIX; template using DIF = DIX; + +// ------------------------------------------------- Utils --------------------------------------------------------- +// +// Util to unpack two values from the packed FFT X + iY - get outputs in the same input arguments, storing x to lo and y to hi +template +void unpack(NBL_REF_ARG(complex_t) lo, NBL_REF_ARG(complex_t) hi) +{ + complex_t x = (lo + conj(hi)) * Scalar(0.5); + hi = rotateRight(lo - conj(hi)) * Scalar(0.5); + lo = x; } + +// Bit-reverses T as a binary string of length given by Bits +template && Bits <= sizeof(T) * 8) +T bitReverse(T value) +{ + return glsl::bitfieldReverse(value) >> (sizeof(T) * 8 - Bits); } + } +} +} + +#endif #endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/workgroup/fft.hlsl b/include/nbl/builtin/hlsl/workgroup/fft.hlsl index 44a0dfc1d7..dc1aa0576e 100644 --- a/include/nbl/builtin/hlsl/workgroup/fft.hlsl +++ b/include/nbl/builtin/hlsl/workgroup/fft.hlsl @@ -1,6 +1,37 @@ #ifndef _NBL_BUILTIN_HLSL_WORKGROUP_FFT_INCLUDED_ #define _NBL_BUILTIN_HLSL_WORKGROUP_FFT_INCLUDED_ +#include +#include + +#ifndef __HLSL_VERSION +#include + +namespace nbl +{ +namespace hlsl +{ +namespace workgroup +{ +namespace fft +{ + +inline std::pair optimalFFTParameters(const video::ILogicalDevice* device, uint32_t inputArrayLength) +{ + uint32_t maxWorkgroupSize = *device->getPhysicalDevice()->getLimits().maxWorkgroupSize; + // This is the logic found in core::roundUpToPoT to get the log2 + uint16_t workgroupSizeLog2 = 1u + hlsl::findMSB(core::min(inputArrayLength / 2, maxWorkgroupSize) - 1u); + uint16_t elementPerInvocationLog2 = 1u + hlsl::findMSB(core::max((inputArrayLength >> workgroupSizeLog2) - 1u, 1u)); + return { elementPerInvocationLog2, workgroupSizeLog2 }; +} + +} +} +} +} + +#else + #include "nbl/builtin/hlsl/subgroup/fft.hlsl" #include "nbl/builtin/hlsl/workgroup/basic.hlsl" #include "nbl/builtin/hlsl/glsl_compat/core.hlsl" @@ -91,15 +122,6 @@ namespace impl } } //namespace impl -// Util to unpack two values from the packed FFT X + iY - get outputs in the same input arguments, storing x to lo and y to hi -template -void unpack(NBL_REF_ARG(complex_t) lo, NBL_REF_ARG(complex_t) hi) -{ - complex_t x = (lo + conj(hi)) * Scalar(0.5); - hi = rotateRight(lo - conj(hi)) * Scalar(0.5); - lo = x; -} - template struct FFTIndexingUtils { @@ -425,4 +447,7 @@ struct FFT Date: Tue, 26 Nov 2024 09:58:13 -0800 Subject: [PATCH 225/358] Implemented nbl::hlsl::mul --- examples_tests | 2 +- .../nbl/builtin/hlsl/emulated/float64_t.hlsl | 47 +++++------ .../builtin/hlsl/emulated/float64_t_impl.hlsl | 23 +++--- .../nbl/builtin/hlsl/emulated/vector_t.hlsl | 80 ++++++++----------- include/nbl/builtin/hlsl/functional.hlsl | 18 +++++ include/nbl/builtin/hlsl/ieee754/impl.hlsl | 10 +-- .../gauss_legendre/gauss_legendre.hlsl | 1 - .../nbl/builtin/hlsl/portable/float64_t.hlsl | 8 +- .../nbl/builtin/hlsl/portable/matrix_t.hlsl | 68 ++++++++-------- include/nbl/builtin/hlsl/shapes/beziers.hlsl | 7 +- include/nbl/builtin/hlsl/type_traits.hlsl | 3 + 11 files changed, 133 insertions(+), 134 deletions(-) diff --git a/examples_tests b/examples_tests index 049772c9f3..ad0e24f799 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 049772c9f3a37a17a2332ed0d7c3f78375d218ec +Subproject commit ad0e24f799afe4b7bb7f1675b061e8408d95a0dd diff --git a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl index eec7a27c46..cf2a7e494f 100644 --- a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl @@ -38,22 +38,22 @@ namespace hlsl NBL_CONSTEXPR_STATIC_INLINE this_t create(int32_t val) { - return bit_cast(emulated_float64_t_impl::castToUint64WithFloat64BitPattern(int64_t(val))); + return bit_cast(emulated_float64_t_impl::reinterpretAsFloat64BitPattern(int64_t(val))); } NBL_CONSTEXPR_STATIC_INLINE this_t create(int64_t val) { - return bit_cast(emulated_float64_t_impl::castToUint64WithFloat64BitPattern(val)); + return bit_cast(emulated_float64_t_impl::reinterpretAsFloat64BitPattern(val)); } NBL_CONSTEXPR_STATIC_INLINE this_t create(uint32_t val) { - return bit_cast(emulated_float64_t_impl::castToUint64WithFloat64BitPattern(uint64_t(val))); + return bit_cast(emulated_float64_t_impl::reinterpretAsFloat64BitPattern(uint64_t(val))); } NBL_CONSTEXPR_STATIC_INLINE this_t create(uint64_t val) { - return bit_cast(emulated_float64_t_impl::castToUint64WithFloat64BitPattern(val)); + return bit_cast(emulated_float64_t_impl::reinterpretAsFloat64BitPattern(val)); } NBL_CONSTEXPR_STATIC_INLINE this_t create(float32_t val) @@ -92,24 +92,14 @@ namespace hlsl { if(!FastMath) { - if (tgmath::isNaN(data) || tgmath::isNaN(rhs.data)) - return bit_cast(ieee754::traits::quietNaN); - - if (emulated_float64_t_impl::areBothInfinity(data, rhs.data)) - { - uint64_t lhsSign = data & ieee754::traits::signMask; - uint64_t rhsSign = rhs.data & ieee754::traits::signMask; - - if (lhsSign == rhsSign) - return bit_cast(ieee754::traits::inf | lhsSign); - else if (lhsSign || rhsSign) - return bit_cast(ieee754::traits::quietNaN | ieee754::traits::signMask); - } - + const bool isRhsInf = tgmath::isInf(rhs.data); if (tgmath::isInf(data)) + { + if (isRhsInf && ((data ^ rhs.data) & ieee754::traits::signMask)) + return bit_cast(ieee754::traits::quietNaN); return bit_cast(data); - - if (tgmath::isInf(rhs.data)) + } + else if (isRhsInf) return bit_cast(rhs.data); } @@ -128,17 +118,18 @@ namespace hlsl return bit_cast(ieee754::traits::inf | ieee754::extractSignPreserveBitPattern(max(lhsData, rhsData))); } - if (emulated_float64_t_impl::areBothZero(lhsData, rhsData)) - { - if (lhsSign == rhsSign) - return bit_cast(lhsSign); - else - return bit_cast(0ull); - } + const bool isRhsZero = emulated_float64_t_impl::isZero(rhsData); if (emulated_float64_t_impl::isZero(lhsData)) + { + if(isRhsZero) + return bit_cast((uint64_t(lhsData == rhsData) << 63) & lhsSign); + return bit_cast(rhsData); - if (emulated_float64_t_impl::isZero(rhsData)) + } + else if (isRhsZero) + { return bit_cast(lhsData); + } uint64_t lhsNormMantissa = ieee754::extractNormalizeMantissa(lhsData); uint64_t rhsNormMantissa = ieee754::extractNormalizeMantissa(rhsData); diff --git a/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl index e9f7b098ae..708c7ae4a3 100644 --- a/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl +++ b/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl @@ -103,7 +103,7 @@ inline int _findMSB(uint64_t val) #endif } -inline uint64_t castToUint64WithFloat64BitPattern(uint64_t val) +inline uint64_t reinterpretAsFloat64BitPattern(uint64_t val) { if (isZero(val)) return val; @@ -149,25 +149,20 @@ inline uint64_t castToUint64WithFloat64BitPattern(uint64_t val) return biasedExp | mantissa; }; -inline uint64_t castToUint64WithFloat64BitPattern(int64_t val) +inline uint64_t reinterpretAsFloat64BitPattern(int64_t val) { const uint64_t sign = val & ieee754::traits::signMask; const uint64_t absVal = uint64_t(abs(val)); - return sign | castToUint64WithFloat64BitPattern(absVal); + return sign | reinterpretAsFloat64BitPattern(absVal); }; -NBL_CONSTEXPR_INLINE_FUNC uint32_t2 umulExtended(uint32_t lhs, uint32_t rhs) +template +NBL_CONSTEXPR_INLINE_FUNC Uint flushDenormToZero(const int extractedBiasedExp, Uint value) { - uint64_t product = uint64_t(lhs) * uint64_t(rhs); - uint32_t2 output; - output.x = uint32_t((product & 0xFFFFFFFF00000000) >> 32); - output.y = uint32_t(product & 0x00000000FFFFFFFFull); - return output; -} - -NBL_CONSTEXPR_INLINE_FUNC uint64_t flushDenormToZero(uint64_t extractedBiasedExponent, uint64_t value) -{ - return extractedBiasedExponent ? value : ieee754::extractSignPreserveBitPattern(value); + // TODO: + // static_assert(is_unsigned::value); + using AsFloat = typename float_of_size::type; + return extractedBiasedExp ? value : (value & ieee754::traits::signMask); } NBL_CONSTEXPR_INLINE_FUNC uint64_t assembleFloat64(uint64_t signShifted, uint64_t expShifted, uint64_t mantissa) diff --git a/include/nbl/builtin/hlsl/emulated/vector_t.hlsl b/include/nbl/builtin/hlsl/emulated/vector_t.hlsl index 768c6b8c85..89c78936a6 100644 --- a/include/nbl/builtin/hlsl/emulated/vector_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/vector_t.hlsl @@ -2,6 +2,7 @@ #define _NBL_BUILTIN_HLSL_EMULATED_VECTOR_T_HLSL_INCLUDED_ #include +#include namespace nbl { @@ -320,6 +321,10 @@ struct emulated_vector : CRTP return output; } + //DEFINE_OPERATORS_FOR_TYPE(emulated_float64_t) + //DEFINE_OPERATORS_FOR_TYPE(emulated_float64_t) + //DEFINE_OPERATORS_FOR_TYPE(emulated_float64_t) + //DEFINE_OPERATORS_FOR_TYPE(emulated_float64_t) DEFINE_OPERATORS_FOR_TYPE(float32_t) DEFINE_OPERATORS_FOR_TYPE(float64_t) DEFINE_OPERATORS_FOR_TYPE(uint16_t) @@ -382,15 +387,6 @@ struct is_valid_emulated_vector is_same_v >; }; -template -struct array_get -{ - T operator()(NBL_REF_ARG(U) vec, const I ix) - { - return vec[ix]; - } -}; - template struct array_get, ComponentType, uint32_t> { @@ -400,43 +396,33 @@ struct array_get, ComponentType, uint32_t> } }; -template -struct array_get, ComponentType, uint32_t> -{ - ComponentType operator()(NBL_REF_ARG(emulated_vector_t3) vec, const uint32_t ix) - { - return vec.getComponent(ix); - } -}; - -template -struct array_get, ComponentType, uint32_t> -{ - ComponentType operator()(NBL_REF_ARG(emulated_vector_t4) vec, const uint32_t ix) - { - return vec.getComponent(ix); - } -}; - -#undef DEFINE_EMULATED_VECTOR_ARRAY_GET_SPECIALIZATION - -template -struct array_set -{ - void operator()(NBL_REF_ARG(U) arr, I index, T val) - { - arr[index] = val; - } -}; +//template +//struct array_set, ComponentType, uint32_t> +//{ +// void operator()(NBL_REF_ARG(emulated_vector_t2) vec, uint32_t index, ComponentType value) +// { +// vec.setComponent(index, value); +// } +//}; + +//template +//struct array_get, ComponentType, uint32_t> +//{ +// ComponentType operator()(typename emulated_vector_t vec, const uint32_t ix) +// { +// return vec.getComponent(ix); +// } +//}; + +//template +//struct array_get, ComponentType, uint32_t> +//{ +// ComponentType operator()(NBL_REF_ARG(emulated_vector_t4) vec, const uint32_t ix) +// { +// return vec.getComponent(ix); +// } +//}; -template -struct array_set, ComponentType, uint32_t> -{ - void operator()(NBL_REF_ARG(emulated_vector_t2) vec, uint32_t index, ComponentType value) - { - vec.setComponent(index, value); - } -}; template struct array_set, ComponentType, uint32_t> @@ -476,7 +462,7 @@ struct static_cast_helper, vector, void> { static inline emulated_vector_t3 cast(vector vec) { - emulated_vector_t2 output; + emulated_vector_t3 output; output.x = _static_cast(vec.x); output.y = _static_cast(vec.y); output.z = _static_cast(vec.z); @@ -490,7 +476,7 @@ struct static_cast_helper, vector, void> { static inline emulated_vector_t4 cast(vector vec) { - emulated_vector_t2 output; + emulated_vector_t4 output; output.x = _static_cast(vec.x); output.y = _static_cast(vec.y); output.z = _static_cast(vec.z); diff --git a/include/nbl/builtin/hlsl/functional.hlsl b/include/nbl/builtin/hlsl/functional.hlsl index c1b347c721..e1d7734bd6 100644 --- a/include/nbl/builtin/hlsl/functional.hlsl +++ b/include/nbl/builtin/hlsl/functional.hlsl @@ -195,6 +195,24 @@ struct maximum NBL_CONSTEXPR_STATIC_INLINE T identity = numeric_limits::lowest; // TODO: `all_components` }; +template +struct array_get +{ + ComponentType operator()(NBL_CONST_REF_ARG(ArrayType) arr, const I ix) + { + return arr[ix]; + } +}; + +template +struct array_set +{ + void operator()(NBL_REF_ARG(ArrayType) arr, I index, ComponentType val) + { + arr[index] = val; + } +}; + } } diff --git a/include/nbl/builtin/hlsl/ieee754/impl.hlsl b/include/nbl/builtin/hlsl/ieee754/impl.hlsl index e17eb9a8c7..b074d81f7c 100644 --- a/include/nbl/builtin/hlsl/ieee754/impl.hlsl +++ b/include/nbl/builtin/hlsl/ieee754/impl.hlsl @@ -26,15 +26,15 @@ NBL_CONSTEXPR_INLINE_FUNC bool isTypeAllowed() } template -NBL_CONSTEXPR_INLINE_FUNC typename unsigned_integer_of_size::type bitCastToUintType(T x) +NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size_t bitCastToUintType(T x) { - using AsUint = typename unsigned_integer_of_size::type; + using AsUint = unsigned_integer_of_size_t; return bit_cast(x); } // to avoid bit cast from uintN_t to uintN_t -template <> NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size<2>::type bitCastToUintType(uint16_t x) { return x; } -template <> NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size<4>::type bitCastToUintType(uint32_t x) { return x; } -template <> NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size<8>::type bitCastToUintType(uint64_t x) { return x; } +template <> NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size_t<2> bitCastToUintType(uint16_t x) { return x; } +template <> NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size_t<4> bitCastToUintType(uint32_t x) { return x; } +template <> NBL_CONSTEXPR_INLINE_FUNC unsigned_integer_of_size_t<8> bitCastToUintType(uint64_t x) { return x; } template NBL_CONSTEXPR_INLINE_FUNC T castBackToFloatType(T x) diff --git a/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl b/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl index 1eeba76546..890fcf174a 100644 --- a/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl +++ b/include/nbl/builtin/hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl @@ -5,7 +5,6 @@ #define _NBL_BUILTIN_HLSL_MATH_QUADRATURE_GAUSS_LEGENDRE_INCLUDED_ #include -// TODO: portable/float64_t.hlsl instead? #include namespace nbl diff --git a/include/nbl/builtin/hlsl/portable/float64_t.hlsl b/include/nbl/builtin/hlsl/portable/float64_t.hlsl index 131d37a87b..e269a4a43b 100644 --- a/include/nbl/builtin/hlsl/portable/float64_t.hlsl +++ b/include/nbl/builtin/hlsl/portable/float64_t.hlsl @@ -9,8 +9,12 @@ namespace hlsl { template #ifdef __HLSL_VERSION -//using portable_float64_t = typename conditional::shaderFloat64, float64_t, emulated_float64_t >::type; -using portable_float64_t = typename conditional::shaderFloat64, emulated_float64_t, emulated_float64_t >::type; +#ifdef NBL_FORCE_EMULATED_FLOAT_64 +using portable_float64_t = emulated_float64_t; +#else +using portable_float64_t = typename conditional::shaderFloat64, float64_t, emulated_float64_t >::type; +#endif + #else using portable_float64_t = float64_t; #endif diff --git a/include/nbl/builtin/hlsl/portable/matrix_t.hlsl b/include/nbl/builtin/hlsl/portable/matrix_t.hlsl index c0b2596cb5..46fc6836af 100644 --- a/include/nbl/builtin/hlsl/portable/matrix_t.hlsl +++ b/include/nbl/builtin/hlsl/portable/matrix_t.hlsl @@ -45,47 +45,51 @@ using portable_float64_t3x3 = portable_matrix_t3x3; namespace impl { -// TODO: move to emulated/matrix.hlsl -// TODO: make one template for all dimensions -template -struct PortableMul64Helper -{ - static inline V multiply(M mat, V vec) + template + struct mul_helper { - V output; + static inline RhsT multiply(LhsT lhs, RhsT rhs) + { + return mul(lhs, rhs); + } + }; - output.x = (mat.rows[0] * vec).calcComponentSum(); - output.y = (mat.rows[1] * vec).calcComponentSum(); - output.z = (mat.rows[2] * vec).calcComponentSum(); + template + struct mul_helper, emulated_vector_t > + { + using LhsT = emulated_matrix; + using RhsT = emulated_vector_t; - return output; - } -}; + static inline RhsT multiply(LhsT mat, RhsT vec) + { + emulated_vector_t output; + output.x = (mat.rows[0] * vec).calcComponentSum(); + output.y = (mat.rows[1] * vec).calcComponentSum(); + output.z = (mat.rows[2] * vec).calcComponentSum(); -template -struct PortableMul64Helper -{ - static inline V multiply(M mat, V vec) - { - return mul(mat, vec); - } -}; + return output; + } + }; } -#ifdef __HLSL_VERSION -template -V portableMul64(M mat, V vec) +// TODO: move to basic.hlsl? +// TODO: concepts, to ensure that LhsT is a matrix and RhsT is a vector type +template +RhsT mul(LhsT lhs, RhsT rhs) { - return impl::PortableMul64Helper >::multiply(mat, vec); + return impl::mul_helper::multiply(lhs, rhs); } -#else -template -V portableMul64(M mat, V vec) -{ - return impl::PortableMul64Helper::multiply(mat, vec); -} -#endif +//template +//emulated_vector_t mul(emulated_matrix mat, emulated_vector_t vec) +//{ +// emulated_vector_t output; +// output.x = (mat.rows[0] * vec).calcComponentSum(); +// output.y = (mat.rows[1] * vec).calcComponentSum(); +// output.z = (mat.rows[2] * vec).calcComponentSum(); +// +// return output; +//} } } diff --git a/include/nbl/builtin/hlsl/shapes/beziers.hlsl b/include/nbl/builtin/hlsl/shapes/beziers.hlsl index fa009074e0..d4ebe6d0d8 100644 --- a/include/nbl/builtin/hlsl/shapes/beziers.hlsl +++ b/include/nbl/builtin/hlsl/shapes/beziers.hlsl @@ -512,13 +512,12 @@ struct Quadratic }; // This function returns the analytic quartic equation to solve for lhs bezier's t value for intersection with another bezier curve -template +template static math::equations::Quartic getBezierBezierIntersectionEquation(NBL_CONST_REF_ARG(QuadraticBezier) lhs, NBL_CONST_REF_ARG(QuadraticBezier) rhs) { using float_t2 = portable_vector_t2; - using float64 = portable_float64_t; - using float64_vec2 = portable_float64_t2; - + using float64 = conditional_t || is_same_v, float64_t, emulated_float64_t >; + using float64_vec2 = conditional_t || is_same_v, float64_t2, emulated_vector_t2 > >; // Algorithm based on Computer Aided Geometric Design: // https://scholarsarchive.byu.edu/cgi/viewcontent.cgi?article=1000&context=facpub#page99 diff --git a/include/nbl/builtin/hlsl/type_traits.hlsl b/include/nbl/builtin/hlsl/type_traits.hlsl index d851620763..8fe39c0c40 100644 --- a/include/nbl/builtin/hlsl/type_traits.hlsl +++ b/include/nbl/builtin/hlsl/type_traits.hlsl @@ -723,6 +723,9 @@ struct float_of_size<8> using type = float64_t; }; +template +using float_of_size_t = typename float_of_size::type; + } } From 4359f91e627cc15dee7a79dec8a2004052f8475b Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Tue, 26 Nov 2024 13:55:29 -0800 Subject: [PATCH 226/358] Fixed array_get and array_set --- examples_tests | 2 +- include/nbl/builtin/hlsl/cpp_compat/basic.h | 4 +- .../nbl/builtin/hlsl/emulated/matrix_t.hlsl | 72 ++++++----------- .../nbl/builtin/hlsl/emulated/vector_t.hlsl | 78 ++++++------------- .../nbl/builtin/hlsl/portable/matrix_t.hlsl | 57 ++++++-------- 5 files changed, 78 insertions(+), 135 deletions(-) diff --git a/examples_tests b/examples_tests index ad0e24f799..d62f779d68 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit ad0e24f799afe4b7bb7f1675b061e8408d95a0dd +Subproject commit d62f779d6845e15599f10ce0ec90daf8a15c4e67 diff --git a/include/nbl/builtin/hlsl/cpp_compat/basic.h b/include/nbl/builtin/hlsl/cpp_compat/basic.h index 8788794ccf..08f446809a 100644 --- a/include/nbl/builtin/hlsl/cpp_compat/basic.h +++ b/include/nbl/builtin/hlsl/cpp_compat/basic.h @@ -56,7 +56,9 @@ namespace nbl::hlsl // We need variadic macro in order to handle multi parameter templates because the // preprocessor parses the template parameters as different macro parameters. #define NBL_REF_ARG(...) typename nbl::hlsl::add_reference<__VA_ARGS__ >::type -#define NBL_CONST_REF_ARG(...) typename nbl::hlsl::add_reference>::type +//#define NBL_CONST_REF_ARG(...) typename nbl::hlsl::add_reference>::type + +#define NBL_CONST_REF_ARG(...) const __VA_ARGS__& #else diff --git a/include/nbl/builtin/hlsl/emulated/matrix_t.hlsl b/include/nbl/builtin/hlsl/emulated/matrix_t.hlsl index 2dde2bd90c..41219a6b7c 100644 --- a/include/nbl/builtin/hlsl/emulated/matrix_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/matrix_t.hlsl @@ -8,69 +8,47 @@ namespace nbl namespace hlsl { -template -struct emulated_matrix {}; - -template -struct emulated_matrix +template +struct emulated_matrix { - using vec_t = emulated_vector_t2; - using this_t = emulated_matrix; - - vec_t rows[2]; - - this_t getTransposed() NBL_CONST_MEMBER_FUNC - { - this_t output; - - output.rows[0].x = rows[0].x; - output.rows[1].x = rows[0].y; - - output.rows[0].y = rows[1].x; - output.rows[1].y = rows[1].y; - - return output; - } -}; + using vec_t = emulated_vector_t; + using this_t = emulated_matrix; + using transposed_t = emulated_matrix; -template -struct emulated_matrix -{ - using vec_t = emulated_vector_t3; - using this_t = emulated_matrix; + NBL_CONSTEXPR_STATIC_INLINE uint32_t getRowCount() { return RowCount; } + NBL_CONSTEXPR_STATIC_INLINE uint32_t getColumnCount() { return RowCount; } - vec_t rows[3]; + vec_t rows[RowCount]; - this_t getTransposed() NBL_CONST_MEMBER_FUNC + transposed_t getTransposed() { - this_t output; - - output.rows[0].x = rows[0].x; - output.rows[1].x = rows[0].y; - output.rows[2].x = rows[0].z; - - output.rows[0].y = rows[1].x; - output.rows[1].y = rows[1].y; - output.rows[2].y = rows[1].z; + static nbl::hlsl::array_get getter; + static nbl::hlsl::array_set setter; - output.rows[0].z = rows[2].x; - output.rows[1].z = rows[2].y; - output.rows[2].z = rows[2].z; + transposed_t output; + for (int i = 0; i < RowCount; ++i) + { + for (int j = 0; j < ColumnCount; ++j) + setter(output.rows[i], j, getter(rows[j], i)); + } return output; } - vec_t operator[](uint32_t rowIdx) - { - return rows[rowIdx]; - } + //vec_t operator[](uint32_t rowIdx) + //{ + // return rows[rowIdx]; + //} }; template using emulated_matrix_t2x2 = emulated_matrix; template using emulated_matrix_t3x3 = emulated_matrix; - +template +using emulated_matrix_t4x4 = emulated_matrix; +template +using emulated_matrix_t3x4 = emulated_matrix; } } #endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/emulated/vector_t.hlsl b/include/nbl/builtin/hlsl/emulated/vector_t.hlsl index 89c78936a6..fdf78f65a7 100644 --- a/include/nbl/builtin/hlsl/emulated/vector_t.hlsl +++ b/include/nbl/builtin/hlsl/emulated/vector_t.hlsl @@ -379,6 +379,7 @@ template using emulated_vector_t4 = emulated_vector_impl::emulated_vector::type>; // TODO: better implementation +// TODO: do i even need that? template struct is_valid_emulated_vector { @@ -387,60 +388,29 @@ struct is_valid_emulated_vector is_same_v >; }; -template -struct array_get, ComponentType, uint32_t> -{ - ComponentType operator()(NBL_REF_ARG(emulated_vector_t2) vec, const uint32_t ix) - { - return vec.getComponent(ix); - } -}; - -//template -//struct array_set, ComponentType, uint32_t> -//{ -// void operator()(NBL_REF_ARG(emulated_vector_t2) vec, uint32_t index, ComponentType value) -// { -// vec.setComponent(index, value); -// } -//}; - -//template -//struct array_get, ComponentType, uint32_t> -//{ -// ComponentType operator()(typename emulated_vector_t vec, const uint32_t ix) -// { -// return vec.getComponent(ix); -// } -//}; - -//template -//struct array_get, ComponentType, uint32_t> -//{ -// ComponentType operator()(NBL_REF_ARG(emulated_vector_t4) vec, const uint32_t ix) -// { -// return vec.getComponent(ix); -// } -//}; - - -template -struct array_set, ComponentType, uint32_t> -{ - void operator()(NBL_REF_ARG(emulated_vector_t3) vec, uint32_t index, ComponentType value) - { - vec.setComponent(index, value); - } -}; - -template -struct array_set, ComponentType, uint32_t> -{ - void operator()(NBL_REF_ARG(emulated_vector_t4) vec, uint32_t index, ComponentType value) - { - vec.setComponent(index, value); - } -}; +// used this macro, because I can't make it work with templated array dimension +#define DEFINE_ARRAY_GET_SET_SPECIALIZATION(DIMENSION)\ +template\ +struct array_get, ComponentType, uint32_t>\ +{\ + inline ComponentType operator()(NBL_REF_ARG(emulated_vector_t##DIMENSION) vec, const uint32_t ix)\ + {\ + return vec.getComponent(ix);\ + }\ +};\ +template\ +struct array_set, ComponentType, uint32_t>\ +{\ + void operator()(NBL_REF_ARG(emulated_vector_t##DIMENSION) vec, uint32_t index, ComponentType value)\ + {\ + vec.setComponent(index, value);\ + }\ +};\ + +DEFINE_ARRAY_GET_SET_SPECIALIZATION(2) +DEFINE_ARRAY_GET_SET_SPECIALIZATION(3) +DEFINE_ARRAY_GET_SET_SPECIALIZATION(4) +#undef DEFINE_ARRAY_GET_SET_SPECIALIZATION namespace impl { diff --git a/include/nbl/builtin/hlsl/portable/matrix_t.hlsl b/include/nbl/builtin/hlsl/portable/matrix_t.hlsl index 46fc6836af..2a8769244e 100644 --- a/include/nbl/builtin/hlsl/portable/matrix_t.hlsl +++ b/include/nbl/builtin/hlsl/portable/matrix_t.hlsl @@ -43,33 +43,36 @@ template using portable_float64_t3x3 = portable_matrix_t3x3; #endif -namespace impl +namespace portable_matrix_impl { - template - struct mul_helper +template +struct mul_helper +{ + static inline RhsT multiply(LhsT lhs, RhsT rhs) { - static inline RhsT multiply(LhsT lhs, RhsT rhs) - { - return mul(lhs, rhs); - } - }; + return mul(lhs, rhs); + } +}; - template - struct mul_helper, emulated_vector_t > +// TODO: portable instead of emulated? so no need for partial spec? +template +struct mul_helper, emulated_vector_t > +{ + using LhsT = emulated_matrix; + using RhsT = emulated_vector_t; + + static inline RhsT multiply(LhsT mat, RhsT vec) { - using LhsT = emulated_matrix; - using RhsT = emulated_vector_t; + nbl::hlsl::array_get>, hlsl::emulated_float64_t> getter; + nbl::hlsl::array_set>, hlsl::emulated_float64_t> setter; - static inline RhsT multiply(LhsT mat, RhsT vec) - { - emulated_vector_t output; - output.x = (mat.rows[0] * vec).calcComponentSum(); - output.y = (mat.rows[1] * vec).calcComponentSum(); - output.z = (mat.rows[2] * vec).calcComponentSum(); + emulated_vector_t output; + for (int i = 0; i < RowCount; ++i) + setter(output, i, nbl::hlsl::dot(mat.rows[i], vec)); - return output; - } - }; + return output; + } +}; } // TODO: move to basic.hlsl? @@ -77,19 +80,9 @@ namespace impl template RhsT mul(LhsT lhs, RhsT rhs) { - return impl::mul_helper::multiply(lhs, rhs); + return portable_matrix_impl::mul_helper::multiply(lhs, rhs); } -//template -//emulated_vector_t mul(emulated_matrix mat, emulated_vector_t vec) -//{ -// emulated_vector_t output; -// output.x = (mat.rows[0] * vec).calcComponentSum(); -// output.y = (mat.rows[1] * vec).calcComponentSum(); -// output.z = (mat.rows[2] * vec).calcComponentSum(); -// -// return output; -//} } } From adb4262929e3daa74346a61022cfb35603cfbef2 Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Wed, 27 Nov 2024 16:52:22 -0800 Subject: [PATCH 227/358] Cherry picked matrix_traits.hlsl --- .../hlsl/matrix_utils/matrix_traits.hlsl | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 include/nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl diff --git a/include/nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl b/include/nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl new file mode 100644 index 0000000000..28142af3ac --- /dev/null +++ b/include/nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl @@ -0,0 +1,44 @@ +#ifndef _NBL_BUILTIN_HLSL_MATRIX_UTILS_MATRIX_TRAITS_INCLUDED_ +#define _NBL_BUILTIN_HLSL_MATRIX_UTILS_MATRIX_TRAITS_INCLUDED_ + +#include + +namespace nbl +{ +namespace hlsl +{ + +template +struct matrix_traits; + +// i choose to implement it this way because of this DXC bug: https://github.com/microsoft/DirectXShaderCompiler/issues/7007 +#define DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(ROW_COUNT, COLUMN_COUNT) \ +template \ +struct matrix_traits > \ +{ \ + using ComponentType = T; \ + NBL_CONSTEXPR_STATIC_INLINE uint32_t RowCount = ROW_COUNT; \ + NBL_CONSTEXPR_STATIC_INLINE uint32_t ColumnCount = COLUMN_COUNT; \ + NBL_CONSTEXPR_STATIC_INLINE bool Square = RowCount == ColumnCount; \ +}; + +DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(2, 2) +DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(3, 3) +DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(4, 4) +DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(3, 4) + +// TODO: when this bug: https://github.com/microsoft/DirectXShaderCompiler/issues/7007 is fixed, uncomment and delete template specializations +/*template +struct matrix_traits > +{ + using ComponentType = T; + NBL_CONSTEXPR_STATIC_INLINE uint32_t RowCount = ROW_COUNT; + NBL_CONSTEXPR_STATIC_INLINE uint32_t ColumnCount = COLUMN_COUNT; + NBL_CONSTEXPR_STATIC_INLINE bool Square = RowCount == ColumnCount; +}; +*/ + +} +} + +#endif \ No newline at end of file From 6bcbe565ed99571ee1fb45259813521cd155b58b Mon Sep 17 00:00:00 2001 From: Przemog1 Date: Thu, 28 Nov 2024 15:18:58 -0800 Subject: [PATCH 228/358] Extended nbl::hlsl::dot function so it can be used with emulated_vectors.h --- include/nbl/builtin/hlsl/array_accessors.hlsl | 30 +++++++++++ .../nbl/builtin/hlsl/cpp_compat/intrinsics.h | 13 ++--- include/nbl/builtin/hlsl/dot_product.hlsl | 25 ++++++++++ include/nbl/builtin/hlsl/functional.hlsl | 18 ------- .../builtin/hlsl/portable/dot_product.hlsl | 35 +++++++++++++ include/nbl/builtin/hlsl/scalar_of.hlsl | 23 +++++++++ .../builtin/hlsl/vector_utils/scalar_of.hlsl | 24 +++++++++ .../hlsl/vector_utils/vector_traits.hlsl | 50 +++++++++++++++++++ 8 files changed, 190 insertions(+), 28 deletions(-) create mode 100644 include/nbl/builtin/hlsl/array_accessors.hlsl create mode 100644 include/nbl/builtin/hlsl/dot_product.hlsl create mode 100644 include/nbl/builtin/hlsl/portable/dot_product.hlsl create mode 100644 include/nbl/builtin/hlsl/scalar_of.hlsl create mode 100644 include/nbl/builtin/hlsl/vector_utils/scalar_of.hlsl create mode 100644 include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl diff --git a/include/nbl/builtin/hlsl/array_accessors.hlsl b/include/nbl/builtin/hlsl/array_accessors.hlsl new file mode 100644 index 0000000000..11d685a150 --- /dev/null +++ b/include/nbl/builtin/hlsl/array_accessors.hlsl @@ -0,0 +1,30 @@ +#ifndef _NBL_BUILTIN_HLSL_ARRAY_ACCESSORS_HLSL_INCLUDED_ +#define _NBL_BUILTIN_HLSL_ARRAY_ACCESSORS_HLSL_INCLUDED_ + +#include + +namespace nbl +{ +namespace hlsl +{ +template +struct array_get +{ + ComponentType operator()(NBL_CONST_REF_ARG(ArrayType) arr, const I ix) + { + return arr[ix]; + } +}; + +template +struct array_set +{ + void operator()(NBL_REF_ARG(ArrayType) arr, I index, ComponentType val) + { + arr[index] = val; + } +}; +} +} + +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h b/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h index e83ced0bc5..100e7bf2c3 100644 --- a/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h +++ b/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h @@ -10,6 +10,9 @@ #include #include #include "nbl/core/util/bitflag.h" +#include + +//#include namespace nbl::hlsl { @@ -44,16 +47,6 @@ NBL_BIT_OP_GLM_PASSTHROUGH(bitCount,bitCount) NBL_SIMPLE_GLM_PASSTHROUGH(cross,cross) NBL_SIMPLE_GLM_PASSTHROUGH(clamp,clamp) -template -inline scalar_type_t dot(const T& lhs, const T& rhs) -{ - scalar_type_t retval = lhs[0]*rhs[0]; - // whatever has a `scalar_type` specialization should be a pure vector - for (auto i=1; i