Portable aoti (#15967)

JacobSzwejbka · web-flow · commit 3a262ef346d5 · 2025-12-11T21:55:44.000-08:00
Dynamically link against cuda runtime at the lib level since the aoti
blob dynamically links and its probably bad if they end up being
different versions. Also add the setting so that the aoti blob generates
a fatbin.

Not totally sure how to add CI for this one since we really only expect
portability of aoti blobs from a weaker gpu to a stronger one. Not super
sure what our access to machines looks like in CI maybe can figure it
out as I add windows CI tomorrow.
diff --git a/CMakePresets.json b/CMakePresets.json
@@ -155,9 +155,9 @@
         "EXECUTORCH_BUILD_CUDA": "ON"
       },
       "condition": {
-        "lhs": "${hostSystemName}",
-        "type": "equals",
-        "rhs": "Linux"
+        "type": "inList",
+        "string": "${hostSystemName}",
+        "list": ["Linux", "Windows"]
       }
     },
     {
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
@@ -28,6 +28,9 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 endif()
 
+# Use dynamic linking for CUDA runtime
+set(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
+
 find_package(CUDAToolkit REQUIRED)
 
 # Use ExecutorTorch's standard way to find PyTorch libraries for AOTI
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import os
 import typing
 from importlib import resources
 from typing import Any, Dict, final, List
@@ -35,6 +36,43 @@ class CudaBackend(AotiBackend, BackendDetails):
     def get_device_name(cls) -> str:
         return "cuda"
 
+    @staticmethod
+    def _setup_cuda_environment_for_fatbin() -> bool:
+        """
+        Configure CUDA environment variables based on detected CUDA version and GPU architecture.
+        These are needed to compile fatbin kernels for more portable binaries on older CUDA versions.
+        Returns True if setup succeeded or if setup was skipped (CUDA >= 12.9), false otherwise.
+        """
+        try:
+            # Detect CUDA version from torch
+            cuda_version = torch.version.cuda
+            if cuda_version is None:
+                return False
+
+            major, minor = map(int, cuda_version.split(".")[:2])
+
+            # Only set up environment variables for CUDA < 12.9
+            if major > 12 or (major == 12 and minor >= 9):
+                return True
+
+            # Set TRITON_PTXAS_PATH for CUDA 12.6+
+            if major == 12 and minor >= 6:
+                # Try versioned path first, fallback to symlinked path
+                ptxas_path = f"/usr/local/cuda-{cuda_version}/bin/ptxas"
+                if not os.path.exists(ptxas_path):
+                    ptxas_path = "/usr/local/cuda/bin/ptxas"
+                    if not os.path.exists(ptxas_path):
+                        return False
+                os.environ["TRITON_PTXAS_PATH"] = ptxas_path
+
+            # Get compute capability of current CUDA device
+            device = torch.cuda.current_device()
+            capability = torch.cuda.get_device_capability(device)
+            os.environ["TORCH_CUDA_ARCH_LIST"] = f"{capability[0]}.{capability[1]}"
+            return True
+        except Exception:
+            return False
+
     @classmethod
     def get_supported_fallback_kernels(cls) -> Dict[str, Any]:
         return {
@@ -78,6 +116,9 @@ def get_aoti_compile_options(
         Get AOTI compile options for CUDA backend.
         Options may vary based on platform (Linux vs Windows).
         """
+
+        # Configure CUDA environment variables based on detected version
+        emit_multi_arch_kernel = CudaBackend._setup_cuda_environment_for_fatbin()
         # Base options for all platforms
         options: Dict[str, typing.Any] = {
             # Disable this to support sdpa decomposition
@@ -100,6 +141,7 @@ def get_aoti_compile_options(
             "max_autotune_gemm_backends": "TRITON",
             # Use TRITON backend for convolution operations tuning only to avoid using operators in libtorch
             "max_autotune_conv_backends": "TRITON",
+            "aot_inductor.emit_multi_arch_kernel": emit_multi_arch_kernel,
         }
 
         # Parse compile_specs to check for platform
diff --git a/examples/models/voxtral/CMakePresets.json b/examples/models/voxtral/CMakePresets.json
@@ -7,7 +7,8 @@
             "binaryDir": "${sourceDir}/../../../cmake-out/examples/models/voxtral",
             "cacheVariables": {
                 "CMAKE_BUILD_TYPE": "Release",
-                "CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out"
+                "CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out",
+                "CMAKE_PREFIX_PATH": "${sourceDir}/../../../cmake-out"
             }
         },
         {
@@ -23,9 +24,9 @@
                 "EXECUTORCH_BUILD_CUDA": "ON"
             },
             "condition": {
-                "lhs": "${hostSystemName}",
-                "type": "equals",
-                "rhs": "Linux"
+                "type": "inList",
+                "string": "${hostSystemName}",
+                "list": ["Linux", "Windows"]
             }
         },
         {
diff --git a/examples/models/whisper/CMakeLists.txt b/examples/models/whisper/CMakeLists.txt
@@ -13,6 +13,12 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(EXECUTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../..")
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
+if(CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
+  set(CMAKE_TOOLCHAIN_IOS ON)
+else()
+  set(CMAKE_TOOLCHAIN_IOS OFF)
+endif()
+
 # Let files say "include <executorch/path/to/header.h>"
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 
@@ -22,23 +28,30 @@ find_package(gflags REQUIRED)
 
 list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
 find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
+executorch_target_link_options_shared_lib(executorch)
 
 set(_link_libraries executorch gflags)
 set(_srcs multimodal.cpp)
 
-list(
-  APPEND
-  _link_libraries
-  optimized_native_cpu_ops_lib
-  quantized_ops_lib
-  custom_ops
-  cpublas
-  eigen_blas
-)
+# Common ops for all builds
+list(APPEND _link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas)
+executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
+
+# CPU-only builds need quantized and custom ops
+if(NOT EXECUTORCH_BUILD_CUDA AND MSVC)
+  list(APPEND _link_libraries quantized_ops_lib custom_ops)
+  executorch_target_link_options_shared_lib(quantized_ops_lib)
+  executorch_target_link_options_shared_lib(custom_ops)
+endif()
 
 # XNNPACK
 if(TARGET xnnpack_backend)
-  list(APPEND _link_libraries xnnpack_backend)
+  set(xnnpack_backend_libs xnnpack_backend XNNPACK xnnpack-microkernels-prod)
+  if(TARGET kleidiai)
+    list(APPEND xnnpack_backend_libs kleidiai)
+  endif()
+  list(APPEND _link_libraries ${xnnpack_backend_libs})
+  executorch_target_link_options_shared_lib(xnnpack_backend)
 endif()
 
 # Add LLM runner and extension module
@@ -70,7 +83,10 @@ list(
 if(EXECUTORCH_BUILD_CUDA)
   find_package(CUDAToolkit REQUIRED)
   list(APPEND _link_libraries aoti_cuda_backend)
-  executorch_target_link_options_shared_lib(aoti_cuda_backend)
+  if(NOT MSVC)
+    # On non-MSVC, use shared lib options
+    executorch_target_link_options_shared_lib(aoti_cuda_backend)
+  endif()
 endif()
 
 if(EXECUTORCH_BUILD_METAL)
@@ -82,8 +98,24 @@ endif()
 list(APPEND _link_libraries tokenizers::tokenizers)
 
 add_executable(whisper_runner main.cpp)
+if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+  target_link_options_gc_sections(whisper_runner)
+  if(NOT APPLE AND NOT MSVC)
+    target_link_options(whisper_runner PRIVATE "LINKER:-s")
+  endif()
+endif()
 
 target_include_directories(whisper_runner PUBLIC ${_common_include_directories})
-
 target_link_libraries(whisper_runner PUBLIC ${_link_libraries})
 target_compile_options(whisper_runner PUBLIC ${_common_compile_options})
+
+# On Windows, copy required DLLs to the executable directory
+if(MSVC AND EXECUTORCH_BUILD_CUDA)
+  add_custom_command(
+    TARGET whisper_runner
+    POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:aoti_cuda_shims>
+            $<TARGET_FILE_DIR:whisper_runner>
+    COMMENT "Copying aoti_cuda_shims.dll to whisper_runner directory"
+  )
+endif()
diff --git a/examples/models/whisper/CMakePresets.json b/examples/models/whisper/CMakePresets.json
@@ -7,7 +7,8 @@
             "binaryDir": "${sourceDir}/../../../cmake-out/examples/models/whisper",
             "cacheVariables": {
                 "CMAKE_BUILD_TYPE": "Release",
-                "CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out"
+                "CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out",
+                "CMAKE_PREFIX_PATH": "${sourceDir}/../../../cmake-out"
             }
         },
         {
@@ -23,9 +24,9 @@
                 "EXECUTORCH_BUILD_CUDA": "ON"
             },
             "condition": {
-                "lhs": "${hostSystemName}",
-                "type": "equals",
-                "rhs": "Linux"
+                "type": "inList",
+                "string": "${hostSystemName}",
+                "list": ["Linux", "Windows"]
             }
         },
         {

Original file line number	Diff line number	Diff line change
`@@ -155,9 +155,9 @@`
`155`	`155`	`"EXECUTORCH_BUILD_CUDA": "ON"`
`156`	`156`	`},`
`157`	`157`	`"condition": {`
`158`		`- "lhs": "${hostSystemName}",`
`159`		`- "type": "equals",`
`160`		`- "rhs": "Linux"`
	`158`	`+ "type": "inList",`
	`159`	`+ "string": "${hostSystemName}",`
	`160`	`+ "list": ["Linux", "Windows"]`
`161`	`161`	`}`
`162`	`162`	`},`
`163`	`163`	`{`
Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,8 @@`
`7`	`7`	`"binaryDir": "${sourceDir}/../../../cmake-out/examples/models/voxtral",`
`8`	`8`	`"cacheVariables": {`
`9`	`9`	`"CMAKE_BUILD_TYPE": "Release",`
`10`		`- "CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out"`
	`10`	`+ "CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out",`
	`11`	`+ "CMAKE_PREFIX_PATH": "${sourceDir}/../../../cmake-out"`
`11`	`12`	`}`
`12`	`13`	`},`
`13`	`14`	`{`
`@@ -23,9 +24,9 @@`
`23`	`24`	`"EXECUTORCH_BUILD_CUDA": "ON"`
`24`	`25`	`},`
`25`	`26`	`"condition": {`
`26`		`- "lhs": "${hostSystemName}",`
`27`		`- "type": "equals",`
`28`		`- "rhs": "Linux"`
	`27`	`+ "type": "inList",`
	`28`	`+ "string": "${hostSystemName}",`
	`29`	`+ "list": ["Linux", "Windows"]`
`29`	`30`	`}`
`30`	`31`	`},`
`31`	`32`	`{`