Skip to content

Commit 3a262ef

Browse files
Portable aoti (#15967)
Dynamically link against cuda runtime at the lib level since the aoti blob dynamically links and its probably bad if they end up being different versions. Also add the setting so that the aoti blob generates a fatbin. Not totally sure how to add CI for this one since we really only expect portability of aoti blobs from a weaker gpu to a stronger one. Not super sure what our access to machines looks like in CI maybe can figure it out as I add windows CI tomorrow.
1 parent 063b61f commit 3a262ef

File tree

6 files changed

+102
-23
lines changed

6 files changed

+102
-23
lines changed

CMakePresets.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -155,9 +155,9 @@
155155
"EXECUTORCH_BUILD_CUDA": "ON"
156156
},
157157
"condition": {
158-
"lhs": "${hostSystemName}",
159-
"type": "equals",
160-
"rhs": "Linux"
158+
"type": "inList",
159+
"string": "${hostSystemName}",
160+
"list": ["Linux", "Windows"]
161161
}
162162
},
163163
{

backends/cuda/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ if(NOT EXECUTORCH_ROOT)
2828
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
2929
endif()
3030

31+
# Use dynamic linking for CUDA runtime
32+
set(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
33+
3134
find_package(CUDAToolkit REQUIRED)
3235

3336
# Use ExecutorTorch's standard way to find PyTorch libraries for AOTI

backends/cuda/cuda_backend.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
66

7+
import os
78
import typing
89
from importlib import resources
910
from typing import Any, Dict, final, List
@@ -35,6 +36,43 @@ class CudaBackend(AotiBackend, BackendDetails):
3536
def get_device_name(cls) -> str:
3637
return "cuda"
3738

39+
@staticmethod
40+
def _setup_cuda_environment_for_fatbin() -> bool:
41+
"""
42+
Configure CUDA environment variables based on detected CUDA version and GPU architecture.
43+
These are needed to compile fatbin kernels for more portable binaries on older CUDA versions.
44+
Returns True if setup succeeded or if setup was skipped (CUDA >= 12.9), false otherwise.
45+
"""
46+
try:
47+
# Detect CUDA version from torch
48+
cuda_version = torch.version.cuda
49+
if cuda_version is None:
50+
return False
51+
52+
major, minor = map(int, cuda_version.split(".")[:2])
53+
54+
# Only set up environment variables for CUDA < 12.9
55+
if major > 12 or (major == 12 and minor >= 9):
56+
return True
57+
58+
# Set TRITON_PTXAS_PATH for CUDA 12.6+
59+
if major == 12 and minor >= 6:
60+
# Try versioned path first, fallback to symlinked path
61+
ptxas_path = f"/usr/local/cuda-{cuda_version}/bin/ptxas"
62+
if not os.path.exists(ptxas_path):
63+
ptxas_path = "/usr/local/cuda/bin/ptxas"
64+
if not os.path.exists(ptxas_path):
65+
return False
66+
os.environ["TRITON_PTXAS_PATH"] = ptxas_path
67+
68+
# Get compute capability of current CUDA device
69+
device = torch.cuda.current_device()
70+
capability = torch.cuda.get_device_capability(device)
71+
os.environ["TORCH_CUDA_ARCH_LIST"] = f"{capability[0]}.{capability[1]}"
72+
return True
73+
except Exception:
74+
return False
75+
3876
@classmethod
3977
def get_supported_fallback_kernels(cls) -> Dict[str, Any]:
4078
return {
@@ -78,6 +116,9 @@ def get_aoti_compile_options(
78116
Get AOTI compile options for CUDA backend.
79117
Options may vary based on platform (Linux vs Windows).
80118
"""
119+
120+
# Configure CUDA environment variables based on detected version
121+
emit_multi_arch_kernel = CudaBackend._setup_cuda_environment_for_fatbin()
81122
# Base options for all platforms
82123
options: Dict[str, typing.Any] = {
83124
# Disable this to support sdpa decomposition
@@ -100,6 +141,7 @@ def get_aoti_compile_options(
100141
"max_autotune_gemm_backends": "TRITON",
101142
# Use TRITON backend for convolution operations tuning only to avoid using operators in libtorch
102143
"max_autotune_conv_backends": "TRITON",
144+
"aot_inductor.emit_multi_arch_kernel": emit_multi_arch_kernel,
103145
}
104146

105147
# Parse compile_specs to check for platform

examples/models/voxtral/CMakePresets.json

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
"binaryDir": "${sourceDir}/../../../cmake-out/examples/models/voxtral",
88
"cacheVariables": {
99
"CMAKE_BUILD_TYPE": "Release",
10-
"CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out"
10+
"CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out",
11+
"CMAKE_PREFIX_PATH": "${sourceDir}/../../../cmake-out"
1112
}
1213
},
1314
{
@@ -23,9 +24,9 @@
2324
"EXECUTORCH_BUILD_CUDA": "ON"
2425
},
2526
"condition": {
26-
"lhs": "${hostSystemName}",
27-
"type": "equals",
28-
"rhs": "Linux"
27+
"type": "inList",
28+
"string": "${hostSystemName}",
29+
"list": ["Linux", "Windows"]
2930
}
3031
},
3132
{

examples/models/whisper/CMakeLists.txt

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,12 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
1313
set(EXECUTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../..")
1414
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
1515

16+
if(CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
17+
set(CMAKE_TOOLCHAIN_IOS ON)
18+
else()
19+
set(CMAKE_TOOLCHAIN_IOS OFF)
20+
endif()
21+
1622
# Let files say "include <executorch/path/to/header.h>"
1723
set(_common_include_directories ${EXECUTORCH_ROOT}/..)
1824

@@ -22,23 +28,30 @@ find_package(gflags REQUIRED)
2228

2329
list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
2430
find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
31+
executorch_target_link_options_shared_lib(executorch)
2532

2633
set(_link_libraries executorch gflags)
2734
set(_srcs multimodal.cpp)
2835

29-
list(
30-
APPEND
31-
_link_libraries
32-
optimized_native_cpu_ops_lib
33-
quantized_ops_lib
34-
custom_ops
35-
cpublas
36-
eigen_blas
37-
)
36+
# Common ops for all builds
37+
list(APPEND _link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas)
38+
executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
39+
40+
# CPU-only builds need quantized and custom ops
41+
if(NOT EXECUTORCH_BUILD_CUDA AND MSVC)
42+
list(APPEND _link_libraries quantized_ops_lib custom_ops)
43+
executorch_target_link_options_shared_lib(quantized_ops_lib)
44+
executorch_target_link_options_shared_lib(custom_ops)
45+
endif()
3846

3947
# XNNPACK
4048
if(TARGET xnnpack_backend)
41-
list(APPEND _link_libraries xnnpack_backend)
49+
set(xnnpack_backend_libs xnnpack_backend XNNPACK xnnpack-microkernels-prod)
50+
if(TARGET kleidiai)
51+
list(APPEND xnnpack_backend_libs kleidiai)
52+
endif()
53+
list(APPEND _link_libraries ${xnnpack_backend_libs})
54+
executorch_target_link_options_shared_lib(xnnpack_backend)
4255
endif()
4356

4457
# Add LLM runner and extension module
@@ -70,7 +83,10 @@ list(
7083
if(EXECUTORCH_BUILD_CUDA)
7184
find_package(CUDAToolkit REQUIRED)
7285
list(APPEND _link_libraries aoti_cuda_backend)
73-
executorch_target_link_options_shared_lib(aoti_cuda_backend)
86+
if(NOT MSVC)
87+
# On non-MSVC, use shared lib options
88+
executorch_target_link_options_shared_lib(aoti_cuda_backend)
89+
endif()
7490
endif()
7591

7692
if(EXECUTORCH_BUILD_METAL)
@@ -82,8 +98,24 @@ endif()
8298
list(APPEND _link_libraries tokenizers::tokenizers)
8399

84100
add_executable(whisper_runner main.cpp)
101+
if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
102+
target_link_options_gc_sections(whisper_runner)
103+
if(NOT APPLE AND NOT MSVC)
104+
target_link_options(whisper_runner PRIVATE "LINKER:-s")
105+
endif()
106+
endif()
85107

86108
target_include_directories(whisper_runner PUBLIC ${_common_include_directories})
87-
88109
target_link_libraries(whisper_runner PUBLIC ${_link_libraries})
89110
target_compile_options(whisper_runner PUBLIC ${_common_compile_options})
111+
112+
# On Windows, copy required DLLs to the executable directory
113+
if(MSVC AND EXECUTORCH_BUILD_CUDA)
114+
add_custom_command(
115+
TARGET whisper_runner
116+
POST_BUILD
117+
COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:aoti_cuda_shims>
118+
$<TARGET_FILE_DIR:whisper_runner>
119+
COMMENT "Copying aoti_cuda_shims.dll to whisper_runner directory"
120+
)
121+
endif()

examples/models/whisper/CMakePresets.json

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
"binaryDir": "${sourceDir}/../../../cmake-out/examples/models/whisper",
88
"cacheVariables": {
99
"CMAKE_BUILD_TYPE": "Release",
10-
"CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out"
10+
"CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out",
11+
"CMAKE_PREFIX_PATH": "${sourceDir}/../../../cmake-out"
1112
}
1213
},
1314
{
@@ -23,9 +24,9 @@
2324
"EXECUTORCH_BUILD_CUDA": "ON"
2425
},
2526
"condition": {
26-
"lhs": "${hostSystemName}",
27-
"type": "equals",
28-
"rhs": "Linux"
27+
"type": "inList",
28+
"string": "${hostSystemName}",
29+
"list": ["Linux", "Windows"]
2930
}
3031
},
3132
{

0 commit comments

Comments
 (0)