[build][cmake]: Bundle ACL dynlibs and torch libgomp for CPU extension builds

Radu2k · Radu2k · commit f721fed614f7 · 2025-11-04T17:57:23.000Z
Signed-off-by: Radu Salavat &lt;radu.salavat@arm.com&gt;
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
@@ -140,11 +140,11 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
         set(ENABLE_AVX512VNNI OFF)
         message(WARNING "Disable AVX512-VNNI ISA support, no avx512_vnni found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512VNNI=1.")
     endif()
-    
+
 elseif (AVX2_FOUND)
     list(APPEND CXX_COMPILE_FLAGS "-mavx2")
     message(WARNING "vLLM CPU backend using AVX2 ISA")
-    
+
 elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
     message(STATUS "PowerPC detected")
     if (POWER9_FOUND)
@@ -167,9 +167,9 @@ elseif (ASIMD_FOUND)
         add_compile_definitions(ARM_BF16_SUPPORT)
     else()
         message(WARNING "BF16 functionality is not available")
-        set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")  
+        set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")
     endif()
-    list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})     
+    list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})
 elseif (S390_FOUND)
     message(STATUS "S390 detected")
     # Check for S390 VXE support
@@ -191,9 +191,33 @@ endif()
 
 # Build oneDNN for GEMM kernels (only for x86-AVX512 /ARM platforms)
 if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
+    # Set number of parallel build processes
+    include(ProcessorCount)
+    ProcessorCount(NPROC)
+    if(NOT NPROC)
+        set(NPROC 4)
+    endif()
+
+    # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
+    # and create a local shim dir with it
+    vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
+
+    find_library(OPEN_MP
+        NAMES gomp
+        PATHS ${VLLM_TORCH_GOMP_SHIM_DIR}
+        NO_DEFAULT_PATH
+        REQUIRED
+    )
+    # Set LD_LIBRARY_PATH to include the shim dir at build time to use the same libgomp as PyTorch
+    if (OPEN_MP)
+        set(ENV{LD_LIBRARY_PATH} "${VLLM_TORCH_GOMP_SHIM_DIR}:$ENV{LD_LIBRARY_PATH}")
+    endif()
+
     # Fetch and build Arm Compute Library (ACL) as oneDNN's backend for AArch64
     # TODO [fadara01]: remove this once ACL can be fetched and built automatically as a dependency of oneDNN
+    set(ONEDNN_AARCH64_USE_ACL OFF CACHE BOOL "")
     if(ASIMD_FOUND)
+        # Fetch and populate ACL
         if(DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY "$ENV{ACL_ROOT_DIR}")
             message(STATUS "Using ACL from specified source directory: $ENV{ACL_ROOT_DIR}")
         else()
@@ -207,38 +231,37 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
                 GIT_PROGRESS   TRUE
             )
             set(ENV{ACL_ROOT_DIR} "${arm_compute_SOURCE_DIR}")
+            set(ACL_LIB_DIR "$ENV{ACL_ROOT_DIR}/build")
         endif()
 
         # Build ACL with scons
-        include(ProcessorCount)
-        ProcessorCount(_NPROC)
+        find_program(SCONS scons REQUIRED)
         set(_scons_cmd
-        scons -j${_NPROC}
-            Werror=0 debug=0 neon=1 examples=0 embed_kernels=0 os=linux
-            arch=armv8.2-a build=native benchmark_examples=0 fixed_format_kernels=1
-            multi_isa=1 openmp=1 cppthreads=0
+            ${SCONS} -j${NPROC}
+                Werror=0 debug=0 neon=1 examples=0 embed_kernels=0 os=linux
+                arch=armv8.2-a build=native benchmark_examples=0 fixed_format_kernels=1
+                multi_isa=1 openmp=1 cppthreads=0
         )
 
-        # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
-        # and create a local shim dir with it
-        include("${CMAKE_CURRENT_LIST_DIR}/utils.cmake")
-        vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
-
-        if(NOT VLLM_TORCH_GOMP_SHIM_DIR STREQUAL "")
-            list(APPEND _scons_cmd extra_link_flags=-L${VLLM_TORCH_GOMP_SHIM_DIR})
-        endif()
-
         execute_process(
             COMMAND ${_scons_cmd}
             WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
             RESULT_VARIABLE _acl_rc
         )
+
         if(NOT _acl_rc EQUAL 0)
             message(FATAL_ERROR "ACL SCons build failed (exit ${_acl_rc}).")
         endif()
 
-        set(ONEDNN_AARCH64_USE_ACL "ON")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
+        # Add ACL libraries to be linked as dynamic libraries
+        find_library(ACL_COMPUTE arm_compute PATHS "${ACL_LIB_DIR}" REQUIRED NO_DEFAULT_PATH)
+        find_library(ACL_COMPUTE_GRAPH arm_compute_graph PATHS "${ACL_LIB_DIR}" REQUIRED NO_DEFAULT_PATH)
+
+        list(APPEND DYNLIBS "${ACL_COMPUTE}" "${ACL_COMPUTE_GRAPH}")
+        list(APPEND DYNLIB_DIRS "${ACL_LIB_DIR}")
+
+        # VLLM/oneDNN settings for ACL
+        set(ONEDNN_AARCH64_USE_ACL ON CACHE BOOL "" FORCE)
         add_compile_definitions(VLLM_USE_ACL)
     endif()
 
@@ -349,9 +372,11 @@ define_gpu_extension_target(
     LANGUAGE CXX
     SOURCES ${VLLM_EXT_SRC}
     LIBRARIES ${LIBS}
+    DYNLIBS ${DYNLIBS}
+    DYNLIB_DIRS ${DYNLIB_DIRS}
     COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
     USE_SABI 3
     WITH_SOABI
 )
 
-message(STATUS "Enabling C extension.")
+message(STATUS "Enabling C extension.")
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
@@ -194,13 +194,13 @@ macro(clear_cuda_arches CUDA_ARCH_FLAGS)
 endmacro()
 
 #
-# Extract unique CUDA architectures from a list of compute capabilities codes in 
-# the form `<major><minor>[<letter>]`, convert them to the form sort 
-# `<major>.<minor>`, dedupes them and then sorts them in ascending order and 
+# Extract unique CUDA architectures from a list of compute capabilities codes in
+# the form `<major><minor>[<letter>]`, convert them to the form sort
+# `<major>.<minor>`, dedupes them and then sorts them in ascending order and
 # stores them in `OUT_ARCHES`.
 #
 # Example:
-#   CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a" 
+#   CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a"
 #   extract_unique_cuda_archs_ascending(OUT_ARCHES CUDA_ARCH_FLAGS)
 #   OUT_ARCHES="7.5;...;9.0"
 function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS)
@@ -221,15 +221,15 @@ function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS)
 endfunction()
 
 #
-# For a specific file set the `-gencode` flag in compile options conditionally 
-# for the CUDA language. 
+# For a specific file set the `-gencode` flag in compile options conditionally
+# for the CUDA language.
 #
 # Example:
 #   set_gencode_flag_for_srcs(
 #     SRCS "foo.cu"
 #     ARCH "compute_75"
 #     CODE "sm_75")
-#   adds: "-gencode arch=compute_75,code=sm_75" to the compile options for 
+#   adds: "-gencode arch=compute_75,code=sm_75" to the compile options for
 #    `foo.cu` (only for the CUDA language).
 #
 macro(set_gencode_flag_for_srcs)
@@ -249,14 +249,14 @@ macro(set_gencode_flag_for_srcs)
 endmacro(set_gencode_flag_for_srcs)
 
 #
-# For a list of source files set the `-gencode` flags in the files specific 
+# For a list of source files set the `-gencode` flags in the files specific
 #  compile options (specifically for the CUDA language).
 #
 # arguments are:
 #  SRCS: list of source files
 #  CUDA_ARCHS: list of CUDA architectures in the form `<major>.<minor>[letter]`
 #  BUILD_PTX_FOR_ARCH: if set to true, then the PTX code will be built
-#    for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS 
+#    for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS
 #    that is larger than BUILD_PTX_FOR_ARCH.
 #
 macro(set_gencode_flags_for_srcs)
@@ -410,7 +410,7 @@ endfunction()
 #
 # Override the GPU architectures detected by cmake/torch and filter them by
 # `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
-# `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set 
+# `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set
 # the architectures on a per file basis.
 #
 # Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`.
@@ -473,6 +473,8 @@ endmacro()
 # COMPILE_FLAGS <flags>      - Extra compiler flags passed to NVCC/hip.
 # INCLUDE_DIRECTORIES <dirs> - Extra include directories.
 # LIBRARIES <libraries>      - Extra link libraries.
+# DYNLIBS <dyn_libs>         - Extra dynamic link libraries.
+# DYNLIB_DIRS <dyn_lib_dirs> - Extra dynamic link directories.
 # WITH_SOABI                 - Generate library with python SOABI suffix name.
 # USE_SABI <version>         - Use python stable api <version>
 #
@@ -483,7 +485,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
     GPU
     "WITH_SOABI"
     "DESTINATION;LANGUAGE;USE_SABI"
-    "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
+    "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES;DYNLIBS;DYNLIB_DIRS")
 
   # Add hipify preprocessing step when building with HIP/ROCm.
   if (GPU_LANGUAGE STREQUAL "HIP")
@@ -536,5 +538,75 @@ function (define_gpu_extension_target GPU_MOD_NAME)
     target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
   endif()
 
+  # Resolve, link and install any dynamic libraries
+  set(_dyn_link_items)
+  set(_dyn_install_paths)
+
+  foreach(_dyn_lib ${GPU_DYNLIBS})
+    if (IS_ABSOLUTE "${_dyn_lib}")
+      list(APPEND _dyn_link_items "${_dyn_lib}")
+      list(APPEND _dyn_install_paths "${_dyn_lib}")
+    else()
+      unset(_found_lib)
+      if (GPU_DYNLIB_DIRS)
+        find_library(_found_lib NAMES "${_dyn_lib}" PATHS ${GPU_DYNLIB_DIRS} NO_DEFAULT_PATH)
+      endif()
+      if (NOT _found_lib)
+        # CMake will search system library paths
+        # options that are provided but may omit LIBRARY_PATH env variable paths
+        # so we explicitly check those as well later on if needed.
+        find_library(_found_lib NAMES "${_dyn_lib}")
+      endif()
+
+      if (_found_lib)
+        list(APPEND _dyn_link_items "${_found_lib}")
+        list(APPEND _dyn_install_paths "${_found_lib}")
+      else()
+        message(WARNING "DYNLIBS: could not resolve '${_dyn_lib}'
+          in DYNLIB_DIRS or system paths, we are
+          linking by name only, will not install.")
+        list(APPEND _dyn_link_items "${_dyn_lib}")
+      endif()
+    endif()
+  endforeach()
+
+  list(REMOVE_DUPLICATES _dyn_link_items)
+  list(REMOVE_DUPLICATES _dyn_install_paths)
+
+  if (_dyn_link_items)
+    if (UNIX AND NOT APPLE)
+      # Prevent linker from discarding unused dynamic libraries as
+      # they may be added trough auditing with auditwheel or ar part
+      # of the system's dynamic library search path.
+      #   (e.g.:/usr/lib)
+      target_link_options(${GPU_MOD_NAME} PRIVATE "LINKER:--no-as-needed")
+    endif()
+    target_link_libraries(${GPU_MOD_NAME} PRIVATE ${_dyn_link_items})
+  endif()
+
+  # Add rpath settings to find dynamic libraries at runtime and after install
+  if(UNIX AND NOT APPLE)
+    set_target_properties(${GPU_MOD_NAME} PROPERTIES
+      BUILD_RPATH "\$ORIGIN;\$ORIGIN/.libs"
+      INSTALL_RPATH "\$ORIGIN;\$ORIGIN/.libs;"
+      INSTALL_RPATH_USE_LINK_PATH TRUE)
+  elseif(APPLE)
+    set_target_properties(${GPU_MOD_NAME} PROPERTIES
+      MACOSX_RPATH ON
+      BUILD_RPATH "@loader_path;@loader_path/.dylibs"
+      INSTALL_RPATH "@loader_path;@loader_path/.dylibs")
+  endif()
+
   install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION} COMPONENT ${GPU_MOD_NAME})
+
+  foreach(_dyn_lib_path ${_dyn_install_paths})
+    get_filename_component(_dir "${_dyn_lib_path}" DIRECTORY)
+    get_filename_component(_base "${_dyn_lib_path}" NAME)
+    file(GLOB _selected LIST_DIRECTORIES FALSE "${_dir}/${_base}*")
+    list(REMOVE_DUPLICATES _selected)
+    install(FILES "${_selected}"
+            DESTINATION "${GPU_DESTINATION}/.libs"
+            COMPONENT ${GPU_MOD_NAME})
+  endforeach()
+
 endfunction()