diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash index 96305c056a..aa2dd81517 100644 --- a/.github/scripts/fbgemm_gpu_build.bash +++ b/.github/scripts/fbgemm_gpu_build.bash @@ -304,8 +304,10 @@ __configure_fbgemm_gpu_build_cuda () { local arch_list="7.5;8.0" fi - elif [[ $cuda_version_nvcc == *"V13.0"* ]] || - [[ $cuda_version_nvcc == *"V12.9"* ]] || + elif [[ $cuda_version_nvcc == *"V13.0"* ]]; then + local arch_list="8.0;9.0a;10.0a;12.0a" + + elif [[ $cuda_version_nvcc == *"V12.9"* ]] || [[ $cuda_version_nvcc == *"V12.8"* ]]; then local arch_list="7.5;8.0;9.0a;10.0a;12.0a" diff --git a/.github/scripts/generate_ci_matrix.py b/.github/scripts/generate_ci_matrix.py index b842433c8f..0d6cc47101 100644 --- a/.github/scripts/generate_ci_matrix.py +++ b/.github/scripts/generate_ci_matrix.py @@ -307,7 +307,7 @@ def cuda_versions(self) -> List[str]: return ["12.6.3", "12.8.1", "13.0.2"] else: # GenAI is unable to support 11.8.0 anymore as of https://github.com/pytorch/FBGEMM/pull/4138 - return ["12.6.3", "12.8.1"] + return ["12.6.3", "12.8.1", "13.0.2"] def rocm_versions(self) -> List[str]: if GitRepo.ref() == REFS_MAIN and GitRepo.event_name() == EVENT_NAME_PUSH: diff --git a/cmake/modules/GpuCppLibrary.cmake b/cmake/modules/GpuCppLibrary.cmake index 51c30df750..3985a633d3 100644 --- a/cmake/modules/GpuCppLibrary.cmake +++ b/cmake/modules/GpuCppLibrary.cmake @@ -87,15 +87,32 @@ function(prepare_target_sources) list(APPEND ${args_PREFIX}_sources_cu ${args_CUDA_SPECIFIC_SRCS}) endif() - # Set source properties - set_source_files_properties(${${args_PREFIX}_sources_cu} - PROPERTIES COMPILE_OPTIONS - "${args_NVCC_FLAGS}") - + # Set include directories set_source_files_properties(${${args_PREFIX}_sources_cu} PROPERTIES INCLUDE_DIRECTORIES "${args_INCLUDE_DIRS}") + # Starting with CUDA 13.0, nvcc changed the default visibility of + # __global__ functions to `hidden`, which causes symbol lookup errors + # during linking. This can be worked around by setting -cudart=shared + # and --device-entity-has-hidden-visibility=false. + # + # https://developer.nvidia.com/blog/cuda-c-compiler-updates-impacting-elf-visibility-and-linkage/ + if( (FBGEMM_BUILD_VARIANT STREQUAL BUILD_VARIANT_CUDA) AND + (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") ) + set(_nvcc_flags ${args_NVCC_FLAGS} + -cudart=shared + -static-global-template-stub=false + --device-entity-has-hidden-visibility=false) + else() + set(_nvcc_flags ${args_NVCC_FLAGS}) + endif() + + # Set compilation flags + set_source_files_properties(${${args_PREFIX}_sources_cu} + PROPERTIES COMPILE_OPTIONS + "${_nvcc_flags}") + # Append to the full sources list list(APPEND ${args_PREFIX}_sources_combined ${${args_PREFIX}_sources_cu}) endif() diff --git a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu index 6574bda45e..b90ad76b34 100644 --- a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu +++ b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu @@ -57,7 +57,7 @@ template < typename index_t, size_t kThreadGroupSize > -__launch_bounds__(kForwardMaxThreads) __global__ void +__launch_bounds__(kForwardMaxThreads) __global__ __attribute__((visibility("default"))) void {%- if is_index_select %} batch_index_select_dim0_codegen_forward_small_kernel( {%- else %} diff --git a/fbgemm_gpu/src/memory_utils/memory_utils.cu b/fbgemm_gpu/src/memory_utils/memory_utils.cu index e7cba2bffe..98647f5a68 100644 --- a/fbgemm_gpu/src/memory_utils/memory_utils.cu +++ b/fbgemm_gpu/src/memory_utils/memory_utils.cu @@ -144,6 +144,13 @@ std::tuple adjust_to_page_boundaries(void* ptr, size_t size) { return std::make_tuple((void*)raw_ptr_adjusted, (size_t)size_adjusted); } +cudaMemLocation new_mem_location_from_device(const int device_id) { + cudaMemLocation deviceLoc; + deviceLoc.type = cudaMemLocationTypeDevice; + deviceLoc.id = device_id; + return deviceLoc; +} + } // namespace Tensor new_managed_tensor( @@ -158,11 +165,25 @@ Tensor new_managed_tensor( // Set preferred memory location to host memory AT_CUDA_CHECK(cudaMemAdvise( - ptr, size_bytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId)); + ptr, size_bytes, cudaMemAdviseSetPreferredLocation, +#if CUDA_VERSION >= 13000 + new_mem_location_from_device(cudaCpuDeviceId) +#else + cudaCpuDeviceId +#endif + )); + // User hints with "accessed by": GPU will establish direct mapping of data // in CPU memory, no page faults will be generated AT_CUDA_CHECK(cudaMemAdvise( - ptr, size_bytes, cudaMemAdviseSetAccessedBy, at::cuda::current_device())); + ptr, size_bytes, cudaMemAdviseSetAccessedBy, +#if CUDA_VERSION >= 13000 + new_mem_location_from_device(at::cuda::current_device()) +#else + at::cuda::current_device() +#endif + )); + C10_CUDA_KERNEL_LAUNCH_CHECK(); // Work around fork issue - see uvm_mem_advice_dont_fork for details @@ -353,7 +374,12 @@ void uvm_cuda_mem_advise(const Tensor& t, int64_t cuda_memory_advise) { ptr, size_bytes, static_cast(cuda_memory_advise), - hint_device)); +#if CUDA_VERSION >= 13000 + new_mem_location_from_device(hint_device) +#else + hint_device +#endif + )); return; } @@ -379,7 +405,13 @@ void uvm_cuda_mem_prefetch_async( auto stream = at::cuda::getCurrentCUDAStream(); - AT_CUDA_CHECK(cudaMemPrefetchAsync(ptr, size_bytes, prefetch_device, stream)); + AT_CUDA_CHECK(cudaMemPrefetchAsync(ptr, size_bytes, +#if CUDA_VERSION >= 13000 + new_mem_location_from_device(prefetch_device), 0, +#else + prefetch_device, +#endif + stream)); return; } diff --git a/fbgemm_gpu/src/sparse_ops/sparse_async_batched_cumsum.cu b/fbgemm_gpu/src/sparse_ops/sparse_async_batched_cumsum.cu index ac9b81cbcb..a281237cf1 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_async_batched_cumsum.cu +++ b/fbgemm_gpu/src/sparse_ops/sparse_async_batched_cumsum.cu @@ -72,7 +72,11 @@ __global__ __launch_bounds__(kMaxThreads) void _batched_complete_cumsum_kernel( data = (val_t)values[blockIdx.x][i]; } BlockScan(temp_storage).InclusiveSum(data, data, prefix_op); - cub::CTA_SYNC(); +#if CUDA_VERSION >= 13000 + __syncthreads(); +#else + cub::CTA_SYNC(); +#endif if (i < len) { out[blockIdx.x][i + 1] = data; }