diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
index 96305c056a..aa2dd81517 100644
--- a/.github/scripts/fbgemm_gpu_build.bash
+++ b/.github/scripts/fbgemm_gpu_build.bash
@@ -304,8 +304,10 @@ __configure_fbgemm_gpu_build_cuda () {
         local arch_list="7.5;8.0"
       fi
 
-    elif  [[ $cuda_version_nvcc == *"V13.0"* ]] ||
-          [[ $cuda_version_nvcc == *"V12.9"* ]] ||
+    elif  [[ $cuda_version_nvcc == *"V13.0"* ]]; then
+      local arch_list="8.0;9.0a;10.0a;12.0a"
+
+    elif  [[ $cuda_version_nvcc == *"V12.9"* ]] ||
           [[ $cuda_version_nvcc == *"V12.8"* ]]; then
       local arch_list="7.5;8.0;9.0a;10.0a;12.0a"
 
diff --git a/.github/scripts/generate_ci_matrix.py b/.github/scripts/generate_ci_matrix.py
index b842433c8f..0d6cc47101 100644
--- a/.github/scripts/generate_ci_matrix.py
+++ b/.github/scripts/generate_ci_matrix.py
@@ -307,7 +307,7 @@ def cuda_versions(self) -> List[str]:
             return ["12.6.3", "12.8.1", "13.0.2"]
         else:
             # GenAI is unable to support 11.8.0 anymore as of https://github.com/pytorch/FBGEMM/pull/4138
-            return ["12.6.3", "12.8.1"]
+            return ["12.6.3", "12.8.1", "13.0.2"]
 
     def rocm_versions(self) -> List[str]:
         if GitRepo.ref() == REFS_MAIN and GitRepo.event_name() == EVENT_NAME_PUSH:
diff --git a/cmake/modules/GpuCppLibrary.cmake b/cmake/modules/GpuCppLibrary.cmake
index 51c30df750..3985a633d3 100644
--- a/cmake/modules/GpuCppLibrary.cmake
+++ b/cmake/modules/GpuCppLibrary.cmake
@@ -87,15 +87,32 @@ function(prepare_target_sources)
             list(APPEND ${args_PREFIX}_sources_cu ${args_CUDA_SPECIFIC_SRCS})
         endif()
 
-        # Set source properties
-        set_source_files_properties(${${args_PREFIX}_sources_cu}
-            PROPERTIES COMPILE_OPTIONS
-            "${args_NVCC_FLAGS}")
-
+        # Set include directories
         set_source_files_properties(${${args_PREFIX}_sources_cu}
             PROPERTIES INCLUDE_DIRECTORIES
             "${args_INCLUDE_DIRS}")
 
+        # Starting with CUDA 13.0, nvcc changed the default visibility of
+        # __global__ functions to `hidden`, which causes symbol lookup errors
+        # during linking.  This can be worked around by setting -cudart=shared
+        # and --device-entity-has-hidden-visibility=false.
+        #
+        # https://developer.nvidia.com/blog/cuda-c-compiler-updates-impacting-elf-visibility-and-linkage/
+        if( (FBGEMM_BUILD_VARIANT STREQUAL BUILD_VARIANT_CUDA) AND
+            (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") )
+            set(_nvcc_flags ${args_NVCC_FLAGS}
+                -cudart=shared
+                -static-global-template-stub=false
+                --device-entity-has-hidden-visibility=false)
+        else()
+            set(_nvcc_flags ${args_NVCC_FLAGS})
+        endif()
+
+        # Set compilation flags
+        set_source_files_properties(${${args_PREFIX}_sources_cu}
+            PROPERTIES COMPILE_OPTIONS
+            "${_nvcc_flags}")
+
         # Append to the full sources list
         list(APPEND ${args_PREFIX}_sources_combined ${${args_PREFIX}_sources_cu})
     endif()
diff --git a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu
index 6574bda45e..b90ad76b34 100644
--- a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu
+++ b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu
@@ -57,7 +57,7 @@ template <
     typename index_t,
     size_t kThreadGroupSize
     >
-__launch_bounds__(kForwardMaxThreads) __global__ void
+__launch_bounds__(kForwardMaxThreads) __global__ __attribute__((visibility("default"))) void
 {%- if is_index_select %}
 batch_index_select_dim0_codegen_forward_small_kernel(
 {%- else %}
diff --git a/fbgemm_gpu/src/memory_utils/memory_utils.cu b/fbgemm_gpu/src/memory_utils/memory_utils.cu
index e7cba2bffe..98647f5a68 100644
--- a/fbgemm_gpu/src/memory_utils/memory_utils.cu
+++ b/fbgemm_gpu/src/memory_utils/memory_utils.cu
@@ -144,6 +144,13 @@ std::tuple<void*, size_t> adjust_to_page_boundaries(void* ptr, size_t size) {
   return std::make_tuple((void*)raw_ptr_adjusted, (size_t)size_adjusted);
 }
 
+cudaMemLocation new_mem_location_from_device(const int device_id) {
+  cudaMemLocation deviceLoc;
+  deviceLoc.type = cudaMemLocationTypeDevice;
+  deviceLoc.id = device_id;
+  return deviceLoc;
+}
+
 } // namespace
 
 Tensor new_managed_tensor(
@@ -158,11 +165,25 @@ Tensor new_managed_tensor(
 
   // Set preferred memory location to host memory
   AT_CUDA_CHECK(cudaMemAdvise(
-      ptr, size_bytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
+      ptr, size_bytes, cudaMemAdviseSetPreferredLocation,
+#if CUDA_VERSION >= 13000
+      new_mem_location_from_device(cudaCpuDeviceId)
+#else
+      cudaCpuDeviceId
+#endif
+    ));
+
   // User hints with "accessed by": GPU will establish direct mapping of data
   // in CPU memory, no page faults will be generated
   AT_CUDA_CHECK(cudaMemAdvise(
-      ptr, size_bytes, cudaMemAdviseSetAccessedBy, at::cuda::current_device()));
+      ptr, size_bytes, cudaMemAdviseSetAccessedBy,
+#if CUDA_VERSION >= 13000
+      new_mem_location_from_device(at::cuda::current_device())
+#else
+      at::cuda::current_device()
+#endif
+    ));
+
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
   // Work around fork issue - see uvm_mem_advice_dont_fork for details
@@ -353,7 +374,12 @@ void uvm_cuda_mem_advise(const Tensor& t, int64_t cuda_memory_advise) {
       ptr,
       size_bytes,
       static_cast<enum cudaMemoryAdvise>(cuda_memory_advise),
-      hint_device));
+#if CUDA_VERSION >= 13000
+      new_mem_location_from_device(hint_device)
+#else
+      hint_device
+#endif
+  ));
   return;
 }
 
@@ -379,7 +405,13 @@ void uvm_cuda_mem_prefetch_async(
 
   auto stream = at::cuda::getCurrentCUDAStream();
 
-  AT_CUDA_CHECK(cudaMemPrefetchAsync(ptr, size_bytes, prefetch_device, stream));
+  AT_CUDA_CHECK(cudaMemPrefetchAsync(ptr, size_bytes,
+#if CUDA_VERSION >= 13000
+    new_mem_location_from_device(prefetch_device), 0,
+#else
+    prefetch_device,
+#endif
+    stream));
 
   return;
 }
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_async_batched_cumsum.cu b/fbgemm_gpu/src/sparse_ops/sparse_async_batched_cumsum.cu
index ac9b81cbcb..a281237cf1 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_async_batched_cumsum.cu
+++ b/fbgemm_gpu/src/sparse_ops/sparse_async_batched_cumsum.cu
@@ -72,7 +72,11 @@ __global__ __launch_bounds__(kMaxThreads) void _batched_complete_cumsum_kernel(
       data = (val_t)values[blockIdx.x][i];
     }
     BlockScan(temp_storage).InclusiveSum(data, data, prefix_op);
-    cub::CTA_SYNC();
+#if CUDA_VERSION >= 13000
+  __syncthreads();
+#else
+  cub::CTA_SYNC();
+#endif
     if (i < len) {
       out[blockIdx.x][i + 1] = data;
     }