ROCm · dhernandez0 · Dec 2, 2025 · Dec 3, 2025 · Dec 4, 2025 · Dec 4, 2025
@@ -90,6 +90,26 @@ def RockAccelTuningParamAttrInterface : AttrInterface<"RockAccelTuningParamAttrI
         /*args=*/(ins),
         /*methodBody=*/"",
         /*defaultImplementation=*/""
+      >,
+    InterfaceMethod<
+        /*desc=*/[{
+          Return waves_per_eu, this is a hint to the backend compiler to tune the number of wavefronts that are capable of fitting within the resources of an EU.
+        }],
+        /*retType=*/"int64_t",
+        /*methodName=*/"getWavesPerEU",
+        /*args=*/(ins),
+        /*methodBody=*/"",
+        /*defaultImplementation=*/""
+      >,
+    InterfaceMethod<
+        /*desc=*/[{
+          Group size for layout on the distribution of the workgroups.
+        }],
+        /*retType=*/"int64_t",
+        /*methodName=*/"getGridGroupSize",
+        /*args=*/(ins),
+        /*methodBody=*/"",
+        /*defaultImplementation=*/""
       >
 
     // TODO: more methods here as needed

@@ -280,7 +280,7 @@ def Rock_AttnPerfConfig : Rock_Attr<"AttnPerfConfig", [RockTuningParamAttrInterf
       "int64_t":$nPerBlockG0, "int64_t":$kpackPerBlock, "int64_t":$mPerWave,
       "int64_t":$nPerWave, "int64_t":$mnPerXdl, "int64_t":$kpack,
       "int64_t":$splitKFactor, "int64_t":$scheduleVersion,
-      "int64_t":$outputSwizzle, "bool":$forceUnroll);
+      "int64_t":$outputSwizzle, "int64_t":$wavesPerEU, "bool":$forceUnroll);
 
   let extraClassDeclaration = [{
     void getPerfConfigStr(::llvm::SmallVectorImpl<char> &perfStr) {
@@ -296,13 +296,14 @@ def Rock_AttnPerfConfig : Rock_Attr<"AttnPerfConfig", [RockTuningParamAttrInterf
       + Twine(getSplitKFactor()) + ","
       + Twine(getScheduleVersion()) + ","
       + Twine(getOutputSwizzle()) + ","
+      + Twine(getWavesPerEU()) + ","
       + Twine(getForceUnroll())).toVector(perfStr);
     }
     AttnPerfConfigAttr withScheduleVersion(int64_t newScheduleVersion) const {
     return AttnPerfConfigAttr::get(
       getContext(), getMPerBlockG0(), getMPerBlockG1(), getNPerBlockG0(),
       getKpackPerBlock(), getMPerWave(), getNPerWave(), getMnPerXdl(), getKpack(),
-      getSplitKFactor(), newScheduleVersion, getOutputSwizzle(), getForceUnroll());
+      getSplitKFactor(), newScheduleVersion, getOutputSwizzle(), getWavesPerEU(), getForceUnroll());
   }
   }];
 
@@ -325,7 +326,7 @@ def Rock_MfmaGemmParamsAttr
       "int64_t":$nPerBlock, "int64_t":$kpack, "int64_t":$mPerWave,
       "int64_t":$nPerWave, "int64_t":$mnPerXdl, "int64_t":$splitKFactor,
       "int64_t":$scheduleVersion, "int64_t":$outputSwizzle,
-      "bool":$forceUnroll);
+      "int64_t":$wavesPerEU, "int64_t":$gridGroupSize, "bool":$forceUnroll);
 
   let extraClassDeclaration = [{
     void getPerfConfigStr(::llvm::SmallVectorImpl<char> &perfStr) {
@@ -339,6 +340,8 @@ def Rock_MfmaGemmParamsAttr
         + Twine(getSplitKFactor()) + ","
         + Twine(getScheduleVersion()) + ","
         + Twine(getOutputSwizzle()) + ","
+        + Twine(getWavesPerEU()) + ","
+        + Twine(getGridGroupSize()) + ","
         + Twine(getForceUnroll()) + ","
           + "1") /* *ThreadCopyMore* */
         .toVector(perfStr);
@@ -359,7 +362,7 @@ def Rock_WmmaGemmParamsAttr : Rock_Attr<"WmmaGemmParams", [RockTuningParamAttrIn
       "int64_t":$nPerBlock, "int64_t":$kpack, "int64_t":$mPerWave,
       "int64_t":$nPerWave, "int64_t":$mnPerXdl, "int64_t":$splitKFactor,
       "int64_t":$scheduleVersion, "int64_t":$outputSwizzle,
-      "bool":$forceUnroll);
+      "int64_t":$wavesPerEU, "int64_t":$gridGroupSize, "bool":$forceUnroll);
 
   let extraClassDeclaration = [{
     void getPerfConfigStr(SmallVectorImpl<char> &perfStr) {
@@ -373,6 +376,8 @@ def Rock_WmmaGemmParamsAttr : Rock_Attr<"WmmaGemmParams", [RockTuningParamAttrIn
         + Twine(getSplitKFactor()) + ","
         + Twine(getScheduleVersion()) + ","
         + Twine(getOutputSwizzle()) + ","
+        + Twine(getWavesPerEU()) + ","
+        + Twine(getGridGroupSize()) + ","
         + Twine(getForceUnroll()) + ","
           + "1") /* *ThreadCopyMore* */
         .toVector(perfStr);
@@ -459,9 +464,25 @@ def Rock_ScheduleVersionAttr : Rock_Attr<"ScheduleVersion"> {
   }];
 }
 
-// It is a temporary attribute
 def Rock_EnableSplitKForTuning : Rock_Attr<"EnableSplitKForTuning"> {
   let mnemonic = "enable_splitk_for_tuning";
+  let description = [{
+    Whether we tune for split-k. If unset, split-k=1.
+  }];
+}
+
+def Rock_WavesPerEU : Rock_Attr<"WavesPerEU"> {
+  let mnemonic = "waves_per_eu";
+  let description = [{
+    This is a hint to the backend compiler to tune the number of wavefronts that are capable of fitting within the resources of an EU.
+  }];
+}
+
+def Rock_OutputSwizzle : Rock_Attr<"OutputSwizzle"> {
+  let mnemonic = "output_swizzle";
+  let description = [{
+    Whether we run the output swizzle pass. 0 -> disabled, 1 -> enabled, 2 -> heuristic.
+  }];
 }
 
 def Rock_PrefillAttr : Rock_Attr<"Prefill"> {

@@ -142,6 +142,7 @@ struct InitParamsNonAccel : InitParams, Serializable<InitParamsNonAccel> {
 };
 
 struct InitParamsAccel : InitParams, Serializable<InitParamsAccel> {
+  // TODO: remove once we generate new quick tuning list
   constexpr InitParamsAccel(int64_t mPerBlock, int64_t nPerBlock,
                             int64_t kPerBlock, int64_t mPerWave,
                             int64_t nPerWave, int64_t mnPerXdl, int64_t kPack,
@@ -152,12 +153,28 @@ struct InitParamsAccel : InitParams, Serializable<InitParamsAccel> {
         gemmNPerWave(nPerWave), gemmMnPerXdl(mnPerXdl),
         gemmNPerWaveOrMnPerXdl(0), gemmKPack(kPack), splitKFactor(splitKFactor),
         gemmScheduleVersion(scheduleVersion), outputSwizzle(outputSwizzle),
+        wavesPerEU(0), gridGroupSize(0),
+        gemmAThreadCopyMoreGemmK(aThreadCopyMoreGemmK),
+        gemmBThreadCopyMoreGemmKPack(bThreadCopyMoreGemmKPack) {}
+
+  constexpr InitParamsAccel(int64_t mPerBlock, int64_t nPerBlock,
+                            int64_t kPerBlock, int64_t mPerWave,
+                            int64_t nPerWave, int64_t mnPerXdl, int64_t kPack,
+                            int64_t splitKFactor, int64_t scheduleVersion,
+                            int64_t outputSwizzle, int64_t wavesPerEU,
+                            int64_t gridGroupSize, bool aThreadCopyMoreGemmK,
+                            bool bThreadCopyMoreGemmKPack)
+      : InitParams{mPerBlock, nPerBlock, kPerBlock}, gemmMPerWave(mPerWave),
+        gemmNPerWave(nPerWave), gemmMnPerXdl(mnPerXdl),
+        gemmNPerWaveOrMnPerXdl(0), gemmKPack(kPack), splitKFactor(splitKFactor),
+        gemmScheduleVersion(scheduleVersion), outputSwizzle(outputSwizzle),
+        wavesPerEU(wavesPerEU), gridGroupSize(gridGroupSize),
         gemmAThreadCopyMoreGemmK(aThreadCopyMoreGemmK),
         gemmBThreadCopyMoreGemmKPack(bThreadCopyMoreGemmKPack) {}
 
   constexpr InitParamsAccel()
-      : InitParamsAccel(0LL, 0LL, 0LL, 0LL, 0LL, 0LL, 0LL, 1LL, 1LL, 2LL, false,
-                        false) {}
+      : InitParamsAccel(0LL, 0LL, 0LL, 0LL, 0LL, 0LL, 0LL, 1LL, 1LL, 2LL, 0LL,
+                        0LL, false, false) {}
 
   InitParamsAccel(MfmaGemmParamsAttr attr)
       : InitParams{attr.getMPerBlock(), attr.getNPerBlock(),
@@ -167,6 +184,8 @@ struct InitParamsAccel : InitParams, Serializable<InitParamsAccel> {
         gemmKPack(attr.getKpack()), splitKFactor(attr.getSplitKFactor()),
         gemmScheduleVersion(attr.getScheduleVersion()),
         outputSwizzle(attr.getOutputSwizzle()),
+        wavesPerEU(attr.getWavesPerEU()),
+        gridGroupSize(attr.getGridGroupSize()),
         gemmAThreadCopyMoreGemmK(attr.getForceUnroll()),
         gemmBThreadCopyMoreGemmKPack(false) {};
 
@@ -178,6 +197,8 @@ struct InitParamsAccel : InitParams, Serializable<InitParamsAccel> {
         gemmKPack(attr.getKpack()), splitKFactor(attr.getSplitKFactor()),
         gemmScheduleVersion(attr.getScheduleVersion()),
         outputSwizzle(attr.getOutputSwizzle()),
+        wavesPerEU(attr.getWavesPerEU()),
+        gridGroupSize(attr.getGridGroupSize()),
         gemmAThreadCopyMoreGemmK(attr.getForceUnroll()),
         gemmBThreadCopyMoreGemmKPack(false) {};
 
@@ -191,6 +212,8 @@ struct InitParamsAccel : InitParams, Serializable<InitParamsAccel> {
   int64_t splitKFactor;
   int64_t gemmScheduleVersion;
   int64_t outputSwizzle;
+  int64_t wavesPerEU;
+  int64_t gridGroupSize;
   bool gemmAThreadCopyMoreGemmK;
   bool gemmBThreadCopyMoreGemmKPack;
 
@@ -214,6 +237,10 @@ struct InitParamsAccel : InitParams, Serializable<InitParamsAccel> {
       f(self.gemmScheduleVersion);
       f(self.outputSwizzle);
     }
+    if (self.version >= Version::V4) {
+      f(self.wavesPerEU);
+      f(self.gridGroupSize);
+    }
     f(self.gemmAThreadCopyMoreGemmK);
     f(self.gemmBThreadCopyMoreGemmKPack);
   }

@@ -71,7 +71,7 @@ struct Serializable {
   }
 
   bool checkVersionFormat(const std::string &s) {
-    const int32_t maxNumTokensArray[] = {0, 8, 9, 11, 12};
+    const int32_t maxNumTokensArray[] = {0, 8, 9, 11, 14};
     const int32_t versionIdx = static_cast<int32_t>(version);
     if (versionIdx < 1 || versionIdx >= static_cast<int32_t>(Version::Count)) {
       llvm_unreachable("Unknown version of the perfConfig");

@@ -136,6 +136,63 @@ struct WorkgroupIdRewritePattern
 };
 } // namespace
 
+static void runWavesPerEUHeuristic(OpBuilder b, gpu::GPUFuncOp gpuFunc,
+                                   int64_t ldsUsage) {
+  LLVM_DEBUG(llvm::dbgs() << "Using heuristic to set wavesPerEU...\n");
+  if (!gpuFunc->hasAttrOfType<IntegerAttr>("block_size")) {
+    LLVM_DEBUG(llvm::dbgs() << "blockSize not found in gpuFunc.\n");
+    return;
+  }
+  int64_t blockSize =
+      gpuFunc->getAttrOfType<IntegerAttr>("block_size").getInt();
+  if (!gpuFunc->hasAttrOfType<IntegerAttr>("grid_size")) {
+    LLVM_DEBUG(llvm::dbgs() << "gridSize not found in gpuFunc.\n");
+    return;
+  }
+  int64_t gridSize = gpuFunc->getAttrOfType<IntegerAttr>("grid_size").getInt();
+  FailureOr<StringAttr> maybeArch = rock::getArch(gpuFunc);
+  if (succeeded(maybeArch)) {
+    StringAttr arch = maybeArch.value();
+    rock::AmdArchInfo archInfo = rock::lookupArchInfo(arch);
+    FailureOr<int64_t> maybeNumCU = rock::getNumCU(gpuFunc);
+    int64_t numCU = maybeNumCU.value_or(archInfo.minNumCU);
+    int64_t totalEUs = archInfo.numEUPerCU * numCU;
+    int64_t wavesPerBlock = (blockSize / archInfo.waveSize);
+    int64_t totalWaves = wavesPerBlock * gridSize;
+    int64_t wavesPerEUPerBlock = wavesPerBlock / archInfo.numEUPerCU;
+    int64_t wavesPerEUPerGrid = (totalWaves + totalEUs - 1) / totalEUs;
+    int64_t wavesPerEU = std::max(wavesPerEUPerBlock, wavesPerEUPerGrid);
+    LLVM_DEBUG(llvm::dbgs() << "wavesPerEU:" << wavesPerEU << "\n");
+    LLVM_DEBUG(llvm::dbgs() << "  blockSize:" << blockSize << "\n");
+    LLVM_DEBUG(llvm::dbgs() << "  waveSize:" << archInfo.waveSize << "\n");
+    LLVM_DEBUG(llvm::dbgs() << "  gridSize:" << gridSize << "\n");
+    LLVM_DEBUG(llvm::dbgs() << "  numCU:" << numCU << "\n");
+    LLVM_DEBUG(llvm::dbgs() << "  numEUPerCU:" << archInfo.numEUPerCU << "\n");
+    LLVM_DEBUG(llvm::dbgs()
+               << "maxSharedMemPerWG:" << archInfo.maxSharedMemPerWG << "\n");
+    LLVM_DEBUG(llvm::dbgs() << "ldsUsage:" << ldsUsage << "\n");
+    // limit wavesPerEU based on lds usage
+    if (ldsUsage > 0) {
+      wavesPerEU =
+          std::min(wavesPerEU, archInfo.totalSharedMemPerCU / ldsUsage);
+    }
+    // Currently limiting wavesPerEU to be two
+    // it is a future to ticket to remove this constraint with further
+    // analysis
+    constexpr int64_t wavesPerEUUpperBound = 2;
+    wavesPerEU = std::min(wavesPerEU, wavesPerEUUpperBound);
+    if (wavesPerEU > 1) {
+      LLVM_DEBUG(llvm::dbgs() << "waves_per_eu:" << wavesPerEU << "\n");
+      gpuFunc->setAttr("rocdl.waves_per_eu", b.getI32IntegerAttr(wavesPerEU));
+    } else {
+      LLVM_DEBUG(llvm::dbgs() << "waves_per_eu not set"
+                              << "\n");
+    }
+  } else {
+    LLVM_DEBUG(llvm::dbgs() << "arch not found.\n");
+  }
+}
+
 void LowerRockOpsToGPUPass::runOnOperation() {
   ModuleOp op = getOperation();
   MLIRContext *ctx = op.getContext();
@@ -204,6 +261,11 @@ void LowerRockOpsToGPUPass::runOnOperation() {
     gridSize = cast<IntegerAttr>(gridSizeAttr).getInt();
     gpuFunc.setKnownGridSizeAttr(b.getDenseI32ArrayAttr({gridSize, 1, 1}));
 
+    auto wavesPerEUAttr = theFunc->getAttr(rock::WavesPerEUAttr::getMnemonic());
+    if (wavesPerEUAttr) {
+      gpuFunc->setAttr(rock::WavesPerEUAttr::getMnemonic(), wavesPerEUAttr);
+    }
+
     FailureOr<StringAttr> maybeArch = rock::getArch(theFunc);
     if (succeeded(maybeArch)) {
       gpuFunc->setAttr("arch", maybeArch.value());
@@ -391,61 +453,23 @@ void LowerRockOpsToGPUPass::runOnOperation() {
       gpuFunc->setAttr("rock.shared_buffer_size",
                        b.getI32IntegerAttr(ldsUsage));
     }
-    LLVM_DEBUG(llvm::dbgs() << "Attempting to set wavesPerEU...\n");
-    if (!gpuFunc->hasAttrOfType<IntegerAttr>("block_size")) {
-      LLVM_DEBUG(llvm::dbgs() << "blockSize not found in gpuFunc.\n");
-      return;
-    }
-    int64_t blockSize =
-        gpuFunc->getAttrOfType<IntegerAttr>("block_size").getInt();
-    if (!gpuFunc->hasAttrOfType<IntegerAttr>("grid_size")) {
-      LLVM_DEBUG(llvm::dbgs() << "gridSize not found in gpuFunc.\n");
-      return;
-    }
-    int64_t gridSize =
-        gpuFunc->getAttrOfType<IntegerAttr>("grid_size").getInt();
-    FailureOr<StringAttr> maybeArch = rock::getArch(gpuFunc);
-    if (succeeded(maybeArch)) {
-      StringAttr arch = maybeArch.value();
-      rock::AmdArchInfo archInfo = rock::lookupArchInfo(arch);
-      FailureOr<int64_t> maybeNumCU = rock::getNumCU(gpuFunc);
-      int64_t numCU = maybeNumCU.value_or(archInfo.minNumCU);
-      int64_t totalEUs = archInfo.numEUPerCU * numCU;
-      int64_t wavesPerBlock = (blockSize / archInfo.waveSize);
-      int64_t totalWaves = wavesPerBlock * gridSize;
-      int64_t wavesPerEUPerBlock = wavesPerBlock / archInfo.numEUPerCU;
-      int64_t wavesPerEUPerGrid = (totalWaves + totalEUs - 1) / totalEUs;
-      int64_t wavesPerEU = std::max(wavesPerEUPerBlock, wavesPerEUPerGrid);
-      LLVM_DEBUG(llvm::dbgs() << "wavesPerEU:" << wavesPerEU << "\n");
-      LLVM_DEBUG(llvm::dbgs() << "  blockSize:" << blockSize << "\n");
-      LLVM_DEBUG(llvm::dbgs() << "  waveSize:" << archInfo.waveSize << "\n");
-      LLVM_DEBUG(llvm::dbgs() << "  gridSize:" << gridSize << "\n");
-      LLVM_DEBUG(llvm::dbgs() << "  numCU:" << numCU << "\n");
-      LLVM_DEBUG(llvm::dbgs()
-                 << "  numEUPerCU:" << archInfo.numEUPerCU << "\n");
-      LLVM_DEBUG(llvm::dbgs()
-                 << "maxSharedMemPerWG:" << archInfo.maxSharedMemPerWG << "\n");
-      LLVM_DEBUG(llvm::dbgs() << "ldsUsage:" << ldsUsage << "\n");
-      // limit wavesPerEU based on lds usage
-      if (ldsUsage > 0) {
-        wavesPerEU =
-            std::min(wavesPerEU, archInfo.totalSharedMemPerCU / ldsUsage);
-      }
-      // Currently limiting wavesPerEU to be two
-      // it is a future to ticket to remove this constraint with further
-      // analysis
-      constexpr int64_t wavesPerEUUpperBound = 2;
-      wavesPerEU = std::min(wavesPerEU, wavesPerEUUpperBound);
-      if (wavesPerEU > 1) {
-        LLVM_DEBUG(llvm::dbgs() << "waves_per_eu:" << wavesPerEU << "\n");
+    // if waves_per_eu is set, use it
+    if (gpuFunc->hasAttrOfType<IntegerAttr>(
+            rock::WavesPerEUAttr::getMnemonic())) {
+      int64_t wavesPerEU =
+          gpuFunc
+              ->getAttrOfType<IntegerAttr>(rock::WavesPerEUAttr::getMnemonic())
+              .getInt();
+      // zero means, use heuristic
+      if (wavesPerEU != 0) {
         gpuFunc->setAttr("rocdl.waves_per_eu", b.getI32IntegerAttr(wavesPerEU));
-      } else {
-        LLVM_DEBUG(llvm::dbgs() << "waves_per_eu not set"
-                                << "\n");
+        LLVM_DEBUG(llvm::dbgs() << "Setting waves_per_eu using tuning param\n");
+        // we are done
+        return;
       }
-    } else {
-      LLVM_DEBUG(llvm::dbgs() << "arch not found.\n");
     }
+    // no "waves_per_eu" attribute, use heuristic
+    runWavesPerEUHeuristic(b, gpuFunc, ldsUsage);
   });
 
   if (gpuModCount == 0) {

@@ -3254,17 +3254,19 @@ AttnPerfConfigAttr AttnPerfConfigAttr::get(StringAttr perfConfigStrAttr,
     expectedNumTokens = 11;
     break;
   case 3:
-    expectedNumTokens = 12;
+    expectedNumTokens = 13;
     break;
   default:
     llvm_unreachable("Unknown version of the perfConfig");
   }
-  SmallVector<StringRef, 11> tokens;
+  SmallVector<StringRef> tokens;
+  SmallVector<int64_t> params;
+  tokens.reserve(expectedNumTokens);
+  params.reserve(expectedNumTokens);
   rest.split(tokens, ',');
   if (tokens.size() != expectedNumTokens) {
     return {};
   }
-  SmallVector<int64_t, 11> params;
   llvm::transform(tokens, std::back_inserter(params), [](StringRef s) {
     int param;
     llvm::to_integer(s, param);
@@ -3298,11 +3300,12 @@ AttnPerfConfigAttr AttnPerfConfigAttr::get(StringAttr perfConfigStrAttr,
   int64_t splitKFactor = version > 1 ? params[lastIdx++] : 1;
   int64_t scheduleVersion = version > 1 ? params[lastIdx++] : 1;
   int64_t outputSwizzle = version > 1 ? params[lastIdx++] : 2;
+  int64_t wavesPerEU = isV3 ? params[lastIdx++] : 0; // 0 -> use heuristic
   int64_t forceUnroll = params[expectedNumTokens - 1] == 1;
   return AttnPerfConfigAttr::get(
       perfConfigStrAttr.getContext(), mPerBlockG0, mPerBlockG1, nPerBlockG0,
       kpackPerBlock, mPerWave, nPerWave, mnPerXdl, kpack, splitKFactor,
-      scheduleVersion, outputSwizzle, forceUnroll);
+      scheduleVersion, outputSwizzle, wavesPerEU, forceUnroll);
 }
 
 //===-----------------------------------------------------===//

@@ -309,7 +309,7 @@ void AffixTuningParameters::affixTuningParametersImpl(
   Attribute params0 = op.getGemm0Params().value_or(nullptr);
   // set a default one if params is not provided
   StringAttr perfConfigStrAttr =
-      builder.getStringAttr("attn:v3:32,32,32,32,32,32,16,1,1,1,2,1");
+      builder.getStringAttr("attn:v3:32,32,32,32,32,32,16,1,1,1,2,0,1");
   if (!params0) {
     if (StringAttr mayBePerfConfigStrAttr =
             dyn_cast_or_null<StringAttr>(op->getAttr("perf_config"))) {