Added changes for correct computations:

keshavvinayak01 · keshavvinayak01 · commit f860ccc36524 · 2025-11-12T02:03:56.000-08:00
-  PyTorch’s attention already supplies the correct scale for the base‑e softmax. All the following files are changed to support exp computation instead of exp2.
1. ConvertTorchUnstructuredToLinalgExt: FlexAttentionOpConversion pattern passes use_exp2 = false, which can be used correctly in decomposition.
2. AggregatedOpInterfaceImpl: accepts the use_exp2 flag as an attribute for decomposition and calls computeSubAndExp accordingly.
3. LinalgExtOps.td: Added getUseExp2AttrStr() to both online_attention and attention.
4. ReshapeFusion.cpp: createCollapsedOP was recreating the attention op and stripping all attributes before StripCompilationPass. Change was necessary to support correct decomposition.

Signed-off-by: Keshav Vinayak Jha &lt;keshavvinayakjha@gmail.com&gt;
diff --git a/compiler/plugins/input/Torch/InputConversion/ConvertTorchUnstructuredToLinalgExt.cpp b/compiler/plugins/input/Torch/InputConversion/ConvertTorchUnstructuredToLinalgExt.cpp
@@ -409,12 +409,23 @@ struct FlexAttentionOpConversion
 
     indexingMaps.push_back(oMap);
 
+    // Create decomposition config with use_exp2 flag
+    // PyTorch's compiled kernels use exp2 for performance, so we match that
+    SmallVector<NamedAttribute> decompositionConfigAttrs;
+    decompositionConfigAttrs.push_back(
+        rewriter.getNamedAttr("use_exp2", rewriter.getBoolAttr(false)));
+    DictionaryAttr decompositionConfig =
+        rewriter.getDictionaryAttr(decompositionConfigAttrs);
+
     // Create attention op
     auto attentionOp = IREE::LinalgExt::AttentionOp::create(
         rewriter, loc, outputTensor.getType(), builtinQuery, builtinKey,
         builtinValue, scale, outputTensor,
         rewriter.getAffineMapArrayAttr(indexingMaps), mask);
 
+    // Set decomposition config
+    attentionOp.setDecompositionConfigAttr(decompositionConfig);
+
     {
       OpBuilder::InsertionGuard g(rewriter);
       Block *block = rewriter.createBlock(&attentionOp.getRegion());
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/IR/AggregatedOpInterfaceImpl.cpp b/compiler/src/iree/compiler/Dialect/LinalgExt/IR/AggregatedOpInterfaceImpl.cpp
@@ -294,15 +294,14 @@ static Value applyMask(OpBuilder &builder, Location loc, AffineMap qkMap,
   return genericOp.getResult(0);
 }
 
-// Compute output = exp2(output - input)
-static Value computeSubAndExp2(OpBuilder &builder, Location loc,
-                               AffineMap inputMap, AffineMap outputMap,
-                               Value input, Value output) {
+// Compute output = exp2/exp(output - input) depending on useExp2 flag.
+static Value computeSubAndExp(OpBuilder &builder, Location loc,
+                              AffineMap inputMap, AffineMap outputMap,
+                              Value input, Value output, bool useExp2) {
   SmallVector<AffineMap> compressedMaps =
       compressUnusedDims(SmallVector<AffineMap>{inputMap, outputMap});
   inputMap = compressedMaps[0];
   outputMap = compressedMaps[1];
-
   SmallVector<utils::IteratorType> iteratorTypes(inputMap.getNumDims(),
                                                  utils::IteratorType::parallel);
   auto genericOp = linalg::GenericOp::create(
@@ -313,8 +312,9 @@ static Value computeSubAndExp2(OpBuilder &builder, Location loc,
         Value in = convertScalarToDtype(b, loc, args[0], args[1].getType(),
                                         /*isUnsignedCast=*/false);
         Value diff = arith::SubFOp::create(b, loc, args[1], in);
-        Value weight = math::Exp2Op::create(b, loc, diff);
-        linalg::YieldOp::create(b, loc, weight);
+        Operation *weight = useExp2 ? math::Exp2Op::create(b, loc, diff)
+                                    : math::ExpOp::create(b, loc, diff);
+        linalg::YieldOp::create(b, loc, weight->getResult(0));
       });
   return genericOp.getResult(0);
 }
@@ -350,15 +350,18 @@ Value computeQKAndElementwise(Location loc, OpBuilder &b, Value query,
                               std::optional<AffineMap> maskMap,
                               SmallVector<OpFoldResult> iterationDomain,
                               Type sElementType, Region &elementwiseRegion,
-                              DictionaryAttr qkAttrs, bool lowPrecision) {
+                              DictionaryAttr qkAttrs, bool lowPrecision,
+                              bool useExp2) {
   MLIRContext *ctx = b.getContext();
-  // Since we use exp2 for attention instead of the original exp, we have to
+  // If using exp2 for attention instead of the original exp, we have to
   // multiply the scale by log2(e). We use exp2 instead of exp as most platforms
   // have better support for exp2 (we verified that we gain some speedup on
   // some GPUs).
-  Value log2e = arith::ConstantOp::create(
-      b, loc, b.getFloatAttr(scale.getType(), M_LOG2E));
-  scale = arith::MulFOp::create(b, loc, scale, log2e);
+  if (useExp2) {
+    Value log2e = arith::ConstantOp::create(
+        b, loc, b.getFloatAttr(scale.getType(), M_LOG2E));
+    scale = arith::MulFOp::create(b, loc, scale, log2e);
+  }
 
   auto qETy = getElementTypeOrSelf(query.getType());
 
@@ -445,9 +448,12 @@ FailureOr<SmallVector<Value>> AttentionOp::decomposeOperation(OpBuilder &b) {
   DictionaryAttr config = getDecompositionConfigAttr();
 
   DictionaryAttr qkAttrs, pvAttrs;
+  bool useExp2 = true; // Default to exp2 for backward compatibility
   if (config) {
     qkAttrs = config.getAs<DictionaryAttr>(getQKAttrStr());
     pvAttrs = config.getAs<DictionaryAttr>(getPVAttrStr());
+    if (auto useExp2Attr = config.getAs<BoolAttr>(getUseExp2AttrStr()))
+      useExp2 = useExp2Attr.getValue();
   }
   Value output = getOutput();
 
@@ -470,9 +476,9 @@ FailureOr<SmallVector<Value>> AttentionOp::decomposeOperation(OpBuilder &b) {
   Type f32Type = b.getF32Type();
 
   // ---- QK Matmul + elementwise math ----
-  Value s = computeQKAndElementwise(loc, b, query, key, getScale(), mask, qMap,
-                                    kMap, sMap, getMaskMap(), sizes, f32Type,
-                                    getRegion(), qkAttrs, lowPrecision);
+  Value s = computeQKAndElementwise(
+      loc, b, query, key, getScale(), mask, qMap, kMap, sMap, getMaskMap(),
+      sizes, f32Type, getRegion(), qkAttrs, lowPrecision, useExp2);
 
   // ---- Softmax ----
 
@@ -512,9 +518,9 @@ FailureOr<SmallVector<Value>> AttentionOp::decomposeOperation(OpBuilder &b) {
   // max = rowMax(S)
   Value max = reduce<arith::MaximumFOp>(b, loc, sMap, maxMap, s, maxFill);
 
-  // P = exp2(S - max)
+  // P = exp2(S - max) or exp(S - max) depending on useExp2 flag
   AffineMap pMap = sMap;
-  Value p = computeSubAndExp2(b, loc, maxMap, sMap, max, s);
+  Value p = computeSubAndExp(b, loc, maxMap, sMap, max, s, useExp2);
 
   // sum = rowSum(P)
   Value sum = reduce<arith::AddFOp>(b, loc, pMap, sumMap, p, sumFill);
@@ -564,9 +570,12 @@ OnlineAttentionOp::decomposeOperation(OpBuilder &b) {
   DictionaryAttr config = getDecompositionConfigAttr();
 
   DictionaryAttr qkAttrs, pvAttrs;
+  bool useExp2 = true; // Default to exp2 for backward compatibility
   if (config) {
     qkAttrs = config.getAs<DictionaryAttr>(getQKAttrStr());
     pvAttrs = config.getAs<DictionaryAttr>(getPVAttrStr());
+    if (auto useExp2Attr = config.getAs<BoolAttr>(getUseExp2AttrStr()))
+      useExp2 = useExp2Attr.getValue();
   }
 
   FailureOr<AttentionOpDetail> maybeOpInfo = AttentionOpDetail::get(
@@ -587,7 +596,7 @@ OnlineAttentionOp::decomposeOperation(OpBuilder &b) {
   // ---- QK Matmul + elementwise math ----
   Value s = computeQKAndElementwise(
       loc, b, query, key, getScale(), mask, qMap, kMap, sMap, getMaskMap(),
-      sizes, elementType, getRegion(), qkAttrs, lowPrecision);
+      sizes, elementType, getRegion(), qkAttrs, lowPrecision, useExp2);
 
   // TODO: This decomposition should be in a seperate op called
   // "online softmax".
@@ -597,20 +606,21 @@ OnlineAttentionOp::decomposeOperation(OpBuilder &b) {
   AffineMap maxMap = getMaxMap();
   Value newMax = reduce<arith::MaximumFOp>(b, loc, sMap, maxMap, s, oldMax);
 
-  // norm = exp2(oldMax - newMax)
+  // norm = exp2(oldMax - newMax) or exp(oldMax - newMax) depending on useExp2
   // normMap = maxMap
   AffineMap normMap = getMaxMap();
-  Value norm = computeSubAndExp2(b, loc, maxMap, normMap, newMax, oldMax);
+  Value norm =
+      computeSubAndExp(b, loc, maxMap, normMap, newMax, oldMax, useExp2);
 
   // normSum = norm * oldSum
   AffineMap sumMap = getSumMap();
   Value normSum = elementwiseValueInPlace<arith::MulFOp>(b, loc, sumMap,
                                                          normMap, oldSum, norm);
 
-  // P = exp2(S - newMax)
+  // P = exp2(S - newMax) or exp(S - newMax) depending on useExp2
   // PMap = SMap
   AffineMap pMap = sMap;
-  Value p = computeSubAndExp2(b, loc, maxMap, sMap, newMax, s);
+  Value p = computeSubAndExp(b, loc, maxMap, sMap, newMax, s, useExp2);
 
   // newSum = normSum + rowSum(P)
   Value newSum = reduce<arith::AddFOp>(b, loc, pMap, sumMap, p, normSum);
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.td b/compiler/src/iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.td
@@ -871,6 +871,8 @@ def IREELinalgExt_AttentionOp : IREELinalgExt_Op<"attention",
     // Attributes to set on QK and PV matmul after decomposition.
     static StringRef getQKAttrStr() { return "qk_attrs"; }
     static StringRef getPVAttrStr() { return "pv_attrs"; }
+    // Flag to control whether to use exp2 (with log2(e) scaling) or exp.
+    static StringRef getUseExp2AttrStr() { return "use_exp2"; }
   }];
 
   let hasCanonicalizer = 1;
@@ -1013,6 +1015,8 @@ def IREELinalgExt_OnlineAttentionOp : IREELinalgExt_Op<"online_attention",
     // Attributes to set on QK and PV matmul after decomposition.
     static StringRef getQKAttrStr() { return "qk_attrs"; }
     static StringRef getPVAttrStr() { return "pv_attrs"; }
+    // Flag to control whether to use exp2 (with log2(e) scaling) or exp.
+    static StringRef getUseExp2AttrStr() { return "use_exp2"; }
   }];
 
   let hasCanonicalizer = 1;
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/ReshapeFusion.cpp b/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/ReshapeFusion.cpp
@@ -1055,6 +1055,12 @@ static Operation *createCollapsedOp(AttentionOp origOp,
       rewriter, origOp.getLoc(), resultTypes, inputOperands[0],
       inputOperands[1], inputOperands[2], inputOperands[3], outputOperands[0],
       rewriter.getAffineMapArrayAttr(indexingMaps), maskOperand);
+
+  // Preserve decomposition_config attribute from original op
+  if (auto config = origOp.getDecompositionConfigAttr()) {
+    collapsedOp.setDecompositionConfigAttr(config);
+  }
+
   rewriter.inlineRegionBefore(origOp.getRegion(), collapsedOp.getRegion(),
                               collapsedOp.getRegion().begin());
   return collapsedOp;
@@ -1152,6 +1158,12 @@ struct DropAttentionUnitDims final
           newOperands.take_front(attentionOp.getNumDpsInputs()),
           newOperands.take_back(attentionOp.getNumDpsInits()),
           b.getAffineMapArrayAttr(newIndexingMaps));
+
+      // Preserve decomposition_config attribute from original op
+      if (auto config = attentionOp.getDecompositionConfigAttr()) {
+        newOp.setDecompositionConfigAttr(config);
+      }
+
       b.cloneRegionBefore(attentionOp.getRegion(), newOp.getRegion(),
                           newOp.getRegion().begin());
       return newOp;