-
Notifications
You must be signed in to change notification settings - Fork 15.2k
DAG: Fold copysign with a known signmask to a disjoint or #167266
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
|
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesIf the sign bit is a computed sign mask (i.e., we know it's We also need to know the sign bit of the magnitude is 0 for This is intended to help avoid the regression which caused Full diff: https://github.com/llvm/llvm-project/pull/167266.diff 5 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 0dd4f23c6d85f..5b331e4444915 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -2072,6 +2072,10 @@ class SelectionDAG {
/// We use this predicate to simplify operations downstream.
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth = 0) const;
+ /// Return true if the sign bit of Op is known to be zero, for a
+ /// floating-point value.
+ LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth = 0) const;
+
/// Return true if 'Op & Mask' is known to be zero. We
/// use this predicate to simplify operations downstream. Op and Mask are
/// known to be the same type.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f144f17d5a8f2..4f2eb1e64dbe0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18863,6 +18863,26 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
if (SimplifyDemandedBits(SDValue(N, 0)))
return SDValue(N, 0);
+ if (VT != N1.getValueType())
+ return SDValue();
+
+ // If this is equivalent to a disjoint or, replace it with one. This can
+ // happen if the sign operand is a sign mask (i.e., x << sign_bit_position).
+ if (DAG.SignBitIsZeroFP(N0) &&
+ DAG.computeKnownBits(N1).Zero.isMaxSignedValue()) {
+ // TODO: Just directly match the shift pattern. computeKnownBits is heavy
+ // for a such a narrowly targeted case.
+ EVT IntVT = VT.changeTypeToInteger();
+ // TODO: It appears to be profitable in some situations to unconditionally
+ // emit a fabs(n0) to perform this combine.
+ SDValue CastSrc0 = DAG.getNode(ISD::BITCAST, DL, IntVT, N0);
+ SDValue CastSrc1 = DAG.getNode(ISD::BITCAST, DL, IntVT, N1);
+
+ SDValue SignOr = DAG.getNode(ISD::OR, DL, IntVT, CastSrc0, CastSrc1,
+ SDNodeFlags::Disjoint);
+ return DAG.getNode(ISD::BITCAST, DL, VT, SignOr);
+ }
+
return SDValue();
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 80bbfea7fb83c..d7b3b03deb757 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2920,6 +2920,34 @@ bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const {
return MaskedValueIsZero(Op, APInt::getSignMask(BitWidth), Depth);
}
+bool SelectionDAG::SignBitIsZeroFP(SDValue Op, unsigned Depth) const {
+ if (Depth >= MaxRecursionDepth)
+ return false; // Limit search depth.
+
+ unsigned Opc = Op.getOpcode();
+ switch (Opc) {
+ case ISD::FABS:
+ return true;
+ case ISD::AssertNoFPClass: {
+ FPClassTest NoFPClass =
+ static_cast<FPClassTest>(Op.getConstantOperandVal(1));
+
+ const FPClassTest TestMask = fcNan | fcNegative;
+ return (NoFPClass & TestMask) == TestMask;
+ }
+ case ISD::ARITH_FENCE:
+ return SignBitIsZeroFP(Op, Depth + 1);
+ case ISD::FEXP:
+ case ISD::FEXP2:
+ case ISD::FEXP10:
+ return Op->getFlags().hasNoNaNs();
+ default:
+ return false;
+ }
+
+ llvm_unreachable("covered opcode switch");
+}
+
/// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero. We use
/// this predicate to simplify operations downstream. Mask is known to be zero
/// for bits that V cannot have.
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index 0be2b5c70c93b..ef676ddc8070e 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -345,15 +345,13 @@ define float @test_copysign_pow_fast_f32__integral_y(float %x, i32 %y.i) {
; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
; GFX9-NEXT: v_fma_f32 v2, v2, v1, v3
-; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX9-NEXT: v_exp_f32_e32 v2, v2
+; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX9-NEXT: v_not_b32_e32 v3, 63
; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1
; GFX9-NEXT: v_ldexp_f32 v2, v2, v3
-; GFX9-NEXT: v_and_b32_e32 v0, v1, v0
-; GFX9-NEXT: s_brev_b32 s4, -2
-; GFX9-NEXT: v_bfi_b32 v0, s4, v2, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1
+; GFX9-NEXT: v_and_or_b32 v0, v1, v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%y = sitofp i32 %y.i to float
%y.fptosi = fptosi float %y to i32
@@ -379,7 +377,7 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-NEXT: v_writelane_b32 v43, s16, 15
+; GFX9-NEXT: v_writelane_b32 v43, s16, 14
; GFX9-NEXT: v_writelane_b32 v43, s30, 0
; GFX9-NEXT: v_writelane_b32 v43, s31, 1
; GFX9-NEXT: v_writelane_b32 v43, s34, 2
@@ -391,19 +389,18 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
; GFX9-NEXT: v_writelane_b32 v43, s48, 8
; GFX9-NEXT: v_writelane_b32 v43, s49, 9
; GFX9-NEXT: v_writelane_b32 v43, s50, 10
-; GFX9-NEXT: v_writelane_b32 v43, s51, 11
; GFX9-NEXT: s_addk_i32 s32, 0x800
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: v_writelane_b32 v43, s52, 12
+; GFX9-NEXT: v_writelane_b32 v43, s51, 11
; GFX9-NEXT: v_mov_b32_e32 v42, v1
-; GFX9-NEXT: v_writelane_b32 v43, s53, 13
+; GFX9-NEXT: v_writelane_b32 v43, s52, 12
; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42
; GFX9-NEXT: s_getpc_b64 s[16:17]
; GFX9-NEXT: s_add_u32 s16, s16, _Z4log2d@rel32@lo+4
; GFX9-NEXT: s_addc_u32 s17, s17, _Z4log2d@rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v43, s54, 14
+; GFX9-NEXT: v_writelane_b32 v43, s53, 13
; GFX9-NEXT: v_mov_b32_e32 v40, v31
; GFX9-NEXT: v_mov_b32_e32 v41, v2
; GFX9-NEXT: s_mov_b32 s50, s15
@@ -414,7 +411,6 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
; GFX9-NEXT: s_mov_b64 s[36:37], s[8:9]
; GFX9-NEXT: s_mov_b64 s[38:39], s[6:7]
; GFX9-NEXT: s_mov_b64 s[48:49], s[4:5]
-; GFX9-NEXT: s_brev_b32 s54, -2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT: v_cvt_f64_i32_e32 v[2:3], v41
; GFX9-NEXT: s_getpc_b64 s[16:17]
@@ -436,8 +432,7 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: v_bfi_b32 v1, s54, v1, v2
-; GFX9-NEXT: v_readlane_b32 s54, v43, 14
+; GFX9-NEXT: v_or_b32_e32 v1, v1, v2
; GFX9-NEXT: v_readlane_b32 s53, v43, 13
; GFX9-NEXT: v_readlane_b32 s52, v43, 12
; GFX9-NEXT: v_readlane_b32 s51, v43, 11
@@ -453,7 +448,7 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
; GFX9-NEXT: v_readlane_b32 s31, v43, 1
; GFX9-NEXT: v_readlane_b32 s30, v43, 0
; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: v_readlane_b32 s4, v43, 15
+; GFX9-NEXT: v_readlane_b32 s4, v43, 14
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll b/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll
index b99b64316b62f..afd610f4911c6 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll
@@ -81,8 +81,7 @@ define half @copysign_known_signmask_f16_known_positive_mag(half nofpclass(nan n
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 15, v1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
%signmask = shl i16 %sign, 15
%signmask.bitcast = bitcast i16 %signmask to half
@@ -94,9 +93,7 @@ define float @copysign_known_signmask_f32_known_positive_mag(float nofpclass(nan
; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1
-; GFX9-NEXT: s_brev_b32 s4, -2
-; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT: v_lshl_or_b32 v0, v1, 31, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%signmask = shl i32 %sign, 31
%signmask.bitcast = bitcast i32 %signmask to float
@@ -109,8 +106,7 @@ define double @copysign_known_signmask_f64_known_positive_mag(double nofpclass(n
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 31, v2
-; GFX9-NEXT: s_brev_b32 s4, -2
-; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
+; GFX9-NEXT: v_or_b32_e32 v1, v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%signmask = shl i64 %sign, 63
%signmask.bitcast = bitcast i64 %signmask to double
@@ -130,11 +126,9 @@ define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp(float %x,
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1
-; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_mul_f32_e32 v2, 0x114b4ea4, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT: v_lshl_or_b32 v0, v1, 31, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%signbit.known.zero = call nnan afn float @llvm.exp.f32(float %x)
%signmask = shl i32 %sign, 31
@@ -155,10 +149,8 @@ define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp2(float %x
; GFX9-NEXT: v_exp_f32_e32 v0, v0
; GFX9-NEXT: v_not_b32_e32 v2, 63
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1
; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT: s_brev_b32 s4, -2
-; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT: v_lshl_or_b32 v0, v1, 31, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%signbit.known.zero = call nnan afn float @llvm.exp2.f32(float %x)
%signmask = shl i32 %sign, 31
@@ -179,10 +171,8 @@ define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp10(float %
; GFX9-NEXT: v_exp_f32_e32 v0, v0
; GFX9-NEXT: v_not_b32_e32 v2, 63
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1
; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT: s_brev_b32 s4, -2
-; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT: v_lshl_or_b32 v0, v1, 31, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%signbit.known.zero = call nnan afn float @llvm.exp2.f32(float %x)
%signmask = shl i32 %sign, 31
|
This defends against regressions in future patches. This excludes the target intrinsic case for now; I'm worried introducing an intermediate AssertNoFPClass is likely to break combines.
If the sign bit is a computed sign mask (i.e., we know it's either +0 or -0), turn this into a disjoint or. This pattern appears in the pow implementations. We also need to know the sign bit of the magnitude is 0 for the or to be disjoint. Unfortunately the DAG's FP tracking is weak and we did not have a way to check if the sign bit is known 0, so add something for that. Ideally we would get a complete computeKnownFPClass implementation. This is intended to help avoid the regression which caused d3e7c4c to be reverted.
This reverts commit 49e2c3aa7a861fc8864c2d045b3804e31e1f13cc. One case is slighly more sophisticated
2d6fb0f to
ed01dc9
Compare
669fbce to
b86a3c5
Compare

If the sign bit is a computed sign mask (i.e., we know it's
either +0 or -0), turn this into a disjoint or. This pattern
appears in the pow implementations.
We also need to know the sign bit of the magnitude is 0 for
the or to be disjoint. Unfortunately the DAG's FP tracking is
weak and we did not have a way to check if the sign bit is known
0, so add something for that. Ideally we would get a complete
computeKnownFPClass implementation.
This is intended to help avoid the regression which caused
d3e7c4c to be reverted.