diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 0dd4f23c6d85f..5b331e4444915 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -2072,6 +2072,10 @@ class SelectionDAG { /// We use this predicate to simplify operations downstream. LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth = 0) const; + /// Return true if the sign bit of Op is known to be zero, for a + /// floating-point value. + LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth = 0) const; + /// Return true if 'Op & Mask' is known to be zero. We /// use this predicate to simplify operations downstream. Op and Mask are /// known to be the same type. diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index f144f17d5a8f2..4f2eb1e64dbe0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18863,6 +18863,26 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) { if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); + if (VT != N1.getValueType()) + return SDValue(); + + // If this is equivalent to a disjoint or, replace it with one. This can + // happen if the sign operand is a sign mask (i.e., x << sign_bit_position). + if (DAG.SignBitIsZeroFP(N0) && + DAG.computeKnownBits(N1).Zero.isMaxSignedValue()) { + // TODO: Just directly match the shift pattern. computeKnownBits is heavy + // for a such a narrowly targeted case. + EVT IntVT = VT.changeTypeToInteger(); + // TODO: It appears to be profitable in some situations to unconditionally + // emit a fabs(n0) to perform this combine. + SDValue CastSrc0 = DAG.getNode(ISD::BITCAST, DL, IntVT, N0); + SDValue CastSrc1 = DAG.getNode(ISD::BITCAST, DL, IntVT, N1); + + SDValue SignOr = DAG.getNode(ISD::OR, DL, IntVT, CastSrc0, CastSrc1, + SDNodeFlags::Disjoint); + return DAG.getNode(ISD::BITCAST, DL, VT, SignOr); + } + return SDValue(); } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 80bbfea7fb83c..d7b3b03deb757 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2920,6 +2920,34 @@ bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const { return MaskedValueIsZero(Op, APInt::getSignMask(BitWidth), Depth); } +bool SelectionDAG::SignBitIsZeroFP(SDValue Op, unsigned Depth) const { + if (Depth >= MaxRecursionDepth) + return false; // Limit search depth. + + unsigned Opc = Op.getOpcode(); + switch (Opc) { + case ISD::FABS: + return true; + case ISD::AssertNoFPClass: { + FPClassTest NoFPClass = + static_cast(Op.getConstantOperandVal(1)); + + const FPClassTest TestMask = fcNan | fcNegative; + return (NoFPClass & TestMask) == TestMask; + } + case ISD::ARITH_FENCE: + return SignBitIsZeroFP(Op, Depth + 1); + case ISD::FEXP: + case ISD::FEXP2: + case ISD::FEXP10: + return Op->getFlags().hasNoNaNs(); + default: + return false; + } + + llvm_unreachable("covered opcode switch"); +} + /// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero. We use /// this predicate to simplify operations downstream. Mask is known to be zero /// for bits that V cannot have. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 88b0809b767b5..6a9022dff41ad 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4638,6 +4638,12 @@ static std::optional getRange(const Instruction &I) { return std::nullopt; } +static FPClassTest getNoFPClass(const Instruction &I) { + if (const auto *CB = dyn_cast(&I)) + return CB->getRetNoFPClass(); + return fcNone; +} + void SelectionDAGBuilder::visitLoad(const LoadInst &I) { if (I.isAtomic()) return visitAtomicLoad(I); @@ -9132,6 +9138,7 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee, if (Result.first.getNode()) { Result.first = lowerRangeToAssertZExt(DAG, CB, Result.first); + Result.first = lowerNoFPClassToAssertNoFPClass(DAG, CB, Result.first); setValue(&CB, Result.first); } @@ -10718,6 +10725,16 @@ SDValue SelectionDAGBuilder::lowerRangeToAssertZExt(SelectionDAG &DAG, return DAG.getMergeValues(Ops, SL); } +SDValue SelectionDAGBuilder::lowerNoFPClassToAssertNoFPClass( + SelectionDAG &DAG, const Instruction &I, SDValue Op) { + FPClassTest Classes = getNoFPClass(I); + if (Classes == fcNone) + return Op; + + return DAG.getNode(ISD::AssertNoFPClass, SDLoc(Op), Op.getValueType(), Op, + DAG.getTargetConstant(Classes, SDLoc(), MVT::i32)); +} + /// Populate a CallLowerinInfo (into \p CLI) based on the properties of /// the call being lowered. /// diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index ed63bee58c957..13e2daa783147 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -429,6 +429,10 @@ class SelectionDAGBuilder { SDValue lowerRangeToAssertZExt(SelectionDAG &DAG, const Instruction &I, SDValue Op); + // Lower nofpclass attributes to AssertNoFPClass + SDValue lowerNoFPClassToAssertNoFPClass(SelectionDAG &DAG, + const Instruction &I, SDValue Op); + void populateCallLoweringInfo(TargetLowering::CallLoweringInfo &CLI, const CallBase *Call, unsigned ArgIdx, unsigned NumArgs, SDValue Callee, diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll index 0be2b5c70c93b..ef676ddc8070e 100644 --- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll +++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll @@ -345,15 +345,13 @@ define float @test_copysign_pow_fast_f32__integral_y(float %x, i32 %y.i) { ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc ; GFX9-NEXT: v_fma_f32 v2, v2, v1, v3 -; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GFX9-NEXT: v_exp_f32_e32 v2, v2 +; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GFX9-NEXT: v_not_b32_e32 v3, 63 ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 ; GFX9-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX9-NEXT: v_and_b32_e32 v0, v1, v0 -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX9-NEXT: v_and_or_b32 v0, v1, v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %y = sitofp i32 %y.i to float %y.fptosi = fptosi float %y to i32 @@ -379,7 +377,7 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-NEXT: v_writelane_b32 v43, s16, 15 +; GFX9-NEXT: v_writelane_b32 v43, s16, 14 ; GFX9-NEXT: v_writelane_b32 v43, s30, 0 ; GFX9-NEXT: v_writelane_b32 v43, s31, 1 ; GFX9-NEXT: v_writelane_b32 v43, s34, 2 @@ -391,19 +389,18 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 { ; GFX9-NEXT: v_writelane_b32 v43, s48, 8 ; GFX9-NEXT: v_writelane_b32 v43, s49, 9 ; GFX9-NEXT: v_writelane_b32 v43, s50, 10 -; GFX9-NEXT: v_writelane_b32 v43, s51, 11 ; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_writelane_b32 v43, s52, 12 +; GFX9-NEXT: v_writelane_b32 v43, s51, 11 ; GFX9-NEXT: v_mov_b32_e32 v42, v1 -; GFX9-NEXT: v_writelane_b32 v43, s53, 13 +; GFX9-NEXT: v_writelane_b32 v43, s52, 12 ; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42 ; GFX9-NEXT: s_getpc_b64 s[16:17] ; GFX9-NEXT: s_add_u32 s16, s16, _Z4log2d@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s17, s17, _Z4log2d@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v43, s54, 14 +; GFX9-NEXT: v_writelane_b32 v43, s53, 13 ; GFX9-NEXT: v_mov_b32_e32 v40, v31 ; GFX9-NEXT: v_mov_b32_e32 v41, v2 ; GFX9-NEXT: s_mov_b32 s50, s15 @@ -414,7 +411,6 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 { ; GFX9-NEXT: s_mov_b64 s[36:37], s[8:9] ; GFX9-NEXT: s_mov_b64 s[38:39], s[6:7] ; GFX9-NEXT: s_mov_b64 s[48:49], s[4:5] -; GFX9-NEXT: s_brev_b32 s54, -2 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: v_cvt_f64_i32_e32 v[2:3], v41 ; GFX9-NEXT: s_getpc_b64 s[16:17] @@ -436,8 +432,7 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 { ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: v_bfi_b32 v1, s54, v1, v2 -; GFX9-NEXT: v_readlane_b32 s54, v43, 14 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX9-NEXT: v_readlane_b32 s53, v43, 13 ; GFX9-NEXT: v_readlane_b32 s52, v43, 12 ; GFX9-NEXT: v_readlane_b32 s51, v43, 11 @@ -453,7 +448,7 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 { ; GFX9-NEXT: v_readlane_b32 s31, v43, 1 ; GFX9-NEXT: v_readlane_b32 s30, v43, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: v_readlane_b32 s4, v43, 15 +; GFX9-NEXT: v_readlane_b32 s4, v43, 14 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll b/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll index b99b64316b62f..afd610f4911c6 100644 --- a/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll @@ -81,8 +81,7 @@ define half @copysign_known_signmask_f16_known_positive_mag(half nofpclass(nan n ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 15, v1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %signmask = shl i16 %sign, 15 %signmask.bitcast = bitcast i16 %signmask to half @@ -94,9 +93,7 @@ define float @copysign_known_signmask_f32_known_positive_mag(float nofpclass(nan ; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 31, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %signmask = shl i32 %sign, 31 %signmask.bitcast = bitcast i32 %signmask to float @@ -109,8 +106,7 @@ define double @copysign_known_signmask_f64_known_positive_mag(double nofpclass(n ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 31, v2 -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %signmask = shl i64 %sign, 63 %signmask.bitcast = bitcast i64 %signmask to double @@ -130,11 +126,9 @@ define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp(float %x, ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; GFX9-NEXT: s_brev_b32 s4, -2 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x114b4ea4, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 31, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %signbit.known.zero = call nnan afn float @llvm.exp.f32(float %x) %signmask = shl i32 %sign, 31 @@ -155,10 +149,8 @@ define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp2(float %x ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: v_not_b32_e32 v2, 63 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 31, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %signbit.known.zero = call nnan afn float @llvm.exp2.f32(float %x) %signmask = shl i32 %sign, 31 @@ -179,10 +171,8 @@ define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp10(float % ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: v_not_b32_e32 v2, 63 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 31, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %signbit.known.zero = call nnan afn float @llvm.exp2.f32(float %x) %signmask = shl i32 %sign, 31 diff --git a/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll b/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll index 1861f02ec8b1c..5f303cc2a1eef 100644 --- a/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll +++ b/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll @@ -35,9 +35,7 @@ define float @call_nofpclass_funcs_f32(ptr addrspace(1) %ptr) { ; CHECK-NEXT: v_mov_b32_e32 v3, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, v2 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_max_f32_e32 v1, v3, v3 -; CHECK-NEXT: v_max_f32_e32 v0, v0, v0 -; CHECK-NEXT: v_min_f32_e32 v0, v1, v0 +; CHECK-NEXT: v_min_f32_e32 v0, v3, v0 ; CHECK-NEXT: v_readlane_b32 s31, v4, 1 ; CHECK-NEXT: v_readlane_b32 s30, v4, 0 ; CHECK-NEXT: s_mov_b32 s32, s33 @@ -87,12 +85,8 @@ define <2 x float> @call_nofpclass_funcs_v2f32(ptr addrspace(1) %ptr) { ; CHECK-NEXT: v_mov_b32_e32 v0, v3 ; CHECK-NEXT: v_mov_b32_e32 v1, v2 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_max_f32_e32 v2, v4, v4 -; CHECK-NEXT: v_max_f32_e32 v0, v0, v0 -; CHECK-NEXT: v_min_f32_e32 v0, v2, v0 -; CHECK-NEXT: v_max_f32_e32 v2, v5, v5 -; CHECK-NEXT: v_max_f32_e32 v1, v1, v1 -; CHECK-NEXT: v_min_f32_e32 v1, v2, v1 +; CHECK-NEXT: v_min_f32_e32 v0, v4, v0 +; CHECK-NEXT: v_min_f32_e32 v1, v5, v1 ; CHECK-NEXT: v_readlane_b32 s31, v6, 1 ; CHECK-NEXT: v_readlane_b32 s30, v6, 0 ; CHECK-NEXT: s_mov_b32 s32, s33 @@ -142,12 +136,10 @@ define double @call_nofpclass_funcs_f64(ptr addrspace(1) %ptr) { ; CHECK-NEXT: v_mov_b32_e32 v0, v5 ; CHECK-NEXT: v_mov_b32_e32 v1, v4 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; CHECK-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; CHECK-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; CHECK-NEXT: v_readlane_b32 s31, v6, 1 ; CHECK-NEXT: v_readlane_b32 s30, v6, 0 ; CHECK-NEXT: s_mov_b32 s32, s33 -; CHECK-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5]