Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions llvm/include/llvm/CodeGen/SelectionDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -2072,6 +2072,10 @@ class SelectionDAG {
/// We use this predicate to simplify operations downstream.
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth = 0) const;

/// Return true if the sign bit of Op is known to be zero, for a
/// floating-point value.
LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth = 0) const;

/// Return true if 'Op & Mask' is known to be zero. We
/// use this predicate to simplify operations downstream. Op and Mask are
/// known to be the same type.
Expand Down
20 changes: 20 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18863,6 +18863,26 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
if (SimplifyDemandedBits(SDValue(N, 0)))
return SDValue(N, 0);

if (VT != N1.getValueType())
return SDValue();

// If this is equivalent to a disjoint or, replace it with one. This can
// happen if the sign operand is a sign mask (i.e., x << sign_bit_position).
if (DAG.SignBitIsZeroFP(N0) &&
DAG.computeKnownBits(N1).Zero.isMaxSignedValue()) {
// TODO: Just directly match the shift pattern. computeKnownBits is heavy
// for a such a narrowly targeted case.
EVT IntVT = VT.changeTypeToInteger();
// TODO: It appears to be profitable in some situations to unconditionally
// emit a fabs(n0) to perform this combine.
SDValue CastSrc0 = DAG.getNode(ISD::BITCAST, DL, IntVT, N0);
SDValue CastSrc1 = DAG.getNode(ISD::BITCAST, DL, IntVT, N1);

SDValue SignOr = DAG.getNode(ISD::OR, DL, IntVT, CastSrc0, CastSrc1,
SDNodeFlags::Disjoint);
return DAG.getNode(ISD::BITCAST, DL, VT, SignOr);
}

return SDValue();
}

Expand Down
28 changes: 28 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2920,6 +2920,34 @@ bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const {
return MaskedValueIsZero(Op, APInt::getSignMask(BitWidth), Depth);
}

bool SelectionDAG::SignBitIsZeroFP(SDValue Op, unsigned Depth) const {
if (Depth >= MaxRecursionDepth)
return false; // Limit search depth.

unsigned Opc = Op.getOpcode();
switch (Opc) {
case ISD::FABS:
return true;
case ISD::AssertNoFPClass: {
FPClassTest NoFPClass =
static_cast<FPClassTest>(Op.getConstantOperandVal(1));

const FPClassTest TestMask = fcNan | fcNegative;
return (NoFPClass & TestMask) == TestMask;
}
case ISD::ARITH_FENCE:
return SignBitIsZeroFP(Op, Depth + 1);
case ISD::FEXP:
case ISD::FEXP2:
case ISD::FEXP10:
return Op->getFlags().hasNoNaNs();
default:
return false;
}

llvm_unreachable("covered opcode switch");
}

/// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero. We use
/// this predicate to simplify operations downstream. Mask is known to be zero
/// for bits that V cannot have.
Expand Down
17 changes: 17 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4638,6 +4638,12 @@ static std::optional<ConstantRange> getRange(const Instruction &I) {
return std::nullopt;
}

static FPClassTest getNoFPClass(const Instruction &I) {
if (const auto *CB = dyn_cast<CallBase>(&I))
return CB->getRetNoFPClass();
return fcNone;
}

void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
if (I.isAtomic())
return visitAtomicLoad(I);
Expand Down Expand Up @@ -9132,6 +9138,7 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee,

if (Result.first.getNode()) {
Result.first = lowerRangeToAssertZExt(DAG, CB, Result.first);
Result.first = lowerNoFPClassToAssertNoFPClass(DAG, CB, Result.first);
setValue(&CB, Result.first);
}

Expand Down Expand Up @@ -10718,6 +10725,16 @@ SDValue SelectionDAGBuilder::lowerRangeToAssertZExt(SelectionDAG &DAG,
return DAG.getMergeValues(Ops, SL);
}

SDValue SelectionDAGBuilder::lowerNoFPClassToAssertNoFPClass(
SelectionDAG &DAG, const Instruction &I, SDValue Op) {
FPClassTest Classes = getNoFPClass(I);
if (Classes == fcNone)
return Op;

return DAG.getNode(ISD::AssertNoFPClass, SDLoc(Op), Op.getValueType(), Op,
DAG.getTargetConstant(Classes, SDLoc(), MVT::i32));
}

/// Populate a CallLowerinInfo (into \p CLI) based on the properties of
/// the call being lowered.
///
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,10 @@ class SelectionDAGBuilder {
SDValue lowerRangeToAssertZExt(SelectionDAG &DAG, const Instruction &I,
SDValue Op);

// Lower nofpclass attributes to AssertNoFPClass
SDValue lowerNoFPClassToAssertNoFPClass(SelectionDAG &DAG,
const Instruction &I, SDValue Op);

void populateCallLoweringInfo(TargetLowering::CallLoweringInfo &CLI,
const CallBase *Call, unsigned ArgIdx,
unsigned NumArgs, SDValue Callee,
Expand Down
23 changes: 9 additions & 14 deletions llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
Original file line number Diff line number Diff line change
Expand Up @@ -345,15 +345,13 @@ define float @test_copysign_pow_fast_f32__integral_y(float %x, i32 %y.i) {
; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
; GFX9-NEXT: v_fma_f32 v2, v2, v1, v3
; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX9-NEXT: v_exp_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX9-NEXT: v_not_b32_e32 v3, 63
; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1
; GFX9-NEXT: v_ldexp_f32 v2, v2, v3
; GFX9-NEXT: v_and_b32_e32 v0, v1, v0
; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_bfi_b32 v0, s4, v2, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1
; GFX9-NEXT: v_and_or_b32 v0, v1, v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%y = sitofp i32 %y.i to float
%y.fptosi = fptosi float %y to i32
Expand All @@ -379,7 +377,7 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-NEXT: v_writelane_b32 v43, s16, 15
; GFX9-NEXT: v_writelane_b32 v43, s16, 14
; GFX9-NEXT: v_writelane_b32 v43, s30, 0
; GFX9-NEXT: v_writelane_b32 v43, s31, 1
; GFX9-NEXT: v_writelane_b32 v43, s34, 2
Expand All @@ -391,19 +389,18 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
; GFX9-NEXT: v_writelane_b32 v43, s48, 8
; GFX9-NEXT: v_writelane_b32 v43, s49, 9
; GFX9-NEXT: v_writelane_b32 v43, s50, 10
; GFX9-NEXT: v_writelane_b32 v43, s51, 11
; GFX9-NEXT: s_addk_i32 s32, 0x800
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v43, s52, 12
; GFX9-NEXT: v_writelane_b32 v43, s51, 11
; GFX9-NEXT: v_mov_b32_e32 v42, v1
; GFX9-NEXT: v_writelane_b32 v43, s53, 13
; GFX9-NEXT: v_writelane_b32 v43, s52, 12
; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42
; GFX9-NEXT: s_getpc_b64 s[16:17]
; GFX9-NEXT: s_add_u32 s16, s16, _Z4log2d@rel32@lo+4
; GFX9-NEXT: s_addc_u32 s17, s17, _Z4log2d@rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v43, s54, 14
; GFX9-NEXT: v_writelane_b32 v43, s53, 13
; GFX9-NEXT: v_mov_b32_e32 v40, v31
; GFX9-NEXT: v_mov_b32_e32 v41, v2
; GFX9-NEXT: s_mov_b32 s50, s15
Expand All @@ -414,7 +411,6 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
; GFX9-NEXT: s_mov_b64 s[36:37], s[8:9]
; GFX9-NEXT: s_mov_b64 s[38:39], s[6:7]
; GFX9-NEXT: s_mov_b64 s[48:49], s[4:5]
; GFX9-NEXT: s_brev_b32 s54, -2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT: v_cvt_f64_i32_e32 v[2:3], v41
; GFX9-NEXT: s_getpc_b64 s[16:17]
Expand All @@ -436,8 +432,7 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT: v_bfi_b32 v1, s54, v1, v2
; GFX9-NEXT: v_readlane_b32 s54, v43, 14
; GFX9-NEXT: v_or_b32_e32 v1, v1, v2
; GFX9-NEXT: v_readlane_b32 s53, v43, 13
; GFX9-NEXT: v_readlane_b32 s52, v43, 12
; GFX9-NEXT: v_readlane_b32 s51, v43, 11
Expand All @@ -453,7 +448,7 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
; GFX9-NEXT: v_readlane_b32 s31, v43, 1
; GFX9-NEXT: v_readlane_b32 s30, v43, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v43, 15
; GFX9-NEXT: v_readlane_b32 s4, v43, 14
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
Expand Down
22 changes: 6 additions & 16 deletions llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,7 @@ define half @copysign_known_signmask_f16_known_positive_mag(half nofpclass(nan n
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 15, v1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
%signmask = shl i16 %sign, 15
%signmask.bitcast = bitcast i16 %signmask to half
Expand All @@ -94,9 +93,7 @@ define float @copysign_known_signmask_f32_known_positive_mag(float nofpclass(nan
; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1
; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 31, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%signmask = shl i32 %sign, 31
%signmask.bitcast = bitcast i32 %signmask to float
Expand All @@ -109,8 +106,7 @@ define double @copysign_known_signmask_f64_known_positive_mag(double nofpclass(n
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 31, v2
; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
; GFX9-NEXT: v_or_b32_e32 v1, v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%signmask = shl i64 %sign, 63
%signmask.bitcast = bitcast i64 %signmask to double
Expand All @@ -130,11 +126,9 @@ define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp(float %x,
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
; GFX9-NEXT: v_exp_f32_e32 v0, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1
; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_mul_f32_e32 v2, 0x114b4ea4, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 31, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%signbit.known.zero = call nnan afn float @llvm.exp.f32(float %x)
%signmask = shl i32 %sign, 31
Expand All @@ -155,10 +149,8 @@ define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp2(float %x
; GFX9-NEXT: v_exp_f32_e32 v0, v0
; GFX9-NEXT: v_not_b32_e32 v2, 63
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1
; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 31, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%signbit.known.zero = call nnan afn float @llvm.exp2.f32(float %x)
%signmask = shl i32 %sign, 31
Expand All @@ -179,10 +171,8 @@ define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp10(float %
; GFX9-NEXT: v_exp_f32_e32 v0, v0
; GFX9-NEXT: v_not_b32_e32 v2, 63
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1
; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 31, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%signbit.known.zero = call nnan afn float @llvm.exp2.f32(float %x)
%signmask = shl i32 %sign, 31
Expand Down
16 changes: 4 additions & 12 deletions llvm/test/CodeGen/AMDGPU/nofpclass-call.ll
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,7 @@ define float @call_nofpclass_funcs_f32(ptr addrspace(1) %ptr) {
; CHECK-NEXT: v_mov_b32_e32 v3, v0
; CHECK-NEXT: v_mov_b32_e32 v0, v2
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_max_f32_e32 v1, v3, v3
; CHECK-NEXT: v_max_f32_e32 v0, v0, v0
; CHECK-NEXT: v_min_f32_e32 v0, v1, v0
; CHECK-NEXT: v_min_f32_e32 v0, v3, v0
; CHECK-NEXT: v_readlane_b32 s31, v4, 1
; CHECK-NEXT: v_readlane_b32 s30, v4, 0
; CHECK-NEXT: s_mov_b32 s32, s33
Expand Down Expand Up @@ -87,12 +85,8 @@ define <2 x float> @call_nofpclass_funcs_v2f32(ptr addrspace(1) %ptr) {
; CHECK-NEXT: v_mov_b32_e32 v0, v3
; CHECK-NEXT: v_mov_b32_e32 v1, v2
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_max_f32_e32 v2, v4, v4
; CHECK-NEXT: v_max_f32_e32 v0, v0, v0
; CHECK-NEXT: v_min_f32_e32 v0, v2, v0
; CHECK-NEXT: v_max_f32_e32 v2, v5, v5
; CHECK-NEXT: v_max_f32_e32 v1, v1, v1
; CHECK-NEXT: v_min_f32_e32 v1, v2, v1
; CHECK-NEXT: v_min_f32_e32 v0, v4, v0
; CHECK-NEXT: v_min_f32_e32 v1, v5, v1
; CHECK-NEXT: v_readlane_b32 s31, v6, 1
; CHECK-NEXT: v_readlane_b32 s30, v6, 0
; CHECK-NEXT: s_mov_b32 s32, s33
Expand Down Expand Up @@ -142,12 +136,10 @@ define double @call_nofpclass_funcs_f64(ptr addrspace(1) %ptr) {
; CHECK-NEXT: v_mov_b32_e32 v0, v5
; CHECK-NEXT: v_mov_b32_e32 v1, v4
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; CHECK-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; CHECK-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
; CHECK-NEXT: v_readlane_b32 s31, v6, 1
; CHECK-NEXT: v_readlane_b32 s30, v6, 0
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
Expand Down
Loading