Skip to content

Commit 249ea3c

Browse files
Mike PallBuristan
authored andcommitted
ARM64: Fix LDP/STP fusion (again).
Reported and analyzed by Zhongwei Yao. Fix by Peter Cawley. (cherry picked from commit b8c6ccd) Assume we have stores/loads from the pointer with offset +488 and -16. The lower bits of the offset are the same as for the offset (488 + 8). This leads to the incorrect fusion of these instructions: | str x20, [x21, 488] | stur x20, [x21, -16] to the following instruction: | stp x20, x20, [x21, 488] This patch prevents this fusion by more accurate offset comparison. Sergey Kaplun: * added the description and the test for the problem Part of tarantool/tarantool#11691 Reviewed-by: Sergey Bronnikov <[email protected]> Signed-off-by: Sergey Kaplun <[email protected]>
1 parent bb02057 commit 249ea3c

File tree

2 files changed

+142
-4
lines changed

2 files changed

+142
-4
lines changed

src/lj_emit_arm64.h

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,17 @@ static int emit_checkofs(A64Ins ai, int64_t ofs)
121121
}
122122
}
123123

124+
static LJ_AINLINE uint32_t emit_lso_pair_candidate(A64Ins ai, int ofs, int sc)
125+
{
126+
if (ofs >= 0) {
127+
return ai | A64F_U12(ofs>>sc); /* Subsequent lj_ror checks ofs. */
128+
} else if (ofs >= -256) {
129+
return (ai^A64I_LS_U) | A64F_S9(ofs & 0x1ff);
130+
} else {
131+
return A64F_D(31); /* Will mismatch prev. */
132+
}
133+
}
134+
124135
static void emit_lso(ASMState *as, A64Ins ai, Reg rd, Reg rn, int64_t ofs)
125136
{
126137
int ot = emit_checkofs(ai, ofs), sc = (ai >> 30) & 3;
@@ -132,11 +143,9 @@ static void emit_lso(ASMState *as, A64Ins ai, Reg rd, Reg rn, int64_t ofs)
132143
uint32_t prev = *as->mcp & ~A64F_D(31);
133144
int ofsm = ofs - (1<<sc), ofsp = ofs + (1<<sc);
134145
A64Ins aip;
135-
if (prev == (ai | A64F_N(rn) | A64F_U12(ofsm>>sc)) ||
136-
prev == ((ai^A64I_LS_U) | A64F_N(rn) | A64F_S9(ofsm&0x1ff))) {
146+
if (prev == emit_lso_pair_candidate(ai | A64F_N(rn), ofsm, sc)) {
137147
aip = (A64F_A(rd) | A64F_D(*as->mcp & 31));
138-
} else if (prev == (ai | A64F_N(rn) | A64F_U12(ofsp>>sc)) ||
139-
prev == ((ai^A64I_LS_U) | A64F_N(rn) | A64F_S9(ofsp&0x1ff))) {
148+
} else if (prev == emit_lso_pair_candidate(ai | A64F_N(rn), ofsp, sc)) {
140149
aip = (A64F_D(rd) | A64F_A(*as->mcp & 31));
141150
ofsm = ofs;
142151
} else {
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
local tap = require('tap')
2+
local ffi = require('ffi')
3+
4+
-- This test demonstrates LuaJIT's incorrect emitting of LDP/STP
5+
-- instruction fused from LDR/STR with negative offset and
6+
-- positive offset with the same lower bits on arm64.
7+
-- See also https://github.com/LuaJIT/LuaJIT/pull/1075.
8+
local test = tap.test('lj-1075-arm64-incorrect-ldp-stp-fusion'):skipcond({
9+
['Test requires JIT enabled'] = not jit.status(),
10+
})
11+
12+
test:plan(6)
13+
14+
-- Amount of iterations to compile and run the invariant part of
15+
-- the trace.
16+
local N_ITERATIONS = 4
17+
18+
local EXPECTED = 42
19+
20+
-- 4 slots of redzone for int64_t load/store.
21+
local REDZONE = 4
22+
local MASK_IMM7 = 0x7f
23+
local BUFLEN = (MASK_IMM7 + REDZONE) * 4
24+
local buf = ffi.new('unsigned char [' .. BUFLEN .. ']', 0)
25+
26+
local function clear_buf()
27+
ffi.fill(buf, ffi.sizeof(buf), 0)
28+
end
29+
30+
-- Initialize the buffer with simple values.
31+
local function init_buf()
32+
-- Limit to fill the buffer. 0 in the top part helps
33+
-- to detect the issue.
34+
local LIMIT = BUFLEN - 12
35+
for i = 0, LIMIT - 1 do
36+
buf[i] = i
37+
end
38+
for i = LIMIT, BUFLEN - 1 do
39+
buf[i] = 0
40+
end
41+
end
42+
43+
jit.opt.start('hotloop=1')
44+
45+
-- Assume we have stores/loads from the pointer with offset
46+
-- +488 and -16. The lower 7 bits of the offset (-16) >> 2 are
47+
-- 1111100. These bits are the same as for the offset (488 + 8).
48+
-- Thus, before the patch, these two instructions:
49+
-- | str x20, [x21, #488]
50+
-- | stur x20, [x21, #-16]
51+
-- are incorrectly fused to the:
52+
-- | stp x20, x20, [x21, #488]
53+
54+
-- Test stores.
55+
56+
local start = ffi.cast('unsigned char *', buf)
57+
-- Use constants to allow optimization to take place.
58+
local base_ptr = start + 16
59+
for _ = 1, N_ITERATIONS do
60+
-- Save the result only for the last iteration.
61+
clear_buf()
62+
-- These 2 accesses become `base_ptr + 488` and `base_ptr + 496`
63+
-- on the trace before the patch.
64+
ffi.cast('uint64_t *', base_ptr + 488)[0] = EXPECTED
65+
ffi.cast('uint64_t *', base_ptr - 16)[0] = EXPECTED
66+
end
67+
68+
test:is(buf[488 + 16], EXPECTED, 'correct store top value')
69+
test:is(buf[0], EXPECTED, 'correct store bottom value')
70+
71+
-- Test loads.
72+
73+
init_buf()
74+
75+
local top, bottom
76+
for _ = 1, N_ITERATIONS do
77+
-- These 2 accesses become `base_ptr + 488` and `base_ptr + 496`
78+
-- on the trace before the patch.
79+
top = ffi.cast('uint64_t *', base_ptr + 488)[0]
80+
bottom = ffi.cast('uint64_t *', base_ptr - 16)[0]
81+
end
82+
83+
test:is(top, 0xfffefdfcfbfaf9f8ULL, 'correct load top value')
84+
test:is(bottom, 0x706050403020100ULL, 'correct load bottom value')
85+
86+
-- Another reproducer that is based on the snapshot restoring.
87+
-- Its advantage is avoiding FFI usage.
88+
89+
-- Snapshot slots are restored in the reversed order.
90+
-- The recording order is the following (from the bottom of the
91+
-- trace to the top):
92+
-- - 0th (ofs == -16) -- `f64()` replaced the `tail64()` on the
93+
-- stack,
94+
-- - 63rd (ofs == 488) -- 1,
95+
-- - 64th (ofs == 496) -- 2.
96+
-- At recording, the instructions for the 0th and 63rd slots are
97+
-- merged like the following:
98+
-- | str x3, [x19, #496]
99+
-- | stp x2, x1, [x19, #488]
100+
-- The first store is dominated by the stp, so the restored value
101+
-- is incorrect.
102+
103+
-- Function with 63 slots on the stack.
104+
local function f63()
105+
-- 61 unused slots to avoid extra stores in between.
106+
-- luacheck: no unused
107+
local _, _, _, _, _, _, _, _, _, _
108+
local _, _, _, _, _, _, _, _, _, _
109+
local _, _, _, _, _, _, _, _, _, _
110+
local _, _, _, _, _, _, _, _, _, _
111+
local _, _, _, _, _, _, _, _, _, _
112+
local _, _, _, _, _, _, _, _, _, _
113+
local _
114+
return 1, 2
115+
end
116+
117+
local function tail63()
118+
return f63()
119+
end
120+
121+
-- Record the trace.
122+
tail63()
123+
tail63()
124+
-- Run the trace.
125+
local one, two = tail63()
126+
test:is(one, 1, 'correct 1st value on stack')
127+
test:is(two, 2, 'correct 2nd value on stack')
128+
129+
test:done(true)

0 commit comments

Comments
 (0)