|
| 1 | +local tap = require('tap') |
| 2 | +local ffi = require('ffi') |
| 3 | + |
| 4 | +-- This test demonstrates LuaJIT's incorrect emitting of LDP/STP |
| 5 | +-- instruction fused from LDR/STR with negative offset and |
| 6 | +-- positive offset with the same lower bits on arm64. |
| 7 | +-- See also https://github.com/LuaJIT/LuaJIT/pull/1075. |
| 8 | +local test = tap.test('lj-1075-arm64-incorrect-ldp-stp-fusion'):skipcond({ |
| 9 | + ['Test requires JIT enabled'] = not jit.status(), |
| 10 | +}) |
| 11 | + |
| 12 | +test:plan(6) |
| 13 | + |
| 14 | +-- Amount of iterations to compile and run the invariant part of |
| 15 | +-- the trace. |
| 16 | +local N_ITERATIONS = 4 |
| 17 | + |
| 18 | +local EXPECTED = 42 |
| 19 | + |
| 20 | +-- 4 slots of redzone for int64_t load/store. |
| 21 | +local REDZONE = 4 |
| 22 | +local MASK_IMM7 = 0x7f |
| 23 | +local BUFLEN = (MASK_IMM7 + REDZONE) * 4 |
| 24 | +local buf = ffi.new('unsigned char [' .. BUFLEN .. ']', 0) |
| 25 | + |
| 26 | +local function clear_buf() |
| 27 | + ffi.fill(buf, ffi.sizeof(buf), 0) |
| 28 | +end |
| 29 | + |
| 30 | +-- Initialize the buffer with simple values. |
| 31 | +local function init_buf() |
| 32 | + -- Limit to fill the buffer. 0 in the top part helps |
| 33 | + -- to detect the issue. |
| 34 | + local LIMIT = BUFLEN - 12 |
| 35 | + for i = 0, LIMIT - 1 do |
| 36 | + buf[i] = i |
| 37 | + end |
| 38 | + for i = LIMIT, BUFLEN - 1 do |
| 39 | + buf[i] = 0 |
| 40 | + end |
| 41 | +end |
| 42 | + |
| 43 | +jit.opt.start('hotloop=1') |
| 44 | + |
| 45 | +-- Assume we have stores/loads from the pointer with offset |
| 46 | +-- +488 and -16. The lower 7 bits of the offset (-16) >> 2 are |
| 47 | +-- 1111100. These bits are the same as for the offset (488 + 8). |
| 48 | +-- Thus, before the patch, these two instructions: |
| 49 | +-- | str x20, [x21, #488] |
| 50 | +-- | stur x20, [x21, #-16] |
| 51 | +-- are incorrectly fused to the: |
| 52 | +-- | stp x20, x20, [x21, #488] |
| 53 | + |
| 54 | +-- Test stores. |
| 55 | + |
| 56 | +local start = ffi.cast('unsigned char *', buf) |
| 57 | +-- Use constants to allow optimization to take place. |
| 58 | +local base_ptr = start + 16 |
| 59 | +for _ = 1, N_ITERATIONS do |
| 60 | + -- Save the result only for the last iteration. |
| 61 | + clear_buf() |
| 62 | + -- These 2 accesses become `base_ptr + 488` and `base_ptr + 496` |
| 63 | + -- on the trace before the patch. |
| 64 | + ffi.cast('uint64_t *', base_ptr + 488)[0] = EXPECTED |
| 65 | + ffi.cast('uint64_t *', base_ptr - 16)[0] = EXPECTED |
| 66 | +end |
| 67 | + |
| 68 | +test:is(buf[488 + 16], EXPECTED, 'correct store top value') |
| 69 | +test:is(buf[0], EXPECTED, 'correct store bottom value') |
| 70 | + |
| 71 | +-- Test loads. |
| 72 | + |
| 73 | +init_buf() |
| 74 | + |
| 75 | +local top, bottom |
| 76 | +for _ = 1, N_ITERATIONS do |
| 77 | + -- These 2 accesses become `base_ptr + 488` and `base_ptr + 496` |
| 78 | + -- on the trace before the patch. |
| 79 | + top = ffi.cast('uint64_t *', base_ptr + 488)[0] |
| 80 | + bottom = ffi.cast('uint64_t *', base_ptr - 16)[0] |
| 81 | +end |
| 82 | + |
| 83 | +test:is(top, 0xfffefdfcfbfaf9f8ULL, 'correct load top value') |
| 84 | +test:is(bottom, 0x706050403020100ULL, 'correct load bottom value') |
| 85 | + |
| 86 | +-- Another reproducer that is based on the snapshot restoring. |
| 87 | +-- Its advantage is avoiding FFI usage. |
| 88 | + |
| 89 | +-- Snapshot slots are restored in the reversed order. |
| 90 | +-- The recording order is the following (from the bottom of the |
| 91 | +-- trace to the top): |
| 92 | +-- - 0th (ofs == -16) -- `f64()` replaced the `tail64()` on the |
| 93 | +-- stack, |
| 94 | +-- - 63rd (ofs == 488) -- 1, |
| 95 | +-- - 64th (ofs == 496) -- 2. |
| 96 | +-- At recording, the instructions for the 0th and 63rd slots are |
| 97 | +-- merged like the following: |
| 98 | +-- | str x3, [x19, #496] |
| 99 | +-- | stp x2, x1, [x19, #488] |
| 100 | +-- The first store is dominated by the stp, so the restored value |
| 101 | +-- is incorrect. |
| 102 | + |
| 103 | +-- Function with 63 slots on the stack. |
| 104 | +local function f63() |
| 105 | + -- 61 unused slots to avoid extra stores in between. |
| 106 | + -- luacheck: no unused |
| 107 | + local _, _, _, _, _, _, _, _, _, _ |
| 108 | + local _, _, _, _, _, _, _, _, _, _ |
| 109 | + local _, _, _, _, _, _, _, _, _, _ |
| 110 | + local _, _, _, _, _, _, _, _, _, _ |
| 111 | + local _, _, _, _, _, _, _, _, _, _ |
| 112 | + local _, _, _, _, _, _, _, _, _, _ |
| 113 | + local _ |
| 114 | + return 1, 2 |
| 115 | +end |
| 116 | + |
| 117 | +local function tail63() |
| 118 | + return f63() |
| 119 | +end |
| 120 | + |
| 121 | +-- Record the trace. |
| 122 | +tail63() |
| 123 | +tail63() |
| 124 | +-- Run the trace. |
| 125 | +local one, two = tail63() |
| 126 | +test:is(one, 1, 'correct 1st value on stack') |
| 127 | +test:is(two, 2, 'correct 2nd value on stack') |
| 128 | + |
| 129 | +test:done(true) |
0 commit comments