Skip to content

Commit 90cba38

Browse files
authored
Correctly reuse storage with SubArrays (#33)
The definition of IntVector caused an implicit copy to convert SA to a Vector{Int}, negating the efforts to reuse memory. Replace it with @view to get reasonable memory consumption. Suffing sorting a 100 MiB UInt8 array (an aarch64 executable): before fix 10.324155 seconds (75 allocations: 16.797 GiB, 1.12% gc time) after fix 8.059011 seconds (26 allocations: 400.007 MiB, 0.00% gc time) Comparison with a few commonly-used suffix sorting libraries: libsais 7.563240 seconds (3 allocations: 400.000 MiB, 0.00% gc time) divsufsort 4.244779 seconds (3 allocations: 400.000 MiB, 0.00% gc time)
1 parent 4262f27 commit 90cba38

File tree

2 files changed

+27
-30
lines changed

2 files changed

+27
-30
lines changed

src/sais.jl

Lines changed: 18 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -24,19 +24,7 @@
2424
* OTHER DEALINGS IN THE SOFTWARE.
2525
=#
2626

27-
struct IntVector <: AbstractVector{Int}
28-
vec::Array{Int,1}
29-
off::Int
30-
end
31-
Base.size(v::IntVector) = (length(v.vec)-v.off,)
32-
Base.getindex(v::IntVector, key) = v.vec[v.off+Int(key)]
33-
Base.setindex!(v::IntVector, value, key) = v.vec[v.off+Int(key)] = value
34-
35-
# TODO:
36-
# - refactor code to simplify
37-
# - build user interface for string operations
38-
39-
function getcounts(T::AbstractVector{<:Integer}, C::IntVector, n::Int, k::Int)
27+
function getcounts(T::AbstractVector{<:Integer}, C::AbstractVector{<:Integer}, n::Int, k::Int)
4028
for i = 1:k
4129
C[i] = 0
4230
end
@@ -45,7 +33,7 @@ function getcounts(T::AbstractVector{<:Integer}, C::IntVector, n::Int, k::Int)
4533
end
4634
end
4735

48-
function getbuckets(C::IntVector, B::IntVector, k::Int, isend::Bool)
36+
function getbuckets(C::AbstractVector{<:Integer}, B::AbstractVector{<:Integer}, k::Int, isend::Bool)
4937
s = 0
5038
if isend != false
5139
for i = 1:k
@@ -71,28 +59,28 @@ function sais(
7159
pidx = 0
7260
flags = 0
7361
if k <= 256
74-
C = IntVector(zeros(Int, k), 0)
62+
C = zeros(Int, k)
7563
if k <= fs
76-
B = IntVector(SA, n + fs - k)
64+
B = @view SA[n+fs-k+1:end]
7765
flags = 1
7866
else
79-
B = IntVector(zeros(Int, k), 0)
67+
B = zeros(Int, k)
8068
flags = 3
8169
end
8270
elseif k <= fs
83-
C = IntVector(SA, n + fs - k)
71+
C = @view SA[n+fs-k+1:end]
8472
if k <= fs - k
85-
B = IntVector(SA, n + fs - 2k)
73+
B = @view SA[n+fs-2k+1:end]
8674
flags = 0
8775
elseif k <= 1024
88-
B = IntVector(zeros(Int, k), 0)
76+
B = zeros(Int, k)
8977
flags = 2
9078
else
9179
B = C
9280
flags = 8
9381
end
9482
else
95-
C = B = IntVector(zeros(Int, k), 0)
83+
C = B = zeros(Int, k)
9684
flags = 4 | 8
9785
end
9886
# stage 1
@@ -156,7 +144,7 @@ function sais(
156144
j -= 1
157145
end
158146
end
159-
RA = IntVector(SA, m + newfs)
147+
RA = @view SA[m+newfs+1:end]
160148
sais(RA, SA, newfs, m, name, false)
161149

162150
i = n
@@ -183,10 +171,10 @@ function sais(
183171
SA[i] = SA[m + SA[i] + 1]
184172
end
185173
if flags & 4 != 0
186-
C = B = IntVector(zeros(Int, k), 0)
174+
C = B = zeros(Int, k)
187175
end
188176
if flags & 2 != 0
189-
B = IntVector(zeros(Int, k), 0)
177+
B = zeros(Int, k)
190178
end
191179
end
192180
# stage 3
@@ -231,8 +219,8 @@ end
231219
function LMSsort(
232220
T::AbstractVector{<:Integer},
233221
SA::IndexVector,
234-
C::IntVector,
235-
B::IntVector,
222+
C::AbstractVector{<:Integer},
223+
B::AbstractVector{<:Integer},
236224
n::Int,
237225
k::Int,
238226
)
@@ -345,8 +333,8 @@ end
345333
function induceSA(
346334
T::AbstractVector{<:Integer},
347335
SA::IndexVector,
348-
C::IntVector,
349-
B::IntVector,
336+
C::AbstractVector{<:Integer},
337+
B::AbstractVector{<:Integer},
350338
n::Int,
351339
k::Int,
352340
)
@@ -395,8 +383,8 @@ end
395383
function computeBWT(
396384
T::AbstractVector{<:Integer},
397385
SA::IndexVector,
398-
C::IntVector,
399-
B::IntVector,
386+
C::AbstractVector{<:Integer},
387+
B::AbstractVector{<:Integer},
400388
n::Int,
401389
k::Int,
402390
)

test/runtests.jl

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,3 +184,12 @@ end
184184
suffixes = [utext[i:end] for i in sa]
185185
@test issorted(suffixes)
186186
end
187+
188+
@testset "Test memory consumption" begin
189+
# On a random UInt8 string, with UInt32 indices, SA-IS should allocate ~4n bytes.
190+
N = 10 * 1024^2 # 10 MiB
191+
s = rand(UInt8, N)
192+
# Avoid allocations from compilation
193+
suffixsort(s)
194+
@test (@allocated suffixsort(s)) < 4N * 1.05
195+
end

0 commit comments

Comments
 (0)