diff --git a/src/ZArray.jl b/src/ZArray.jl index 9d9d915..38715c0 100644 --- a/src/ZArray.jl +++ b/src/ZArray.jl @@ -1,6 +1,6 @@ import JSON import OffsetArrays: OffsetArray -import DiskArrays: AbstractDiskArray +import DiskArrays: AbstractDiskArray, findchunk, max_chunksize import DiskArrays """ @@ -122,8 +122,7 @@ trans_ind(r, bs) For a given index and blocksize determines which chunks of the Zarray will have to be accessed. """ -trans_ind(r::AbstractUnitRange, bs) = fld1(first(r),bs):fld1(last(r),bs) -trans_ind(r::Integer, bs) = fld1(r,bs) +trans_ind(r, bs) = findchunk(bs,r) function boundint(r1, s2, o2) r2 = range(o2+1,length=s2) @@ -134,14 +133,14 @@ end function getchunkarray(z::ZArray{>:Missing}) # temporary workaround to use strings as data values - inner = fill(z.metadata.fill_value, z.metadata.chunks) + inner = fill(z.metadata.fill_value, max_chunksize.(z.metadata.chunks)) a = SenMissArray(inner,z.metadata.fill_value) end _zero(T) = zero(T) _zero(T::Type{<:MaxLengthString}) = T("") _zero(T::Type{ASCIIChar}) = ASCIIChar(0) _zero(::Type{<:Vector{T}}) where T = T[] -getchunkarray(z::ZArray) = fill(_zero(eltype(z)), z.metadata.chunks) +getchunkarray(z::ZArray) = fill(_zero(eltype(z)), max_chunksize.(z.metadata.chunks)) maybeinner(a::Array) = a maybeinner(a::SenMissArray) = a.x @@ -247,7 +246,7 @@ end DiskArrays.readblock!(a::ZArray,aout,i::AbstractUnitRange...) = readblock!(aout,a,CartesianIndices(i)) DiskArrays.writeblock!(a::ZArray,v,i::AbstractUnitRange...) = writeblock!(v,a,CartesianIndices(i)) DiskArrays.haschunks(::ZArray) = DiskArrays.Chunked() -DiskArrays.eachchunk(a::ZArray) = DiskArrays.GridChunks(a,a.metadata.chunks) +DiskArrays.eachchunk(a::ZArray) = DiskArrays.GridChunks(a.metadata.chunks...) """ uncompress_raw!(a::DenseArray{T},z::ZArray{T,N},i::CartesianIndex{N}) @@ -278,7 +277,7 @@ function uncompress_to_output!(aout,output_base_offsets,z,chunk_compressed,curre end function compress_raw(a,z) - length(a) == prod(z.metadata.chunks) || throw(DimensionMismatch("Array size does not equal chunk size")) + #length(a) == prod(z.metadata.chunks) || throw(DimensionMismatch("Array size does not equal chunk size")) if !all(isequal(z.metadata.fill_value),a) dtemp = UInt8[] zcompress!(dtemp,a,z.metadata.compressor, z.metadata.filters) @@ -383,7 +382,7 @@ chunkindices(z::ZArray) Returns the Cartesian Indices of the chunks of a given ZArray """ -chunkindices(z::ZArray) = CartesianIndices(map((s, c) -> 1:ceil(Int, s/c), z.metadata.shape[], z.metadata.chunks)) +chunkindices(z::ZArray) = CartesianIndices(length.(z.metadata.chunks)) """ zzeros(T, dims...; kwargs... ) @@ -392,7 +391,7 @@ Creates a zarr array and initializes all values with zero. Accepts the same keyw """ function zzeros(T,dims...;kwargs...) z = zcreate(T,dims...;kwargs...) - as = zeros(T, z.metadata.chunks...) + as = zeros(T, max_chunksize.(z.metadata.chunks)...) data_encoded = compress_raw(as,z) p = z.path for i in chunkindices(z) @@ -459,9 +458,9 @@ end function prune_oob_chunks(s::AbstractStore,path,oldsize, newsize, chunks) dimstoshorten = findall(map(<,newsize, oldsize)) + allchunkranges = Base.OneTo.(length.(chunks)) for idim in dimstoshorten - delrange = (fld1(newsize[idim],chunks[idim])+1):(fld1(oldsize[idim],chunks[idim])) - allchunkranges = map(i->1:fld1(oldsize[i],chunks[i]),1:length(oldsize)) + delrange = (findchunk(chunks[idim],newsize[idim])+1):findchunk(chunks[idim],oldsize[idim]) r = (allchunkranges[1:idim-1]..., delrange, allchunkranges[idim+1:end]...) for cI in CartesianIndices(r) delete!(s,path,cI) diff --git a/src/metadata.jl b/src/metadata.jl index c7660e7..fa14c08 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -1,5 +1,6 @@ import Dates: Date, DateTime - +using DiskArrays: RegularChunks, IrregularChunks +const SomeChunk = Union{RegularChunks, IrregularChunks} """NumPy array protocol type string (typestr) format A string providing the basic type of the homogeneous array. The basic string format @@ -114,6 +115,9 @@ function typestr(s::AbstractString, filterlist=nothing) end end +arraysize_from_chunksize(cs::RegularChunks)=cs.s +arraysize_from_chunksize(cs::IrregularChunks)=last(cs.offsets) + """Metadata configuration of the stored array Each array requires essential configuration metadata to be stored, enabling correct @@ -125,7 +129,7 @@ https://zarr.readthedocs.io/en/stable/spec/v2.html#metadata struct Metadata{T, N, C, F} zarr_format::Int shape::Base.RefValue{NTuple{N, Int}} - chunks::NTuple{N, Int} + chunks::NTuple{N, SomeChunk} dtype::String # structured data types not yet supported compressor::C fill_value::Union{T, Nothing} @@ -136,9 +140,17 @@ struct Metadata{T, N, C, F} zarr_format == 2 || throw(ArgumentError("Zarr.jl currently only support v2 of the protocol")) #Do some sanity checks to make sure we have a sane array any(<(0), shape) && throw(ArgumentError("Size must be positive")) - any(<(1), chunks) && throw(ArgumentError("Chunk size must be >= 1 along each dimension")) + chunks = map(shape,chunks) do s,c + if isa(c,Int) + c=RegularChunks(c,0,s) + elseif isa(c,AbstractVector{<:Integer}) + c=IrregularChunks(chunksizes=c) + end + arraysize_from_chunksize(c) < s && throw(ArgumentError("Size of chunks must be larger or equal the size of the array")) + c + end order === 'C' || throw(ArgumentError("Currently only 'C' storage order is supported")) - new{T2, N, C, F}(zarr_format, Base.RefValue{NTuple{N,Int}}(shape), chunks, dtype, compressor,fill_value, order, filters) + new{T2, N, C, F}(zarr_format, Base.RefValue{NTuple{N,Int}}(shape), (chunks...,), dtype, compressor,fill_value, order, filters) end end @@ -157,7 +169,7 @@ end "Construct Metadata based on your data" -function Metadata(A::AbstractArray{T, N}, chunks::NTuple{N, Int}; +function Metadata(A::AbstractArray{T, N}, chunks::Tuple; zarr_format::Integer=2, compressor::C=BloscCompressor(), fill_value::Union{T, Nothing}=nothing, @@ -196,12 +208,15 @@ function Metadata(d::AbstractDict, fill_as_missing) fv = fill_value_decoding(d["fill_value"], T) + chunks = map(d["chunks"],d["shape"]) do c,s + isa(c,Integer) ? RegularChunks(c,0,s) : IrregularChunks(chunksizes=c) + end TU = (fv === nothing || !fill_as_missing) ? T : Union{T,Missing} Metadata{TU, N, C, F}( d["zarr_format"], NTuple{N, Int}(d["shape"]) |> reverse, - NTuple{N, Int}(d["chunks"]) |> reverse, + chunks |> reverse, d["dtype"], compressor, fv, @@ -210,12 +225,15 @@ function Metadata(d::AbstractDict, fill_as_missing) ) end +chunk_encoding(c::RegularChunks) = c.cs +chunk_encoding(c::IrregularChunks) = length.(c) + "Describes how to lower Metadata to JSON, used in json(::Metadata)" function JSON.lower(md::Metadata) Dict{String, Any}( "zarr_format" => md.zarr_format, "shape" => md.shape[] |> reverse, - "chunks" => md.chunks |> reverse, + "chunks" => chunk_encoding.(md.chunks) |> reverse, "dtype" => md.dtype, "compressor" => md.compressor, "fill_value" => fill_value_encoding(md.fill_value), diff --git a/test/runtests.jl b/test/runtests.jl index 4c996e3..32071c2 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,6 +4,7 @@ using JSON using Pkg using PyCall using Dates +using Zarr: RegularChunks, IrregularChunks macro test_py(ex) quote @@ -25,7 +26,7 @@ end @test eltype(z.storage.a["0.0"]) === UInt8 @test z.metadata.shape[] === (2, 3) @test z.metadata.order === 'C' - @test z.metadata.chunks === (2, 3) + @test z.metadata.chunks === (RegularChunks(2,0,2), RegularChunks(3,0,3)) @test z.metadata.fill_value === nothing @test z.metadata.compressor isa Zarr.BloscCompressor @test z.metadata.compressor.blocksize === 0 @@ -122,7 +123,7 @@ end @test metadata isa Zarr.Metadata @test metadata.zarr_format === 2 @test metadata.shape[] === size(A) - @test metadata.chunks === chunks + @test metadata.chunks === (RegularChunks(5,0,30),RegularChunks(10,0,20)) @test metadata.dtype === "