Skip to content

Commit 9c6f0a5

Browse files
authored
Add a groupby function (#591)
* add groupby * ambiguity fixes * update test * fix groupby more
1 parent 27c6d0e commit 9c6f0a5

File tree

21 files changed

+786
-53
lines changed

21 files changed

+786
-53
lines changed

Project.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ version = "0.25.8"
77
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
88
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
99
ConstructionBase = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
10+
DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
1011
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
1112
Extents = "411431e0-e8b7-467b-b5e0-f676ba4f2910"
1213
Interfaces = "85a1e053-f937-4924-92a5-1367d23b7b87"
@@ -38,6 +39,7 @@ ColorTypes = "0.11"
3839
Combinatorics = "1"
3940
ConstructionBase = "1"
4041
CoordinateTransformations = "0.6"
42+
DataAPI = "1"
4143
DataFrames = "1"
4244
Dates = "1"
4345
Distributions = "0.25"

docs/Project.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,5 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
66
DimensionalData = "0703355e-b756-11e9-17c0-8b28908087d0"
77
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
88
DocumenterVitepress = "4710194d-e776-4893-9690-8d956a29c365"
9+
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
10+
Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d"

docs/crash/course/groupby.jl

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
using DimensionalData
2+
using Dates
3+
using Statistics
4+
const DD = DimensionalData
5+
6+
# # Basics: DateTime operations we can use for grouping
7+
8+
# First lets look at the kind of functions that can be used to group `DateTime`.
9+
# Other types will follow the same principles, but are usually simpler.
10+
11+
# Create a demo `DateTime` range
12+
tempo = range(DateTime(2000), step=Hour(1), length=365*24*2)
13+
14+
# Now lets see how some common functions work.
15+
16+
# The `hour` function will transform values to hour of the day - the integers `0:23`
17+
18+
# ### hour
19+
hour.(tempo)
20+
21+
# These do similar things with other time periods
22+
23+
# ### dayofweek
24+
dayofweek.(tempo)
25+
26+
# ### month
27+
month.(tempo)
28+
29+
# ### dayofyear
30+
dayofyear.(tempo)
31+
32+
# ## Tuple grouping
33+
34+
# Some functions return a tuple - we can also use tuples for grouping.
35+
# They are sorted by the left to right values.
36+
37+
# ### yearmonth
38+
yearmonth.(tempo)
39+
40+
# We can creat our own anonymous function that return tuples
41+
yearday(x) = year(x), dayofyear(x)
42+
43+
yearhour(x) = year(x), hour(x)
44+
45+
# And you can probably guess what they do:
46+
yearday.(tempo)
47+
48+
# All of these functions can be used in `groupby` on `DateTime` objects.
49+
50+
# # Practical example: grouping by season
51+
52+
# ### TODOS: We will need several functions.
53+
54+
# # groupby operations
55+
# Here we use the same time functions from above
56+
57+
ds = rand(X(1:0.01:2), Ti(tempo))
58+
59+
# ## select by month, days, years and seasons
60+
# ### TODO, how do we select month 1 or 2, and even a group of them, i.e. [1,3,5]? Same for days, years and seasons.
61+
mean.(groupby(ds, Ti=>Bins(month, 1:2)))
62+
mean.(groupby(ds, Ti=>Bins(month, [1, 3, 5])))
63+
mean.(groupby(ds, Ti => season(; start=December)))
64+
mean.(groupby(ds, Ti => Bins(dayofyear, intervals(1:8:370))))
65+
mean.(groupby(ds, Ti => Bins(yearday, [[1,2,3], [4,5,6]], labels=x -> join(string.(x), ','))))
66+
mean.(groupby(ds, Ti => week))
67+
mean.(groupby(ds, Ti => hours(12; start=6, labels=x -> 6 in x ? :night : :day)))
68+
mean.(groupby(ds, Ti => dims(ds, Ti)))
69+
70+
# ### TODO, we need a new function that can return DJF (Dec-Jan-Feb), MAM (Mar-Apr-May)... etc.
71+
# THIS IS HARD. We need a succinct way to select around the end-start of the year.
72+
73+
# is combining month from different years
74+
mean.(groupby(ds, Ti=>month))
75+
76+
# Use three-month bins. The 13 is the open side of the last interval.
77+
mean.(groupby(ds, Ti=>Bins(yearmonth, intervals(1:3:12))))
78+
79+
mean.(groupby(ds, Ti=>Bins(month, 4))) # is combining month from different years
80+
81+
#
82+
mean.(groupby(ds, Ti=>year))
83+
84+
#
85+
mean.(groupby(ds, Ti=>yearmonth))
86+
87+
#
88+
mean.(groupby(ds, Ti=>hour))
89+
90+
#
91+
mean.(groupby(ds, Ti=>Dates.hour12))
92+
93+
# ### TODO. How do could we incorporate resample? Let's say if we have hour resolution I want to resample every 3,6,12.. hours?
94+
mean.(groupby(ds, Ti=>Bins(yearhour, intervals(1:3:24)))) # it will combine the same day from different year.
95+
96+
mean.(groupby(ds, Ti=>dayofyear)) # it will combine the same day from different year.
97+
98+
#
99+
mean.(groupby(ds, Ti=>yearmonthday)) # this does the a daily mean aggregation.
100+
101+
#
102+
mean.(groupby(ds, Ti=>yearmonth)) # this does a monthly mean aggregation
103+
104+
#
105+
mean.(groupby(ds, Ti=>yearday)) # this does a daily mean aggregation
106+
107+
mean.(groupby(ds, Ti=>Bins(yearhour, 12))) # this does a daily mean aggregation
108+
109+
# ### TODO. Similar to the hourly resample, how do we do it for more than 1 day, let's say 8daily?
110+
mean.(groupby(ds, Ti=>Bins(dayofyear, map(x -> x:x+7, 1:8:370))))
111+
112+
# ### TODO: Group by Dims. This should include the rasters input sampling.
113+
mean.(groupby(ds, dims(ds, Ti)))
114+
115+
# ## Apply custom function (i.e. normalization) to grouped output.

src/DimensionalData.jl

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,16 @@ using Dates,
1717
using Base.Broadcast: Broadcasted, BroadcastStyle, DefaultArrayStyle, AbstractArrayStyle,
1818
Unknown
1919

20-
using Base: tail, OneTo, @propagate_inbounds
20+
using Base: tail, OneTo, Callable, @propagate_inbounds
2121

2222
# Ecosystem
2323
import Adapt,
2424
ArrayInterface,
2525
ConstructionBase,
26+
DataAPI,
2627
Extents,
2728
Interfaces,
29+
IntervalSets,
2830
InvertedIndices,
2931
IteratorInterfaceExtensions,
3032
RecipesBase,
@@ -34,23 +36,27 @@ import Adapt,
3436

3537
using RecipesBase: @recipe
3638

39+
# using IntervalSets: .., Interval
40+
3741
include("Dimensions/Dimensions.jl")
3842

3943
using .Dimensions
4044
using .Dimensions.LookupArrays
4145
using .Dimensions: StandardIndices, DimOrDimType, DimTuple, DimTupleOrEmpty, DimType, AllDims
4246
import .LookupArrays: metadata, set, _set, rebuild, basetypeof,
43-
order, span, sampling, locus, val, index, bounds, intervalbounds,
47+
order, span, sampling, locus, val, index, bounds, intervalbounds,
4448
hasselection, units, SelectorOrInterval
4549
import .Dimensions: dims, refdims, name, lookup, dimstride, kwdims, hasdim, label, _astuple
4650

51+
import DataAPI.groupby
52+
4753
export LookupArrays, Dimensions
4854

4955
# Dimension
5056
export X, Y, Z, Ti, Dim, Coord
5157

5258
# Selector
53-
export At, Between, Touches, Contains, Near, Where, All, .., Not
59+
export At, Between, Touches, Contains, Near, Where, All, .., Not, Bins
5460

5561
export AbstractDimArray, DimArray
5662

@@ -71,6 +77,8 @@ export dimnum, hasdim, hasselection, otherdims
7177
# utils
7278
export set, rebuild, reorder, modify, broadcast_dims, broadcast_dims!, mergedims, unmergedims
7379

80+
export groupby, season, months, hours, yeardays, monthdays, intervals, ranges
81+
7482
const DD = DimensionalData
7583

7684
# Common
@@ -96,6 +104,7 @@ include("tables.jl")
96104
include("plotrecipes.jl")
97105
include("utils.jl")
98106
include("set.jl")
107+
include("groupby.jl")
99108
include("precompile.jl")
100109
include("interface_tests.jl")
101110

src/Dimensions/Dimensions.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ using DimensionalData.Dimensions
1313
"""
1414
module Dimensions
1515

16-
import Adapt, ConstructionBase, Extents
16+
import Adapt, ConstructionBase, Extents, IntervalSets
1717
using Dates
1818

1919
include("../LookupArrays/LookupArrays.jl")

src/Dimensions/format.jl

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,12 @@ function format(dims::Tuple{<:Pair,Vararg{Pair}}, A::AbstractArray)
2626
end
2727
return format(dims, A)
2828
end
29-
format(dims::Tuple{Vararg{Any,N}}, A::AbstractArray{<:Any,N}) where N =
30-
format(dims, axes(A))
29+
format(dims::Tuple{Vararg{Any,N}}, A::AbstractArray{<:Any,N}) where N = format(dims, axes(A))
3130
@noinline format(dims::Tuple{Vararg{Any,M}}, A::AbstractArray{<:Any,N}) where {N,M} =
3231
throw(DimensionMismatch("Array A has $N axes, while the number of dims is $M: $(map(basetypeof, dims))"))
3332
format(dims::Tuple{Vararg{Any,N}}, axes::Tuple{Vararg{Any,N}}) where N = map(_format, dims, axes)
33+
format(d::Dimension{<:AbstractArray}) = _format(d, axes(val(d), 1))
34+
format(d::Dimension, axis::AbstractRange) = _format(d, axis)
3435

3536
_format(dimname::Symbol, axis::AbstractRange) = Dim{dimname}(NoLookup(axes(axis, 1)))
3637
_format(::Type{D}, axis::AbstractRange) where D<:Dimension = D(NoLookup(axes(axis, 1)))
@@ -110,8 +111,13 @@ _format(span::Irregular{<:Tuple}, D, index) = span
110111
_format(span::Explicit, D, index) = span
111112
# Sampling
112113
_format(sampling::AutoSampling, span::Span, D::Type, index) = Points()
114+
_format(::AutoSampling, ::Span, D::Type, ::AbstractArray{<:IntervalSets.Interval}) =
115+
Intervals(Start())
113116
_format(sampling::AutoSampling, span::Explicit, D::Type, index) =
114117
Intervals(_format(locus(sampling), D, index))
118+
# For ambiguity, not likely to happen in practice
119+
_format(::AutoSampling, ::Explicit, D::Type, ::AbstractArray{<:IntervalSets.Interval}) =
120+
Intervals(_format(locus(sampling), D, index))
115121
_format(sampling::Points, span::Span, D::Type, index) = sampling
116122
_format(sampling::Points, span::Explicit, D::Type, index) = _explicitpoints_error()
117123
_format(sampling::Intervals, span::Span, D::Type, index) =

src/Dimensions/primitives.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ julia> dimnum(A, Y)
203203
```
204204
"""
205205
@inline function dimnum(x, q1, query...)
206-
all(hasdim(x, q1, query...)) || _extradimserror()
206+
all(hasdim(x, q1, query...)) || _extradimserror(otherdims(x, (q1, query)))
207207
_call_primitive(_dimnum, MaybeFirst(), x, q1, query...)
208208
end
209209
@inline dimnum(x, query::Function) =
@@ -757,5 +757,5 @@ _typemsg(a, b) = "Lookups do not all have the same type: $(order(a)), $(order(b)
757757
@noinline _valerror(a, b) = throw(DimensionMismatch(_valmsg(a, b)))
758758
@noinline _ordererror(a, b) = throw(DimensionMismatch(_ordermsg(a, b)))
759759
@noinline _metadataerror(a, b) = throw(DimensionMismatch(_metadatamsg(a, b)))
760-
@noinline _extradimserror(args...) = throw(ArgumentError(_extradimsmsg(args)))
760+
@noinline _extradimserror(args) = throw(ArgumentError(_extradimsmsg(args)))
761761
@noinline _dimsnotdefinederror() = throw(ArgumentError("Object does not define a `dims` method"))

src/Dimensions/show.jl

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,25 +13,27 @@ function dimcolors(i)
1313
colors[c]
1414
end
1515

16-
function Base.show(io::IO, mime::MIME"text/plain", dims::DimTuple)
16+
function show_dims(io::IO, mime::MIME"text/plain", dims::DimTuple;
17+
colors=map(x -> get(io, :dimcolo, dimcolors(x)), ntuple(identity, length(dims)))
18+
)
1719
ctx = IOContext(io, :compact => true)
1820
inset = get(io, :inset, "")
1921
print(io, inset)
2022
if all(map(d -> !(parent(d) isa AbstractArray) || (parent(d) isa NoLookup), dims))
21-
dc = get(ctx, :dimcolor, dimcolors(1))
23+
dc = colors[1]
2224
printstyled(io, dimsymbols(1), ' '; color=dc)
2325
show(IOContext(ctx, :dimcolor => dc, :dimname_len => 0), mime, first(dims))
2426
foreach(enumerate(Base.tail(dims))) do (i, d)
2527
n = i + 1
2628
print(io, ", ")
27-
dc = get(ctx, :dimcolor, dimcolors(n))
29+
dc = colors[n]
2830
printstyled(io, dimsymbols(n), ' '; color=dc)
2931
show(IOContext(ctx, :dimcolor => dc, :dimname_len => 0), mime, d)
3032
end
3133
return 0
3234
else # Dims get a line each
3335
lines = 3
34-
dc = get(ctx, :dimcolor, dimcolors(1))
36+
dc = colors[1]
3537
printstyled(io, dimsymbols(1), ' '; color=dc)
3638
maxname = maximum(length string dim2key, dims)
3739
dim_ctx = IOContext(ctx, :dimcolor => dc, :dimname_len=> maxname)
@@ -41,14 +43,17 @@ function Base.show(io::IO, mime::MIME"text/plain", dims::DimTuple)
4143
lines += 1
4244
s = dimsymbols(n)
4345
print(io, ",\n", inset)
44-
dc = get(ctx, :dimcolor, dimcolors(n))
46+
dc = colors[n]
4547
printstyled(io, s, ' '; color=dc)
4648
dim_ctx = IOContext(ctx, :dimcolor => dc, :dimname_len => maxname)
4749
show(dim_ctx, mime, d)
4850
end
4951
return lines
5052
end
5153
end
54+
55+
Base.show(io::IO, mime::MIME"text/plain", dims::DimTuple) =
56+
show_dims(io, mime, dims)
5257
function Base.show(io::IO, mime::MIME"text/plain", dim::Dimension)
5358
get(io, :compact, false) && return show_compact(io, mime, dim)
5459
print_dimname(io, dim)
@@ -77,14 +82,14 @@ end
7782
dimcolor(io) = get(io, :dimcolor, dimcolors(1))
7883

7984
# print dims with description string and inset
80-
function print_dims(io::IO, mime, dims::Tuple{})
85+
function print_dims(io::IO, mime, dims::Tuple{}; kw...)
8186
@nospecialize io mime dims
8287
print(io, ": ")
8388
return 0
8489
end
85-
function print_dims(io::IO, mime, dims::Tuple)
90+
function print_dims(io::IO, mime, dims::Tuple; kw...)
8691
ctx = IOContext(io, :inset => " ")
87-
return show(ctx, mime, dims)
92+
return show_dims(ctx, mime, dims; kw...)
8893
end
8994
# print refdims with description string and inset
9095
function print_refdims(io::IO, mime, refdims::Tuple)

src/LookupArrays/lookup_arrays.jl

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,11 @@ ordered_lastindex(o::ForwardOrdered, l::LookupArray) = lastindex(parent(l))
4646
ordered_lastindex(o::ReverseOrdered, l::LookupArray) = firstindex(parent(l))
4747
ordered_lastindex(o::Unordered, l::LookupArray) = lastindex(parent(l))
4848

49-
function Base.searchsortedfirst(lookup::LookupArray, val; lt=<)
50-
searchsortedfirst(parent(lookup), unwrap(val); order=ordering(order(lookup)), lt=lt)
49+
function Base.searchsortedfirst(lookup::LookupArray, val; lt=<, kw...)
50+
searchsortedfirst(parent(lookup), unwrap(val); order=ordering(order(lookup)), lt=lt, kw...)
5151
end
52-
function Base.searchsortedlast(lookup::LookupArray, val; lt=<)
53-
searchsortedlast(parent(lookup), unwrap(val); order=ordering(order(lookup)), lt=lt)
52+
function Base.searchsortedlast(lookup::LookupArray, val; lt=<, kw...)
53+
searchsortedlast(parent(lookup), unwrap(val); order=ordering(order(lookup)), lt=lt, kw...)
5454
end
5555

5656
function Adapt.adapt_structure(to, l::LookupArray)
@@ -593,7 +593,8 @@ Base.:(==)(l1::Transformed, l2::Transformed) = typeof(l1) == typeof(l2) && f(l1)
593593

594594
intervalbounds(l::LookupArray, args...) = _intervalbounds_no_interval_error()
595595
intervalbounds(l::AbstractSampled, args...) = intervalbounds(span(l), sampling(l), l, args...)
596-
intervalbounds(span::Span, ::Points, l::LookupArray, args...) = _intervalbounds_no_interval_error()
596+
intervalbounds(span::Span, ::Points, ls::LookupArray) = map(l -> (l, l), ls)
597+
intervalbounds(span::Span, ::Points, ls::LookupArray, i::Int) = ls[i], ls[i]
597598
intervalbounds(span::Span, sampling::Intervals, l::LookupArray, i::Int) =
598599
intervalbounds(order(l), locus(sampling), span, l, i)
599600
function intervalbounds(order::ForwardOrdered, locus::Start, span::Span, l::LookupArray, i::Int)

src/LookupArrays/metadata.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ objects to [`set`](@ref) without ambiguity about where to put them.
1313
"""
1414
abstract type AbstractMetadata{X,T} end
1515

16-
const _MetadataContents =Union{AbstractDict,NamedTuple}
16+
const _MetadataContents = Union{AbstractDict,NamedTuple}
1717
const AllMetadata = Union{AbstractMetadata,AbstractDict}
1818

1919
Base.get(m::AbstractMetadata, args...) = get(val(m), args...)

0 commit comments

Comments
 (0)