Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,11 @@
* Make `transform!` on `SubDataFrame` faster
([#3070](https://github.com/JuliaData/DataFrames.jl/pull/3070))

## Integration changes

* Support `Tables.subset` and move `ByRow` definition to Tables.jl
([#3158](https://github.com/JuliaData/DataFrames.jl/pull/3158))

# DataFrames.jl v1.3.4 Patch Release Notes

## Bug fixes
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ Reexport = "0.1, 0.2, 1"
ShiftedArrays = "1"
SortingAlgorithms = "0.1, 0.2, 0.3, 1"
TableTraits = "0.4, 1"
Tables = "1.2"
Tables = "1.8"
Unitful = "1"
julia = "1"

Expand Down
1 change: 1 addition & 0 deletions src/DataFrames.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import LinearAlgebra: norm
using Markdown
using PrettyTables
using Random
using Tables: ByRow

import DataAPI,
DataAPI.allcombinations,
Expand Down
27 changes: 0 additions & 27 deletions src/abstractdataframe/selection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -309,33 +309,6 @@ function broadcast_pair(df::AbstractDataFrame, @nospecialize(p::AbstractVecOrMat
end
end

"""
ByRow

A type used for selection operations to signal that the wrapped function should
be applied to each element (row) of the selection.

The wrapped function is called exactly once for each element.
This differs from `map` and `broadcast`, which assume for some types of
source vectors (e.g. `SparseVector`) that the wrapped function is pure,
allowing them to call the function only once for multiple equal values.
When using such types, for maximal performance with pure functions
which are relatively costly, use `x -> map(f, x)` instead of `ByRow(f)`.

Note that `ByRow` always collects values returned by `fun` in a vector.
"""
struct ByRow{T} <: Function
fun::T
end

# invoke the generic AbstractVector function to ensure function is called
# exactly once for each element
(f::ByRow)(cols::AbstractVector...) =
invoke(map,
Tuple{typeof(f.fun), ntuple(i -> AbstractVector, length(cols))...},
f.fun, cols...)
(f::ByRow)(table::NamedTuple) = [f.fun(nt) for nt in Tables.namedtupleiterator(table)]

# add a method to funname defined in other/utils.jl
funname(row::ByRow) = funname(row.fun)

Expand Down
9 changes: 9 additions & 0 deletions src/other/tables.jl
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,12 @@ IteratorInterfaceExtensions.getiterator(df::AbstractDataFrame) =
Tables.datavaluerows(Tables.columntable(df))
IteratorInterfaceExtensions.isiterable(x::AbstractDataFrame) = true
TableTraits.isiterabletable(x::AbstractDataFrame) = true

function Tables.subset(df::AbstractDataFrame, inds; view::Union{Bool, Nothing}=nothing)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we use @inline to ensure the return type is inferred?

Copy link
Member Author

@bkamins bkamins Sep 17, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added. It should not be a problem - the function is small (on newer Julia constant propagation also should be applied without inlining AFAICT, x-ref: JuliaLang/julia#43852).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added @inferred where applicable

res = view === true ? DataFrames.view(df, inds, :) : df[inds, :]
if res isa DataFrameRow && view == false
return copy(res)
else
return res
end
end
44 changes: 44 additions & 0 deletions test/tables.jl
Original file line number Diff line number Diff line change
Expand Up @@ -323,4 +323,48 @@ end
@test DataFrame === @inferred Tables.materializer(DataFrames.DataFrameColumns)
end

@testset "Tables.subset" begin
df = DataFrame(a=1:3, b=4:6)

res = Tables.subset(df, :)
@test res isa DataFrame
@test res == DataFrame(a=1:3, b=4:6)
res = Tables.subset(df, :, view=false)
@test res isa DataFrame
@test res == DataFrame(a=1:3, b=4:6)
res = Tables.subset(df, :, view=true)
@test res isa SubDataFrame
@test res == DataFrame(a=1:3, b=4:6)

res = Tables.subset(df, [3, 1])
@test res isa DataFrame
@test res == DataFrame(a=[3, 1], b=[6, 4])
res = Tables.subset(df, [3, 1], view=false)
@test res isa DataFrame
@test res == DataFrame(a=[3, 1], b=[6, 4])
res = Tables.subset(df, [3, 1], view=true)
@test res isa SubDataFrame
@test res == DataFrame(a=[3, 1], b=[6, 4])

res = Tables.subset(df, [true, false, true])
@test res isa DataFrame
@test res == DataFrame(a=[1, 3], b=[4, 6])
res = Tables.subset(df, [1, 3], view=false)
@test res isa DataFrame
@test res == DataFrame(a=[1, 3], b=[4, 6])
res = Tables.subset(df, [1, 3], view=true)
@test res isa SubDataFrame
@test res == DataFrame(a=[1, 3], b=[4, 6])

res = Tables.subset(df, 2)
@test res isa DataFrameRow
@test res == DataFrame(a=2, b=5)[1, :]
res = Tables.subset(df, 2, view=false)
@test res isa NamedTuple{(:a, :b), Tuple{Int, Int}}
@test res == (a=2, b=5)
res = Tables.subset(df, 2, view=true)
@test res isa DataFrameRow
@test res == DataFrame(a=2, b=5)[1, :]
end

end # module