From acc84d2134f6ab4623fbe09914449d5d75faecb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 26 Sep 2020 19:11:54 +0200 Subject: [PATCH 01/21] initial implementation for AbstractDataFrame --- src/abstractdataframe/selection.jl | 300 +++++++++++++++++++++-------- 1 file changed, 219 insertions(+), 81 deletions(-) diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 0469b87fb3..57a3fe87b4 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -4,9 +4,10 @@ # normalize_selection function makes sure that whatever input format of idx is it # will end up in one of four canonical forms # 1) AbstractVector{Int} -# 2) Pair{Int, <:Pair{<:Base.Callable, Symbol}} -# 3) Pair{AbstractVector{Int}, <:Pair{<:Base.Callable, Symbol}} -# 4) Pair{AsTable, <:Pair{<:Base.Callable, Symbol}} +# 2) Pair{Int, <:Pair{<:Base.Callable, <:Union{Symbol, Vector{Symbol}, Type{AsTable}}}} +# 3) Pair{AbstractVector{Int}, <:Pair{<:Base.Callable, <:Union{Symbol, AbstractVector{Symbol}, Type{AsTable}}}} +# 4) Pair{AsTable, <:Pair{<:Base.Callable, <:Union{Symbol, Vector{Symbol}, Type{AsTable}}}} +# 5) Function """ ByRow @@ -14,22 +15,14 @@ A type used for selection operations to signal that the wrapped function should be applied to each element (row) of the selection. -Note that `ByRow` always collects values returned by `fun` in a vector. Therefore, -to allow for future extensions, returning `NamedTuple` or `DataFrameRow` -from `fun` is currently disallowed. +Note that `ByRow` always collects values returned by `fun` in a vector. """ struct ByRow{T} <: Function fun::T end -_by_row_helper(x::Any) = x -_by_row_helper(x::Union{NamedTuple, DataFrameRow}) = - throw(ArgumentError("return value of type $(typeof(x)) " * - "is currently not allowed with ByRow.")) - -(f::ByRow)(cols::AbstractVector...) = _by_row_helper.(f.fun.(cols...)) -(f::ByRow)(table::NamedTuple) = - _by_row_helper.(f.fun.(Tables.namedtupleiterator(table))) +(f::ByRow)(cols::AbstractVector...) = f.fun.(cols...) +(f::ByRow)(table::NamedTuple) = f.fun.(Tables.namedtupleiterator(table)) # add a method to funname defined in other/utils.jl funname(row::ByRow) = funname(row.fun) @@ -45,6 +38,9 @@ normalize_selection(idx::AbstractIndex, sel, renamecols::Bool) = end end +normalize_selection(idx::AbstractIndex, sel::Function, renamecols::Bool) = sel +normalize_selection(idx::AbstractIndex, sel::Colon, renamecols::Bool) = idx[:] + normalize_selection(idx::AbstractIndex, sel::Pair{typeof(nrow), Symbol}, renamecols::Bool) = length(idx) == 0 ? (Int[] => (() -> 0) => last(sel)) : (1 => length => last(sel)) @@ -70,8 +66,13 @@ normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, <:AbstractStrin normalize_selection(idx, first(sel) => Symbol(last(sel)), renamecols::Bool) function normalize_selection(idx::AbstractIndex, - sel::Pair{<:Any,<:Pair{<:Base.Callable, Symbol}}, + sel::Pair{<:Any,<:Pair{<:Base.Callable, + <:Union{Symbol, AbstractString, DataType, + AbstractVector{Symbol}, AbstractVector{<:AbstractString}}}}, renamecols::Bool) + if last(last(sel)) isa DataType + last(last(sel)) === AsTable || throw(ArgumentError("Only DataType supported as target is AsTable")) + end if first(sel) isa AsTable rawc = first(sel).cols wanttable = true @@ -98,15 +99,17 @@ function normalize_selection(idx::AbstractIndex, throw(ArgumentError("at least one column must be passed to a " * "`ByRow` transformation function")) end - return (wanttable ? AsTable(c) : c) => last(sel) + ls = last(sel) + if ls isa AbstractString + r = Symbol(ls) + elseif ls isa AbstractVector{<:AbstractString} + r = Symbol.(ls) + else + r = ls + end + return (wanttable ? AsTable(c) : c) => r end -normalize_selection(idx::AbstractIndex, - sel::Pair{<:Any,<:Pair{<:Base.Callable,<:AbstractString}}, - renamecols::Bool) = - normalize_selection(idx, first(sel) => first(last(sel)) => Symbol(last(last(sel))), - renamecols::Bool) - function normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex,<:Base.Callable}, renamecols::Bool) c = idx[first(sel)] @@ -170,18 +173,27 @@ function normalize_selection(idx::AbstractIndex, return (wanttable ? AsTable(c) : c) => fun => newcol end -function select_transform!(nc::Pair{<:Union{Int, AbstractVector{Int}, AsTable}, - <:Pair{<:Base.Callable, Symbol}}, +function select_transform!(nc::Union{Function, Pair{<:Union{Int, AbstractVector{Int}, AsTable}, + <:Pair{<:Base.Callable, + <:Union{Symbol, AbstractVector{Symbol}, DataType}}}}, df::AbstractDataFrame, newdf::DataFrame, - transformed_cols::Dict{Symbol, Any}, copycols::Bool, + transformed_cols::Set{Symbol}, copycols::Bool, allow_resizing_newdf::Ref{Bool}) - col_idx, (fun, newname) = nc + if nc isa Function + col_idx, fun, newname = nothing, nc, AsTable + else + col_idx, (fun, newname) = nc + end + if newname isa DataType + newname === AsTable || throw(ArgumentError("Only DataType supported as target is AsTable")) + end # It is allowed to request a tranformation operation into a newname column # only once. This is ensured by the logic related to transformed_cols dictionaly # in _manipulate, therefore in select_transform! such a duplicate should not happen - @assert !hasproperty(newdf, newname) cdf = eachcol(df) - if col_idx isa Int + if col_idx === nothing + res = fun(df) + elseif col_idx isa Int res = fun(df[!, col_idx]) elseif col_idx isa AsTable res = fun(Tables.columntable(select(df, col_idx.cols, copycols=false))) @@ -190,11 +202,166 @@ function select_transform!(nc::Pair{<:Union{Int, AbstractVector{Int}, AsTable}, @assert col_idx isa AbstractVector{Int} res = fun(map(c -> cdf[c], col_idx)...) end - if res isa Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix} - throw(ArgumentError("return value from function $fun " * - "of type $(typeof(res)) is currently not allowed.")) + + if (newname === AsTable || newname isa AbstractVector{Symbol}) && + !(res isa Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix}) + if res isa AbstractVector && !isempty(res) + p = pairs.(res) + ex = extrema(length, p) + ex[1] == ex[2] || throw(ArgumentError("returned elements must have the same length")) + kp1 = keys(p[1]) + all(x -> keys(x) == kp1, p) || throw(ArgumentError("keys of the returned elements must be identical")) + res = DataFrame() + for (i, n) in enumerate(kp1) + res[!, Symbol(n)] = [x[i] for x in p] + end + else + res = Tables.columntable(res) + end end - if res isa AbstractVector + + if res isa Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix} + if newname isa Symbol + throw(ArgumentError("Table returned while a single column return value was requested")) + end + if res isa AbstractMatrix + colnames = gennames(size(res, 2)) + else + colnames = propertynames(res) + end + if !(newname === AsTable) + if length(colnames) != length(newname) + throw(ArgumentError("Number of returned columns does not match the " * + "length of requested output")) + end + colnames = newname + end + isempty(colnames) && return # nothing to do + + if any(in(transformed_cols), colnames) + throw(ArgumentError("Duplicate column name returned")) + else + startlen = length(transformed_cols) + union!(transformed_cols, colnames) + @assert startlen + length(colnames) == length(transformed_cols) + end + if res isa AbstractDataFrame + lr = nrow(res) + # allow shortening to 0 rows + if allow_resizing_newdf[] && nrow(newdf) == 1 + newdfcols = _columns(newdf) + for (i, col) in enumerate(newdfcols) + newdfcols[i] = fill!(similar(col, lr), first(col)) + end + end + + # !allow_resizing_newdf[] && ncol(newdf) == 0 + # means that we use `select` or `transform` not `combine` + if !allow_resizing_newdf[] && ncol(newdf) == 0 && lr != nrow(df) + throw(ArgumentError("length $(lr) of vector returned from " * + "function $fun is different from number of rows " * + "$(nrow(df)) of the source data frame.")) + end + allow_resizing_newdf[] = false + @assert length(colnames) == ncol(res) + for (newname, v) in zip(colnames, eachcol(res)) + vpar = parent(v) + parent_cols = col_idx isa AsTable ? col_idx.cols : col_idx + if copycols && !(fun isa ByRow) && + (v isa SubArray || any(i -> vpar === parent(cdf[i]), parent_cols)) + newdf[!, newname] = copy(v) + else + newdf[!, newname] = v + end + end + elseif res isa AbstractMatrix + lr = size(res, 1) + # allow shortening to 0 rows + if allow_resizing_newdf[] && nrow(newdf) == 1 + newdfcols = _columns(newdf) + for (i, col) in enumerate(newdfcols) + newdfcols[i] = fill!(similar(col, lr), first(col)) + end + end + + # !allow_resizing_newdf[] && ncol(newdf) == 0 + # means that we use `select` or `transform` not `combine` + if !allow_resizing_newdf[] && ncol(newdf) == 0 && lr != nrow(df) + throw(ArgumentError("length $(lr) of vector returned from " * + "function $fun is different from number of rows " * + "$(nrow(df)) of the source data frame.")) + end + allow_resizing_newdf[] = false + @assert length(colnames) == size(res, 2) + for (i, newname) in enumerate(colnames) + newdf[!, newname] = res[:, i] + end + elseif res isa NamedTuple + if all(v -> v isa AbstractVector, x) + lr = length(res[1]) + # allow shortening to 0 rows + if allow_resizing_newdf[] && nrow(newdf) == 1 + newdfcols = _columns(newdf) + for (i, col) in enumerate(newdfcols) + newdfcols[i] = fill!(similar(col, lr), first(col)) + end + end + + # !allow_resizing_newdf[] && ncol(newdf) == 0 + # means that we use `select` or `transform` not `combine` + if !allow_resizing_newdf[] && ncol(newdf) == 0 && lr != nrow(df) + throw(ArgumentError("length $(lr) of vector returned from " * + "function $fun is different from number of rows " * + "$(nrow(df)) of the source data frame.")) + end + allow_resizing_newdf[] = false + @assert length(colnames) == length(res) + for (newname, v) in zip(colnames, res) + vpar = parent(v) + parent_cols = col_idx isa AsTable ? col_idx.cols : col_idx + if copycols && !(fun isa ByRow) && + (v isa SubArray || any(i -> vpar === parent(cdf[i]), parent_cols)) + newdf[!, newname] = copy(v) + else + newdf[!, newname] = v + end + end + elseif any(v -> v isa AbstractVector, x) + throw(ArgumentError("mixing single values and vectors in a named tuple is not allowed")) + else + if ncol(newdf) == 0 + # if allow_resizing_newdf[] is false we know this is select or transform + rows = allow_resizing_newdf[] ? 1 : nrow(df) + else + # allow squashing a scalar to 0 rows + rows = nrow(newdf) + end + @assert length(colnames) == length(res) + for (newname, v) in zip(colnames, res) + # note that newdf potentially can contain c in general + newdf[!, newname] = fill!(Tables.allocatecolumn(typeof(v), rows), v) + end + end + elseif res isa DataFrameRow + if ncol(newdf) == 0 + # if allow_resizing_newdf[] is false we know this is select or transform + rows = allow_resizing_newdf[] ? 1 : nrow(df) + else + # allow squashing a scalar to 0 rows + rows = nrow(newdf) + end + @assert length(colnames) == length(res) + for (newname, v) in zip(colnames, res) + # note that newdf potentially can contain c in general + newdf[!, newname] = fill!(Tables.allocatecolumn(typeof(v), rows), v) + end + end + elseif res isa AbstractVector + if newname in transformed_cols + throw(ArgumentError("duplicate name of a transformed column")) + else + push!(transformed_cols, newname) + end # allow shortening to 0 rows if allow_resizing_newdf[] && nrow(newdf) == 1 newdfcols = _columns(newdf) @@ -220,6 +387,11 @@ function select_transform!(nc::Pair{<:Union{Int, AbstractVector{Int}, AsTable}, newdf[!, newname] = res end else + if newname in transformed_cols + throw(ArgumentError("duplicate name of a transformed column")) + else + push!(transformed_cols, newname) + end res_unwrap = res isa Union{AbstractArray{<:Any, 0}, Ref} ? res[] : res if ncol(newdf) == 0 # if allow_resizing_newdf[] is false we know this is select or transform @@ -231,9 +403,6 @@ function select_transform!(nc::Pair{<:Union{Int, AbstractVector{Int}, AsTable}, newdf[!, newname] = fill!(Tables.allocatecolumn(typeof(res_unwrap), rows), res_unwrap) end - # mark that column transformation was applied - # nothing is not possible otherwise as a value in this dict - transformed_cols[newname] = nothing end SELECT_ARG_RULES = @@ -642,27 +811,17 @@ function _manipulate(df::AbstractDataFrame, normalized_cs, copycols::Bool, keepr # │ 1 │ 0.841471 │ 3 │ # │ 2 │ 0.909297 │ 4 │ # - # we compute column :a immediately when we process `:` although it is specified - # later by `:a=>sin=>:a` because we know from `transformed_cols` variable that - # it will be computed later via a transformation - transformed_cols = Dict{Symbol, Any}() - for nc in normalized_cs - if nc isa Pair - newname = last(last(nc)) - @assert newname isa Symbol - if haskey(transformed_cols, newname) - throw(ArgumentError("duplicate target column name $newname passed")) - end - transformed_cols[newname] = nc - end - end + # transformed_cols keeps a set of columns that were generated via a transformation + # up till the point. Note that single column selection and column renaming is + # considered to be a transformation + transformed_cols = Set{Symbol}() # we allow resizing newdf only if up to some point only scalars were put # in it. The moment we put any vector into newdf its number of rows becomes fixed # Also if keeprows is true then we make sure to produce nrow(df) rows so resizing # is not allowed allow_resizing_newdf = Ref(!keeprows) for nc in normalized_cs - if nc isa AbstractVector{Int} + if nc isa AbstractVector{Int} # only this case is NOT considered to be a transformation allunique(nc) || throw(ArgumentError("duplicate column names selected")) for i in nc newname = _names(df)[i] @@ -670,42 +829,21 @@ function _manipulate(df::AbstractDataFrame, normalized_cs, copycols::Bool, keepr # we allow duplicate column names with selections applied earlier # and ignore them for convinience, to allow for e.g. select(df, :x1, :) if !hasproperty(newdf, newname) - if haskey(transformed_cols, newname) - # if newdf does not have a column newname - # but a column transformation was requested for this column - # then apply the transformation immediately - # in such a case nct may not be nothing, as if it were - # nothing then newname should be preasent in newdf already - nct = transformed_cols[newname] - @assert nct !== nothing - select_transform!(nct, df, newdf, transformed_cols, copycols, - allow_resizing_newdf) - else - # allow shortening to 0 rows - if allow_resizing_newdf[] && nrow(newdf) == 1 - newdfcols = _columns(newdf) - for (i, col) in enumerate(newdfcols) - newdfcols[i] = fill!(similar(col, nrow(df)), first(col)) - end + # allow shortening to 0 rows + if allow_resizing_newdf[] && nrow(newdf) == 1 + newdfcols = _columns(newdf) + for (i, col) in enumerate(newdfcols) + newdfcols[i] = fill!(similar(col, nrow(df)), first(col)) end - # here even if keeprows is true all is OK - newdf[!, newname] = copycols ? df[:, i] : df[!, i] - allow_resizing_newdf[] = false end + # here even if keeprows is true all is OK + newdf[!, newname] = copycols ? df[:, i] : df[!, i] + allow_resizing_newdf[] = false end end else - # nc is normalized so it has a form src_cols => fun => Symbol - newname = last(last(nc)) - if hasproperty(newdf, newname) - # it is possible that the transformation has already been applied - # via multiple column selection, like in select(df, :, :x1 => :y1) - # but then transformed_cols[newname] must be nothing - @assert transformed_cols[newname] === nothing - else - select_transform!(nc, df, newdf, transformed_cols, copycols, - allow_resizing_newdf) - end + select_transform!(nc, df, newdf, transformed_cols, copycols, + allow_resizing_newdf) end end return newdf From 7477075d47a6d7b8c0128a33a76155fd509c00ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 26 Sep 2020 19:51:36 +0200 Subject: [PATCH 02/21] minor fixes --- src/abstractdataframe/selection.jl | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 57a3fe87b4..fdecb20deb 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -266,7 +266,7 @@ function select_transform!(nc::Union{Function, Pair{<:Union{Int, AbstractVector{ @assert length(colnames) == ncol(res) for (newname, v) in zip(colnames, eachcol(res)) vpar = parent(v) - parent_cols = col_idx isa AsTable ? col_idx.cols : col_idx + parent_cols = col_idx isa AsTable ? col_idx.cols : (col_idx === nothing ? (1:ncol(df)) : col_idx) if copycols && !(fun isa ByRow) && (v isa SubArray || any(i -> vpar === parent(cdf[i]), parent_cols)) newdf[!, newname] = copy(v) @@ -318,7 +318,7 @@ function select_transform!(nc::Union{Function, Pair{<:Union{Int, AbstractVector{ @assert length(colnames) == length(res) for (newname, v) in zip(colnames, res) vpar = parent(v) - parent_cols = col_idx isa AsTable ? col_idx.cols : col_idx + parent_cols = col_idx isa AsTable ? col_idx.cols : (col_idx === nothing ? (1:ncol(df)) : col_idx) if copycols && !(fun isa ByRow) && (v isa SubArray || any(i -> vpar === parent(cdf[i]), parent_cols)) newdf[!, newname] = copy(v) @@ -379,7 +379,7 @@ function select_transform!(nc::Union{Function, Pair{<:Union{Int, AbstractVector{ end allow_resizing_newdf[] = false respar = parent(res) - parent_cols = col_idx isa AsTable ? col_idx.cols : col_idx + parent_cols = col_idx isa AsTable ? col_idx.cols : (col_idx === nothing ? (1:ncol(df)) : col_idx) if copycols && !(fun isa ByRow) && (res isa SubArray || any(i -> respar === parent(cdf[i]), parent_cols)) newdf[!, newname] = copy(res) @@ -738,13 +738,12 @@ julia> combine(df, :a => sum, nrow, renamecols=false) combine(df::AbstractDataFrame, args...; renamecols::Bool=true) = manipulate(df, args..., copycols=true, keeprows=false, renamecols=renamecols) -function combine(arg, df::AbstractDataFrame; renamecols::Bool=true) - if nrow(df) == 0 - throw(ArgumentError("calling combine on a data frame with zero rows" * - " with transformation as a first argument is " * - "currently not supported")) +function combine(arg::Function, df::AbstractDataFrame; renamecols::Bool=true) + if arg isa Colon + throw(ArgumentError("Only transformations are allowed when function is a " * + "frist argument to combine")) end - return combine(arg, groupby(df, Symbol[]), renamecols=renamecols) + return combine(df, arg) end manipulate(df::DataFrame, args::AbstractVector{Int}; copycols::Bool, keeprows::Bool, From f79228001251cb0c7d9c6835c0ac60bbffaf3d36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 26 Sep 2020 19:53:39 +0200 Subject: [PATCH 03/21] add missing methods --- src/abstractdataframe/selection.jl | 32 ++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index fdecb20deb..888c4f84db 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -555,6 +555,14 @@ julia> select!(df, AsTable(:) => ByRow(mean), renamecols=false) select!(df::DataFrame, args...; renamecols::Bool=true) = _replace_columns!(df, select(df, args..., copycols=false, renamecols=renamecols)) +function select!(arg::Function, df::AbstractDataFrame; renamecols::Bool=true) + if arg isa Colon + throw(ArgumentError("Only transformations are allowed when function is a " * + "frist argument to select!")) + end + return select!(df, arg) +end + """ transform!(df::DataFrame, args...; renamecols::Bool=true) @@ -567,6 +575,14 @@ See [`select!`](@ref) for detailed rules regarding accepted values for `args`. transform!(df::DataFrame, args...; renamecols::Bool=true) = select!(df, :, args..., renamecols=renamecols) +function transform!(arg::Function, df::AbstractDataFrame; renamecols::Bool=true) + if arg isa Colon + throw(ArgumentError("Only transformations are allowed when function is a " * + "frist argument to transform!")) + end + return transform!(df, arg) +end + """ select(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true) @@ -690,6 +706,14 @@ julia> select(df, AsTable(:) => ByRow(mean), renamecols=false) select(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true) = manipulate(df, args..., copycols=copycols, keeprows=true, renamecols=renamecols) +function select(arg::Function, df::AbstractDataFrame; renamecols::Bool=true) + if arg isa Colon + throw(ArgumentError("Only transformations are allowed when function is a " * + "frist argument to select")) + end + return select(df, arg) +end + """ transform(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true) @@ -703,6 +727,14 @@ See [`select`](@ref) for detailed rules regarding accepted values for `args`. transform(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true) = select(df, :, args..., copycols=copycols, renamecols=renamecols) +function transform(arg::Function, df::AbstractDataFrame; renamecols::Bool=true) + if arg isa Colon + throw(ArgumentError("Only transformations are allowed when function is a " * + "frist argument to transform")) + end + return transform(df, arg) +end + """ combine(df::AbstractDataFrame, args...; renamecols::Bool=true) combine(arg, df::AbstractDataFrame; renamecols::Bool=true) From 7e736fa25439845506a9f82b51275f667b04f78c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 26 Sep 2020 20:17:29 +0200 Subject: [PATCH 04/21] fix normalization problem --- src/abstractdataframe/selection.jl | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 888c4f84db..6787530a60 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -70,8 +70,9 @@ function normalize_selection(idx::AbstractIndex, <:Union{Symbol, AbstractString, DataType, AbstractVector{Symbol}, AbstractVector{<:AbstractString}}}}, renamecols::Bool) - if last(last(sel)) isa DataType - last(last(sel)) === AsTable || throw(ArgumentError("Only DataType supported as target is AsTable")) + lls = last(last(sel)) + if lls isa DataType + lls === AsTable || throw(ArgumentError("Only DataType supported as target is AsTable")) end if first(sel) isa AsTable rawc = first(sel).cols @@ -99,15 +100,17 @@ function normalize_selection(idx::AbstractIndex, throw(ArgumentError("at least one column must be passed to a " * "`ByRow` transformation function")) end - ls = last(sel) - if ls isa AbstractString - r = Symbol(ls) - elseif ls isa AbstractVector{<:AbstractString} - r = Symbol.(ls) + if lls isa AbstractString + r = Symbol(lls) + elseif lls isa AbstractVector{<:AbstractString} + r = Symbol.(lls) else - r = ls + r = lls end - return (wanttable ? AsTable(c) : c) => r + if r isa AbstractVector{Symbol} + allunique(r) || throw(ArgumentError("target column names must be unique")) + end + return (wanttable ? AsTable(c) : c) => first(last(sel)) => r end function normalize_selection(idx::AbstractIndex, From abb400614cd702df70a2a51a63962e5717d60385 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 26 Sep 2020 22:48:48 +0200 Subject: [PATCH 05/21] initial implementation so that tests pass --- src/abstractdataframe/selection.jl | 21 ++++++------ test/select.jl | 52 +++++++++++++++++++++--------- 2 files changed, 48 insertions(+), 25 deletions(-) diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 6787530a60..1be7c65c72 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -207,16 +207,15 @@ function select_transform!(nc::Union{Function, Pair{<:Union{Int, AbstractVector{ end if (newname === AsTable || newname isa AbstractVector{Symbol}) && - !(res isa Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix}) + !(res isa Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix, AbstractArray{<:Any, 0}, Ref}) if res isa AbstractVector && !isempty(res) - p = pairs.(res) - ex = extrema(length, p) - ex[1] == ex[2] || throw(ArgumentError("returned elements must have the same length")) - kp1 = keys(p[1]) - all(x -> keys(x) == kp1, p) || throw(ArgumentError("keys of the returned elements must be identical")) + kp1 = keys(res[1]) + all(x -> keys(x) == kp1, res) || throw(ArgumentError("keys of the returned elements must be identical")) + true_res = res res = DataFrame() - for (i, n) in enumerate(kp1) - res[!, Symbol(n)] = [x[i] for x in p] + prepend = all(x -> x isa Integer, kp1) + for n in kp1 + res[!, prepend ? Symbol("x", n) : Symbol(n)] = [x[n] for x in true_res] end else res = Tables.columntable(res) @@ -300,7 +299,7 @@ function select_transform!(nc::Union{Function, Pair{<:Union{Int, AbstractVector{ newdf[!, newname] = res[:, i] end elseif res isa NamedTuple - if all(v -> v isa AbstractVector, x) + if all(v -> v isa AbstractVector, res) lr = length(res[1]) # allow shortening to 0 rows if allow_resizing_newdf[] && nrow(newdf) == 1 @@ -390,6 +389,10 @@ function select_transform!(nc::Union{Function, Pair{<:Union{Int, AbstractVector{ newdf[!, newname] = res end else + if newname === AsTable + @assert res isa Union{AbstractArray{<:Any, 0}, Ref} + newname = :x1 + end if newname in transformed_cols throw(ArgumentError("duplicate name of a transformed column")) else diff --git a/test/select.jl b/test/select.jl index f707a9e950..2fa0700f09 100644 --- a/test/select.jl +++ b/test/select.jl @@ -720,20 +720,32 @@ end @test_throws ArgumentError select(df, :x => x -> retval) @test_throws ArgumentError select(df, :x => x -> retval, copycols=false) @test_throws ArgumentError select!(df, :x => x -> retval) + + @test select(df, :x => ByRow(x -> retval)) == DataFrame(x_function = [retval]) + cdf = copy(df) + select!(cdf, :x => ByRow(x -> retval)) + @test cdf == DataFrame(x_function = [retval]) + if retval isa Union{NamedTuple, DataFrameRow} - @test_throws ArgumentError select(df, :x => ByRow(x -> retval)) - @test_throws ArgumentError select!(df, :x => ByRow(x -> retval)) - else - @test select(df, :x => ByRow(x -> retval)) == DataFrame(x_function = [retval]) - cdf = copy(df) - select!(cdf, :x => ByRow(x -> retval)) - @test cdf == DataFrame(x_function = [retval]) + @test select(df, :x => ByRow(x -> retval) => AsTable) == DataFrame(;retval...) + elseif retval isa DataFrame + @test_throws MethodError select(df, :x => ByRow(x -> retval) => AsTable) + else # Matrix; surprising but following the API + @test select(df, :x => ByRow(x -> retval) => AsTable) == + DataFrame(["CartesianIndex($i, $j)" => 1.0 for i in 1:2, j in 1:2]...) + @test select(df, :x => ByRow(x -> retval) => [:a, :b, :c, :d]) == + DataFrame(a=1.0, b=1.0, c=1.0, d=1.0) end end for retval in [(1, 2), ones(2,2,2)] @test select(df, :x => x -> retval) == DataFrame(x_function = [retval]) @test select(df, :x => ByRow(x -> retval)) == DataFrame(x_function = [retval]) + if retval isa Tuple + @test select(df, :x => ByRow(x -> retval) => AsTable) == DataFrame(x1=1, x2=2) + else + @test select(df, :x => ByRow(x -> retval) => Symbol.("x", 1:8)) == DataFrame(ones(1, 8)) + end cdf = copy(df) select!(cdf, :x => x -> retval) @test cdf == DataFrame(x_function = [retval]) @@ -1122,8 +1134,13 @@ end hcat(df, DataFrame(a_b_c_function=[[(a = 1, b = 4, c = 7)], [(a = 2, b = 5, c = 8)], [(a = 3, b = 6, c = 9)]])) - @test_throws ArgumentError select(df, AsTable(:) => ByRow(identity)) - @test_throws ArgumentError select(df, AsTable(:) => ByRow(x -> df[1, :])) + @test select(df, AsTable(:) => ByRow(identity)) == + DataFrame(a_b_c_identity=[(a = 1, b = 4, c = 7), (a = 2, b = 5, c = 8), (a = 3, b = 6, c = 9)]) + @test select(df, AsTable(:) => ByRow(identity) => AsTable) == df + @test select(df, AsTable(:) => ByRow(x -> df[1, :])) == + DataFrame(a_b_c_function=fill(df[1,:], 3)) + @test select(df, AsTable(:) => ByRow(x -> df[1, :]) => AsTable) == + DataFrame(a=[1,1,1], b=4, c=7) @test_throws ArgumentError transform(df, AsTable(Not(:)) => ByRow(identity)) @test select(df, AsTable(Not(:)) => Ref) == repeat(DataFrame(Ref = NamedTuple()), nrow(df)) @@ -1164,12 +1181,14 @@ end @test combine(x -> Matrix(x), df) == rename(df, [:x1, :x2]) @test combine(x -> Ref(1:3), df) == DataFrame(x1=[1:3]) - @test_throws ArgumentError combine(df, x -> Ref(1:3)) + @test combine(df, x -> Ref(1:3)) == DataFrame(x1=[1:3]) - @test combine(AsTable(:) => identity, df) == df - @test combine((:) => cor, df) == DataFrame(x_y_cor = 1.0) - @test combine(:x => x -> Ref(1:3), df) == DataFrame(x_function=[1:3]) + @test_throws ArgumentError combine(df, AsTable(:) => identity) + @test combine(df, AsTable(:) => identity => AsTable) == df + @test combine(df, (:) => cor) == DataFrame(x_y_cor = 1.0) + @test combine(df, :x => x -> Ref(1:3)) == DataFrame(x_function=[1:3]) @test_throws ArgumentError combine(df, :x => x -> ones(1,1)) + @test combine(df, :x => (x -> ones(1,1)) => AsTable) == DataFrame(x1=1.0) df2 = combine(df, :x => identity) @test df2[:, 1] == df.x @@ -1188,8 +1207,9 @@ end @test combine(x -> Matrix(x), dfv) == rename(dfv, [:x1, :x2]) - @test combine(AsTable(:) => identity, dfv) == dfv - @test combine((:) => cor, dfv) == DataFrame(y_x_cor = 1.0) + @test_throws ArgumentError combine(dfv, AsTable(:) => identity) + @test combine(dfv, AsTable(:) => identity => AsTable) == dfv + @test combine(dfv, (:) => cor) == DataFrame(y_x_cor = 1.0) df2 = combine(dfv, :x => identity) @test df2[:, 1] == dfv.x @@ -1294,7 +1314,7 @@ end DataFrame(a=1:3, b=4:6, c=7:9, d=10:12, a_b=5:2:9, a_b_etc=22:4:30) @test combine(df, :a => +, [:a, :b] => +, All() => +, renamecols=false) == DataFrame(a=1:3, a_b=5:2:9, a_b_etc=22:4:30) - @test combine([:a, :b] => +, df, renamecols=false) == DataFrame(a_b=5:2:9) + @test combine(df, [:a, :b] => +, renamecols=false) == DataFrame(a_b=5:2:9) @test combine(identity, df, renamecols=false) == df df = DataFrame(a=1:3, b=4:6, c=7:9, d=10:12) From d93747565317b38cdbfff5ca06e894fb14518ee8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 29 Sep 2020 17:41:59 +0200 Subject: [PATCH 06/21] allow empty selector in ByRow. Initial tests --- src/abstractdataframe/selection.jl | 58 +++++++---- src/groupeddataframe/splitapplycombine.jl | 2 +- test/select.jl | 120 ++++++++++++++++++++++ 3 files changed, 157 insertions(+), 23 deletions(-) diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 1be7c65c72..5021bed0ac 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -1,5 +1,6 @@ # TODO: -# * add combine(fun, df) for DataFrame with 0 rows +# * add handling of empty ByRow to filter, and select/transform/combine for GroupedDataFrame +# * add handling of multiple column return rules for select/transform/combine for GroupedDataFrame # normalize_selection function makes sure that whatever input format of idx is it # will end up in one of four canonical forms @@ -7,7 +8,7 @@ # 2) Pair{Int, <:Pair{<:Base.Callable, <:Union{Symbol, Vector{Symbol}, Type{AsTable}}}} # 3) Pair{AbstractVector{Int}, <:Pair{<:Base.Callable, <:Union{Symbol, AbstractVector{Symbol}, Type{AsTable}}}} # 4) Pair{AsTable, <:Pair{<:Base.Callable, <:Union{Symbol, Vector{Symbol}, Type{AsTable}}}} -# 5) Function +# 5) Callable """ ByRow @@ -38,7 +39,7 @@ normalize_selection(idx::AbstractIndex, sel, renamecols::Bool) = end end -normalize_selection(idx::AbstractIndex, sel::Function, renamecols::Bool) = sel +normalize_selection(idx::AbstractIndex, sel::Base.Callable, renamecols::Bool) = sel normalize_selection(idx::AbstractIndex, sel::Colon, renamecols::Bool) = idx[:] normalize_selection(idx::AbstractIndex, sel::Pair{typeof(nrow), Symbol}, @@ -96,10 +97,6 @@ function normalize_selection(idx::AbstractIndex, end end end - if length(c) == 0 && first(last(sel)) isa ByRow - throw(ArgumentError("at least one column must be passed to a " * - "`ByRow` transformation function")) - end if lls isa AbstractString r = Symbol(lls) elseif lls isa AbstractVector{<:AbstractString} @@ -149,10 +146,6 @@ function normalize_selection(idx::AbstractIndex, end end end - if length(c) == 0 && last(sel) isa ByRow - throw(ArgumentError("at least one column must be passed to a " * - "`ByRow` transformation function")) - end fun = last(sel) if length(c) > 3 prefix = join(@views(_names(idx)[c[1:2]]), '_') @@ -176,14 +169,14 @@ function normalize_selection(idx::AbstractIndex, return (wanttable ? AsTable(c) : c) => fun => newcol end -function select_transform!(nc::Union{Function, Pair{<:Union{Int, AbstractVector{Int}, AsTable}, +function select_transform!(nc::Union{Base.Callable, Pair{<:Union{Int, AbstractVector{Int}, AsTable}, <:Pair{<:Base.Callable, <:Union{Symbol, AbstractVector{Symbol}, DataType}}}}, df::AbstractDataFrame, newdf::DataFrame, transformed_cols::Set{Symbol}, copycols::Bool, allow_resizing_newdf::Ref{Bool}) - if nc isa Function - col_idx, fun, newname = nothing, nc, AsTable + if nc isa Base.Callable + col_idx, fun, newname = nothing, nc, nothing else col_idx, (fun, newname) = nc end @@ -199,15 +192,34 @@ function select_transform!(nc::Union{Function, Pair{<:Union{Int, AbstractVector{ elseif col_idx isa Int res = fun(df[!, col_idx]) elseif col_idx isa AsTable - res = fun(Tables.columntable(select(df, col_idx.cols, copycols=false))) + tbl = Tables.columntable(select(df, col_idx.cols, copycols=false)) + if isempty(tbl) && fun isa ByRow + if isempty(df) + T = Base.return_types(fun.fun, ())[1] + res = T[] + else + res = [fun.fun() for _ in 1:nrow(df)] + end + else + res = fun(tbl) + end else # it should be fast enough here as we do not expect to do it millions of times @assert col_idx isa AbstractVector{Int} - res = fun(map(c -> cdf[c], col_idx)...) + if isempty(col_idx) && fun isa ByRow + if isempty(df) + T = Base.return_types(fun.fun, ())[1] + res = T[] + else + res = [fun.fun() for _ in 1:nrow(df)] + end + else + res = fun(map(c -> cdf[c], col_idx)...) + end end if (newname === AsTable || newname isa AbstractVector{Symbol}) && - !(res isa Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix, AbstractArray{<:Any, 0}, Ref}) + !(res isa Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix}) if res isa AbstractVector && !isempty(res) kp1 = keys(res[1]) all(x -> keys(x) == kp1, res) || throw(ArgumentError("keys of the returned elements must be identical")) @@ -231,7 +243,7 @@ function select_transform!(nc::Union{Function, Pair{<:Union{Int, AbstractVector{ else colnames = propertynames(res) end - if !(newname === AsTable) + if !(newname === AsTable || newname === nothing) if length(colnames) != length(newname) throw(ArgumentError("Number of returned columns does not match the " * "length of requested output")) @@ -328,7 +340,7 @@ function select_transform!(nc::Union{Function, Pair{<:Union{Int, AbstractVector{ newdf[!, newname] = v end end - elseif any(v -> v isa AbstractVector, x) + elseif any(v -> v isa AbstractVector, res) throw(ArgumentError("mixing single values and vectors in a named tuple is not allowed")) else if ncol(newdf) == 0 @@ -359,6 +371,9 @@ function select_transform!(nc::Union{Function, Pair{<:Union{Int, AbstractVector{ end end elseif res isa AbstractVector + if newname === nothing + newname = :x1 + end if newname in transformed_cols throw(ArgumentError("duplicate name of a transformed column")) else @@ -389,8 +404,7 @@ function select_transform!(nc::Union{Function, Pair{<:Union{Int, AbstractVector{ newdf[!, newname] = res end else - if newname === AsTable - @assert res isa Union{AbstractArray{<:Any, 0}, Ref} + if newname === nothing newname = :x1 end if newname in transformed_cols @@ -806,7 +820,7 @@ manipulate(df::DataFrame, c::ColumnIndex; copycols::Bool, keeprows::Bool, function manipulate(df::DataFrame, cs...; copycols::Bool, keeprows::Bool, renamecols::Bool) cs_vec = [] for v in cs - if v isa AbstractVector{<:Pair} + if v isa AbstractVecOrMat{<:Pair} append!(cs_vec, v) else push!(cs_vec, v) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index d7b1c23d86..b6f0595019 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -502,7 +502,7 @@ function _combine_prepare(gd::GroupedDataFrame, for p in cs if p === nrow push!(cs_vec, nrow => :nrow) - elseif p isa AbstractVector{<:Pair} + elseif p isa AbstractVecOrMat{<:Pair} append!(cs_vec, p) else push!(cs_vec, p) diff --git a/test/select.jl b/test/select.jl index 2fa0700f09..33f405c679 100644 --- a/test/select.jl +++ b/test/select.jl @@ -1326,4 +1326,124 @@ end @test df == DataFrame(a=1:3, b=4:6, c=7:9, d=10:12, a_b=5:2:9, a_b_etc=22:4:30) end +@testset "additional tests for new rules" begin +# select select! transform transform! combine +# Union{Type{AsTable}, Symbol, AbstractVector{Symbol}, AbstractString, AbstractVector{<:AbstractString}} +# DataFrame, SubDataFrame + @testset "SELECT(FUN, DF)" begin + for df in (DataFrame(a=1:2, b=3:4, c=5:6), view(DataFrame(a=1:3, b=3:5, c=5:7, d=11:13), 1:2, 1:3)) + @test select(sdf -> sdf.b, df) == DataFrame(x1=3:4) + @test select(sdf -> (b = 2sdf.b,), df) == DataFrame(b=[6,8]) + @test select(sdf -> (b = 1,), df) == DataFrame(b=[1, 1]) + @test_throws ArgumentError select(sdf -> (b = [1],), df) + @test select(sdf -> (b = [1, 5],), df) == DataFrame(b=[1, 5]) + @test select(sdf -> 1, df) == DataFrame(x1=[1, 1]) + @test select(sdf -> fill([1]), df) == DataFrame(x1=[[1], [1]]) + @test select(sdf -> Ref([1]), df) == DataFrame(x1=[[1], [1]]) + @test select(sdf -> "x", df) == DataFrame(x1=["x", "x"]) + @test select(sdf -> [[1,2],[3,4]], df) == DataFrame(x1=[[1,2],[3,4]]) + for ret in (DataFrame(), NamedTuple(), zeros(0,0), DataFrame(t=1)[1, 1:0]) + @test select(sdf -> ret, df) == DataFrame() + end + @test_throws ArgumentError select(sdf -> DataFrame(a=10), df) + @test_throws ArgumentError select(sdf -> zeros(1, 2), df) + @test select(sdf -> DataFrame(a=[10, 11]), df) == DataFrame(a=[10, 11]) + @test select(sdf -> [10 11; 12 13], df) == DataFrame(x1=[10, 12], x2=[11, 13]) + @test select(sdf -> DataFrame(a=10)[1, :], df) == DataFrame(a=[10, 10]) + + @test transform(sdf -> sdf.b, df) == [df DataFrame(x1=3:4)] + @test transform(sdf -> (b = 2sdf.b,), df) == DataFrame(a=1:2, b=[6,8], c=5:6) + @test transform(sdf -> (b = 1,), df) == DataFrame(a=[1,2], b=[1, 1], c=[5,6]) + @test_throws ArgumentError transform(sdf -> (b = [1],), df) + @test transform(sdf -> (b = [1, 5],), df) == DataFrame(a=[1,2], b=[1, 5], c=[5,6]) + @test transform(sdf -> 1, df) == DataFrame(a=1:2, b=3:4, c=5:6, x1=1) + @test transform(sdf -> fill([1]), df) == DataFrame(a=1:2, b=3:4, c=5:6, x1=[[1],[1]]) + @test transform(sdf -> Ref([1]), df) == DataFrame(a=1:2, b=3:4, c=5:6, x1=[[1],[1]]) + @test transform(sdf -> "x", df) == DataFrame(a=1:2, b=3:4, c=5:6, x1="x") + @test transform(sdf -> [[1,2],[3,4]], df) == DataFrame(a=1:2, b=3:4, c=5:6, x1=[[1,2],[3,4]]) + for ret in (DataFrame(), NamedTuple(), zeros(0,0), DataFrame(t=1)[1, 1:0]) + @test transform(sdf -> ret, df) == df + end + @test_throws ArgumentError transform(sdf -> DataFrame(a=10), df) + @test_throws ArgumentError transform(sdf -> zeros(1, 2), df) + @test transform(sdf -> DataFrame(a=[10, 11]), df) == DataFrame(a=[10, 11], b=3:4, c=5:6) + @test transform(sdf -> [10 11; 12 13], df) == DataFrame(a=1:2, b=3:4, c=5:6, x1=[10, 12], x2=[11, 13]) + @test transform(sdf -> DataFrame(a=10)[1, :], df) == DataFrame(a=[10, 10], b=3:4, c=5:6) + + @test combine(sdf -> sdf.b, df) == DataFrame(x1=3:4) + @test combine(sdf -> (b = 2sdf.b,), df) == DataFrame(b=[6,8]) + @test combine(sdf -> (b = 1,), df) == DataFrame(b=[1]) + @test combine(sdf -> (b = [1],), df) == DataFrame(b=[1]) + @test combine(sdf -> (b = [1, 5],), df) == DataFrame(b=[1, 5]) + @test combine(sdf -> 1, df) == DataFrame(x1=[1]) + @test combine(sdf -> fill([1]), df) == DataFrame(x1=[[1]]) + @test combine(sdf -> Ref([1]), df) == DataFrame(x1=[[1]]) + @test combine(sdf -> "x", df) == DataFrame(x1=["x"]) + @test combine(sdf -> [[1,2],[3,4]], df) == DataFrame(x1=[[1,2],[3,4]]) + for ret in (DataFrame(), NamedTuple(), zeros(0,0), DataFrame(t=1)[1, 1:0]) + @test combine(sdf -> ret, df) == DataFrame() + end + @test combine(sdf -> DataFrame(a=10), df) == DataFrame(a=10) + @test combine(sdf -> zeros(1, 2), df) == DataFrame(x1=0, x2=0) + @test combine(sdf -> DataFrame(a=[10, 11]), df) == DataFrame(a=[10, 11]) + @test combine(sdf -> [10 11; 12 13], df) == DataFrame(x1=[10, 12], x2=[11, 13]) + @test combine(sdf -> DataFrame(a=10)[1, :], df) == DataFrame(a=[10]) + end + + df = DataFrame(a=1:2, b=3:4, c=5:6) + @test select!(sdf -> sdf.b, copy(df)) == DataFrame(x1=3:4) + @test select!(sdf -> (b = 2sdf.b,), copy(df)) == DataFrame(b=[6,8]) + @test select!(sdf -> (b = 1,), copy(df)) == DataFrame(b=[1, 1]) + @test_throws ArgumentError select!(sdf -> (b = [1],), copy(df)) + @test select!(sdf -> (b = [1, 5],), copy(df)) == DataFrame(b=[1, 5]) + @test select!(sdf -> 1, copy(df)) == DataFrame(x1=[1, 1]) + @test select!(sdf -> fill([1]), copy(df)) == DataFrame(x1=[[1], [1]]) + @test select!(sdf -> Ref([1]), copy(df)) == DataFrame(x1=[[1], [1]]) + @test select!(sdf -> "x", copy(df)) == DataFrame(x1=["x", "x"]) + @test select!(sdf -> [[1,2],[3,4]], copy(df)) == DataFrame(x1=[[1,2],[3,4]]) + for ret in (DataFrame(), NamedTuple(), zeros(0,0), DataFrame(t=1)[1, 1:0]) + @test select!(sdf -> ret, copy(df)) == DataFrame() + end + @test_throws ArgumentError select!(sdf -> DataFrame(a=10), copy(df)) + @test_throws ArgumentError select!(sdf -> zeros(1, 2), copy(df)) + @test select!(sdf -> DataFrame(a=[10, 11]), copy(df)) == DataFrame(a=[10, 11]) + @test select!(sdf -> [10 11; 12 13], copy(df)) == DataFrame(x1=[10, 12], x2=[11, 13]) + @test select!(sdf -> DataFrame(a=10)[1, :], copy(df)) == DataFrame(a=[10, 10]) + + @test transform!(sdf -> sdf.b, copy(df)) == [df DataFrame(x1=3:4)] + @test transform!(sdf -> (b = 2sdf.b,), copy(df)) == DataFrame(a=1:2, b=[6,8], c=5:6) + @test transform!(sdf -> (b = 1,), copy(df)) == DataFrame(a=[1,2], b=[1, 1], c=[5,6]) + @test_throws ArgumentError transform!(sdf -> (b = [1],), copy(df)) + @test transform!(sdf -> (b = [1, 5],), copy(df)) == DataFrame(a=[1,2], b=[1, 5], c=[5,6]) + @test transform!(sdf -> 1, copy(df)) == DataFrame(a=1:2, b=3:4, c=5:6, x1=1) + @test transform!(sdf -> fill([1]), copy(df)) == DataFrame(a=1:2, b=3:4, c=5:6, x1=[[1],[1]]) + @test transform!(sdf -> Ref([1]), copy(df)) == DataFrame(a=1:2, b=3:4, c=5:6, x1=[[1],[1]]) + @test transform!(sdf -> "x", copy(df)) == DataFrame(a=1:2, b=3:4, c=5:6, x1="x") + @test transform!(sdf -> [[1,2],[3,4]], copy(df)) == DataFrame(a=1:2, b=3:4, c=5:6, x1=[[1,2],[3,4]]) + for ret in (DataFrame(), NamedTuple(), zeros(0,0), DataFrame(t=1)[1, 1:0]) + @test transform!(sdf -> ret, copy(df)) == df + end + @test_throws ArgumentError transform!(sdf -> DataFrame(a=10), copy(df)) + @test_throws ArgumentError transform!(sdf -> zeros(1, 2), copy(df)) + @test transform!(sdf -> DataFrame(a=[10, 11]), copy(df)) == DataFrame(a=[10, 11], b=3:4, c=5:6) + @test transform!(sdf -> [10 11; 12 13], copy(df)) == DataFrame(a=1:2, b=3:4, c=5:6, x1=[10, 12], x2=[11, 13]) + @test transform!(sdf -> DataFrame(a=10)[1, :], copy(df)) == DataFrame(a=[10, 10], b=3:4, c=5:6) + end +end + +@testset "empty ByRow" begin + df = DataFrame(a=1:3) + @test select(df, [] => ByRow(() -> 1)) == DataFrame("function" => [1, 1, 1]) + @test combine(df, [] => ByRow(() -> 1)) == DataFrame("function" => [1, 1, 1]) + @test transform(df, [] => ByRow(() -> 1)) == DataFrame("a" => 1:3, "function" => [1, 1, 1]) + + df = DataFrame() + @test select(df, [] => ByRow(() -> 1)) == DataFrame("function" => []) + @test combine(df, [] => ByRow(() -> 1)) == DataFrame("function" => []) + @test transform(df, [] => ByRow(() -> 1)) == DataFrame("function" => []) + @test eltype(select(df, [] => ByRow(() -> 1)).function) == Int64 + @test eltype(combine(df, [] => ByRow(() -> 1)).function) == Int64 + @test eltype(transform(df, [] => ByRow(() -> 1)).function) == Int64 +end + end # module From e8cacff5cded9e727a7134effeb99a248b23c998 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 30 Sep 2020 14:54:29 +0200 Subject: [PATCH 07/21] more tests of empty ByRow --- test/select.jl | 44 ++++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/test/select.jl b/test/select.jl index 33f405c679..9a8d9d50b2 100644 --- a/test/select.jl +++ b/test/select.jl @@ -1432,18 +1432,38 @@ end end @testset "empty ByRow" begin - df = DataFrame(a=1:3) - @test select(df, [] => ByRow(() -> 1)) == DataFrame("function" => [1, 1, 1]) - @test combine(df, [] => ByRow(() -> 1)) == DataFrame("function" => [1, 1, 1]) - @test transform(df, [] => ByRow(() -> 1)) == DataFrame("a" => 1:3, "function" => [1, 1, 1]) - - df = DataFrame() - @test select(df, [] => ByRow(() -> 1)) == DataFrame("function" => []) - @test combine(df, [] => ByRow(() -> 1)) == DataFrame("function" => []) - @test transform(df, [] => ByRow(() -> 1)) == DataFrame("function" => []) - @test eltype(select(df, [] => ByRow(() -> 1)).function) == Int64 - @test eltype(combine(df, [] => ByRow(() -> 1)).function) == Int64 - @test eltype(transform(df, [] => ByRow(() -> 1)).function) == Int64 + for sel in ([], AsTable([])) + df = DataFrame(a=1:3) + @test select(df, sel => ByRow(() -> 1)) == DataFrame("function" => [1, 1, 1]) + @test combine(df, sel => ByRow(() -> 1)) == DataFrame("function" => [1, 1, 1]) + @test transform(df, sel => ByRow(() -> 1)) == DataFrame("a" => 1:3, "function" => [1, 1, 1]) + + for df in (DataFrame(), DataFrame(a=[])) + @test select(df, sel => ByRow(() -> 1)) == DataFrame("function" => []) + @test combine(df, sel => ByRow(() -> 1)) == DataFrame("function" => []) + if ncol(df) == 0 + @test transform(df, sel => ByRow(() -> 1)) == DataFrame("function" => []) + else + @test transform(df, sel => ByRow(() -> 1)) == DataFrame("a" => [], "function" => []) + end + @test eltype(select(df, sel => ByRow(() -> 1)).function) == Int + @test eltype(combine(df, sel => ByRow(() -> 1)).function) == Int + @test eltype(transform(df, sel => ByRow(() -> 1)).function) == Int + + df2 = select(df, sel => ByRow(() -> (a=1,b="1")) => AsTable) + @test names(df2) == ["a", "b"] + @test eltype.(eachcol(df2)) == [Int, String] + df2 = select(df, sel => ByRow(() -> (a=1,b="1")) => [:p, :q]) + @test names(df2) == ["p", "q"] + @test eltype.(eachcol(df2)) == [Int, String] + + # here this follows Tables.jl behavior + for res in ([1, "1"], (1, "1")) + @test select(df, sel => ByRow(() -> res) => AsTable) == DataFrame() + @test_throws ArgumentError select(df, sel => ByRow(() -> res) => [:p, :q]) + end + end + end end end # module From de4bacf53fdcc22cd41b7854d8d5e6eca27f0221 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 30 Sep 2020 15:54:12 +0200 Subject: [PATCH 08/21] improved tests of empty ByRow --- src/abstractdataframe/selection.jl | 4 +- test/select.jl | 98 +++++++++++++++++++----------- 2 files changed, 66 insertions(+), 36 deletions(-) diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 5021bed0ac..15580e34d9 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -195,10 +195,10 @@ function select_transform!(nc::Union{Base.Callable, Pair{<:Union{Int, AbstractVe tbl = Tables.columntable(select(df, col_idx.cols, copycols=false)) if isempty(tbl) && fun isa ByRow if isempty(df) - T = Base.return_types(fun.fun, ())[1] + T = Base.return_types(fun.fun, (NamedTuple{(),Tuple{}},))[1] res = T[] else - res = [fun.fun() for _ in 1:nrow(df)] + res = [fun.fun(NamedTuple()) for _ in 1:nrow(df)] end else res = fun(tbl) diff --git a/test/select.jl b/test/select.jl index 9a8d9d50b2..a5aa6dde82 100644 --- a/test/select.jl +++ b/test/select.jl @@ -766,16 +766,16 @@ end @test combine(df, r"z" => () -> y) == DataFrame(:function => y) @test select(df, r"z" => () -> x)[!, 1] === x # no copy even for copycols=true @test_throws MethodError select(df, r"z" => x -> 1) - @test_throws ArgumentError select(df, r"z" => ByRow(rand)) + @test select(df, r"z" => ByRow(() -> 1)) == DataFrame(:function => fill(1, 10)) @test select(df, r"z", copycols=false) == DataFrame() @test select(df, r"z" => () -> x, copycols=false) == DataFrame(:function => x) @test select(df, r"z" => () -> x, copycols=false)[!, 1] === x @test_throws MethodError select(df, r"z" => x -> 1, copycols=false) - @test_throws ArgumentError select(df, r"z" => ByRow(rand), copycols=false) + @test select(df, r"z" => ByRow(() -> 1)) == DataFrame(:function => fill(1, 10), copycols=false) @test_throws MethodError select!(df, r"z" => x -> 1) - @test_throws ArgumentError select!(df, r"z" => ByRow(rand)) + @test select!(df, r"z" => ByRow(() -> 1)) == DataFrame(:function => fill(1, 10)) @test_throws MethodError select!(df, r"z" => () -> x, copycols=false) select!(df, r"z" => () -> x) @@ -1141,7 +1141,8 @@ end DataFrame(a_b_c_function=fill(df[1,:], 3)) @test select(df, AsTable(:) => ByRow(x -> df[1, :]) => AsTable) == DataFrame(a=[1,1,1], b=4, c=7) - @test_throws ArgumentError transform(df, AsTable(Not(:)) => ByRow(identity)) + @test transform(df, AsTable(Not(:)) => + ByRow(identity)) == [df DataFrame(:identity => fill(NamedTuple(), nrow(df)))] @test select(df, AsTable(Not(:)) => Ref) == repeat(DataFrame(Ref = NamedTuple()), nrow(df)) @test combine(df, AsTable(Not(:)) => Ref) == DataFrame(Ref = NamedTuple()) @@ -1432,36 +1433,65 @@ end end @testset "empty ByRow" begin - for sel in ([], AsTable([])) - df = DataFrame(a=1:3) - @test select(df, sel => ByRow(() -> 1)) == DataFrame("function" => [1, 1, 1]) - @test combine(df, sel => ByRow(() -> 1)) == DataFrame("function" => [1, 1, 1]) - @test transform(df, sel => ByRow(() -> 1)) == DataFrame("a" => 1:3, "function" => [1, 1, 1]) - - for df in (DataFrame(), DataFrame(a=[])) - @test select(df, sel => ByRow(() -> 1)) == DataFrame("function" => []) - @test combine(df, sel => ByRow(() -> 1)) == DataFrame("function" => []) - if ncol(df) == 0 - @test transform(df, sel => ByRow(() -> 1)) == DataFrame("function" => []) - else - @test transform(df, sel => ByRow(() -> 1)) == DataFrame("a" => [], "function" => []) - end - @test eltype(select(df, sel => ByRow(() -> 1)).function) == Int - @test eltype(combine(df, sel => ByRow(() -> 1)).function) == Int - @test eltype(transform(df, sel => ByRow(() -> 1)).function) == Int - - df2 = select(df, sel => ByRow(() -> (a=1,b="1")) => AsTable) - @test names(df2) == ["a", "b"] - @test eltype.(eachcol(df2)) == [Int, String] - df2 = select(df, sel => ByRow(() -> (a=1,b="1")) => [:p, :q]) - @test names(df2) == ["p", "q"] - @test eltype.(eachcol(df2)) == [Int, String] - - # here this follows Tables.jl behavior - for res in ([1, "1"], (1, "1")) - @test select(df, sel => ByRow(() -> res) => AsTable) == DataFrame() - @test_throws ArgumentError select(df, sel => ByRow(() -> res) => [:p, :q]) - end + df = DataFrame(a=1:3) + + @test select(df, [] => ByRow(() -> 1)) == DataFrame("function" => [1, 1, 1]) + @test combine(df, [] => ByRow(() -> 1)) == DataFrame("function" => [1, 1, 1]) + @test transform(df, [] => ByRow(() -> 1)) == DataFrame("a" => 1:3, "function" => [1, 1, 1]) + + for df in (DataFrame(), DataFrame(a=[])) + @test select(df, [] => ByRow(() -> 1)) == DataFrame("function" => []) + @test combine(df, [] => ByRow(() -> 1)) == DataFrame("function" => []) + if ncol(df) == 0 + @test transform(df, [] => ByRow(() -> 1)) == DataFrame("function" => []) + else + @test transform(df, [] => ByRow(() -> 1)) == DataFrame("a" => [], "function" => []) + end + @test eltype(select(df, [] => ByRow(() -> 1)).function) == Int + @test eltype(combine(df, [] => ByRow(() -> 1)).function) == Int + @test eltype(transform(df, [] => ByRow(() -> 1)).function) == Int + + df2 = select(df, [] => ByRow(() -> (a=1,b="1")) => AsTable) + @test names(df2) == ["a", "b"] + @test eltype.(eachcol(df2)) == [Int, String] + df2 = select(df, [] => ByRow(() -> (a=1,b="1")) => [:p, :q]) + @test names(df2) == ["p", "q"] + @test eltype.(eachcol(df2)) == [Int, String] + + # here this follows Tables.jl behavior + for res in ([1, "1"], (1, "1")) + @test select(df, [] => ByRow(() -> res) => AsTable) == DataFrame() + @test_throws ArgumentError select(df, [] => ByRow(() -> res) => [:p, :q]) + end + end + + @test select(df, AsTable([]) => ByRow(x -> 1)) == DataFrame("function" => [1, 1, 1]) + @test combine(df, AsTable([]) => ByRow(x -> 1)) == DataFrame("function" => [1, 1, 1]) + @test transform(df, AsTable([]) => ByRow(x -> 1)) == DataFrame("a" => 1:3, "function" => [1, 1, 1]) + + for df in (DataFrame(), DataFrame(a=[])) + @test select(df, AsTable([]) => ByRow(x -> 1)) == DataFrame("function" => []) + @test combine(df, AsTable([]) => ByRow(x -> 1)) == DataFrame("function" => []) + if ncol(df) == 0 + @test transform(df, AsTable([]) => ByRow(x -> 1)) == DataFrame("function" => []) + else + @test transform(df, AsTable([]) => ByRow(x -> 1)) == DataFrame("a" => [], "function" => []) + end + @test eltype(select(df, AsTable([]) => ByRow(x -> 1)).function) == Int + @test eltype(combine(df, AsTable([]) => ByRow(x -> 1)).function) == Int + @test eltype(transform(df, AsTable([]) => ByRow(x -> 1)).function) == Int + + df2 = select(df, AsTable([]) => ByRow(x -> (a=1,b="1")) => AsTable) + @test names(df2) == ["a", "b"] + @test eltype.(eachcol(df2)) == [Int, String] + df2 = select(df, AsTable([]) => ByRow(x -> (a=1,b="1")) => [:p, :q]) + @test names(df2) == ["p", "q"] + @test eltype.(eachcol(df2)) == [Int, String] + + # here this follows Tables.jl behavior + for res in ([1, "1"], (1, "1")) + @test select(df, AsTable([]) => ByRow(x -> res) => AsTable) == DataFrame() + @test_throws ArgumentError select(df, AsTable([]) => ByRow(x -> res) => [:p, :q]) end end end From 4dedb3d18510eb3e60bb2db551e2ff63eea61079 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 1 Oct 2020 11:14:56 +0200 Subject: [PATCH 09/21] finalize tests --- src/abstractdataframe/selection.jl | 294 +++++++++++++---------------- test/grouping.jl | 6 +- test/select.jl | 73 +++++++ 3 files changed, 203 insertions(+), 170 deletions(-) diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 15580e34d9..25e7695e43 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -169,39 +169,22 @@ function normalize_selection(idx::AbstractIndex, return (wanttable ? AsTable(c) : c) => fun => newcol end -function select_transform!(nc::Union{Base.Callable, Pair{<:Union{Int, AbstractVector{Int}, AsTable}, - <:Pair{<:Base.Callable, - <:Union{Symbol, AbstractVector{Symbol}, DataType}}}}, - df::AbstractDataFrame, newdf::DataFrame, - transformed_cols::Set{Symbol}, copycols::Bool, - allow_resizing_newdf::Ref{Bool}) - if nc isa Base.Callable - col_idx, fun, newname = nothing, nc, nothing - else - col_idx, (fun, newname) = nc - end - if newname isa DataType - newname === AsTable || throw(ArgumentError("Only DataType supported as target is AsTable")) - end - # It is allowed to request a tranformation operation into a newname column - # only once. This is ensured by the logic related to transformed_cols dictionaly - # in _manipulate, therefore in select_transform! such a duplicate should not happen - cdf = eachcol(df) +function _transformation_helper(df, col_idx, @nospecialize(fun)) if col_idx === nothing - res = fun(df) + return fun(df) elseif col_idx isa Int - res = fun(df[!, col_idx]) + return fun(df[!, col_idx]) elseif col_idx isa AsTable tbl = Tables.columntable(select(df, col_idx.cols, copycols=false)) if isempty(tbl) && fun isa ByRow if isempty(df) T = Base.return_types(fun.fun, (NamedTuple{(),Tuple{}},))[1] - res = T[] + return T[] else - res = [fun.fun(NamedTuple()) for _ in 1:nrow(df)] + return [fun.fun(NamedTuple()) for _ in 1:nrow(df)] end else - res = fun(tbl) + return fun(tbl) end else # it should be fast enough here as we do not expect to do it millions of times @@ -209,47 +192,124 @@ function select_transform!(nc::Union{Base.Callable, Pair{<:Union{Int, AbstractVe if isempty(col_idx) && fun isa ByRow if isempty(df) T = Base.return_types(fun.fun, ())[1] - res = T[] + return T[] else - res = [fun.fun() for _ in 1:nrow(df)] + return [fun.fun() for _ in 1:nrow(df)] end else - res = fun(map(c -> cdf[c], col_idx)...) + cdf = eachcol(df) + return fun(map(c -> cdf[c], col_idx)...) + end + end + throw(ErrorException("unreachable reached")) +end + +function _gen_colnames(@nospecialize(res), newname) + if res isa AbstractMatrix + colnames = gennames(size(res, 2)) + else + colnames = propertynames(res) + end + + if !(newname === AsTable || newname === nothing) + if length(colnames) != length(newname) + throw(ArgumentError("Number of returned columns does not match the " * + "length of requested output")) end + colnames = newname end + return colnames +end + +function _expand_to_table(@nospecialize(res)) + if res isa AbstractVector && !isempty(res) + kp1 = keys(res[1]) + all(x -> keys(x) == kp1, res) || throw(ArgumentError("keys of the returned elements must be identical")) + newres = DataFrame() + prepend = all(x -> x isa Integer, kp1) + for n in kp1 + newres[!, prepend ? Symbol("x", n) : Symbol(n)] = [x[n] for x in res] + end + return newres + else + return Tables.columntable(res) + end +end + +function _insert_row_multicolumn(df, newdf, allow_resizing_newdf, colnames, res) + if ncol(newdf) == 0 + # if allow_resizing_newdf[] is false we know this is select or transform + rows = allow_resizing_newdf[] ? 1 : nrow(df) + else + # allow squashing a scalar to 0 rows + rows = nrow(newdf) + end + @assert length(colnames) == length(res) + for (newname, v) in zip(colnames, res) + # note that newdf potentially can contain c in general + newdf[!, newname] = fill!(Tables.allocatecolumn(typeof(v), rows), v) + end +end + +function _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, @nospecialize(fun)) + # allow shortening to 0 rows + if allow_resizing_newdf[] && nrow(newdf) == 1 + newdfcols = _columns(newdf) + for (i, col) in enumerate(newdfcols) + newdfcols[i] = fill!(similar(col, lr), first(col)) + end + end + # !allow_resizing_newdf[] && ncol(newdf) == 0 + # means that we use `select` or `transform` not `combine` + if !allow_resizing_newdf[] && ncol(newdf) == 0 && lr != nrow(df) + throw(ArgumentError("length $(lr) of vector returned from " * + "function $fun is different from number of rows " * + "$(nrow(df)) of the source data frame.")) + end + allow_resizing_newdf[] = false +end + +function _add_col_check_copy(df, newdf, col_idx, copycols, @nospecialize(fun), newname, @nospecialize(v)) + cdf = eachcol(df) + vpar = parent(v) + parent_cols = col_idx isa AsTable ? col_idx.cols : (col_idx === nothing ? (1:ncol(df)) : col_idx) + if copycols && !(fun isa ByRow) && (v isa SubArray || any(i -> vpar === parent(cdf[i]), parent_cols)) + newdf[!, newname] = copy(v) + else + newdf[!, newname] = v + end +end + +function select_transform!(@nospecialize(nc::Union{Base.Callable, Pair{<:Union{Int, AbstractVector{Int}, AsTable}, + <:Pair{<:Base.Callable, + <:Union{Symbol, AbstractVector{Symbol}, DataType}}}}), + df::AbstractDataFrame, newdf::DataFrame, + transformed_cols::Set{Symbol}, copycols::Bool, + allow_resizing_newdf::Ref{Bool}) + if nc isa Base.Callable + col_idx, fun, newname = nothing, nc, nothing + else + col_idx, (fun, newname) = nc + end + if newname isa DataType + newname === AsTable || throw(ArgumentError("Only DataType supported as target is AsTable")) + end + # It is allowed to request a tranformation operation into a newname column + # only once. This is ensured by the logic related to transformed_cols dictionaly + # in _manipulate, therefore in select_transform! such a duplicate should not happen + res = _transformation_helper(df, col_idx, fun) + if (newname === AsTable || newname isa AbstractVector{Symbol}) && !(res isa Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix}) - if res isa AbstractVector && !isempty(res) - kp1 = keys(res[1]) - all(x -> keys(x) == kp1, res) || throw(ArgumentError("keys of the returned elements must be identical")) - true_res = res - res = DataFrame() - prepend = all(x -> x isa Integer, kp1) - for n in kp1 - res[!, prepend ? Symbol("x", n) : Symbol(n)] = [x[n] for x in true_res] - end - else - res = Tables.columntable(res) - end + res = _expand_to_table(res) end if res isa Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix} if newname isa Symbol throw(ArgumentError("Table returned while a single column return value was requested")) end - if res isa AbstractMatrix - colnames = gennames(size(res, 2)) - else - colnames = propertynames(res) - end - if !(newname === AsTable || newname === nothing) - if length(colnames) != length(newname) - throw(ArgumentError("Number of returned columns does not match the " * - "length of requested output")) - end - colnames = newname - end + colnames = _gen_colnames(res, newname) isempty(colnames) && return # nothing to do if any(in(transformed_cols), colnames) @@ -261,51 +321,14 @@ function select_transform!(nc::Union{Base.Callable, Pair{<:Union{Int, AbstractVe end if res isa AbstractDataFrame lr = nrow(res) - # allow shortening to 0 rows - if allow_resizing_newdf[] && nrow(newdf) == 1 - newdfcols = _columns(newdf) - for (i, col) in enumerate(newdfcols) - newdfcols[i] = fill!(similar(col, lr), first(col)) - end - end - - # !allow_resizing_newdf[] && ncol(newdf) == 0 - # means that we use `select` or `transform` not `combine` - if !allow_resizing_newdf[] && ncol(newdf) == 0 && lr != nrow(df) - throw(ArgumentError("length $(lr) of vector returned from " * - "function $fun is different from number of rows " * - "$(nrow(df)) of the source data frame.")) - end - allow_resizing_newdf[] = false + _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, fun) @assert length(colnames) == ncol(res) for (newname, v) in zip(colnames, eachcol(res)) - vpar = parent(v) - parent_cols = col_idx isa AsTable ? col_idx.cols : (col_idx === nothing ? (1:ncol(df)) : col_idx) - if copycols && !(fun isa ByRow) && - (v isa SubArray || any(i -> vpar === parent(cdf[i]), parent_cols)) - newdf[!, newname] = copy(v) - else - newdf[!, newname] = v - end + _add_col_check_copy(df, newdf, col_idx, copycols, fun, newname, v) end elseif res isa AbstractMatrix lr = size(res, 1) - # allow shortening to 0 rows - if allow_resizing_newdf[] && nrow(newdf) == 1 - newdfcols = _columns(newdf) - for (i, col) in enumerate(newdfcols) - newdfcols[i] = fill!(similar(col, lr), first(col)) - end - end - - # !allow_resizing_newdf[] && ncol(newdf) == 0 - # means that we use `select` or `transform` not `combine` - if !allow_resizing_newdf[] && ncol(newdf) == 0 && lr != nrow(df) - throw(ArgumentError("length $(lr) of vector returned from " * - "function $fun is different from number of rows " * - "$(nrow(df)) of the source data frame.")) - end - allow_resizing_newdf[] = false + _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, fun) @assert length(colnames) == size(res, 2) for (i, newname) in enumerate(colnames) newdf[!, newname] = res[:, i] @@ -313,62 +336,18 @@ function select_transform!(nc::Union{Base.Callable, Pair{<:Union{Int, AbstractVe elseif res isa NamedTuple if all(v -> v isa AbstractVector, res) lr = length(res[1]) - # allow shortening to 0 rows - if allow_resizing_newdf[] && nrow(newdf) == 1 - newdfcols = _columns(newdf) - for (i, col) in enumerate(newdfcols) - newdfcols[i] = fill!(similar(col, lr), first(col)) - end - end - - # !allow_resizing_newdf[] && ncol(newdf) == 0 - # means that we use `select` or `transform` not `combine` - if !allow_resizing_newdf[] && ncol(newdf) == 0 && lr != nrow(df) - throw(ArgumentError("length $(lr) of vector returned from " * - "function $fun is different from number of rows " * - "$(nrow(df)) of the source data frame.")) - end - allow_resizing_newdf[] = false + _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, fun) @assert length(colnames) == length(res) for (newname, v) in zip(colnames, res) - vpar = parent(v) - parent_cols = col_idx isa AsTable ? col_idx.cols : (col_idx === nothing ? (1:ncol(df)) : col_idx) - if copycols && !(fun isa ByRow) && - (v isa SubArray || any(i -> vpar === parent(cdf[i]), parent_cols)) - newdf[!, newname] = copy(v) - else - newdf[!, newname] = v - end + _add_col_check_copy(df, newdf, col_idx, copycols, fun, newname, v) end elseif any(v -> v isa AbstractVector, res) throw(ArgumentError("mixing single values and vectors in a named tuple is not allowed")) else - if ncol(newdf) == 0 - # if allow_resizing_newdf[] is false we know this is select or transform - rows = allow_resizing_newdf[] ? 1 : nrow(df) - else - # allow squashing a scalar to 0 rows - rows = nrow(newdf) - end - @assert length(colnames) == length(res) - for (newname, v) in zip(colnames, res) - # note that newdf potentially can contain c in general - newdf[!, newname] = fill!(Tables.allocatecolumn(typeof(v), rows), v) - end + _insert_row_multicolumn(df, newdf, allow_resizing_newdf, colnames, res) end elseif res isa DataFrameRow - if ncol(newdf) == 0 - # if allow_resizing_newdf[] is false we know this is select or transform - rows = allow_resizing_newdf[] ? 1 : nrow(df) - else - # allow squashing a scalar to 0 rows - rows = nrow(newdf) - end - @assert length(colnames) == length(res) - for (newname, v) in zip(colnames, res) - # note that newdf potentially can contain c in general - newdf[!, newname] = fill!(Tables.allocatecolumn(typeof(v), rows), v) - end + _insert_row_multicolumn(df, newdf, allow_resizing_newdf, colnames, res) end elseif res isa AbstractVector if newname === nothing @@ -379,30 +358,9 @@ function select_transform!(nc::Union{Base.Callable, Pair{<:Union{Int, AbstractVe else push!(transformed_cols, newname) end - # allow shortening to 0 rows - if allow_resizing_newdf[] && nrow(newdf) == 1 - newdfcols = _columns(newdf) - for (i, col) in enumerate(newdfcols) - newdfcols[i] = fill!(similar(col, length(res)), first(col)) - end - end - - # !allow_resizing_newdf[] && ncol(newdf) == 0 - # means that we use `select` or `transform` not `combine` - if !allow_resizing_newdf[] && ncol(newdf) == 0 && length(res) != nrow(df) - throw(ArgumentError("length $(length(res)) of vector returned from " * - "function $fun is different from number of rows " * - "$(nrow(df)) of the source data frame.")) - end - allow_resizing_newdf[] = false - respar = parent(res) - parent_cols = col_idx isa AsTable ? col_idx.cols : (col_idx === nothing ? (1:ncol(df)) : col_idx) - if copycols && !(fun isa ByRow) && - (res isa SubArray || any(i -> respar === parent(cdf[i]), parent_cols)) - newdf[!, newname] = copy(res) - else - newdf[!, newname] = res - end + lr = length(res) + _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, fun) + _add_col_check_copy(df, newdf, col_idx, copycols, fun, newname, res) else if newname === nothing newname = :x1 @@ -575,7 +533,7 @@ julia> select!(df, AsTable(:) => ByRow(mean), renamecols=false) select!(df::DataFrame, args...; renamecols::Bool=true) = _replace_columns!(df, select(df, args..., copycols=false, renamecols=renamecols)) -function select!(arg::Function, df::AbstractDataFrame; renamecols::Bool=true) +function select!(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true) if arg isa Colon throw(ArgumentError("Only transformations are allowed when function is a " * "frist argument to select!")) @@ -595,7 +553,7 @@ See [`select!`](@ref) for detailed rules regarding accepted values for `args`. transform!(df::DataFrame, args...; renamecols::Bool=true) = select!(df, :, args..., renamecols=renamecols) -function transform!(arg::Function, df::AbstractDataFrame; renamecols::Bool=true) +function transform!(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true) if arg isa Colon throw(ArgumentError("Only transformations are allowed when function is a " * "frist argument to transform!")) @@ -726,7 +684,7 @@ julia> select(df, AsTable(:) => ByRow(mean), renamecols=false) select(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true) = manipulate(df, args..., copycols=copycols, keeprows=true, renamecols=renamecols) -function select(arg::Function, df::AbstractDataFrame; renamecols::Bool=true) +function select(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true) if arg isa Colon throw(ArgumentError("Only transformations are allowed when function is a " * "frist argument to select")) @@ -747,7 +705,7 @@ See [`select`](@ref) for detailed rules regarding accepted values for `args`. transform(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true) = select(df, :, args..., copycols=copycols, renamecols=renamecols) -function transform(arg::Function, df::AbstractDataFrame; renamecols::Bool=true) +function transform(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true) if arg isa Colon throw(ArgumentError("Only transformations are allowed when function is a " * "frist argument to transform")) @@ -790,7 +748,7 @@ julia> combine(df, :a => sum, nrow, renamecols=false) combine(df::AbstractDataFrame, args...; renamecols::Bool=true) = manipulate(df, args..., copycols=true, keeprows=false, renamecols=renamecols) -function combine(arg::Function, df::AbstractDataFrame; renamecols::Bool=true) +function combine(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true) if arg isa Colon throw(ArgumentError("Only transformations are allowed when function is a " * "frist argument to combine")) @@ -830,7 +788,7 @@ function manipulate(df::DataFrame, cs...; copycols::Bool, keeprows::Bool, rename copycols, keeprows) end -function _manipulate(df::AbstractDataFrame, normalized_cs, copycols::Bool, keeprows::Bool) +function _manipulate(df::AbstractDataFrame, @nospecialize(normalized_cs), copycols::Bool, keeprows::Bool) @assert !(df isa SubDataFrame && copycols==false) newdf = DataFrame() # the role of transformed_cols is the following diff --git a/test/grouping.jl b/test/grouping.jl index c8839c9fa9..817ca1b255 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -1977,8 +1977,10 @@ end [df DataFrame(x_function=[(-1,), (-2,) ,(-3,) ,(-4,) ,(-5,)], y_function=[(-6,), (-7,) ,(-8,) ,(-9,) ,(-10,)])] - @test_throws ArgumentError combine(gdf, AsTable([:x, :y]) => ByRow(identity)) - @test_throws ArgumentError combine(gdf, AsTable([:x, :y]) => ByRow(x -> df[1, :])) + @test combine(gdf, AsTable([:x, :y]) => ByRow(identity)) == + DataFrame(g=[1,1,1,2,2], x_y_identity=ByRow(identity)((x=1:5, y=6:10))) + @test combine(gdf, AsTable([:x, :y]) => ByRow(x -> df[1, :])) == + DataFrame(g=[1,1,1,2,2], x_y_function=fill(df[1, :], 5)) end @testset "test correctness of ungrouping" begin diff --git a/test/select.jl b/test/select.jl index a5aa6dde82..98eb2f3dc1 100644 --- a/test/select.jl +++ b/test/select.jl @@ -1430,6 +1430,79 @@ end @test transform!(sdf -> [10 11; 12 13], copy(df)) == DataFrame(a=1:2, b=3:4, c=5:6, x1=[10, 12], x2=[11, 13]) @test transform!(sdf -> DataFrame(a=10)[1, :], copy(df)) == DataFrame(a=[10, 10], b=3:4, c=5:6) end + + @testset "SELECT(DF, => AsTable)" begin + for df in (DataFrame(a=1:2, b=3:4, c=5:6), view(DataFrame(a=1:3, b=3:5, c=5:7, d=11:13), 1:2, 1:3)) + for fun in (select, combine, transform), + res in (DataFrame(), DataFrame(a=1,b=2)[1, :], ones(1,1), + (a=1,b=2), (a=[1], b=[2]), (a=1, b=[2])) + @test_throws ArgumentError fun(df, :a => x -> res) + @test_throws ArgumentError fun(df, :a => (x -> res) => :z) + end + for res in (DataFrame(x1=1, x2=2)[1, :], (x1=1,x2=2)) + @test select(df, :a => (x -> res) => AsTable) == DataFrame(x1=[1,1], x2=[2,2]) + @test transform(df, :a => (x -> res) => AsTable) == [df DataFrame(x1=[1,1], x2=[2,2])] + @test combine(df, :a => (x -> res) => AsTable) == DataFrame(x1=[1], x2=[2]) + @test select(df, :a => (x -> res) => [:p, :q]) == DataFrame(p=[1,1], q=[2,2]) + @test transform(df, :a => (x -> res) => [:p, :q]) == [df DataFrame(p=[1,1], q=[2,2])] + @test combine(df, :a => (x -> res) => [:p, :q]) == DataFrame(p=[1], q=[2]) + @test_throws ArgumentError select(df, :a => (x -> res) => [:p, :q, :r]) + @test_throws ArgumentError select(df, :a => (x -> res) => [:p]) + end + for res in (DataFrame(x1=1, x2=2), [1 2], Tables.table([1 2], header=[:x1, :x2]), + (x1=[1], x2=[2])) + @test combine(df, :a => (x -> res) => AsTable) == DataFrame(x1=1, x2=2) + @test combine(df, :a => (x -> res) => [:p, :q]) == DataFrame(p=1, q=2) + @test_throws ArgumentError combine(df, :a => (x -> res) => [:p]) + @test_throws ArgumentError select(df, :a => (x -> res) => AsTable) + @test_throws ArgumentError transform(df, :a => (x -> res) => AsTable) + end + @test combine(df, :a => ByRow(x -> [x,x+1]), + :a => ByRow(x -> [x, x+1]) => AsTable, + :a => ByRow(x -> [x, x+1]) => [:p, :q], + :a => ByRow(x -> (s=x, t=x+1)) => AsTable, + :a => (x -> (k=x, l=x.+1)) => AsTable, + :a => ByRow(x -> (s=x, t=x+1)) => :z) == + DataFrame(a_function=[[1, 2], [2, 3]], x1=[1, 2], x2=[2, 3], + p=[1, 2], q=[2, 3], s=[1, 2], t=[2, 3], k=[1, 2], l=[2, 3], + z=[(s=1, t=2), (s=2, t=3)]) + @test select(df, :a => ByRow(x -> [x,x+1]), + :a => ByRow(x -> [x, x+1]) => AsTable, + :a => ByRow(x -> [x, x+1]) => [:p, :q], + :a => ByRow(x -> (s=x, t=x+1)) => AsTable, + :a => (x -> (k=x, l=x.+1)) => AsTable, + :a => ByRow(x -> (s=x, t=x+1)) => :z) == + DataFrame(a_function=[[1, 2], [2, 3]], x1=[1, 2], x2=[2, 3], + p=[1, 2], q=[2, 3], s=[1, 2], t=[2, 3], k=[1, 2], l=[2, 3], + z=[(s=1, t=2), (s=2, t=3)]) + @test transform(df, :a => ByRow(x -> [x,x+1]), + :a => ByRow(x -> [x, x+1]) => AsTable, + :a => ByRow(x -> [x, x+1]) => [:p, :q], + :a => ByRow(x -> (s=x, t=x+1)) => AsTable, + :a => (x -> (k=x, l=x.+1)) => AsTable, + :a => ByRow(x -> (s=x, t=x+1)) => :z) == + [df DataFrame(a_function=[[1, 2], [2, 3]], x1=[1, 2], x2=[2, 3], + p=[1, 2], q=[2, 3], s=[1, 2], t=[2, 3], k=[1, 2], l=[2, 3], + z=[(s=1, t=2), (s=2, t=3)])] + @test_throws ArgumentError select(df, :a => (x -> [(a=1,b=2), (a=1, b=2, c=3)]) => AsTable) + @test_throws ArgumentError select(df, :a => (x -> [(a=1,b=2), (a=1, c=3)]) => AsTable) + @test_throws ArgumentError combine(df, :a => (x -> (a=1,b=2)) => :x) + end + end + + @testset "check correctness of duplicate column names" begin + for df in (DataFrame(a=1:2, b=3:4, c=5:6), view(DataFrame(a=1:3, b=3:5, c=5:7, d=11:13), 1:2, 1:3)) + @test select(df, :b, :) == DataFrame(b=3:4, a=1:2, c=5:6) + @test select(df, :b => :c, :) == DataFrame(c=3:4, a=1:2, b=3:4) + @test_throws ArgumentError select(df, :b => [:c, :d], :) + @test_throws ArgumentError select(df, :a, :a => x -> (a=[1,2], b=[3,4])) + @test_throws ArgumentError select(df, :a, :a => (x -> (a=[1,2], b=[3,4])) => AsTable) + @test select(df, [:b, :a], :a => (x -> (a=[11,12], b=[13,14])) => AsTable, :) == + DataFrame(b=[13, 14], a=[11, 12], c=[5, 6]) + @test select(df, [:b, :a], :a => (x -> (a=[11,12], b=[13,14])) => [:b, :a], :) == + DataFrame(b=[11, 12], a=[13, 14], c=[5, 6]) + end + end end @testset "empty ByRow" begin From f307d1d8bbb4bf611c47430580717fc2903e5369 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 1 Oct 2020 12:43:37 +0200 Subject: [PATCH 10/21] change tests that now work --- test/grouping.jl | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/test/grouping.jl b/test/grouping.jl index 817ca1b255..ea9163980d 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -2700,12 +2700,8 @@ end @test isequal_typed(combine(df, :x => (x -> 1:2) => :y), DataFrame(y=1:2)) @test isequal_typed(combine(df, :x => (x -> x isa Vector{Int} ? "a" : 'a') => :y), DataFrame(y="a")) - - # in the future this should be DataFrame(nrow=0) - @test_throws ArgumentError combine(nrow, df) - - # in the future this should be DataFrame(a=1,b=2) - @test_throws ArgumentError combine(sdf -> DataFrame(a=1,b=2), df) + @test combine(nrow, df) == DataFrame(nrow=0) + @test combine(sdf -> DataFrame(a=1,b=2), df) == DataFrame(a=1,b=2) end @testset "disallowed tuple column selector" begin From 000b5c1f8cc919380612dc8c55baeb11a388ed2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 3 Oct 2020 15:03:09 +0200 Subject: [PATCH 11/21] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/selection.jl | 35 ++++++++++++++---------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 25e7695e43..b045c70bd1 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -68,8 +68,9 @@ normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, <:AbstractStrin function normalize_selection(idx::AbstractIndex, sel::Pair{<:Any,<:Pair{<:Base.Callable, - <:Union{Symbol, AbstractString, DataType, - AbstractVector{Symbol}, AbstractVector{<:AbstractString}}}}, + <:Union{Symbol, AbstractString, DataType, + AbstractVector{Symbol}, + AbstractVector{<:AbstractString}}}}, renamecols::Bool) lls = last(last(sel)) if lls isa DataType @@ -169,7 +170,7 @@ function normalize_selection(idx::AbstractIndex, return (wanttable ? AsTable(c) : c) => fun => newcol end -function _transformation_helper(df, col_idx, @nospecialize(fun)) +function _transformation_helper(df::AbstractDataFrame, col_idx, @nospecialize(fun)) if col_idx === nothing return fun(df) elseif col_idx isa Int @@ -178,7 +179,7 @@ function _transformation_helper(df, col_idx, @nospecialize(fun)) tbl = Tables.columntable(select(df, col_idx.cols, copycols=false)) if isempty(tbl) && fun isa ByRow if isempty(df) - T = Base.return_types(fun.fun, (NamedTuple{(),Tuple{}},))[1] + T = Core.Compiler.return_type(fun.fun, (NamedTuple{(),Tuple{}},)) return T[] else return [fun.fun(NamedTuple()) for _ in 1:nrow(df)] @@ -211,7 +212,7 @@ function _gen_colnames(@nospecialize(res), newname) colnames = propertynames(res) end - if !(newname === AsTable || newname === nothing) + if newname !== AsTable && newname !== nothing if length(colnames) != length(newname) throw(ArgumentError("Number of returned columns does not match the " * "length of requested output")) @@ -273,7 +274,7 @@ end function _add_col_check_copy(df, newdf, col_idx, copycols, @nospecialize(fun), newname, @nospecialize(v)) cdf = eachcol(df) vpar = parent(v) - parent_cols = col_idx isa AsTable ? col_idx.cols : (col_idx === nothing ? (1:ncol(df)) : col_idx) + parent_cols = col_idx isa AsTable ? col_idx.cols : something(col_idx, 1:ncol(df)) if copycols && !(fun isa ByRow) && (v isa SubArray || any(i -> vpar === parent(cdf[i]), parent_cols)) newdf[!, newname] = copy(v) else @@ -313,7 +314,8 @@ function select_transform!(@nospecialize(nc::Union{Base.Callable, Pair{<:Union{I isempty(colnames) && return # nothing to do if any(in(transformed_cols), colnames) - throw(ArgumentError("Duplicate column name returned")) + throw(ArgumentError("Duplicate column name(s) returned: :" * + "$(join(intersect(colnames, transformed_cols), ", :"))")) else startlen = length(transformed_cols) union!(transformed_cols, colnames) @@ -354,7 +356,7 @@ function select_transform!(@nospecialize(nc::Union{Base.Callable, Pair{<:Union{I newname = :x1 end if newname in transformed_cols - throw(ArgumentError("duplicate name of a transformed column")) + throw(ArgumentError("duplicate output column name: :$newname")) else push!(transformed_cols, newname) end @@ -366,7 +368,7 @@ function select_transform!(@nospecialize(nc::Union{Base.Callable, Pair{<:Union{I newname = :x1 end if newname in transformed_cols - throw(ArgumentError("duplicate name of a transformed column")) + throw(ArgumentError("duplicate output column name: :$newname")) else push!(transformed_cols, newname) end @@ -535,8 +537,7 @@ select!(df::DataFrame, args...; renamecols::Bool=true) = function select!(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true) if arg isa Colon - throw(ArgumentError("Only transformations are allowed when function is a " * - "frist argument to select!")) + throw(ArgumentError("First argument must be a transformation if the second argument is a data frame")) end return select!(df, arg) end @@ -555,8 +556,7 @@ transform!(df::DataFrame, args...; renamecols::Bool=true) = function transform!(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true) if arg isa Colon - throw(ArgumentError("Only transformations are allowed when function is a " * - "frist argument to transform!")) + throw(ArgumentError("First argument must be a transformation if the second argument is a data frame")) end return transform!(df, arg) end @@ -686,8 +686,7 @@ select(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=tru function select(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true) if arg isa Colon - throw(ArgumentError("Only transformations are allowed when function is a " * - "frist argument to select")) + throw(ArgumentError("First argument must be a transformation if the second argument is a data frame")) end return select(df, arg) end @@ -707,8 +706,7 @@ transform(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool= function transform(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true) if arg isa Colon - throw(ArgumentError("Only transformations are allowed when function is a " * - "frist argument to transform")) + throw(ArgumentError("First argument to must be a transformation if the second argument is a data frame")) end return transform(df, arg) end @@ -750,8 +748,7 @@ combine(df::AbstractDataFrame, args...; renamecols::Bool=true) = function combine(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true) if arg isa Colon - throw(ArgumentError("Only transformations are allowed when function is a " * - "frist argument to combine")) + throw(ArgumentError("First argument to select! must be a transformation if the second argument is a data frame")) end return combine(df, arg) end From 87455e2cb66ba80b454a6e8bb81051654fd7e329 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 3 Oct 2020 16:19:46 +0200 Subject: [PATCH 12/21] fixes after code review --- src/abstractdataframe/selection.jl | 169 ++++++++++++++++++----------- test/select.jl | 51 ++++----- 2 files changed, 125 insertions(+), 95 deletions(-) diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index b045c70bd1..361e01b848 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -170,7 +170,9 @@ function normalize_selection(idx::AbstractIndex, return (wanttable ? AsTable(c) : c) => fun => newcol end -function _transformation_helper(df::AbstractDataFrame, col_idx, @nospecialize(fun)) +function _transformation_helper(df::AbstractDataFrame, + col_idx::Union{Nothing, Int, AbstractVector{Int}, AsTable}, + @nospecialize(fun)) if col_idx === nothing return fun(df) elseif col_idx isa Int @@ -178,12 +180,7 @@ function _transformation_helper(df::AbstractDataFrame, col_idx, @nospecialize(fu elseif col_idx isa AsTable tbl = Tables.columntable(select(df, col_idx.cols, copycols=false)) if isempty(tbl) && fun isa ByRow - if isempty(df) - T = Core.Compiler.return_type(fun.fun, (NamedTuple{(),Tuple{}},)) - return T[] - else - return [fun.fun(NamedTuple()) for _ in 1:nrow(df)] - end + return [fun.fun(NamedTuple()) for _ in 1:nrow(df)] else return fun(tbl) end @@ -191,12 +188,7 @@ function _transformation_helper(df::AbstractDataFrame, col_idx, @nospecialize(fu # it should be fast enough here as we do not expect to do it millions of times @assert col_idx isa AbstractVector{Int} if isempty(col_idx) && fun isa ByRow - if isempty(df) - T = Base.return_types(fun.fun, ())[1] - return T[] - else - return [fun.fun() for _ in 1:nrow(df)] - end + return [fun.fun() for _ in 1:nrow(df)] else cdf = eachcol(df) return fun(map(c -> cdf[c], col_idx)...) @@ -205,7 +197,8 @@ function _transformation_helper(df::AbstractDataFrame, col_idx, @nospecialize(fu throw(ErrorException("unreachable reached")) end -function _gen_colnames(@nospecialize(res), newname) +function _gen_colnames(@nospecialize(res), newname::Union{AbstractVector{Symbol}, + Type{AsTable}, Nothing}) if res isa AbstractMatrix colnames = gennames(size(res, 2)) else @@ -220,25 +213,31 @@ function _gen_colnames(@nospecialize(res), newname) colnames = newname end - return colnames + # fix the type to avoid unnecesarry compilations of methods + # this should be cheap + return colnames isa Vector{Symbol} ? colnames : collect(Symbol, colnames) end -function _expand_to_table(@nospecialize(res)) - if res isa AbstractVector && !isempty(res) - kp1 = keys(res[1]) - all(x -> keys(x) == kp1, res) || throw(ArgumentError("keys of the returned elements must be identical")) - newres = DataFrame() - prepend = all(x -> x isa Integer, kp1) - for n in kp1 - newres[!, prepend ? Symbol("x", n) : Symbol(n)] = [x[n] for x in res] - end - return newres - else - return Tables.columntable(res) +_expand_to_table(res) = Tables.columntable(res) +_expand_to_table(res::Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix}) = res + +function _expand_to_table(res::AbstractVector) + isempty(res) && return Tables.columntable(res) + kp1 = keys(res[1]) + if any(x -> !isequal(keys(x), kp1), res) + throw(ArgumentError("keys of the returned elements must be identical")) + end + newres = DataFrame() + prepend = all(x -> x isa Integer, kp1) + for n in kp1 + newres[!, prepend ? Symbol("x", n) : Symbol(n)] = [x[n] for x in res] end + return newres end -function _insert_row_multicolumn(df, newdf, allow_resizing_newdf, colnames, res) +function _insert_row_multicolumn(newdf::DataFrame, df::AbstractDataFrame, + allow_resizing_newdf::Ref{Bool}, colnames::AbstractVector{Symbol}, + res::Union{NamedTuple, DataFrameRow}) if ncol(newdf) == 0 # if allow_resizing_newdf[] is false we know this is select or transform rows = allow_resizing_newdf[] ? 1 : nrow(df) @@ -248,12 +247,14 @@ function _insert_row_multicolumn(df, newdf, allow_resizing_newdf, colnames, res) end @assert length(colnames) == length(res) for (newname, v) in zip(colnames, res) - # note that newdf potentially can contain c in general + # note that newdf potentially can contain newname in general newdf[!, newname] = fill!(Tables.allocatecolumn(typeof(v), rows), v) end end -function _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, @nospecialize(fun)) +function _fix_existing_columns_for_vector(newdf::DataFrame, df::AbstractDataFrame, + allow_resizing_newdf::Ref{Bool}, lr::Int, + @nospecialize(fun)) # allow shortening to 0 rows if allow_resizing_newdf[] && nrow(newdf) == 1 newdfcols = _columns(newdf) @@ -271,7 +272,10 @@ function _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, @ allow_resizing_newdf[] = false end -function _add_col_check_copy(df, newdf, col_idx, copycols, @nospecialize(fun), newname, @nospecialize(v)) +function _add_col_check_copy(newdf::DataFrame, df::AbstractDataFrame, + col_idx::Union{Nothing, Int, AbstractVector{Int}, AsTable}, + copycols::Bool, @nospecialize(fun), + newname::Symbol, @nospecialize(v)) cdf = eachcol(df) vpar = parent(v) parent_cols = col_idx isa AsTable ? col_idx.cols : something(col_idx, 1:ncol(df)) @@ -282,9 +286,71 @@ function _add_col_check_copy(df, newdf, col_idx, copycols, @nospecialize(fun), n end end +function _add_multicol_res(res::AbstractDataFrame, newdf::DataFrame, df::AbstractDataFrame, + colnames::AbstractVector{Symbol}, + allow_resizing_newdf::Ref{Bool}, @nospecialize(fun), + col_idx::Union{Nothing, Int, AbstractVector{Int}, AsTable}, + copycols::Bool, newname::Union{Nothing, Type{AsTable}, AbstractVector{Symbol}}) + lr = nrow(res) + _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, fun) + @assert length(colnames) == ncol(res) + for (newname, v) in zip(colnames, eachcol(res)) + _add_col_check_copy(newdf, df, col_idx, copycols, fun, newname, v) + end +end + +function _add_multicol_res(res::AbstractMatrix, newdf::DataFrame, df::AbstractDataFrame, + colnames::AbstractVector{Symbol}, + allow_resizing_newdf::Ref{Bool}, @nospecialize(fun), + col_idx::Union{Nothing, Int, AbstractVector{Int}, AsTable}, + copycols::Bool, newname::Union{Nothing, Type{AsTable}, AbstractVector{Symbol}}) + lr = size(res, 1) + _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, fun) + @assert length(colnames) == size(res, 2) + for (i, newname) in enumerate(colnames) + newdf[!, newname] = res[:, i] + end +end + +function _add_multicol_res(res::NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}, + newdf::DataFrame, df::AbstractDataFrame, + colnames::AbstractVector{Symbol}, + allow_resizing_newdf::Ref{Bool}, @nospecialize(fun), + col_idx::Union{Nothing, Int, AbstractVector{Int}, AsTable}, + copycols::Bool, newname::Union{Nothing, Type{AsTable}, AbstractVector{Symbol}}) + lr = length(res[1]) + _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, fun) + @assert length(colnames) == length(res) + for (newname, v) in zip(colnames, res) + _add_col_check_copy(newdf, df, col_idx, copycols, fun, newname, v) + end +end + +function _add_multicol_res(res::NamedTuple, newdf::DataFrame, df::AbstractDataFrame, + colnames::AbstractVector{Symbol}, + allow_resizing_newdf::Ref{Bool}, @nospecialize(fun), + col_idx::Union{Nothing, Int, AbstractVector{Int}, AsTable}, + copycols::Bool, newname::Union{Nothing, Type{AsTable}, AbstractVector{Symbol}}) + if any(v -> v isa AbstractVector, res) + throw(ArgumentError("mixing single values and vectors in a named tuple is not allowed")) + else + _insert_row_multicolumn(newdf, df, allow_resizing_newdf, colnames, res) + end +end + +function _add_multicol_res(res::DataFrameRow, newdf::DataFrame, df::AbstractDataFrame, + colnames::AbstractVector{Symbol}, + allow_resizing_newdf::Ref{Bool}, @nospecialize(fun), + col_idx::Union{Nothing, Int, AbstractVector{Int}, AsTable}, + copycols::Bool, newname::Union{Nothing, Type{AsTable}, AbstractVector{Symbol}}) + _insert_row_multicolumn(newdf, df, allow_resizing_newdf, colnames, res) +end + function select_transform!(@nospecialize(nc::Union{Base.Callable, Pair{<:Union{Int, AbstractVector{Int}, AsTable}, - <:Pair{<:Base.Callable, - <:Union{Symbol, AbstractVector{Symbol}, DataType}}}}), + <:Pair{<:Base.Callable, + <:Union{Symbol, + AbstractVector{Symbol}, + DataType}}}}), df::AbstractDataFrame, newdf::DataFrame, transformed_cols::Set{Symbol}, copycols::Bool, allow_resizing_newdf::Ref{Bool}) @@ -301,8 +367,7 @@ function select_transform!(@nospecialize(nc::Union{Base.Callable, Pair{<:Union{I # in _manipulate, therefore in select_transform! such a duplicate should not happen res = _transformation_helper(df, col_idx, fun) - if (newname === AsTable || newname isa AbstractVector{Symbol}) && - !(res isa Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix}) + if (newname === AsTable || newname isa AbstractVector{Symbol}) res = _expand_to_table(res) end @@ -321,36 +386,8 @@ function select_transform!(@nospecialize(nc::Union{Base.Callable, Pair{<:Union{I union!(transformed_cols, colnames) @assert startlen + length(colnames) == length(transformed_cols) end - if res isa AbstractDataFrame - lr = nrow(res) - _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, fun) - @assert length(colnames) == ncol(res) - for (newname, v) in zip(colnames, eachcol(res)) - _add_col_check_copy(df, newdf, col_idx, copycols, fun, newname, v) - end - elseif res isa AbstractMatrix - lr = size(res, 1) - _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, fun) - @assert length(colnames) == size(res, 2) - for (i, newname) in enumerate(colnames) - newdf[!, newname] = res[:, i] - end - elseif res isa NamedTuple - if all(v -> v isa AbstractVector, res) - lr = length(res[1]) - _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, fun) - @assert length(colnames) == length(res) - for (newname, v) in zip(colnames, res) - _add_col_check_copy(df, newdf, col_idx, copycols, fun, newname, v) - end - elseif any(v -> v isa AbstractVector, res) - throw(ArgumentError("mixing single values and vectors in a named tuple is not allowed")) - else - _insert_row_multicolumn(df, newdf, allow_resizing_newdf, colnames, res) - end - elseif res isa DataFrameRow - _insert_row_multicolumn(df, newdf, allow_resizing_newdf, colnames, res) - end + _add_multicol_res(res, newdf, df, colnames, allow_resizing_newdf, fun, + col_idx, copycols, newname) elseif res isa AbstractVector if newname === nothing newname = :x1 @@ -362,7 +399,7 @@ function select_transform!(@nospecialize(nc::Union{Base.Callable, Pair{<:Union{I end lr = length(res) _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, fun) - _add_col_check_copy(df, newdf, col_idx, copycols, fun, newname, res) + _add_col_check_copy(newdf, df, col_idx, copycols, fun, newname, res) else if newname === nothing newname = :x1 diff --git a/test/select.jl b/test/select.jl index 98eb2f3dc1..80e9c4018e 100644 --- a/test/select.jl +++ b/test/select.jl @@ -4,6 +4,10 @@ using DataFrames, Test, Random, Statistics, CategoricalArrays const ≅ = isequal +"""Check if passed data frames are `isequal` and have the same types of columns""" +isequal_coltyped(df1::AbstractDataFrame, df2::AbstractDataFrame) = + isequal(df1, df2) && typeof.(eachcol(df1)) == typeof.(eachcol(df2)) + Random.seed!(1234) @testset "select! Not" begin @@ -887,18 +891,14 @@ end @test select(df2, (:) => (+) => :d, :x1 => (x -> x) => :b, [] => (() -> v) => :a) == DataFrame([6 1 9], [:d, :b, :a]) - res = select(df3, [] => (() -> v) => :a, :x1 => x -> []) - @test propertynames(res) == [:a, :x1_function] && nrow(res) == 0 - @test eltype.(eachcol(res)) == [Int, Any] - res = select(df3, :x1 => x -> [], [] => (() -> v) => :a) - @test propertynames(res) == [:x1_function, :a] && nrow(res) == 0 - @test eltype.(eachcol(res)) == [Any, Int] - res = select(df3, [] => (() -> v) => :a, :x1) - @test propertynames(res) == [:a, :x1] && nrow(res) == 0 - @test eltype.(eachcol(res)) == [Int, Char] - res = select(df3, :x1, [] => (() -> v) => :a) - @test propertynames(res) == [:x1, :a] && nrow(res) == 0 - @test eltype.(eachcol(res)) == [Char, Int] + @test isequal_coltyped(select(df3, [] => (() -> v) => :a, :x1 => x -> []), + DataFrame(a=Int[], x1_function=Any[])) + @test isequal_coltyped(select(df3, :x1 => x -> [], [] => (() -> v) => :a), + DataFrame(x1_function=Any[], a=Int[])) + @test isequal_coltyped(select(df3, [] => (() -> v) => :a, :x1), + DataFrame(a=Int[], x1=Char[])) + @test isequal_coltyped(select(df3, :x1, [] => (() -> v) => :a), + DataFrame(x1=Char[], a=Int[])) end @test_throws ArgumentError select(df, [] => (() -> [9]) => :a, :) @test_throws ArgumentError select(df, :, [] => (() -> [9]) => :a) @@ -1328,10 +1328,7 @@ end end @testset "additional tests for new rules" begin -# select select! transform transform! combine -# Union{Type{AsTable}, Symbol, AbstractVector{Symbol}, AbstractString, AbstractVector{<:AbstractString}} -# DataFrame, SubDataFrame - @testset "SELECT(FUN, DF)" begin + @testset "transformation function with a function as first argument" begin for df in (DataFrame(a=1:2, b=3:4, c=5:6), view(DataFrame(a=1:3, b=3:5, c=5:7, d=11:13), 1:2, 1:3)) @test select(sdf -> sdf.b, df) == DataFrame(x1=3:4) @test select(sdf -> (b = 2sdf.b,), df) == DataFrame(b=[6,8]) @@ -1431,7 +1428,7 @@ end @test transform!(sdf -> DataFrame(a=10)[1, :], copy(df)) == DataFrame(a=[10, 10], b=3:4, c=5:6) end - @testset "SELECT(DF, => AsTable)" begin + @testset "transformation function with multiple columns as destination" begin for df in (DataFrame(a=1:2, b=3:4, c=5:6), view(DataFrame(a=1:3, b=3:5, c=5:7, d=11:13), 1:2, 1:3)) for fun in (select, combine, transform), res in (DataFrame(), DataFrame(a=1,b=2)[1, :], ones(1,1), @@ -1524,12 +1521,10 @@ end @test eltype(combine(df, [] => ByRow(() -> 1)).function) == Int @test eltype(transform(df, [] => ByRow(() -> 1)).function) == Int - df2 = select(df, [] => ByRow(() -> (a=1,b="1")) => AsTable) - @test names(df2) == ["a", "b"] - @test eltype.(eachcol(df2)) == [Int, String] - df2 = select(df, [] => ByRow(() -> (a=1,b="1")) => [:p, :q]) - @test names(df2) == ["p", "q"] - @test eltype.(eachcol(df2)) == [Int, String] + @test isequal_coltyped(select(df, [] => ByRow(() -> (a=1,b="1")) => AsTable), + DataFrame(a=Int[], b=String[])) + @test isequal_coltyped(select(df, [] => ByRow(() -> (a=1,b="1")) => [:p, :q]), + DataFrame(p=Int[], q=String[])) # here this follows Tables.jl behavior for res in ([1, "1"], (1, "1")) @@ -1554,12 +1549,10 @@ end @test eltype(combine(df, AsTable([]) => ByRow(x -> 1)).function) == Int @test eltype(transform(df, AsTable([]) => ByRow(x -> 1)).function) == Int - df2 = select(df, AsTable([]) => ByRow(x -> (a=1,b="1")) => AsTable) - @test names(df2) == ["a", "b"] - @test eltype.(eachcol(df2)) == [Int, String] - df2 = select(df, AsTable([]) => ByRow(x -> (a=1,b="1")) => [:p, :q]) - @test names(df2) == ["p", "q"] - @test eltype.(eachcol(df2)) == [Int, String] + @test isequal_coltyped(select(df, AsTable([]) => ByRow(x -> (a=1,b="1")) => AsTable), + DataFrame(a=Int[], b=String[])) + @test isequal_coltyped(select(df, AsTable([]) => ByRow(x -> (a=1,b="1")) => [:p, :q]), + DataFrame(p=Int[], q=String[])) # here this follows Tables.jl behavior for res in ([1, "1"], (1, "1")) From e16e17a0b288dd3a2c722c82db4ed87e511f3111 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 3 Oct 2020 23:32:31 +0200 Subject: [PATCH 13/21] Update src/abstractdataframe/selection.jl Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/selection.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 361e01b848..7dfb0c0032 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -373,7 +373,7 @@ function select_transform!(@nospecialize(nc::Union{Base.Callable, Pair{<:Union{I if res isa Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix} if newname isa Symbol - throw(ArgumentError("Table returned while a single column return value was requested")) + throw(ArgumentError("Table returned but a single output column was expected")) end colnames = _gen_colnames(res, newname) isempty(colnames) && return # nothing to do From 0ef36cfd2c11c59aea14f93cd8b04d241a8c92c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 3 Oct 2020 23:41:46 +0200 Subject: [PATCH 14/21] Update src/abstractdataframe/selection.jl --- src/abstractdataframe/selection.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 7dfb0c0032..7ef80a48a5 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -275,7 +275,7 @@ end function _add_col_check_copy(newdf::DataFrame, df::AbstractDataFrame, col_idx::Union{Nothing, Int, AbstractVector{Int}, AsTable}, copycols::Bool, @nospecialize(fun), - newname::Symbol, @nospecialize(v)) + newname::Symbol, v::AbstractVector) cdf = eachcol(df) vpar = parent(v) parent_cols = col_idx isa AsTable ? col_idx.cols : something(col_idx, 1:ncol(df)) From 7679adee2edd3913fbaae77502ff82694d1f5ac9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 4 Oct 2020 12:48:01 +0200 Subject: [PATCH 15/21] Update src/abstractdataframe/selection.jl Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/selection.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 7ef80a48a5..4c335c65fc 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -367,7 +367,7 @@ function select_transform!(@nospecialize(nc::Union{Base.Callable, Pair{<:Union{I # in _manipulate, therefore in select_transform! such a duplicate should not happen res = _transformation_helper(df, col_idx, fun) - if (newname === AsTable || newname isa AbstractVector{Symbol}) + if newname === AsTable || newname isa AbstractVector{Symbol} res = _expand_to_table(res) end From 2ebe6889ebd38bbfb8c280e30b4d938603e520dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 5 Oct 2020 11:17:09 +0200 Subject: [PATCH 16/21] Update src/abstractdataframe/selection.jl Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/selection.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 4c335c65fc..4f52c4b3af 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -213,7 +213,7 @@ function _gen_colnames(@nospecialize(res), newname::Union{AbstractVector{Symbol} colnames = newname end - # fix the type to avoid unnecesarry compilations of methods + # fix the type to avoid unnecessary compilations of methods # this should be cheap return colnames isa Vector{Symbol} ? colnames : collect(Symbol, colnames) end From e1141bec10f743463a4fbdd12c0156d56474b132 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 5 Oct 2020 11:51:05 +0200 Subject: [PATCH 17/21] make rules for keys in AsTable on vector stricter --- NEWS.md | 4 ++++ src/abstractdataframe/selection.jl | 6 +++++- test/select.jl | 10 ++++------ 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index e8acee6440..f5037937eb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,10 @@ ## Breaking changes +* the rules for transformations passed to `select`/`select!`, `transform`/`transform!`, + and `combine` have been made more flexible; in particular now it is allowed to + return multiple columns from a transformation function + [#2461](https://github.com/JuliaData/DataFrames.jl/pull/2461) * CategoricalArrays.jl is no longer reexported: call `using CategoricalArrays` to use it [#2404]((https://github.com/JuliaData/DataFrames.jl/pull/2404)). In the same vein, the `categorical` and `categorical!` functions diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 4f52c4b3af..1724f68027 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -224,11 +224,15 @@ _expand_to_table(res::Union{AbstractDataFrame, NamedTuple, DataFrameRow, Abstrac function _expand_to_table(res::AbstractVector) isempty(res) && return Tables.columntable(res) kp1 = keys(res[1]) + prepend = all(x -> x isa Integer, kp1) + if !(prepend || all(x -> x isa Symbol, kp1) || all(x -> x isa AbstractString, kp1)) + throw(ArgumentError("keys of the returned elements must be " * + "`Symbol`s, strings or integers")) + end if any(x -> !isequal(keys(x), kp1), res) throw(ArgumentError("keys of the returned elements must be identical")) end newres = DataFrame() - prepend = all(x -> x isa Integer, kp1) for n in kp1 newres[!, prepend ? Symbol("x", n) : Symbol(n)] = [x[n] for x in res] end diff --git a/test/select.jl b/test/select.jl index 80e9c4018e..23c2d0674b 100644 --- a/test/select.jl +++ b/test/select.jl @@ -734,11 +734,9 @@ end @test select(df, :x => ByRow(x -> retval) => AsTable) == DataFrame(;retval...) elseif retval isa DataFrame @test_throws MethodError select(df, :x => ByRow(x -> retval) => AsTable) - else # Matrix; surprising but following the API - @test select(df, :x => ByRow(x -> retval) => AsTable) == - DataFrame(["CartesianIndex($i, $j)" => 1.0 for i in 1:2, j in 1:2]...) - @test select(df, :x => ByRow(x -> retval) => [:a, :b, :c, :d]) == - DataFrame(a=1.0, b=1.0, c=1.0, d=1.0) + else # Matrix: wrong type of keys + @test_throws ArgumentError select(df, :x => ByRow(x -> retval) => AsTable) + @test_throws ArgumentError select(df, :x => ByRow(x -> retval) => [:a, :b, :c, :d]) end end @@ -748,7 +746,7 @@ end if retval isa Tuple @test select(df, :x => ByRow(x -> retval) => AsTable) == DataFrame(x1=1, x2=2) else - @test select(df, :x => ByRow(x -> retval) => Symbol.("x", 1:8)) == DataFrame(ones(1, 8)) + @test_throws ArgumentError select(df, :x => ByRow(x -> retval) => AsTable) end cdf = copy(df) select!(cdf, :x => x -> retval) From df1dd1922a8a125b01cbd56b5af18d0fb512d111 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 5 Oct 2020 12:38:05 +0200 Subject: [PATCH 18/21] update docstrings and the manual --- docs/src/lib/types.md | 3 +- docs/src/man/getting_started.md | 8 + src/abstractdataframe/selection.jl | 271 ++++++++++++++++++++++------- 3 files changed, 220 insertions(+), 62 deletions(-) diff --git a/docs/src/lib/types.md b/docs/src/lib/types.md index 0bb84afaa4..5073229c05 100644 --- a/docs/src/lib/types.md +++ b/docs/src/lib/types.md @@ -55,7 +55,8 @@ The `ByRow` type is a special type used for selection operations to signal that to each element (row) of the selection. The `AsTable` type is a special type used for selection operations to signal that the columns selected by a wrapped -selector should be passed as a `NamedTuple` to the function. +selector should be passed as a `NamedTuple` to the function or to signal that it is requested +to expand the return value of a transformation into multiple columns. ## [The design of handling of columns of a `DataFrame`](@id man-columnhandling) diff --git a/docs/src/man/getting_started.md b/docs/src/man/getting_started.md index e6d697b947..df6b8bd899 100644 --- a/docs/src/man/getting_started.md +++ b/docs/src/man/getting_started.md @@ -627,6 +627,14 @@ julia> select(df, :x2, :x2 => ByRow(sqrt)) # transform columns by row ├─────┼───────┼─────────┤ │ 1 │ 3 │ 1.73205 │ │ 2 │ 4 │ 2.0 │ + +julia> select(df, AsTable(:) => ByRow(extrema) => [:lo, :hi]) # return multiple columns +2×2 DataFrame +│ Row │ lo │ hi │ +│ │ Int64 │ Int64 │ +├─────┼───────┼───────┤ +│ 1 │ 1 │ 5 │ +│ 2 │ 2 │ 6 │ ``` It is important to note that `select` always returns a data frame, diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 1724f68027..82f65226d5 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -432,35 +432,41 @@ SELECT_ARG_RULES = * Any index that is allowed for column indexing ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). + * A function or a type * Column transformation operations using the `Pair` notation that is - described below and vectors of such pairs. + described below and vectors or matrices of such pairs. Columns can be renamed using the `old_column => new_column_name` syntax, and transformed using the `old_column => fun => new_column_name` syntax. - `new_column_name` must be a `Symbol` or a string, and `fun` a function or a - type. If `old_column` is a `Symbol`, a string, or an integer then `fun` is - applied to the corresponding column vector. Otherwise `old_column` can be - any column indexing syntax, in which case `fun` will be passed the column - vectors specified by `old_column` as separate arguments. The only exception - is when `old_column` is an `AsTable` type wrapping a selector, in which case - `fun` is passed a `NamedTuple` containing the selected columns. + `new_column_name` must be a `Symbol` or a string, a vector of `Symbol` or + string, or `AsTable`, and `fun` a function or a type. If `old_column` is a + `Symbol`, a string, or an integer then `fun` is applied to the corresponding + column vector. Otherwise `old_column` can be any column indexing syntax, in + which case `fun` will be passed the column vectors specified by `old_column` + as separate arguments. The only exception is when `old_column` is an + `AsTable` type wrapping a selector, in which case `fun` is passed a + `NamedTuple` containing the selected columns. + + Column renaming and transformation operations can be passed wrapped in + vectors or matrices (this is useful when combined with broadcasting). + + # Rules when `new_column_name` is a `Symbol` or a string or is missing If `fun` returns a value of type other than `AbstractVector` then it will be broadcasted into a vector matching the target number of rows in the data frame, unless its type is one of `AbstractDataFrame`, `NamedTuple`, - `DataFrameRow`, `AbstractMatrix`, in which case an error is thrown as - currently these return types are not allowed. As a particular rule, values - wrapped in a `Ref` or a `0`-dimensional `AbstractArray` are unwrapped and - then broadcasted. + `DataFrameRow`, `AbstractMatrix`, in which case an error is thrown. As a + particular rule, values wrapped in a `Ref` or a `0`-dimensional + `AbstractArray` are unwrapped and then broadcasted. To apply `fun` to each row instead of whole columns, it can be wrapped in a `ByRow` struct. In this case if `old_column` is a `Symbol`, a string, or an integer then `fun` is applied to each element (row) of `old_column` using broadcasting. Otherwise `old_column` can be any column indexing syntax, in which case `fun` will be passed one argument for each of the columns - specified by `old_column`. If `ByRow` is used it is not allowed for - `old_column` to select an empty set of columns nor for `fun` to return a - `NamedTuple` or a `DataFrameRow`. + specified by `old_column`. If `ByRow` is used it is allowed for + `old_column` to select an empty set of columns, in which case no arguments + are passed to `fun` for each row. Column transformation can also be specified using the short `old_column => fun` form. In this case, `new_column_name` is automatically generated as @@ -473,8 +479,52 @@ SELECT_ARG_RULES = It is not allowed to pass `renamecols=false` if `old_column` is empty as it would generate an empty column name. - Column renaming and transformation operations can be passed wrapped in - vectors (this is useful when combined with broadcasting). + # Rules when `new_column_name` is a vector of `Symbol` or a string or is `AsTable` + + In this case it is assumed that `fun` returns multiple columns. + + If `fun` returns one of `AbstractDataFrame`, `NamedTuple`, `DataFrameRow`, + `AbstractMatrix` then rules described below for `args` being a function or + a type apply. + + If `fun` returns an `AbstractVector` then each element of this vector must + support `keys` function that must return a collection of `Symbol`s, strings + or integers; the return value of `keys` must be identical for all elements. + Then as many columns are created as there are elements in the return value + of the `keys` function and their names are set to be equal to the key names, + except if `keys` returns integers, in which case they are prefixed by `x` + (so the column names are e.g. `x1`, `x2`, ...) + + If `fun` returns a value of any other type then it is assumed that it is + a table conforming to Tables.jl API and the `Tables.columntable` function is + called on it to get the resulting columns and their names. + + Additionally if `new_column_name` is a vector of `Symbol` or string then column + names produced using the rules above are ignored and replaced by `new_column_name` + (the number of columns must be the same as the length `new_column_name` in this case). + + # Rules when element of `args` is a function or a type + + In this case a transformaton is passed `df` as a single argument. + + If the return value of the transformation is of `AbstractDataFrame`, + `NamedTuple`, `DataFrameRow` or `AbstractMatrix` then it is treated as + containing multiple columns. For `AbstractMatrix` column names are generated + as `x1`, `x2`, etc. For `AbstractDataFrame`, `NamedTuple` of vectors and + `AbstractMatrix` the columns are taken as is from the returned value. For + `DataFrameRow` and` NamedTuple` not containing any vectors the returned + value is broadcasted a vector matching the target number of rows in the data + frame. + + If the return value is an `AbstractVector` then it is used as-is. The resulting + column gets a name `x1`. + + In all other cases the return value is broadcasted into a vector matching + the target number of rows in the data frame. As a particular rule, values + wrapped in a `Ref` or a `0`-dimensional `AbstractArray` are unwrapped and + then broadcasted. The resulting column gets a name `x1`. + + # Special rules As a special rule passing `nrow` without specifying `old_column` creates a column named `:nrow` containing a number of rows in a source data frame, and @@ -493,6 +543,7 @@ SELECT_ARG_RULES = """ select!(df::DataFrame, args...; renamecols::Bool=true) + select!(args::Callable, df::DataFrame; renamecols::Bool=true) Mutate `df` in place to retain only columns specified by `args...` and return it. The result is guaranteed to have the same number of rows as `df`, except when no @@ -547,16 +598,16 @@ julia> select!(df, :, [:c, :b] => (c,b) -> c .+ b .- sum(b)/length(b)) julia> df = DataFrame(a=1:3, b=4:6); -julia> select!(df, names(df) .=> sum); +julia> select!(df, names(df) .=> [minimum maximum]); julia> df -3×2 DataFrame -│ Row │ a_sum │ b_sum │ -│ │ Int64 │ Int64 │ -├─────┼───────┼───────┤ -│ 1 │ 6 │ 15 │ -│ 2 │ 6 │ 15 │ -│ 3 │ 6 │ 15 │ +3×4 DataFrame +│ Row │ a_minimum │ b_minimum │ a_maximum │ b_maximum │ +│ │ Int64 │ Int64 │ Int64 │ Int64 │ +├─────┼───────────┼───────────┼───────────┼───────────┤ +│ 1 │ 1 │ 4 │ 3 │ 6 │ +│ 2 │ 1 │ 4 │ 3 │ 6 │ +│ 3 │ 1 │ 4 │ 3 │ 6 │ julia> df = DataFrame(a=1:3, b=4:6); @@ -570,6 +621,36 @@ julia> select!(df, AsTable(:) => ByRow(mean), renamecols=false) │ 1 │ 2.5 │ │ 2 │ 3.5 │ │ 3 │ 4.5 │ + +julia> df = DataFrame(a=1:3, b=4:6); + +julia> select!(first, df) +3×2 DataFrame +│ Row │ a │ b │ +│ │ Int64 │ Int64 │ +├─────┼───────┼───────┤ +│ 1 │ 1 │ 4 │ +│ 2 │ 1 │ 4 │ +│ 3 │ 1 │ 4 │ + +julia> df = DataFrame(a=1:3, b=4:6, c=7:9) +3×3 DataFrame +│ Row │ a │ b │ c │ +│ │ Int64 │ Int64 │ Int64 │ +├─────┼───────┼───────┼───────┤ +│ 1 │ 1 │ 4 │ 7 │ +│ 2 │ 2 │ 5 │ 8 │ +│ 3 │ 3 │ 6 │ 9 │ + +julia> select!(df, AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => :stats, + AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => AsTable) +3×3 DataFrame +│ Row │ stats │ mean │ std │ +│ │ NamedTuple… │ Float64 │ Float64 │ +├─────┼─────────────────────────┼─────────┼─────────┤ +│ 1 │ (mean = 4.0, std = 3.0) │ 4.0 │ 3.0 │ +│ 2 │ (mean = 5.0, std = 3.0) │ 5.0 │ 3.0 │ +│ 3 │ (mean = 6.0, std = 3.0) │ 6.0 │ 3.0 │ ``` """ @@ -585,6 +666,7 @@ end """ transform!(df::DataFrame, args...; renamecols::Bool=true) + transform!(args::Callable, df::DataFrame; renamecols::Bool=true) Mutate `df` in place to add columns specified by `args...` and return it. The result is guaranteed to have the same number of rows as `df`. @@ -604,6 +686,7 @@ end """ select(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true) + select(args::Callable, df::DataFrame; renamecols::Bool=true) Create a new data frame that contains columns from `df` specified by `args` and return it. The result is guaranteed to have the same number of rows as `df`, @@ -648,7 +731,7 @@ julia> df = DataFrame(a=1:3, b=4:6) │ 2 │ 2 │ 5 │ │ 3 │ 3 │ 6 │ -julia> select(df, :b) +julia> select(df, 2) 3×1 DataFrame │ Row │ b │ │ │ Int64 │ @@ -657,24 +740,6 @@ julia> select(df, :b) │ 2 │ 5 │ │ 3 │ 6 │ -julia> select(df, Not(:b)) # drop column :b from df -3×1 DataFrame -│ Row │ a │ -│ │ Int64 │ -├─────┼───────┤ -│ 1 │ 1 │ -│ 2 │ 2 │ -│ 3 │ 3 │ - -julia> select(df, :a => :c, :b) -3×2 DataFrame -│ Row │ c │ b │ -│ │ Int64 │ Int64 │ -├─────┼───────┼───────┤ -│ 1 │ 1 │ 4 │ -│ 2 │ 2 │ 5 │ -│ 3 │ 3 │ 6 │ - julia> select(df, :a => ByRow(sin) => :c, :b) 3×2 DataFrame │ Row │ c │ b │ @@ -693,23 +758,16 @@ julia> select(df, :, [:a, :b] => (a,b) -> a .+ b .- sum(b)/length(b)) │ 2 │ 2 │ 5 │ 2.0 │ │ 3 │ 3 │ 6 │ 4.0 │ -julia> select(df, names(df) .=> sum) -3×2 DataFrame -│ Row │ a_sum │ b_sum │ -│ │ Int64 │ Int64 │ -├─────┼───────┼───────┤ -│ 1 │ 6 │ 15 │ -│ 2 │ 6 │ 15 │ -│ 3 │ 6 │ 15 │ +julia> select(df, names(df) .=> [minimum maximum]) +3×4 DataFrame +│ Row │ a_minimum │ b_minimum │ a_maximum │ b_maximum │ +│ │ Int64 │ Int64 │ Int64 │ Int64 │ +├─────┼───────────┼───────────┼───────────┼───────────┤ +│ 1 │ 1 │ 4 │ 3 │ 6 │ +│ 2 │ 1 │ 4 │ 3 │ 6 │ +│ 3 │ 1 │ 4 │ 3 │ 6 │ -julia> select(df, names(df) .=> sum .=> [:A, :B]) -3×2 DataFrame -│ Row │ A │ B │ -│ │ Int64 │ Int64 │ -├─────┼───────┼───────┤ -│ 1 │ 6 │ 15 │ -│ 2 │ 6 │ 15 │ -│ 3 │ 6 │ 15 │ +julia> using Statistics julia> select(df, AsTable(:) => ByRow(mean), renamecols=false) 3×1 DataFrame @@ -719,6 +777,34 @@ julia> select(df, AsTable(:) => ByRow(mean), renamecols=false) │ 1 │ 2.5 │ │ 2 │ 3.5 │ │ 3 │ 4.5 │ + +julia> select(first, df) +3×2 DataFrame +│ Row │ a │ b │ +│ │ Int64 │ Int64 │ +├─────┼───────┼───────┤ +│ 1 │ 1 │ 4 │ +│ 2 │ 1 │ 4 │ +│ 3 │ 1 │ 4 │ + +julia> df = DataFrame(a=1:3, b=4:6, c=7:9) +3×3 DataFrame +│ Row │ a │ b │ c │ +│ │ Int64 │ Int64 │ Int64 │ +├─────┼───────┼───────┼───────┤ +│ 1 │ 1 │ 4 │ 7 │ +│ 2 │ 2 │ 5 │ 8 │ +│ 3 │ 3 │ 6 │ 9 │ + +julia> select(df, AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => :stats, + AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => AsTable) +3×3 DataFrame +│ Row │ stats │ mean │ std │ +│ │ NamedTuple… │ Float64 │ Float64 │ +├─────┼─────────────────────────┼─────────┼─────────┤ +│ 1 │ (mean = 4.0, std = 3.0) │ 4.0 │ 3.0 │ +│ 2 │ (mean = 5.0, std = 3.0) │ 5.0 │ 3.0 │ +│ 3 │ (mean = 6.0, std = 3.0) │ 6.0 │ 3.0 │ ``` """ @@ -734,6 +820,7 @@ end """ transform(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true) + transform(args::Callable, df::DataFrame; renamecols::Bool=true) Create a new data frame that contains columns from `df` and adds columns specified by `args` and return it. @@ -754,7 +841,7 @@ end """ combine(df::AbstractDataFrame, args...; renamecols::Bool=true) - combine(arg, df::AbstractDataFrame; renamecols::Bool=true) + combine(args::Callable, df::AbstractDataFrame; renamecols::Bool=true) Create a new data frame that contains columns from `df` specified by `args` and return it. The result can have any number of rows that is determined by the @@ -782,6 +869,68 @@ julia> combine(df, :a => sum, nrow, renamecols=false) │ │ Int64 │ Int64 │ ├─────┼───────┼───────┤ │ 1 │ 6 │ 3 │ + +julia> combine(df, :a => ByRow(sin) => :c, :b) +3×2 DataFrame +│ Row │ c │ b │ +│ │ Float64 │ Int64 │ +├─────┼──────────┼───────┤ +│ 1 │ 0.841471 │ 4 │ +│ 2 │ 0.909297 │ 5 │ +│ 3 │ 0.14112 │ 6 │ + +julia> combine(df, :, [:a, :b] => (a,b) -> a .+ b .- sum(b)/length(b)) +3×3 DataFrame +│ Row │ a │ b │ a_b_function │ +│ │ Int64 │ Int64 │ Float64 │ +├─────┼───────┼───────┼──────────────┤ +│ 1 │ 1 │ 4 │ 0.0 │ +│ 2 │ 2 │ 5 │ 2.0 │ +│ 3 │ 3 │ 6 │ 4.0 │ + +julia> combine(df, names(df) .=> [minimum maximum]) +1×4 DataFrame +│ Row │ a_minimum │ b_minimum │ a_maximum │ b_maximum │ +│ │ Int64 │ Int64 │ Int64 │ Int64 │ +├─────┼───────────┼───────────┼───────────┼───────────┤ +│ 1 │ 1 │ 4 │ 3 │ 6 │ + +julia> using Statistics + +julia> combine(df, AsTable(:) => ByRow(mean), renamecols=false) +3×1 DataFrame +│ Row │ a_b │ +│ │ Float64 │ +├─────┼─────────┤ +│ 1 │ 2.5 │ +│ 2 │ 3.5 │ +│ 3 │ 4.5 │ + +julia> combine(first, df) +1×2 DataFrame +│ Row │ a │ b │ +│ │ Int64 │ Int64 │ +├─────┼───────┼───────┤ +│ 1 │ 1 │ 4 │ + +julia> df = DataFrame(a=1:3, b=4:6, c=7:9) +3×3 DataFrame +│ Row │ a │ b │ c │ +│ │ Int64 │ Int64 │ Int64 │ +├─────┼───────┼───────┼───────┤ +│ 1 │ 1 │ 4 │ 7 │ +│ 2 │ 2 │ 5 │ 8 │ +│ 3 │ 3 │ 6 │ 9 │ + +julia> combine(df, AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => :stats, + AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => AsTable) +3×3 DataFrame +│ Row │ stats │ mean │ std │ +│ │ NamedTuple… │ Float64 │ Float64 │ +├─────┼─────────────────────────┼─────────┼─────────┤ +│ 1 │ (mean = 4.0, std = 3.0) │ 4.0 │ 3.0 │ +│ 2 │ (mean = 5.0, std = 3.0) │ 5.0 │ 3.0 │ +│ 3 │ (mean = 6.0, std = 3.0) │ 6.0 │ 3.0 │ ``` """ combine(df::AbstractDataFrame, args...; renamecols::Bool=true) = From 1e1f87ca99b6423d8fe64519e7b21a459b9c79f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 7 Oct 2020 09:14:05 +0200 Subject: [PATCH 19/21] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/selection.jl | 32 +++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 82f65226d5..68053d47de 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -438,8 +438,8 @@ SELECT_ARG_RULES = Columns can be renamed using the `old_column => new_column_name` syntax, and transformed using the `old_column => fun => new_column_name` syntax. - `new_column_name` must be a `Symbol` or a string, a vector of `Symbol` or - string, or `AsTable`, and `fun` a function or a type. If `old_column` is a + `new_column_name` must be a `Symbol` or a string, a vector of `Symbol`s or + strings, or `AsTable`. `fun` must be a function or a type. If `old_column` is a `Symbol`, a string, or an integer then `fun` is applied to the corresponding column vector. Otherwise `old_column` can be any column indexing syntax, in which case `fun` will be passed the column vectors specified by `old_column` @@ -450,7 +450,7 @@ SELECT_ARG_RULES = Column renaming and transformation operations can be passed wrapped in vectors or matrices (this is useful when combined with broadcasting). - # Rules when `new_column_name` is a `Symbol` or a string or is missing + # Rules when `new_column_name` is a `Symbol` or a string or is absent If `fun` returns a value of type other than `AbstractVector` then it will be broadcasted into a vector matching the target number of rows in the data @@ -465,8 +465,8 @@ SELECT_ARG_RULES = broadcasting. Otherwise `old_column` can be any column indexing syntax, in which case `fun` will be passed one argument for each of the columns specified by `old_column`. If `ByRow` is used it is allowed for - `old_column` to select an empty set of columns, in which case no arguments - are passed to `fun` for each row. + `old_column` to select an empty set of columns, in which case `fun` + is called for each row without any arguments. Column transformation can also be specified using the short `old_column => fun` form. In this case, `new_column_name` is automatically generated as @@ -479,7 +479,7 @@ SELECT_ARG_RULES = It is not allowed to pass `renamecols=false` if `old_column` is empty as it would generate an empty column name. - # Rules when `new_column_name` is a vector of `Symbol` or a string or is `AsTable` + # Rules when `new_column_name` is a vector of `Symbol`s or strings or is `AsTable` In this case it is assumed that `fun` returns multiple columns. @@ -493,36 +493,36 @@ SELECT_ARG_RULES = Then as many columns are created as there are elements in the return value of the `keys` function and their names are set to be equal to the key names, except if `keys` returns integers, in which case they are prefixed by `x` - (so the column names are e.g. `x1`, `x2`, ...) + (so the column names are e.g. `x1`, `x2`, ...). If `fun` returns a value of any other type then it is assumed that it is - a table conforming to Tables.jl API and the `Tables.columntable` function is + a table conforming to the Tables.jl API and the `Tables.columntable` function is called on it to get the resulting columns and their names. - Additionally if `new_column_name` is a vector of `Symbol` or string then column + Additionally if `new_column_name` is a vector of `Symbol`s or strings then column names produced using the rules above are ignored and replaced by `new_column_name` - (the number of columns must be the same as the length `new_column_name` in this case). + (the number of columns must be the same as the length of `new_column_name` in this case). # Rules when element of `args` is a function or a type - In this case a transformaton is passed `df` as a single argument. + In this case the function or type is called with `df` as a single argument. - If the return value of the transformation is of `AbstractDataFrame`, + If the return value of the transformation is one of `AbstractDataFrame`, `NamedTuple`, `DataFrameRow` or `AbstractMatrix` then it is treated as containing multiple columns. For `AbstractMatrix` column names are generated as `x1`, `x2`, etc. For `AbstractDataFrame`, `NamedTuple` of vectors and `AbstractMatrix` the columns are taken as is from the returned value. For `DataFrameRow` and` NamedTuple` not containing any vectors the returned - value is broadcasted a vector matching the target number of rows in the data + value is broadcasted to a vector matching the target number of rows in the data frame. If the return value is an `AbstractVector` then it is used as-is. The resulting - column gets a name `x1`. + column gets the name `x1`. In all other cases the return value is broadcasted into a vector matching the target number of rows in the data frame. As a particular rule, values wrapped in a `Ref` or a `0`-dimensional `AbstractArray` are unwrapped and - then broadcasted. The resulting column gets a name `x1`. + then broadcasted. The resulting column gets the name `x1`. # Special rules @@ -923,7 +923,7 @@ julia> df = DataFrame(a=1:3, b=4:6, c=7:9) │ 3 │ 3 │ 6 │ 9 │ julia> combine(df, AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => :stats, - AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => AsTable) + AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => AsTable) 3×3 DataFrame │ Row │ stats │ mean │ std │ │ │ NamedTuple… │ Float64 │ Float64 │ From 58b9d66d6ce842fa0d1af85ffddf2258dfb8cbae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 7 Oct 2020 09:35:13 +0200 Subject: [PATCH 20/21] update docs --- src/abstractdataframe/selection.jl | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 68053d47de..cb1be3d330 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -484,24 +484,26 @@ SELECT_ARG_RULES = In this case it is assumed that `fun` returns multiple columns. If `fun` returns one of `AbstractDataFrame`, `NamedTuple`, `DataFrameRow`, - `AbstractMatrix` then rules described below for `args` being a function or - a type apply. + `AbstractMatrix` then rules described in the section describing the case + when `args` is a function or a type apply. If `fun` returns an `AbstractVector` then each element of this vector must support `keys` function that must return a collection of `Symbol`s, strings or integers; the return value of `keys` must be identical for all elements. Then as many columns are created as there are elements in the return value - of the `keys` function and their names are set to be equal to the key names, - except if `keys` returns integers, in which case they are prefixed by `x` - (so the column names are e.g. `x1`, `x2`, ...). - - If `fun` returns a value of any other type then it is assumed that it is - a table conforming to the Tables.jl API and the `Tables.columntable` function is - called on it to get the resulting columns and their names. - - Additionally if `new_column_name` is a vector of `Symbol`s or strings then column - names produced using the rules above are ignored and replaced by `new_column_name` - (the number of columns must be the same as the length of `new_column_name` in this case). + of the `keys` function. If `new_column_name` is `AsTable` then their names + are set to be equal to the key names except if `keys` returns integers, in + which case they are prefixed by `x` (so the column names are e.g. `x1`, + `x2`, ...). If `new_column_name` is a vector of `Symbol`s or strings then + column names produced using the rules above are ignored and replaced by + `new_column_name` (the number of columns must be the same as the length of + `new_column_name` in this case). + + If `fun` returns a value of any other type then it is assumed that it is a + table conforming to the Tables.jl API and the `Tables.columntable` function + is called on it to get the resulting columns and their names. The names are + retained when `new_column_name` is `AsTable` and are replaced if + `new_column_name` is a vector of `Symbol`s or strings. # Rules when element of `args` is a function or a type From 2cdeb7da927c3be8e74a9c1610562004c2c3754a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 7 Oct 2020 22:37:28 +0200 Subject: [PATCH 21/21] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/selection.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index cb1be3d330..76393ccd85 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -453,11 +453,11 @@ SELECT_ARG_RULES = # Rules when `new_column_name` is a `Symbol` or a string or is absent If `fun` returns a value of type other than `AbstractVector` then it will be - broadcasted into a vector matching the target number of rows in the data + repeated in a vector matching the target number of rows in the data frame, unless its type is one of `AbstractDataFrame`, `NamedTuple`, `DataFrameRow`, `AbstractMatrix`, in which case an error is thrown. As a particular rule, values wrapped in a `Ref` or a `0`-dimensional - `AbstractArray` are unwrapped and then broadcasted. + `AbstractArray` are unwrapped and then repeated. To apply `fun` to each row instead of whole columns, it can be wrapped in a `ByRow` struct. In this case if `old_column` is a `Symbol`, a string, or an @@ -488,7 +488,7 @@ SELECT_ARG_RULES = when `args` is a function or a type apply. If `fun` returns an `AbstractVector` then each element of this vector must - support `keys` function that must return a collection of `Symbol`s, strings + support the `keys` function, which must return a collection of `Symbol`s, strings or integers; the return value of `keys` must be identical for all elements. Then as many columns are created as there are elements in the return value of the `keys` function. If `new_column_name` is `AsTable` then their names @@ -521,10 +521,10 @@ SELECT_ARG_RULES = If the return value is an `AbstractVector` then it is used as-is. The resulting column gets the name `x1`. - In all other cases the return value is broadcasted into a vector matching + In all other cases the return value is repeated in a vector matching the target number of rows in the data frame. As a particular rule, values wrapped in a `Ref` or a `0`-dimensional `AbstractArray` are unwrapped and - then broadcasted. The resulting column gets the name `x1`. + then repeated. The resulting column gets the name `x1`. # Special rules