From acc84d2134f6ab4623fbe09914449d5d75faecb7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 26 Sep 2020 19:11:54 +0200
Subject: [PATCH 01/21] initial implementation for AbstractDataFrame

---
 src/abstractdataframe/selection.jl | 300 +++++++++++++++++++++--------
 1 file changed, 219 insertions(+), 81 deletions(-)

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index 0469b87fb3..57a3fe87b4 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -4,9 +4,10 @@
 # normalize_selection function makes sure that whatever input format of idx is it
 # will end up in one of four canonical forms
 # 1) AbstractVector{Int}
-# 2) Pair{Int, <:Pair{<:Base.Callable, Symbol}}
-# 3) Pair{AbstractVector{Int}, <:Pair{<:Base.Callable, Symbol}}
-# 4) Pair{AsTable, <:Pair{<:Base.Callable, Symbol}}
+# 2) Pair{Int, <:Pair{<:Base.Callable, <:Union{Symbol, Vector{Symbol}, Type{AsTable}}}}
+# 3) Pair{AbstractVector{Int}, <:Pair{<:Base.Callable, <:Union{Symbol, AbstractVector{Symbol}, Type{AsTable}}}}
+# 4) Pair{AsTable, <:Pair{<:Base.Callable, <:Union{Symbol, Vector{Symbol}, Type{AsTable}}}}
+# 5) Function
 
 """
     ByRow
@@ -14,22 +15,14 @@
 A type used for selection operations to signal that the wrapped function should
 be applied to each element (row) of the selection.
 
-Note that `ByRow` always collects values returned by `fun` in a vector. Therefore,
-to allow for future extensions, returning `NamedTuple` or `DataFrameRow`
-from `fun` is currently disallowed.
+Note that `ByRow` always collects values returned by `fun` in a vector.
 """
 struct ByRow{T} <: Function
     fun::T
 end
 
-_by_row_helper(x::Any) = x
-_by_row_helper(x::Union{NamedTuple, DataFrameRow}) =
-    throw(ArgumentError("return value of type $(typeof(x)) " *
-                        "is currently not allowed with ByRow."))
-
-(f::ByRow)(cols::AbstractVector...) = _by_row_helper.(f.fun.(cols...))
-(f::ByRow)(table::NamedTuple) =
-    _by_row_helper.(f.fun.(Tables.namedtupleiterator(table)))
+(f::ByRow)(cols::AbstractVector...) = f.fun.(cols...)
+(f::ByRow)(table::NamedTuple) = f.fun.(Tables.namedtupleiterator(table))
 
 # add a method to funname defined in other/utils.jl
 funname(row::ByRow) = funname(row.fun)
@@ -45,6 +38,9 @@ normalize_selection(idx::AbstractIndex, sel, renamecols::Bool) =
         end
     end
 
+normalize_selection(idx::AbstractIndex, sel::Function, renamecols::Bool) = sel
+normalize_selection(idx::AbstractIndex, sel::Colon, renamecols::Bool) = idx[:]
+
 normalize_selection(idx::AbstractIndex, sel::Pair{typeof(nrow), Symbol},
                     renamecols::Bool) =
     length(idx) == 0 ? (Int[] => (() -> 0) => last(sel)) : (1 => length => last(sel))
@@ -70,8 +66,13 @@ normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, <:AbstractStrin
     normalize_selection(idx, first(sel) => Symbol(last(sel)), renamecols::Bool)
 
 function normalize_selection(idx::AbstractIndex,
-                             sel::Pair{<:Any,<:Pair{<:Base.Callable, Symbol}},
+                             sel::Pair{<:Any,<:Pair{<:Base.Callable,
+                             <:Union{Symbol, AbstractString, DataType,
+                                     AbstractVector{Symbol}, AbstractVector{<:AbstractString}}}},
                              renamecols::Bool)
+    if last(last(sel)) isa DataType
+        last(last(sel)) === AsTable || throw(ArgumentError("Only DataType supported as target is AsTable"))
+    end
     if first(sel) isa AsTable
         rawc = first(sel).cols
         wanttable = true
@@ -98,15 +99,17 @@ function normalize_selection(idx::AbstractIndex,
         throw(ArgumentError("at least one column must be passed to a " *
                             "`ByRow` transformation function"))
     end
-    return (wanttable ? AsTable(c) : c) => last(sel)
+    ls = last(sel)
+    if ls isa AbstractString
+        r = Symbol(ls)
+    elseif ls isa AbstractVector{<:AbstractString}
+        r = Symbol.(ls)
+    else
+        r = ls
+    end
+    return (wanttable ? AsTable(c) : c) => r
 end
 
-normalize_selection(idx::AbstractIndex,
-                    sel::Pair{<:Any,<:Pair{<:Base.Callable,<:AbstractString}},
-                    renamecols::Bool) =
-    normalize_selection(idx, first(sel) => first(last(sel)) => Symbol(last(last(sel))),
-                        renamecols::Bool)
-
 function normalize_selection(idx::AbstractIndex,
                              sel::Pair{<:ColumnIndex,<:Base.Callable}, renamecols::Bool)
     c = idx[first(sel)]
@@ -170,18 +173,27 @@ function normalize_selection(idx::AbstractIndex,
     return (wanttable ? AsTable(c) : c) => fun => newcol
 end
 
-function select_transform!(nc::Pair{<:Union{Int, AbstractVector{Int}, AsTable},
-                                    <:Pair{<:Base.Callable, Symbol}},
+function select_transform!(nc::Union{Function, Pair{<:Union{Int, AbstractVector{Int}, AsTable},
+                                                    <:Pair{<:Base.Callable,
+                                                           <:Union{Symbol, AbstractVector{Symbol}, DataType}}}},
                            df::AbstractDataFrame, newdf::DataFrame,
-                           transformed_cols::Dict{Symbol, Any}, copycols::Bool,
+                           transformed_cols::Set{Symbol}, copycols::Bool,
                            allow_resizing_newdf::Ref{Bool})
-    col_idx, (fun, newname) = nc
+    if nc isa Function
+        col_idx, fun, newname = nothing, nc, AsTable
+    else
+        col_idx, (fun, newname) = nc
+    end
+    if newname isa DataType
+        newname === AsTable || throw(ArgumentError("Only DataType supported as target is AsTable"))
+    end
     # It is allowed to request a tranformation operation into a newname column
     # only once. This is ensured by the logic related to transformed_cols dictionaly
     # in _manipulate, therefore in select_transform! such a duplicate should not happen
-    @assert !hasproperty(newdf, newname)
     cdf = eachcol(df)
-    if col_idx isa Int
+    if col_idx === nothing
+        res = fun(df)
+    elseif col_idx isa Int
         res = fun(df[!, col_idx])
     elseif col_idx isa AsTable
         res = fun(Tables.columntable(select(df, col_idx.cols, copycols=false)))
@@ -190,11 +202,166 @@ function select_transform!(nc::Pair{<:Union{Int, AbstractVector{Int}, AsTable},
         @assert col_idx isa AbstractVector{Int}
         res = fun(map(c -> cdf[c], col_idx)...)
     end
-    if res isa Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix}
-        throw(ArgumentError("return value from function $fun " *
-                            "of type $(typeof(res)) is currently not allowed."))
+
+    if (newname === AsTable || newname isa AbstractVector{Symbol}) &&
+        !(res isa Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix})
+        if res isa AbstractVector && !isempty(res)
+            p = pairs.(res)
+            ex = extrema(length, p)
+            ex[1] == ex[2] || throw(ArgumentError("returned elements must have the same length"))
+            kp1 = keys(p[1])
+            all(x -> keys(x) == kp1, p) || throw(ArgumentError("keys of the returned elements must be identical"))
+            res = DataFrame()
+            for (i, n) in enumerate(kp1)
+                res[!, Symbol(n)] = [x[i] for x in p]
+            end
+        else
+            res = Tables.columntable(res)
+        end
     end
-    if res isa AbstractVector
+
+    if res isa Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix}
+        if newname isa Symbol
+            throw(ArgumentError("Table returned while a single column return value was requested"))
+        end
+        if res isa AbstractMatrix
+            colnames = gennames(size(res, 2))
+        else
+            colnames = propertynames(res)
+        end
+        if !(newname === AsTable)
+            if length(colnames) != length(newname)
+                throw(ArgumentError("Number of returned columns does not match the " *
+                                    "length of requested output"))
+            end
+            colnames = newname
+        end
+        isempty(colnames) && return # nothing to do
+
+        if any(in(transformed_cols), colnames)
+            throw(ArgumentError("Duplicate column name returned"))
+        else
+            startlen = length(transformed_cols)
+            union!(transformed_cols, colnames)
+            @assert startlen + length(colnames) == length(transformed_cols)
+        end
+        if res isa AbstractDataFrame
+            lr = nrow(res)
+            # allow shortening to 0 rows
+            if allow_resizing_newdf[] && nrow(newdf) == 1
+                newdfcols = _columns(newdf)
+                for (i, col) in enumerate(newdfcols)
+                    newdfcols[i] = fill!(similar(col, lr), first(col))
+                end
+            end
+
+            # !allow_resizing_newdf[] && ncol(newdf) == 0
+            # means that we use `select` or `transform` not `combine`
+            if !allow_resizing_newdf[] && ncol(newdf) == 0 && lr != nrow(df)
+                throw(ArgumentError("length $(lr) of vector returned from " *
+                                    "function $fun is different from number of rows " *
+                                    "$(nrow(df)) of the source data frame."))
+            end
+            allow_resizing_newdf[] = false
+            @assert length(colnames) == ncol(res)
+            for (newname, v) in zip(colnames, eachcol(res))
+                vpar = parent(v)
+                parent_cols = col_idx isa AsTable ? col_idx.cols : col_idx
+                if copycols && !(fun isa ByRow) &&
+                    (v isa SubArray || any(i -> vpar === parent(cdf[i]), parent_cols))
+                    newdf[!, newname] = copy(v)
+                else
+                    newdf[!, newname] = v
+                end
+            end
+        elseif res isa AbstractMatrix
+            lr = size(res, 1)
+            # allow shortening to 0 rows
+            if allow_resizing_newdf[] && nrow(newdf) == 1
+                newdfcols = _columns(newdf)
+                for (i, col) in enumerate(newdfcols)
+                    newdfcols[i] = fill!(similar(col, lr), first(col))
+                end
+            end
+
+            # !allow_resizing_newdf[] && ncol(newdf) == 0
+            # means that we use `select` or `transform` not `combine`
+            if !allow_resizing_newdf[] && ncol(newdf) == 0 && lr != nrow(df)
+                throw(ArgumentError("length $(lr) of vector returned from " *
+                                    "function $fun is different from number of rows " *
+                                    "$(nrow(df)) of the source data frame."))
+            end
+            allow_resizing_newdf[] = false
+            @assert length(colnames) == size(res, 2)
+            for (i, newname) in enumerate(colnames)
+                newdf[!, newname] = res[:, i]
+            end
+        elseif res isa NamedTuple
+            if all(v -> v isa AbstractVector, x)
+                lr = length(res[1])
+                # allow shortening to 0 rows
+                if allow_resizing_newdf[] && nrow(newdf) == 1
+                    newdfcols = _columns(newdf)
+                    for (i, col) in enumerate(newdfcols)
+                        newdfcols[i] = fill!(similar(col, lr), first(col))
+                    end
+                end
+
+                # !allow_resizing_newdf[] && ncol(newdf) == 0
+                # means that we use `select` or `transform` not `combine`
+                if !allow_resizing_newdf[] && ncol(newdf) == 0 && lr != nrow(df)
+                    throw(ArgumentError("length $(lr) of vector returned from " *
+                                        "function $fun is different from number of rows " *
+                                        "$(nrow(df)) of the source data frame."))
+                end
+                allow_resizing_newdf[] = false
+                @assert length(colnames) == length(res)
+                for (newname, v) in zip(colnames, res)
+                    vpar = parent(v)
+                    parent_cols = col_idx isa AsTable ? col_idx.cols : col_idx
+                    if copycols && !(fun isa ByRow) &&
+                        (v isa SubArray || any(i -> vpar === parent(cdf[i]), parent_cols))
+                        newdf[!, newname] = copy(v)
+                    else
+                        newdf[!, newname] = v
+                    end
+                end
+            elseif any(v -> v isa AbstractVector, x)
+                throw(ArgumentError("mixing single values and vectors in a named tuple is not allowed"))
+            else
+                if ncol(newdf) == 0
+                    # if allow_resizing_newdf[] is false we know this is select or transform
+                    rows = allow_resizing_newdf[] ? 1 : nrow(df)
+                else
+                    # allow squashing a scalar to 0 rows
+                    rows = nrow(newdf)
+                end
+                @assert length(colnames) == length(res)
+                for (newname, v) in zip(colnames, res)
+                    # note that newdf potentially can contain c in general
+                    newdf[!, newname] = fill!(Tables.allocatecolumn(typeof(v), rows), v)
+                end
+            end
+        elseif res isa DataFrameRow
+            if ncol(newdf) == 0
+                # if allow_resizing_newdf[] is false we know this is select or transform
+                rows = allow_resizing_newdf[] ? 1 : nrow(df)
+            else
+                # allow squashing a scalar to 0 rows
+                rows = nrow(newdf)
+            end
+            @assert length(colnames) == length(res)
+            for (newname, v) in zip(colnames, res)
+                # note that newdf potentially can contain c in general
+                newdf[!, newname] = fill!(Tables.allocatecolumn(typeof(v), rows), v)
+            end
+        end
+    elseif res isa AbstractVector
+        if newname in transformed_cols
+            throw(ArgumentError("duplicate name of a transformed column"))
+        else
+            push!(transformed_cols, newname)
+        end
         # allow shortening to 0 rows
         if allow_resizing_newdf[] && nrow(newdf) == 1
             newdfcols = _columns(newdf)
@@ -220,6 +387,11 @@ function select_transform!(nc::Pair{<:Union{Int, AbstractVector{Int}, AsTable},
             newdf[!, newname] = res
         end
     else
+        if newname in transformed_cols
+            throw(ArgumentError("duplicate name of a transformed column"))
+        else
+            push!(transformed_cols, newname)
+        end
         res_unwrap = res isa Union{AbstractArray{<:Any, 0}, Ref} ? res[] : res
         if ncol(newdf) == 0
             # if allow_resizing_newdf[] is false we know this is select or transform
@@ -231,9 +403,6 @@ function select_transform!(nc::Pair{<:Union{Int, AbstractVector{Int}, AsTable},
         newdf[!, newname] = fill!(Tables.allocatecolumn(typeof(res_unwrap), rows),
                                   res_unwrap)
     end
-    # mark that column transformation was applied
-    # nothing is not possible otherwise as a value in this dict
-    transformed_cols[newname] = nothing
 end
 
 SELECT_ARG_RULES =
@@ -642,27 +811,17 @@ function _manipulate(df::AbstractDataFrame, normalized_cs, copycols::Bool, keepr
     # │ 1   │ 0.841471 │ 3     │
     # │ 2   │ 0.909297 │ 4     │
     #
-    # we compute column :a immediately when we process `:` although it is specified
-    # later by `:a=>sin=>:a` because we know from `transformed_cols` variable that
-    # it will be computed later via a transformation
-    transformed_cols = Dict{Symbol, Any}()
-    for nc in normalized_cs
-        if nc isa Pair
-            newname = last(last(nc))
-            @assert newname isa Symbol
-            if haskey(transformed_cols, newname)
-                throw(ArgumentError("duplicate target column name $newname passed"))
-            end
-            transformed_cols[newname] = nc
-        end
-    end
+    # transformed_cols keeps a set of columns that were generated via a transformation
+    # up till the point. Note that single column selection and column renaming is
+    # considered to be a transformation
+    transformed_cols = Set{Symbol}()
     # we allow resizing newdf only if up to some point only scalars were put
     # in it. The moment we put any vector into newdf its number of rows becomes fixed
     # Also if keeprows is true then we make sure to produce nrow(df) rows so resizing
     # is not allowed
     allow_resizing_newdf = Ref(!keeprows)
     for nc in normalized_cs
-        if nc isa AbstractVector{Int}
+        if nc isa AbstractVector{Int} # only this case is NOT considered to be a transformation
             allunique(nc) || throw(ArgumentError("duplicate column names selected"))
             for i in nc
                 newname = _names(df)[i]
@@ -670,42 +829,21 @@ function _manipulate(df::AbstractDataFrame, normalized_cs, copycols::Bool, keepr
                 # we allow duplicate column names with selections applied earlier
                 # and ignore them for convinience, to allow for e.g. select(df, :x1, :)
                 if !hasproperty(newdf, newname)
-                    if haskey(transformed_cols, newname)
-                        # if newdf does not have a column newname
-                        # but a column transformation was requested for this column
-                        # then apply the transformation immediately
-                        # in such a case nct may not be nothing, as if it were
-                        # nothing then newname should be preasent in newdf already
-                        nct = transformed_cols[newname]
-                        @assert nct !== nothing
-                        select_transform!(nct, df, newdf, transformed_cols, copycols,
-                                          allow_resizing_newdf)
-                    else
-                        # allow shortening to 0 rows
-                        if allow_resizing_newdf[] && nrow(newdf) == 1
-                            newdfcols = _columns(newdf)
-                            for (i, col) in enumerate(newdfcols)
-                                newdfcols[i] = fill!(similar(col, nrow(df)), first(col))
-                            end
+                    # allow shortening to 0 rows
+                    if allow_resizing_newdf[] && nrow(newdf) == 1
+                        newdfcols = _columns(newdf)
+                        for (i, col) in enumerate(newdfcols)
+                            newdfcols[i] = fill!(similar(col, nrow(df)), first(col))
                         end
-                        # here even if keeprows is true all is OK
-                        newdf[!, newname] = copycols ? df[:, i] : df[!, i]
-                        allow_resizing_newdf[] = false
                     end
+                    # here even if keeprows is true all is OK
+                    newdf[!, newname] = copycols ? df[:, i] : df[!, i]
+                    allow_resizing_newdf[] = false
                 end
             end
         else
-            # nc is normalized so it has a form src_cols => fun => Symbol
-            newname = last(last(nc))
-            if hasproperty(newdf, newname)
-                # it is possible that the transformation has already been applied
-                # via multiple column selection, like in select(df, :, :x1 => :y1)
-                # but then transformed_cols[newname] must be nothing
-                @assert transformed_cols[newname] === nothing
-            else
-                select_transform!(nc, df, newdf, transformed_cols, copycols,
-                                  allow_resizing_newdf)
-            end
+            select_transform!(nc, df, newdf, transformed_cols, copycols,
+                              allow_resizing_newdf)
         end
     end
     return newdf

From 7477075d47a6d7b8c0128a33a76155fd509c00ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 26 Sep 2020 19:51:36 +0200
Subject: [PATCH 02/21] minor fixes

---
 src/abstractdataframe/selection.jl | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index 57a3fe87b4..fdecb20deb 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -266,7 +266,7 @@ function select_transform!(nc::Union{Function, Pair{<:Union{Int, AbstractVector{
             @assert length(colnames) == ncol(res)
             for (newname, v) in zip(colnames, eachcol(res))
                 vpar = parent(v)
-                parent_cols = col_idx isa AsTable ? col_idx.cols : col_idx
+                parent_cols = col_idx isa AsTable ? col_idx.cols : (col_idx === nothing ? (1:ncol(df)) : col_idx)
                 if copycols && !(fun isa ByRow) &&
                     (v isa SubArray || any(i -> vpar === parent(cdf[i]), parent_cols))
                     newdf[!, newname] = copy(v)
@@ -318,7 +318,7 @@ function select_transform!(nc::Union{Function, Pair{<:Union{Int, AbstractVector{
                 @assert length(colnames) == length(res)
                 for (newname, v) in zip(colnames, res)
                     vpar = parent(v)
-                    parent_cols = col_idx isa AsTable ? col_idx.cols : col_idx
+                    parent_cols = col_idx isa AsTable ? col_idx.cols : (col_idx === nothing ? (1:ncol(df)) : col_idx)
                     if copycols && !(fun isa ByRow) &&
                         (v isa SubArray || any(i -> vpar === parent(cdf[i]), parent_cols))
                         newdf[!, newname] = copy(v)
@@ -379,7 +379,7 @@ function select_transform!(nc::Union{Function, Pair{<:Union{Int, AbstractVector{
         end
         allow_resizing_newdf[] = false
         respar = parent(res)
-        parent_cols = col_idx isa AsTable ? col_idx.cols : col_idx
+        parent_cols = col_idx isa AsTable ? col_idx.cols : (col_idx === nothing ? (1:ncol(df)) : col_idx)
         if copycols && !(fun isa ByRow) &&
             (res isa SubArray || any(i -> respar === parent(cdf[i]), parent_cols))
             newdf[!, newname] = copy(res)
@@ -738,13 +738,12 @@ julia> combine(df, :a => sum, nrow, renamecols=false)
 combine(df::AbstractDataFrame, args...; renamecols::Bool=true) =
     manipulate(df, args..., copycols=true, keeprows=false, renamecols=renamecols)
 
-function combine(arg, df::AbstractDataFrame; renamecols::Bool=true)
-    if nrow(df) == 0
-        throw(ArgumentError("calling combine on a data frame with zero rows" *
-                            " with transformation as a first argument is " *
-                            "currently not supported"))
+function combine(arg::Function, df::AbstractDataFrame; renamecols::Bool=true)
+    if arg isa Colon
+        throw(ArgumentError("Only transformations are allowed when function is a " *
+                            "frist argument to combine"))
     end
-    return combine(arg, groupby(df, Symbol[]), renamecols=renamecols)
+    return combine(df, arg)
 end
 
 manipulate(df::DataFrame, args::AbstractVector{Int}; copycols::Bool, keeprows::Bool,

From f79228001251cb0c7d9c6835c0ac60bbffaf3d36 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 26 Sep 2020 19:53:39 +0200
Subject: [PATCH 03/21] add missing methods

---
 src/abstractdataframe/selection.jl | 32 ++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index fdecb20deb..888c4f84db 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -555,6 +555,14 @@ julia> select!(df, AsTable(:) => ByRow(mean), renamecols=false)
 select!(df::DataFrame, args...; renamecols::Bool=true) =
     _replace_columns!(df, select(df, args..., copycols=false, renamecols=renamecols))
 
+function select!(arg::Function, df::AbstractDataFrame; renamecols::Bool=true)
+    if arg isa Colon
+        throw(ArgumentError("Only transformations are allowed when function is a " *
+                            "frist argument to select!"))
+    end
+    return select!(df, arg)
+end
+
 """
     transform!(df::DataFrame, args...; renamecols::Bool=true)
 
@@ -567,6 +575,14 @@ See [`select!`](@ref) for detailed rules regarding accepted values for `args`.
 transform!(df::DataFrame, args...; renamecols::Bool=true) =
     select!(df, :, args..., renamecols=renamecols)
 
+function transform!(arg::Function, df::AbstractDataFrame; renamecols::Bool=true)
+    if arg isa Colon
+        throw(ArgumentError("Only transformations are allowed when function is a " *
+                            "frist argument to transform!"))
+    end
+    return transform!(df, arg)
+end
+
 """
     select(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true)
 
@@ -690,6 +706,14 @@ julia> select(df, AsTable(:) => ByRow(mean), renamecols=false)
 select(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true) =
     manipulate(df, args..., copycols=copycols, keeprows=true, renamecols=renamecols)
 
+function select(arg::Function, df::AbstractDataFrame; renamecols::Bool=true)
+    if arg isa Colon
+        throw(ArgumentError("Only transformations are allowed when function is a " *
+                            "frist argument to select"))
+    end
+    return select(df, arg)
+end
+
 """
     transform(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true)
 
@@ -703,6 +727,14 @@ See [`select`](@ref) for detailed rules regarding accepted values for `args`.
 transform(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true) =
     select(df, :, args..., copycols=copycols, renamecols=renamecols)
 
+function transform(arg::Function, df::AbstractDataFrame; renamecols::Bool=true)
+    if arg isa Colon
+        throw(ArgumentError("Only transformations are allowed when function is a " *
+                            "frist argument to transform"))
+    end
+    return transform(df, arg)
+end
+
 """
     combine(df::AbstractDataFrame, args...; renamecols::Bool=true)
     combine(arg, df::AbstractDataFrame; renamecols::Bool=true)

From 7e736fa25439845506a9f82b51275f667b04f78c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 26 Sep 2020 20:17:29 +0200
Subject: [PATCH 04/21] fix normalization problem

---
 src/abstractdataframe/selection.jl | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index 888c4f84db..6787530a60 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -70,8 +70,9 @@ function normalize_selection(idx::AbstractIndex,
                              <:Union{Symbol, AbstractString, DataType,
                                      AbstractVector{Symbol}, AbstractVector{<:AbstractString}}}},
                              renamecols::Bool)
-    if last(last(sel)) isa DataType
-        last(last(sel)) === AsTable || throw(ArgumentError("Only DataType supported as target is AsTable"))
+    lls = last(last(sel))
+    if lls isa DataType
+        lls === AsTable || throw(ArgumentError("Only DataType supported as target is AsTable"))
     end
     if first(sel) isa AsTable
         rawc = first(sel).cols
@@ -99,15 +100,17 @@ function normalize_selection(idx::AbstractIndex,
         throw(ArgumentError("at least one column must be passed to a " *
                             "`ByRow` transformation function"))
     end
-    ls = last(sel)
-    if ls isa AbstractString
-        r = Symbol(ls)
-    elseif ls isa AbstractVector{<:AbstractString}
-        r = Symbol.(ls)
+    if lls isa AbstractString
+        r = Symbol(lls)
+    elseif lls isa AbstractVector{<:AbstractString}
+        r = Symbol.(lls)
     else
-        r = ls
+        r = lls
     end
-    return (wanttable ? AsTable(c) : c) => r
+    if r isa AbstractVector{Symbol}
+        allunique(r) || throw(ArgumentError("target column names must be unique"))
+    end
+    return (wanttable ? AsTable(c) : c) => first(last(sel)) => r
 end
 
 function normalize_selection(idx::AbstractIndex,

From abb400614cd702df70a2a51a63962e5717d60385 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 26 Sep 2020 22:48:48 +0200
Subject: [PATCH 05/21] initial implementation so that tests pass

---
 src/abstractdataframe/selection.jl | 21 ++++++------
 test/select.jl                     | 52 +++++++++++++++++++++---------
 2 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index 6787530a60..1be7c65c72 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -207,16 +207,15 @@ function select_transform!(nc::Union{Function, Pair{<:Union{Int, AbstractVector{
     end
 
     if (newname === AsTable || newname isa AbstractVector{Symbol}) &&
-        !(res isa Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix})
+        !(res isa Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix, AbstractArray{<:Any, 0}, Ref})
         if res isa AbstractVector && !isempty(res)
-            p = pairs.(res)
-            ex = extrema(length, p)
-            ex[1] == ex[2] || throw(ArgumentError("returned elements must have the same length"))
-            kp1 = keys(p[1])
-            all(x -> keys(x) == kp1, p) || throw(ArgumentError("keys of the returned elements must be identical"))
+            kp1 = keys(res[1])
+            all(x -> keys(x) == kp1, res) || throw(ArgumentError("keys of the returned elements must be identical"))
+            true_res = res
             res = DataFrame()
-            for (i, n) in enumerate(kp1)
-                res[!, Symbol(n)] = [x[i] for x in p]
+            prepend = all(x -> x isa Integer, kp1)
+            for n in kp1
+                res[!, prepend ? Symbol("x", n) : Symbol(n)] = [x[n] for x in true_res]
             end
         else
             res = Tables.columntable(res)
@@ -300,7 +299,7 @@ function select_transform!(nc::Union{Function, Pair{<:Union{Int, AbstractVector{
                 newdf[!, newname] = res[:, i]
             end
         elseif res isa NamedTuple
-            if all(v -> v isa AbstractVector, x)
+            if all(v -> v isa AbstractVector, res)
                 lr = length(res[1])
                 # allow shortening to 0 rows
                 if allow_resizing_newdf[] && nrow(newdf) == 1
@@ -390,6 +389,10 @@ function select_transform!(nc::Union{Function, Pair{<:Union{Int, AbstractVector{
             newdf[!, newname] = res
         end
     else
+        if newname === AsTable
+            @assert res isa Union{AbstractArray{<:Any, 0}, Ref}
+            newname = :x1
+        end
         if newname in transformed_cols
             throw(ArgumentError("duplicate name of a transformed column"))
         else
diff --git a/test/select.jl b/test/select.jl
index f707a9e950..2fa0700f09 100644
--- a/test/select.jl
+++ b/test/select.jl
@@ -720,20 +720,32 @@ end
         @test_throws ArgumentError select(df, :x => x -> retval)
         @test_throws ArgumentError select(df, :x => x -> retval, copycols=false)
         @test_throws ArgumentError select!(df, :x => x -> retval)
+
+        @test select(df, :x => ByRow(x -> retval)) == DataFrame(x_function = [retval])
+        cdf = copy(df)
+        select!(cdf, :x => ByRow(x -> retval))
+        @test cdf == DataFrame(x_function = [retval])
+
         if retval isa Union{NamedTuple, DataFrameRow}
-            @test_throws ArgumentError select(df, :x => ByRow(x -> retval))
-            @test_throws ArgumentError select!(df, :x => ByRow(x -> retval))
-        else
-            @test select(df, :x => ByRow(x -> retval)) == DataFrame(x_function = [retval])
-            cdf = copy(df)
-            select!(cdf, :x => ByRow(x -> retval))
-            @test cdf == DataFrame(x_function = [retval])
+            @test select(df, :x => ByRow(x -> retval) => AsTable) == DataFrame(;retval...)
+        elseif retval isa DataFrame
+            @test_throws MethodError select(df, :x => ByRow(x -> retval) => AsTable)
+        else # Matrix; surprising but following the API
+            @test select(df, :x => ByRow(x -> retval) => AsTable) ==
+                  DataFrame(["CartesianIndex($i, $j)" => 1.0 for i in 1:2, j in 1:2]...)
+            @test select(df, :x => ByRow(x -> retval) => [:a, :b, :c, :d]) ==
+                  DataFrame(a=1.0, b=1.0, c=1.0, d=1.0)
         end
     end
 
     for retval in [(1, 2), ones(2,2,2)]
         @test select(df, :x => x -> retval) == DataFrame(x_function = [retval])
         @test select(df, :x => ByRow(x -> retval)) == DataFrame(x_function = [retval])
+        if retval isa Tuple
+            @test select(df, :x => ByRow(x -> retval) => AsTable) == DataFrame(x1=1, x2=2)
+        else
+            @test select(df, :x => ByRow(x -> retval) => Symbol.("x", 1:8)) == DataFrame(ones(1, 8))
+        end
         cdf = copy(df)
         select!(cdf, :x => x -> retval)
         @test cdf == DataFrame(x_function = [retval])
@@ -1122,8 +1134,13 @@ end
           hcat(df, DataFrame(a_b_c_function=[[(a = 1, b = 4, c = 7)],
                                              [(a = 2, b = 5, c = 8)],
                                              [(a = 3, b = 6, c = 9)]]))
-    @test_throws ArgumentError select(df, AsTable(:) => ByRow(identity))
-    @test_throws ArgumentError select(df, AsTable(:) => ByRow(x -> df[1, :]))
+    @test select(df, AsTable(:) => ByRow(identity)) ==
+          DataFrame(a_b_c_identity=[(a = 1, b = 4, c = 7), (a = 2, b = 5, c = 8), (a = 3, b = 6, c = 9)])
+    @test select(df, AsTable(:) => ByRow(identity) => AsTable) == df
+    @test select(df, AsTable(:) => ByRow(x -> df[1, :])) ==
+          DataFrame(a_b_c_function=fill(df[1,:], 3))
+    @test select(df, AsTable(:) => ByRow(x -> df[1, :]) => AsTable) ==
+          DataFrame(a=[1,1,1], b=4, c=7)
     @test_throws ArgumentError transform(df, AsTable(Not(:)) => ByRow(identity))
 
     @test select(df, AsTable(Not(:)) => Ref) == repeat(DataFrame(Ref = NamedTuple()), nrow(df))
@@ -1164,12 +1181,14 @@ end
 
     @test combine(x -> Matrix(x), df) == rename(df, [:x1, :x2])
     @test combine(x -> Ref(1:3), df) == DataFrame(x1=[1:3])
-    @test_throws ArgumentError combine(df, x -> Ref(1:3))
+    @test combine(df, x -> Ref(1:3)) == DataFrame(x1=[1:3])
 
-    @test combine(AsTable(:) => identity, df) == df
-    @test combine((:) => cor, df) == DataFrame(x_y_cor = 1.0)
-    @test combine(:x => x -> Ref(1:3), df) == DataFrame(x_function=[1:3])
+    @test_throws ArgumentError combine(df, AsTable(:) => identity)
+    @test combine(df, AsTable(:) => identity => AsTable) == df
+    @test combine(df, (:) => cor) == DataFrame(x_y_cor = 1.0)
+    @test combine(df, :x => x -> Ref(1:3)) == DataFrame(x_function=[1:3])
     @test_throws ArgumentError combine(df, :x => x -> ones(1,1))
+    @test combine(df, :x => (x -> ones(1,1)) => AsTable) == DataFrame(x1=1.0)
 
     df2 = combine(df, :x => identity)
     @test df2[:, 1] == df.x
@@ -1188,8 +1207,9 @@ end
 
     @test combine(x -> Matrix(x), dfv) == rename(dfv, [:x1, :x2])
 
-    @test combine(AsTable(:) => identity, dfv) == dfv
-    @test combine((:) => cor, dfv) == DataFrame(y_x_cor = 1.0)
+    @test_throws ArgumentError combine(dfv, AsTable(:) => identity)
+    @test combine(dfv, AsTable(:) => identity => AsTable) == dfv
+    @test combine(dfv, (:) => cor) == DataFrame(y_x_cor = 1.0)
 
     df2 = combine(dfv, :x => identity)
     @test df2[:, 1] == dfv.x
@@ -1294,7 +1314,7 @@ end
           DataFrame(a=1:3, b=4:6, c=7:9, d=10:12, a_b=5:2:9, a_b_etc=22:4:30)
     @test combine(df, :a => +, [:a, :b] => +, All() => +, renamecols=false) ==
           DataFrame(a=1:3, a_b=5:2:9, a_b_etc=22:4:30)
-    @test combine([:a, :b] => +, df, renamecols=false) == DataFrame(a_b=5:2:9)
+    @test combine(df, [:a, :b] => +, renamecols=false) == DataFrame(a_b=5:2:9)
     @test combine(identity, df, renamecols=false) == df
 
     df = DataFrame(a=1:3, b=4:6, c=7:9, d=10:12)

From d93747565317b38cdbfff5ca06e894fb14518ee8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Tue, 29 Sep 2020 17:41:59 +0200
Subject: [PATCH 06/21] allow empty selector in ByRow. Initial tests

---
 src/abstractdataframe/selection.jl        |  58 +++++++----
 src/groupeddataframe/splitapplycombine.jl |   2 +-
 test/select.jl                            | 120 ++++++++++++++++++++++
 3 files changed, 157 insertions(+), 23 deletions(-)

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index 1be7c65c72..5021bed0ac 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -1,5 +1,6 @@
 # TODO:
-# * add combine(fun, df) for DataFrame with 0 rows
+# * add handling of empty ByRow to filter, and select/transform/combine for GroupedDataFrame
+# * add handling of multiple column return rules for select/transform/combine for GroupedDataFrame
 
 # normalize_selection function makes sure that whatever input format of idx is it
 # will end up in one of four canonical forms
@@ -7,7 +8,7 @@
 # 2) Pair{Int, <:Pair{<:Base.Callable, <:Union{Symbol, Vector{Symbol}, Type{AsTable}}}}
 # 3) Pair{AbstractVector{Int}, <:Pair{<:Base.Callable, <:Union{Symbol, AbstractVector{Symbol}, Type{AsTable}}}}
 # 4) Pair{AsTable, <:Pair{<:Base.Callable, <:Union{Symbol, Vector{Symbol}, Type{AsTable}}}}
-# 5) Function
+# 5) Callable
 
 """
     ByRow
@@ -38,7 +39,7 @@ normalize_selection(idx::AbstractIndex, sel, renamecols::Bool) =
         end
     end
 
-normalize_selection(idx::AbstractIndex, sel::Function, renamecols::Bool) = sel
+normalize_selection(idx::AbstractIndex, sel::Base.Callable, renamecols::Bool) = sel
 normalize_selection(idx::AbstractIndex, sel::Colon, renamecols::Bool) = idx[:]
 
 normalize_selection(idx::AbstractIndex, sel::Pair{typeof(nrow), Symbol},
@@ -96,10 +97,6 @@ function normalize_selection(idx::AbstractIndex,
                 end
             end
     end
-    if length(c) == 0 && first(last(sel)) isa ByRow
-        throw(ArgumentError("at least one column must be passed to a " *
-                            "`ByRow` transformation function"))
-    end
     if lls isa AbstractString
         r = Symbol(lls)
     elseif lls isa AbstractVector{<:AbstractString}
@@ -149,10 +146,6 @@ function normalize_selection(idx::AbstractIndex,
                 end
             end
     end
-    if length(c) == 0 && last(sel) isa ByRow
-        throw(ArgumentError("at least one column must be passed to a " *
-                            "`ByRow` transformation function"))
-    end
     fun = last(sel)
     if length(c) > 3
         prefix = join(@views(_names(idx)[c[1:2]]), '_')
@@ -176,14 +169,14 @@ function normalize_selection(idx::AbstractIndex,
     return (wanttable ? AsTable(c) : c) => fun => newcol
 end
 
-function select_transform!(nc::Union{Function, Pair{<:Union{Int, AbstractVector{Int}, AsTable},
+function select_transform!(nc::Union{Base.Callable, Pair{<:Union{Int, AbstractVector{Int}, AsTable},
                                                     <:Pair{<:Base.Callable,
                                                            <:Union{Symbol, AbstractVector{Symbol}, DataType}}}},
                            df::AbstractDataFrame, newdf::DataFrame,
                            transformed_cols::Set{Symbol}, copycols::Bool,
                            allow_resizing_newdf::Ref{Bool})
-    if nc isa Function
-        col_idx, fun, newname = nothing, nc, AsTable
+    if nc isa Base.Callable
+        col_idx, fun, newname = nothing, nc, nothing
     else
         col_idx, (fun, newname) = nc
     end
@@ -199,15 +192,34 @@ function select_transform!(nc::Union{Function, Pair{<:Union{Int, AbstractVector{
     elseif col_idx isa Int
         res = fun(df[!, col_idx])
     elseif col_idx isa AsTable
-        res = fun(Tables.columntable(select(df, col_idx.cols, copycols=false)))
+        tbl = Tables.columntable(select(df, col_idx.cols, copycols=false))
+        if isempty(tbl) && fun isa ByRow
+            if isempty(df)
+                T = Base.return_types(fun.fun, ())[1]
+                res = T[]
+            else
+                res = [fun.fun() for _ in 1:nrow(df)]
+            end
+        else
+            res = fun(tbl)
+        end
     else
         # it should be fast enough here as we do not expect to do it millions of times
         @assert col_idx isa AbstractVector{Int}
-        res = fun(map(c -> cdf[c], col_idx)...)
+        if isempty(col_idx) && fun isa ByRow
+            if isempty(df)
+                T = Base.return_types(fun.fun, ())[1]
+                res = T[]
+            else
+                res = [fun.fun() for _ in 1:nrow(df)]
+            end
+        else
+            res = fun(map(c -> cdf[c], col_idx)...)
+        end
     end
 
     if (newname === AsTable || newname isa AbstractVector{Symbol}) &&
-        !(res isa Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix, AbstractArray{<:Any, 0}, Ref})
+        !(res isa Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix})
         if res isa AbstractVector && !isempty(res)
             kp1 = keys(res[1])
             all(x -> keys(x) == kp1, res) || throw(ArgumentError("keys of the returned elements must be identical"))
@@ -231,7 +243,7 @@ function select_transform!(nc::Union{Function, Pair{<:Union{Int, AbstractVector{
         else
             colnames = propertynames(res)
         end
-        if !(newname === AsTable)
+        if !(newname === AsTable || newname === nothing)
             if length(colnames) != length(newname)
                 throw(ArgumentError("Number of returned columns does not match the " *
                                     "length of requested output"))
@@ -328,7 +340,7 @@ function select_transform!(nc::Union{Function, Pair{<:Union{Int, AbstractVector{
                         newdf[!, newname] = v
                     end
                 end
-            elseif any(v -> v isa AbstractVector, x)
+            elseif any(v -> v isa AbstractVector, res)
                 throw(ArgumentError("mixing single values and vectors in a named tuple is not allowed"))
             else
                 if ncol(newdf) == 0
@@ -359,6 +371,9 @@ function select_transform!(nc::Union{Function, Pair{<:Union{Int, AbstractVector{
             end
         end
     elseif res isa AbstractVector
+        if newname === nothing
+            newname = :x1
+        end
         if newname in transformed_cols
             throw(ArgumentError("duplicate name of a transformed column"))
         else
@@ -389,8 +404,7 @@ function select_transform!(nc::Union{Function, Pair{<:Union{Int, AbstractVector{
             newdf[!, newname] = res
         end
     else
-        if newname === AsTable
-            @assert res isa Union{AbstractArray{<:Any, 0}, Ref}
+        if newname === nothing
             newname = :x1
         end
         if newname in transformed_cols
@@ -806,7 +820,7 @@ manipulate(df::DataFrame, c::ColumnIndex; copycols::Bool, keeprows::Bool,
 function manipulate(df::DataFrame, cs...; copycols::Bool, keeprows::Bool, renamecols::Bool)
     cs_vec = []
     for v in cs
-        if v isa AbstractVector{<:Pair}
+        if v isa AbstractVecOrMat{<:Pair}
             append!(cs_vec, v)
         else
             push!(cs_vec, v)
diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
index d7b1c23d86..b6f0595019 100644
--- a/src/groupeddataframe/splitapplycombine.jl
+++ b/src/groupeddataframe/splitapplycombine.jl
@@ -502,7 +502,7 @@ function _combine_prepare(gd::GroupedDataFrame,
     for p in cs
         if p === nrow
             push!(cs_vec, nrow => :nrow)
-        elseif p isa AbstractVector{<:Pair}
+        elseif p isa AbstractVecOrMat{<:Pair}
             append!(cs_vec, p)
         else
             push!(cs_vec, p)
diff --git a/test/select.jl b/test/select.jl
index 2fa0700f09..33f405c679 100644
--- a/test/select.jl
+++ b/test/select.jl
@@ -1326,4 +1326,124 @@ end
     @test df == DataFrame(a=1:3, b=4:6, c=7:9, d=10:12, a_b=5:2:9, a_b_etc=22:4:30)
 end
 
+@testset "additional tests for new rules" begin
+#    select select! transform transform! combine
+#    Union{Type{AsTable}, Symbol, AbstractVector{Symbol}, AbstractString, AbstractVector{<:AbstractString}}
+#    DataFrame, SubDataFrame
+    @testset "SELECT(FUN, DF)" begin
+        for df in (DataFrame(a=1:2, b=3:4, c=5:6), view(DataFrame(a=1:3, b=3:5, c=5:7, d=11:13), 1:2, 1:3))
+            @test select(sdf -> sdf.b, df) == DataFrame(x1=3:4)
+            @test select(sdf -> (b = 2sdf.b,), df) == DataFrame(b=[6,8])
+            @test select(sdf -> (b = 1,), df) == DataFrame(b=[1, 1])
+            @test_throws ArgumentError select(sdf -> (b = [1],), df)
+            @test select(sdf -> (b = [1, 5],), df) == DataFrame(b=[1, 5])
+            @test select(sdf -> 1, df) == DataFrame(x1=[1, 1])
+            @test select(sdf -> fill([1]), df) == DataFrame(x1=[[1], [1]])
+            @test select(sdf -> Ref([1]), df) == DataFrame(x1=[[1], [1]])
+            @test select(sdf -> "x", df) == DataFrame(x1=["x", "x"])
+            @test select(sdf -> [[1,2],[3,4]], df) == DataFrame(x1=[[1,2],[3,4]])
+            for ret in (DataFrame(), NamedTuple(), zeros(0,0), DataFrame(t=1)[1, 1:0])
+                @test select(sdf -> ret, df) == DataFrame()
+            end
+            @test_throws ArgumentError select(sdf -> DataFrame(a=10), df)
+            @test_throws ArgumentError select(sdf -> zeros(1, 2), df)
+            @test select(sdf -> DataFrame(a=[10, 11]), df) == DataFrame(a=[10, 11])
+            @test select(sdf -> [10 11; 12 13], df) == DataFrame(x1=[10, 12], x2=[11, 13])
+            @test select(sdf -> DataFrame(a=10)[1, :], df) == DataFrame(a=[10, 10])
+
+            @test transform(sdf -> sdf.b, df) == [df DataFrame(x1=3:4)]
+            @test transform(sdf -> (b = 2sdf.b,), df) == DataFrame(a=1:2, b=[6,8], c=5:6)
+            @test transform(sdf -> (b = 1,), df) == DataFrame(a=[1,2], b=[1, 1], c=[5,6])
+            @test_throws ArgumentError transform(sdf -> (b = [1],), df)
+            @test transform(sdf -> (b = [1, 5],), df) == DataFrame(a=[1,2], b=[1, 5], c=[5,6])
+            @test transform(sdf -> 1, df) == DataFrame(a=1:2, b=3:4, c=5:6, x1=1)
+            @test transform(sdf -> fill([1]), df) == DataFrame(a=1:2, b=3:4, c=5:6, x1=[[1],[1]])
+            @test transform(sdf -> Ref([1]), df) == DataFrame(a=1:2, b=3:4, c=5:6, x1=[[1],[1]])
+            @test transform(sdf -> "x", df) == DataFrame(a=1:2, b=3:4, c=5:6, x1="x")
+            @test transform(sdf -> [[1,2],[3,4]], df) == DataFrame(a=1:2, b=3:4, c=5:6, x1=[[1,2],[3,4]])
+            for ret in (DataFrame(), NamedTuple(), zeros(0,0), DataFrame(t=1)[1, 1:0])
+                @test transform(sdf -> ret, df) == df
+            end
+            @test_throws ArgumentError transform(sdf -> DataFrame(a=10), df)
+            @test_throws ArgumentError transform(sdf -> zeros(1, 2), df)
+            @test transform(sdf -> DataFrame(a=[10, 11]), df) == DataFrame(a=[10, 11], b=3:4, c=5:6)
+            @test transform(sdf -> [10 11; 12 13], df) == DataFrame(a=1:2, b=3:4, c=5:6, x1=[10, 12], x2=[11, 13])
+            @test transform(sdf -> DataFrame(a=10)[1, :], df) == DataFrame(a=[10, 10], b=3:4, c=5:6)
+
+            @test combine(sdf -> sdf.b, df) == DataFrame(x1=3:4)
+            @test combine(sdf -> (b = 2sdf.b,), df) == DataFrame(b=[6,8])
+            @test combine(sdf -> (b = 1,), df) == DataFrame(b=[1])
+            @test combine(sdf -> (b = [1],), df) == DataFrame(b=[1])
+            @test combine(sdf -> (b = [1, 5],), df) == DataFrame(b=[1, 5])
+            @test combine(sdf -> 1, df) == DataFrame(x1=[1])
+            @test combine(sdf -> fill([1]), df) == DataFrame(x1=[[1]])
+            @test combine(sdf -> Ref([1]), df) == DataFrame(x1=[[1]])
+            @test combine(sdf -> "x", df) == DataFrame(x1=["x"])
+            @test combine(sdf -> [[1,2],[3,4]], df) == DataFrame(x1=[[1,2],[3,4]])
+            for ret in (DataFrame(), NamedTuple(), zeros(0,0), DataFrame(t=1)[1, 1:0])
+                @test combine(sdf -> ret, df) == DataFrame()
+            end
+            @test combine(sdf -> DataFrame(a=10), df) == DataFrame(a=10)
+            @test combine(sdf -> zeros(1, 2), df) == DataFrame(x1=0, x2=0)
+            @test combine(sdf -> DataFrame(a=[10, 11]), df) == DataFrame(a=[10, 11])
+            @test combine(sdf -> [10 11; 12 13], df) == DataFrame(x1=[10, 12], x2=[11, 13])
+            @test combine(sdf -> DataFrame(a=10)[1, :], df) == DataFrame(a=[10])
+        end
+
+        df = DataFrame(a=1:2, b=3:4, c=5:6)
+        @test select!(sdf -> sdf.b, copy(df)) == DataFrame(x1=3:4)
+        @test select!(sdf -> (b = 2sdf.b,), copy(df)) == DataFrame(b=[6,8])
+        @test select!(sdf -> (b = 1,), copy(df)) == DataFrame(b=[1, 1])
+        @test_throws ArgumentError select!(sdf -> (b = [1],), copy(df))
+        @test select!(sdf -> (b = [1, 5],), copy(df)) == DataFrame(b=[1, 5])
+        @test select!(sdf -> 1, copy(df)) == DataFrame(x1=[1, 1])
+        @test select!(sdf -> fill([1]), copy(df)) == DataFrame(x1=[[1], [1]])
+        @test select!(sdf -> Ref([1]), copy(df)) == DataFrame(x1=[[1], [1]])
+        @test select!(sdf -> "x", copy(df)) == DataFrame(x1=["x", "x"])
+        @test select!(sdf -> [[1,2],[3,4]], copy(df)) == DataFrame(x1=[[1,2],[3,4]])
+        for ret in (DataFrame(), NamedTuple(), zeros(0,0), DataFrame(t=1)[1, 1:0])
+            @test select!(sdf -> ret, copy(df)) == DataFrame()
+        end
+        @test_throws ArgumentError select!(sdf -> DataFrame(a=10), copy(df))
+        @test_throws ArgumentError select!(sdf -> zeros(1, 2), copy(df))
+        @test select!(sdf -> DataFrame(a=[10, 11]), copy(df)) == DataFrame(a=[10, 11])
+        @test select!(sdf -> [10 11; 12 13], copy(df)) == DataFrame(x1=[10, 12], x2=[11, 13])
+        @test select!(sdf -> DataFrame(a=10)[1, :], copy(df)) == DataFrame(a=[10, 10])
+
+        @test transform!(sdf -> sdf.b, copy(df)) == [df DataFrame(x1=3:4)]
+        @test transform!(sdf -> (b = 2sdf.b,), copy(df)) == DataFrame(a=1:2, b=[6,8], c=5:6)
+        @test transform!(sdf -> (b = 1,), copy(df)) == DataFrame(a=[1,2], b=[1, 1], c=[5,6])
+        @test_throws ArgumentError transform!(sdf -> (b = [1],), copy(df))
+        @test transform!(sdf -> (b = [1, 5],), copy(df)) == DataFrame(a=[1,2], b=[1, 5], c=[5,6])
+        @test transform!(sdf -> 1, copy(df)) == DataFrame(a=1:2, b=3:4, c=5:6, x1=1)
+        @test transform!(sdf -> fill([1]), copy(df)) == DataFrame(a=1:2, b=3:4, c=5:6, x1=[[1],[1]])
+        @test transform!(sdf -> Ref([1]), copy(df)) == DataFrame(a=1:2, b=3:4, c=5:6, x1=[[1],[1]])
+        @test transform!(sdf -> "x", copy(df)) == DataFrame(a=1:2, b=3:4, c=5:6, x1="x")
+        @test transform!(sdf -> [[1,2],[3,4]], copy(df)) == DataFrame(a=1:2, b=3:4, c=5:6, x1=[[1,2],[3,4]])
+        for ret in (DataFrame(), NamedTuple(), zeros(0,0), DataFrame(t=1)[1, 1:0])
+            @test transform!(sdf -> ret, copy(df)) == df
+        end
+        @test_throws ArgumentError transform!(sdf -> DataFrame(a=10), copy(df))
+        @test_throws ArgumentError transform!(sdf -> zeros(1, 2), copy(df))
+        @test transform!(sdf -> DataFrame(a=[10, 11]), copy(df)) == DataFrame(a=[10, 11], b=3:4, c=5:6)
+        @test transform!(sdf -> [10 11; 12 13], copy(df)) == DataFrame(a=1:2, b=3:4, c=5:6, x1=[10, 12], x2=[11, 13])
+        @test transform!(sdf -> DataFrame(a=10)[1, :], copy(df)) == DataFrame(a=[10, 10], b=3:4, c=5:6)
+    end
+end
+
+@testset "empty ByRow" begin
+    df = DataFrame(a=1:3)
+    @test select(df, [] => ByRow(() -> 1)) == DataFrame("function" => [1, 1, 1])
+    @test combine(df, [] => ByRow(() -> 1)) == DataFrame("function" => [1, 1, 1])
+    @test transform(df, [] => ByRow(() -> 1)) == DataFrame("a" => 1:3, "function" => [1, 1, 1])
+
+    df = DataFrame()
+    @test select(df, [] => ByRow(() -> 1)) == DataFrame("function" => [])
+    @test combine(df, [] => ByRow(() -> 1)) == DataFrame("function" => [])
+    @test transform(df, [] => ByRow(() -> 1)) == DataFrame("function" => [])
+    @test eltype(select(df, [] => ByRow(() -> 1)).function) == Int64
+    @test eltype(combine(df, [] => ByRow(() -> 1)).function) == Int64
+    @test eltype(transform(df, [] => ByRow(() -> 1)).function) == Int64
+end
+
 end # module

From e8cacff5cded9e727a7134effeb99a248b23c998 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Wed, 30 Sep 2020 14:54:29 +0200
Subject: [PATCH 07/21] more tests of empty ByRow

---
 test/select.jl | 44 ++++++++++++++++++++++++++++++++------------
 1 file changed, 32 insertions(+), 12 deletions(-)

diff --git a/test/select.jl b/test/select.jl
index 33f405c679..9a8d9d50b2 100644
--- a/test/select.jl
+++ b/test/select.jl
@@ -1432,18 +1432,38 @@ end
 end
 
 @testset "empty ByRow" begin
-    df = DataFrame(a=1:3)
-    @test select(df, [] => ByRow(() -> 1)) == DataFrame("function" => [1, 1, 1])
-    @test combine(df, [] => ByRow(() -> 1)) == DataFrame("function" => [1, 1, 1])
-    @test transform(df, [] => ByRow(() -> 1)) == DataFrame("a" => 1:3, "function" => [1, 1, 1])
-
-    df = DataFrame()
-    @test select(df, [] => ByRow(() -> 1)) == DataFrame("function" => [])
-    @test combine(df, [] => ByRow(() -> 1)) == DataFrame("function" => [])
-    @test transform(df, [] => ByRow(() -> 1)) == DataFrame("function" => [])
-    @test eltype(select(df, [] => ByRow(() -> 1)).function) == Int64
-    @test eltype(combine(df, [] => ByRow(() -> 1)).function) == Int64
-    @test eltype(transform(df, [] => ByRow(() -> 1)).function) == Int64
+    for sel in ([], AsTable([]))
+        df = DataFrame(a=1:3)
+        @test select(df, sel => ByRow(() -> 1)) == DataFrame("function" => [1, 1, 1])
+        @test combine(df, sel => ByRow(() -> 1)) == DataFrame("function" => [1, 1, 1])
+        @test transform(df, sel => ByRow(() -> 1)) == DataFrame("a" => 1:3, "function" => [1, 1, 1])
+
+        for df in (DataFrame(), DataFrame(a=[]))
+            @test select(df, sel => ByRow(() -> 1)) == DataFrame("function" => [])
+            @test combine(df, sel => ByRow(() -> 1)) == DataFrame("function" => [])
+            if ncol(df) == 0
+                @test transform(df, sel => ByRow(() -> 1)) == DataFrame("function" => [])
+            else
+                @test transform(df, sel => ByRow(() -> 1)) == DataFrame("a" => [], "function" => [])
+            end
+            @test eltype(select(df, sel => ByRow(() -> 1)).function) == Int
+            @test eltype(combine(df, sel => ByRow(() -> 1)).function) == Int
+            @test eltype(transform(df, sel => ByRow(() -> 1)).function) == Int
+
+            df2 = select(df, sel => ByRow(() -> (a=1,b="1")) => AsTable)
+            @test names(df2) == ["a", "b"]
+            @test eltype.(eachcol(df2)) == [Int, String]
+            df2 = select(df, sel => ByRow(() -> (a=1,b="1")) => [:p, :q])
+            @test names(df2) == ["p", "q"]
+            @test eltype.(eachcol(df2)) == [Int, String]
+
+            # here this follows Tables.jl behavior
+            for res in ([1, "1"], (1, "1"))
+                @test select(df, sel => ByRow(() -> res) => AsTable) == DataFrame()
+                @test_throws ArgumentError select(df, sel => ByRow(() -> res) => [:p, :q])
+            end
+        end
+    end
 end
 
 end # module

From de4bacf53fdcc22cd41b7854d8d5e6eca27f0221 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Wed, 30 Sep 2020 15:54:12 +0200
Subject: [PATCH 08/21] improved tests of empty ByRow

---
 src/abstractdataframe/selection.jl |  4 +-
 test/select.jl                     | 98 +++++++++++++++++++-----------
 2 files changed, 66 insertions(+), 36 deletions(-)

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index 5021bed0ac..15580e34d9 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -195,10 +195,10 @@ function select_transform!(nc::Union{Base.Callable, Pair{<:Union{Int, AbstractVe
         tbl = Tables.columntable(select(df, col_idx.cols, copycols=false))
         if isempty(tbl) && fun isa ByRow
             if isempty(df)
-                T = Base.return_types(fun.fun, ())[1]
+                T = Base.return_types(fun.fun, (NamedTuple{(),Tuple{}},))[1]
                 res = T[]
             else
-                res = [fun.fun() for _ in 1:nrow(df)]
+                res = [fun.fun(NamedTuple()) for _ in 1:nrow(df)]
             end
         else
             res = fun(tbl)
diff --git a/test/select.jl b/test/select.jl
index 9a8d9d50b2..a5aa6dde82 100644
--- a/test/select.jl
+++ b/test/select.jl
@@ -766,16 +766,16 @@ end
     @test combine(df, r"z" => () -> y) == DataFrame(:function => y)
     @test select(df, r"z" => () -> x)[!, 1] === x # no copy even for copycols=true
     @test_throws MethodError select(df, r"z" => x -> 1)
-    @test_throws ArgumentError select(df, r"z" => ByRow(rand))
+    @test select(df, r"z" => ByRow(() -> 1)) == DataFrame(:function => fill(1, 10))
 
     @test select(df, r"z", copycols=false) == DataFrame()
     @test select(df, r"z" => () -> x, copycols=false) == DataFrame(:function => x)
     @test select(df, r"z" => () -> x, copycols=false)[!, 1] === x
     @test_throws MethodError select(df, r"z" => x -> 1, copycols=false)
-    @test_throws ArgumentError select(df, r"z" => ByRow(rand), copycols=false)
+    @test select(df, r"z" => ByRow(() -> 1)) == DataFrame(:function => fill(1, 10), copycols=false)
 
     @test_throws MethodError select!(df, r"z" => x -> 1)
-    @test_throws ArgumentError select!(df, r"z" => ByRow(rand))
+    @test select!(df, r"z" => ByRow(() -> 1)) == DataFrame(:function => fill(1, 10))
     @test_throws MethodError select!(df, r"z" => () -> x, copycols=false)
 
     select!(df, r"z" => () -> x)
@@ -1141,7 +1141,8 @@ end
           DataFrame(a_b_c_function=fill(df[1,:], 3))
     @test select(df, AsTable(:) => ByRow(x -> df[1, :]) => AsTable) ==
           DataFrame(a=[1,1,1], b=4, c=7)
-    @test_throws ArgumentError transform(df, AsTable(Not(:)) => ByRow(identity))
+    @test transform(df, AsTable(Not(:)) =>
+          ByRow(identity)) == [df DataFrame(:identity => fill(NamedTuple(), nrow(df)))]
 
     @test select(df, AsTable(Not(:)) => Ref) == repeat(DataFrame(Ref = NamedTuple()), nrow(df))
     @test combine(df, AsTable(Not(:)) => Ref) == DataFrame(Ref = NamedTuple())
@@ -1432,36 +1433,65 @@ end
 end
 
 @testset "empty ByRow" begin
-    for sel in ([], AsTable([]))
-        df = DataFrame(a=1:3)
-        @test select(df, sel => ByRow(() -> 1)) == DataFrame("function" => [1, 1, 1])
-        @test combine(df, sel => ByRow(() -> 1)) == DataFrame("function" => [1, 1, 1])
-        @test transform(df, sel => ByRow(() -> 1)) == DataFrame("a" => 1:3, "function" => [1, 1, 1])
-
-        for df in (DataFrame(), DataFrame(a=[]))
-            @test select(df, sel => ByRow(() -> 1)) == DataFrame("function" => [])
-            @test combine(df, sel => ByRow(() -> 1)) == DataFrame("function" => [])
-            if ncol(df) == 0
-                @test transform(df, sel => ByRow(() -> 1)) == DataFrame("function" => [])
-            else
-                @test transform(df, sel => ByRow(() -> 1)) == DataFrame("a" => [], "function" => [])
-            end
-            @test eltype(select(df, sel => ByRow(() -> 1)).function) == Int
-            @test eltype(combine(df, sel => ByRow(() -> 1)).function) == Int
-            @test eltype(transform(df, sel => ByRow(() -> 1)).function) == Int
-
-            df2 = select(df, sel => ByRow(() -> (a=1,b="1")) => AsTable)
-            @test names(df2) == ["a", "b"]
-            @test eltype.(eachcol(df2)) == [Int, String]
-            df2 = select(df, sel => ByRow(() -> (a=1,b="1")) => [:p, :q])
-            @test names(df2) == ["p", "q"]
-            @test eltype.(eachcol(df2)) == [Int, String]
-
-            # here this follows Tables.jl behavior
-            for res in ([1, "1"], (1, "1"))
-                @test select(df, sel => ByRow(() -> res) => AsTable) == DataFrame()
-                @test_throws ArgumentError select(df, sel => ByRow(() -> res) => [:p, :q])
-            end
+    df = DataFrame(a=1:3)
+
+    @test select(df, [] => ByRow(() -> 1)) == DataFrame("function" => [1, 1, 1])
+    @test combine(df, [] => ByRow(() -> 1)) == DataFrame("function" => [1, 1, 1])
+    @test transform(df, [] => ByRow(() -> 1)) == DataFrame("a" => 1:3, "function" => [1, 1, 1])
+
+    for df in (DataFrame(), DataFrame(a=[]))
+        @test select(df, [] => ByRow(() -> 1)) == DataFrame("function" => [])
+        @test combine(df, [] => ByRow(() -> 1)) == DataFrame("function" => [])
+        if ncol(df) == 0
+            @test transform(df, [] => ByRow(() -> 1)) == DataFrame("function" => [])
+        else
+            @test transform(df, [] => ByRow(() -> 1)) == DataFrame("a" => [], "function" => [])
+        end
+        @test eltype(select(df, [] => ByRow(() -> 1)).function) == Int
+        @test eltype(combine(df, [] => ByRow(() -> 1)).function) == Int
+        @test eltype(transform(df, [] => ByRow(() -> 1)).function) == Int
+
+        df2 = select(df, [] => ByRow(() -> (a=1,b="1")) => AsTable)
+        @test names(df2) == ["a", "b"]
+        @test eltype.(eachcol(df2)) == [Int, String]
+        df2 = select(df, [] => ByRow(() -> (a=1,b="1")) => [:p, :q])
+        @test names(df2) == ["p", "q"]
+        @test eltype.(eachcol(df2)) == [Int, String]
+
+        # here this follows Tables.jl behavior
+        for res in ([1, "1"], (1, "1"))
+            @test select(df, [] => ByRow(() -> res) => AsTable) == DataFrame()
+            @test_throws ArgumentError select(df, [] => ByRow(() -> res) => [:p, :q])
+        end
+    end
+
+    @test select(df, AsTable([]) => ByRow(x -> 1)) == DataFrame("function" => [1, 1, 1])
+    @test combine(df, AsTable([]) => ByRow(x -> 1)) == DataFrame("function" => [1, 1, 1])
+    @test transform(df, AsTable([]) => ByRow(x -> 1)) == DataFrame("a" => 1:3, "function" => [1, 1, 1])
+
+    for df in (DataFrame(), DataFrame(a=[]))
+        @test select(df, AsTable([]) => ByRow(x -> 1)) == DataFrame("function" => [])
+        @test combine(df, AsTable([]) => ByRow(x -> 1)) == DataFrame("function" => [])
+        if ncol(df) == 0
+            @test transform(df, AsTable([]) => ByRow(x -> 1)) == DataFrame("function" => [])
+        else
+            @test transform(df, AsTable([]) => ByRow(x -> 1)) == DataFrame("a" => [], "function" => [])
+        end
+        @test eltype(select(df, AsTable([]) => ByRow(x -> 1)).function) == Int
+        @test eltype(combine(df, AsTable([]) => ByRow(x -> 1)).function) == Int
+        @test eltype(transform(df, AsTable([]) => ByRow(x -> 1)).function) == Int
+
+        df2 = select(df, AsTable([]) => ByRow(x -> (a=1,b="1")) => AsTable)
+        @test names(df2) == ["a", "b"]
+        @test eltype.(eachcol(df2)) == [Int, String]
+        df2 = select(df, AsTable([]) => ByRow(x -> (a=1,b="1")) => [:p, :q])
+        @test names(df2) == ["p", "q"]
+        @test eltype.(eachcol(df2)) == [Int, String]
+
+        # here this follows Tables.jl behavior
+        for res in ([1, "1"], (1, "1"))
+            @test select(df, AsTable([]) => ByRow(x -> res) => AsTable) == DataFrame()
+            @test_throws ArgumentError select(df, AsTable([]) => ByRow(x -> res) => [:p, :q])
         end
     end
 end

From 4dedb3d18510eb3e60bb2db551e2ff63eea61079 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Thu, 1 Oct 2020 11:14:56 +0200
Subject: [PATCH 09/21] finalize tests

---
 src/abstractdataframe/selection.jl | 294 +++++++++++++----------------
 test/grouping.jl                   |   6 +-
 test/select.jl                     |  73 +++++++
 3 files changed, 203 insertions(+), 170 deletions(-)

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index 15580e34d9..25e7695e43 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -169,39 +169,22 @@ function normalize_selection(idx::AbstractIndex,
     return (wanttable ? AsTable(c) : c) => fun => newcol
 end
 
-function select_transform!(nc::Union{Base.Callable, Pair{<:Union{Int, AbstractVector{Int}, AsTable},
-                                                    <:Pair{<:Base.Callable,
-                                                           <:Union{Symbol, AbstractVector{Symbol}, DataType}}}},
-                           df::AbstractDataFrame, newdf::DataFrame,
-                           transformed_cols::Set{Symbol}, copycols::Bool,
-                           allow_resizing_newdf::Ref{Bool})
-    if nc isa Base.Callable
-        col_idx, fun, newname = nothing, nc, nothing
-    else
-        col_idx, (fun, newname) = nc
-    end
-    if newname isa DataType
-        newname === AsTable || throw(ArgumentError("Only DataType supported as target is AsTable"))
-    end
-    # It is allowed to request a tranformation operation into a newname column
-    # only once. This is ensured by the logic related to transformed_cols dictionaly
-    # in _manipulate, therefore in select_transform! such a duplicate should not happen
-    cdf = eachcol(df)
+function _transformation_helper(df, col_idx, @nospecialize(fun))
     if col_idx === nothing
-        res = fun(df)
+        return fun(df)
     elseif col_idx isa Int
-        res = fun(df[!, col_idx])
+        return fun(df[!, col_idx])
     elseif col_idx isa AsTable
         tbl = Tables.columntable(select(df, col_idx.cols, copycols=false))
         if isempty(tbl) && fun isa ByRow
             if isempty(df)
                 T = Base.return_types(fun.fun, (NamedTuple{(),Tuple{}},))[1]
-                res = T[]
+                return T[]
             else
-                res = [fun.fun(NamedTuple()) for _ in 1:nrow(df)]
+                return [fun.fun(NamedTuple()) for _ in 1:nrow(df)]
             end
         else
-            res = fun(tbl)
+            return fun(tbl)
         end
     else
         # it should be fast enough here as we do not expect to do it millions of times
@@ -209,47 +192,124 @@ function select_transform!(nc::Union{Base.Callable, Pair{<:Union{Int, AbstractVe
         if isempty(col_idx) && fun isa ByRow
             if isempty(df)
                 T = Base.return_types(fun.fun, ())[1]
-                res = T[]
+                return T[]
             else
-                res = [fun.fun() for _ in 1:nrow(df)]
+                return [fun.fun() for _ in 1:nrow(df)]
             end
         else
-            res = fun(map(c -> cdf[c], col_idx)...)
+            cdf = eachcol(df)
+            return fun(map(c -> cdf[c], col_idx)...)
+        end
+    end
+    throw(ErrorException("unreachable reached"))
+end
+
+function _gen_colnames(@nospecialize(res), newname)
+    if res isa AbstractMatrix
+        colnames = gennames(size(res, 2))
+    else
+        colnames = propertynames(res)
+    end
+
+    if !(newname === AsTable || newname === nothing)
+        if length(colnames) != length(newname)
+            throw(ArgumentError("Number of returned columns does not match the " *
+                                "length of requested output"))
         end
+        colnames = newname
     end
 
+    return colnames
+end
+
+function _expand_to_table(@nospecialize(res))
+    if res isa AbstractVector && !isempty(res)
+        kp1 = keys(res[1])
+        all(x -> keys(x) == kp1, res) || throw(ArgumentError("keys of the returned elements must be identical"))
+        newres = DataFrame()
+        prepend = all(x -> x isa Integer, kp1)
+        for n in kp1
+            newres[!, prepend ? Symbol("x", n) : Symbol(n)] = [x[n] for x in res]
+        end
+        return newres
+    else
+        return Tables.columntable(res)
+    end
+end
+
+function _insert_row_multicolumn(df, newdf, allow_resizing_newdf, colnames, res)
+    if ncol(newdf) == 0
+        # if allow_resizing_newdf[] is false we know this is select or transform
+        rows = allow_resizing_newdf[] ? 1 : nrow(df)
+    else
+        # allow squashing a scalar to 0 rows
+        rows = nrow(newdf)
+    end
+    @assert length(colnames) == length(res)
+    for (newname, v) in zip(colnames, res)
+        # note that newdf potentially can contain c in general
+        newdf[!, newname] = fill!(Tables.allocatecolumn(typeof(v), rows), v)
+    end
+end
+
+function _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, @nospecialize(fun))
+    # allow shortening to 0 rows
+    if allow_resizing_newdf[] && nrow(newdf) == 1
+        newdfcols = _columns(newdf)
+        for (i, col) in enumerate(newdfcols)
+            newdfcols[i] = fill!(similar(col, lr), first(col))
+        end
+    end
+    # !allow_resizing_newdf[] && ncol(newdf) == 0
+    # means that we use `select` or `transform` not `combine`
+    if !allow_resizing_newdf[] && ncol(newdf) == 0 && lr != nrow(df)
+        throw(ArgumentError("length $(lr) of vector returned from " *
+                            "function $fun is different from number of rows " *
+                            "$(nrow(df)) of the source data frame."))
+    end
+    allow_resizing_newdf[] = false
+end
+
+function _add_col_check_copy(df, newdf, col_idx, copycols, @nospecialize(fun), newname, @nospecialize(v))
+    cdf = eachcol(df)
+    vpar = parent(v)
+    parent_cols = col_idx isa AsTable ? col_idx.cols : (col_idx === nothing ? (1:ncol(df)) : col_idx)
+    if copycols && !(fun isa ByRow) && (v isa SubArray || any(i -> vpar === parent(cdf[i]), parent_cols))
+        newdf[!, newname] = copy(v)
+    else
+        newdf[!, newname] = v
+    end
+end
+
+function select_transform!(@nospecialize(nc::Union{Base.Callable, Pair{<:Union{Int, AbstractVector{Int}, AsTable},
+                                                    <:Pair{<:Base.Callable,
+                                                           <:Union{Symbol, AbstractVector{Symbol}, DataType}}}}),
+                           df::AbstractDataFrame, newdf::DataFrame,
+                           transformed_cols::Set{Symbol}, copycols::Bool,
+                           allow_resizing_newdf::Ref{Bool})
+    if nc isa Base.Callable
+        col_idx, fun, newname = nothing, nc, nothing
+    else
+        col_idx, (fun, newname) = nc
+    end
+    if newname isa DataType
+        newname === AsTable || throw(ArgumentError("Only DataType supported as target is AsTable"))
+    end
+    # It is allowed to request a tranformation operation into a newname column
+    # only once. This is ensured by the logic related to transformed_cols dictionaly
+    # in _manipulate, therefore in select_transform! such a duplicate should not happen
+    res = _transformation_helper(df, col_idx, fun)
+
     if (newname === AsTable || newname isa AbstractVector{Symbol}) &&
         !(res isa Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix})
-        if res isa AbstractVector && !isempty(res)
-            kp1 = keys(res[1])
-            all(x -> keys(x) == kp1, res) || throw(ArgumentError("keys of the returned elements must be identical"))
-            true_res = res
-            res = DataFrame()
-            prepend = all(x -> x isa Integer, kp1)
-            for n in kp1
-                res[!, prepend ? Symbol("x", n) : Symbol(n)] = [x[n] for x in true_res]
-            end
-        else
-            res = Tables.columntable(res)
-        end
+        res = _expand_to_table(res)
     end
 
     if res isa Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix}
         if newname isa Symbol
             throw(ArgumentError("Table returned while a single column return value was requested"))
         end
-        if res isa AbstractMatrix
-            colnames = gennames(size(res, 2))
-        else
-            colnames = propertynames(res)
-        end
-        if !(newname === AsTable || newname === nothing)
-            if length(colnames) != length(newname)
-                throw(ArgumentError("Number of returned columns does not match the " *
-                                    "length of requested output"))
-            end
-            colnames = newname
-        end
+        colnames = _gen_colnames(res, newname)
         isempty(colnames) && return # nothing to do
 
         if any(in(transformed_cols), colnames)
@@ -261,51 +321,14 @@ function select_transform!(nc::Union{Base.Callable, Pair{<:Union{Int, AbstractVe
         end
         if res isa AbstractDataFrame
             lr = nrow(res)
-            # allow shortening to 0 rows
-            if allow_resizing_newdf[] && nrow(newdf) == 1
-                newdfcols = _columns(newdf)
-                for (i, col) in enumerate(newdfcols)
-                    newdfcols[i] = fill!(similar(col, lr), first(col))
-                end
-            end
-
-            # !allow_resizing_newdf[] && ncol(newdf) == 0
-            # means that we use `select` or `transform` not `combine`
-            if !allow_resizing_newdf[] && ncol(newdf) == 0 && lr != nrow(df)
-                throw(ArgumentError("length $(lr) of vector returned from " *
-                                    "function $fun is different from number of rows " *
-                                    "$(nrow(df)) of the source data frame."))
-            end
-            allow_resizing_newdf[] = false
+            _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, fun)
             @assert length(colnames) == ncol(res)
             for (newname, v) in zip(colnames, eachcol(res))
-                vpar = parent(v)
-                parent_cols = col_idx isa AsTable ? col_idx.cols : (col_idx === nothing ? (1:ncol(df)) : col_idx)
-                if copycols && !(fun isa ByRow) &&
-                    (v isa SubArray || any(i -> vpar === parent(cdf[i]), parent_cols))
-                    newdf[!, newname] = copy(v)
-                else
-                    newdf[!, newname] = v
-                end
+                _add_col_check_copy(df, newdf, col_idx, copycols, fun, newname, v)
             end
         elseif res isa AbstractMatrix
             lr = size(res, 1)
-            # allow shortening to 0 rows
-            if allow_resizing_newdf[] && nrow(newdf) == 1
-                newdfcols = _columns(newdf)
-                for (i, col) in enumerate(newdfcols)
-                    newdfcols[i] = fill!(similar(col, lr), first(col))
-                end
-            end
-
-            # !allow_resizing_newdf[] && ncol(newdf) == 0
-            # means that we use `select` or `transform` not `combine`
-            if !allow_resizing_newdf[] && ncol(newdf) == 0 && lr != nrow(df)
-                throw(ArgumentError("length $(lr) of vector returned from " *
-                                    "function $fun is different from number of rows " *
-                                    "$(nrow(df)) of the source data frame."))
-            end
-            allow_resizing_newdf[] = false
+            _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, fun)
             @assert length(colnames) == size(res, 2)
             for (i, newname) in enumerate(colnames)
                 newdf[!, newname] = res[:, i]
@@ -313,62 +336,18 @@ function select_transform!(nc::Union{Base.Callable, Pair{<:Union{Int, AbstractVe
         elseif res isa NamedTuple
             if all(v -> v isa AbstractVector, res)
                 lr = length(res[1])
-                # allow shortening to 0 rows
-                if allow_resizing_newdf[] && nrow(newdf) == 1
-                    newdfcols = _columns(newdf)
-                    for (i, col) in enumerate(newdfcols)
-                        newdfcols[i] = fill!(similar(col, lr), first(col))
-                    end
-                end
-
-                # !allow_resizing_newdf[] && ncol(newdf) == 0
-                # means that we use `select` or `transform` not `combine`
-                if !allow_resizing_newdf[] && ncol(newdf) == 0 && lr != nrow(df)
-                    throw(ArgumentError("length $(lr) of vector returned from " *
-                                        "function $fun is different from number of rows " *
-                                        "$(nrow(df)) of the source data frame."))
-                end
-                allow_resizing_newdf[] = false
+                _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, fun)
                 @assert length(colnames) == length(res)
                 for (newname, v) in zip(colnames, res)
-                    vpar = parent(v)
-                    parent_cols = col_idx isa AsTable ? col_idx.cols : (col_idx === nothing ? (1:ncol(df)) : col_idx)
-                    if copycols && !(fun isa ByRow) &&
-                        (v isa SubArray || any(i -> vpar === parent(cdf[i]), parent_cols))
-                        newdf[!, newname] = copy(v)
-                    else
-                        newdf[!, newname] = v
-                    end
+                    _add_col_check_copy(df, newdf, col_idx, copycols, fun, newname, v)
                 end
             elseif any(v -> v isa AbstractVector, res)
                 throw(ArgumentError("mixing single values and vectors in a named tuple is not allowed"))
             else
-                if ncol(newdf) == 0
-                    # if allow_resizing_newdf[] is false we know this is select or transform
-                    rows = allow_resizing_newdf[] ? 1 : nrow(df)
-                else
-                    # allow squashing a scalar to 0 rows
-                    rows = nrow(newdf)
-                end
-                @assert length(colnames) == length(res)
-                for (newname, v) in zip(colnames, res)
-                    # note that newdf potentially can contain c in general
-                    newdf[!, newname] = fill!(Tables.allocatecolumn(typeof(v), rows), v)
-                end
+                _insert_row_multicolumn(df, newdf, allow_resizing_newdf, colnames, res)
             end
         elseif res isa DataFrameRow
-            if ncol(newdf) == 0
-                # if allow_resizing_newdf[] is false we know this is select or transform
-                rows = allow_resizing_newdf[] ? 1 : nrow(df)
-            else
-                # allow squashing a scalar to 0 rows
-                rows = nrow(newdf)
-            end
-            @assert length(colnames) == length(res)
-            for (newname, v) in zip(colnames, res)
-                # note that newdf potentially can contain c in general
-                newdf[!, newname] = fill!(Tables.allocatecolumn(typeof(v), rows), v)
-            end
+            _insert_row_multicolumn(df, newdf, allow_resizing_newdf, colnames, res)
         end
     elseif res isa AbstractVector
         if newname === nothing
@@ -379,30 +358,9 @@ function select_transform!(nc::Union{Base.Callable, Pair{<:Union{Int, AbstractVe
         else
             push!(transformed_cols, newname)
         end
-        # allow shortening to 0 rows
-        if allow_resizing_newdf[] && nrow(newdf) == 1
-            newdfcols = _columns(newdf)
-            for (i, col) in enumerate(newdfcols)
-                newdfcols[i] = fill!(similar(col, length(res)), first(col))
-            end
-        end
-
-        # !allow_resizing_newdf[] && ncol(newdf) == 0
-        # means that we use `select` or `transform` not `combine`
-        if !allow_resizing_newdf[] && ncol(newdf) == 0 && length(res) != nrow(df)
-            throw(ArgumentError("length $(length(res)) of vector returned from " *
-                                "function $fun is different from number of rows " *
-                                "$(nrow(df)) of the source data frame."))
-        end
-        allow_resizing_newdf[] = false
-        respar = parent(res)
-        parent_cols = col_idx isa AsTable ? col_idx.cols : (col_idx === nothing ? (1:ncol(df)) : col_idx)
-        if copycols && !(fun isa ByRow) &&
-            (res isa SubArray || any(i -> respar === parent(cdf[i]), parent_cols))
-            newdf[!, newname] = copy(res)
-        else
-            newdf[!, newname] = res
-        end
+        lr = length(res)
+        _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, fun)
+        _add_col_check_copy(df, newdf, col_idx, copycols, fun, newname, res)
     else
         if newname === nothing
             newname = :x1
@@ -575,7 +533,7 @@ julia> select!(df, AsTable(:) => ByRow(mean), renamecols=false)
 select!(df::DataFrame, args...; renamecols::Bool=true) =
     _replace_columns!(df, select(df, args..., copycols=false, renamecols=renamecols))
 
-function select!(arg::Function, df::AbstractDataFrame; renamecols::Bool=true)
+function select!(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true)
     if arg isa Colon
         throw(ArgumentError("Only transformations are allowed when function is a " *
                             "frist argument to select!"))
@@ -595,7 +553,7 @@ See [`select!`](@ref) for detailed rules regarding accepted values for `args`.
 transform!(df::DataFrame, args...; renamecols::Bool=true) =
     select!(df, :, args..., renamecols=renamecols)
 
-function transform!(arg::Function, df::AbstractDataFrame; renamecols::Bool=true)
+function transform!(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true)
     if arg isa Colon
         throw(ArgumentError("Only transformations are allowed when function is a " *
                             "frist argument to transform!"))
@@ -726,7 +684,7 @@ julia> select(df, AsTable(:) => ByRow(mean), renamecols=false)
 select(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true) =
     manipulate(df, args..., copycols=copycols, keeprows=true, renamecols=renamecols)
 
-function select(arg::Function, df::AbstractDataFrame; renamecols::Bool=true)
+function select(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true)
     if arg isa Colon
         throw(ArgumentError("Only transformations are allowed when function is a " *
                             "frist argument to select"))
@@ -747,7 +705,7 @@ See [`select`](@ref) for detailed rules regarding accepted values for `args`.
 transform(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true) =
     select(df, :, args..., copycols=copycols, renamecols=renamecols)
 
-function transform(arg::Function, df::AbstractDataFrame; renamecols::Bool=true)
+function transform(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true)
     if arg isa Colon
         throw(ArgumentError("Only transformations are allowed when function is a " *
                             "frist argument to transform"))
@@ -790,7 +748,7 @@ julia> combine(df, :a => sum, nrow, renamecols=false)
 combine(df::AbstractDataFrame, args...; renamecols::Bool=true) =
     manipulate(df, args..., copycols=true, keeprows=false, renamecols=renamecols)
 
-function combine(arg::Function, df::AbstractDataFrame; renamecols::Bool=true)
+function combine(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true)
     if arg isa Colon
         throw(ArgumentError("Only transformations are allowed when function is a " *
                             "frist argument to combine"))
@@ -830,7 +788,7 @@ function manipulate(df::DataFrame, cs...; copycols::Bool, keeprows::Bool, rename
                     copycols, keeprows)
 end
 
-function _manipulate(df::AbstractDataFrame, normalized_cs, copycols::Bool, keeprows::Bool)
+function _manipulate(df::AbstractDataFrame, @nospecialize(normalized_cs), copycols::Bool, keeprows::Bool)
     @assert !(df isa SubDataFrame && copycols==false)
     newdf = DataFrame()
     # the role of transformed_cols is the following
diff --git a/test/grouping.jl b/test/grouping.jl
index c8839c9fa9..817ca1b255 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -1977,8 +1977,10 @@ end
           [df DataFrame(x_function=[(-1,), (-2,) ,(-3,) ,(-4,) ,(-5,)],
                         y_function=[(-6,), (-7,) ,(-8,) ,(-9,) ,(-10,)])]
 
-    @test_throws ArgumentError combine(gdf, AsTable([:x, :y]) => ByRow(identity))
-    @test_throws ArgumentError combine(gdf, AsTable([:x, :y]) => ByRow(x -> df[1, :]))
+    @test combine(gdf, AsTable([:x, :y]) => ByRow(identity)) ==
+          DataFrame(g=[1,1,1,2,2], x_y_identity=ByRow(identity)((x=1:5, y=6:10)))
+    @test combine(gdf, AsTable([:x, :y]) => ByRow(x -> df[1, :])) ==
+          DataFrame(g=[1,1,1,2,2], x_y_function=fill(df[1, :], 5))
 end
 
 @testset "test correctness of ungrouping" begin
diff --git a/test/select.jl b/test/select.jl
index a5aa6dde82..98eb2f3dc1 100644
--- a/test/select.jl
+++ b/test/select.jl
@@ -1430,6 +1430,79 @@ end
         @test transform!(sdf -> [10 11; 12 13], copy(df)) == DataFrame(a=1:2, b=3:4, c=5:6, x1=[10, 12], x2=[11, 13])
         @test transform!(sdf -> DataFrame(a=10)[1, :], copy(df)) == DataFrame(a=[10, 10], b=3:4, c=5:6)
     end
+
+    @testset "SELECT(DF, => AsTable)" begin
+        for df in (DataFrame(a=1:2, b=3:4, c=5:6), view(DataFrame(a=1:3, b=3:5, c=5:7, d=11:13), 1:2, 1:3))
+            for fun in (select, combine, transform),
+                res in (DataFrame(), DataFrame(a=1,b=2)[1, :], ones(1,1),
+                        (a=1,b=2), (a=[1], b=[2]), (a=1, b=[2]))
+                @test_throws ArgumentError fun(df, :a => x -> res)
+                @test_throws ArgumentError fun(df, :a => (x -> res) => :z)
+            end
+            for res in (DataFrame(x1=1, x2=2)[1, :], (x1=1,x2=2))
+                @test select(df, :a => (x -> res) => AsTable) == DataFrame(x1=[1,1], x2=[2,2])
+                @test transform(df, :a => (x -> res) => AsTable) == [df DataFrame(x1=[1,1], x2=[2,2])]
+                @test combine(df, :a => (x -> res) => AsTable) == DataFrame(x1=[1], x2=[2])
+                @test select(df, :a => (x -> res) => [:p, :q]) == DataFrame(p=[1,1], q=[2,2])
+                @test transform(df, :a => (x -> res) => [:p, :q]) == [df DataFrame(p=[1,1], q=[2,2])]
+                @test combine(df, :a => (x -> res) => [:p, :q]) == DataFrame(p=[1], q=[2])
+                @test_throws ArgumentError select(df, :a => (x -> res) => [:p, :q, :r])
+                @test_throws ArgumentError select(df, :a => (x -> res) => [:p])
+            end
+            for res in (DataFrame(x1=1, x2=2), [1 2], Tables.table([1 2], header=[:x1, :x2]),
+                        (x1=[1], x2=[2]))
+                @test combine(df, :a => (x -> res) => AsTable) == DataFrame(x1=1, x2=2)
+                @test combine(df, :a => (x -> res) => [:p, :q]) == DataFrame(p=1, q=2)
+                @test_throws ArgumentError combine(df, :a => (x -> res) => [:p])
+                @test_throws ArgumentError select(df, :a => (x -> res) => AsTable)
+                @test_throws ArgumentError transform(df, :a => (x -> res) => AsTable)
+            end
+            @test combine(df, :a => ByRow(x -> [x,x+1]),
+                          :a => ByRow(x -> [x, x+1]) => AsTable,
+                          :a => ByRow(x -> [x, x+1]) => [:p, :q],
+                          :a => ByRow(x -> (s=x, t=x+1)) => AsTable,
+                          :a => (x -> (k=x, l=x.+1)) => AsTable,
+                          :a => ByRow(x -> (s=x, t=x+1)) => :z) ==
+                  DataFrame(a_function=[[1, 2], [2, 3]], x1=[1, 2], x2=[2, 3],
+                            p=[1, 2], q=[2, 3], s=[1, 2], t=[2, 3], k=[1, 2], l=[2, 3],
+                            z=[(s=1, t=2), (s=2, t=3)])
+            @test select(df, :a => ByRow(x -> [x,x+1]),
+                         :a => ByRow(x -> [x, x+1]) => AsTable,
+                         :a => ByRow(x -> [x, x+1]) => [:p, :q],
+                         :a => ByRow(x -> (s=x, t=x+1)) => AsTable,
+                         :a => (x -> (k=x, l=x.+1)) => AsTable,
+                         :a => ByRow(x -> (s=x, t=x+1)) => :z) ==
+                  DataFrame(a_function=[[1, 2], [2, 3]], x1=[1, 2], x2=[2, 3],
+                            p=[1, 2], q=[2, 3], s=[1, 2], t=[2, 3], k=[1, 2], l=[2, 3],
+                            z=[(s=1, t=2), (s=2, t=3)])
+            @test transform(df, :a => ByRow(x -> [x,x+1]),
+                            :a => ByRow(x -> [x, x+1]) => AsTable,
+                            :a => ByRow(x -> [x, x+1]) => [:p, :q],
+                            :a => ByRow(x -> (s=x, t=x+1)) => AsTable,
+                            :a => (x -> (k=x, l=x.+1)) => AsTable,
+                            :a => ByRow(x -> (s=x, t=x+1)) => :z) ==
+                  [df DataFrame(a_function=[[1, 2], [2, 3]], x1=[1, 2], x2=[2, 3],
+                                p=[1, 2], q=[2, 3], s=[1, 2], t=[2, 3], k=[1, 2], l=[2, 3],
+                                z=[(s=1, t=2), (s=2, t=3)])]
+            @test_throws ArgumentError select(df, :a => (x -> [(a=1,b=2), (a=1, b=2, c=3)]) => AsTable)
+            @test_throws ArgumentError select(df, :a => (x -> [(a=1,b=2), (a=1, c=3)]) => AsTable)
+            @test_throws ArgumentError combine(df, :a => (x -> (a=1,b=2)) => :x)
+        end
+    end
+
+    @testset "check correctness of duplicate column names" begin
+        for df in (DataFrame(a=1:2, b=3:4, c=5:6), view(DataFrame(a=1:3, b=3:5, c=5:7, d=11:13), 1:2, 1:3))
+            @test select(df, :b, :) == DataFrame(b=3:4, a=1:2, c=5:6)
+            @test select(df, :b => :c, :) == DataFrame(c=3:4, a=1:2, b=3:4)
+            @test_throws ArgumentError select(df, :b => [:c, :d], :)
+            @test_throws ArgumentError select(df, :a, :a => x -> (a=[1,2], b=[3,4]))
+            @test_throws ArgumentError select(df, :a, :a => (x -> (a=[1,2], b=[3,4])) => AsTable)
+            @test select(df, [:b, :a], :a => (x -> (a=[11,12], b=[13,14])) => AsTable, :) ==
+                  DataFrame(b=[13, 14], a=[11, 12], c=[5, 6])
+            @test select(df, [:b, :a], :a => (x -> (a=[11,12], b=[13,14])) => [:b, :a], :) ==
+                  DataFrame(b=[11, 12], a=[13, 14], c=[5, 6])
+        end
+    end
 end
 
 @testset "empty ByRow" begin

From f307d1d8bbb4bf611c47430580717fc2903e5369 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Thu, 1 Oct 2020 12:43:37 +0200
Subject: [PATCH 10/21] change tests that now work

---
 test/grouping.jl | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/test/grouping.jl b/test/grouping.jl
index 817ca1b255..ea9163980d 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -2700,12 +2700,8 @@ end
     @test isequal_typed(combine(df, :x => (x -> 1:2) => :y), DataFrame(y=1:2))
     @test isequal_typed(combine(df, :x => (x -> x isa Vector{Int} ? "a" : 'a') => :y),
                         DataFrame(y="a"))
-
-    # in the future this should be DataFrame(nrow=0)
-    @test_throws ArgumentError combine(nrow, df)
-
-    # in the future this should be DataFrame(a=1,b=2)
-    @test_throws ArgumentError combine(sdf -> DataFrame(a=1,b=2), df)
+    @test combine(nrow, df) == DataFrame(nrow=0)
+    @test combine(sdf -> DataFrame(a=1,b=2), df) == DataFrame(a=1,b=2)
 end
 
 @testset "disallowed tuple column selector" begin

From 000b5c1f8cc919380612dc8c55baeb11a388ed2e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 3 Oct 2020 15:03:09 +0200
Subject: [PATCH 11/21] Apply suggestions from code review

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 src/abstractdataframe/selection.jl | 35 ++++++++++++++----------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index 25e7695e43..b045c70bd1 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -68,8 +68,9 @@ normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, <:AbstractStrin
 
 function normalize_selection(idx::AbstractIndex,
                              sel::Pair{<:Any,<:Pair{<:Base.Callable,
-                             <:Union{Symbol, AbstractString, DataType,
-                                     AbstractVector{Symbol}, AbstractVector{<:AbstractString}}}},
+                                                    <:Union{Symbol, AbstractString, DataType,
+                                                            AbstractVector{Symbol},
+                                                            AbstractVector{<:AbstractString}}}},
                              renamecols::Bool)
     lls = last(last(sel))
     if lls isa DataType
@@ -169,7 +170,7 @@ function normalize_selection(idx::AbstractIndex,
     return (wanttable ? AsTable(c) : c) => fun => newcol
 end
 
-function _transformation_helper(df, col_idx, @nospecialize(fun))
+function _transformation_helper(df::AbstractDataFrame, col_idx, @nospecialize(fun))
     if col_idx === nothing
         return fun(df)
     elseif col_idx isa Int
@@ -178,7 +179,7 @@ function _transformation_helper(df, col_idx, @nospecialize(fun))
         tbl = Tables.columntable(select(df, col_idx.cols, copycols=false))
         if isempty(tbl) && fun isa ByRow
             if isempty(df)
-                T = Base.return_types(fun.fun, (NamedTuple{(),Tuple{}},))[1]
+                T = Core.Compiler.return_type(fun.fun, (NamedTuple{(),Tuple{}},))
                 return T[]
             else
                 return [fun.fun(NamedTuple()) for _ in 1:nrow(df)]
@@ -211,7 +212,7 @@ function _gen_colnames(@nospecialize(res), newname)
         colnames = propertynames(res)
     end
 
-    if !(newname === AsTable || newname === nothing)
+    if newname !== AsTable && newname !== nothing
         if length(colnames) != length(newname)
             throw(ArgumentError("Number of returned columns does not match the " *
                                 "length of requested output"))
@@ -273,7 +274,7 @@ end
 function _add_col_check_copy(df, newdf, col_idx, copycols, @nospecialize(fun), newname, @nospecialize(v))
     cdf = eachcol(df)
     vpar = parent(v)
-    parent_cols = col_idx isa AsTable ? col_idx.cols : (col_idx === nothing ? (1:ncol(df)) : col_idx)
+    parent_cols = col_idx isa AsTable ? col_idx.cols : something(col_idx, 1:ncol(df))
     if copycols && !(fun isa ByRow) && (v isa SubArray || any(i -> vpar === parent(cdf[i]), parent_cols))
         newdf[!, newname] = copy(v)
     else
@@ -313,7 +314,8 @@ function select_transform!(@nospecialize(nc::Union{Base.Callable, Pair{<:Union{I
         isempty(colnames) && return # nothing to do
 
         if any(in(transformed_cols), colnames)
-            throw(ArgumentError("Duplicate column name returned"))
+            throw(ArgumentError("Duplicate column name(s) returned: :" *
+                                "$(join(intersect(colnames, transformed_cols), ", :"))"))
         else
             startlen = length(transformed_cols)
             union!(transformed_cols, colnames)
@@ -354,7 +356,7 @@ function select_transform!(@nospecialize(nc::Union{Base.Callable, Pair{<:Union{I
             newname = :x1
         end
         if newname in transformed_cols
-            throw(ArgumentError("duplicate name of a transformed column"))
+            throw(ArgumentError("duplicate output column name: :$newname"))
         else
             push!(transformed_cols, newname)
         end
@@ -366,7 +368,7 @@ function select_transform!(@nospecialize(nc::Union{Base.Callable, Pair{<:Union{I
             newname = :x1
         end
         if newname in transformed_cols
-            throw(ArgumentError("duplicate name of a transformed column"))
+            throw(ArgumentError("duplicate output column name: :$newname"))
         else
             push!(transformed_cols, newname)
         end
@@ -535,8 +537,7 @@ select!(df::DataFrame, args...; renamecols::Bool=true) =
 
 function select!(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true)
     if arg isa Colon
-        throw(ArgumentError("Only transformations are allowed when function is a " *
-                            "frist argument to select!"))
+        throw(ArgumentError("First argument must be a transformation if the second argument is a data frame"))
     end
     return select!(df, arg)
 end
@@ -555,8 +556,7 @@ transform!(df::DataFrame, args...; renamecols::Bool=true) =
 
 function transform!(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true)
     if arg isa Colon
-        throw(ArgumentError("Only transformations are allowed when function is a " *
-                            "frist argument to transform!"))
+        throw(ArgumentError("First argument must be a transformation if the second argument is a data frame"))
     end
     return transform!(df, arg)
 end
@@ -686,8 +686,7 @@ select(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=tru
 
 function select(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true)
     if arg isa Colon
-        throw(ArgumentError("Only transformations are allowed when function is a " *
-                            "frist argument to select"))
+        throw(ArgumentError("First argument must be a transformation if the second argument is a data frame"))
     end
     return select(df, arg)
 end
@@ -707,8 +706,7 @@ transform(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=
 
 function transform(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true)
     if arg isa Colon
-        throw(ArgumentError("Only transformations are allowed when function is a " *
-                            "frist argument to transform"))
+        throw(ArgumentError("First argument to must be a transformation if the second argument is a data frame"))
     end
     return transform(df, arg)
 end
@@ -750,8 +748,7 @@ combine(df::AbstractDataFrame, args...; renamecols::Bool=true) =
 
 function combine(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true)
     if arg isa Colon
-        throw(ArgumentError("Only transformations are allowed when function is a " *
-                            "frist argument to combine"))
+        throw(ArgumentError("First argument to select! must be a transformation if the second argument is a data frame"))
     end
     return combine(df, arg)
 end

From 87455e2cb66ba80b454a6e8bb81051654fd7e329 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 3 Oct 2020 16:19:46 +0200
Subject: [PATCH 12/21] fixes after code review

---
 src/abstractdataframe/selection.jl | 169 ++++++++++++++++++-----------
 test/select.jl                     |  51 ++++-----
 2 files changed, 125 insertions(+), 95 deletions(-)

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index b045c70bd1..361e01b848 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -170,7 +170,9 @@ function normalize_selection(idx::AbstractIndex,
     return (wanttable ? AsTable(c) : c) => fun => newcol
 end
 
-function _transformation_helper(df::AbstractDataFrame, col_idx, @nospecialize(fun))
+function _transformation_helper(df::AbstractDataFrame,
+                                col_idx::Union{Nothing, Int, AbstractVector{Int}, AsTable},
+                                @nospecialize(fun))
     if col_idx === nothing
         return fun(df)
     elseif col_idx isa Int
@@ -178,12 +180,7 @@ function _transformation_helper(df::AbstractDataFrame, col_idx, @nospecialize(fu
     elseif col_idx isa AsTable
         tbl = Tables.columntable(select(df, col_idx.cols, copycols=false))
         if isempty(tbl) && fun isa ByRow
-            if isempty(df)
-                T = Core.Compiler.return_type(fun.fun, (NamedTuple{(),Tuple{}},))
-                return T[]
-            else
-                return [fun.fun(NamedTuple()) for _ in 1:nrow(df)]
-            end
+            return [fun.fun(NamedTuple()) for _ in 1:nrow(df)]
         else
             return fun(tbl)
         end
@@ -191,12 +188,7 @@ function _transformation_helper(df::AbstractDataFrame, col_idx, @nospecialize(fu
         # it should be fast enough here as we do not expect to do it millions of times
         @assert col_idx isa AbstractVector{Int}
         if isempty(col_idx) && fun isa ByRow
-            if isempty(df)
-                T = Base.return_types(fun.fun, ())[1]
-                return T[]
-            else
-                return [fun.fun() for _ in 1:nrow(df)]
-            end
+            return [fun.fun() for _ in 1:nrow(df)]
         else
             cdf = eachcol(df)
             return fun(map(c -> cdf[c], col_idx)...)
@@ -205,7 +197,8 @@ function _transformation_helper(df::AbstractDataFrame, col_idx, @nospecialize(fu
     throw(ErrorException("unreachable reached"))
 end
 
-function _gen_colnames(@nospecialize(res), newname)
+function _gen_colnames(@nospecialize(res), newname::Union{AbstractVector{Symbol},
+                                                          Type{AsTable}, Nothing})
     if res isa AbstractMatrix
         colnames = gennames(size(res, 2))
     else
@@ -220,25 +213,31 @@ function _gen_colnames(@nospecialize(res), newname)
         colnames = newname
     end
 
-    return colnames
+    # fix the type to avoid unnecesarry compilations of methods
+    # this should be cheap
+    return colnames isa Vector{Symbol} ? colnames : collect(Symbol, colnames)
 end
 
-function _expand_to_table(@nospecialize(res))
-    if res isa AbstractVector && !isempty(res)
-        kp1 = keys(res[1])
-        all(x -> keys(x) == kp1, res) || throw(ArgumentError("keys of the returned elements must be identical"))
-        newres = DataFrame()
-        prepend = all(x -> x isa Integer, kp1)
-        for n in kp1
-            newres[!, prepend ? Symbol("x", n) : Symbol(n)] = [x[n] for x in res]
-        end
-        return newres
-    else
-        return Tables.columntable(res)
+_expand_to_table(res) = Tables.columntable(res)
+_expand_to_table(res::Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix}) = res
+
+function _expand_to_table(res::AbstractVector)
+    isempty(res) && return Tables.columntable(res)
+    kp1 = keys(res[1])
+    if any(x -> !isequal(keys(x), kp1), res)
+        throw(ArgumentError("keys of the returned elements must be identical"))
+    end
+    newres = DataFrame()
+    prepend = all(x -> x isa Integer, kp1)
+    for n in kp1
+        newres[!, prepend ? Symbol("x", n) : Symbol(n)] = [x[n] for x in res]
     end
+    return newres
 end
 
-function _insert_row_multicolumn(df, newdf, allow_resizing_newdf, colnames, res)
+function _insert_row_multicolumn(newdf::DataFrame, df::AbstractDataFrame,
+                                 allow_resizing_newdf::Ref{Bool}, colnames::AbstractVector{Symbol},
+                                 res::Union{NamedTuple, DataFrameRow})
     if ncol(newdf) == 0
         # if allow_resizing_newdf[] is false we know this is select or transform
         rows = allow_resizing_newdf[] ? 1 : nrow(df)
@@ -248,12 +247,14 @@ function _insert_row_multicolumn(df, newdf, allow_resizing_newdf, colnames, res)
     end
     @assert length(colnames) == length(res)
     for (newname, v) in zip(colnames, res)
-        # note that newdf potentially can contain c in general
+        # note that newdf potentially can contain newname in general
         newdf[!, newname] = fill!(Tables.allocatecolumn(typeof(v), rows), v)
     end
 end
 
-function _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, @nospecialize(fun))
+function _fix_existing_columns_for_vector(newdf::DataFrame, df::AbstractDataFrame,
+                                          allow_resizing_newdf::Ref{Bool}, lr::Int,
+                                          @nospecialize(fun))
     # allow shortening to 0 rows
     if allow_resizing_newdf[] && nrow(newdf) == 1
         newdfcols = _columns(newdf)
@@ -271,7 +272,10 @@ function _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, @
     allow_resizing_newdf[] = false
 end
 
-function _add_col_check_copy(df, newdf, col_idx, copycols, @nospecialize(fun), newname, @nospecialize(v))
+function _add_col_check_copy(newdf::DataFrame, df::AbstractDataFrame,
+                             col_idx::Union{Nothing, Int, AbstractVector{Int}, AsTable},
+                             copycols::Bool, @nospecialize(fun),
+                             newname::Symbol, @nospecialize(v))
     cdf = eachcol(df)
     vpar = parent(v)
     parent_cols = col_idx isa AsTable ? col_idx.cols : something(col_idx, 1:ncol(df))
@@ -282,9 +286,71 @@ function _add_col_check_copy(df, newdf, col_idx, copycols, @nospecialize(fun), n
     end
 end
 
+function _add_multicol_res(res::AbstractDataFrame, newdf::DataFrame, df::AbstractDataFrame,
+                           colnames::AbstractVector{Symbol},
+                           allow_resizing_newdf::Ref{Bool}, @nospecialize(fun),
+                           col_idx::Union{Nothing, Int, AbstractVector{Int}, AsTable},
+                           copycols::Bool, newname::Union{Nothing, Type{AsTable}, AbstractVector{Symbol}})
+    lr = nrow(res)
+    _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, fun)
+    @assert length(colnames) == ncol(res)
+    for (newname, v) in zip(colnames, eachcol(res))
+        _add_col_check_copy(newdf, df, col_idx, copycols, fun, newname, v)
+    end
+end
+
+function _add_multicol_res(res::AbstractMatrix, newdf::DataFrame, df::AbstractDataFrame,
+                           colnames::AbstractVector{Symbol},
+                           allow_resizing_newdf::Ref{Bool}, @nospecialize(fun),
+                           col_idx::Union{Nothing, Int, AbstractVector{Int}, AsTable},
+                           copycols::Bool, newname::Union{Nothing, Type{AsTable}, AbstractVector{Symbol}})
+    lr = size(res, 1)
+    _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, fun)
+    @assert length(colnames) == size(res, 2)
+    for (i, newname) in enumerate(colnames)
+        newdf[!, newname] = res[:, i]
+    end
+end
+
+function _add_multicol_res(res::NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}},
+                           newdf::DataFrame, df::AbstractDataFrame,
+                           colnames::AbstractVector{Symbol},
+                           allow_resizing_newdf::Ref{Bool}, @nospecialize(fun),
+                           col_idx::Union{Nothing, Int, AbstractVector{Int}, AsTable},
+                           copycols::Bool, newname::Union{Nothing, Type{AsTable}, AbstractVector{Symbol}})
+    lr = length(res[1])
+    _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, fun)
+    @assert length(colnames) == length(res)
+    for (newname, v) in zip(colnames, res)
+        _add_col_check_copy(newdf, df, col_idx, copycols, fun, newname, v)
+    end
+end
+
+function _add_multicol_res(res::NamedTuple, newdf::DataFrame, df::AbstractDataFrame,
+                           colnames::AbstractVector{Symbol},
+                           allow_resizing_newdf::Ref{Bool}, @nospecialize(fun),
+                           col_idx::Union{Nothing, Int, AbstractVector{Int}, AsTable},
+                           copycols::Bool, newname::Union{Nothing, Type{AsTable}, AbstractVector{Symbol}})
+    if any(v -> v isa AbstractVector, res)
+        throw(ArgumentError("mixing single values and vectors in a named tuple is not allowed"))
+    else
+        _insert_row_multicolumn(newdf, df, allow_resizing_newdf, colnames, res)
+    end
+end
+
+function _add_multicol_res(res::DataFrameRow, newdf::DataFrame, df::AbstractDataFrame,
+                           colnames::AbstractVector{Symbol},
+                           allow_resizing_newdf::Ref{Bool}, @nospecialize(fun),
+                           col_idx::Union{Nothing, Int, AbstractVector{Int}, AsTable},
+                           copycols::Bool, newname::Union{Nothing, Type{AsTable}, AbstractVector{Symbol}})
+    _insert_row_multicolumn(newdf, df, allow_resizing_newdf, colnames, res)
+end
+
 function select_transform!(@nospecialize(nc::Union{Base.Callable, Pair{<:Union{Int, AbstractVector{Int}, AsTable},
-                                                    <:Pair{<:Base.Callable,
-                                                           <:Union{Symbol, AbstractVector{Symbol}, DataType}}}}),
+                                                                       <:Pair{<:Base.Callable,
+                                                                              <:Union{Symbol,
+                                                                                      AbstractVector{Symbol},
+                                                                                      DataType}}}}),
                            df::AbstractDataFrame, newdf::DataFrame,
                            transformed_cols::Set{Symbol}, copycols::Bool,
                            allow_resizing_newdf::Ref{Bool})
@@ -301,8 +367,7 @@ function select_transform!(@nospecialize(nc::Union{Base.Callable, Pair{<:Union{I
     # in _manipulate, therefore in select_transform! such a duplicate should not happen
     res = _transformation_helper(df, col_idx, fun)
 
-    if (newname === AsTable || newname isa AbstractVector{Symbol}) &&
-        !(res isa Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix})
+    if (newname === AsTable || newname isa AbstractVector{Symbol})
         res = _expand_to_table(res)
     end
 
@@ -321,36 +386,8 @@ function select_transform!(@nospecialize(nc::Union{Base.Callable, Pair{<:Union{I
             union!(transformed_cols, colnames)
             @assert startlen + length(colnames) == length(transformed_cols)
         end
-        if res isa AbstractDataFrame
-            lr = nrow(res)
-            _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, fun)
-            @assert length(colnames) == ncol(res)
-            for (newname, v) in zip(colnames, eachcol(res))
-                _add_col_check_copy(df, newdf, col_idx, copycols, fun, newname, v)
-            end
-        elseif res isa AbstractMatrix
-            lr = size(res, 1)
-            _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, fun)
-            @assert length(colnames) == size(res, 2)
-            for (i, newname) in enumerate(colnames)
-                newdf[!, newname] = res[:, i]
-            end
-        elseif res isa NamedTuple
-            if all(v -> v isa AbstractVector, res)
-                lr = length(res[1])
-                _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, fun)
-                @assert length(colnames) == length(res)
-                for (newname, v) in zip(colnames, res)
-                    _add_col_check_copy(df, newdf, col_idx, copycols, fun, newname, v)
-                end
-            elseif any(v -> v isa AbstractVector, res)
-                throw(ArgumentError("mixing single values and vectors in a named tuple is not allowed"))
-            else
-                _insert_row_multicolumn(df, newdf, allow_resizing_newdf, colnames, res)
-            end
-        elseif res isa DataFrameRow
-            _insert_row_multicolumn(df, newdf, allow_resizing_newdf, colnames, res)
-        end
+        _add_multicol_res(res, newdf, df, colnames, allow_resizing_newdf, fun,
+                          col_idx, copycols, newname)
     elseif res isa AbstractVector
         if newname === nothing
             newname = :x1
@@ -362,7 +399,7 @@ function select_transform!(@nospecialize(nc::Union{Base.Callable, Pair{<:Union{I
         end
         lr = length(res)
         _fix_existing_columns_for_vector(newdf, df, allow_resizing_newdf, lr, fun)
-        _add_col_check_copy(df, newdf, col_idx, copycols, fun, newname, res)
+        _add_col_check_copy(newdf, df, col_idx, copycols, fun, newname, res)
     else
         if newname === nothing
             newname = :x1
diff --git a/test/select.jl b/test/select.jl
index 98eb2f3dc1..80e9c4018e 100644
--- a/test/select.jl
+++ b/test/select.jl
@@ -4,6 +4,10 @@ using DataFrames, Test, Random, Statistics, CategoricalArrays
 
 const ≅ = isequal
 
+"""Check if passed data frames are `isequal` and have the same types of columns"""
+isequal_coltyped(df1::AbstractDataFrame, df2::AbstractDataFrame) =
+    isequal(df1, df2) && typeof.(eachcol(df1)) == typeof.(eachcol(df2))
+
 Random.seed!(1234)
 
 @testset "select! Not" begin
@@ -887,18 +891,14 @@ end
         @test select(df2, (:) => (+) => :d, :x1 => (x -> x) => :b, [] => (() -> v) => :a) ==
               DataFrame([6  1 9], [:d, :b, :a])
 
-        res = select(df3, [] => (() -> v) => :a, :x1 => x -> [])
-        @test propertynames(res) == [:a, :x1_function] && nrow(res) == 0
-        @test eltype.(eachcol(res)) == [Int, Any]
-        res = select(df3, :x1 => x -> [], [] => (() -> v) => :a)
-        @test propertynames(res) == [:x1_function, :a] && nrow(res) == 0
-        @test eltype.(eachcol(res)) == [Any, Int]
-        res = select(df3, [] => (() -> v) => :a, :x1)
-        @test propertynames(res) == [:a, :x1] && nrow(res) == 0
-        @test eltype.(eachcol(res)) == [Int, Char]
-        res = select(df3, :x1, [] => (() -> v) => :a)
-        @test propertynames(res) == [:x1, :a] && nrow(res) == 0
-        @test eltype.(eachcol(res)) == [Char, Int]
+        @test isequal_coltyped(select(df3, [] => (() -> v) => :a, :x1 => x -> []),
+                               DataFrame(a=Int[], x1_function=Any[]))
+        @test isequal_coltyped(select(df3, :x1 => x -> [], [] => (() -> v) => :a),
+                               DataFrame(x1_function=Any[], a=Int[]))
+        @test isequal_coltyped(select(df3, [] => (() -> v) => :a, :x1),
+                               DataFrame(a=Int[], x1=Char[]))
+        @test isequal_coltyped(select(df3, :x1, [] => (() -> v) => :a),
+                               DataFrame(x1=Char[], a=Int[]))
     end
     @test_throws ArgumentError select(df, [] => (() -> [9]) => :a, :)
     @test_throws ArgumentError select(df, :, [] => (() -> [9]) => :a)
@@ -1328,10 +1328,7 @@ end
 end
 
 @testset "additional tests for new rules" begin
-#    select select! transform transform! combine
-#    Union{Type{AsTable}, Symbol, AbstractVector{Symbol}, AbstractString, AbstractVector{<:AbstractString}}
-#    DataFrame, SubDataFrame
-    @testset "SELECT(FUN, DF)" begin
+    @testset "transformation function with a function as first argument" begin
         for df in (DataFrame(a=1:2, b=3:4, c=5:6), view(DataFrame(a=1:3, b=3:5, c=5:7, d=11:13), 1:2, 1:3))
             @test select(sdf -> sdf.b, df) == DataFrame(x1=3:4)
             @test select(sdf -> (b = 2sdf.b,), df) == DataFrame(b=[6,8])
@@ -1431,7 +1428,7 @@ end
         @test transform!(sdf -> DataFrame(a=10)[1, :], copy(df)) == DataFrame(a=[10, 10], b=3:4, c=5:6)
     end
 
-    @testset "SELECT(DF, => AsTable)" begin
+    @testset "transformation function with multiple columns as destination" begin
         for df in (DataFrame(a=1:2, b=3:4, c=5:6), view(DataFrame(a=1:3, b=3:5, c=5:7, d=11:13), 1:2, 1:3))
             for fun in (select, combine, transform),
                 res in (DataFrame(), DataFrame(a=1,b=2)[1, :], ones(1,1),
@@ -1524,12 +1521,10 @@ end
         @test eltype(combine(df, [] => ByRow(() -> 1)).function) == Int
         @test eltype(transform(df, [] => ByRow(() -> 1)).function) == Int
 
-        df2 = select(df, [] => ByRow(() -> (a=1,b="1")) => AsTable)
-        @test names(df2) == ["a", "b"]
-        @test eltype.(eachcol(df2)) == [Int, String]
-        df2 = select(df, [] => ByRow(() -> (a=1,b="1")) => [:p, :q])
-        @test names(df2) == ["p", "q"]
-        @test eltype.(eachcol(df2)) == [Int, String]
+        @test isequal_coltyped(select(df, [] => ByRow(() -> (a=1,b="1")) => AsTable),
+                               DataFrame(a=Int[], b=String[]))
+        @test isequal_coltyped(select(df, [] => ByRow(() -> (a=1,b="1")) => [:p, :q]),
+                               DataFrame(p=Int[], q=String[]))
 
         # here this follows Tables.jl behavior
         for res in ([1, "1"], (1, "1"))
@@ -1554,12 +1549,10 @@ end
         @test eltype(combine(df, AsTable([]) => ByRow(x -> 1)).function) == Int
         @test eltype(transform(df, AsTable([]) => ByRow(x -> 1)).function) == Int
 
-        df2 = select(df, AsTable([]) => ByRow(x -> (a=1,b="1")) => AsTable)
-        @test names(df2) == ["a", "b"]
-        @test eltype.(eachcol(df2)) == [Int, String]
-        df2 = select(df, AsTable([]) => ByRow(x -> (a=1,b="1")) => [:p, :q])
-        @test names(df2) == ["p", "q"]
-        @test eltype.(eachcol(df2)) == [Int, String]
+        @test isequal_coltyped(select(df, AsTable([]) => ByRow(x -> (a=1,b="1")) => AsTable),
+                               DataFrame(a=Int[], b=String[]))
+        @test isequal_coltyped(select(df, AsTable([]) => ByRow(x -> (a=1,b="1")) => [:p, :q]),
+                               DataFrame(p=Int[], q=String[]))
 
         # here this follows Tables.jl behavior
         for res in ([1, "1"], (1, "1"))

From e16e17a0b288dd3a2c722c82db4ed87e511f3111 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 3 Oct 2020 23:32:31 +0200
Subject: [PATCH 13/21] Update src/abstractdataframe/selection.jl

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 src/abstractdataframe/selection.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index 361e01b848..7dfb0c0032 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -373,7 +373,7 @@ function select_transform!(@nospecialize(nc::Union{Base.Callable, Pair{<:Union{I
 
     if res isa Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix}
         if newname isa Symbol
-            throw(ArgumentError("Table returned while a single column return value was requested"))
+            throw(ArgumentError("Table returned but a single output column was expected"))
         end
         colnames = _gen_colnames(res, newname)
         isempty(colnames) && return # nothing to do

From 0ef36cfd2c11c59aea14f93cd8b04d241a8c92c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 3 Oct 2020 23:41:46 +0200
Subject: [PATCH 14/21] Update src/abstractdataframe/selection.jl

---
 src/abstractdataframe/selection.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index 7dfb0c0032..7ef80a48a5 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -275,7 +275,7 @@ end
 function _add_col_check_copy(newdf::DataFrame, df::AbstractDataFrame,
                              col_idx::Union{Nothing, Int, AbstractVector{Int}, AsTable},
                              copycols::Bool, @nospecialize(fun),
-                             newname::Symbol, @nospecialize(v))
+                             newname::Symbol, v::AbstractVector)
     cdf = eachcol(df)
     vpar = parent(v)
     parent_cols = col_idx isa AsTable ? col_idx.cols : something(col_idx, 1:ncol(df))

From 7679adee2edd3913fbaae77502ff82694d1f5ac9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sun, 4 Oct 2020 12:48:01 +0200
Subject: [PATCH 15/21] Update src/abstractdataframe/selection.jl

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 src/abstractdataframe/selection.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index 7ef80a48a5..4c335c65fc 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -367,7 +367,7 @@ function select_transform!(@nospecialize(nc::Union{Base.Callable, Pair{<:Union{I
     # in _manipulate, therefore in select_transform! such a duplicate should not happen
     res = _transformation_helper(df, col_idx, fun)
 
-    if (newname === AsTable || newname isa AbstractVector{Symbol})
+    if newname === AsTable || newname isa AbstractVector{Symbol}
         res = _expand_to_table(res)
     end
 

From 2ebe6889ebd38bbfb8c280e30b4d938603e520dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Mon, 5 Oct 2020 11:17:09 +0200
Subject: [PATCH 16/21] Update src/abstractdataframe/selection.jl

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 src/abstractdataframe/selection.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index 4c335c65fc..4f52c4b3af 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -213,7 +213,7 @@ function _gen_colnames(@nospecialize(res), newname::Union{AbstractVector{Symbol}
         colnames = newname
     end
 
-    # fix the type to avoid unnecesarry compilations of methods
+    # fix the type to avoid unnecessary compilations of methods
     # this should be cheap
     return colnames isa Vector{Symbol} ? colnames : collect(Symbol, colnames)
 end

From e1141bec10f743463a4fbdd12c0156d56474b132 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Mon, 5 Oct 2020 11:51:05 +0200
Subject: [PATCH 17/21] make rules for keys in AsTable on vector stricter

---
 NEWS.md                            |  4 ++++
 src/abstractdataframe/selection.jl |  6 +++++-
 test/select.jl                     | 10 ++++------
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index e8acee6440..f5037937eb 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -2,6 +2,10 @@
 
 ## Breaking changes
 
+* the rules for transformations passed to `select`/`select!`, `transform`/`transform!`,
+  and `combine` have been made more flexible; in particular now it is allowed to
+  return multiple columns from a transformation function
+  [#2461](https://github.com/JuliaData/DataFrames.jl/pull/2461)
 * CategoricalArrays.jl is no longer reexported: call `using CategoricalArrays`
   to use it [#2404]((https://github.com/JuliaData/DataFrames.jl/pull/2404)).
   In the same vein, the `categorical` and `categorical!` functions
diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index 4f52c4b3af..1724f68027 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -224,11 +224,15 @@ _expand_to_table(res::Union{AbstractDataFrame, NamedTuple, DataFrameRow, Abstrac
 function _expand_to_table(res::AbstractVector)
     isempty(res) && return Tables.columntable(res)
     kp1 = keys(res[1])
+    prepend = all(x -> x isa Integer, kp1)
+    if !(prepend || all(x -> x isa Symbol, kp1) || all(x -> x isa AbstractString, kp1))
+        throw(ArgumentError("keys of the returned elements must be " *
+                            "`Symbol`s, strings or integers"))
+    end
     if any(x -> !isequal(keys(x), kp1), res)
         throw(ArgumentError("keys of the returned elements must be identical"))
     end
     newres = DataFrame()
-    prepend = all(x -> x isa Integer, kp1)
     for n in kp1
         newres[!, prepend ? Symbol("x", n) : Symbol(n)] = [x[n] for x in res]
     end
diff --git a/test/select.jl b/test/select.jl
index 80e9c4018e..23c2d0674b 100644
--- a/test/select.jl
+++ b/test/select.jl
@@ -734,11 +734,9 @@ end
             @test select(df, :x => ByRow(x -> retval) => AsTable) == DataFrame(;retval...)
         elseif retval isa DataFrame
             @test_throws MethodError select(df, :x => ByRow(x -> retval) => AsTable)
-        else # Matrix; surprising but following the API
-            @test select(df, :x => ByRow(x -> retval) => AsTable) ==
-                  DataFrame(["CartesianIndex($i, $j)" => 1.0 for i in 1:2, j in 1:2]...)
-            @test select(df, :x => ByRow(x -> retval) => [:a, :b, :c, :d]) ==
-                  DataFrame(a=1.0, b=1.0, c=1.0, d=1.0)
+        else # Matrix: wrong type of keys
+            @test_throws ArgumentError select(df, :x => ByRow(x -> retval) => AsTable)
+            @test_throws ArgumentError select(df, :x => ByRow(x -> retval) => [:a, :b, :c, :d])
         end
     end
 
@@ -748,7 +746,7 @@ end
         if retval isa Tuple
             @test select(df, :x => ByRow(x -> retval) => AsTable) == DataFrame(x1=1, x2=2)
         else
-            @test select(df, :x => ByRow(x -> retval) => Symbol.("x", 1:8)) == DataFrame(ones(1, 8))
+            @test_throws ArgumentError select(df, :x => ByRow(x -> retval) => AsTable)
         end
         cdf = copy(df)
         select!(cdf, :x => x -> retval)

From df1dd1922a8a125b01cbd56b5af18d0fb512d111 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Mon, 5 Oct 2020 12:38:05 +0200
Subject: [PATCH 18/21] update docstrings and the manual

---
 docs/src/lib/types.md              |   3 +-
 docs/src/man/getting_started.md    |   8 +
 src/abstractdataframe/selection.jl | 271 ++++++++++++++++++++++-------
 3 files changed, 220 insertions(+), 62 deletions(-)

diff --git a/docs/src/lib/types.md b/docs/src/lib/types.md
index 0bb84afaa4..5073229c05 100644
--- a/docs/src/lib/types.md
+++ b/docs/src/lib/types.md
@@ -55,7 +55,8 @@ The `ByRow` type is a special type used for selection operations to signal that
 to each element (row) of the selection.
 
 The `AsTable` type is a special type used for selection operations to signal that the columns selected by a wrapped
-selector should be passed as a `NamedTuple` to the function.
+selector should be passed as a `NamedTuple` to the function or to signal that it is requested
+to expand the return value of a transformation into multiple columns.
 
 ## [The design of handling of columns of a `DataFrame`](@id man-columnhandling)
 
diff --git a/docs/src/man/getting_started.md b/docs/src/man/getting_started.md
index e6d697b947..df6b8bd899 100644
--- a/docs/src/man/getting_started.md
+++ b/docs/src/man/getting_started.md
@@ -627,6 +627,14 @@ julia> select(df, :x2, :x2 => ByRow(sqrt)) # transform columns by row
 ├─────┼───────┼─────────┤
 │ 1   │ 3     │ 1.73205 │
 │ 2   │ 4     │ 2.0     │
+
+julia> select(df, AsTable(:) => ByRow(extrema) => [:lo, :hi]) # return multiple columns
+2×2 DataFrame
+│ Row │ lo    │ hi    │
+│     │ Int64 │ Int64 │
+├─────┼───────┼───────┤
+│ 1   │ 1     │ 5     │
+│ 2   │ 2     │ 6     │
 ```
 
 It is important to note that `select` always returns a data frame,
diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index 1724f68027..82f65226d5 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -432,35 +432,41 @@ SELECT_ARG_RULES =
 
     * Any index that is allowed for column indexing
       ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR).
+    * A function or a type
     * Column transformation operations using the `Pair` notation that is
-      described below and vectors of such pairs.
+      described below and vectors or matrices of such pairs.
 
     Columns can be renamed using the `old_column => new_column_name` syntax, and
     transformed using the `old_column => fun => new_column_name` syntax.
-    `new_column_name` must be a `Symbol` or a string, and `fun` a function or a
-    type. If `old_column` is a `Symbol`, a string, or an integer then `fun` is
-    applied to the corresponding column vector. Otherwise `old_column` can be
-    any column indexing syntax, in which case `fun` will be passed the column
-    vectors specified by `old_column` as separate arguments. The only exception
-    is when `old_column` is an `AsTable` type wrapping a selector, in which case
-    `fun` is passed a `NamedTuple` containing the selected columns.
+    `new_column_name` must be a `Symbol` or a string, a vector of `Symbol` or
+    string, or `AsTable`, and `fun` a function or a type. If `old_column` is a
+    `Symbol`, a string, or an integer then `fun` is applied to the corresponding
+    column vector. Otherwise `old_column` can be any column indexing syntax, in
+    which case `fun` will be passed the column vectors specified by `old_column`
+    as separate arguments. The only exception is when `old_column` is an
+    `AsTable` type wrapping a selector, in which case `fun` is passed a
+    `NamedTuple` containing the selected columns.
+
+    Column renaming and transformation operations can be passed wrapped in
+    vectors or matrices (this is useful when combined with broadcasting).
+
+    # Rules when `new_column_name` is a `Symbol` or a string or is missing
 
     If `fun` returns a value of type other than `AbstractVector` then it will be
     broadcasted into a vector matching the target number of rows in the data
     frame, unless its type is one of `AbstractDataFrame`, `NamedTuple`,
-    `DataFrameRow`, `AbstractMatrix`, in which case an error is thrown as
-    currently these return types are not allowed. As a particular rule, values
-    wrapped in a `Ref` or a `0`-dimensional `AbstractArray` are unwrapped and
-    then broadcasted.
+    `DataFrameRow`, `AbstractMatrix`, in which case an error is thrown. As a
+    particular rule, values wrapped in a `Ref` or a `0`-dimensional
+    `AbstractArray` are unwrapped and then broadcasted.
 
     To apply `fun` to each row instead of whole columns, it can be wrapped in a
     `ByRow` struct. In this case if `old_column` is a `Symbol`, a string, or an
     integer then `fun` is applied to each element (row) of `old_column` using
     broadcasting. Otherwise `old_column` can be any column indexing syntax, in
     which case `fun` will be passed one argument for each of the columns
-    specified by `old_column`. If `ByRow` is used it is not allowed for
-    `old_column` to select an empty set of columns nor for `fun` to return a
-    `NamedTuple` or a `DataFrameRow`.
+    specified by `old_column`. If `ByRow` is used it is allowed for
+    `old_column` to select an empty set of columns, in which case no arguments
+    are passed to `fun` for each row.
 
     Column transformation can also be specified using the short `old_column =>
     fun` form. In this case, `new_column_name` is automatically generated as
@@ -473,8 +479,52 @@ SELECT_ARG_RULES =
     It is not allowed to pass `renamecols=false` if `old_column` is empty
     as it would generate an empty column name.
 
-    Column renaming and transformation operations can be passed wrapped in
-    vectors (this is useful when combined with broadcasting).
+    # Rules when `new_column_name` is a vector of `Symbol` or a string or is `AsTable`
+
+    In this case it is assumed that `fun` returns multiple columns.
+
+    If `fun` returns one of `AbstractDataFrame`, `NamedTuple`, `DataFrameRow`,
+    `AbstractMatrix` then rules described below for `args` being a function or
+    a type apply.
+
+    If `fun` returns an `AbstractVector` then each element of this vector must
+    support `keys` function that must return a collection of `Symbol`s, strings
+    or integers; the return value of `keys` must be identical for all elements.
+    Then as many columns are created as there are elements in the return value
+    of the `keys` function and their names are set to be equal to the key names,
+    except if `keys` returns integers, in which case they are prefixed by `x`
+    (so the column names are e.g. `x1`, `x2`, ...)
+
+    If `fun` returns a value of any other type then it is assumed that it is
+    a table conforming to Tables.jl API and the `Tables.columntable` function is
+    called on it to get the resulting columns and their names.
+
+    Additionally if `new_column_name` is a vector of `Symbol` or string then column
+    names produced using the rules above are ignored and replaced by `new_column_name`
+    (the number of columns must be the same as the length `new_column_name` in this case).
+
+    # Rules when element of `args` is a function or a type
+
+    In this case a transformaton is passed `df` as a single argument.
+
+    If the return value of the transformation is of `AbstractDataFrame`,
+    `NamedTuple`, `DataFrameRow` or `AbstractMatrix` then it is treated as
+    containing multiple columns. For `AbstractMatrix` column names are generated
+    as `x1`, `x2`, etc. For `AbstractDataFrame`, `NamedTuple` of vectors and
+    `AbstractMatrix` the columns are taken as is from the returned value. For
+    `DataFrameRow` and` NamedTuple` not containing any vectors the returned
+    value is broadcasted a vector matching the target number of rows in the data
+    frame.
+
+    If the return value is an `AbstractVector` then it is used as-is. The resulting
+    column gets a name `x1`.
+
+    In all other cases the return value is broadcasted into a vector matching
+    the target number of rows in the data frame. As a particular rule, values
+    wrapped in a `Ref` or a `0`-dimensional `AbstractArray` are unwrapped and
+    then broadcasted. The resulting column gets a name `x1`.
+
+    # Special rules
 
     As a special rule passing `nrow` without specifying `old_column` creates a
     column named `:nrow` containing a number of rows in a source data frame, and
@@ -493,6 +543,7 @@ SELECT_ARG_RULES =
 
 """
     select!(df::DataFrame, args...; renamecols::Bool=true)
+    select!(args::Callable, df::DataFrame; renamecols::Bool=true)
 
 Mutate `df` in place to retain only columns specified by `args...` and return it.
 The result is guaranteed to have the same number of rows as `df`, except when no
@@ -547,16 +598,16 @@ julia> select!(df, :, [:c, :b] => (c,b) -> c .+ b .- sum(b)/length(b))
 
 julia> df = DataFrame(a=1:3, b=4:6);
 
-julia> select!(df, names(df) .=> sum);
+julia> select!(df, names(df) .=> [minimum maximum]);
 
 julia> df
-3×2 DataFrame
-│ Row │ a_sum │ b_sum │
-│     │ Int64 │ Int64 │
-├─────┼───────┼───────┤
-│ 1   │ 6     │ 15    │
-│ 2   │ 6     │ 15    │
-│ 3   │ 6     │ 15    │
+3×4 DataFrame
+│ Row │ a_minimum │ b_minimum │ a_maximum │ b_maximum │
+│     │ Int64     │ Int64     │ Int64     │ Int64     │
+├─────┼───────────┼───────────┼───────────┼───────────┤
+│ 1   │ 1         │ 4         │ 3         │ 6         │
+│ 2   │ 1         │ 4         │ 3         │ 6         │
+│ 3   │ 1         │ 4         │ 3         │ 6         │
 
 julia> df = DataFrame(a=1:3, b=4:6);
 
@@ -570,6 +621,36 @@ julia> select!(df, AsTable(:) => ByRow(mean), renamecols=false)
 │ 1   │ 2.5     │
 │ 2   │ 3.5     │
 │ 3   │ 4.5     │
+
+julia> df = DataFrame(a=1:3, b=4:6);
+
+julia> select!(first, df)
+3×2 DataFrame
+│ Row │ a     │ b     │
+│     │ Int64 │ Int64 │
+├─────┼───────┼───────┤
+│ 1   │ 1     │ 4     │
+│ 2   │ 1     │ 4     │
+│ 3   │ 1     │ 4     │
+
+julia> df = DataFrame(a=1:3, b=4:6, c=7:9)
+3×3 DataFrame
+│ Row │ a     │ b     │ c     │
+│     │ Int64 │ Int64 │ Int64 │
+├─────┼───────┼───────┼───────┤
+│ 1   │ 1     │ 4     │ 7     │
+│ 2   │ 2     │ 5     │ 8     │
+│ 3   │ 3     │ 6     │ 9     │
+
+julia> select!(df, AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => :stats,
+               AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => AsTable)
+3×3 DataFrame
+│ Row │ stats                   │ mean    │ std     │
+│     │ NamedTuple…             │ Float64 │ Float64 │
+├─────┼─────────────────────────┼─────────┼─────────┤
+│ 1   │ (mean = 4.0, std = 3.0) │ 4.0     │ 3.0     │
+│ 2   │ (mean = 5.0, std = 3.0) │ 5.0     │ 3.0     │
+│ 3   │ (mean = 6.0, std = 3.0) │ 6.0     │ 3.0     │
 ```
 
 """
@@ -585,6 +666,7 @@ end
 
 """
     transform!(df::DataFrame, args...; renamecols::Bool=true)
+    transform!(args::Callable, df::DataFrame; renamecols::Bool=true)
 
 Mutate `df` in place to add columns specified by `args...` and return it.
 The result is guaranteed to have the same number of rows as `df`.
@@ -604,6 +686,7 @@ end
 
 """
     select(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true)
+    select(args::Callable, df::DataFrame; renamecols::Bool=true)
 
 Create a new data frame that contains columns from `df` specified by `args` and
 return it. The result is guaranteed to have the same number of rows as `df`,
@@ -648,7 +731,7 @@ julia> df = DataFrame(a=1:3, b=4:6)
 │ 2   │ 2     │ 5     │
 │ 3   │ 3     │ 6     │
 
-julia> select(df, :b)
+julia> select(df, 2)
 3×1 DataFrame
 │ Row │ b     │
 │     │ Int64 │
@@ -657,24 +740,6 @@ julia> select(df, :b)
 │ 2   │ 5     │
 │ 3   │ 6     │
 
-julia> select(df, Not(:b)) # drop column :b from df
-3×1 DataFrame
-│ Row │ a     │
-│     │ Int64 │
-├─────┼───────┤
-│ 1   │ 1     │
-│ 2   │ 2     │
-│ 3   │ 3     │
-
-julia> select(df, :a => :c, :b)
-3×2 DataFrame
-│ Row │ c     │ b     │
-│     │ Int64 │ Int64 │
-├─────┼───────┼───────┤
-│ 1   │ 1     │ 4     │
-│ 2   │ 2     │ 5     │
-│ 3   │ 3     │ 6     │
-
 julia> select(df, :a => ByRow(sin) => :c, :b)
 3×2 DataFrame
 │ Row │ c        │ b     │
@@ -693,23 +758,16 @@ julia> select(df, :, [:a, :b] => (a,b) -> a .+ b .- sum(b)/length(b))
 │ 2   │ 2     │ 5     │ 2.0          │
 │ 3   │ 3     │ 6     │ 4.0          │
 
-julia> select(df, names(df) .=> sum)
-3×2 DataFrame
-│ Row │ a_sum │ b_sum │
-│     │ Int64 │ Int64 │
-├─────┼───────┼───────┤
-│ 1   │ 6     │ 15    │
-│ 2   │ 6     │ 15    │
-│ 3   │ 6     │ 15    │
+julia> select(df, names(df) .=> [minimum maximum])
+3×4 DataFrame
+│ Row │ a_minimum │ b_minimum │ a_maximum │ b_maximum │
+│     │ Int64     │ Int64     │ Int64     │ Int64     │
+├─────┼───────────┼───────────┼───────────┼───────────┤
+│ 1   │ 1         │ 4         │ 3         │ 6         │
+│ 2   │ 1         │ 4         │ 3         │ 6         │
+│ 3   │ 1         │ 4         │ 3         │ 6         │
 
-julia> select(df, names(df) .=> sum .=> [:A, :B])
-3×2 DataFrame
-│ Row │ A     │ B     │
-│     │ Int64 │ Int64 │
-├─────┼───────┼───────┤
-│ 1   │ 6     │ 15    │
-│ 2   │ 6     │ 15    │
-│ 3   │ 6     │ 15    │
+julia> using Statistics
 
 julia> select(df, AsTable(:) => ByRow(mean), renamecols=false)
 3×1 DataFrame
@@ -719,6 +777,34 @@ julia> select(df, AsTable(:) => ByRow(mean), renamecols=false)
 │ 1   │ 2.5     │
 │ 2   │ 3.5     │
 │ 3   │ 4.5     │
+
+julia> select(first, df)
+3×2 DataFrame
+│ Row │ a     │ b     │
+│     │ Int64 │ Int64 │
+├─────┼───────┼───────┤
+│ 1   │ 1     │ 4     │
+│ 2   │ 1     │ 4     │
+│ 3   │ 1     │ 4     │
+
+julia> df = DataFrame(a=1:3, b=4:6, c=7:9)
+3×3 DataFrame
+│ Row │ a     │ b     │ c     │
+│     │ Int64 │ Int64 │ Int64 │
+├─────┼───────┼───────┼───────┤
+│ 1   │ 1     │ 4     │ 7     │
+│ 2   │ 2     │ 5     │ 8     │
+│ 3   │ 3     │ 6     │ 9     │
+
+julia> select(df, AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => :stats,
+              AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => AsTable)
+3×3 DataFrame
+│ Row │ stats                   │ mean    │ std     │
+│     │ NamedTuple…             │ Float64 │ Float64 │
+├─────┼─────────────────────────┼─────────┼─────────┤
+│ 1   │ (mean = 4.0, std = 3.0) │ 4.0     │ 3.0     │
+│ 2   │ (mean = 5.0, std = 3.0) │ 5.0     │ 3.0     │
+│ 3   │ (mean = 6.0, std = 3.0) │ 6.0     │ 3.0     │
 ```
 
 """
@@ -734,6 +820,7 @@ end
 
 """
     transform(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true)
+    transform(args::Callable, df::DataFrame; renamecols::Bool=true)
 
 Create a new data frame that contains columns from `df` and adds columns
 specified by `args` and return it.
@@ -754,7 +841,7 @@ end
 
 """
     combine(df::AbstractDataFrame, args...; renamecols::Bool=true)
-    combine(arg, df::AbstractDataFrame; renamecols::Bool=true)
+    combine(args::Callable, df::AbstractDataFrame; renamecols::Bool=true)
 
 Create a new data frame that contains columns from `df` specified by `args` and
 return it. The result can have any number of rows that is determined by the
@@ -782,6 +869,68 @@ julia> combine(df, :a => sum, nrow, renamecols=false)
 │     │ Int64 │ Int64 │
 ├─────┼───────┼───────┤
 │ 1   │ 6     │ 3     │
+
+julia> combine(df, :a => ByRow(sin) => :c, :b)
+3×2 DataFrame
+│ Row │ c        │ b     │
+│     │ Float64  │ Int64 │
+├─────┼──────────┼───────┤
+│ 1   │ 0.841471 │ 4     │
+│ 2   │ 0.909297 │ 5     │
+│ 3   │ 0.14112  │ 6     │
+
+julia> combine(df, :, [:a, :b] => (a,b) -> a .+ b .- sum(b)/length(b))
+3×3 DataFrame
+│ Row │ a     │ b     │ a_b_function │
+│     │ Int64 │ Int64 │ Float64      │
+├─────┼───────┼───────┼──────────────┤
+│ 1   │ 1     │ 4     │ 0.0          │
+│ 2   │ 2     │ 5     │ 2.0          │
+│ 3   │ 3     │ 6     │ 4.0          │
+
+julia> combine(df, names(df) .=> [minimum maximum])
+1×4 DataFrame
+│ Row │ a_minimum │ b_minimum │ a_maximum │ b_maximum │
+│     │ Int64     │ Int64     │ Int64     │ Int64     │
+├─────┼───────────┼───────────┼───────────┼───────────┤
+│ 1   │ 1         │ 4         │ 3         │ 6         │
+
+julia> using Statistics
+
+julia> combine(df, AsTable(:) => ByRow(mean), renamecols=false)
+3×1 DataFrame
+│ Row │ a_b     │
+│     │ Float64 │
+├─────┼─────────┤
+│ 1   │ 2.5     │
+│ 2   │ 3.5     │
+│ 3   │ 4.5     │
+
+julia> combine(first, df)
+1×2 DataFrame
+│ Row │ a     │ b     │
+│     │ Int64 │ Int64 │
+├─────┼───────┼───────┤
+│ 1   │ 1     │ 4     │
+
+julia> df = DataFrame(a=1:3, b=4:6, c=7:9)
+3×3 DataFrame
+│ Row │ a     │ b     │ c     │
+│     │ Int64 │ Int64 │ Int64 │
+├─────┼───────┼───────┼───────┤
+│ 1   │ 1     │ 4     │ 7     │
+│ 2   │ 2     │ 5     │ 8     │
+│ 3   │ 3     │ 6     │ 9     │
+
+julia> combine(df, AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => :stats,
+                      AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => AsTable)
+3×3 DataFrame
+│ Row │ stats                   │ mean    │ std     │
+│     │ NamedTuple…             │ Float64 │ Float64 │
+├─────┼─────────────────────────┼─────────┼─────────┤
+│ 1   │ (mean = 4.0, std = 3.0) │ 4.0     │ 3.0     │
+│ 2   │ (mean = 5.0, std = 3.0) │ 5.0     │ 3.0     │
+│ 3   │ (mean = 6.0, std = 3.0) │ 6.0     │ 3.0     │
 ```
 """
 combine(df::AbstractDataFrame, args...; renamecols::Bool=true) =

From 1e1f87ca99b6423d8fe64519e7b21a459b9c79f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Wed, 7 Oct 2020 09:14:05 +0200
Subject: [PATCH 19/21] Apply suggestions from code review

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 src/abstractdataframe/selection.jl | 32 +++++++++++++++---------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index 82f65226d5..68053d47de 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -438,8 +438,8 @@ SELECT_ARG_RULES =
 
     Columns can be renamed using the `old_column => new_column_name` syntax, and
     transformed using the `old_column => fun => new_column_name` syntax.
-    `new_column_name` must be a `Symbol` or a string, a vector of `Symbol` or
-    string, or `AsTable`, and `fun` a function or a type. If `old_column` is a
+    `new_column_name` must be a `Symbol` or a string, a vector of `Symbol`s or
+    strings, or `AsTable`. `fun` must be a function or a type. If `old_column` is a
     `Symbol`, a string, or an integer then `fun` is applied to the corresponding
     column vector. Otherwise `old_column` can be any column indexing syntax, in
     which case `fun` will be passed the column vectors specified by `old_column`
@@ -450,7 +450,7 @@ SELECT_ARG_RULES =
     Column renaming and transformation operations can be passed wrapped in
     vectors or matrices (this is useful when combined with broadcasting).
 
-    # Rules when `new_column_name` is a `Symbol` or a string or is missing
+    # Rules when `new_column_name` is a `Symbol` or a string or is absent
 
     If `fun` returns a value of type other than `AbstractVector` then it will be
     broadcasted into a vector matching the target number of rows in the data
@@ -465,8 +465,8 @@ SELECT_ARG_RULES =
     broadcasting. Otherwise `old_column` can be any column indexing syntax, in
     which case `fun` will be passed one argument for each of the columns
     specified by `old_column`. If `ByRow` is used it is allowed for
-    `old_column` to select an empty set of columns, in which case no arguments
-    are passed to `fun` for each row.
+    `old_column` to select an empty set of columns, in which case `fun`
+     is called for each row without any arguments.
 
     Column transformation can also be specified using the short `old_column =>
     fun` form. In this case, `new_column_name` is automatically generated as
@@ -479,7 +479,7 @@ SELECT_ARG_RULES =
     It is not allowed to pass `renamecols=false` if `old_column` is empty
     as it would generate an empty column name.
 
-    # Rules when `new_column_name` is a vector of `Symbol` or a string or is `AsTable`
+    # Rules when `new_column_name` is a vector of `Symbol`s or strings or is `AsTable`
 
     In this case it is assumed that `fun` returns multiple columns.
 
@@ -493,36 +493,36 @@ SELECT_ARG_RULES =
     Then as many columns are created as there are elements in the return value
     of the `keys` function and their names are set to be equal to the key names,
     except if `keys` returns integers, in which case they are prefixed by `x`
-    (so the column names are e.g. `x1`, `x2`, ...)
+    (so the column names are e.g. `x1`, `x2`, ...).
 
     If `fun` returns a value of any other type then it is assumed that it is
-    a table conforming to Tables.jl API and the `Tables.columntable` function is
+    a table conforming to the Tables.jl API and the `Tables.columntable` function is
     called on it to get the resulting columns and their names.
 
-    Additionally if `new_column_name` is a vector of `Symbol` or string then column
+    Additionally if `new_column_name` is a vector of `Symbol`s or strings then column
     names produced using the rules above are ignored and replaced by `new_column_name`
-    (the number of columns must be the same as the length `new_column_name` in this case).
+    (the number of columns must be the same as the length of `new_column_name` in this case).
 
     # Rules when element of `args` is a function or a type
 
-    In this case a transformaton is passed `df` as a single argument.
+    In this case the function or type is called with `df` as a single argument.
 
-    If the return value of the transformation is of `AbstractDataFrame`,
+    If the return value of the transformation is one of `AbstractDataFrame`,
     `NamedTuple`, `DataFrameRow` or `AbstractMatrix` then it is treated as
     containing multiple columns. For `AbstractMatrix` column names are generated
     as `x1`, `x2`, etc. For `AbstractDataFrame`, `NamedTuple` of vectors and
     `AbstractMatrix` the columns are taken as is from the returned value. For
     `DataFrameRow` and` NamedTuple` not containing any vectors the returned
-    value is broadcasted a vector matching the target number of rows in the data
+    value is broadcasted to a vector matching the target number of rows in the data
     frame.
 
     If the return value is an `AbstractVector` then it is used as-is. The resulting
-    column gets a name `x1`.
+    column gets the name `x1`.
 
     In all other cases the return value is broadcasted into a vector matching
     the target number of rows in the data frame. As a particular rule, values
     wrapped in a `Ref` or a `0`-dimensional `AbstractArray` are unwrapped and
-    then broadcasted. The resulting column gets a name `x1`.
+    then broadcasted. The resulting column gets the name `x1`.
 
     # Special rules
 
@@ -923,7 +923,7 @@ julia> df = DataFrame(a=1:3, b=4:6, c=7:9)
 │ 3   │ 3     │ 6     │ 9     │
 
 julia> combine(df, AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => :stats,
-                      AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => AsTable)
+               AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => AsTable)
 3×3 DataFrame
 │ Row │ stats                   │ mean    │ std     │
 │     │ NamedTuple…             │ Float64 │ Float64 │

From 58b9d66d6ce842fa0d1af85ffddf2258dfb8cbae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Wed, 7 Oct 2020 09:35:13 +0200
Subject: [PATCH 20/21] update docs

---
 src/abstractdataframe/selection.jl | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index 68053d47de..cb1be3d330 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -484,24 +484,26 @@ SELECT_ARG_RULES =
     In this case it is assumed that `fun` returns multiple columns.
 
     If `fun` returns one of `AbstractDataFrame`, `NamedTuple`, `DataFrameRow`,
-    `AbstractMatrix` then rules described below for `args` being a function or
-    a type apply.
+    `AbstractMatrix` then rules described in the section describing the case
+    when `args` is a function or a type apply.
 
     If `fun` returns an `AbstractVector` then each element of this vector must
     support `keys` function that must return a collection of `Symbol`s, strings
     or integers; the return value of `keys` must be identical for all elements.
     Then as many columns are created as there are elements in the return value
-    of the `keys` function and their names are set to be equal to the key names,
-    except if `keys` returns integers, in which case they are prefixed by `x`
-    (so the column names are e.g. `x1`, `x2`, ...).
-
-    If `fun` returns a value of any other type then it is assumed that it is
-    a table conforming to the Tables.jl API and the `Tables.columntable` function is
-    called on it to get the resulting columns and their names.
-
-    Additionally if `new_column_name` is a vector of `Symbol`s or strings then column
-    names produced using the rules above are ignored and replaced by `new_column_name`
-    (the number of columns must be the same as the length of `new_column_name` in this case).
+    of the `keys` function. If `new_column_name` is `AsTable` then their names
+    are set to be equal to the key names except if `keys` returns integers, in
+    which case they are prefixed by `x` (so the column names are e.g. `x1`,
+    `x2`, ...). If `new_column_name` is a vector of `Symbol`s or strings then
+    column names produced using the rules above are ignored and replaced by
+    `new_column_name` (the number of columns must be the same as the length of
+    `new_column_name` in this case).
+
+    If `fun` returns a value of any other type then it is assumed that it is a
+    table conforming to the Tables.jl API and the `Tables.columntable` function
+    is called on it to get the resulting columns and their names. The names are
+    retained when `new_column_name` is `AsTable` and are replaced if
+    `new_column_name` is a vector of `Symbol`s or strings.
 
     # Rules when element of `args` is a function or a type
 

From 2cdeb7da927c3be8e74a9c1610562004c2c3754a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Wed, 7 Oct 2020 22:37:28 +0200
Subject: [PATCH 21/21] Apply suggestions from code review

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 src/abstractdataframe/selection.jl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index cb1be3d330..76393ccd85 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -453,11 +453,11 @@ SELECT_ARG_RULES =
     # Rules when `new_column_name` is a `Symbol` or a string or is absent
 
     If `fun` returns a value of type other than `AbstractVector` then it will be
-    broadcasted into a vector matching the target number of rows in the data
+    repeated in a vector matching the target number of rows in the data
     frame, unless its type is one of `AbstractDataFrame`, `NamedTuple`,
     `DataFrameRow`, `AbstractMatrix`, in which case an error is thrown. As a
     particular rule, values wrapped in a `Ref` or a `0`-dimensional
-    `AbstractArray` are unwrapped and then broadcasted.
+    `AbstractArray` are unwrapped and then repeated.
 
     To apply `fun` to each row instead of whole columns, it can be wrapped in a
     `ByRow` struct. In this case if `old_column` is a `Symbol`, a string, or an
@@ -488,7 +488,7 @@ SELECT_ARG_RULES =
     when `args` is a function or a type apply.
 
     If `fun` returns an `AbstractVector` then each element of this vector must
-    support `keys` function that must return a collection of `Symbol`s, strings
+    support the `keys` function, which must return a collection of `Symbol`s, strings
     or integers; the return value of `keys` must be identical for all elements.
     Then as many columns are created as there are elements in the return value
     of the `keys` function. If `new_column_name` is `AsTable` then their names
@@ -521,10 +521,10 @@ SELECT_ARG_RULES =
     If the return value is an `AbstractVector` then it is used as-is. The resulting
     column gets the name `x1`.
 
-    In all other cases the return value is broadcasted into a vector matching
+    In all other cases the return value is repeated in a vector matching
     the target number of rows in the data frame. As a particular rule, values
     wrapped in a `Ref` or a `0`-dimensional `AbstractArray` are unwrapped and
-    then broadcasted. The resulting column gets the name `x1`.
+    then repeated. The resulting column gets the name `x1`.
 
     # Special rules