From a8402acdc659a16fadcd193319352e1405fb833b Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sat, 19 Mar 2022 16:07:13 +0100 Subject: [PATCH 1/6] Add `skipmissing` argument to `levels` Currently if one wants to get levels in their appropriate order, but add `missing` if present, the only solution is to do something like `union(levels(x), unique(x))`, which is inefficient. Support `skipmissing=false` to allow doing this in a single pass over the data. Use `@inline` to ensure that the return type can be inferred when the value of `skipmissing` is known statically. Also fix a type instability which existed for ranges. --- src/DataAPI.jl | 34 +++++++++++++++++++++++++------ test/runtests.jl | 52 ++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 72 insertions(+), 14 deletions(-) diff --git a/src/DataAPI.jl b/src/DataAPI.jl index f723983..0eab97a 100644 --- a/src/DataAPI.jl +++ b/src/DataAPI.jl @@ -106,22 +106,28 @@ definition. function describe end """ - levels(x) + levels(x; skipmissing=true) -Return a vector of unique values which occur or could occur in collection `x`, -omitting `missing` even if present. Values are returned in the preferred order -for the collection, with the result of [`sort`](@ref) as a default. +Return a vector of unique values which occur or could occur in collection `x`. +`missing` values are skipped unless `skipmissing=false` is passed. + +Values are returned in the preferred order for the collection, +with the result of [`sort`](@ref) as a default. If the collection is not sortable then the order of levels is unspecified. Contrary to [`unique`](@ref), this function may return values which do not actually occur in the data, and does not preserve their order of appearance in `x`. """ -function levels(x) +@inline levels(x; skipmissing::Bool=true) = + skipmissing ? _levels_skipmissing(x) : _levels_missing(x) + +function _levels_skipmissing(x) T = Base.nonmissingtype(eltype(x)) u = unique(x) # unique returns its input with copying for ranges # (and possibly for other types guaranteed to hold unique values) - nmu = (u === x || Base.mightalias(u, x)) ? filter(!ismissing, u) : filter!(!ismissing, u) + nmu = (u isa AbstractRange || u === x || Base.mightalias(u, x)) ? + filter(!ismissing, u) : filter!(!ismissing, u) levs = convert(AbstractArray{T}, nmu) try sort!(levs) @@ -130,6 +136,22 @@ function levels(x) levs end +function _levels_missing(x) + levs = convert(AbstractArray{eltype(x)}, unique(x)) + # unique returns its input with copying for ranges + # (and possibly for other types guaranteed to hold unique values) + docopy = x isa AbstractRange || levs === x || Base.mightalias(levs, x) + try + if docopy + levs = sort(levs) + else + sort!(levs) + end + catch + end + levs +end + """ Between(first, last) diff --git a/test/runtests.jl b/test/runtests.jl index a086310..a6c39d1 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,5 +1,7 @@ using Test, DataAPI +const ≅ = isequal + @testset "DataAPI" begin @testset "defaultarray" begin @@ -29,15 +31,15 @@ end @testset "levels" begin - @test DataAPI.levels(1:1) == - DataAPI.levels([1]) == - DataAPI.levels([1, missing]) == - DataAPI.levels([missing, 1]) == + @test @inferred(DataAPI.levels(1:1)) == + @inferred(DataAPI.levels([1])) == + @inferred(DataAPI.levels([1, missing])) == + @inferred(DataAPI.levels([missing, 1])) == [1] - @test DataAPI.levels(2:-1:1) == - DataAPI.levels([2, 1]) == - DataAPI.levels(Any[2, 1]) == - DataAPI.levels([2, missing, 1]) == + @test @inferred(DataAPI.levels(2:-1:1)) == + @inferred(DataAPI.levels([2, 1])) == + @inferred(DataAPI.levels(Any[2, 1])) == + @inferred(DataAPI.levels([2, missing, 1])) == [1, 2] @test DataAPI.levels([missing, "a", "c", missing, "b"]) == ["a", "b", "c"] @test DataAPI.levels([Complex(0, 1), Complex(1, 0), missing]) == @@ -55,6 +57,40 @@ end @test isempty(DataAPI.levels([missing])) @test isempty(DataAPI.levels([])) + levels_skipmissing(x) = DataAPI.levels(x, skipmissing=false) + @test @inferred(levels_skipmissing(1:1)) ≅ + @inferred(levels_skipmissing([1])) ≅ + [1] + @test @inferred(levels_skipmissing([1, missing])) ≅ + @inferred(levels_skipmissing([missing, 1])) ≅ + [1, missing] + @test @inferred(levels_skipmissing(2:-1:1)) ≅ + @inferred(levels_skipmissing([2, 1])) ≅ + [1, 2] + @test @inferred(levels_skipmissing(Any[2, 1])) ≅ + [1, 2] + @test DataAPI.levels([2, missing, 1], skipmissing=false) ≅ + [1, 2, missing] + @test DataAPI.levels([missing, "a", "c", missing, "b"], skipmissing=false) ≅ + ["a", "b", "c", missing] + @test DataAPI.levels([Complex(0, 1), Complex(1, 0), missing], skipmissing=false) ≅ + [Complex(0, 1), Complex(1, 0), missing] + @test typeof(DataAPI.levels([1], skipmissing=false)) === Vector{Int} + @test typeof(DataAPI.levels([1, missing], skipmissing=false)) === + Vector{Union{Int, Missing}} + @test typeof(DataAPI.levels(["a"], skipmissing=false)) === Vector{String} + @test typeof(DataAPI.levels(["a", missing], skipmissing=false)) === + Vector{Union{String, Missing}} + @test typeof(DataAPI.levels(Real[1], skipmissing=false)) === Vector{Real} + @test typeof(DataAPI.levels(Union{Real,Missing}[1, missing], skipmissing=false)) === + Vector{Union{Real, Missing}} + @test typeof(DataAPI.levels(trues(1), skipmissing=false)) === Vector{Bool} + @test DataAPI.levels([missing], skipmissing=false) ≅ [missing] + @test DataAPI.levels([missing], skipmissing=false) isa Vector{Missing} + @test typeof(DataAPI.levels(Union{Int,Missing}[missing], skipmissing=false)) === + Vector{Union{Int,Missing}} + @test isempty(DataAPI.levels([], skipmissing=false)) + @test typeof(DataAPI.levels(Int[], skipmissing=false)) === Vector{Int} end @testset "Between" begin From 7700f0f38c0a9cbf11f56d9d042640170697a103 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 20 Mar 2022 15:01:51 +0100 Subject: [PATCH 2/6] Fix tests on Julia 1.0 --- test/runtests.jl | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index a6c39d1..9745467 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -61,9 +61,15 @@ end @test @inferred(levels_skipmissing(1:1)) ≅ @inferred(levels_skipmissing([1])) ≅ [1] - @test @inferred(levels_skipmissing([1, missing])) ≅ - @inferred(levels_skipmissing([missing, 1])) ≅ - [1, missing] + if VERSION >= v"1.6.0" + @test @inferred(levels_skipmissing([1, missing])) ≅ + @inferred(levels_skipmissing([missing, 1])) ≅ + [1, missing] + else + @test levels_skipmissing([1, missing]) ≅ + levels_skipmissing([missing, 1]) ≅ + [1, missing] + end @test @inferred(levels_skipmissing(2:-1:1)) ≅ @inferred(levels_skipmissing([2, 1])) ≅ [1, 2] From b4b9db5703e8696853097aef7d6a835c34a47558 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 20 Mar 2022 22:28:51 +0100 Subject: [PATCH 3/6] Add backward compatibility strategy based on `which` --- src/DataAPI.jl | 54 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/src/DataAPI.jl b/src/DataAPI.jl index 0eab97a..ded32a4 100644 --- a/src/DataAPI.jl +++ b/src/DataAPI.jl @@ -121,35 +121,47 @@ actually occur in the data, and does not preserve their order of appearance in ` @inline levels(x; skipmissing::Bool=true) = skipmissing ? _levels_skipmissing(x) : _levels_missing(x) +# The `which` check is here for backward compatibility: +# if a type implements a custom `levels` method but does not support +# keyword arguments, `levels(x, skipmissing=true/false)` will dispatch +# to the fallback methods here, and we take care of calling that custom method function _levels_skipmissing(x) - T = Base.nonmissingtype(eltype(x)) - u = unique(x) - # unique returns its input with copying for ranges - # (and possibly for other types guaranteed to hold unique values) - nmu = (u isa AbstractRange || u === x || Base.mightalias(u, x)) ? - filter(!ismissing, u) : filter!(!ismissing, u) - levs = convert(AbstractArray{T}, nmu) - try - sort!(levs) - catch + if which(DataAPI.levels, Tuple{typeof(x)}) === which(DataAPI.levels, Tuple{Any}) + T = Base.nonmissingtype(eltype(x)) + u = unique(x) + # unique returns its input with copying for ranges + # (and possibly for other types guaranteed to hold unique values) + nmu = (u isa AbstractRange || u === x || Base.mightalias(u, x)) ? + filter(!ismissing, u) : filter!(!ismissing, u) + levs = convert(AbstractArray{T}, nmu) + try + sort!(levs) + catch + end + return levs + else + return levels(x) end - levs end function _levels_missing(x) - levs = convert(AbstractArray{eltype(x)}, unique(x)) - # unique returns its input with copying for ranges - # (and possibly for other types guaranteed to hold unique values) - docopy = x isa AbstractRange || levs === x || Base.mightalias(levs, x) - try - if docopy - levs = sort(levs) - else + if which(DataAPI.levels, Tuple{typeof(x)}) === which(DataAPI.levels, Tuple{Any}) + u = convert(AbstractArray{eltype(x)}, unique(x)) + # unique returns its input with copying for ranges + # (and possibly for other types guaranteed to hold unique values) + levs = (x isa AbstractRange || u === x || Base.mightalias(u, x)) ? + Base.copymutable(u) : u + try sort!(levs) + catch end - catch + return levs + # This is a suboptimal fallback since it does a second pass over the data + elseif any(ismissing, x) + return [levels(x); missing] + else + return convert(AbstractArray{eltype(x)}, levels(x)) end - levs end """ From 88ffd6d3139aa864c50e92c7a2f15a9e43610ec1 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sat, 26 Mar 2022 16:21:49 +0100 Subject: [PATCH 4/6] Fix inference --- src/DataAPI.jl | 8 ++++++-- test/runtests.jl | 30 ++++++++++++++++++++---------- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/src/DataAPI.jl b/src/DataAPI.jl index ded32a4..5abbf6e 100644 --- a/src/DataAPI.jl +++ b/src/DataAPI.jl @@ -105,6 +105,9 @@ definition. """ function describe end +# Sentinel type needed to make `levels` inferrable +struct _Default end + """ levels(x; skipmissing=true) @@ -118,8 +121,9 @@ If the collection is not sortable then the order of levels is unspecified. Contrary to [`unique`](@ref), this function may return values which do not actually occur in the data, and does not preserve their order of appearance in `x`. """ -@inline levels(x; skipmissing::Bool=true) = - skipmissing ? _levels_skipmissing(x) : _levels_missing(x) +@inline levels(x; skipmissing::Union{Bool, _Default}=_Default()) = + skipmissing isa _Default || skipmissing ? + _levels_skipmissing(x) : _levels_missing(x) # The `which` check is here for backward compatibility: # if a type implements a custom `levels` method but does not support diff --git a/test/runtests.jl b/test/runtests.jl index 9745467..d63fa06 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -29,6 +29,12 @@ end end +struct TestArray <: AbstractVector{Union{Int, Missing}} +end +Base.size(x::TestArray) = (2,) +Base.getindex(x::TestArray, i) = (checkbounds(x, i); i == 1 ? missing : i-1) +DataAPI.levels(::TestArray) = [2, 1] + @testset "levels" begin @test @inferred(DataAPI.levels(1:1)) == @@ -57,23 +63,23 @@ end @test isempty(DataAPI.levels([missing])) @test isempty(DataAPI.levels([])) - levels_skipmissing(x) = DataAPI.levels(x, skipmissing=false) - @test @inferred(levels_skipmissing(1:1)) ≅ - @inferred(levels_skipmissing([1])) ≅ + levels_missing(x) = DataAPI.levels(x, skipmissing=false) + @test @inferred(levels_missing(1:1)) ≅ + @inferred(levels_missing([1])) ≅ [1] if VERSION >= v"1.6.0" - @test @inferred(levels_skipmissing([1, missing])) ≅ - @inferred(levels_skipmissing([missing, 1])) ≅ + @test @inferred(levels_missing([1, missing])) ≅ + @inferred(levels_missing([missing, 1])) ≅ [1, missing] else - @test levels_skipmissing([1, missing]) ≅ - levels_skipmissing([missing, 1]) ≅ + @test levels_missing([1, missing]) ≅ + levels_missing([missing, 1]) ≅ [1, missing] end - @test @inferred(levels_skipmissing(2:-1:1)) ≅ - @inferred(levels_skipmissing([2, 1])) ≅ + @test @inferred(levels_missing(2:-1:1)) ≅ + @inferred(levels_missing([2, 1])) ≅ [1, 2] - @test @inferred(levels_skipmissing(Any[2, 1])) ≅ + @test @inferred(levels_missing(Any[2, 1])) ≅ [1, 2] @test DataAPI.levels([2, missing, 1], skipmissing=false) ≅ [1, 2, missing] @@ -97,6 +103,10 @@ end Vector{Union{Int,Missing}} @test isempty(DataAPI.levels([], skipmissing=false)) @test typeof(DataAPI.levels(Int[], skipmissing=false)) === Vector{Int} + + @test DataAPI.levels(TestArray()) == + DataAPI.levels(TestArray(), skipmissing=true) == [2, 1] + @test DataAPI.levels(TestArray(), skipmissing=false) ≅ [2, 1, missing] end @testset "Between" begin From 675536a3391cf3871fe2376b7558fcccd8f40593 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sat, 26 Mar 2022 16:34:15 +0100 Subject: [PATCH 5/6] Fix Julia 1.0 --- test/runtests.jl | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index d63fa06..d62f2c0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,6 +2,13 @@ using Test, DataAPI const ≅ = isequal +# For `levels` tests +struct TestArray <: AbstractVector{Union{Int, Missing}} +end +Base.size(x::TestArray) = (2,) +Base.getindex(x::TestArray, i) = (checkbounds(x, i); i == 1 ? missing : i-1) +DataAPI.levels(::TestArray) = [2, 1] + @testset "DataAPI" begin @testset "defaultarray" begin @@ -29,12 +36,6 @@ end end -struct TestArray <: AbstractVector{Union{Int, Missing}} -end -Base.size(x::TestArray) = (2,) -Base.getindex(x::TestArray, i) = (checkbounds(x, i); i == 1 ? missing : i-1) -DataAPI.levels(::TestArray) = [2, 1] - @testset "levels" begin @test @inferred(DataAPI.levels(1:1)) == @@ -104,6 +105,9 @@ DataAPI.levels(::TestArray) = [2, 1] @test isempty(DataAPI.levels([], skipmissing=false)) @test typeof(DataAPI.levels(Int[], skipmissing=false)) === Vector{Int} + # Backward compatibility test: + # check that an array type which implements a `levels` method + # which does not accept keyword arguments works thanks to fallbacks @test DataAPI.levels(TestArray()) == DataAPI.levels(TestArray(), skipmissing=true) == [2, 1] @test DataAPI.levels(TestArray(), skipmissing=false) ≅ [2, 1, missing] From 8f36ce92ccf98600b064aae645dfe2be14584a3e Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 27 Mar 2022 11:59:41 +0200 Subject: [PATCH 6/6] Improve tests --- test/runtests.jl | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index d62f2c0..9782ac2 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,11 +3,12 @@ using Test, DataAPI const ≅ = isequal # For `levels` tests -struct TestArray <: AbstractVector{Union{Int, Missing}} +struct TestArray{T} <: AbstractVector{T} + x::Vector{T} end -Base.size(x::TestArray) = (2,) -Base.getindex(x::TestArray, i) = (checkbounds(x, i); i == 1 ? missing : i-1) -DataAPI.levels(::TestArray) = [2, 1] +Base.size(x::TestArray) = size(x.x) +Base.getindex(x::TestArray, i) = x.x[i] +DataAPI.levels(x::TestArray) = reverse(DataAPI.levels(x.x)) @testset "DataAPI" begin @@ -108,9 +109,21 @@ end # Backward compatibility test: # check that an array type which implements a `levels` method # which does not accept keyword arguments works thanks to fallbacks - @test DataAPI.levels(TestArray()) == - DataAPI.levels(TestArray(), skipmissing=true) == [2, 1] - @test DataAPI.levels(TestArray(), skipmissing=false) ≅ [2, 1, missing] + @test DataAPI.levels(TestArray([1, 2])) == + DataAPI.levels(TestArray([1, 2]), skipmissing=true) == + DataAPI.levels(TestArray([1, 2]), skipmissing=false) == [2, 1] + @test DataAPI.levels(TestArray([1, 2])) isa Vector{Int} + @test DataAPI.levels(TestArray([1, 2]), skipmissing=true) isa Vector{Int} + @test DataAPI.levels(TestArray([1, 2]), skipmissing=false) isa Vector{Int} + @test DataAPI.levels(TestArray([missing, 1, 2])) == + DataAPI.levels(TestArray([missing, 1, 2]), skipmissing=true) == [2, 1] + @test DataAPI.levels(TestArray([missing, 1, 2]), skipmissing=false) ≅ + [2, 1, missing] + @test DataAPI.levels(TestArray([missing, 1, 2])) isa Vector{Int} + @test DataAPI.levels(TestArray([missing, 1, 2]), skipmissing=true) isa + Vector{Int} + @test DataAPI.levels(TestArray([missing, 1, 2]), skipmissing=false) isa + Vector{Union{Int, Missing}} end @testset "Between" begin