From 4013a0fed75a6e779a7fafaca6576c8d37866090 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Tue, 19 Apr 2022 22:17:02 +0200 Subject: [PATCH 1/3] Implement `skipmissing` argument to `levels` The argument is added by DataAPI 1.10 (JuliaData/DataAPI.jl#46). When `skipmissing=true`, the method for `CategoricalArray` can be slightly more efficient than the fallback defined in DataAPI as it avoids calling `unique`. --- src/array.jl | 19 +++++++++++++++++-- src/subarray.jl | 1 - test/13_arraycommon.jl | 18 ++++++++++++++++++ 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/src/array.jl b/src/array.jl index 40b34998..78efc09b 100644 --- a/src/array.jl +++ b/src/array.jl @@ -753,14 +753,29 @@ end leveltype(::Type{T}) where {T <: CategoricalArray} = leveltype(nonmissingtype(eltype(T))) """ - levels(x::CategoricalArray) + levels(x::CategoricalArray, skipmissing=true) levels(x::CategoricalValue) Return the levels of categorical array or value `x`. This may include levels which do not actually appear in the data (see [`droplevels!`](@ref)). +`missing` will be included only if it appears in the data and +`skipmissing=false` is passed. + +The returned vector is an internal field of `x` which must not be mutated +as doing so would corrupt it. """ -DataAPI.levels(A::CategoricalArray) = levels(A.pool) +@inline function DataAPI.levels(A::CatArrOrSub{T}; skipmissing::Bool=true) where T + if eltype(A) >: Missing && !skipmissing + if any(==(0), refs(A)) + T[levels(pool(A)); missing] + else + convert(Vector{T}, levels(pool(A))) + end + else + levels(pool(A)) + end +end """ levels!(A::CategoricalArray, newlevels::Vector; allowmissing::Bool=false) diff --git a/src/subarray.jl b/src/subarray.jl index 00b38480..3e5f3f39 100644 --- a/src/subarray.jl +++ b/src/subarray.jl @@ -1,6 +1,5 @@ # delegate methods for SubArrays to support view -DataAPI.levels(sa::SubArray{T,N,P}) where {T,N,P<:CategoricalArray} = levels(parent(sa)) isordered(sa::SubArray{T,N,P}) where {T,N,P<:CategoricalArray} = isordered(parent(sa)) # This method cannot support allowmissing=true since that would modify the parent levels!(sa::SubArray{T,N,P}, newlevels::Vector) where {T,N,P<:CategoricalArray} = diff --git a/test/13_arraycommon.jl b/test/13_arraycommon.jl index aed6ad86..eac92ad0 100644 --- a/test/13_arraycommon.jl +++ b/test/13_arraycommon.jl @@ -2260,4 +2260,22 @@ end Vector{CategoricalVector{<:Any, <:Integer, <:Any, <:Any, Union{}}} end +@testset "levels with skipmissing argument" begin + for x in (categorical(["a", "b", "a"], levels=["b", "c", "a"]), + view(categorical(["c", "b", "a"], levels=["b", "c", "a"]), 2:3)) + @test @inferred(levels(x)) == ["b", "c", "a"] + @test @inferred(levels(x, skipmissing=true)) == ["b", "c", "a"] + @test @inferred(levels(x, skipmissing=false)) == ["b", "c", "a"] + end + + for x in (categorical(Union{String, Missing}["a", "b", "a"], levels=["b", "c", "a"]), + view(categorical(Union{String, Missing}["c", "b", "a"], levels=["b", "c", "a"]), 2:3)) + @test @inferred(levels(x)) == ["b", "c", "a"] + @test levels(x, skipmissing=true) == ["b", "c", "a"] + @test levels(x, skipmissing=true) isa Vector{String} + @test levels(x, skipmissing=false) == ["b", "c", "a"] + @test levels(x, skipmissing=false) isa Vector{Union{String, Missing}} + end +end + end From 326615d156e05c7135f3190c175081d6fce05957 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Tue, 19 Apr 2022 22:26:55 +0200 Subject: [PATCH 2/3] Add missing tests --- test/13_arraycommon.jl | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/test/13_arraycommon.jl b/test/13_arraycommon.jl index eac92ad0..20d61ef0 100644 --- a/test/13_arraycommon.jl +++ b/test/13_arraycommon.jl @@ -2269,13 +2269,23 @@ end end for x in (categorical(Union{String, Missing}["a", "b", "a"], levels=["b", "c", "a"]), - view(categorical(Union{String, Missing}["c", "b", "a"], levels=["b", "c", "a"]), 2:3)) + view(categorical(Union{String, Missing}["c", "b", "a"], levels=["b", "c", "a"]), 2:3), + view(categorical(Union{String, Missing}[missing, "b", "a"], levels=["b", "c", "a"]), 2:3)) @test @inferred(levels(x)) == ["b", "c", "a"] @test levels(x, skipmissing=true) == ["b", "c", "a"] @test levels(x, skipmissing=true) isa Vector{String} @test levels(x, skipmissing=false) == ["b", "c", "a"] @test levels(x, skipmissing=false) isa Vector{Union{String, Missing}} end + + for x in (categorical(Union{String, Missing}["a", "b", missing], levels=["b", "c", "a"]), + view(categorical(Union{String, Missing}["c", "b", missing], levels=["b", "c", "a"]), 2:3)) + @test @inferred(levels(x)) == ["b", "c", "a"] + @test levels(x, skipmissing=true) == ["b", "c", "a"] + @test levels(x, skipmissing=true) isa Vector{String} + @test levels(x, skipmissing=false) ≅ ["b", "c", "a", missing] + @test levels(x, skipmissing=false) isa Vector{Union{String, Missing}} + end end end From 3c17e6985edce1c7ea3691f997c8b4605ae35078 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 23 Apr 2022 19:03:39 +0200 Subject: [PATCH 3/3] Update src/array.jl --- src/array.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/array.jl b/src/array.jl index 78efc09b..ffbf66b8 100644 --- a/src/array.jl +++ b/src/array.jl @@ -753,7 +753,7 @@ end leveltype(::Type{T}) where {T <: CategoricalArray} = leveltype(nonmissingtype(eltype(T))) """ - levels(x::CategoricalArray, skipmissing=true) + levels(x::CategoricalArray; skipmissing=true) levels(x::CategoricalValue) Return the levels of categorical array or value `x`.