From 220198aca8eec1629699f8f072f7670432eb674b Mon Sep 17 00:00:00 2001 From: HenricoWitvliet <2179781+HenricoWitvliet@users.noreply.github.com> Date: Sun, 4 Apr 2021 16:45:54 +0200 Subject: [PATCH] Changed recode to accept more general collection types (#290) --- src/recode.jl | 29 ++++++++++++++++-------- test/16_recode.jl | 57 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 9 deletions(-) diff --git a/src/recode.jl b/src/recode.jl index f3ee6089..25854670 100644 --- a/src/recode.jl +++ b/src/recode.jl @@ -36,6 +36,19 @@ recode!(dest::CategoricalArray, src::AbstractArray, pairs::Pair...) = recode!(dest::CategoricalArray, src::CategoricalArray, pairs::Pair...) = recode!(dest, src, nothing, pairs...) +""" + recode_in(x, collection) + +Helper function to test if `x` is a member of `collection`. + +The default method is to test if any element in the `collection` `isequal` to +`x`. For `Set`s `in` is used as it is faster than the default method and equivalent to it. +A user defined type could override this method to define an appropriate test function. +""" +@inline recode_in(x, ::Missing) = false +@inline recode_in(x, collection::Set) = x in collection +@inline recode_in(x, collection) = any(x ≅ y for y in collection) + function recode!(dest::AbstractArray{T}, src::AbstractArray, default::Any, pairs::Pair...) where {T} if length(dest) != length(src) throw(DimensionMismatch("dest and src must be of the same length (got $(length(dest)) and $(length(src)))")) @@ -46,8 +59,8 @@ function recode!(dest::AbstractArray{T}, src::AbstractArray, default::Any, pairs for j in 1:length(pairs) p = pairs[j] - if ((isa(p.first, Union{AbstractArray, Tuple}) && any(x ≅ y for y in p.first)) || - x ≅ p.first) + # we use isequal and recode_in because we cannot really distinguish scalars from collections + if x ≅ p.first || recode_in(x, p.first) dest[i] = p.second @goto nextitem end @@ -99,8 +112,8 @@ function recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pa for j in 1:length(pairs) p = pairs[j] - if ((isa(p.first, Union{AbstractArray, Tuple}) && any(x ≅ y for y in p.first)) || - x ≅ p.first) + # we use isequal and recode_in because we cannot really distinguish scalars from collections + if x ≅ p.first || recode_in(x, p.first) drefs[i] = dupvals ? pairmap[j] : j @goto nextitem end @@ -166,7 +179,7 @@ function recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray, for l in srclevels if !(any(x -> x ≅ l, firsts) || - any(f -> isa(f, Union{AbstractArray, Tuple}) && any(l ≅ y for y in f), firsts)) + any(f -> recode_in(l, f), firsts)) try push!(keptlevels, l) catch err @@ -200,8 +213,7 @@ function recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray, # For missing values (0 if no missing in pairs' keys) levelsmap[1] = 0 for p in pairs - if ((isa(p.first, Union{AbstractArray, Tuple}) && any(ismissing, p.first)) || - ismissing(p.first)) + if (ismissing(p.first) || any(ismissing, p.first)) levelsmap[1] = get(dest.pool, p.second) break end @@ -214,8 +226,7 @@ function recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray, @inbounds for (i, l) in enumerate(srclevels) for j in 1:length(pairs) p = pairs[j] - if ((isa(p.first, Union{AbstractArray, Tuple}) && any(l ≅ y for y in p.first)) || - l ≅ p.first) + if l ≅ p.first || recode_in(l, p.first) levelsmap[i+1] = pairmap[j] @goto nextitem end diff --git a/test/16_recode.jl b/test/16_recode.jl index 8512967a..757d744b 100644 --- a/test/16_recode.jl +++ b/test/16_recode.jl @@ -9,6 +9,48 @@ end const ≅ = isequal +@testset "recode_in" begin + @testset "collection is a string" begin + @test !CategoricalArrays.recode_in("a", "ab") + @test CategoricalArrays.recode_in('a', "ab") + @test !CategoricalArrays.recode_in('c', "ab") + @test !CategoricalArrays.recode_in(missing, "b") + end + @testset "collection without missing" begin + @test CategoricalArrays.recode_in(1, [1, 2]) + @test !CategoricalArrays.recode_in(1, [2, 3]) + end + @testset "collection with missing" begin + @test CategoricalArrays.recode_in(1, [1, 2, missing]) + @test !CategoricalArrays.recode_in(1, [2, missing]) + @test CategoricalArrays.recode_in(missing, [1, 2, missing]) + end + @testset "collection is a single value" begin + @test CategoricalArrays.recode_in(1, 1) + @test !CategoricalArrays.recode_in(1, missing) + @test !CategoricalArrays.recode_in(missing, missing) + end + @testset "tuple without missing" begin + @test CategoricalArrays.recode_in(1, (1, 2)) + @test !CategoricalArrays.recode_in(1, (2, 3)) + end + @testset "tuple with missing" begin + @test CategoricalArrays.recode_in(1, (1, 2, missing)) + @test !CategoricalArrays.recode_in(1, (2, missing)) + @test CategoricalArrays.recode_in(missing, (1, 2, missing)) + end + @testset "nested arrays" begin + @test CategoricalArrays.recode_in([1,2], [[1, 2], [3, 4]]) + @test !CategoricalArrays.recode_in([1, 3], [[1, 2], [3, 4]]) + end + @testset "NaN in array" begin + @test CategoricalArrays.recode_in(NaN, [1, 2, NaN]) + @test !CategoricalArrays.recode_in(NaN, [1, 2, 3]) + @test CategoricalArrays.recode_in(2, [1, 2, NaN]) + @test !CategoricalArrays.recode_in(3, [1, 2, NaN]) + end +end + ## Test recode!, used by recode # Test both recoding into x itself and into an uninitialized vector @@ -29,6 +71,21 @@ const ≅ = isequal end end +@testset "Recoding from $(typeof(x)) to $(typeof(y)) using a Set as the first argument in a pair" for + x in ([1:10;], CategoricalArray(1:10), CategoricalArray{Union{Int, Missing}}(1:10)), + y in (similar(x), Array{Int}(undef, size(x)), + CategoricalArray{Int}(undef, size(x)), + CategoricalArray{Union{Int, Missing}}(undef, size(x)), x) + + z = @inferred recode!(y, x, 1=>100, 2:4=>0, Set([5; 9:10])=>-1) + @test y === z + @test y == [100, 0, 0, 0, -1, 6, 7, 8, -1, -1] + if isa(y, CategoricalArray) + @test levels(y) == [6, 7, 8, 100, 0, -1] + @test !isordered(y) + end +end + @testset "Recoding from $(typeof(x)) to $(typeof(y)) with duplicate recoded values" for x in ([1:10;], CategoricalArray(1:10), CategoricalArray{Union{Int, Missing}}(1:10)), y in (similar(x), Array{Int}(undef, size(x)),