Skip to content

Commit

Permalink
Changed recode to accept more general collection types (#290)
Browse files Browse the repository at this point in the history
  • Loading branch information
HenricoWitvliet authored Apr 4, 2021
1 parent a898b11 commit 220198a
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 9 deletions.
29 changes: 20 additions & 9 deletions src/recode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,19 @@ recode!(dest::CategoricalArray, src::AbstractArray, pairs::Pair...) =
recode!(dest::CategoricalArray, src::CategoricalArray, pairs::Pair...) =
recode!(dest, src, nothing, pairs...)

"""
recode_in(x, collection)
Helper function to test if `x` is a member of `collection`.
The default method is to test if any element in the `collection` `isequal` to
`x`. For `Set`s `in` is used as it is faster than the default method and equivalent to it.
A user defined type could override this method to define an appropriate test function.
"""
@inline recode_in(x, ::Missing) = false
@inline recode_in(x, collection::Set) = x in collection
@inline recode_in(x, collection) = any(x y for y in collection)

function recode!(dest::AbstractArray{T}, src::AbstractArray, default::Any, pairs::Pair...) where {T}
if length(dest) != length(src)
throw(DimensionMismatch("dest and src must be of the same length (got $(length(dest)) and $(length(src)))"))
Expand All @@ -46,8 +59,8 @@ function recode!(dest::AbstractArray{T}, src::AbstractArray, default::Any, pairs

for j in 1:length(pairs)
p = pairs[j]
if ((isa(p.first, Union{AbstractArray, Tuple}) && any(x y for y in p.first)) ||
x p.first)
# we use isequal and recode_in because we cannot really distinguish scalars from collections
if x p.first || recode_in(x, p.first)
dest[i] = p.second
@goto nextitem
end
Expand Down Expand Up @@ -99,8 +112,8 @@ function recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pa

for j in 1:length(pairs)
p = pairs[j]
if ((isa(p.first, Union{AbstractArray, Tuple}) && any(x y for y in p.first)) ||
x p.first)
# we use isequal and recode_in because we cannot really distinguish scalars from collections
if x p.first || recode_in(x, p.first)
drefs[i] = dupvals ? pairmap[j] : j
@goto nextitem
end
Expand Down Expand Up @@ -166,7 +179,7 @@ function recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray,

for l in srclevels
if !(any(x -> x l, firsts) ||
any(f -> isa(f, Union{AbstractArray, Tuple}) && any(l y for y in f), firsts))
any(f -> recode_in(l, f), firsts))
try
push!(keptlevels, l)
catch err
Expand Down Expand Up @@ -200,8 +213,7 @@ function recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray,
# For missing values (0 if no missing in pairs' keys)
levelsmap[1] = 0
for p in pairs
if ((isa(p.first, Union{AbstractArray, Tuple}) && any(ismissing, p.first)) ||
ismissing(p.first))
if (ismissing(p.first) || any(ismissing, p.first))
levelsmap[1] = get(dest.pool, p.second)
break
end
Expand All @@ -214,8 +226,7 @@ function recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray,
@inbounds for (i, l) in enumerate(srclevels)
for j in 1:length(pairs)
p = pairs[j]
if ((isa(p.first, Union{AbstractArray, Tuple}) && any(l y for y in p.first)) ||
l p.first)
if l p.first || recode_in(l, p.first)
levelsmap[i+1] = pairmap[j]
@goto nextitem
end
Expand Down
57 changes: 57 additions & 0 deletions test/16_recode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,48 @@ end

const = isequal

@testset "recode_in" begin
@testset "collection is a string" begin
@test !CategoricalArrays.recode_in("a", "ab")
@test CategoricalArrays.recode_in('a', "ab")
@test !CategoricalArrays.recode_in('c', "ab")
@test !CategoricalArrays.recode_in(missing, "b")
end
@testset "collection without missing" begin
@test CategoricalArrays.recode_in(1, [1, 2])
@test !CategoricalArrays.recode_in(1, [2, 3])
end
@testset "collection with missing" begin
@test CategoricalArrays.recode_in(1, [1, 2, missing])
@test !CategoricalArrays.recode_in(1, [2, missing])
@test CategoricalArrays.recode_in(missing, [1, 2, missing])
end
@testset "collection is a single value" begin
@test CategoricalArrays.recode_in(1, 1)
@test !CategoricalArrays.recode_in(1, missing)
@test !CategoricalArrays.recode_in(missing, missing)
end
@testset "tuple without missing" begin
@test CategoricalArrays.recode_in(1, (1, 2))
@test !CategoricalArrays.recode_in(1, (2, 3))
end
@testset "tuple with missing" begin
@test CategoricalArrays.recode_in(1, (1, 2, missing))
@test !CategoricalArrays.recode_in(1, (2, missing))
@test CategoricalArrays.recode_in(missing, (1, 2, missing))
end
@testset "nested arrays" begin
@test CategoricalArrays.recode_in([1,2], [[1, 2], [3, 4]])
@test !CategoricalArrays.recode_in([1, 3], [[1, 2], [3, 4]])
end
@testset "NaN in array" begin
@test CategoricalArrays.recode_in(NaN, [1, 2, NaN])
@test !CategoricalArrays.recode_in(NaN, [1, 2, 3])
@test CategoricalArrays.recode_in(2, [1, 2, NaN])
@test !CategoricalArrays.recode_in(3, [1, 2, NaN])
end
end

## Test recode!, used by recode

# Test both recoding into x itself and into an uninitialized vector
Expand All @@ -29,6 +71,21 @@ const ≅ = isequal
end
end

@testset "Recoding from $(typeof(x)) to $(typeof(y)) using a Set as the first argument in a pair" for
x in ([1:10;], CategoricalArray(1:10), CategoricalArray{Union{Int, Missing}}(1:10)),
y in (similar(x), Array{Int}(undef, size(x)),
CategoricalArray{Int}(undef, size(x)),
CategoricalArray{Union{Int, Missing}}(undef, size(x)), x)

z = @inferred recode!(y, x, 1=>100, 2:4=>0, Set([5; 9:10])=>-1)
@test y === z
@test y == [100, 0, 0, 0, -1, 6, 7, 8, -1, -1]
if isa(y, CategoricalArray)
@test levels(y) == [6, 7, 8, 100, 0, -1]
@test !isordered(y)
end
end

@testset "Recoding from $(typeof(x)) to $(typeof(y)) with duplicate recoded values" for
x in ([1:10;], CategoricalArray(1:10), CategoricalArray{Union{Int, Missing}}(1:10)),
y in (similar(x), Array{Int}(undef, size(x)),
Expand Down

0 comments on commit 220198a

Please # to comment.