From 8e2da16f24fc47c324db16f0bbc851cc8f022268 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 5 Dec 2017 22:40:20 +0100 Subject: [PATCH] Correctly handle recoding pair with value range and source containing missings (#106) --- src/recode.jl | 33 ++++++++++++++++++++++----------- test/16_recode.jl | 45 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 66 insertions(+), 12 deletions(-) diff --git a/src/recode.jl b/src/recode.jl index ffbdd908..a8486e83 100644 --- a/src/recode.jl +++ b/src/recode.jl @@ -6,9 +6,9 @@ const ≅ = isequal Fill `dest` with elements from `src`, replacing those matching a key of `pairs` with the corresponding value. -For each `Pair` in `pairs`, if the element is equal to (according to [`isequal`](@ref)) -or [`in`](@ref) the key (first item of the pair), then the corresponding value -(second item) is copied to `dest`. +For each `Pair` in `pairs`, if the element is equal to (according to [`isequal`](@ref))) +the key (first item of the pair) or to one of its entries if it is a collection, +then the corresponding value (second item) is copied to `dest`. If the element matches no key and `default` is not provided or `nothing`, it is copied as-is; if `default` is specified, it is used in place of the original element. `dest` and `src` must be of the same length, but not necessarily of the same type. @@ -16,6 +16,11 @@ Elements of `src` as well as values from `pairs` will be `convert`ed when possib on assignment. If an element matches more than one key, the first match is used. + recode!(dest::CategoricalArray, src::AbstractArray[, default::Any], pairs::Pair...) + +If `dest` is a `CategoricalArray` then the ordering of resulting levels is determined +by the order of passed `pairs` and `default` will be the last level if provided. + recode!(dest::AbstractArray, src::AbstractArray{>:Missing}[, default::Any], pairs::Pair...) If `src` contains missing values, they are never replaced with `default`: @@ -36,8 +41,8 @@ function recode!(dest::AbstractArray{T}, src::AbstractArray, default::Any, pairs for j in 1:length(pairs) p = pairs[j] - if (!isa(p.first, Union{AbstractArray, Tuple}) && x ≅ p.first) || - (isa(p.first, Union{AbstractArray, Tuple}) && x in p.first) + if ((isa(p.first, Union{AbstractArray, Tuple}) && any(x ≅ y for y in p.first)) || + x ≅ p.first) dest[i] = p.second @goto nextitem end @@ -89,8 +94,8 @@ function recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pa for j in 1:length(pairs) p = pairs[j] - if (!isa(p.first, Union{AbstractArray, Tuple}) && x ≅ p.first) || - (isa(p.first, Union{AbstractArray, Tuple}) && x in p.first) + if ((isa(p.first, Union{AbstractArray, Tuple}) && any(x ≅ y for y in p.first)) || + x ≅ p.first) drefs[i] = dupvals ? pairmap[j] : j @goto nextitem end @@ -146,7 +151,7 @@ function recode!(dest::CategoricalArray{T}, src::CategoricalArray, default::Any, for l in srclevels if !(any(x -> x ≅ l, firsts) || - any(f -> isa(f, Union{AbstractArray, Tuple}) && l in f, firsts)) + any(f -> isa(f, Union{AbstractArray, Tuple}) && any(l ≅ y for y in f), firsts)) try push!(keptlevels, l) catch err @@ -176,7 +181,8 @@ function recode!(dest::CategoricalArray{T}, src::CategoricalArray, default::Any, # For missing values (0 if no missing in pairs' keys) indexmap[1] = 0 for p in pairs - if ismissing(p.first) + if ((isa(p.first, Union{AbstractArray, Tuple}) && any(ismissing, p.first)) || + ismissing(p.first)) indexmap[1] = get(dest.pool, p.second) break end @@ -189,8 +195,8 @@ function recode!(dest::CategoricalArray{T}, src::CategoricalArray, default::Any, @inbounds for (i, l) in enumerate(srcindex) for j in 1:length(pairs) p = pairs[j] - if (!isa(p.first, Union{AbstractArray, Tuple}) && l ≅ p.first) || - (isa(p.first, Union{AbstractArray, Tuple}) && l in p.first) + if ((isa(p.first, Union{AbstractArray, Tuple}) && any(l ≅ y for y in p.first)) || + l ≅ p.first) indexmap[i+1] = pairmap[j] @goto nextitem end @@ -268,6 +274,11 @@ If the element matches no key and `default` is not provided or `nothing`, it is if `default` is specified, it is used in place of the original element. If an element matches more than one key, the first match is used. + recode(a::CategoricalArray[, default::Any], pairs::Pair...) + +If `a` is a `CategoricalArray` then the ordering of resulting levels is determined +by the order of passed `pairs` and `default` will be the last level if provided. + # Examples ```jldoctest julia> using CategoricalArrays diff --git a/test/16_recode.jl b/test/16_recode.jl index 158456c9..0c459bde 100644 --- a/test/16_recode.jl +++ b/test/16_recode.jl @@ -123,6 +123,7 @@ end @testset "Recoding from $(typeof(x)) to categorical array with missing values" for x in (["a", missing, "c", "d"], CategoricalArray(["a", missing, "c", "d"])) + # check that error is thrown y = Vector{String}(4) @test_throws MissingException recode!(y, x, "a", "c"=>"b") @@ -159,7 +160,21 @@ end end end -@testset "Recoding array with missings, no default and with missing as a key pair from $(typeof(x)) to $(typeof(y))" for +@testset "Collection in LHS recoding array with missings and no default from $(typeof(x)) to $(typeof(y))" for + x in (["1", missing, "3", "4", "5"], CategoricalArray(["1", missing, "3", "4", "5"])), + y in (similar(x), Array{Union{String, Missing}}(size(x)), + CategoricalArray{Union{String, Missing}}(size(x)), x) + + z = @inferred recode!(y, x, ["3","4"]=>"2") + @test y === z + @test y ≅ ["1", missing, "2", "2", "5"] + if isa(y, CategoricalArray) + @test levels(y) == ["1", "5", "2"] + @test !isordered(y) + end +end + +@testset "Recoding array with missings, default and with missing as a key pair from $(typeof(x)) to $(typeof(y))" for x in (["a", missing, "c", "d"], CategoricalArray(["a", missing, "c", "d"])), y in (similar(x), Array{Union{String, Missing}}(size(x)), CategoricalArray{Union{String, Missing}}(size(x)), x) @@ -173,6 +188,20 @@ end end end +@testset "Collection with missing in LHS recoding array with missings, default from $(typeof(x)) to $(typeof(y))" for + x in (["a", missing, "c", "d"], CategoricalArray(["a", missing, "c", "d"])), + y in (similar(x), Array{Union{String, Missing}}(size(x)), + CategoricalArray{Union{String, Missing}}(size(x)), x) + + z = @inferred recode!(y, x, "a", [missing, "c"]=>"b") + @test y === z + @test y == ["a", "b", "b", "a"] + if isa(y, CategoricalArray) + @test levels(y) == ["b", "a"] + @test !isordered(y) + end +end + @testset "Recoding array with missings, no default and with missing as a key pair from $(typeof(x)) to $(typeof(y))" for x in (["a", missing, "c", "d"], CategoricalArray(["a", missing, "c", "d"])), y in (similar(x), Array{Union{String, Missing}}(size(x)), @@ -187,6 +216,20 @@ end end end +@testset "Collection with missing in LHS recoding array with missings, no default from $(typeof(x)) to $(typeof(y))" for + x in (["a", missing, "c", "d"], CategoricalArray(["a", missing, "c", "d"])), + y in (similar(x), Array{Union{String, Missing}}(size(x)), + CategoricalArray{Union{String, Missing}}(size(x)), x) + + z = @inferred recode!(y, x, ["c", missing]=>"b") + @test y === z + @test y == ["a", "b", "b", "d"] + if isa(y, CategoricalArray) + @test levels(y) == ["a", "d", "b"] + @test !isordered(y) + end +end + @testset "Recoding into an array of incompatible size from $(typeof(x)) to $(typeof(y))" for x in (["a", missing, "c", "d"], CategoricalArray(["a", missing, "c", "d"])), y in (similar(x, 0), Array{Union{String, Missing}}(0),