Skip to content

Commit

Permalink
Remove valindex (#327)
Browse files Browse the repository at this point in the history
This used to make `getindex` faster, but it's no longer the case on Julia 1.6,
and since it triggers one allocation for each level it makes `getindex` with slices
slower with large pools.
  • Loading branch information
nalimilan authored Mar 31, 2021
1 parent 18671b1 commit 7e840a3
Show file tree
Hide file tree
Showing 8 changed files with 203 additions and 221 deletions.
2 changes: 1 addition & 1 deletion docs/src/implementation.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
- `refs`: an integer array that stores the position of the category level in the `levels` field of `CategoricalPool` for each `CategoricalArray` element; `0` denotes a missing value (for `CategoricalArray{Union{T, Missing}}` only).
- `pool`: the `CategoricalPool` object that maintains the levels of the array.

The `CategoricalPool{V,R,C}` type keeps track of the levels of type `V` and associates them with an integer reference code of type `R` (for internal use). It offers methods to add new levels, and efficiently get the integer index corresponding to a level and vice-versa. Whether the values of `CategoricalArray` are ordered or not is defined by an `ordered` field of the pool. Finally, `CategoricalPool{V,R,C}` keeps a `valindex` vector of value objects of type `C == CategoricalValue{V, R}`, so that `getindex` can return the existing object instead of allocating a new one.
The `CategoricalPool{V,R,C}` type keeps track of the levels of type `V` and associates them with an integer reference code of type `R` (for internal use). It offers methods to add new levels, and efficiently get the integer index corresponding to a level and vice-versa. Whether the values of `CategoricalArray` are ordered or not is defined by an `ordered` field of the pool.

Do note that `CategoricalPool` levels are semi-mutable: it is only allowed to add new levels, but never to remove or reorder existing ones. This ensures existing `CategoricalValue` objects remain valid and always point to the same level as when they were created. Therefore, `CategoricalArray`s create a new pool each time some of their levels are removed or reordered. This happens when calling `levels!`, but also when assigning a `CategoricalValue` via `setindex!`, `push!`, `append!`, `copy!` or `copyto!` (as new levels may be added to the front to preserve relative order of both source and destination levels). Doing so requires updating all reference codes to point to the new pool, and makes it impossible to compare existing ordered `CategoricalValue` objects with values from the array using `<` and `>`.

Expand Down
4 changes: 3 additions & 1 deletion src/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1032,7 +1032,9 @@ function Base.sort!(v::CategoricalVector;
ord = Base.Sort.ord(lt, by, rev, order)
seen = counts .> 0
anymissing = eltype(v) >: Missing && seen[1]
levs = eltype(v) >: Missing ? [missing; v.pool.valindex] : v.pool.valindex
levs = eltype(v) >: Missing ?
eltype(v)[i == 0 ? missing : CategoricalValue(i, v.pool) for i in 0:length(v.pool)] :
eltype(v)[CategoricalValue(i, v.pool) for i in 1:length(v.pool)]
sortedlevs = sort!(Vector(view(levs, seen)), order=ord)
levelsmap = something.(indexin(sortedlevs, levs))
j = 0
Expand Down
5 changes: 1 addition & 4 deletions src/pool.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ end

Base.length(pool::CategoricalPool) = length(pool.levels)

Base.getindex(pool::CategoricalPool, i::Integer) = pool.valindex[i]
Base.getindex(pool::CategoricalPool, i::Integer) = CategoricalValue(i, pool)
Base.get(pool::CategoricalPool, level::Any) = pool.invindex[level]
Base.get(pool::CategoricalPool, level::Any, default::Any) = get(pool.invindex, level, default)

Expand All @@ -57,7 +57,6 @@ avoid doing a dict lookup twice

i = R(n + 1)
push!(pool.levels, x)
push!(pool.valindex, CategoricalValue(i, pool))
i
end

Expand Down Expand Up @@ -202,12 +201,10 @@ function levels!(pool::CategoricalPool{S, R}, newlevels::Vector;

empty!(pool.invindex)
resize!(pool.levels, n)
resize!(pool.valindex, n)
for i in 1:n
v = levs[i]
pool.levels[i] = v
pool.invindex[v] = i
pool.valindex[i] = CategoricalValue(i, pool)
end

return pool
Expand Down
5 changes: 1 addition & 4 deletions src/typedefs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ const SupportedTypes = Union{AbstractString, AbstractChar, Number}
mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer, V}
levels::Vector{T} # category levels ordered by their reference codes
invindex::Dict{T, R} # map from category levels to their reference codes
valindex::Vector{V} # "category value" objects 1-to-1 matching `index`
ordered::Bool

function CategoricalPool{T, R, V}(levels::Vector{T},
Expand Down Expand Up @@ -49,9 +48,7 @@ mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer, V}
if V !== CategoricalValue{T, R}
throw(ArgumentError("V must be CategoricalValue{T, R}"))
end
valindex = Vector{V}(undef, length(levels))
pool = new(levels, invindex, valindex, ordered)
pool.valindex .= CategoricalValue.(1:length(levels), Ref(pool))
pool = new(levels, invindex, ordered)
return pool
end
end
Expand Down
4 changes: 0 additions & 4 deletions test/05_copy.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ using CategoricalArrays: CategoricalPool
@test length(pool2) == 3
@test pool2.levels == ["d", "c", "b"]
@test pool2.invindex == Dict("d"=>1, "c"=>2, "b"=>3)
@test pool2.valindex == [CategoricalValue(i, pool2) for i in 1:3]
@test all(v -> v.pool === pool2, pool2.valindex)
@test pool2.ordered

levels!(pool2, ["d", "c", "b", "e"])
Expand All @@ -21,8 +19,6 @@ using CategoricalArrays: CategoricalPool
@test length(pool) == 3
@test pool.levels == ["d", "c", "b"]
@test pool.invindex == Dict("d"=>1, "c"=>2, "b"=>3)
@test pool.valindex == [CategoricalValue(i, pool) for i in 1:3]
@test all(v -> v.pool === pool, pool.valindex)
@test pool.ordered
end

Expand Down
12 changes: 1 addition & 11 deletions test/07_levels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ using CategoricalArrays: DefaultRefType, levels!
@test levels(pool) == [2, 1, 3]
@test all([levels(CategoricalValue(i, pool)) for i in 1:3] .=== Ref(levels(pool)))
@test pool.invindex == Dict(2=>1, 1=>2, 3=>3)
@test pool.valindex == [CategoricalValue(i, pool) for i in 1:3]

for rep in 1:3
push!(pool, 4)
Expand All @@ -22,7 +21,6 @@ using CategoricalArrays: DefaultRefType, levels!
@test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4)
@test get(pool, 4) === DefaultRefType(4)
@test pool[4] === CategoricalValue(4, pool)
@test pool.valindex == [CategoricalValue(i, pool) for i in 1:4]
end

for rep in 1:3
Expand All @@ -34,7 +32,6 @@ using CategoricalArrays: DefaultRefType, levels!
@test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5)
@test get(pool, 0) === DefaultRefType(5)
@test pool[5] === CategoricalValue(5, pool)
@test pool.valindex == [CategoricalValue(i, pool) for i in 1:5]
end

for rep in 1:3
Expand All @@ -48,7 +45,6 @@ using CategoricalArrays: DefaultRefType, levels!
@test get(pool, 11) === DefaultRefType(7)
@test pool[6] === CategoricalValue(6, pool)
@test pool[7] === CategoricalValue(7, pool)
@test pool.valindex == [CategoricalValue(i, pool) for i in 1:7]
end

for rep in 1:3
Expand All @@ -62,7 +58,6 @@ using CategoricalArrays: DefaultRefType, levels!
@test get(pool, 13) === DefaultRefType(9)
@test pool[8] === CategoricalValue(8, pool)
@test pool[9] === CategoricalValue(9, pool)
@test pool.valindex == [CategoricalValue(i, pool) for i in 1:9]
end

# Removing levels
Expand All @@ -86,7 +81,6 @@ using CategoricalArrays: DefaultRefType, levels!
@test get(pool, 14) === DefaultRefType(11)
@test pool[10] === CategoricalValue(10, pool)
@test pool[11] === CategoricalValue(11, pool)
@test pool.valindex == [CategoricalValue(i, pool) for i in 1:11]

# get!
ordered!(pool, true)
Expand All @@ -101,7 +95,6 @@ using CategoricalArrays: DefaultRefType, levels!
@test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9,
15=>10, 14=>11, 20=>12)
@test get(pool, 20) === DefaultRefType(12)
@test pool.valindex == [CategoricalValue(i, pool) for i in 1:12]

# get! with CategoricalValue adding new levels in conflicting order
v = CategoricalValue(2, CategoricalPool([100, 99, 4, 2]))
Expand All @@ -123,7 +116,6 @@ using CategoricalArrays: DefaultRefType, levels!
15=>10, 14=>11, 20=>12, 100=>13, 99=>14)
@test get(pool, 100) === DefaultRefType(13)
@test get(pool, 99) === DefaultRefType(14)
@test pool.valindex == [CategoricalValue(i, pool) for i in 1:14]

# get! with CategoricalValue not adding new levels
v = CategoricalValue(1, CategoricalPool([100, 2]))
Expand All @@ -133,8 +125,7 @@ using CategoricalArrays: DefaultRefType, levels!
@test length(pool) == 14
@test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99]
@test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9,
15=>10, 14=>11, 20=>12, 100=>13, 99=>14)
@test pool.valindex == [CategoricalValue(i, pool) for i in 1:14]
15=>10, 14=>11, 20=>12, 100=>13, 99=>14)

# get! with CategoricalValue from same pool
@test get!(pool, pool[1]) === DefaultRefType(1)
Expand All @@ -144,7 +135,6 @@ using CategoricalArrays: DefaultRefType, levels!
@test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99]
@test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9,
15=>10, 14=>11, 20=>12, 100=>13, 99=>14)
@test pool.valindex == [CategoricalValue(i, pool) for i in 1:14]

# get! with CategoricalValue conversion error
v = CategoricalValue(1, CategoricalPool(["a", "b"]))
Expand Down
Loading

0 comments on commit 7e840a3

Please # to comment.