Skip to content

Commit e1c57a2

Browse files
authored
Improve performance of constructor and levels! with many levels (JuliaData#100)
levels! was very slow due to repeated calls to in, either directly or via issubset. We do not actually need to call levels!(::CategoricalArray, ::Vector) from the constructor since we know the levels will all be preserved (just reordered): we can use levels!(CategoricalPool, ::Vector) directly. Also inline get!(::CategoricalPool, x) for maximum performance.
1 parent 97791c5 commit e1c57a2

File tree

3 files changed

+24
-5
lines changed

3 files changed

+24
-5
lines changed

benchmark/benchmarks.jl

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,20 @@ end
3030
@bench "CategoricalArray{String}" sumequals(ca, ca[1])
3131
@bench "CategoricalArray{Union{String, Missing}}" sumequals(nca, nca[1])
3232
end
33+
34+
# With many levels, checking whether some levels have been dropped can be very slow (#93)
35+
@benchgroup "CategoricalArray{String} with many levels" begin
36+
a = rand([@sprintf("id%010d", k) for k in 1:1000], 10000)
37+
@bench "CategoricalArray(::Vector{String})" CategoricalArray(a)
38+
39+
a = rand([@sprintf("id%010d", k) for k in 1:1000], 10000)
40+
ca = CategoricalArray(a)
41+
levs = levels(ca)
42+
@bench "levels! with original levels" levels!(ca, levs)
43+
44+
levs = reverse(levels(ca))
45+
@bench "levels! with resorted levels" levels!(ca, levs)
46+
47+
levs = [levels(ca); [@sprintf("id2%010d", k) for k in 1:1000]]
48+
@bench "levels! with many additional levels" levels!(ca, levs)
49+
end

src/array.jl

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ function convert(::Type{CategoricalArray{T, N, R}}, A::AbstractArray{S, N}) wher
246246
# if order is defined for level type, automatically apply it
247247
L = leveltype(res)
248248
if method_exists(isless, Tuple{L, L})
249-
levels!(res, sort(levels(res)))
249+
levels!(res.pool, sort(levels(res.pool)))
250250
end
251251

252252
res
@@ -553,9 +553,10 @@ function levels!(A::CategoricalArray{T}, newlevels::Vector; allow_missing=false)
553553
join(unique(filter(x->sum(newlevels.==x)>1, newlevels)), ", "))))
554554
end
555555

556-
# first pass to check whether changes can be applied without error
556+
# first pass to check whether, if some levels are removed, changes can be applied without error
557557
# TODO: save original levels and undo changes in case of error to skip this step
558-
if !all(l->l in newlevels, index(A.pool))
558+
# equivalent to issubset but faster due to JuliaLang/julia#24624
559+
if !isempty(setdiff(index(A.pool), newlevels))
559560
deleted = [!(l in newlevels) for l in index(A.pool)]
560561
@inbounds for (i, x) in enumerate(A.refs)
561562
if T >: Missing

src/pool.jl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ Base.getindex(pool::CategoricalPool, i::Integer) = pool.valindex[i]
8888
Base.get(pool::CategoricalPool, level::Any) = pool.invindex[level]
8989
Base.get(pool::CategoricalPool, level::Any, default::Any) = get(pool.invindex, level, default)
9090

91-
function Base.get!(pool::CategoricalPool{T, R}, level::Any) where {T, R}
91+
@inline function Base.get!(pool::CategoricalPool{T, R}, level::Any) where {T, R}
9292
get!(pool.invindex, level) do
9393
x = convert(T, level)
9494
n = length(pool)
@@ -151,7 +151,8 @@ function levels!(pool::CategoricalPool{S, R}, newlevels::Vector) where {S, R}
151151
end
152152

153153
# No deletions: can preserve position of existing levels
154-
if issubset(pool.index, levs)
154+
# equivalent to issubset but faster due to JuliaLang/julia#24624
155+
if isempty(setdiff(pool.index, levs))
155156
append!(pool, setdiff(levs, pool.index))
156157
else
157158
empty!(pool.invindex)

0 commit comments

Comments
 (0)