From 4b12853fbdc4811c73d31727af418e0f79abc3af Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Mon, 26 May 2025 21:38:32 +0200 Subject: [PATCH 1/2] Make `levels` return a `CategoricalArray` Having `levels` preserve the eltype of the input is sometimes useful to write generic code. This is only slightly breaking as the result still compares equal to the previous behvior returning unwrapped values. --- benchmark/benchmarks.jl | 4 +- docs/src/using.md | 16 +++--- ext/CategoricalArraysArrowExt.jl | 2 + ext/CategoricalArraysRecipesBaseExt.jl | 2 +- src/array.jl | 56 +++++++++++---------- src/pool.jl | 29 ++++++----- src/recode.jl | 10 ++-- src/typedefs.jl | 5 +- src/value.jl | 8 +-- test/01_value.jl | 7 ++- test/07_levels.jl | 68 ++++++++++++++++++-------- test/11_array.jl | 2 +- test/12_missingarray.jl | 2 +- test/13_arraycommon.jl | 8 +-- test/14_view.jl | 3 +- 15 files changed, 134 insertions(+), 88 deletions(-) diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 5c2ae42b..bf12f7c9 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -55,11 +55,11 @@ SUITE["many levels"]["CategoricalArray(::Vector{String})"] = a = rand([@sprintf("id%010d", k) for k in 1:1000], 10000) ca = CategoricalArray(a) -levs = levels(ca) +levs = unwrap.(levels(ca)) SUITE["many levels"]["levels! with original levels"] = @benchmarkable levels!(ca, levs) -levs = reverse(levels(ca)) +levs = reverse(unwrap.(levels(ca))) SUITE["many levels"]["levels! with resorted levels"] = @benchmarkable levels!(ca, levs) diff --git a/docs/src/using.md b/docs/src/using.md index 9790e8cf..aaa6e36c 100644 --- a/docs/src/using.md +++ b/docs/src/using.md @@ -20,7 +20,7 @@ By default, the levels are lexically sorted, which is clearly not correct in our ```jldoctest using julia> levels(x) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "Middle" "Old" "Young" @@ -68,7 +68,7 @@ To get rid of the `"Old"` group, just call the [`droplevels!`](@ref) function: ```jldoctest using julia> levels(x) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "Young" "Middle" "Old" @@ -81,7 +81,7 @@ julia> droplevels!(x) "Young" julia> levels(x) -2-element Vector{String}: +2-element CategoricalArray{String,1,UInt32}: "Young" "Middle" @@ -139,7 +139,7 @@ Levels still need to be reordered manually: ```jldoctest using julia> levels(y) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "Middle" "Old" "Young" @@ -263,7 +263,7 @@ true Likewise, assigning a `CategoricalValue` from `y` to an entry in `x` expands the levels of `x` with all levels from `y`, *respecting the ordering of levels of both vectors if possible*: ```jldoctest using julia> levels(x) -2-element Vector{String}: +2-element CategoricalArray{String,1,UInt32}: "Middle" "Old" @@ -271,7 +271,7 @@ julia> x[1] = y[1] CategoricalValue{String, UInt32} "Young" (1/2) julia> levels(x) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "Young" "Middle" "Old" @@ -296,7 +296,7 @@ julia> ab = vcat(a, b) "c" julia> levels(ab) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "a" "b" "c" @@ -320,7 +320,7 @@ julia> ab2 = vcat(a, b) "c" julia> levels(ab2) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "a" "b" "c" diff --git a/ext/CategoricalArraysArrowExt.jl b/ext/CategoricalArraysArrowExt.jl index 3e764122..811870d2 100644 --- a/ext/CategoricalArraysArrowExt.jl +++ b/ext/CategoricalArraysArrowExt.jl @@ -7,6 +7,8 @@ import Arrow: ArrowTypes const CATARRAY_ARROWNAME = Symbol("JuliaLang.CategoricalArrays.CategoricalArray") ArrowTypes.arrowname(::Type{<:CategoricalValue}) = CATARRAY_ARROWNAME ArrowTypes.arrowmetadata(::Type{CategoricalValue{T, R}}) where {T, R} = string(R) +ArrowTypes.ArrowType(::Type{<:CategoricalValue{T}}) where {T} = T +ArrowTypes.toarrow(x::CategoricalValue) = unwrap(x) ArrowTypes.arrowname(::Type{Union{<:CategoricalValue, Missing}}) = CATARRAY_ARROWNAME ArrowTypes.arrowmetadata(::Type{Union{CategoricalValue{T, R}, Missing}}) where {T, R} = diff --git a/ext/CategoricalArraysRecipesBaseExt.jl b/ext/CategoricalArraysRecipesBaseExt.jl index 2642f838..656f3e3d 100644 --- a/ext/CategoricalArraysRecipesBaseExt.jl +++ b/ext/CategoricalArraysRecipesBaseExt.jl @@ -9,7 +9,7 @@ else end RecipesBase.@recipe function f(::Type{T}, v::T) where T <: CategoricalValue - level_strings = [map(string, levels(v)); missing] + level_strings = [map(string, CategoricalArrays._levels(v)); missing] ticks --> eachindex(level_strings) v -> ismissing(v) ? length(level_strings) : Int(CategoricalArrays.refcode(v)), i -> level_strings[Int(i)] diff --git a/src/array.jl b/src/array.jl index c462e7d4..a2103e2a 100644 --- a/src/array.jl +++ b/src/array.jl @@ -239,7 +239,7 @@ function CategoricalArray{T, N, R}(A::CategoricalArray{S, N, Q}; catch err err isa LevelsException || rethrow(err) throw(ArgumentError("encountered value(s) not in specified `levels`: " * - "$(setdiff(CategoricalArrays.levels(res), levels))")) + "$(setdiff(_levels(res), levels))")) end end return res @@ -358,18 +358,18 @@ function _convert(::Type{CategoricalArray{T, N, R}}, A::AbstractArray{S, N}; copyto!(res, A) if levels !== nothing - CategoricalArrays.levels(res) == levels || + _levels(res) == levels || throw(ArgumentError("encountered value(s) not in specified `levels`: " * - "$(setdiff(CategoricalArrays.levels(res), levels))")) + "$(setdiff(_levels(res), levels))")) else # if order is defined for level type, automatically apply it L = leveltype(res) if Base.OrderStyle(L) isa Base.Ordered - levels!(res, sort(CategoricalArrays.levels(res))) + levels!(res, sort(_levels(res))) elseif hasmethod(isless, (L, L)) # isless may throw an error, e.g. for AbstractArray{T} of unordered T try - levels!(res, sort(CategoricalArrays.levels(res))) + levels!(res, sort(_levels(res))) catch e e isa MethodError || rethrow(e) end @@ -382,7 +382,7 @@ end # From CategoricalArray (preserve levels, ordering and R) function convert(::Type{CategoricalArray{T, N, R}}, A::CategoricalArray{S, N}) where {S, T, N, R} if length(A.pool) > typemax(R) - throw(LevelsException{T, R}(levels(A)[typemax(R)+1:end])) + throw(LevelsException{T, R}(_levels(A)[typemax(R)+1:end])) end if !(T >: Missing) && S >: Missing && any(iszero, A.refs) @@ -460,7 +460,7 @@ size(A::CategoricalArray) = size(A.refs) Base.IndexStyle(::Type{<:CategoricalArray}) = IndexLinear() function update_refs!(A::CategoricalArray, newlevels::AbstractVector) - oldlevels = levels(A) + oldlevels = _levels(A) levelsmap = similar(A.refs, length(oldlevels)+1) # 0 maps to a missing value levelsmap[1] = 0 @@ -478,7 +478,7 @@ function merge_pools!(A::CatArrOrSub, updaterefs::Bool=true, updatepool::Bool=true) newlevels, ordered = merge_pools(pool(A), pool(B)) - oldlevels = levels(A) + oldlevels = _levels(A) pA = A isa SubArray ? parent(A) : A ordered!(pA, ordered) # If A's levels are an ordered superset of new (merged) pool, no need to recompute refs @@ -537,8 +537,8 @@ function copyto!(dest::CatArrOrSub{T, N, R}, dstart::Integer, # try converting src to dest type to avoid partial copy corruption of dest # in the event that the src cannot be copied into dest - slevs = convert(Vector{T}, levels(src)) - dlevs = levels(dest) + slevs = convert(Vector{T}, _levels(src)) + dlevs = _levels(dest) if eltype(src) >: Missing && !(eltype(dest) >: Missing) && !all(x -> x > 0, srefs) throw(MissingException("cannot copy array with missing values to an array with element type $T")) end @@ -591,7 +591,7 @@ function copyto!(dest::CatArrOrSub{T1, N, R}, dstart::Integer, return invoke(copyto!, Tuple{AbstractArray, Integer, AbstractArray, Integer, Integer}, dest, dstart, src, sstart, n) end - newdestlevs = destlevs = copy(levels(dest)) # copy since we need original levels below + newdestlevs = destlevs = copy(_levels(dest)) # copy since we need original levels below srclevsnm = T2 >: Missing ? setdiff(srclevs, [missing]) : srclevs if !(srclevsnm ⊆ destlevs) # if order is defined for level type, automatically apply it @@ -701,7 +701,7 @@ While this will reduce memory use, this function is type-unstable, which can aff performance inside the function where the call is made. Therefore, use it with caution. """ function compress(A::CategoricalArray{T, N}) where {T, N} - R = reftype(length(levels(A.pool))) + R = reftype(length(_levels(A.pool))) convert(CategoricalArray{T, N, R}, A) end @@ -719,11 +719,11 @@ decompress(A::CategoricalArray{T, N}) where {T, N} = convert(CategoricalArray{T, N, DefaultRefType}, A) function vcat(A::CategoricalArray...) - ordered = any(isordered, A) && all(a->isordered(a) || isempty(levels(a)), A) - newlevels, ordered = mergelevels(ordered, map(levels, A)...) + ordered = any(isordered, A) && all(a->isordered(a) || isempty(_levels(a)), A) + newlevels, ordered = mergelevels(ordered, map(_levels, A)...) refsvec = map(A) do a - ii = convert(Vector{Int}, indexin(levels(a.pool), newlevels)) + ii = convert(Vector{Int}, indexin(_levels(a.pool), newlevels)) [x==0 ? 0 : ii[x] for x in a.refs]::Array{Int,ndims(a)} end @@ -761,23 +761,25 @@ This may include levels which do not actually appear in the data `missing` will be included only if it appears in the data and `skipmissing=false` is passed. -The returned vector is an internal field of `x` which must not be mutated +The returned vector is owned by `x` and must not be mutated as doing so would corrupt it. """ -@inline function DataAPI.levels(A::CatArrOrSub{T}; skipmissing::Bool=true) where T +@inline function DataAPI.levels(A::CatArrOrSub; skipmissing::Bool=true) if eltype(A) >: Missing && !skipmissing if any(==(0), refs(A)) - T[levels(pool(A)); missing] + eltype(A)[levels(pool(A)); missing] else - convert(Vector{T}, levels(pool(A))) + levels_missing(pool(A)) end else levels(pool(A)) end end +_levels(A::CatArrOrSub) = _levels(pool(A)) + """ - levels!(A::CategoricalArray, newlevels::Vector; allowmissing::Bool=false) + levels!(A::CategoricalArray, newlevels::AbstractVector; allowmissing::Bool=false) Set the levels categorical array `A`. The order of appearance of levels will be respected by [`levels`](@ref DataAPI.levels), which may affect display of results in some operations; if `A` is @@ -791,7 +793,7 @@ Else, `newlevels` must include all levels which appear in the data. """ function levels!(A::CategoricalArray{T, N, R}, newlevels::AbstractVector; allowmissing::Bool=false) where {T, N, R} - (levels(A) == newlevels) && return A # nothing to do + (_levels(A) == newlevels) && return A # nothing to do # map each new level to its ref code newlv2ref = Dict{eltype(newlevels), Int}() @@ -806,7 +808,7 @@ function levels!(A::CategoricalArray{T, N, R}, newlevels::AbstractVector; end # map each old ref code to new ref code (or 0 if no such level) - oldlevels = levels(pool(A)) + oldlevels = _levels(pool(A)) oldref2newref = fill(0, length(oldlevels) + 1) for (i, lv) in enumerate(oldlevels) oldref2newref[i + 1] = get(newlv2ref, lv, 0) @@ -867,7 +869,7 @@ end function _uniquerefs(A::CatArrOrSub{T}) where T arefs = refs(A) res = similar(arefs, 0) - nlevels = length(levels(A)) + nlevels = length(_levels(A)) maxunique = nlevels + (T >: Missing ? 1 : 0) seen = fill(false, nlevels + 1) # always +1 for 0 (missing ref) @inbounds for ref in arefs @@ -900,7 +902,7 @@ returned by [`levels`](@ref DataAPI.levels)). """ function droplevels!(A::CategoricalArray) arefs = refs(A) - nlevels = length(levels(A)) + 1 # +1 for missing + nlevels = length(_levels(A)) + 1 # +1 for missing seen = fill(false, nlevels) seen[1] = true # assume that missing is always observed to simplify checks nseen = 1 @@ -913,7 +915,7 @@ function droplevels!(A::CategoricalArray) end # replace the pool - A.pool = typeof(pool(A))(@inbounds(levels(A)[view(seen, 2:nlevels)]), isordered(A)) + A.pool = typeof(pool(A))(@inbounds(_levels(A)[view(seen, 2:nlevels)]), isordered(A)) # recode refs to keep only the seen ones (optimized version of update_refs!()) seen[1] = false # to start levelsmap from 0 levelsmap = cumsum(seen) @@ -1030,7 +1032,7 @@ end ordered=_isordered(A), compress::Bool=false) where {T, N, R} # @inline is needed so that return type is inferred when compress is not provided - RefType = compress ? reftype(length(CategoricalArrays.levels(A))) : R + RefType = compress ? reftype(length(_levels(A))) : R CategoricalArray{T, N, RefType}(A, levels=levels, ordered=ordered) end @@ -1043,7 +1045,7 @@ function in(x::CategoricalValue, y::CategoricalArray{T, N, R}) where {T, N, R} if x.pool === y.pool return refcode(x) in y.refs else - ref = get(y.pool, levels(x.pool)[refcode(x)], zero(R)) + ref = get(y.pool, _levels(x.pool)[refcode(x)], zero(R)) return ref != 0 ? ref in y.refs : false end end diff --git a/src/pool.jl b/src/pool.jl index 9753a76d..2df7e345 100644 --- a/src/pool.jl +++ b/src/pool.jl @@ -21,8 +21,8 @@ Base.convert(::Type{CategoricalPool{S}}, pool::CategoricalPool{T, R}) where {S, convert(CategoricalPool{S, R}, pool) function Base.convert(::Type{CategoricalPool{T, R}}, pool::CategoricalPool) where {T, R <: Integer} - if length(levels(pool)) > typemax(R) - throw(LevelsException{T, R}(levels(pool)[typemax(R)+1:end])) + if length(pool.levels) > typemax(R) + throw(LevelsException{T, R}(pool.levels[typemax(R)+1:end])) end levelsT = convert(Vector{T}, pool.levels) @@ -37,10 +37,10 @@ Base.copy(pool::CategoricalPool{T, R}) where {T, R} = function Base.show(io::IO, pool::CategoricalPool{T, R}) where {T, R} @static if VERSION >= v"1.6.0" @printf(io, "%s{%s, %s}([%s])", CategoricalPool, T, R, - join(map(repr, levels(pool)), ", ")) + join(map(repr, pool.levels), ", ")) else @printf(io, "%s{%s,%s}([%s])", CategoricalPool, T, R, - join(map(repr, levels(pool)), ", ")) + join(map(repr, pool.levels), ", ")) end pool.ordered && print(io, " with ordered levels") @@ -65,6 +65,7 @@ it doesn't do this itself to avoid doing a dict lookup twice i = R(n + 1) push!(pool.levels, x) + push!(pool.levelsinds, i) pool_hash = pool.hash if pool_hash !== nothing pool.hash = hash(x, pool_hash) @@ -185,10 +186,10 @@ function merge_pools(a::CategoricalPool{T}, b::CategoricalPool) where {T} newlevs = T[] ordered = isordered(a) elseif length(a) == 0 - newlevs = Vector{T}(levels(b)) + newlevs = Vector{T}(b.levels) ordered = isordered(b) elseif length(b) == 0 - newlevs = copy(levels(a)) + newlevs = copy(a.levels) ordered = isordered(a) else ordered = isordered(a) && (isordered(b) || b ⊆ a) @@ -200,7 +201,7 @@ end @inline function Base.hash(pool::CategoricalPool, h::UInt) if pool.hash === nothing - pool.hash = hashlevels(levels(pool)) + pool.hash = hashlevels(pool.levels) end hash(pool.hash, h) end @@ -246,9 +247,9 @@ end # Contrary to the CategoricalArray one, this method only allows adding new levels at the end # so that existing CategoricalValue objects still point to the same value -function levels!(pool::CategoricalPool{S, R}, newlevels::Vector; +function levels!(pool::CategoricalPool{S, R}, newlevels::AbstractVector; checkunique::Bool=true) where {S, R} - levs = convert(Vector{S}, newlevels) + levs = newlevels isa CategoricalVector{S} ? newlevels : convert(Vector{S}, newlevels) if checkunique && !allunique(levs) throw(ArgumentError(string("duplicated levels found in levs: ", join(unique(filter(x->sum(levs.==x)>1, levs)), ", ")))) @@ -259,24 +260,30 @@ function levels!(pool::CategoricalPool{S, R}, newlevels::Vector; n = length(levs) if n > typemax(R) - throw(LevelsException{S, R}(setdiff(levs, levels(pool))[typemax(R)-length(levels(pool))+1:end])) + throw(LevelsException{S, R}(setdiff(levs, pool.levels)[typemax(R)-length(pool.levels)+1:end])) end empty!(pool.invindex) resize!(pool.levels, n) + resize!(pool.levelsinds, n) pool.hash = nothing pool.equalto = C_NULL pool.subsetof = C_NULL for i in 1:n v = levs[i] pool.levels[i] = v + pool.levelsinds[i] = i pool.invindex[v] = i end return pool end -DataAPI.levels(pool::CategoricalPool) = pool.levels +DataAPI.levels(pool::CategoricalPool{T}) where {T} = + CategoricalVector{T}(pool.levelsinds, pool) +levels_missing(pool::CategoricalPool{T}) where {T} = + CategoricalVector{Union{T, Missing}}(pool.levelsinds, pool) +_levels(pool::CategoricalPool) = pool.levels isordered(pool::CategoricalPool) = pool.ordered ordered!(pool::CategoricalPool, ordered) = (pool.ordered = ordered; pool) diff --git a/src/recode.jl b/src/recode.jl index 141f9967..ff258e60 100644 --- a/src/recode.jl +++ b/src/recode.jl @@ -111,7 +111,7 @@ function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, defau levels!(dest.pool, filter!(!ismissing, unique(vals))) # In the absence of duplicated recoded values, we do not need to lookup the reference # for each pair in the loop, which is more efficient (with loop unswitching) - dupvals = length(vals) != length(levels(dest.pool)) + dupvals = length(vals) != length(_levels(dest.pool)) drefs = dest.refs pairmap = [ismissing(v) ? zero(R) : get(dest.pool, v) for v in vals] @@ -150,7 +150,7 @@ function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, defau # Put existing levels first, and sort them if possible # for consistency with CategoricalArray - oldlevels = setdiff(levels(dest), vals) + oldlevels = setdiff(_levels(dest), vals) filter!(!ismissing, oldlevels) L = eltype(oldlevels) if Base.OrderStyle(L) isa Base.Ordered @@ -163,7 +163,7 @@ function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, defau e isa MethodError || rethrow(e) end end - levels!(dest, union(oldlevels, levels(dest))) + levels!(dest, union(oldlevels, _levels(dest))) dest end @@ -174,7 +174,7 @@ function _recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray, vals = T[p.second for p in pairs] if default === nothing - srclevels = levels(src) + srclevels = _levels(src) # Remove recoded levels as they won't appear in result keptlevels = Vector{T}(undef, 0) @@ -201,7 +201,7 @@ function _recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray, ordered = false end - srclevels = src.pool === dest.pool ? copy(levels(src.pool)) : levels(src.pool) + srclevels = src.pool === dest.pool ? copy(_levels(src.pool)) : _levels(src.pool) if length(levs) > length(srclevels) && view(levs, 1:length(srclevels)) == srclevels levels!(dest.pool, levs) else diff --git a/src/typedefs.jl b/src/typedefs.jl index 0f9aa414..238bb995 100644 --- a/src/typedefs.jl +++ b/src/typedefs.jl @@ -8,6 +8,7 @@ const SupportedTypes = Union{AbstractString, AbstractChar, Number} # * `R` integer type for referencing category levels mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer} levels::Vector{T} # category levels ordered by their reference codes + levelsinds::Vector{R} # set to 1:length(levels), used by `levels(p)` invindex::Dict{T, R} # map from category levels to their reference codes ordered::Bool # whether levels can be compared using < hash::Union{UInt, Nothing} # hash of levels @@ -45,8 +46,8 @@ mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer} invindex::Dict{T, R}, ordered::Bool, hash::Union{UInt, Nothing}=nothing) where {T, R} - pool = new(levels, invindex, ordered, hash, C_NULL, C_NULL) - return pool + return new(levels, 1:length(levels), invindex, + ordered, hash, C_NULL, C_NULL) end end diff --git a/src/value.jl b/src/value.jl index ae962adb..a1633204 100644 --- a/src/value.jl +++ b/src/value.jl @@ -27,6 +27,8 @@ reftype(x::Any) = reftype(typeof(x)) pool(x::CategoricalValue) = x.pool refcode(x::CategoricalValue) = x.ref isordered(x::CategoricalValue) = isordered(x.pool) +DataAPI.levels(x::CategoricalValue) = levels(pool(x)) +_levels(x::CategoricalValue) = _levels(pool(x)) # extract the type of the original value from array eltype `T` unwrap_catvaluetype(::Type{T}) where {T} = T @@ -42,7 +44,7 @@ unwrap_catvaluetype(::Type{T}) where {T <: CategoricalValue} = leveltype(T) Get the value wrapped by categorical value `x`. If `x` is `Missing` return `missing`. """ -DataAPI.unwrap(x::CategoricalValue) = levels(x)[refcode(x)] +DataAPI.unwrap(x::CategoricalValue) = _levels(x)[refcode(x)] """ levelcode(x::CategoricalValue) @@ -59,10 +61,8 @@ Return `missing`. """ levelcode(x::Missing) = missing -DataAPI.levels(x::CategoricalValue) = levels(pool(x)) - function cat_promote_type(::Type{S}, ::Type{T}) where {S, T} - U = promote_type(S, T) + U = promote_type(unwrap_catvaluetype(S), unwrap_catvaluetype(T)) U <: Union{SupportedTypes, Missing} ? U : typeintersect(Union{SupportedTypes, Missing}, Union{S, T}) end diff --git a/test/01_value.jl b/test/01_value.jl index 39f58b67..8c60ae7f 100644 --- a/test/01_value.jl +++ b/test/01_value.jl @@ -22,6 +22,8 @@ end for i in 1:3 x = CategoricalValue(pool, i) + @test levels(x) == levels(pool) + @test levels(x) isa CategoricalVector{String, UInt32} @test leveltype(x) === String @test leveltype(typeof(x)) === String @test reftype(x) === DefaultRefType @@ -48,6 +50,8 @@ end for i in 1:3 x = CategoricalValue(pool, i) + @test levels(x) == levels(pool) + @test levels(x) isa CategoricalVector{String, UInt8} @test leveltype(x) === String @test leveltype(typeof(x)) === String @test reftype(x) === UInt8 @@ -68,7 +72,8 @@ end for x in (CategoricalValue(pool, 1), arr, view(arr, 2:3)) for (i, v) in enumerate(levels(pool)) @test CategoricalValue(v, x) === - CategoricalValue(float(v), x) === + CategoricalValue(unwrap(v), x) === + CategoricalValue(float(unwrap(v)), x) === CategoricalValue(CategoricalValue(pool, i), x) === CategoricalValue(pool, i) end diff --git a/test/07_levels.jl b/test/07_levels.jl index 25c54be0..b54e4d52 100644 --- a/test/07_levels.jl +++ b/test/07_levels.jl @@ -1,15 +1,16 @@ module TestLevels using Test using CategoricalArrays -using CategoricalArrays: DefaultRefType, levels!, hashlevels +using CategoricalArrays: DefaultRefType, levels!, hashlevels, _levels @testset "CategoricalPool{Int} updates levels and order correctly" begin pool = CategoricalPool([2, 1, 3]) - @test isa(levels(pool), Vector{Int}) + @test isa(levels(pool), CategoricalVector{Int, DefaultRefType}) @test length(pool) === 3 - @test levels(pool) == [2, 1, 3] - @test all([levels(CategoricalValue(pool, i)) for i in 1:3] .=== Ref(levels(pool))) + @test levels(pool) == _levels(pool) == [2, 1, 3] + @test all([levels(CategoricalValue(pool, i)) for i in 1:3] .== Ref(levels(pool))) + @test pool.levelsinds == 1:3 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3) @test pool.hash === nothing @test pool.equalto == C_NULL @@ -20,7 +21,8 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) === 4 - @test levels(pool) == [2, 1, 3, 4] + @test levels(pool) == _levels(pool) == [2, 1, 3, 4] + @test pool.levelsinds == 1:4 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4) @test pool.hash === nothing @test pool.equalto == C_NULL @@ -34,7 +36,8 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) === 5 - @test levels(pool) == [2, 1, 3, 4, 0] + @test levels(pool) == _levels(pool) == [2, 1, 3, 4, 0] + @test pool.levelsinds == 1:5 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5) @test pool.hash === nothing @test pool.equalto == C_NULL @@ -48,7 +51,8 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) === 7 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11] + @test levels(pool) == _levels(pool) == [2, 1, 3, 4, 0, 10, 11] + @test pool.levelsinds == 1:7 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7) @test pool.hash === nothing @test pool.equalto == C_NULL @@ -64,7 +68,8 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) === 9 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13] + @test levels(pool) == _levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13] + @test pool.levelsinds == 1:9 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9) @test pool.hash === nothing @test pool.equalto == C_NULL @@ -84,15 +89,10 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels # Adding levels while preserving existing ones levs = [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14] @test levels!(pool, levs) === pool - @test levels(pool) == levs - @test levels(pool) !== levs - @test pool.hash === nothing - @test pool.equalto == C_NULL - @test pool.subsetof == C_NULL - + @test levels(pool) == _levels(pool) == levs + @test pool.levels !== levs @test isa(pool.levels, Vector{Int}) - @test length(pool) === 11 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14] + @test pool.levelsinds == 1:11 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9, 15=>10, 14=>11) @test pool.hash === nothing @@ -109,7 +109,10 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) == 12 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20] + @test levels(pool) == + _levels(pool) == + [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20] + @test pool.levelsinds == 1:12 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9, 15=>10, 14=>11, 20=>12) @test pool.hash === nothing @@ -128,7 +131,10 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) == 14 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99] + @test levels(pool) == + _levels(pool) == + [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99] + @test pool.levelsinds == 1:14 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9, 15=>10, 14=>11, 20=>12, 100=>13, 99=>14) @test pool.hash === nothing @@ -143,7 +149,10 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) == 14 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99] + @test levels(pool) == + _levels(pool) == + [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99] + @test pool.levelsinds == 1:14 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9, 15=>10, 14=>11, 20=>12, 100=>13, 99=>14) @test pool.hash === CategoricalArrays.hashlevels(levels(pool)) @@ -155,7 +164,10 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) == 14 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99] + @test levels(pool) == + _levels(pool) == + [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99] + @test pool.levelsinds == 1:14 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9, 15=>10, 14=>11, 20=>12, 100=>13, 99=>14) @test pool.hash === CategoricalArrays.hashlevels(levels(pool)) @@ -178,6 +190,22 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test !isordered(p2) end +@testset "levels!(::CategoricalPool, ::CategoricalVector)" begin + pool = CategoricalPool([2, 1, 3]) + levels!(pool, categorical([2, 1, 3, 4])) + @test levels(pool) == [2, 1, 3, 4] + + pool = CategoricalPool([2, 1, 3]) + levels!(pool, categorical([2.0, 1.0, 3.0, 4.0])) + @test levels(pool) == [2, 1, 3, 4] + + pool = CategoricalPool([2, 1, 3]) + @test_throws ArgumentError levels!(pool, categorical([2, 2, 1, 3, 4])) + + pool = CategoricalPool([2, 1, 3]) + @test_throws ArgumentError levels!(pool, categorical(1:3)) +end + @testset "overflow of reftype is detected and doesn't corrupt levels" begin res = @test_throws LevelsException{Int, UInt8} CategoricalPool{Int, UInt8}(collect(256:-1:1)) @test res.value.levels == [1] diff --git a/test/11_array.jl b/test/11_array.jl index b474cfe1..4f332640 100644 --- a/test/11_array.jl +++ b/test/11_array.jl @@ -746,7 +746,7 @@ end @test y == unique(x) x = CategoricalArray(String[]) - @test isa(levels(x), Vector{String}) && isempty(levels(x)) + @test isa(levels(x), CategoricalVector{String, DefaultRefType}) && isempty(levels(x)) @test isa(unique(x), typeof(x)) && isempty(unique(x)) @test levels!(x, ["Young", "Middle", "Old"]) === x @test levels(x) == ["Young", "Middle", "Old"] diff --git a/test/12_missingarray.jl b/test/12_missingarray.jl index a2204e40..5c2ed3a9 100644 --- a/test/12_missingarray.jl +++ b/test/12_missingarray.jl @@ -1160,7 +1160,7 @@ end @test unique(x) ≅ ["Old", "Young", "Middle", missing] x = CategoricalArray((Union{String, Missing})[missing]) - @test isa(levels(x), Vector{String}) && isempty(levels(x)) + @test isa(levels(x), CategoricalVector{String, DefaultRefType}) && isempty(levels(x)) @test unique(x) ≅ [missing] @test levels!(x, ["Young", "Middle", "Old"]) === x @test levels(x) == ["Young", "Middle", "Old"] diff --git a/test/13_arraycommon.jl b/test/13_arraycommon.jl index 02b51bd7..ac5fd424 100644 --- a/test/13_arraycommon.jl +++ b/test/13_arraycommon.jl @@ -2326,18 +2326,18 @@ end view(categorical(Union{String, Missing}[missing, "b", "a"], levels=["b", "c", "a"]), 2:3)) @test @inferred(levels(x)) == ["b", "c", "a"] @test levels(x, skipmissing=true) == ["b", "c", "a"] - @test levels(x, skipmissing=true) isa Vector{String} + @test levels(x, skipmissing=true) isa CategoricalVector{String} @test levels(x, skipmissing=false) == ["b", "c", "a"] - @test levels(x, skipmissing=false) isa Vector{Union{String, Missing}} + @test levels(x, skipmissing=false) isa CategoricalVector{Union{String, Missing}} end for x in (categorical(Union{String, Missing}["a", "b", missing], levels=["b", "c", "a"]), view(categorical(Union{String, Missing}["c", "b", missing], levels=["b", "c", "a"]), 2:3)) @test @inferred(levels(x)) == ["b", "c", "a"] @test levels(x, skipmissing=true) == ["b", "c", "a"] - @test levels(x, skipmissing=true) isa Vector{String} + @test levels(x, skipmissing=true) isa CategoricalVector{String} @test levels(x, skipmissing=false) ≅ ["b", "c", "a", missing] - @test levels(x, skipmissing=false) isa Vector{Union{String, Missing}} + @test levels(x, skipmissing=false) isa CategoricalVector{Union{String, Missing}} end end diff --git a/test/14_view.jl b/test/14_view.jl index 79b20812..11853853 100644 --- a/test/14_view.jl +++ b/test/14_view.jl @@ -11,7 +11,8 @@ const ≅ = isequal x = CategoricalArray{Union{T, eltype(a)}}(a, ordered=order) v = view(x, inds) - @test levels(v) === levels(x) + @test levels(x) isa CategoricalVector{nonmissingtype(eltype(a))} + @test levels(v) == levels(x) @test unique(v) == (ndims(v) > 0 ? unique(a[inds]) : [a[inds]]) @test isordered(v) === isordered(x) end From 87b50fc4fcade15c3c8aad1c92ac4c5f7cff9973 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Tue, 27 May 2025 23:03:22 +0200 Subject: [PATCH 2/2] Fix doctests --- docs/src/using.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/using.md b/docs/src/using.md index aaa6e36c..24c452b0 100644 --- a/docs/src/using.md +++ b/docs/src/using.md @@ -251,7 +251,7 @@ julia> xy = vcat(x, y) "Middle" julia> levels(xy) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "Young" "Middle" "Old"