diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 5c2ae42b..bf12f7c9 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -55,11 +55,11 @@ SUITE["many levels"]["CategoricalArray(::Vector{String})"] = a = rand([@sprintf("id%010d", k) for k in 1:1000], 10000) ca = CategoricalArray(a) -levs = levels(ca) +levs = unwrap.(levels(ca)) SUITE["many levels"]["levels! with original levels"] = @benchmarkable levels!(ca, levs) -levs = reverse(levels(ca)) +levs = reverse(unwrap.(levels(ca))) SUITE["many levels"]["levels! with resorted levels"] = @benchmarkable levels!(ca, levs) diff --git a/docs/src/using.md b/docs/src/using.md index 9790e8cf..24c452b0 100644 --- a/docs/src/using.md +++ b/docs/src/using.md @@ -20,7 +20,7 @@ By default, the levels are lexically sorted, which is clearly not correct in our ```jldoctest using julia> levels(x) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "Middle" "Old" "Young" @@ -68,7 +68,7 @@ To get rid of the `"Old"` group, just call the [`droplevels!`](@ref) function: ```jldoctest using julia> levels(x) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "Young" "Middle" "Old" @@ -81,7 +81,7 @@ julia> droplevels!(x) "Young" julia> levels(x) -2-element Vector{String}: +2-element CategoricalArray{String,1,UInt32}: "Young" "Middle" @@ -139,7 +139,7 @@ Levels still need to be reordered manually: ```jldoctest using julia> levels(y) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "Middle" "Old" "Young" @@ -251,7 +251,7 @@ julia> xy = vcat(x, y) "Middle" julia> levels(xy) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "Young" "Middle" "Old" @@ -263,7 +263,7 @@ true Likewise, assigning a `CategoricalValue` from `y` to an entry in `x` expands the levels of `x` with all levels from `y`, *respecting the ordering of levels of both vectors if possible*: ```jldoctest using julia> levels(x) -2-element Vector{String}: +2-element CategoricalArray{String,1,UInt32}: "Middle" "Old" @@ -271,7 +271,7 @@ julia> x[1] = y[1] CategoricalValue{String, UInt32} "Young" (1/2) julia> levels(x) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "Young" "Middle" "Old" @@ -296,7 +296,7 @@ julia> ab = vcat(a, b) "c" julia> levels(ab) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "a" "b" "c" @@ -320,7 +320,7 @@ julia> ab2 = vcat(a, b) "c" julia> levels(ab2) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "a" "b" "c" diff --git a/ext/CategoricalArraysArrowExt.jl b/ext/CategoricalArraysArrowExt.jl index 3e764122..811870d2 100644 --- a/ext/CategoricalArraysArrowExt.jl +++ b/ext/CategoricalArraysArrowExt.jl @@ -7,6 +7,8 @@ import Arrow: ArrowTypes const CATARRAY_ARROWNAME = Symbol("JuliaLang.CategoricalArrays.CategoricalArray") ArrowTypes.arrowname(::Type{<:CategoricalValue}) = CATARRAY_ARROWNAME ArrowTypes.arrowmetadata(::Type{CategoricalValue{T, R}}) where {T, R} = string(R) +ArrowTypes.ArrowType(::Type{<:CategoricalValue{T}}) where {T} = T +ArrowTypes.toarrow(x::CategoricalValue) = unwrap(x) ArrowTypes.arrowname(::Type{Union{<:CategoricalValue, Missing}}) = CATARRAY_ARROWNAME ArrowTypes.arrowmetadata(::Type{Union{CategoricalValue{T, R}, Missing}}) where {T, R} = diff --git a/ext/CategoricalArraysRecipesBaseExt.jl b/ext/CategoricalArraysRecipesBaseExt.jl index 2642f838..656f3e3d 100644 --- a/ext/CategoricalArraysRecipesBaseExt.jl +++ b/ext/CategoricalArraysRecipesBaseExt.jl @@ -9,7 +9,7 @@ else end RecipesBase.@recipe function f(::Type{T}, v::T) where T <: CategoricalValue - level_strings = [map(string, levels(v)); missing] + level_strings = [map(string, CategoricalArrays._levels(v)); missing] ticks --> eachindex(level_strings) v -> ismissing(v) ? length(level_strings) : Int(CategoricalArrays.refcode(v)), i -> level_strings[Int(i)] diff --git a/src/array.jl b/src/array.jl index c462e7d4..a2103e2a 100644 --- a/src/array.jl +++ b/src/array.jl @@ -239,7 +239,7 @@ function CategoricalArray{T, N, R}(A::CategoricalArray{S, N, Q}; catch err err isa LevelsException || rethrow(err) throw(ArgumentError("encountered value(s) not in specified `levels`: " * - "$(setdiff(CategoricalArrays.levels(res), levels))")) + "$(setdiff(_levels(res), levels))")) end end return res @@ -358,18 +358,18 @@ function _convert(::Type{CategoricalArray{T, N, R}}, A::AbstractArray{S, N}; copyto!(res, A) if levels !== nothing - CategoricalArrays.levels(res) == levels || + _levels(res) == levels || throw(ArgumentError("encountered value(s) not in specified `levels`: " * - "$(setdiff(CategoricalArrays.levels(res), levels))")) + "$(setdiff(_levels(res), levels))")) else # if order is defined for level type, automatically apply it L = leveltype(res) if Base.OrderStyle(L) isa Base.Ordered - levels!(res, sort(CategoricalArrays.levels(res))) + levels!(res, sort(_levels(res))) elseif hasmethod(isless, (L, L)) # isless may throw an error, e.g. for AbstractArray{T} of unordered T try - levels!(res, sort(CategoricalArrays.levels(res))) + levels!(res, sort(_levels(res))) catch e e isa MethodError || rethrow(e) end @@ -382,7 +382,7 @@ end # From CategoricalArray (preserve levels, ordering and R) function convert(::Type{CategoricalArray{T, N, R}}, A::CategoricalArray{S, N}) where {S, T, N, R} if length(A.pool) > typemax(R) - throw(LevelsException{T, R}(levels(A)[typemax(R)+1:end])) + throw(LevelsException{T, R}(_levels(A)[typemax(R)+1:end])) end if !(T >: Missing) && S >: Missing && any(iszero, A.refs) @@ -460,7 +460,7 @@ size(A::CategoricalArray) = size(A.refs) Base.IndexStyle(::Type{<:CategoricalArray}) = IndexLinear() function update_refs!(A::CategoricalArray, newlevels::AbstractVector) - oldlevels = levels(A) + oldlevels = _levels(A) levelsmap = similar(A.refs, length(oldlevels)+1) # 0 maps to a missing value levelsmap[1] = 0 @@ -478,7 +478,7 @@ function merge_pools!(A::CatArrOrSub, updaterefs::Bool=true, updatepool::Bool=true) newlevels, ordered = merge_pools(pool(A), pool(B)) - oldlevels = levels(A) + oldlevels = _levels(A) pA = A isa SubArray ? parent(A) : A ordered!(pA, ordered) # If A's levels are an ordered superset of new (merged) pool, no need to recompute refs @@ -537,8 +537,8 @@ function copyto!(dest::CatArrOrSub{T, N, R}, dstart::Integer, # try converting src to dest type to avoid partial copy corruption of dest # in the event that the src cannot be copied into dest - slevs = convert(Vector{T}, levels(src)) - dlevs = levels(dest) + slevs = convert(Vector{T}, _levels(src)) + dlevs = _levels(dest) if eltype(src) >: Missing && !(eltype(dest) >: Missing) && !all(x -> x > 0, srefs) throw(MissingException("cannot copy array with missing values to an array with element type $T")) end @@ -591,7 +591,7 @@ function copyto!(dest::CatArrOrSub{T1, N, R}, dstart::Integer, return invoke(copyto!, Tuple{AbstractArray, Integer, AbstractArray, Integer, Integer}, dest, dstart, src, sstart, n) end - newdestlevs = destlevs = copy(levels(dest)) # copy since we need original levels below + newdestlevs = destlevs = copy(_levels(dest)) # copy since we need original levels below srclevsnm = T2 >: Missing ? setdiff(srclevs, [missing]) : srclevs if !(srclevsnm ⊆ destlevs) # if order is defined for level type, automatically apply it @@ -701,7 +701,7 @@ While this will reduce memory use, this function is type-unstable, which can aff performance inside the function where the call is made. Therefore, use it with caution. """ function compress(A::CategoricalArray{T, N}) where {T, N} - R = reftype(length(levels(A.pool))) + R = reftype(length(_levels(A.pool))) convert(CategoricalArray{T, N, R}, A) end @@ -719,11 +719,11 @@ decompress(A::CategoricalArray{T, N}) where {T, N} = convert(CategoricalArray{T, N, DefaultRefType}, A) function vcat(A::CategoricalArray...) - ordered = any(isordered, A) && all(a->isordered(a) || isempty(levels(a)), A) - newlevels, ordered = mergelevels(ordered, map(levels, A)...) + ordered = any(isordered, A) && all(a->isordered(a) || isempty(_levels(a)), A) + newlevels, ordered = mergelevels(ordered, map(_levels, A)...) refsvec = map(A) do a - ii = convert(Vector{Int}, indexin(levels(a.pool), newlevels)) + ii = convert(Vector{Int}, indexin(_levels(a.pool), newlevels)) [x==0 ? 0 : ii[x] for x in a.refs]::Array{Int,ndims(a)} end @@ -761,23 +761,25 @@ This may include levels which do not actually appear in the data `missing` will be included only if it appears in the data and `skipmissing=false` is passed. -The returned vector is an internal field of `x` which must not be mutated +The returned vector is owned by `x` and must not be mutated as doing so would corrupt it. """ -@inline function DataAPI.levels(A::CatArrOrSub{T}; skipmissing::Bool=true) where T +@inline function DataAPI.levels(A::CatArrOrSub; skipmissing::Bool=true) if eltype(A) >: Missing && !skipmissing if any(==(0), refs(A)) - T[levels(pool(A)); missing] + eltype(A)[levels(pool(A)); missing] else - convert(Vector{T}, levels(pool(A))) + levels_missing(pool(A)) end else levels(pool(A)) end end +_levels(A::CatArrOrSub) = _levels(pool(A)) + """ - levels!(A::CategoricalArray, newlevels::Vector; allowmissing::Bool=false) + levels!(A::CategoricalArray, newlevels::AbstractVector; allowmissing::Bool=false) Set the levels categorical array `A`. The order of appearance of levels will be respected by [`levels`](@ref DataAPI.levels), which may affect display of results in some operations; if `A` is @@ -791,7 +793,7 @@ Else, `newlevels` must include all levels which appear in the data. """ function levels!(A::CategoricalArray{T, N, R}, newlevels::AbstractVector; allowmissing::Bool=false) where {T, N, R} - (levels(A) == newlevels) && return A # nothing to do + (_levels(A) == newlevels) && return A # nothing to do # map each new level to its ref code newlv2ref = Dict{eltype(newlevels), Int}() @@ -806,7 +808,7 @@ function levels!(A::CategoricalArray{T, N, R}, newlevels::AbstractVector; end # map each old ref code to new ref code (or 0 if no such level) - oldlevels = levels(pool(A)) + oldlevels = _levels(pool(A)) oldref2newref = fill(0, length(oldlevels) + 1) for (i, lv) in enumerate(oldlevels) oldref2newref[i + 1] = get(newlv2ref, lv, 0) @@ -867,7 +869,7 @@ end function _uniquerefs(A::CatArrOrSub{T}) where T arefs = refs(A) res = similar(arefs, 0) - nlevels = length(levels(A)) + nlevels = length(_levels(A)) maxunique = nlevels + (T >: Missing ? 1 : 0) seen = fill(false, nlevels + 1) # always +1 for 0 (missing ref) @inbounds for ref in arefs @@ -900,7 +902,7 @@ returned by [`levels`](@ref DataAPI.levels)). """ function droplevels!(A::CategoricalArray) arefs = refs(A) - nlevels = length(levels(A)) + 1 # +1 for missing + nlevels = length(_levels(A)) + 1 # +1 for missing seen = fill(false, nlevels) seen[1] = true # assume that missing is always observed to simplify checks nseen = 1 @@ -913,7 +915,7 @@ function droplevels!(A::CategoricalArray) end # replace the pool - A.pool = typeof(pool(A))(@inbounds(levels(A)[view(seen, 2:nlevels)]), isordered(A)) + A.pool = typeof(pool(A))(@inbounds(_levels(A)[view(seen, 2:nlevels)]), isordered(A)) # recode refs to keep only the seen ones (optimized version of update_refs!()) seen[1] = false # to start levelsmap from 0 levelsmap = cumsum(seen) @@ -1030,7 +1032,7 @@ end ordered=_isordered(A), compress::Bool=false) where {T, N, R} # @inline is needed so that return type is inferred when compress is not provided - RefType = compress ? reftype(length(CategoricalArrays.levels(A))) : R + RefType = compress ? reftype(length(_levels(A))) : R CategoricalArray{T, N, RefType}(A, levels=levels, ordered=ordered) end @@ -1043,7 +1045,7 @@ function in(x::CategoricalValue, y::CategoricalArray{T, N, R}) where {T, N, R} if x.pool === y.pool return refcode(x) in y.refs else - ref = get(y.pool, levels(x.pool)[refcode(x)], zero(R)) + ref = get(y.pool, _levels(x.pool)[refcode(x)], zero(R)) return ref != 0 ? ref in y.refs : false end end diff --git a/src/pool.jl b/src/pool.jl index 9753a76d..2df7e345 100644 --- a/src/pool.jl +++ b/src/pool.jl @@ -21,8 +21,8 @@ Base.convert(::Type{CategoricalPool{S}}, pool::CategoricalPool{T, R}) where {S, convert(CategoricalPool{S, R}, pool) function Base.convert(::Type{CategoricalPool{T, R}}, pool::CategoricalPool) where {T, R <: Integer} - if length(levels(pool)) > typemax(R) - throw(LevelsException{T, R}(levels(pool)[typemax(R)+1:end])) + if length(pool.levels) > typemax(R) + throw(LevelsException{T, R}(pool.levels[typemax(R)+1:end])) end levelsT = convert(Vector{T}, pool.levels) @@ -37,10 +37,10 @@ Base.copy(pool::CategoricalPool{T, R}) where {T, R} = function Base.show(io::IO, pool::CategoricalPool{T, R}) where {T, R} @static if VERSION >= v"1.6.0" @printf(io, "%s{%s, %s}([%s])", CategoricalPool, T, R, - join(map(repr, levels(pool)), ", ")) + join(map(repr, pool.levels), ", ")) else @printf(io, "%s{%s,%s}([%s])", CategoricalPool, T, R, - join(map(repr, levels(pool)), ", ")) + join(map(repr, pool.levels), ", ")) end pool.ordered && print(io, " with ordered levels") @@ -65,6 +65,7 @@ it doesn't do this itself to avoid doing a dict lookup twice i = R(n + 1) push!(pool.levels, x) + push!(pool.levelsinds, i) pool_hash = pool.hash if pool_hash !== nothing pool.hash = hash(x, pool_hash) @@ -185,10 +186,10 @@ function merge_pools(a::CategoricalPool{T}, b::CategoricalPool) where {T} newlevs = T[] ordered = isordered(a) elseif length(a) == 0 - newlevs = Vector{T}(levels(b)) + newlevs = Vector{T}(b.levels) ordered = isordered(b) elseif length(b) == 0 - newlevs = copy(levels(a)) + newlevs = copy(a.levels) ordered = isordered(a) else ordered = isordered(a) && (isordered(b) || b ⊆ a) @@ -200,7 +201,7 @@ end @inline function Base.hash(pool::CategoricalPool, h::UInt) if pool.hash === nothing - pool.hash = hashlevels(levels(pool)) + pool.hash = hashlevels(pool.levels) end hash(pool.hash, h) end @@ -246,9 +247,9 @@ end # Contrary to the CategoricalArray one, this method only allows adding new levels at the end # so that existing CategoricalValue objects still point to the same value -function levels!(pool::CategoricalPool{S, R}, newlevels::Vector; +function levels!(pool::CategoricalPool{S, R}, newlevels::AbstractVector; checkunique::Bool=true) where {S, R} - levs = convert(Vector{S}, newlevels) + levs = newlevels isa CategoricalVector{S} ? newlevels : convert(Vector{S}, newlevels) if checkunique && !allunique(levs) throw(ArgumentError(string("duplicated levels found in levs: ", join(unique(filter(x->sum(levs.==x)>1, levs)), ", ")))) @@ -259,24 +260,30 @@ function levels!(pool::CategoricalPool{S, R}, newlevels::Vector; n = length(levs) if n > typemax(R) - throw(LevelsException{S, R}(setdiff(levs, levels(pool))[typemax(R)-length(levels(pool))+1:end])) + throw(LevelsException{S, R}(setdiff(levs, pool.levels)[typemax(R)-length(pool.levels)+1:end])) end empty!(pool.invindex) resize!(pool.levels, n) + resize!(pool.levelsinds, n) pool.hash = nothing pool.equalto = C_NULL pool.subsetof = C_NULL for i in 1:n v = levs[i] pool.levels[i] = v + pool.levelsinds[i] = i pool.invindex[v] = i end return pool end -DataAPI.levels(pool::CategoricalPool) = pool.levels +DataAPI.levels(pool::CategoricalPool{T}) where {T} = + CategoricalVector{T}(pool.levelsinds, pool) +levels_missing(pool::CategoricalPool{T}) where {T} = + CategoricalVector{Union{T, Missing}}(pool.levelsinds, pool) +_levels(pool::CategoricalPool) = pool.levels isordered(pool::CategoricalPool) = pool.ordered ordered!(pool::CategoricalPool, ordered) = (pool.ordered = ordered; pool) diff --git a/src/recode.jl b/src/recode.jl index 141f9967..ff258e60 100644 --- a/src/recode.jl +++ b/src/recode.jl @@ -111,7 +111,7 @@ function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, defau levels!(dest.pool, filter!(!ismissing, unique(vals))) # In the absence of duplicated recoded values, we do not need to lookup the reference # for each pair in the loop, which is more efficient (with loop unswitching) - dupvals = length(vals) != length(levels(dest.pool)) + dupvals = length(vals) != length(_levels(dest.pool)) drefs = dest.refs pairmap = [ismissing(v) ? zero(R) : get(dest.pool, v) for v in vals] @@ -150,7 +150,7 @@ function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, defau # Put existing levels first, and sort them if possible # for consistency with CategoricalArray - oldlevels = setdiff(levels(dest), vals) + oldlevels = setdiff(_levels(dest), vals) filter!(!ismissing, oldlevels) L = eltype(oldlevels) if Base.OrderStyle(L) isa Base.Ordered @@ -163,7 +163,7 @@ function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, defau e isa MethodError || rethrow(e) end end - levels!(dest, union(oldlevels, levels(dest))) + levels!(dest, union(oldlevels, _levels(dest))) dest end @@ -174,7 +174,7 @@ function _recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray, vals = T[p.second for p in pairs] if default === nothing - srclevels = levels(src) + srclevels = _levels(src) # Remove recoded levels as they won't appear in result keptlevels = Vector{T}(undef, 0) @@ -201,7 +201,7 @@ function _recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray, ordered = false end - srclevels = src.pool === dest.pool ? copy(levels(src.pool)) : levels(src.pool) + srclevels = src.pool === dest.pool ? copy(_levels(src.pool)) : _levels(src.pool) if length(levs) > length(srclevels) && view(levs, 1:length(srclevels)) == srclevels levels!(dest.pool, levs) else diff --git a/src/typedefs.jl b/src/typedefs.jl index 0f9aa414..238bb995 100644 --- a/src/typedefs.jl +++ b/src/typedefs.jl @@ -8,6 +8,7 @@ const SupportedTypes = Union{AbstractString, AbstractChar, Number} # * `R` integer type for referencing category levels mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer} levels::Vector{T} # category levels ordered by their reference codes + levelsinds::Vector{R} # set to 1:length(levels), used by `levels(p)` invindex::Dict{T, R} # map from category levels to their reference codes ordered::Bool # whether levels can be compared using < hash::Union{UInt, Nothing} # hash of levels @@ -45,8 +46,8 @@ mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer} invindex::Dict{T, R}, ordered::Bool, hash::Union{UInt, Nothing}=nothing) where {T, R} - pool = new(levels, invindex, ordered, hash, C_NULL, C_NULL) - return pool + return new(levels, 1:length(levels), invindex, + ordered, hash, C_NULL, C_NULL) end end diff --git a/src/value.jl b/src/value.jl index ae962adb..a1633204 100644 --- a/src/value.jl +++ b/src/value.jl @@ -27,6 +27,8 @@ reftype(x::Any) = reftype(typeof(x)) pool(x::CategoricalValue) = x.pool refcode(x::CategoricalValue) = x.ref isordered(x::CategoricalValue) = isordered(x.pool) +DataAPI.levels(x::CategoricalValue) = levels(pool(x)) +_levels(x::CategoricalValue) = _levels(pool(x)) # extract the type of the original value from array eltype `T` unwrap_catvaluetype(::Type{T}) where {T} = T @@ -42,7 +44,7 @@ unwrap_catvaluetype(::Type{T}) where {T <: CategoricalValue} = leveltype(T) Get the value wrapped by categorical value `x`. If `x` is `Missing` return `missing`. """ -DataAPI.unwrap(x::CategoricalValue) = levels(x)[refcode(x)] +DataAPI.unwrap(x::CategoricalValue) = _levels(x)[refcode(x)] """ levelcode(x::CategoricalValue) @@ -59,10 +61,8 @@ Return `missing`. """ levelcode(x::Missing) = missing -DataAPI.levels(x::CategoricalValue) = levels(pool(x)) - function cat_promote_type(::Type{S}, ::Type{T}) where {S, T} - U = promote_type(S, T) + U = promote_type(unwrap_catvaluetype(S), unwrap_catvaluetype(T)) U <: Union{SupportedTypes, Missing} ? U : typeintersect(Union{SupportedTypes, Missing}, Union{S, T}) end diff --git a/test/01_value.jl b/test/01_value.jl index 39f58b67..8c60ae7f 100644 --- a/test/01_value.jl +++ b/test/01_value.jl @@ -22,6 +22,8 @@ end for i in 1:3 x = CategoricalValue(pool, i) + @test levels(x) == levels(pool) + @test levels(x) isa CategoricalVector{String, UInt32} @test leveltype(x) === String @test leveltype(typeof(x)) === String @test reftype(x) === DefaultRefType @@ -48,6 +50,8 @@ end for i in 1:3 x = CategoricalValue(pool, i) + @test levels(x) == levels(pool) + @test levels(x) isa CategoricalVector{String, UInt8} @test leveltype(x) === String @test leveltype(typeof(x)) === String @test reftype(x) === UInt8 @@ -68,7 +72,8 @@ end for x in (CategoricalValue(pool, 1), arr, view(arr, 2:3)) for (i, v) in enumerate(levels(pool)) @test CategoricalValue(v, x) === - CategoricalValue(float(v), x) === + CategoricalValue(unwrap(v), x) === + CategoricalValue(float(unwrap(v)), x) === CategoricalValue(CategoricalValue(pool, i), x) === CategoricalValue(pool, i) end diff --git a/test/07_levels.jl b/test/07_levels.jl index 25c54be0..b54e4d52 100644 --- a/test/07_levels.jl +++ b/test/07_levels.jl @@ -1,15 +1,16 @@ module TestLevels using Test using CategoricalArrays -using CategoricalArrays: DefaultRefType, levels!, hashlevels +using CategoricalArrays: DefaultRefType, levels!, hashlevels, _levels @testset "CategoricalPool{Int} updates levels and order correctly" begin pool = CategoricalPool([2, 1, 3]) - @test isa(levels(pool), Vector{Int}) + @test isa(levels(pool), CategoricalVector{Int, DefaultRefType}) @test length(pool) === 3 - @test levels(pool) == [2, 1, 3] - @test all([levels(CategoricalValue(pool, i)) for i in 1:3] .=== Ref(levels(pool))) + @test levels(pool) == _levels(pool) == [2, 1, 3] + @test all([levels(CategoricalValue(pool, i)) for i in 1:3] .== Ref(levels(pool))) + @test pool.levelsinds == 1:3 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3) @test pool.hash === nothing @test pool.equalto == C_NULL @@ -20,7 +21,8 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) === 4 - @test levels(pool) == [2, 1, 3, 4] + @test levels(pool) == _levels(pool) == [2, 1, 3, 4] + @test pool.levelsinds == 1:4 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4) @test pool.hash === nothing @test pool.equalto == C_NULL @@ -34,7 +36,8 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) === 5 - @test levels(pool) == [2, 1, 3, 4, 0] + @test levels(pool) == _levels(pool) == [2, 1, 3, 4, 0] + @test pool.levelsinds == 1:5 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5) @test pool.hash === nothing @test pool.equalto == C_NULL @@ -48,7 +51,8 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) === 7 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11] + @test levels(pool) == _levels(pool) == [2, 1, 3, 4, 0, 10, 11] + @test pool.levelsinds == 1:7 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7) @test pool.hash === nothing @test pool.equalto == C_NULL @@ -64,7 +68,8 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) === 9 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13] + @test levels(pool) == _levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13] + @test pool.levelsinds == 1:9 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9) @test pool.hash === nothing @test pool.equalto == C_NULL @@ -84,15 +89,10 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels # Adding levels while preserving existing ones levs = [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14] @test levels!(pool, levs) === pool - @test levels(pool) == levs - @test levels(pool) !== levs - @test pool.hash === nothing - @test pool.equalto == C_NULL - @test pool.subsetof == C_NULL - + @test levels(pool) == _levels(pool) == levs + @test pool.levels !== levs @test isa(pool.levels, Vector{Int}) - @test length(pool) === 11 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14] + @test pool.levelsinds == 1:11 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9, 15=>10, 14=>11) @test pool.hash === nothing @@ -109,7 +109,10 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) == 12 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20] + @test levels(pool) == + _levels(pool) == + [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20] + @test pool.levelsinds == 1:12 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9, 15=>10, 14=>11, 20=>12) @test pool.hash === nothing @@ -128,7 +131,10 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) == 14 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99] + @test levels(pool) == + _levels(pool) == + [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99] + @test pool.levelsinds == 1:14 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9, 15=>10, 14=>11, 20=>12, 100=>13, 99=>14) @test pool.hash === nothing @@ -143,7 +149,10 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) == 14 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99] + @test levels(pool) == + _levels(pool) == + [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99] + @test pool.levelsinds == 1:14 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9, 15=>10, 14=>11, 20=>12, 100=>13, 99=>14) @test pool.hash === CategoricalArrays.hashlevels(levels(pool)) @@ -155,7 +164,10 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) == 14 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99] + @test levels(pool) == + _levels(pool) == + [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99] + @test pool.levelsinds == 1:14 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9, 15=>10, 14=>11, 20=>12, 100=>13, 99=>14) @test pool.hash === CategoricalArrays.hashlevels(levels(pool)) @@ -178,6 +190,22 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test !isordered(p2) end +@testset "levels!(::CategoricalPool, ::CategoricalVector)" begin + pool = CategoricalPool([2, 1, 3]) + levels!(pool, categorical([2, 1, 3, 4])) + @test levels(pool) == [2, 1, 3, 4] + + pool = CategoricalPool([2, 1, 3]) + levels!(pool, categorical([2.0, 1.0, 3.0, 4.0])) + @test levels(pool) == [2, 1, 3, 4] + + pool = CategoricalPool([2, 1, 3]) + @test_throws ArgumentError levels!(pool, categorical([2, 2, 1, 3, 4])) + + pool = CategoricalPool([2, 1, 3]) + @test_throws ArgumentError levels!(pool, categorical(1:3)) +end + @testset "overflow of reftype is detected and doesn't corrupt levels" begin res = @test_throws LevelsException{Int, UInt8} CategoricalPool{Int, UInt8}(collect(256:-1:1)) @test res.value.levels == [1] diff --git a/test/11_array.jl b/test/11_array.jl index b474cfe1..4f332640 100644 --- a/test/11_array.jl +++ b/test/11_array.jl @@ -746,7 +746,7 @@ end @test y == unique(x) x = CategoricalArray(String[]) - @test isa(levels(x), Vector{String}) && isempty(levels(x)) + @test isa(levels(x), CategoricalVector{String, DefaultRefType}) && isempty(levels(x)) @test isa(unique(x), typeof(x)) && isempty(unique(x)) @test levels!(x, ["Young", "Middle", "Old"]) === x @test levels(x) == ["Young", "Middle", "Old"] diff --git a/test/12_missingarray.jl b/test/12_missingarray.jl index a2204e40..5c2ed3a9 100644 --- a/test/12_missingarray.jl +++ b/test/12_missingarray.jl @@ -1160,7 +1160,7 @@ end @test unique(x) ≅ ["Old", "Young", "Middle", missing] x = CategoricalArray((Union{String, Missing})[missing]) - @test isa(levels(x), Vector{String}) && isempty(levels(x)) + @test isa(levels(x), CategoricalVector{String, DefaultRefType}) && isempty(levels(x)) @test unique(x) ≅ [missing] @test levels!(x, ["Young", "Middle", "Old"]) === x @test levels(x) == ["Young", "Middle", "Old"] diff --git a/test/13_arraycommon.jl b/test/13_arraycommon.jl index 02b51bd7..ac5fd424 100644 --- a/test/13_arraycommon.jl +++ b/test/13_arraycommon.jl @@ -2326,18 +2326,18 @@ end view(categorical(Union{String, Missing}[missing, "b", "a"], levels=["b", "c", "a"]), 2:3)) @test @inferred(levels(x)) == ["b", "c", "a"] @test levels(x, skipmissing=true) == ["b", "c", "a"] - @test levels(x, skipmissing=true) isa Vector{String} + @test levels(x, skipmissing=true) isa CategoricalVector{String} @test levels(x, skipmissing=false) == ["b", "c", "a"] - @test levels(x, skipmissing=false) isa Vector{Union{String, Missing}} + @test levels(x, skipmissing=false) isa CategoricalVector{Union{String, Missing}} end for x in (categorical(Union{String, Missing}["a", "b", missing], levels=["b", "c", "a"]), view(categorical(Union{String, Missing}["c", "b", missing], levels=["b", "c", "a"]), 2:3)) @test @inferred(levels(x)) == ["b", "c", "a"] @test levels(x, skipmissing=true) == ["b", "c", "a"] - @test levels(x, skipmissing=true) isa Vector{String} + @test levels(x, skipmissing=true) isa CategoricalVector{String} @test levels(x, skipmissing=false) ≅ ["b", "c", "a", missing] - @test levels(x, skipmissing=false) isa Vector{Union{String, Missing}} + @test levels(x, skipmissing=false) isa CategoricalVector{Union{String, Missing}} end end diff --git a/test/14_view.jl b/test/14_view.jl index 79b20812..11853853 100644 --- a/test/14_view.jl +++ b/test/14_view.jl @@ -11,7 +11,8 @@ const ≅ = isequal x = CategoricalArray{Union{T, eltype(a)}}(a, ordered=order) v = view(x, inds) - @test levels(v) === levels(x) + @test levels(x) isa CategoricalVector{nonmissingtype(eltype(a))} + @test levels(v) == levels(x) @test unique(v) == (ndims(v) > 0 ? unique(a[inds]) : [a[inds]]) @test isordered(v) === isordered(x) end