Skip to content

Commit e4a13b1

Browse files
authored
Simplify default cut labels (#422)
1) The quantile number isn't needed in most cases in the label, and anyway it's shown when printing an ordered `CategoricalValue`. Only use it by default when `allowempty=true` to avoid data-dependent errors if there are duplicate levels. 2) Round breaks by default to a number of significant digits chosen by `sigdigits`. This number is increased if necessary for breaks to remain unique. This generates labels which are not completely correct as rounding may make the left break greater than a value which is included in the interval, but this is generally minor and expected. Taking the floor rather than rounding would be more correct, but it can generate unexpected labels due to floating point trickiness (e.g. `floor(0.0003, sigdigits=4)` gives 0.0002999). This is what R does. Add a deprecation to avoid breaking custom `labels` functions which did not accept `sigdigits`.
1 parent b16588b commit e4a13b1

File tree

4 files changed

+246
-90
lines changed

4 files changed

+246
-90
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ CategoricalArraysStructTypesExt = "StructTypes"
2828

2929
[compat]
3030
Arrow = "2"
31-
Compat = "3.37, 4"
31+
Compat = "3.47, 4.10"
3232
DataAPI = "1.6"
3333
JSON = "0.15, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21"
3434
JSON3 = "1.1.2"

src/CategoricalArrays.jl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,12 @@ module CategoricalArrays
1111
import DataAPI: unwrap
1212
export unwrap
1313

14+
using Compat
15+
@compat public default_formatter, numbered_formatter
16+
1417
using DataAPI
1518
using Missings
1619
using Printf
17-
import Compat
1820

1921
# JuliaLang/julia#36810
2022
if VERSION < v"1.5.2"

src/extras.jl

Lines changed: 146 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,67 @@ function fill_refs!(refs::AbstractArray, X::AbstractArray,
2727
end
2828
end
2929

30+
if VERSION >= v"1.10"
31+
const CUT_FMT = Printf.Format("%.*g")
32+
end
33+
3034
"""
31-
default_formatter(from, to, i; leftclosed, rightclosed)
35+
CategoricalArrays.default_formatter(from, to, i::Integer;
36+
leftclosed::Bool, rightclosed::Bool,
37+
sigdigits::Integer)
3238
33-
Provide the default label format for the `cut(x, breaks)` method.
39+
Provide the default label format for the `cut(x, breaks)` method,
40+
which is `"[from, to)"` if `leftclosed` is `true` and `"[from, to)"` otherwise.
41+
42+
If they are floating points values, breaks are turned into to strings using
43+
`@sprintf("%.*g", sigdigits, break)`
44+
(or `to` using `@sprintf("%.*g", sigdigits, break)` for the last break).
3445
"""
35-
default_formatter(from, to, i; leftclosed, rightclosed) =
36-
string(leftclosed ? "[" : "(", from, ", ", to, rightclosed ? "]" : ")")
46+
function default_formatter(from, to, i::Integer;
47+
leftclosed::Bool, rightclosed::Bool,
48+
sigdigits::Integer)
49+
@static if VERSION >= v"1.10"
50+
from_str = from isa AbstractFloat ?
51+
Printf.format(CUT_FMT, sigdigits, from) :
52+
string(from)
53+
to_str = to isa AbstractFloat ?
54+
Printf.format(CUT_FMT, sigdigits, to) :
55+
string(to)
56+
else
57+
from_str = from isa AbstractFloat ?
58+
Printf.format(Printf.Format("%.$(sigdigits)g"), from) :
59+
string(from)
60+
to_str = to isa AbstractFloat ?
61+
Printf.format(Printf.Format("%.$(sigdigits)g"), to) :
62+
string(to)
63+
end
64+
string(leftclosed ? "[" : "(", from_str, ", ", to_str, rightclosed ? "]" : ")")
65+
end
66+
67+
"""
68+
CategoricalArrays.numbered_formatter(from, to, i::Integer;
69+
leftclosed::Bool, rightclosed::Bool,
70+
sigdigits::Integer)
71+
72+
Provide the default label format for the `cut(x, ngroups)` method
73+
when `allowempty=true`, which is `"i: [from, to)"` if `leftclosed`
74+
is `true` and `"i: [from, to)"` otherwise.
75+
76+
If they are floating points values, breaks are turned into to strings using
77+
`@sprintf("%.*g", sigdigits, breaks)`
78+
(or `to` using `@sprintf("%.*g", sigdigits, break)` for the last break).
79+
"""
80+
numbered_formatter(from, to, i::Integer;
81+
leftclosed::Bool, rightclosed::Bool,
82+
sigdigits::Integer) =
83+
string(i, ": ",
84+
default_formatter(from, to, i, leftclosed=leftclosed, rightclosed=rightclosed,
85+
sigdigits=sigdigits))
3786

3887
@doc raw"""
3988
cut(x::AbstractArray, breaks::AbstractVector;
4089
labels::Union{AbstractVector,Function},
90+
sigdigits::Integer=3,
4191
extend::Union{Bool,Missing}=false, allowempty::Bool=false)
4292
4393
Cut a numeric array into intervals at values `breaks`
@@ -49,15 +99,25 @@ the last interval, which is closed on both ends, i.e. `[lower, upper]`.
4999
If `x` accepts missing values (i.e. `eltype(x) >: Missing`) the returned array will
50100
also accept them.
51101
102+
!!! note
103+
For floating point data, breaks may be rounded to `sigdigits` significant digits
104+
when generating interval labels, meaning that they may not reflect exactly the cutpoints
105+
used.
106+
52107
# Keyword arguments
53108
* `extend::Union{Bool, Missing}=false`: when `false`, an error is raised if some values
54109
in `x` fall outside of the breaks; when `true`, breaks are automatically added to include
55110
all values in `x`; when `missing`, values outside of the breaks generate `missing` entries.
56111
* `labels::Union{AbstractVector, Function}`: a vector of strings, characters
57-
or numbers giving the names to use for
58-
the intervals; or a function `f(from, to, i; leftclosed, rightclosed)` that generates
112+
or numbers giving the names to use for the intervals; or a function
113+
`f(from, to, i::Integer; leftclosed::Bool, rightclosed::Bool, sigdigits::Integer)` that generates
59114
the labels from the left and right interval boundaries and the group index. Defaults to
60-
`"[from, to)"` (or `"[from, to]"` for the rightmost interval if `extend == true`).
115+
[`CategoricalArrays.default_formatter`](@ref), giving `"[from, to)"` (or `"[from, to]"`
116+
for the rightmost interval if `extend == true`).
117+
* `sigdigits::Integer=3`: the minimum number of significant digits to use in labels.
118+
This value is increased automatically if necessary so that rounded breaks are unique.
119+
Only used for floating point types and when `labels` is a function, in which case it
120+
is passed to it as a keyword argument.
61121
* `allowempty::Bool=false`: when `false`, an error is raised if some breaks other than
62122
the last one appear multiple times, generating empty intervals; when `true`,
63123
duplicate breaks are allowed and the intervals they generate are kept as
@@ -69,19 +129,19 @@ julia> using CategoricalArrays
69129
70130
julia> cut(-1:0.5:1, [0, 1], extend=true)
71131
5-element CategoricalArray{String,1,UInt32}:
72-
"[-1.0, 0.0)"
73-
"[-1.0, 0.0)"
74-
"[0.0, 1.0]"
75-
"[0.0, 1.0]"
76-
"[0.0, 1.0]"
132+
"[-1, 0)"
133+
"[-1, 0)"
134+
"[0, 1]"
135+
"[0, 1]"
136+
"[0, 1]"
77137
78138
julia> cut(-1:0.5:1, 2)
79139
5-element CategoricalArray{String,1,UInt32}:
80-
"Q1: [-1.0, 0.0)"
81-
"Q1: [-1.0, 0.0)"
82-
"Q2: [0.0, 1.0]"
83-
"Q2: [0.0, 1.0]"
84-
"Q2: [0.0, 1.0]"
140+
"[-1, 0)"
141+
"[-1, 0)"
142+
"[0, 1]"
143+
"[0, 1]"
144+
"[0, 1]"
85145
86146
julia> cut(-1:0.5:1, 2, labels=["A", "B"])
87147
5-element CategoricalArray{String,1,UInt32}:
@@ -114,15 +174,17 @@ julia> cut(-1:0.5:1, 3, labels=fmt)
114174
@inline function cut(x::AbstractArray, breaks::AbstractVector;
115175
extend::Union{Bool, Missing}=false,
116176
labels::Union{AbstractVector{<:SupportedTypes},Function}=default_formatter,
177+
sigdigits::Integer=3,
117178
allowempty::Bool=false)
118-
return _cut(x, breaks, extend, labels, allowempty)
179+
return _cut(x, breaks, extend, labels, sigdigits, allowempty)
119180
end
120181

121182
# Separate function for inferability (thanks to inlining of cut)
122183
function _cut(x::AbstractArray{T, N}, breaks::AbstractVector,
123184
extend::Union{Bool, Missing},
124185
labels::Union{AbstractVector{<:SupportedTypes},Function},
125-
allowempty::Bool=false) where {T, N}
186+
sigdigits::Integer,
187+
allowempty::Bool) where {T, N}
126188
if !issorted(breaks)
127189
breaks = sort(breaks)
128190
end
@@ -179,21 +241,60 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector,
179241
end
180242
end
181243

244+
# Find minimal number of digits so that distinct breaks remain so
245+
if eltype(breaks) <: AbstractFloat
246+
while true
247+
local i
248+
for outer i in 2:lastindex(breaks)
249+
b1 = breaks[i-1]
250+
b2 = breaks[i]
251+
isequal(b1, b2) && continue
252+
253+
@static if VERSION >= v"1.9"
254+
b1_str = Printf.format(CUT_FMT, sigdigits, b1)
255+
b2_str = Printf.format(CUT_FMT, sigdigits, b2)
256+
else
257+
b1_str = Printf.format(Printf.Format("%.$(sigdigits)g"), b1)
258+
b2_str = Printf.format(Printf.Format("%.$(sigdigits)g"), b2)
259+
end
260+
if b1_str == b2_str
261+
sigdigits += 1
262+
break
263+
end
264+
end
265+
i == lastindex(breaks) && break
266+
end
267+
end
182268
n = length(breaks)
183269
n >= 2 || throw(ArgumentError("at least two breaks must be provided when extend is not true"))
184270
if labels isa Function
185271
from = breaks[1:n-1]
186272
to = breaks[2:n]
187-
firstlevel = labels(from[1], to[1], 1,
188-
leftclosed=!isequal(breaks[1], breaks[2]), rightclosed=false)
273+
local firstlevel
274+
try
275+
firstlevel = labels(from[1], to[1], 1,
276+
leftclosed=!isequal(breaks[1], breaks[2]), rightclosed=false,
277+
sigdigits=sigdigits)
278+
catch
279+
# Support functions defined before v1.0, where sigdigits did not exist
280+
Base.depwarn("`labels` function is now required to accept a `sigdigits` keyword argument",
281+
:cut)
282+
labels_orig = labels
283+
labels = (from, to, i; leftclosed, rightclosed, sigdigits) ->
284+
labels_orig(from, to, i; leftclosed, rightclosed)
285+
firstlevel = labels_orig(from[1], to[1], 1,
286+
leftclosed=!isequal(breaks[1], breaks[2]), rightclosed=false)
287+
end
189288
levs = Vector{typeof(firstlevel)}(undef, n-1)
190289
levs[1] = firstlevel
191290
for i in 2:n-2
192291
levs[i] = labels(from[i], to[i], i,
193-
leftclosed=!isequal(breaks[i], breaks[i+1]), rightclosed=false)
292+
leftclosed=!isequal(breaks[i], breaks[i+1]), rightclosed=false,
293+
sigdigits=sigdigits)
194294
end
195295
levs[end] = labels(from[end], to[end], n-1,
196-
leftclosed=true, rightclosed=true)
296+
leftclosed=true, rightclosed=true,
297+
sigdigits=sigdigits)
197298
else
198299
length(labels) == n-1 ||
199300
throw(ArgumentError("labels must be of length $(n-1), but got length $(length(labels))"))
@@ -213,14 +314,6 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector,
213314
CategoricalArray{S, N}(refs, pool)
214315
end
215316

216-
"""
217-
quantile_formatter(from, to, i; leftclosed, rightclosed)
218-
219-
Provide the default label format for the `cut(x, ngroups)` method.
220-
"""
221-
quantile_formatter(from, to, i; leftclosed, rightclosed) =
222-
string("Q", i, ": ", leftclosed ? "[" : "(", from, ", ", to, rightclosed ? "]" : ")")
223-
224317
"""
225318
Find first value in (sorted) `v` which is greater than or equal to each quantile
226319
in (sorted) `qs`.
@@ -247,6 +340,7 @@ end
247340
"""
248341
cut(x::AbstractArray, ngroups::Integer;
249342
labels::Union{AbstractVector{<:AbstractString},Function},
343+
sigdigits::Integer=3,
250344
allowempty::Bool=false)
251345
252346
Cut a numeric array into `ngroups` quantiles.
@@ -257,19 +351,32 @@ but breaks are taken from actual data values instead of estimated quantiles.
257351
If `x` contains `missing` values, they are automatically skipped when computing
258352
quantiles.
259353
354+
!!! note
355+
For floating point data, breaks may be rounded to `sigdigits` significant digits
356+
when generating interval labels, meaning that they may not reflect exactly the cutpoints
357+
used.
358+
260359
# Keyword arguments
261360
* `labels::Union{AbstractVector, Function}`: a vector of strings, characters
262-
or numbers giving the names to use for
263-
the intervals; or a function `f(from, to, i; leftclosed, rightclosed)` that generates
361+
or numbers giving the names to use for the intervals; or a function
362+
`f(from, to, i::Integer; leftclosed::Bool, rightclosed::Bool, sigdigits::Integer)` that generates
264363
the labels from the left and right interval boundaries and the group index. Defaults to
265-
`"Qi: [from, to)"` (or `"Qi: [from, to]"` for the rightmost interval).
364+
[`CategoricalArrays.default_formatter`](@ref), giving `"[from, to)"` (or `"[from, to]"`
365+
for the rightmost interval if `extend == true`) if `allowempty=false`, otherwise to
366+
[`CategoricalArrays.numbered_formatter`](@ref), which prefixes the label with the quantile
367+
number to ensure uniqueness.
368+
* `sigdigits::Integer=3`: the minimum number of significant digits to use when rounding
369+
breaks for inclusion in generated labels. This value is increased automatically if necessary
370+
so that rounded breaks are unique. Only used for floating point types and when `labels` is a
371+
function, in which case it is passed to it as a keyword argument.
266372
* `allowempty::Bool=false`: when `false`, an error is raised if some quantiles breakpoints
267373
other than the last one are equal, generating empty intervals;
268374
when `true`, duplicate breaks are allowed and the intervals they generate are kept as
269375
unused levels (but duplicate labels are not allowed).
270376
"""
271377
function cut(x::AbstractArray, ngroups::Integer;
272-
labels::Union{AbstractVector{<:SupportedTypes},Function}=quantile_formatter,
378+
labels::Union{AbstractVector{<:SupportedTypes},Function,Nothing}=nothing,
379+
sigdigits::Integer=3,
273380
allowempty::Bool=false)
274381
ngroups >= 1 || throw(ArgumentError("ngroups must be strictly positive (got $ngroups)"))
275382
sorted_x = eltype(x) >: Missing ? sort!(collect(skipmissing(x))) : sort(x)
@@ -286,5 +393,8 @@ function cut(x::AbstractArray, ngroups::Integer;
286393
"Pass `allowempty=true` to allow empty quantiles or " *
287394
"choose a lower value for `ngroups`."))
288395
end
289-
cut(x, breaks; labels=labels, allowempty=allowempty)
396+
if labels === nothing
397+
labels = allowempty ? numbered_formatter : default_formatter
398+
end
399+
return cut(x, breaks; labels=labels, sigdigits=sigdigits, allowempty=allowempty)
290400
end

0 commit comments

Comments
 (0)