Skip to content

Commit 5a5a69f

Browse files
Merge pull request #282 from TeemuSailynoja/issue-267-ecdf-plots-with-confidence-bands
Issue 267 ecdf plots with confidence bands
2 parents 0f931bc + f5c0dc3 commit 5a5a69f

20 files changed

+1763
-73
lines changed

NAMESPACE

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ export(mcmc_nuts_treedepth)
7575
export(mcmc_pairs)
7676
export(mcmc_parcoord)
7777
export(mcmc_parcoord_data)
78+
export(mcmc_rank_ecdf)
7879
export(mcmc_rank_hist)
7980
export(mcmc_rank_overlay)
8081
export(mcmc_recover_hist)
@@ -131,6 +132,8 @@ export(ppc_loo_pit_data)
131132
export(ppc_loo_pit_overlay)
132133
export(ppc_loo_pit_qq)
133134
export(ppc_loo_ribbon)
135+
export(ppc_pit_ecdf)
136+
export(ppc_pit_ecdf_grouped)
134137
export(ppc_ribbon)
135138
export(ppc_ribbon_data)
136139
export(ppc_ribbon_grouped)

R/helpers-ppc.R

Lines changed: 318 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,32 @@ validate_predictions <- function(predictions, n_obs = NULL) {
9898
}
9999

100100

101+
#' Validate PIT
102+
#'
103+
#' Checks that `pit` is numeric, doesn't have any NAs, and is either a vector,
104+
#' or a 1-D array with values in [0,1].
105+
#'
106+
#' @param pit The 'pit' object from the user.
107+
#' @return Either throws an error or returns a numeric vector.
108+
#' @noRd
109+
validate_pit <- function(pit) {
110+
stopifnot(is.numeric(pit))
111+
112+
if (!is_vector_or_1Darray(pit)) {
113+
abort("'pit' must be a vector or 1D array.")
114+
}
115+
116+
if (any(pit > 1) || any(pit < 0)) {
117+
abort("'pit' must only contain values between 0 and 1.")
118+
}
119+
120+
if (anyNA(pit)) {
121+
abort("NAs not allowed in 'pit'.")
122+
}
123+
124+
unname(pit)
125+
}
126+
101127
#' Validate group
102128
#'
103129
#' Checks that grouping variable has correct number of observations and is
@@ -267,6 +293,298 @@ melt_and_stack <- function(y, yrep) {
267293
}
268294

269295

296+
#' Obtain the coverage parameter for simultaneous confidence bands for ECDFs
297+
#'
298+
#' @param N Length of sample.
299+
#' @param L Number of chains. Used for MCMC, defaults to 1 for ppc.
300+
#' @param K Number of equally spaced evaluation points (1:K / K). Defaults to N.
301+
#' @param prob Desired simultaneous coverage (0,1).
302+
#' @param M number of simulations to run, if simulation method is used.
303+
#' @param interpolate_adj Boolean defining whether to interpolate the confidence
304+
#' bands from precomputed values. Interpolation provides a faster plot with the
305+
#' trade-off of possible loss of accuracy.
306+
#' @return The adjusted coverage parameter yielding the desired simultaneous
307+
#' coverage of the ECDF traces.
308+
#' @noRd
309+
adjust_gamma <- function(N,
310+
L = 1,
311+
K = N,
312+
prob = 0.99,
313+
M = 1000,
314+
interpolate_adj = FALSE) {
315+
if (! all_counts(c(K, N, L))) {
316+
abort("Parameters 'N', 'L' and 'K' must be positive integers.")
317+
}
318+
if (prob >= 1 || prob <= 0) {
319+
abort("Value of 'prob' must be in (0,1).")
320+
}
321+
if (interpolate_adj == TRUE) {
322+
gamma <- interpolate_gamma(N = N, K = K, prob = prob, L = L)
323+
} else if (L == 1) {
324+
gamma <- adjust_gamma_optimize(N = N, K = K, prob = prob)
325+
} else {
326+
gamma <- adjust_gamma_simulate(N = N, L = L, K = K, prob = prob, M = M)
327+
}
328+
gamma
329+
}
330+
331+
#' Adjust coverage parameter for single sample using the optimization method.
332+
#' @param N Length of sample.
333+
#' @param K Number of equally spaced evaluation points (1:K / K). Defaults to N.
334+
#' @param prob Desired simultaneous coverage (0,1).
335+
#' @return The adjusted coverage parameter yielding the desired simultaneous
336+
#' coverage of the ECDF traces.
337+
#' @noRd
338+
adjust_gamma_optimize <- function(N, K, prob) {
339+
target <- function(gamma, prob, N, K) {
340+
z <- 1:(K - 1) / K
341+
z1 <- c(0, z)
342+
z2 <- c(z, 1)
343+
344+
# pre-compute quantiles and use symmetry for increased efficiency.
345+
x2_lower <- qbinom(gamma / 2, N, z2)
346+
x2_upper <- c(N - rev(x2_lower)[2:K], 1)
347+
348+
# Compute the total probability of trajectories inside the confidence
349+
# intervals. Initialize the set and corresponding probabilities known
350+
# to be 0 and 1 for the starting value z1 = 0.
351+
x1 <- 0
352+
p_int <- 1
353+
for (i in seq_along(z1)) {
354+
p_int <- p_interior(
355+
p_int = p_int,
356+
x1 = x1,
357+
x2 = x2_lower[i]: x2_upper[i],
358+
z1 = z1[i],
359+
z2 = z2[i],
360+
N = N
361+
)
362+
x1 <- x2_lower[i]:x2_upper[i]
363+
}
364+
return(abs(prob - sum(p_int)))
365+
}
366+
optimize(target, c(0, 1 - prob), prob = prob, N = N, K = K)$minimum
367+
}
368+
369+
#' Adjust coverage parameter for multiple chains using the simulation method.
370+
#' In short, 'M' simulations of 'L' standard uniform chains are run and the
371+
#' confidence bands are set to cover 100 * 'prob' % of these simulations.
372+
#' @param N Length of sample.
373+
#' @param L Number of chains. Used for MCMC, defaults to 1 for ppc.
374+
#' @param K Number of equally spaced evaluation points (1:K / K). Defaults to N.
375+
#' @param prob Desired simultaneous coverage (0,1).
376+
#' @param M number of simulations to run.
377+
#' @return The adjusted coverage parameter yielding the desired simultaneous
378+
#' coverage of the ECDF traces.
379+
#' @noRd
380+
adjust_gamma_simulate <- function(N, L, K, prob, M) {
381+
gamma <- numeric(M)
382+
z <- (1:(K - 1)) / K # Rank ECDF evaluation points.
383+
n <- N * (L - 1)
384+
k <- floor(z * N * L)
385+
for (m in seq_len(M)) {
386+
u <- u_scale(replicate(L, runif(N))) # Fractional ranks of sample chains
387+
scaled_ecdfs <- apply(outer(u, z, "<="), c(2, 3), sum)
388+
# Find the smalles marginal probability of the simulation run
389+
gamma[m] <- 2 * min(
390+
apply(
391+
scaled_ecdfs, 1, phyper, m = N, n = n, k = k
392+
),
393+
apply(
394+
scaled_ecdfs - 1, 1, phyper, m = N, n = n, k = k, lower.tail = FALSE
395+
)
396+
)
397+
}
398+
alpha_quantile(gamma, 1 - prob)
399+
}
400+
401+
#' Approximate the required adjustement to obtain simultaneous confidence bands
402+
#' of an ECDF plot with interpolation with regards to N and K from precomputed
403+
#' values for a fixed set of prob and L values.
404+
#' @param N Length of sample.
405+
#' @param L Number of chains. Used for MCMC, defaults to 1 for ppc.
406+
#' @param prob Desired simultaneous coverage (0,1).
407+
#' @param K Number of equally spaced evaluation points (1:K / K). Defaults to N.
408+
#' @return The approximated adjusted coverage parameter yielding the desired
409+
#' simultaneous coverage of the ECDF traces.
410+
#' @noRd
411+
interpolate_gamma <- function(N, K, prob, L) {
412+
# Find the precomputed values ueful for the interpolation task.
413+
vals <- get_interpolation_values(N, K, L, prob)
414+
# Largest lower bound and smalles upper bound for N among precomputed values.
415+
N_lb <- max(vals[vals$N <= N, ]$N)
416+
N_ub <- min(vals[vals$N >= N, ]$N)
417+
# Approximate largest lower bound and smalles upper bound for gamma.
418+
log_gamma_lb <- approx(
419+
x = log(vals[vals$N == N_lb, ]$K),
420+
y = log(vals[vals$N == N_lb, ]$val),
421+
xout = log(K)
422+
)$y
423+
log_gamma_ub <- approx(
424+
x = log(vals[vals$N == N_ub, ]$K),
425+
y = log(vals[vals$N == N_ub, ]$val),
426+
xout = log(K)
427+
)$y
428+
if (N_ub == N_lb) {
429+
log_gamma_approx <- log_gamma_lb
430+
} else {
431+
# Approximate log_gamma for the desired value of N.
432+
log_gamma_approx <- approx(
433+
x = log(c(N_lb, N_ub)),
434+
y = c(log_gamma_lb, log_gamma_ub),
435+
xout = log(N)
436+
)$y
437+
}
438+
exp(log_gamma_approx)
439+
}
440+
441+
#' Filter the precomputed values useful for the interpolation task given to
442+
#' interpolate_gamma. Check, if the task is possible with the availabel data.
443+
#' @param N Length of sample.
444+
#' @param K Number of equally spaced evaluation points (1:K / K). Defaults to N.
445+
#' @param L Number of chains. Used for MCMC, defaults to 1 for ppc.
446+
#' @param prob Desired simultaneous coverage (0,1).
447+
#' @return A data.frame containing the relevant precomputed values.
448+
#' @noRd
449+
get_interpolation_values <- function(N, K, L, prob) {
450+
for (dim in c("L", "prob")) {
451+
if (all(get(dim) != bayesplot:::gamma_adj[, dim])) {
452+
stop(paste(
453+
"No precomputed values to interpolate from for '", dim, "' = ",
454+
get(dim),
455+
".\n",
456+
"Values of '", dim, "' available for interpolation: ",
457+
paste(unique(bayesplot:::gamma_adj[, dim]), collapse = ", "),
458+
".",
459+
sep = ""
460+
))
461+
}
462+
}
463+
vals <- bayesplot:::gamma_adj[
464+
bayesplot:::gamma_adj$L == L & bayesplot:::gamma_adj$prob == prob,
465+
]
466+
if (N > max(vals$N)) {
467+
stop(paste(
468+
"No precomputed values to interpolate from for sample length of ",
469+
N,
470+
".\n",
471+
"Please use a subsample of length ",
472+
max(vals$N),
473+
" or smaller, or consider setting 'interpolate_adj' = FALSE.",
474+
sep = ""
475+
))
476+
}
477+
if (N < min(vals$N)) {
478+
stop(paste(
479+
"No precomputed values to interpolate from for sample length of ",
480+
N,
481+
".\n",
482+
"Please use a subsample of length ",
483+
min(vals$N),
484+
" or larger, or consider setting 'interpolate_adj' = FALSE.",
485+
sep = ""
486+
))
487+
}
488+
if (K > max(vals[vals$N <= N, ]$K)) {
489+
stop(paste(
490+
"No precomputed values available for interpolation for 'K' = ",
491+
K,
492+
".\n",
493+
"Try either setting a value of 'K' <= ",
494+
max(vals[vals$N <= N, ]$K),
495+
"or 'interpolate_adj' = FALSE.",
496+
sep = ""
497+
))
498+
}
499+
if (K < min(vals[vals$N <= N, ]$K)) {
500+
stop(paste(
501+
"No precomputed values available for interpolation for 'K' = ",
502+
K,
503+
".\n",
504+
"Try either setting a value of 'K' >= ",
505+
min(vals[vals$N <= N, ]$K),
506+
"or 'interpolate_adj' = FALSE.",
507+
sep = ""
508+
))
509+
}
510+
vals
511+
}
512+
513+
#' A helper function for 'adjust_gamma_optimize' defining the probability that
514+
#' a scaled ECDF stays within the supplied bounds between two evaluation points.
515+
#' @param p_int For each value in x1, the probability that the ECDF has stayed
516+
#' within the bounds until z1 and takes the value in x1 at z1.
517+
#' @param x1 Vector of scaled ECDF values at the left end of the interval, z1.
518+
#' @param x2 Vector of scaled ECDF values at the right end of the interval, z2.
519+
#' @param z1 Left evaluation point in [0,1]
520+
#' @param z2 Right evaluation point in [0,1] with z2 > z1.
521+
#' @param N Total number of values in the sample.
522+
#' @return A vector containing the probability to transitioning from the values
523+
#' in x1 to each of the values in x2 weighted by the probabilities in p_int.
524+
#' @noRd
525+
p_interior <- function(p_int, x1, x2, z1, z2, N) {
526+
# Ratio between the length of the evaluation interval and the total length of
527+
# the interval left to cover by ECDF.
528+
z_tilde <- (z2 - z1) / (1 - z1)
529+
# Number of samples left to cover by ECDF.
530+
N_tilde <- rep(N - x1, each = length(x2))
531+
532+
p_int <- rep(p_int, each = length(x2))
533+
x_diff <- outer(x2, x1, "-")
534+
# Pobability of each transition from a value in x1 to a value in x2.
535+
p_x2_int <- p_int * dbinom(x_diff, N_tilde, z_tilde)
536+
rowSums(p_x2_int)
537+
}
538+
539+
#' A helper function for 'adjust_alpha_simulate'
540+
#' 100 * `alpha` percent of the trials in 'gamma' are allowed to be rejected.
541+
#' In case of ties, return the largest value dominating at most
542+
#' 100 * (alpha + tol) percent of the values.
543+
#' @noRd
544+
alpha_quantile <- function(gamma, alpha, tol = 0.001) {
545+
a <- unname(quantile(gamma, probs = alpha))
546+
a_tol <- unname(quantile(gamma, probs = alpha + tol))
547+
if (a == a_tol) {
548+
if (min(gamma) < a) {
549+
# take the largest value that doesn't exceed the tolerance.
550+
a <- max(gamma[gamma < a])
551+
}
552+
}
553+
a
554+
}
555+
556+
#' Compute simultaneous confidence intervals with the given adjusted coverage
557+
#' parameter gamma.
558+
#' @param gamma Adjusted coverage parameter for the marginal distribution
559+
#' (binomial for PIT values and hypergeometric for rank transformed chains).
560+
#' @param N Sample length.
561+
#' @param K Number of uniformly spaced evaluation points.
562+
#' @param L Number of MCMC-chains. (1 for ppc)
563+
#' @return A list with upper and lower confidence interval values at the
564+
#' evaluation points.
565+
#' @noRd
566+
ecdf_intervals <- function(gamma, N, K, L = 1) {
567+
lims <- list()
568+
z <- seq(0, 1, length.out = K + 1)
569+
if (L == 1) {
570+
lims$lower <- qbinom(gamma / 2, N, z)
571+
lims$upper <- qbinom(1 - gamma / 2, N, z)
572+
} else {
573+
n <- N * (L - 1)
574+
k <- floor(z * L * N)
575+
lims$lower <- qhyper(gamma / 2, N, n, k)
576+
lims$upper <- qhyper(1 - gamma / 2, N, n, k)
577+
}
578+
lims
579+
}
580+
581+
#' Helper for 'adjust_gamma_simulate`
582+
#' Transforms observations in 'x' into their corresponding fractional ranks.
583+
#' @noRd
584+
u_scale <- function(x) {
585+
array(rank(x) / length(x), dim = dim(x), dimnames = dimnames(x))
586+
}
587+
270588
# labels ----------------------------------------------------------------
271589
create_rep_ids <- function(ids) paste('italic(y)[rep] (', ids, ")")
272590
y_label <- function() expression(italic(y))

0 commit comments

Comments
 (0)