diff --git a/R/geom-histogram.R b/R/geom-histogram.R index dafc181f15..7bd832b611 100644 --- a/R/geom-histogram.R +++ b/R/geom-histogram.R @@ -17,6 +17,12 @@ #' one change at a time. You may need to look at a few options to uncover #' the full story behind your data. #' +#' By default, the _height_ of the bars represent the counts within each bin. +#' However, there are situations where this behavior might produce misleading +#' plots (e.g., when non-equal-width bins are used), in which case it might be +#' preferable to have the _area_ of the bars represent the counts (by setting +#' `aes(y = after_stat(count / width))`). See example below. +#' #' In addition to `geom_histogram()`, you can create a histogram plot by using #' `scale_x_binned()` with [geom_bar()]. This method by default plots tick marks #' in between each bar. @@ -63,6 +69,18 @@ #' ggplot(diamonds, aes(price, after_stat(density), colour = cut)) + #' geom_freqpoly(binwidth = 500) #' +#' +#' # When using the non-equal-width bins, we should set the area of the bars to +#' # represent the counts (not the height). +#' # Here we're using 10 equi-probable bins: +#' price_bins <- quantile(diamonds$price, probs = seq(0, 1, length = 11)) +#' +#' ggplot(diamonds, aes(price)) + +#' geom_histogram(breaks = price_bins, color = "black") # misleading (height = count) +#' +#' ggplot(diamonds, aes(price, after_stat(count / width))) + +#' geom_histogram(breaks = price_bins, color = "black") # area = count +#' #' if (require("ggplot2movies")) { #' # Often we don't want the height of the bar to represent the #' # count of observations, but the sum of some other variable. diff --git a/man/geom_histogram.Rd b/man/geom_histogram.Rd index 1f290dbcdc..a241aa2ba4 100644 --- a/man/geom_histogram.Rd +++ b/man/geom_histogram.Rd @@ -192,6 +192,12 @@ different number of bins. You can also experiment modifying the \code{binwidth} one change at a time. You may need to look at a few options to uncover the full story behind your data. +By default, the \emph{height} of the bars represent the counts within each bin. +However, there are situations where this behavior might produce misleading +plots (e.g., when non-equal-width bins are used), in which case it might be +preferable to have the \emph{area} of the bars represent the counts (by setting +\code{aes(y = after_stat(count / width))}). See example below. + In addition to \code{geom_histogram()}, you can create a histogram plot by using \code{scale_x_binned()} with \code{\link[=geom_bar]{geom_bar()}}. This method by default plots tick marks in between each bar. @@ -255,6 +261,18 @@ ggplot(diamonds, aes(price, colour = cut)) + ggplot(diamonds, aes(price, after_stat(density), colour = cut)) + geom_freqpoly(binwidth = 500) + +# When using the non-equal-width bins, we should set the area of the bars to +# represent the counts (not the height). +# Here we're using 10 equi-probable bins: +price_bins <- quantile(diamonds$price, probs = seq(0, 1, length = 11)) + +ggplot(diamonds, aes(price)) + + geom_histogram(breaks = price_bins, color = "black") # misleading (height = count) + +ggplot(diamonds, aes(price, after_stat(count / width))) + + geom_histogram(breaks = price_bins, color = "black") # area = count + if (require("ggplot2movies")) { # Often we don't want the height of the bar to represent the # count of observations, but the sum of some other variable.