Skip to content

Multidimensional histogram #5400

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions xarray/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from .core.alignment import align, broadcast
from .core.combine import combine_by_coords, combine_nested
from .core.common import ALL_DIMS, full_like, ones_like, zeros_like
from .core.computation import apply_ufunc, corr, cov, dot, polyval, where
from .core.computation import apply_ufunc, corr, cov, dot, hist, polyval, where
from .core.concat import concat
from .core.dataarray import DataArray
from .core.dataset import Dataset
Expand Down Expand Up @@ -57,7 +57,7 @@
"cov",
"corr",
"full_like",
"infer_freq",
"hist" "infer_freq",
"load_dataarray",
"load_dataset",
"map_blocks",
Expand Down
76 changes: 76 additions & 0 deletions xarray/core/computation.py
Original file line number Diff line number Diff line change
Expand Up @@ -1724,3 +1724,79 @@ def _calc_idxminmax(
res.attrs = indx.attrs

return res


def hist(*datarrays, bins=None, dim=None, weights=None, density=False):
"""
Histogram applied along specified dimensions.

If the supplied arguments are chunked dask arrays it will use
`dask.array.blockwise` internally to parallelize over all chunks.

Parameters
----------
datarrays : xarray.DataArray objects
Input data. The number of input arguments determines the dimensionality of
the histogram. For example, two arguments produce a 2D histogram.
dim : tuple of strings, optional
Dimensions over which which the histogram is computed. The default is to
compute the histogram of the flattened array. i.e. over all dimensions.
bins : int or array_like or a list of ints or arrays, or list of DataArrays, optional
If a list, there should be one entry for each item in ``args``.
The bin specification:

* If int, the number of bins for all arguments in ``args``.
* If array_like, the bin edges for all arguments in ``args``.
* If a list of ints, the number of bins for every argument in ``args``.
* If a list arrays, the bin edges for each argument in ``args``
(required format for Dask inputs).
* A combination [int, array] or [array, int], where int
is the number of bins and array is the bin edges.
* If a list of DataArrays, the bins for each argument in ``args``
The DataArrays can be multidimensional, but must not have any
dimensions shared with the `dim` argument.

When bin edges are specified, all but the last (righthand-most) bin include
the left edge and exclude the right edge. The last bin includes both edges.

A ``TypeError`` will be raised if ``args`` contains dask arrays and
``bins`` are not specified explicitly as a list of arrays.
weights : array_like, optional
An array of weights, of the same shape as `a`. Each value in
`a` only contributes its associated weight towards the bin count
(instead of 1). If `density` is True, the weights are
normalized, so that the integral of the density over the range
remains 1. NaNs in the weights input will fill the entire bin with
NaNs. If there are NaNs in the weights input call ``.fillna(0.)``
before running ``hist()``.
density : bool, optional
If ``False``, the result will contain the number of samples in
each bin. If ``True``, the result is the value of the
probability *density* function at the bin, normalized such that
the *integral* over the range is 1. Note that the sum of the
histogram values will not be equal to 1 unless bins of unit
width are chosen; it is not a probability *mass* function.

Returns
-------
hist : xarray.DataArray
A single dataarray which contains the values of the histogram. See
`density` and `weights` for a description of the possible semantics.

The returned dataarray will have one additional coordinate for each
dataarray supplied, named as `var_bins`, which contains the positions
of the centres of each bin.

Examples
--------

See Also
--------
DataArray.hist
Dataset.hist
numpy.histogramdd
dask.array.blockwise
"""

# TODO xhistogram code goes here
raise NotImplementedError
77 changes: 77 additions & 0 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -4599,6 +4599,83 @@ def drop_duplicates(
indexes = {dim: ~self.get_index(dim).duplicated(keep=keep)}
return self.isel(indexes)

def hist(self, dim=None, bins=None, weights=None, density=False):
"""
Histogram applied along specified dimensions.

If the supplied arguments are chunked dask arrays it will use
`dask.array.blockwise` internally to parallelize over all chunks.

dim : tuple of strings, optional
Dimensions over which which the histogram is computed. The default is to
compute the histogram of the flattened array. i.e. over all dimensions.
bins : int or array_like or a list of ints or arrays, or list of DataArrays, optional
If a list, there should be one entry for each item in ``args``.
The bin specification:

* If int, the number of bins for all arguments in ``args``.
* If array_like, the bin edges for all arguments in ``args``.
* If a list of ints, the number of bins for every argument in ``args``.
* If a list arrays, the bin edges for each argument in ``args``
(required format for Dask inputs).
* A combination [int, array] or [array, int], where int
is the number of bins and array is the bin edges.
* If a list of DataArrays, the bins for each argument in ``args``
The DataArrays can be multidimensional, but must not have any
dimensions shared with the `dim` argument.

When bin edges are specified, all but the last (righthand-most) bin include
the left edge and exclude the right edge. The last bin includes both edges.

A ``TypeError`` will be raised if ``args`` contains dask arrays and
``bins`` are not specified explicitly as a list of arrays.
weights : array_like, optional
An array of weights, of the same shape as `a`. Each value in
`a` only contributes its associated weight towards the bin count
(instead of 1). If `density` is True, the weights are
normalized, so that the integral of the density over the range
remains 1. NaNs in the weights input will fill the entire bin with
NaNs. If there are NaNs in the weights input call ``.fillna(0.)``
before running ``hist()``.
density : bool, optional
If ``False``, the result will contain the number of samples in
each bin. If ``True``, the result is the value of the
probability *density* function at the bin, normalized such that
the *integral* over the range is 1. Note that the sum of the
histogram values will not be equal to 1 unless bins of unit
width are chosen; it is not a probability *mass* function.

Returns
-------
hist : xarray.DataArray
A single dataarray which contains the values of the histogram. See
`density` and `weights` for a description of the possible semantics.

The returned dataarray will have one additional coordinate for each
dataarray supplied, named as `var_bins`, which contains the positions
of the centres of each bin.

Examples
--------

See Also
--------
xarray.hist
DataArray.hist
numpy.histogramdd
dask.array.blockwise
"""

from .computation import hist

return hist(
[self],
dim=dim,
bins=bins,
weights=weights,
density=density,
)

# this needs to be at the end, or mypy will confuse with `str`
# https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names
str = utils.UncachedAccessor(StringAccessor)
81 changes: 81 additions & 0 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7627,3 +7627,84 @@ def _wrapper(Y, *coords_, **kwargs):
result.attrs = self.attrs.copy()

return result

def hist(self, vars=None, dim=None, bins=None, weights=None, density=False):
"""
Histogram applied along specified dimensions.

If the supplied arguments are chunked dask arrays it will use
`dask.array.blockwise` internally to parallelize over all chunks.

vars : list of str
Variables on the Dataset to use as input data. The number of variables
determines the dimensionality of the histogram. For example, two
arguments produce a 2D histogram.
dim : tuple of strings, optional
Dimensions over which which the histogram is computed. The default is to
compute the histogram of the flattened array. i.e. over all dimensions.
bins : int or array_like or a list of ints or arrays, or list of DataArrays, optional
If a list, there should be one entry for each item in ``args``.
The bin specification:

* If int, the number of bins for all arguments in ``args``.
* If array_like, the bin edges for all arguments in ``args``.
* If a list of ints, the number of bins for every argument in ``args``.
* If a list arrays, the bin edges for each argument in ``args``
(required format for Dask inputs).
* A combination [int, array] or [array, int], where int
is the number of bins and array is the bin edges.
* If a list of DataArrays, the bins for each argument in ``args``
The DataArrays can be multidimensional, but must not have any
dimensions shared with the `dim` argument.

When bin edges are specified, all but the last (righthand-most) bin include
the left edge and exclude the right edge. The last bin includes both edges.

A ``TypeError`` will be raised if ``args`` contains dask arrays and
``bins`` are not specified explicitly as a list of arrays.
weights : array_like, optional
An array of weights, of the same shape as `a`. Each value in
`a` only contributes its associated weight towards the bin count
(instead of 1). If `density` is True, the weights are
normalized, so that the integral of the density over the range
remains 1. NaNs in the weights input will fill the entire bin with
NaNs. If there are NaNs in the weights input call ``.fillna(0.)``
before running ``hist()``.
density : bool, optional
If ``False``, the result will contain the number of samples in
each bin. If ``True``, the result is the value of the
probability *density* function at the bin, normalized such that
the *integral* over the range is 1. Note that the sum of the
histogram values will not be equal to 1 unless bins of unit
width are chosen; it is not a probability *mass* function.

Returns
-------
hist : xarray.DataArray
A single dataarray which contains the values of the histogram. See
`density` and `weights` for a description of the possible semantics.

The returned dataarray will have one additional coordinate for each
dataarray supplied, named as `var_bins`, which contains the positions
of the centres of each bin.

Examples
--------

See Also
--------
xarray.hist
DataArray.hist
numpy.histogramdd
dask.array.blockwise
"""

from .computation import hist

return hist(
[self[var] for var in vars],
dim=dim,
bins=bins,
weights=weights,
density=density,
)
56 changes: 49 additions & 7 deletions xarray/plot/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@
import functools

import numpy as np
import pandas as pd

from .facetgrid import _easy_facetgrid
from .facetgrid import FacetGrid, _easy_facetgrid
from .utils import (
_add_colorbar,
_assert_valid_xy,
Expand Down Expand Up @@ -379,6 +378,9 @@ def step(darray, *args, where="pre", drawstyle=None, ds=None, **kwargs):

def hist(
darray,
row=None,
col=None,
col_wrap=None,
figsize=None,
size=None,
aspect=None,
Expand All @@ -391,19 +393,25 @@ def hist(
yticks=None,
xlim=None,
ylim=None,
bins=None,
weights=None,
density=False,
**kwargs,
):
"""
Histogram of DataArray.

Wraps :py:func:`matplotlib:matplotlib.pyplot.hist`.

Plots *N*-dimensional arrays by first flattening the array.
Plots *N*-dimensional arrays by first flattening the array and calculating
the histogram via :py:func:`DataArray.hist`.

Parameters
----------
darray : DataArray
Can have any number of dimensions.
Can have any number of dimensions, but will be reduced to 1 dimension
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe you have plans for this already but will this handle 2d-histograms like the mean oxygen example in https://xhistogram.readthedocs.io/en/latest/tutorial.html#Averaging-a-variable?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it will be able to calculate 2D (or N-D in general) histograms in xarray.hist(), it just won't be able to plot them in xarray.plot.hist(), because you need something like an xarray.plot.imshow() to plot the results of 2D histogram instead.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure whether you feel it is at all confusing to use the same word "hist" to refer both to the function that calculates the histogram and to the plotting function?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's fine. da.hist() vs. da.plot.hist() feels distinct enough for me.
I think I prefer the functions being called the proper name, da.histogram() and da.plot.histogram(), instead of the shorthand though.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's fine. da.hist() vs. da.plot.hist() feels distinct enough for me.

Cool.

I think I prefer the functions being called the proper name

That would also be marginally easier for the current users of xhistogram, who have been using xhistogram.histogram so would only need to change the import.

However the two arguments I can see against that are:

  1. Deprecation cycle - I would like consistency between the calculation function and the plot function, but the plot function already exists and is so named because it wraps matplotlib.pyplot.hist. Changing that would require a deprecation cycle, but I think with the changes I'm proposing here we could get away without a deprecation cycle (because we're only adding optional arguments, not changing them).
  2. We have many shorthand names for high-level functions already - cov, corr, concat, it would not be unusually terse.

I don't really have a strong opinion though.

Copy link
Contributor

@Illviljan Illviljan Jun 4, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The deprecation cycle wouldn't hinder your progress since I'm imagining simply just aliasing the method with a warning for a while.

numpy and dask also uses .histogram() for their histograms. matplotlib, pandas and hvplot uses .hist() for the plotting, holoviews uses .Histogram().
Maybe it's just easier to follow the upstream dependencies decisions then to avoid messing with the muscle memory (even though it's in my opinion "bad" muscle memory)?

And I'm not a fan of those shorthand names either, covid, corrosion, interpret? But I'm fine with just following what upstream are doing, there's something satisfying when you can just jump between the different data structures and everything just works.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You make very good points! I think I agree with you now.

covid, corrosion, interpret

🤣

by the histogram calculation before plotting (faceted plots will retain
more dimensions).
figsize : tuple, optional
A tuple (width, height) of the figure in inches.
Mutually exclusive with ``size`` and ``ax``.
Expand All @@ -416,16 +424,50 @@ def hist(
ax : matplotlib axes object, optional
Axes on which to plot. By default, use the current axes.
Mutually exclusive with ``size`` and ``figsize``.
bins : int or array_like or a list of ints or arrays, or list of DataArrays, optional
See :py:func:DataArray.hist
weights : array_like, optional
See :py:func:DataArray.hist
density : bool, optional
See :py:func:DataArray.hist
**kwargs : optional
Additional keyword arguments to :py:func:`matplotlib:matplotlib.pyplot.hist`.

"""

# compute the dims to count over
reduce_dims = set(darray.dims) - set([row, col])

# Handle facetgrids first
if row or col:
allargs = locals().copy()
allargs.update(allargs.pop("kwargs"))
allargs.pop("darray")

g = FacetGrid(
data=darray,
col=col,
row=row,
col_wrap=col_wrap,
sharex=False,
sharey=False,
figsize=figsize,
aspect=aspect,
size=size,
subplot_kws=kwargs,
)

return g.map(hist, **kwargs)

ax = get_axis(figsize, size, aspect, ax)

no_nan = np.ravel(darray.values)
no_nan = no_nan[pd.notnull(no_nan)]
h = darray.hist(bins=bins, dim=reduce_dims, weights=weights, density=density)
counts = h.values
bins = h.coords[f"{darray.name}_bins"].values

primitive = ax.hist(no_nan, **kwargs)
# Use the weights kwarg to avoid recomputing histogram in matplotlib
# (see matplotlib.pyplot.hist docstring)
primitive = ax.hist(bins[:-1], weights=counts, **kwargs)

ax.set_title("Histogram")
ax.set_xlabel(label_from_attrs(darray))
Expand Down