Skip to content

Refactor approach for making final bin right-edge inclusive #44

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jun 22, 2021
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 20 additions & 19 deletions xhistogram/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from functools import reduce
from collections.abc import Iterable
from numpy import (
digitize,
searchsorted,
bincount,
reshape,
ravel_multi_index,
Expand Down Expand Up @@ -154,24 +154,25 @@ def _bincount_2d_vectorized(
nbins = [len(b) for b in bins]
hist_shapes = [nb + 1 for nb in nbins]

# a marginally faster implementation would be to use searchsorted,
# like numpy histogram itself does
# https://github.com/numpy/numpy/blob/9c98662ee2f7daca3f9fae9d5144a9a8d3cabe8c/numpy/lib/histograms.py#L864-L882
# for now we stick with `digitize` because it's easy to understand how it works

# Add small increment to the last bin edge to make the final bin right-edge inclusive
# Note, this is the approach taken by sklearn, e.g.
# https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/calibration.py#L592
# but a better approach would be to use something like _search_sorted_inclusive() in
# numpy histogram. This is an additional motivation for moving to searchsorted
bins = [np.concatenate((b[:-1], b[-1:] + 1e-8)) for b in bins]

# the maximum possible value of of digitize is nbins
# for right=False:
# The maximum possible value of searchsorted is nbins
# For _searchsorted_inclusive:
# - 0 corresponds to a < b[0]
# - i corresponds to bins[i-1] <= a < b[i]
# - nbins corresponds to a a >= b[1]
each_bin_indices = [digitize(a, b) for a, b in zip(args, bins)]
# - i corresponds to b[i-1] <= a < b[i]
# - nbins-1 corresponds to b[-2] <= a <= b[-1]
# - nbins corresponds to a >= b[-1]
def _searchsorted_inclusive(a, b):
"""
Like `searchsorted`, but where the last bin is also right-edge inclusive.
"""
# Similar to implementation in np.histogramdd
# see https://github.com/numpy/numpy/blob/9c98662ee2f7daca3f9fae9d5144a9a8d3cabe8c/numpy/lib/histograms.py#L1056
bin_indices = searchsorted(b, a, side="right")
on_edge = a == b[-1]
# Shift these points one bin to the left.
bin_indices[on_edge] -= 1
return bin_indices

each_bin_indices = [_searchsorted_inclusive(a, b) for a, b in zip(args, bins)]
# product of the bins gives the joint distribution
if N_inputs > 1:
bin_indices = ravel_multi_index(each_bin_indices, hist_shapes)
Expand Down Expand Up @@ -321,7 +322,7 @@ def histogram(

See Also
--------
numpy.histogram, numpy.bincount, numpy.digitize
numpy.histogram, numpy.bincount, numpy.searchsorted
"""

a0 = args[0]
Expand Down
21 changes: 21 additions & 0 deletions xhistogram/test/test_core.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
import pandas as pd

from itertools import combinations
import dask.array as dsa
Expand Down Expand Up @@ -313,3 +314,23 @@ def test_ensure_correctly_formatted_range(in_out):
else:
with pytest.raises(ValueError):
_ensure_correctly_formatted_range(range_in, n)


@pytest.mark.parametrize("block_size", [None, 1, 2])
@pytest.mark.parametrize("use_dask", [False, True])
def test_histogram_results_datetime(use_dask, block_size):
"""Test computing histogram of datetime objects"""
data = pd.date_range(start="2000-06-01", periods=5)
if use_dask:
data = dsa.asarray(data, chunks=(5,))
# everything should be in the second bin (index 1)
bins = np.array(
[
np.datetime64("1999-01-01"),
np.datetime64("2000-01-01"),
np.datetime64("2001-01-01"),
]
)
h = histogram(data, bins=bins, block_size=block_size)[0]
expected = np.histogram(data, bins=bins)[0]
np.testing.assert_allclose(h, expected)