Merge pull request #44 from dougiesquire/refactor_bin_alignment_with_np

dougiesquire · web-flow · commit 5c12e178cfee · 2021-06-22T10:25:33.000+10:00
Refactor approach for making final bin right-edge inclusive
diff --git a/xhistogram/core.py b/xhistogram/core.py
@@ -8,7 +8,7 @@
 from functools import reduce
 from collections.abc import Iterable
 from numpy import (
-    digitize,
+    searchsorted,
     bincount,
     reshape,
     ravel_multi_index,
@@ -154,24 +154,26 @@ def _bincount_2d_vectorized(
     nbins = [len(b) for b in bins]
     hist_shapes = [nb + 1 for nb in nbins]
 
-    # a marginally faster implementation would be to use searchsorted,
-    # like numpy histogram itself does
-    # https://github.com/numpy/numpy/blob/9c98662ee2f7daca3f9fae9d5144a9a8d3cabe8c/numpy/lib/histograms.py#L864-L882
-    # for now we stick with `digitize` because it's easy to understand how it works
-
-    # Add small increment to the last bin edge to make the final bin right-edge inclusive
-    # Note, this is the approach taken by sklearn, e.g.
-    # https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/calibration.py#L592
-    # but a better approach would be to use something like _search_sorted_inclusive() in
-    # numpy histogram. This is an additional motivation for moving to searchsorted
-    bins = [np.concatenate((b[:-1], b[-1:] + 1e-8)) for b in bins]
-
-    # the maximum possible value of of digitize is nbins
-    # for right=False:
+    # The maximum possible value of searchsorted is nbins
+    # For _searchsorted_inclusive:
     #   - 0 corresponds to a < b[0]
-    #   - i corresponds to bins[i-1] <= a < b[i]
-    #   - nbins corresponds to a a >= b[1]
-    each_bin_indices = [digitize(a, b) for a, b in zip(args, bins)]
+    #   - i corresponds to b[i-1] <= a < b[i]
+    #   - nbins-1 corresponds to b[-2] <= a <= b[-1]
+    #   - nbins corresponds to a >= b[-1]
+    def _searchsorted_inclusive(a, b):
+        """
+        Like `searchsorted`, but where the last bin is also right-edge inclusive.
+        """
+        # Similar to implementation in np.histogramdd
+        # see https://github.com/numpy/numpy/blob/9c98662ee2f7daca3f9fae9d5144a9a8d3cabe8c/numpy/lib/histograms.py#L1056
+        # This assumes the bins (b) are sorted
+        bin_indices = searchsorted(b, a, side="right")
+        on_edge = a == b[-1]
+        # Shift these points one bin to the left.
+        bin_indices[on_edge] -= 1
+        return bin_indices
+
+    each_bin_indices = [_searchsorted_inclusive(a, b) for a, b in zip(args, bins)]
     # product of the bins gives the joint distribution
     if N_inputs > 1:
         bin_indices = ravel_multi_index(each_bin_indices, hist_shapes)
@@ -327,7 +329,7 @@ def histogram(
 
     See Also
     --------
-    numpy.histogram, numpy.bincount, numpy.digitize
+    numpy.histogram, numpy.bincount, numpy.searchsorted
     """
 
     a0 = args[0]
diff --git a/xhistogram/test/test_core.py b/xhistogram/test/test_core.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pandas as pd
 
 from itertools import combinations
 import dask.array as dsa
@@ -328,3 +329,23 @@ def test_ensure_correctly_formatted_range(in_out):
     else:
         with pytest.raises(ValueError):
             _ensure_correctly_formatted_range(range_in, n)
+
+
+@pytest.mark.parametrize("block_size", [None, 1, 2])
+@pytest.mark.parametrize("use_dask", [False, True])
+def test_histogram_results_datetime(use_dask, block_size):
+    """Test computing histogram of datetime objects"""
+    data = pd.date_range(start="2000-06-01", periods=5)
+    if use_dask:
+        data = dsa.asarray(data, chunks=(5,))
+    # everything should be in the second bin (index 1)
+    bins = np.array(
+        [
+            np.datetime64("1999-01-01"),
+            np.datetime64("2000-01-01"),
+            np.datetime64("2001-01-01"),
+        ]
+    )
+    h = histogram(data, bins=bins, block_size=block_size)[0]
+    expected = np.histogram(data, bins=bins)[0]
+    np.testing.assert_allclose(h, expected)