feat: Only allocate shapesys parameter if yield and uncrt are positive, nonzero (#775)

lukasheinrich · web-flow · commit e85864c57049 · 2020-02-28T19:55:33.000-08:00
- improve handling of shapesys by only allocating nuisance parameters for valid, physically meaningful bins (yield and uncertainty are nonzero and positive)
- add documentation about this particular "feature" of shapesys
diff --git a/docs/likelihood.rst b/docs/likelihood.rst
@@ -132,7 +132,27 @@ shown below:
 
    { "name": "mod_name", "type": "shapesys", "data": [1.0, 1.5, 2.0] }
 
-An example of an uncorrelated shape modifier with three absolute uncertainty terms for a 3-bin channel.
+An example of an uncorrelated shape modifier with three absolute uncertainty
+terms for a 3-bin channel.
+
+.. warning::
+
+   Nuisance parameters will not be allocated for any bins where either
+
+     * the samples nominal expected rate is zero, or
+     * the absolute uncertainty is zero.
+
+   These values are, in the context of uncorrelated shape uncertainties,
+   unphysical. If this situation occurs, one needs to go back and understand
+   the inputs as this is undefined behavior in HistFactory.
+
+The previous example will allocate three nuisance parameters for ``mod_name``.
+The following example will allocate only two nuisance parameters for a 3-bin
+channel:
+
+.. code:: json
+
+   { "name": "mod_name", "type": "shapesys", "data": [1.0, 0.0, 2.0] }
 
 Correlated Shape (histosys)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/src/pyhf/modifiers/shapesys.py b/src/pyhf/modifiers/shapesys.py
@@ -13,18 +13,24 @@
 class shapesys(object):
     @classmethod
     def required_parset(cls, sample_data, modifier_data):
+        # count the number of bins with nonzero, positive yields
+        valid_bins = [
+            (sample_bin > 0 and modifier_bin > 0)
+            for sample_bin, modifier_bin in zip(modifier_data, sample_data)
+        ]
+        n_parameters = sum(valid_bins)
         return {
             'paramset_type': constrained_by_poisson,
-            'n_parameters': len(sample_data),
+            'n_parameters': n_parameters,
             'modifier': cls.__name__,
             'is_constrained': cls.is_constrained,
             'is_shared': False,
-            'inits': (1.0,) * len(sample_data),
-            'bounds': ((1e-10, 10.0),) * len(sample_data),
+            'inits': (1.0,) * n_parameters,
+            'bounds': ((1e-10, 10.0),) * n_parameters,
             # nb: auxdata/factors set by finalize. Set to non-numeric to crash
             # if we fail to set auxdata/factors correctly
-            'auxdata': (None,) * len(sample_data),
-            'factors': (None,) * len(sample_data),
+            'auxdata': (None,) * n_parameters,
+            'factors': (None,) * n_parameters,
         }
 
 
@@ -66,17 +72,36 @@ def __init__(self, shapesys_mods, pdfconfig, mega_mods, batch_size=None):
             (len(shapesys_mods), self.batch_size or 1, 1),
         )
         # access field is shape (sys, batch, globalbin)
-        for s, syst_access in enumerate(self._access_field):
-            for t, batch_access in enumerate(syst_access):
-                selection = self.param_viewer.index_selection[s][t]
-                for b, bin_access in enumerate(batch_access):
-                    self._access_field[s, t, b] = (
-                        selection[bin_access] if bin_access < len(selection) else 0
-                    )
+
+        # reindex it based on current masking
+        self._reindex_access_field(pdfconfig)
 
         self._precompute()
         events.subscribe('tensorlib_changed')(self._precompute)
 
+    def _reindex_access_field(self, pdfconfig):
+        for syst_index, syst_access in enumerate(self._access_field):
+            if not pdfconfig.param_set(self._shapesys_mods[syst_index]).n_parameters:
+                self._access_field[syst_index] = 0
+                continue
+            for batch_index, batch_access in enumerate(syst_access):
+                selection = self.param_viewer.index_selection[syst_index][batch_index]
+                access_field_for_syst_and_batch = default_backend.zeros(
+                    len(batch_access)
+                )
+                singular_sample_index = [
+                    idx
+                    for idx, syst in enumerate(
+                        default_backend.astensor(self._shapesys_mask)[syst_index, :, 0]
+                    )
+                    if any(syst)
+                ][-1]
+                sample_mask = self._shapesys_mask[syst_index][singular_sample_index][0]
+                access_field_for_syst_and_batch[sample_mask] = selection
+                self._access_field[
+                    syst_index, batch_index
+                ] = access_field_for_syst_and_batch
+
     def _precompute(self):
         tensorlib, _ = get_backend()
         if not self.param_viewer.index_selection:
@@ -91,6 +116,8 @@ def _precompute(self):
 
     def finalize(self, pdfconfig):
         for uncert_this_mod, pname in zip(self.__shapesys_uncrt, self._shapesys_mods):
+            if not pdfconfig.param_set(pname).n_parameters:
+                continue
             unc_nom = default_backend.astensor(
                 [x for x in uncert_this_mod[:, :, :] if any(x[0][x[0] > 0])]
             )
diff --git a/src/pyhf/pdf.py b/src/pyhf/pdf.py
@@ -188,7 +188,10 @@ def _nominal_and_modifiers_from_spec(config, spec):
                     )  # broadcasting
                 elif mtype in ['shapesys', 'staterror']:
                     uncrt = thismod['data'] if thismod else [0.0] * len(nom)
-                    maskval = [True if thismod else False] * len(nom)
+                    if mtype == 'shapesys':
+                        maskval = [(x > 0 and y > 0) for x, y in zip(uncrt, nom)]
+                    else:
+                        maskval = [True if thismod else False] * len(nom)
                     mega_mods[key][s]['data']['mask'] += maskval
                     mega_mods[key][s]['data']['uncrt'] += uncrt
                     mega_mods[key][s]['data']['nom_data'] += nom
diff --git a/tests/test_combined_modifiers.py b/tests/test_combined_modifiers.py
@@ -504,6 +504,88 @@ def test_normfactor(backend):
     assert np.allclose(mod[1, 0, 3], [1.0, 8.0, 8.0])
 
 
+def test_shapesys_zero(backend):
+    mc = MockConfig(
+        par_map={
+            'SigXsecOverSM': {
+                'paramset': paramset(n_parameters=1, inits=[0], bounds=[[0, 10]]),
+                'slice': slice(0, 1),
+            },
+            'syst': {
+                'paramset': paramset(
+                    n_parameters=5, inits=[0] * 5, bounds=[[0, 10]] * 5
+                ),
+                'slice': slice(1, 6),
+            },
+            'syst_lowstats': {
+                'paramset': paramset(
+                    n_parameters=0, inits=[0] * 0, bounds=[[0, 10]] * 0
+                ),
+                'slice': slice(6, 6),
+            },
+        },
+        channels=['channel1'],
+        channel_nbins={'channel1': 6},
+        par_order=['SigXsecOverSM', 'syst', 'syst_lowstats'],
+        samples=['signal', 'background'],
+    )
+
+    mega_mods = {
+        'shapesys/syst': {
+            'background': {
+                'type': 'shapesys',
+                'name': 'syst',
+                'data': {
+                    'mask': [True, True, False, True, True, True],
+                    'nom_data': [100.0, 90.0, 0.0, 70, 0.1, 50],
+                    'uncrt': [10, 9, 1, 0.0, 0.1, 5],
+                },
+            },
+            'signal': {
+                'type': 'shapesys',
+                'name': 'syst',
+                'data': {
+                    'mask': [False, False, False, False, False, False],
+                    'nom_data': [20.0, 10.0, 5.0, 3.0, 2.0, 1.0],
+                    'uncrt': [10, 9, 1, 0.0, 0.1, 5],
+                },
+            },
+        },
+        'shapesys/syst_lowstats': {
+            'background': {
+                'type': 'shapesys',
+                'name': 'syst_lowstats',
+                'data': {
+                    'mask': [False, False, False, False, False, False],
+                    'nom_data': [100.0, 90.0, 0.0, 70, 0.1, 50],
+                    'uncrt': [0, 0, 0, 0, 0, 0],
+                },
+            },
+            'signal': {
+                'type': 'shapesys',
+                'name': 'syst',
+                'data': {
+                    'mask': [False, False, False, False, False, False],
+                    'nom_data': [20.0, 10.0, 5.0, 3.0, 2.0, 1.0],
+                    'uncrt': [10, 9, 1, 0.0, 0.1, 5],
+                },
+            },
+        },
+    }
+    hsc = shapesys_combined(
+        [('syst', 'shapesys'), ('syst_lowstats', 'shapesys')], mc, mega_mods
+    )
+
+    mod = hsc.apply(pyhf.tensorlib.astensor([-10, 1.1, 1.2, 1.3, -20, -30]))
+    shape = pyhf.tensorlib.shape(mod)
+    assert shape == (2, 2, 1, 6)
+
+    # expect the 'background' sample to have a single masked bin for 'syst'
+    assert mod[0, 1, 0, 2] == 1.0
+    # expect the 'background' sample to have all bins masked for 'syst_lowstats'
+    assert np.all(kappa == 1 for kappa in mod[1, 1, 0])
+
+
 def test_shapefactor(backend):
     mc = MockConfig(
         par_map={
diff --git a/tests/test_pdf.py b/tests/test_pdf.py
@@ -182,6 +182,56 @@ def test_pdf_integration_staterror(backend):
     )
 
 
+def test_pdf_integration_shapesys_zeros(backend):
+    spec = {
+        "channels": [
+            {
+                "name": "channel1",
+                "samples": [
+                    {
+                        "data": [20.0, 10.0, 5.0, 3.0, 2.0, 1.0],
+                        "modifiers": [
+                            {"data": None, "name": "mu", "type": "normfactor"}
+                        ],
+                        "name": "signal",
+                    },
+                    {
+                        "data": [100.0, 90, 0.0, 70, 0.1, 50],
+                        "modifiers": [
+                            {
+                                "data": [10, 9, 1, 0.0, 0.1, 5],
+                                "name": "syst",
+                                "type": "shapesys",
+                            },
+                            {
+                                "data": [0, 0, 0, 0, 0, 0],
+                                "name": "syst_lowstats",
+                                "type": "shapesys",
+                            },
+                        ],
+                        "name": "background1",
+                    },
+                ],
+            }
+        ]
+    }
+    pdf = pyhf.Model(spec)
+    par_set_syst = pdf.config.param_set('syst')
+    par_set_syst_lowstats = pdf.config.param_set('syst_lowstats')
+
+    assert par_set_syst.n_parameters == 4
+    assert par_set_syst_lowstats.n_parameters == 0
+    tensorlib, _ = backend
+    nominal_sq = tensorlib.power(tensorlib.astensor([100.0, 90, 0.0, 70, 0.1, 50]), 2)
+    uncerts_sq = tensorlib.power(tensorlib.astensor([10, 9, 1, 0.0, 0.1, 5]), 2)
+    factors = tensorlib.divide(nominal_sq, uncerts_sq)
+    indices = tensorlib.astensor([0, 1, 4, 5], dtype='int')
+    assert pytest.approx(tensorlib.tolist(par_set_syst.factors)) == tensorlib.tolist(
+        tensorlib.gather(factors, indices)
+    )
+    assert getattr(par_set_syst_lowstats, 'factors', None) is None
+
+
 @pytest.mark.only_numpy
 def test_pdf_integration_histosys(backend):
     source = json.load(open('validation/data/2bin_histosys_example2.json'))
diff --git a/tests/test_validation.py b/tests/test_validation.py
@@ -4,6 +4,7 @@
 import json
 import pytest
 import os
+import numpy as np
 
 
 @pytest.fixture(scope='module')
@@ -846,3 +847,80 @@ def test_import_roundtrip(tmpdir, toplvl, basedir):
     assert abs(CLs_obs_after - CLs_obs_before) / CLs_obs_before < tolerance
     for result, expected_result in zip(CLs_exp_set_after, CLs_exp_set_before):
         assert abs(result - expected_result) / expected_result < tolerance
+
+
+def test_shapesys_nuisparfilter_validation():
+    reference_root_results = {
+        "CLs_exp": [
+            2.702197937866914e-05,
+            0.00037099917612576155,
+            0.004360634386335687,
+            0.03815031509701916,
+            0.20203027564155074,
+        ],
+        "CLs_obs": 0.004360634405484502,
+    }
+    null = None
+    spec = {
+        "channels": [
+            {
+                "name": "channel1",
+                "samples": [
+                    {
+                        "data": [20, 10],
+                        "modifiers": [
+                            {
+                                "data": null,
+                                "name": "SigXsecOverSM",
+                                "type": "normfactor",
+                            }
+                        ],
+                        "name": "signal",
+                    },
+                    {
+                        "data": [100, 10],
+                        "modifiers": [
+                            {"data": [10, 0], "name": "syst", "type": "shapesys"}
+                        ],
+                        "name": "background1",
+                    },
+                ],
+            }
+        ],
+        "measurements": [
+            {
+                "config": {
+                    "parameters": [
+                        {
+                            "auxdata": [1],
+                            "bounds": [[0.5, 1.5]],
+                            "inits": [1],
+                            "name": "lumi",
+                            "sigmas": [0.1],
+                        }
+                    ],
+                    "poi": "SigXsecOverSM",
+                },
+                "name": "GaussExample",
+            }
+        ],
+        "observations": [{"data": [100, 10], "name": "channel1"}],
+        "version": "1.0.0",
+    }
+    w = pyhf.Workspace(spec)
+    m = w.model(
+        modifier_settings={
+            'normsys': {'interpcode': 'code4'},
+            'histosys': {'interpcode': 'code4p'},
+        },
+    )
+    d = w.data(m)
+    obs, exp = pyhf.infer.hypotest(1.0, d, m, return_expected_set=True)
+    pyhf_results = {'CLs_obs': obs[0], 'CLs_exp': [e[0] for e in exp]}
+
+    assert np.allclose(
+        reference_root_results['CLs_obs'], pyhf_results['CLs_obs'], atol=1e-4, rtol=1e-5
+    )
+    assert np.allclose(
+        reference_root_results['CLs_exp'], pyhf_results['CLs_exp'], atol=1e-4, rtol=1e-5
+    )