|
| 1 | +import numpy as np |
| 2 | +import mpmath |
| 3 | +from mpmath import mp |
| 4 | + |
| 5 | + |
| 6 | +class ReferenceDistribution: |
| 7 | + """Minimalist distribution infrastructure for generating reference data. |
| 8 | +
|
| 9 | + The purpose is to generate reference values for unit tests of SciPy |
| 10 | + distribution accuracy and robustness. |
| 11 | +
|
| 12 | + Handles array input with standard broadcasting rules, and method |
| 13 | + implementations are easily compared against their mathematical definitions. |
| 14 | + No attempt is made to handle edge cases or be fast, and arbitrary precision |
| 15 | + arithmetic is trusted for accuracy rather than making the method |
| 16 | + implementations "smart". |
| 17 | +
|
| 18 | + Notes |
| 19 | + ----- |
| 20 | +
|
| 21 | + In this infrastructure, distributions families are classes, and |
| 22 | + fully-specified distributions (i.e. with definite values of all family |
| 23 | + parameters) are instances of these classes. Typically, the public methods |
| 24 | + accept as input only the argument at which the at which the function is to |
| 25 | + be evaluated. Unlike SciPy distributions, they never accept values of |
| 26 | + distribution family shape, location, or scale parameters. A few |
| 27 | + other parameters are noteworthy: |
| 28 | +
|
| 29 | + - All methods accept `dtype` to control the output data type. The default |
| 30 | + is `np.float64`, but `object` or `mp.mpf` may be |
| 31 | + specified to output the full `mpf`. |
| 32 | + - `ppf`/`isf` accept a `guess` because they use a scalar rootfinder |
| 33 | + to invert the `cdf`/`sf`. This is passed directly into the `x0` method |
| 34 | + of `mpmath.findroot`; see its documentation for details. |
| 35 | + - moment accepts `order`, an integer that specifies the order of the (raw) |
| 36 | + moment, and `center`, which is the value about which the moment is |
| 37 | + taken. The default is to calculate the mean and use it to calculate |
| 38 | + central moments; passing ``0`` results in a noncentral moment. For |
| 39 | + efficiency, the mean can be passed explicitly if it is already known. |
| 40 | +
|
| 41 | + Follow the example of SkewNormal to generate new reference distributions, |
| 42 | + overriding only `__init__` and `_pdf`*. Use the reference distributions to |
| 43 | + generate reference values for unit tests of SciPy distribution method |
| 44 | + precision and robustness (e.g. for extreme arguments). If the a SciPy |
| 45 | + methods implementation is independent and yet the output matches reference |
| 46 | + values generated with this infrastructure, it is unlikely that the SciPy |
| 47 | + and reference values are both inaccurate. |
| 48 | +
|
| 49 | + * If the SciPy output *doesn't* match and the cause appears to be |
| 50 | + inaccuracy of the reference values (e.g. due to numerical issues that |
| 51 | + mpmath's arbitrary precision arithmetic doesn't handle), then it may be |
| 52 | + appropriate to override a method of the reference distribution rather than |
| 53 | + relying on the generic implementation. Otherwise, hesitate to override |
| 54 | + methods: the generic implementations are mathematically correct and easy |
| 55 | + to verify, whereas an override introduces many possibilities of mistakes, |
| 56 | + requires more time to write, and requires more time to review. |
| 57 | +
|
| 58 | + In general, do not create custom unit tests to ensure that |
| 59 | + SciPy distribution methods are *correct* (in the sense of being consistent |
| 60 | + with the rest of the distribution methods); generic tests take care of |
| 61 | + that. |
| 62 | + """ |
| 63 | + |
| 64 | + def __init__(self, **kwargs): |
| 65 | + try: |
| 66 | + if mpmath.dps is not None: |
| 67 | + message = ( |
| 68 | + "`mpmath.dps` has been assigned. This is not " |
| 69 | + "intended usage; instead, assign the desired " |
| 70 | + "precision to `mpmath.mp.dps` (e.g. `from mpmath " |
| 71 | + "as mp; mp.dps = 50." |
| 72 | + ) |
| 73 | + raise RuntimeError(message) |
| 74 | + except AttributeError: |
| 75 | + mpmath.dps = None |
| 76 | + |
| 77 | + if mp.dps <= 15: |
| 78 | + message = ( |
| 79 | + "`mpmath.mp.dps <= 15`. Set a higher precision (e.g." |
| 80 | + "`50`) to use this distribution." |
| 81 | + ) |
| 82 | + raise RuntimeError(message) |
| 83 | + |
| 84 | + self._params = {key: self._make_mpf_array(val) for key, val in kwargs.items()} |
| 85 | + |
| 86 | + def _make_mpf_array(self, x): |
| 87 | + shape = np.shape(x) |
| 88 | + x = np.asarray(x, dtype=np.float64).ravel() |
| 89 | + return np.asarray([mp.mpf(xi) for xi in x]).reshape(shape)[()] |
| 90 | + |
| 91 | + def _pdf(self, x): |
| 92 | + raise NotImplementedError("_pdf must be overridden.") |
| 93 | + |
| 94 | + def _cdf(self, x, **kwargs): |
| 95 | + if (self._cdf.__func__ is ReferenceDistribution._cdf) and ( |
| 96 | + self._sf.__func__ is not ReferenceDistribution._sf |
| 97 | + ): |
| 98 | + return mp.one - self._sf(x, **kwargs) |
| 99 | + |
| 100 | + a, b = self._support(**kwargs) |
| 101 | + res = mp.quad(lambda x: self._pdf(x, **kwargs), (a, x)) |
| 102 | + res = res if res < 0.5 else mp.one - self._sf(x, **kwargs) |
| 103 | + return res |
| 104 | + |
| 105 | + def _sf(self, x, **kwargs): |
| 106 | + if (self._sf.__func__ is ReferenceDistribution._sf) and ( |
| 107 | + self._cdf.__func__ is not ReferenceDistribution._cdf |
| 108 | + ): |
| 109 | + return mp.one - self._cdf(x, **kwargs) |
| 110 | + |
| 111 | + a, b = self._support(**kwargs) |
| 112 | + res = mp.quad(lambda x: self._pdf(x, **kwargs), (x, b)) |
| 113 | + res = res if res < 0.5 else mp.one - self._cdf(x, **kwargs) |
| 114 | + return res |
| 115 | + |
| 116 | + def _ppf(self, p, guess=0, **kwargs): |
| 117 | + if (self._ppf.__func__ is ReferenceDistribution._ppf) and ( |
| 118 | + self._isf.__func__ is not ReferenceDistribution._isf |
| 119 | + ): |
| 120 | + return self._isf(mp.one - p, guess, **kwargs) |
| 121 | + |
| 122 | + def f(x): |
| 123 | + return self._cdf(x, **kwargs) - p |
| 124 | + |
| 125 | + return mp.findroot(f, guess) |
| 126 | + |
| 127 | + def _isf(self, p, guess=0, **kwargs): |
| 128 | + if (self._isf.__func__ is ReferenceDistribution._isf) and ( |
| 129 | + self._ppf.__func__ is not ReferenceDistribution._ppf |
| 130 | + ): |
| 131 | + return self._ppf(mp.one - p, guess, **kwargs) |
| 132 | + |
| 133 | + def f(x): |
| 134 | + return self._sf(x, **kwargs) - p |
| 135 | + |
| 136 | + return mp.findroot(f, guess) |
| 137 | + |
| 138 | + def _logpdf(self, x, **kwargs): |
| 139 | + return mp.log(self._pdf(x, **kwargs)) |
| 140 | + |
| 141 | + def _logcdf(self, x, **kwargs): |
| 142 | + return mp.log(self._cdf(x, **kwargs)) |
| 143 | + |
| 144 | + def _logsf(self, x, **kwargs): |
| 145 | + return mp.log(self._sf(x, **kwargs)) |
| 146 | + |
| 147 | + def _support(self, **kwargs): |
| 148 | + return -mp.inf, mp.inf |
| 149 | + |
| 150 | + def _entropy(self, **kwargs): |
| 151 | + def integrand(x): |
| 152 | + logpdf = self._logpdf(x, **kwargs) |
| 153 | + pdf = mp.exp(logpdf) |
| 154 | + return -pdf * logpdf |
| 155 | + |
| 156 | + a, b = self._support(**kwargs) |
| 157 | + return mp.quad(integrand, (a, b)) |
| 158 | + |
| 159 | + def _mean(self, **kwargs): |
| 160 | + return self._moment(order=1, center=0, **kwargs) |
| 161 | + |
| 162 | + def _var(self, **kwargs): |
| 163 | + mu = self._mean(**kwargs) |
| 164 | + return self._moment(order=2, center=mu, **kwargs) |
| 165 | + |
| 166 | + def _skew(self, **kwargs): |
| 167 | + mu = self._mean(**kwargs) |
| 168 | + u2 = self._moment(order=2, center=mu, **kwargs) |
| 169 | + sigma = mp.sqrt(u2) |
| 170 | + u3 = self._moment(order=3, center=mu, **kwargs) |
| 171 | + return u3 / sigma**3 |
| 172 | + |
| 173 | + def _kurtosis(self, **kwargs): |
| 174 | + mu = self._mean(**kwargs) |
| 175 | + u2 = self._moment(order=2, center=mu, **kwargs) |
| 176 | + u4 = self._moment(order=4, center=mu, **kwargs) |
| 177 | + return u4 / u2**2 - 3 |
| 178 | + |
| 179 | + def _moment(self, order, center, **kwargs): |
| 180 | + def integrand(x): |
| 181 | + return self._pdf(x, **kwargs) * (x - center) ** order |
| 182 | + |
| 183 | + if center is None: |
| 184 | + center = self._mean(**kwargs) |
| 185 | + |
| 186 | + a, b = self._support(**kwargs) |
| 187 | + return mp.quad(integrand, (a, b)) |
| 188 | + |
| 189 | + def pdf(self, x, dtype=np.float64): |
| 190 | + fun = np.vectorize(self._pdf) |
| 191 | + x = self._make_mpf_array(x) |
| 192 | + res = fun(x, **self._params) |
| 193 | + return np.asarray(res, dtype=dtype)[()] |
| 194 | + |
| 195 | + def cdf(self, x, dtype=np.float64): |
| 196 | + fun = np.vectorize(self._cdf) |
| 197 | + x = self._make_mpf_array(x) |
| 198 | + res = fun(x, **self._params) |
| 199 | + return np.asarray(res, dtype=dtype)[()] |
| 200 | + |
| 201 | + def sf(self, x, dtype=np.float64): |
| 202 | + fun = np.vectorize(self._sf) |
| 203 | + x = self._make_mpf_array(x) |
| 204 | + res = fun(x, **self._params) |
| 205 | + return np.asarray(res, dtype=dtype)[()] |
| 206 | + |
| 207 | + def ppf(self, x, guess=0, dtype=np.float64): |
| 208 | + fun = np.vectorize(self._ppf, excluded={1}) # don't vectorize guess |
| 209 | + x = self._make_mpf_array(x) |
| 210 | + res = fun(x, guess, **self._params) |
| 211 | + return np.asarray(res, dtype=dtype)[()] |
| 212 | + |
| 213 | + def isf(self, x, guess=0, dtype=np.float64): |
| 214 | + fun = np.vectorize(self._isf, excluded={1}) # don't vectorize guess |
| 215 | + x = self._make_mpf_array(x) |
| 216 | + res = fun(x, guess, **self._params) |
| 217 | + return np.asarray(res, dtype=dtype)[()] |
| 218 | + |
| 219 | + def logpdf(self, x, dtype=np.float64): |
| 220 | + fun = np.vectorize(self._logpdf) |
| 221 | + x = self._make_mpf_array(x) |
| 222 | + res = fun(x, **self._params) |
| 223 | + return np.asarray(res, dtype=dtype)[()] |
| 224 | + |
| 225 | + def logcdf(self, x, dtype=np.float64): |
| 226 | + fun = np.vectorize(self._logcdf) |
| 227 | + x = self._make_mpf_array(x) |
| 228 | + res = fun(x, **self._params) |
| 229 | + return np.asarray(res, dtype=dtype)[()] |
| 230 | + |
| 231 | + def logsf(self, x, dtype=np.float64): |
| 232 | + fun = np.vectorize(self._logsf) |
| 233 | + x = self._make_mpf_array(x) |
| 234 | + res = fun(x, **self._params) |
| 235 | + return np.asarray(res, dtype=dtype)[()] |
| 236 | + |
| 237 | + def support(self, dtype=np.float64): |
| 238 | + fun = np.vectorize(self._support) |
| 239 | + res = fun(**self._params) |
| 240 | + return np.asarray(res, dtype=dtype)[()] |
| 241 | + |
| 242 | + def entropy(self, dtype=np.float64): |
| 243 | + fun = np.vectorize(self._entropy) |
| 244 | + res = fun(**self._params) |
| 245 | + return np.asarray(res, dtype=dtype)[()] |
| 246 | + |
| 247 | + def mean(self, dtype=np.float64): |
| 248 | + fun = np.vectorize(self._mean) |
| 249 | + res = fun(**self._params) |
| 250 | + return np.asarray(res, dtype=dtype)[()] |
| 251 | + |
| 252 | + def var(self, dtype=np.float64): |
| 253 | + fun = np.vectorize(self._var) |
| 254 | + res = fun(**self._params) |
| 255 | + return np.asarray(res, dtype=dtype)[()] |
| 256 | + |
| 257 | + def skew(self, dtype=np.float64): |
| 258 | + fun = np.vectorize(self._skew) |
| 259 | + res = fun(**self._params) |
| 260 | + return np.asarray(res, dtype=dtype)[()] |
| 261 | + |
| 262 | + def kurtosis(self, dtype=np.float64): |
| 263 | + fun = np.vectorize(self._kurtosis) |
| 264 | + res = fun(**self._params) |
| 265 | + return np.asarray(res, dtype=dtype)[()] |
| 266 | + |
| 267 | + def moment(self, order, center=None, dtype=np.float64): |
| 268 | + fun = np.vectorize(self._moment) |
| 269 | + order = self._make_mpf_array(order) |
| 270 | + res = fun(order, **self._params) |
| 271 | + return np.asarray(res, dtype=dtype)[()] |
| 272 | + |
| 273 | + |
| 274 | +class LogNormal(ReferenceDistribution): |
| 275 | + def __init__(self, *, s): |
| 276 | + super().__init__(s=s) |
| 277 | + |
| 278 | + def _support(self, s): |
| 279 | + return 0, mp.inf |
| 280 | + |
| 281 | + def _pdf(self, x, s): |
| 282 | + return ( |
| 283 | + mp.one |
| 284 | + / (s * x * mp.sqrt(2 * mp.pi)) |
| 285 | + * mp.exp(-mp.one / 2 * (mp.log(x) / s) ** 2) |
| 286 | + ) |
| 287 | + |
| 288 | + def _cdf(self, x, s): |
| 289 | + return mp.ncdf(mp.log(x) / s) |
| 290 | + |
| 291 | + |
| 292 | +class Normal(ReferenceDistribution): |
| 293 | + def _pdf(self, x): |
| 294 | + return mp.npdf(x) |
0 commit comments