Skip to content

Commit 19a4564

Browse files
committed
refactor some numeric stuff into algo.py
1 parent 7c68aba commit 19a4564

File tree

4 files changed

+293
-262
lines changed

4 files changed

+293
-262
lines changed

probscale/algo.py

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
import numpy
2+
3+
4+
def _make_boot_index(elements, niter):
5+
""" Generate an array of bootstrap sample sets
6+
7+
Parameters
8+
----------
9+
elements : int
10+
The number of rows in the original dataset.
11+
niter : int
12+
Number of iteration for the bootstrapping.
13+
14+
Returns
15+
-------
16+
index : numpy array
17+
A collection of random *indices* that can be used to randomly
18+
sample a dataset ``niter`` times.
19+
20+
"""
21+
return numpy.random.randint(low=0, high=elements, size=(niter, elements))
22+
23+
24+
def _fit_simple(x, y, xhat, fitlogs=None):
25+
"""
26+
Simple linear fit of x and y data using ``numpy.polyfit``.
27+
28+
Parameters
29+
----------
30+
x, y : array-like
31+
fitlogs : str, optional.
32+
Defines which data should be log-transformed. Valid values are
33+
'x', 'y', or 'both'.
34+
35+
Returns
36+
-------
37+
xhat, yhat : array-like
38+
Estimates of x and y based on the linear fit
39+
results : dict
40+
Dictionary of the fit coefficients
41+
42+
See also
43+
--------
44+
numpy.polyfit
45+
46+
"""
47+
48+
# do the best-fit
49+
coeffs = numpy.polyfit(x, y, 1)
50+
51+
results = {
52+
'slope': coeffs[0],
53+
'intercept': coeffs[1]
54+
}
55+
56+
# estimate y values
57+
yhat = _estimate_from_fit(xhat, coeffs[0], coeffs[1],
58+
xlog=fitlogs in ['x', 'both'],
59+
ylog=fitlogs in ['y', 'both'])
60+
61+
return yhat, results
62+
63+
64+
def _bs_resid(x, y, xhat, fitlogs=None, niter=10000, alpha=0.05):
65+
index = _make_boot_index(len(x), niter)
66+
yhat, results = _fit_simple(x, y, xhat, fitlogs=fitlogs)
67+
resid = y - yhat
68+
bs_y = y + resid[index]
69+
70+
percentiles = 100 * numpy.array([alpha*0.5, 1 - alpha*0.5])
71+
yhat_lo, yhat_hi = numpy.percentile(bs_y, percentiles, axis=0)
72+
return yhat_lo, yhat_hi
73+
74+
75+
def _bs_fit(x, y, xhat, fitlogs=None, niter=10000, alpha=0.05):
76+
"""
77+
Percentile method bootstrapping of linear fit of x and y data using
78+
``numpy.polyfit``.
79+
80+
Parameters
81+
----------
82+
x, y : array-like
83+
fitlogs : str, optional.
84+
Defines which data should be log-transformed. Valid values are
85+
'x', 'y', or 'both'.
86+
niter : int, optional (default is 10000)
87+
Number of bootstrap iterations to use
88+
alpha : float, optional
89+
Confidence level of the estimate.
90+
91+
Returns
92+
-------
93+
xhat, yhat : array-like
94+
Estimates of x and y based on the linear fit
95+
results : dict
96+
Dictionary of the fit coefficients
97+
98+
See also
99+
--------
100+
numpy.polyfit
101+
102+
"""
103+
104+
index = _make_boot_index(len(x), niter)
105+
yhat_array = numpy.array([
106+
_fit_simple(x[ii], y[ii], xhat, fitlogs=fitlogs)[0]
107+
for ii in index
108+
])
109+
110+
percentiles = 100 * numpy.array([alpha*0.5, 1 - alpha*0.5])
111+
yhat_lo, yhat_hi = numpy.percentile(yhat_array, percentiles, axis=0)
112+
return yhat_lo, yhat_hi
113+
114+
115+
def _estimate_from_fit(xhat, slope, intercept, xlog=False, ylog=False):
116+
""" Estimate the dependent variables of a linear fit given x-data
117+
and linear parameters.
118+
119+
Parameters
120+
----------
121+
xhat : numpy array or pandas Series/DataFrame
122+
The input independent variable of the fit
123+
slope : float
124+
Slope of the best-fit line
125+
intercept : float
126+
y-intercept of the best-fit line
127+
xlog, ylog : bool (default = False)
128+
Toggles whether or not the logs of the x- or y- data should be
129+
used to perform the regression.
130+
131+
Returns
132+
-------
133+
yhat : numpy array
134+
Estimate of the dependent variable.
135+
136+
"""
137+
138+
xhat = numpy.asarray(xhat)
139+
if ylog:
140+
if xlog:
141+
yhat = numpy.exp(intercept) * xhat ** slope
142+
else:
143+
yhat = numpy.exp(intercept) * numpy.exp(slope) ** xhat
144+
145+
else:
146+
if xlog:
147+
yhat = slope * numpy.log(xhat) + intercept
148+
149+
else:
150+
yhat = slope * xhat + intercept
151+
152+
return yhat

probscale/tests/test_algo.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
import numpy
2+
3+
import pytest
4+
import numpy.testing as nptest
5+
6+
from probscale import algo
7+
8+
9+
def test__make_boot_index():
10+
result = algo._make_boot_index(5, 5000)
11+
assert result.shape == (5000, 5)
12+
assert result.min() == 0
13+
assert result.max() == 4
14+
15+
16+
@pytest.fixture
17+
def plot_data():
18+
data = numpy.array([
19+
3.113, 3.606, 4.046, 4.046, 4.710, 6.140, 6.978,
20+
2.000, 4.200, 4.620, 5.570, 5.660, 5.860, 6.650,
21+
6.780, 6.790, 7.500, 7.500, 7.500, 8.630, 8.710,
22+
8.990, 9.850, 10.820, 11.250, 11.250, 12.200, 14.920,
23+
16.770, 17.810, 19.160, 19.190, 19.640, 20.180, 22.970,
24+
])
25+
return data
26+
27+
28+
@pytest.mark.parametrize(('fitlogs', 'known_yhat'), [
29+
(None, numpy.array([0.7887, 3.8946, 7.0005, 10.1065, 13.2124, 16.3183])),
30+
('x', numpy.array([0.2711, 1.2784, 1.5988, 1.7953, 1.9373, 2.0487])),
31+
('y', numpy.array([2.2006e+00, 4.9139e+01, 1.0972e+03, 2.4501e+04, 5.4711e+05, 1.2217e+07])),
32+
('both', numpy.array([1.3114, 3.5908, 4.9472, 6.0211, 6.9402, 7.7577])),
33+
])
34+
def test__fit_simple(plot_data, fitlogs, known_yhat):
35+
x = numpy.arange(1, len(plot_data)+1)
36+
known_results = {'slope': 0.5177, 'intercept': 0.2711}
37+
xhat = x[::6]
38+
yhat, results = algo._fit_simple(x, plot_data, xhat, fitlogs=fitlogs)
39+
assert abs(results['intercept'] - known_results['intercept']) < 0.0001
40+
assert abs(results['slope'] - known_results['slope']) < 0.0001
41+
nptest.assert_allclose(yhat, known_yhat, rtol=0.0001)
42+
43+
44+
@pytest.mark.parametrize(('fitlogs', 'known_lo', 'known_hi'), [
45+
(None, numpy.array([-0.7944, 2.7051, 6.1974, 9.2612, 11.9382, 14.4290]),
46+
numpy.array([ 2.1447, 4.8360, 7.7140, 10.8646, 14.1014, 17.4432])),
47+
('x', numpy.array([-1.4098, -0.2210, 0.1387, 0.3585, 0.5147, 0.6417]),
48+
numpy.array([ 1.7067, 2.5661, 2.8468, 3.0169, 3.1400, 3.2341])),
49+
('y', numpy.array([4.5187e-01, 1.4956e+01, 4.9145e+02, 1.0522e+04, 1.5299e+05, 1.8468e+06]),
50+
numpy.array([8.5396e+00, 1.2596e+02, 2.2396e+03, 5.2290e+04, 1.3310e+06, 3.7627e+07])),
51+
('both', numpy.array([0.2442, 0.8017, 1.1488, 1.4312, 1.6731, 1.8997]),
52+
numpy.array([5.5107, 13.0148 , 17.232, 20.4285, 23.1035, 25.3843])),
53+
])
54+
def test__bs_fit(plot_data, fitlogs, known_lo, known_hi):
55+
numpy.random.seed(0)
56+
x = numpy.arange(1, len(plot_data)+1)
57+
xhat = x[::6]
58+
yhat_lo, yhat_hi = algo._bs_fit(x, plot_data, xhat, fitlogs=fitlogs, niter=1000)
59+
60+
nptest.assert_allclose(yhat_lo, known_lo, rtol=0.001)
61+
nptest.assert_allclose(yhat_hi, known_hi, rtol=0.001)
62+
63+
64+
class Test__estimate_from_fit(object):
65+
def setup(self):
66+
self.x = numpy.arange(1, 11, 0.5)
67+
self.slope = 2
68+
self.intercept = 3.5
69+
70+
self.known_ylinlin = numpy.array([
71+
5.5, 6.5, 7.5, 8.5, 9.5, 10.5, 11.5, 12.5, 13.5,
72+
14.5, 15.5, 16.5, 17.5, 18.5, 19.5, 20.5, 21.5, 22.5,
73+
23.5, 24.5
74+
])
75+
76+
77+
self.known_yloglin = numpy.array([
78+
3.5 , 4.31093022, 4.88629436, 5.33258146, 5.69722458,
79+
6.00552594, 6.27258872, 6.50815479, 6.71887582, 6.90949618,
80+
7.08351894, 7.24360435, 7.3918203 , 7.52980604, 7.65888308,
81+
7.78013233, 7.89444915, 8.0025836 , 8.10517019, 8.20275051
82+
])
83+
84+
self.known_yloglog = numpy.array([
85+
33.11545196, 74.50976691, 132.46180783, 206.97157474,
86+
298.03906763, 405.66428649, 529.84723134, 670.58790216,
87+
827.88629897, 1001.74242175, 1192.15627051, 1399.12784525,
88+
1622.65714598, 1862.74417268, 2119.38892536, 2392.59140402,
89+
2682.35160865, 2988.66953927, 3311.54519587, 3650.97857845
90+
])
91+
92+
self.known_ylinlog = numpy.array([
93+
2.44691932e+02, 6.65141633e+02, 1.80804241e+03,
94+
4.91476884e+03, 1.33597268e+04, 3.63155027e+04,
95+
9.87157710e+04, 2.68337287e+05, 7.29416370e+05,
96+
1.98275926e+06, 5.38969848e+06, 1.46507194e+07,
97+
3.98247844e+07, 1.08254988e+08, 2.94267566e+08,
98+
7.99902177e+08, 2.17435955e+09, 5.91052206e+09,
99+
1.60664647e+10, 4.36731791e+10
100+
])
101+
102+
def test_linlin(self):
103+
ylinlin = algo._estimate_from_fit(self.x, self.slope, self.intercept,
104+
xlog=False, ylog=False)
105+
nptest.assert_array_almost_equal(ylinlin, self.known_ylinlin)
106+
107+
def test_loglin(self):
108+
yloglin = algo._estimate_from_fit(self.x, self.slope, self.intercept,
109+
xlog=True, ylog=False)
110+
nptest.assert_array_almost_equal(yloglin, self.known_yloglin)
111+
112+
def test_loglog(self):
113+
yloglog = algo._estimate_from_fit(self.x, self.slope, self.intercept,
114+
xlog=True, ylog=True)
115+
nptest.assert_array_almost_equal(yloglog, self.known_yloglog)
116+
117+
def test_linlog(self):
118+
ylinlog = algo._estimate_from_fit(self.x, self.slope, self.intercept,
119+
xlog=False, ylog=True)
120+
percent_diff = numpy.abs(ylinlog - self.known_ylinlog) / self.known_ylinlog
121+
nptest.assert_array_almost_equal(
122+
percent_diff,
123+
numpy.zeros(self.x.shape[0]),
124+
decimal=5
125+
)

0 commit comments

Comments
 (0)