Skip to content

Commit b851187

Browse files
refactored single test unit tests to ease phasing in of new noise models
1 parent b5fe1cd commit b851187

File tree

3 files changed

+507
-213
lines changed

3 files changed

+507
-213
lines changed

diffxpy/unit_test/test_single_de.py

Lines changed: 268 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
import unittest
2+
import logging
3+
import numpy as np
4+
import pandas as pd
5+
import scipy.stats as stats
6+
7+
from batchglm.api.models.glm_nb import Simulator
8+
import diffxpy.api as de
9+
10+
11+
class _TestSingleDE:
12+
13+
def _prepare_data(self, n_cells: int = 2000, n_genes: int = 100):
14+
"""
15+
16+
:param n_cells: Number of cells to simulate (number of observations per test).
17+
:param n_genes: Number of genes to simulate (number of tests).
18+
"""
19+
num_non_de = n_genes // 2
20+
sim = Simulator(num_observations=n_cells, num_features=n_genes)
21+
sim.generate_sample_description(num_batches=0, num_conditions=2)
22+
sim.generate_params(
23+
rand_fn_ave=lambda shape: np.random.poisson(500, shape) + 1,
24+
rand_fn=lambda shape: np.abs(np.random.uniform(1, 0.5, shape))
25+
)
26+
sim.params["a_var"][1, :num_non_de] = 0
27+
sim.params["b_var"][1, :num_non_de] = 0
28+
sim.params["isDE"] = ("features",), np.arange(n_genes) >= num_non_de
29+
sim.generate_data()
30+
31+
return sim
32+
33+
def _eval(self, sim, test):
34+
idx_de = np.where(sim.params["isDE"] == True)[0]
35+
idx_nonde = np.where(sim.params["isDE"] == False)[0]
36+
37+
frac_de_of_non_de = np.sum(test.qval[idx_nonde] < 0.05) / len(idx_nonde)
38+
frac_de_of_de = np.sum(test.qval[idx_de] < 0.05) / len(idx_de)
39+
40+
logging.getLogger("diffxpy").info(
41+
'fraction of non-DE genes with q-value < 0.05: %.1f%%' %
42+
float(100 * frac_de_of_non_de)
43+
)
44+
logging.getLogger("diffxpy").info(
45+
'fraction of DE genes with q-value < 0.05: %.1f%%' %
46+
float(100 * frac_de_of_de)
47+
)
48+
assert frac_de_of_non_de <= 0.1, "too many false-positives"
49+
assert frac_de_of_de >= 0.5, "too many false-negatives"
50+
51+
return sim
52+
53+
def _test_rank_de(self, n_cells: int = 2000, n_genes: int = 100):
54+
"""
55+
:param n_cells: Number of cells to simulate (number of observations per test).
56+
:param n_genes: Number of genes to simulate (number of tests).
57+
"""
58+
logging.getLogger("tensorflow").setLevel(logging.ERROR)
59+
logging.getLogger("batchglm").setLevel(logging.WARNING)
60+
logging.getLogger("diffxpy").setLevel(logging.WARNING)
61+
62+
sim = self._prepare_data(n_cells=n_cells, n_genes=n_genes)
63+
64+
test = de.test.rank_test(
65+
data=sim.X,
66+
grouping="condition",
67+
sample_description=sim.sample_description,
68+
dtype="float64"
69+
)
70+
71+
self._eval(sim=sim, test=test)
72+
73+
return True
74+
75+
def _test_t_test_de(self, n_cells: int = 2000, n_genes: int = 100):
76+
"""
77+
:param n_cells: Number of cells to simulate (number of observations per test).
78+
:param n_genes: Number of genes to simulate (number of tests).
79+
"""
80+
logging.getLogger("tensorflow").setLevel(logging.ERROR)
81+
logging.getLogger("batchglm").setLevel(logging.WARNING)
82+
logging.getLogger("diffxpy").setLevel(logging.WARNING)
83+
84+
sim = self._prepare_data(n_cells=n_cells, n_genes=n_genes)
85+
86+
test = de.test.t_test(
87+
data=sim.X,
88+
grouping="condition",
89+
sample_description=sim.sample_description,
90+
dtype="float64"
91+
)
92+
93+
self._eval(sim=sim, test=test)
94+
95+
return True
96+
97+
def _test_wald_de(
98+
self,
99+
n_cells: int,
100+
n_genes: int,
101+
noise_model: str
102+
):
103+
"""
104+
:param n_cells: Number of cells to simulate (number of observations per test).
105+
:param n_genes: Number of genes to simulate (number of tests).
106+
:param noise_model: Noise model to use for data fitting.
107+
"""
108+
logging.getLogger("tensorflow").setLevel(logging.ERROR)
109+
logging.getLogger("batchglm").setLevel(logging.WARNING)
110+
logging.getLogger("diffxpy").setLevel(logging.WARNING)
111+
112+
sim = self._prepare_data(n_cells=n_cells, n_genes=n_genes)
113+
114+
test = de.test.wald(
115+
data=sim.X,
116+
factor_loc_totest="condition",
117+
formula_loc="~ 1 + condition",
118+
sample_description=sim.sample_description,
119+
noise_model=noise_model,
120+
training_strategy="DEFAULT",
121+
dtype="float64"
122+
)
123+
124+
self._eval(sim=sim, test=test)
125+
126+
return True
127+
128+
def _test_lrt_de(
129+
self,
130+
n_cells: int,
131+
n_genes: int,
132+
noise_model: str
133+
):
134+
"""
135+
:param n_cells: Number of cells to simulate (number of observations per test).
136+
:param n_genes: Number of genes to simulate (number of tests).
137+
:param noise_model: Noise model to use for data fitting.
138+
"""
139+
logging.getLogger("tensorflow").setLevel(logging.ERROR)
140+
logging.getLogger("batchglm").setLevel(logging.WARNING)
141+
logging.getLogger("diffxpy").setLevel(logging.WARNING)
142+
143+
sim = self._prepare_data(n_cells=n_cells, n_genes=n_genes)
144+
145+
test = de.test.lrt(
146+
data=sim.X,
147+
full_formula_loc="~ 1 + condition",
148+
full_formula_scale="~ 1",
149+
reduced_formula_loc="~ 1",
150+
reduced_formula_scale="~ 1",
151+
sample_description=sim.sample_description,
152+
noise_model=noise_model,
153+
training_strategy="DEFAULT",
154+
dtype="float64"
155+
)
156+
157+
self._eval(sim=sim, test=test)
158+
159+
return True
160+
161+
162+
class TestSingleDE_STANDARD(_TestSingleDE, unittest.TestCase):
163+
"""
164+
Noise model-independent tests unit tests that tests false positive and false negative rates.
165+
"""
166+
167+
def test_ttest_de(
168+
self,
169+
n_cells: int = 2000,
170+
n_genes: int = 200
171+
):
172+
"""
173+
:param n_cells: Number of cells to simulate (number of observations per test).
174+
:param n_genes: Number of genes to simulate (number of tests).
175+
"""
176+
return self._test_t_test_de(
177+
n_cells=n_cells,
178+
n_genes=n_genes
179+
)
180+
181+
def test_rank_de(
182+
self,
183+
n_cells: int = 2000,
184+
n_genes: int = 200
185+
):
186+
"""
187+
:param n_cells: Number of cells to simulate (number of observations per test).
188+
:param n_genes: Number of genes to simulate (number of tests).
189+
"""
190+
return self._test_rank_de(
191+
n_cells=n_cells,
192+
n_genes=n_genes
193+
)
194+
195+
196+
class TestSingleDE_NB(_TestSingleDE, unittest.TestCase):
197+
"""
198+
Negative binomial noise model unit tests that tests false positive and false negative rates.
199+
"""
200+
201+
def test_wald_de_nb(
202+
self,
203+
n_cells: int = 2000,
204+
n_genes: int = 200
205+
):
206+
"""
207+
:param n_cells: Number of cells to simulate (number of observations per test).
208+
:param n_genes: Number of genes to simulate (number of tests).
209+
"""
210+
return self._test_wald_de(
211+
n_cells=n_cells,
212+
n_genes=n_genes,
213+
noise_model="nb"
214+
)
215+
216+
def test_lrt_de_nb(
217+
self,
218+
n_cells: int = 2000,
219+
n_genes: int = 200
220+
):
221+
"""
222+
:param n_cells: Number of cells to simulate (number of observations per test).
223+
:param n_genes: Number of genes to simulate (number of tests).
224+
"""
225+
return self._test_lrt_de(
226+
n_cells=n_cells,
227+
n_genes=n_genes,
228+
noise_model="nb"
229+
)
230+
231+
232+
class TestSingleDE_NORM(_TestSingleDE, unittest.TestCase):
233+
"""
234+
Normal noise model unit tests that tests false positive and false negative rates.
235+
"""
236+
237+
def test_wald_de_norm(
238+
self,
239+
n_cells: int = 2000,
240+
n_genes: int = 200
241+
):
242+
"""
243+
:param n_cells: Number of cells to simulate (number of observations per test).
244+
:param n_genes: Number of genes to simulate (number of tests).
245+
"""
246+
return self._test_wald_de(
247+
n_cells=n_cells,
248+
n_genes=n_genes,
249+
noise_model="norm"
250+
)
251+
252+
def test_lrt_de_norm(
253+
self,
254+
n_cells: int = 2000,
255+
n_genes: int = 200
256+
):
257+
"""
258+
:param n_cells: Number of cells to simulate (number of observations per test).
259+
:param n_genes: Number of genes to simulate (number of tests).
260+
"""
261+
return self._test_lrt_de(
262+
n_cells=n_cells,
263+
n_genes=n_genes,
264+
noise_model="norm"
265+
)
266+
267+
if __name__ == '__main__':
268+
unittest.main()
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
import unittest
2+
import logging
3+
import numpy as np
4+
import pandas as pd
5+
import scipy.stats as stats
6+
7+
from batchglm.api.models.glm_nb import Simulator
8+
import diffxpy.api as de
9+
10+
11+
class TestSingleExternalLibs(unittest.TestCase):
12+
13+
def _prepare_data(self, n_cells: int = 2000, n_genes: int = 100):
14+
"""
15+
16+
:param n_cells: Number of cells to simulate (number of observations per test).
17+
:param n_genes: Number of genes to simulate (number of tests).
18+
"""
19+
sim = Simulator(num_observations=n_cells, num_features=n_genes)
20+
sim.generate_sample_description(num_batches=0, num_conditions=2)
21+
sim.generate_params(
22+
rand_fn_ave=lambda shape: np.random.poisson(500, shape) + 1,
23+
rand_fn=lambda shape: np.abs(np.random.uniform(1, 0.5, shape))
24+
)
25+
sim.generate_data()
26+
27+
return sim
28+
29+
def _eval(self, test, ref_pvals):
30+
test_pval = test.pval
31+
pval_dev = np.abs(test_pval - ref_pvals)
32+
log_pval_dev = np.abs(np.log(test_pval+1e-200) - np.log(ref_pvals+1e-200))
33+
max_dev = np.max(pval_dev)
34+
max_log_dev = np.max(log_pval_dev)
35+
mean_dev = np.mean(log_pval_dev)
36+
logging.getLogger("diffxpy").info(
37+
'maximum absolute p-value deviation: %f' %
38+
float(max_dev)
39+
)
40+
logging.getLogger("diffxpy").info(
41+
'maximum absolute log p-value deviation: %f' %
42+
float(max_log_dev)
43+
)
44+
logging.getLogger("diffxpy").info(
45+
'mean absolute log p-value deviation: %f' %
46+
float(mean_dev)
47+
)
48+
assert max_dev < 1e-3, "maximum deviation too large"
49+
assert max_log_dev < 1e-1, "maximum deviation in log space too large"
50+
51+
def test_t_test_ref(self, n_cells: int = 2000, n_genes: int = 100):
52+
"""
53+
Test if de.test.t_test() generates the same p-value distribution as scipy t-test.
54+
55+
:param n_cells: Number of cells to simulate (number of observations per test).
56+
:param n_genes: Number of genes to simulate (number of tests).
57+
"""
58+
logging.getLogger("tensorflow").setLevel(logging.ERROR)
59+
logging.getLogger("batchglm").setLevel(logging.WARNING)
60+
logging.getLogger("diffxpy").setLevel(logging.INFO)
61+
62+
sim = self._prepare_data(n_cells=n_cells, n_genes=n_genes)
63+
64+
test = de.test.t_test(
65+
data=sim.X,
66+
grouping="condition",
67+
sample_description=sim.sample_description,
68+
dtype="float64"
69+
)
70+
71+
# Run scipy t-tests as a reference.
72+
conds = np.unique(sim.sample_description["condition"].values)
73+
ind_a = np.where(sim.sample_description["condition"] == conds[0])[0]
74+
ind_b = np.where(sim.sample_description["condition"] == conds[1])[0]
75+
scipy_pvals = stats.ttest_ind(a=sim.X[ind_a, :], b=sim.X[ind_b, :], axis=0, equal_var=False).pvalue
76+
77+
self._eval(test=test, ref_pvals=scipy_pvals)
78+
79+
return True
80+
81+
def test_rank_ref(self, n_cells: int = 2000, n_genes: int = 100):
82+
"""
83+
Test if de.test.rank_test() generates the same p-value distribution as scipy t-test.
84+
85+
:param n_cells: Number of cells to simulate (number of observations per test).
86+
:param n_genes: Number of genes to simulate (number of tests).
87+
"""
88+
logging.getLogger("tensorflow").setLevel(logging.ERROR)
89+
logging.getLogger("batchglm").setLevel(logging.WARNING)
90+
logging.getLogger("diffxpy").setLevel(logging.INFO)
91+
92+
sim = self._prepare_data(n_cells=n_cells, n_genes=n_genes)
93+
94+
test = de.test.rank_test(
95+
data=sim.X,
96+
grouping="condition",
97+
sample_description=sim.sample_description,
98+
dtype="float64"
99+
)
100+
101+
# Run scipy t-tests as a reference.
102+
conds = np.unique(sim.sample_description["condition"].values)
103+
ind_a = np.where(sim.sample_description["condition"] == conds[0])[0]
104+
ind_b = np.where(sim.sample_description["condition"] == conds[1])[0]
105+
scipy_pvals = np.array([
106+
stats.mannwhitneyu(x=sim.X[ind_a, i], y=sim.X[ind_b, i],
107+
use_continuity=True, alternative="two-sided").pvalue
108+
for i in range(sim.X.shape[1])
109+
])
110+
111+
self._eval(test=test, ref_pvals=scipy_pvals)
112+
113+
return True
114+
115+
116+
if __name__ == '__main__':
117+
unittest.main()

0 commit comments

Comments
 (0)