Skip to content

Commit 45d9724

Browse files
introduced new constraint interface
1 parent efa841e commit 45d9724

File tree

4 files changed

+111
-66
lines changed

4 files changed

+111
-66
lines changed

diffxpy/api/utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
import batchglm.data as data_utils
1+
from batchglm.data import constraint_matrix_from_string, setup_constrained
2+
from batchglm.data import design_matrix, design_matrix_from_xarray, design_matrix_from_anndata
3+
from batchglm.data import view_coef_names

diffxpy/testing/tests.py

Lines changed: 73 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ def lrt(
207207
gene_names: Union[np.ndarray, list] = None,
208208
sample_description: pd.DataFrame = None,
209209
noise_model="nb",
210-
size_factors: np.ndarray = None,
210+
size_factors: Union[np.ndarray, pd.core.series.Series, np.ndarray] = None,
211211
batch_size: int = None,
212212
training_strategy: Union[str, List[Dict[str, object]], Callable] = "DEFAULT",
213213
quick_scale: bool = False,
@@ -220,8 +220,7 @@ def lrt(
220220
Note that lrt() does not support constraints in its current form. Please
221221
use wald() for constraints.
222222
223-
:param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations.
224-
Input data matrix (observations x features) or (cells x genes).
223+
:param data: Input data matrix (observations x features) or (cells x genes).
225224
:param full_formula_loc: formula
226225
Full model formula for location parameter model.
227226
If not specified, `full_formula` will be used instead.
@@ -264,7 +263,8 @@ def lrt(
264263
265264
- 'nb': default
266265
:param size_factors: 1D array of transformed library size factors for each cell in the
267-
same order as in data
266+
same order as in data or string-type column identifier of size-factor containing
267+
column in sample description.
268268
:param batch_size: the batch size to use for the estimator
269269
:param training_strategy: {str, function, list} training strategy to use. Can be:
270270
@@ -302,27 +302,35 @@ def lrt(
302302
gene_names = parse_gene_names(data, gene_names)
303303
X = parse_data(data, gene_names)
304304
sample_description = parse_sample_description(data, sample_description)
305-
size_factors = parse_size_factors(size_factors=size_factors, data=X)
305+
size_factors = parse_size_factors(
306+
size_factors=size_factors,
307+
data=X,
308+
sample_description=sample_description
309+
)
306310

307311
full_design_loc = data_utils.design_matrix(
308312
sample_description=sample_description,
309313
formula=full_formula_loc,
310-
as_categorical=[False if x in as_numeric else True for x in sample_description.columns.values]
314+
as_categorical=[False if x in as_numeric else True for x in sample_description.columns.values],
315+
return_type="patsy"
311316
)
312317
reduced_design_loc = data_utils.design_matrix(
313318
sample_description=sample_description,
314319
formula=reduced_formula_loc,
315-
as_categorical=[False if x in as_numeric else True for x in sample_description.columns.values]
320+
as_categorical=[False if x in as_numeric else True for x in sample_description.columns.values],
321+
return_type="patsy"
316322
)
317323
full_design_scale = data_utils.design_matrix(
318324
sample_description=sample_description,
319325
formula=full_formula_scale,
320-
as_categorical=[False if x in as_numeric else True for x in sample_description.columns.values]
326+
as_categorical=[False if x in as_numeric else True for x in sample_description.columns.values],
327+
return_type="patsy"
321328
)
322329
reduced_design_scale = data_utils.design_matrix(
323330
sample_description=sample_description,
324331
formula=reduced_formula_scale,
325-
as_categorical=[False if x in as_numeric else True for x in sample_description.columns.values]
332+
as_categorical=[False if x in as_numeric else True for x in sample_description.columns.values],
333+
return_type="patsy"
326334
)
327335

328336
reduced_model = _fit(
@@ -388,7 +396,7 @@ def wald(
388396
constraints_loc: np.ndarray = None,
389397
constraints_scale: np.ndarray = None,
390398
noise_model: str = "nb",
391-
size_factors: np.ndarray = None,
399+
size_factors: Union[np.ndarray, pd.core.series.Series, str] = None,
392400
batch_size: int = None,
393401
training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO",
394402
quick_scale: bool = False,
@@ -417,7 +425,7 @@ def wald(
417425
:param as_numeric:
418426
Which columns of sample_description to treat as numeric and
419427
not as categorical. This yields columns in the design matrix
420-
which do not correpond to one-hot encoded discrete factors.
428+
which do not correspond to one-hot encoded discrete factors.
421429
This makes sense for number of genes, time, pseudotime or space
422430
for example.
423431
:param init_a: (Optional) Low-level initial values for a.
@@ -465,7 +473,8 @@ def wald(
465473
are indicated by a 1. It is highly recommended to only use this option
466474
together with prebuilt design matrix for the scale model, dmat_scale.
467475
:param size_factors: 1D array of transformed library size factors for each cell in the
468-
same order as in data
476+
same order as in data or string-type column identifier of size-factor containing
477+
column in sample description.
469478
:param noise_model: str, noise model to use in model-based unit_test. Possible options:
470479
471480
- 'nb': default
@@ -488,10 +497,14 @@ def wald(
488497
if len(kwargs) != 0:
489498
logging.getLogger("diffxpy").debug("additional kwargs: %s", str(kwargs))
490499

491-
if dmat_loc is None and formula_loc is None:
492-
raise ValueError("Supply either dmat_loc or formula_loc or formula.")
493-
if dmat_scale is None and formula_scale is None:
494-
raise ValueError("Supply either dmat_loc or formula_loc or formula.")
500+
if (dmat_loc is None and formula_loc is None) or \
501+
(dmat_loc is not None and formula_loc is not None):
502+
raise ValueError("Supply either dmat_loc or formula_loc.")
503+
if (dmat_scale is None and formula_scale is None) or \
504+
(dmat_scale is not None and formula_scale != "~1"):
505+
raise ValueError("Supply either dmat_scale or formula_scale.")
506+
if dmat_loc is not None and factor_loc_totest is not None:
507+
raise ValueError("Supply coef_to_test and not factor_loc_totest if dmat_loc is supplied.")
495508
# Check that factor_loc_totest and coef_to_test are lists and not single strings:
496509
if isinstance(factor_loc_totest, str):
497510
factor_loc_totest = [factor_loc_totest]
@@ -505,13 +518,18 @@ def wald(
505518
X = parse_data(data, gene_names)
506519
if dmat_loc is None and dmat_scale is None:
507520
sample_description = parse_sample_description(data, sample_description)
508-
size_factors = parse_size_factors(size_factors=size_factors, data=X)
521+
size_factors = parse_size_factors(
522+
size_factors=size_factors,
523+
data=X,
524+
sample_description=sample_description
525+
)
509526

510527
if dmat_loc is None:
511528
design_loc = data_utils.design_matrix(
512529
sample_description=sample_description,
513530
formula=formula_loc,
514-
as_categorical=[False if x in as_numeric else True for x in sample_description.columns.values]
531+
as_categorical=[False if x in as_numeric else True for x in sample_description.columns.values],
532+
return_type="patsy"
515533
)
516534
# Check that closed-form is not used if numeric predictors are used and model is not "norm".
517535
if isinstance(init_a, str):
@@ -533,7 +551,8 @@ def wald(
533551
design_scale = data_utils.design_matrix(
534552
sample_description=sample_description,
535553
formula=formula_scale,
536-
as_categorical=[False if x in as_numeric else True for x in sample_description.columns.values]
554+
as_categorical=[False if x in as_numeric else True for x in sample_description.columns.values],
555+
return_type="patsy"
537556
)
538557
# Check that closed-form is not used if numeric predictors are used and model is not "norm".
539558
if isinstance(init_b, str):
@@ -1645,10 +1664,8 @@ def continuous_1d(
16451664
init_b: Union[np.ndarray, str] = "standard",
16461665
gene_names: Union[np.ndarray, list] = None,
16471666
sample_description=None,
1648-
dmat_loc: Union[patsy.design_info.DesignMatrix, xr.Dataset] = None,
1649-
dmat_scale: Union[patsy.design_info.DesignMatrix, xr.Dataset] = None,
1650-
constraints_loc: np.ndarray = None,
1651-
constraints_scale: np.ndarray = None,
1667+
constraints_loc: Union[Tuple[str], List[str]] = (),
1668+
constraints_scale: Union[Tuple[str], List[str]] = (),
16521669
noise_model: str = 'nb',
16531670
size_factors: np.ndarray = None,
16541671
batch_size: int = None,
@@ -1696,11 +1713,10 @@ def continuous_1d(
16961713
this will be propagated across all coefficients which represent this covariate
16971714
in the spline basis space.
16981715
:param as_numeric:
1699-
Which columns of sample_description to treat as numeric and
1700-
not as categorical. This yields columns in the design matrix
1701-
which do not correpond to one-hot encoded discrete factors.
1702-
This makes sense for number of genes, time, pseudotime or space
1703-
for example.
1716+
Which columns of sample_description to treat as numeric and not as categorical.
1717+
This yields columns in the design matrix which do not correpond to one-hot encoded discrete factors.
1718+
This makes sense for library depth for example. Do not use this for the covariate that you
1719+
want to extrpolate with using a spline-basis!
17041720
:param test: str, statistical test to use. Possible options:
17051721
17061722
- 'wald': default
@@ -1719,34 +1735,33 @@ def continuous_1d(
17191735
* "auto": automatically choose best initialization
17201736
* "standard": initialize with zeros
17211737
- np.ndarray: direct initialization of 'b'
1722-
:param gene_names: optional list/array of gene names which will be used if `data` does not implicitly store these
1738+
:param gene_names: optional list/array of gene names which will be used if `data` does
1739+
not implicitly store these
17231740
:param sample_description: optional pandas.DataFrame containing sample annotations
1724-
:param dmat_loc: Pre-built location model design matrix.
1725-
This over-rides formula_loc and sample description information given in
1726-
data or sample_description.
1727-
:param dmat_scale: Pre-built scale model design matrix.
1728-
This over-rides formula_scale and sample description information given in
1729-
data or sample_description.
1730-
:param constraints_loc: : Constraints for location model.
1731-
Array with constraints in rows and model parameters in columns.
1732-
Each constraint contains non-zero entries for the a of parameters that
1733-
has to sum to zero. This constraint is enforced by binding one parameter
1734-
to the negative sum of the other parameters, effectively representing that
1735-
parameter as a function of the other parameters. This dependent
1736-
parameter is indicated by a -1 in this array, the independent parameters
1737-
of that constraint (which may be dependent at an earlier constraint)
1738-
are indicated by a 1. It is highly recommended to only use this option
1739-
together with prebuilt design matrix for the location model, dmat_loc.
1740-
:param constraints_scale: : Constraints for scale model.
1741-
Array with constraints in rows and model parameters in columns.
1742-
Each constraint contains non-zero entries for the a of parameters that
1743-
has to sum to zero. This constraint is enforced by binding one parameter
1744-
to the negative sum of the other parameters, effectively representing that
1745-
parameter as a function of the other parameters. This dependent
1746-
parameter is indicated by a -1 in this array, the independent parameters
1747-
of that constraint (which may be dependent at an earlier constraint)
1748-
are indicated by a 1. It is highly recommended to only use this option
1749-
together with prebuilt design matrix for the scale model, dmat_scale.
1741+
:param constraints_loc: Grouped factors to enfore equality constraints on for location model.
1742+
Every element of the iteratable corresponds to one set of equality constraints.
1743+
Each set has to be a dictionary of the form {x: y} where x is the factor to be constrained
1744+
and y is a factor by which levels of x are grouped and then constrained. Set y="1" to constrain
1745+
all levels of x to sum to one, a single equality constraint.
1746+
1747+
E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to
1748+
zero. This is applicable if repeats of a an experiment within each condition
1749+
are independent so that the set-up ~1+condition+batch is perfectly confounded.
1750+
1751+
Can only group by non-constrained effects right now, use constraint_matrix_from_string
1752+
for other cases.
1753+
:param constraints_scale: Grouped factors to enfore equality constraints on for scale model.
1754+
Every element of the iteratable corresponds to one set of equality constraints.
1755+
Each set has to be a dictionary of the form {x: y} where x is the factor to be constrained
1756+
and y is a factor by which levels of x are grouped and then constrained. Set y="1" to constrain
1757+
all levels of x to sum to one, a single equality constraint.
1758+
1759+
E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to
1760+
zero. This is applicable if repeats of a an experiment within each condition
1761+
are independent so that the set-up ~1+condition+batch is perfectly confounded.
1762+
1763+
Can only group by non-constrained effects right now, use constraint_matrix_from_string
1764+
for other cases.
17501765
:param noise_model: str, noise model to use in model-based unit_test. Possible options:
17511766
17521767
- 'nb': default
@@ -1757,9 +1772,9 @@ def continuous_1d(
17571772
17581773
- str: will use Estimator.TrainingStrategy[training_strategy] to train
17591774
- function: Can be used to implement custom training function will be called as
1760-
`training_strategy(estimator)`.
1761-
- list of keyword dicts containing method arguments: Will call Estimator.train() once with each dict of
1762-
method arguments.
1775+
`training_strategy(estimator)`.
1776+
- list of keyword dicts containing method arguments: Will call Estimator.train()
1777+
once with each dict of method arguments.
17631778
17641779
Example:
17651780

diffxpy/testing/utils.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import numpy as np
55
import pandas as pd
66
import patsy
7+
import scipy
78
import xarray as xr
89

910
from batchglm import data as data_utils
@@ -31,7 +32,17 @@ def parse_data(data, gene_names) -> xr.DataArray:
3132
return X
3233

3334

34-
def parse_sample_description(data, sample_description=None) -> pd.DataFrame:
35+
def parse_sample_description(
36+
data: Union[anndata.AnnData, anndata.base.Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix],
37+
sample_description: Union[pd.DataFrame, None]
38+
) -> pd.DataFrame:
39+
"""
40+
Parse sample description from input.
41+
42+
:param data: Input data matrix (observations x features) or (cells x genes).
43+
:param sample_description: pandas.DataFrame containing sample annotations, can be None.
44+
:return: Assembled sample annotations.
45+
"""
3546
if sample_description is None:
3647
if anndata is not None and isinstance(data, anndata.AnnData):
3748
sample_description = data_utils.sample_description_from_anndata(
@@ -58,10 +69,27 @@ def parse_sample_description(data, sample_description=None) -> pd.DataFrame:
5869
return sample_description
5970

6071

61-
def parse_size_factors(size_factors, data):
72+
def parse_size_factors(
73+
size_factors: Union[np.ndarray, pd.core.series.Series, np.ndarray],
74+
data: Union[anndata.AnnData, anndata.base.Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix],
75+
sample_description: pd.DataFrame
76+
) -> Union[np.ndarray, None]:
77+
"""
78+
Parse size-factors from input.
79+
80+
:param size_factors: 1D array of transformed library size factors for each cell in the
81+
same order as in data or string-type column identifier of size-factor containing
82+
column in sample description.
83+
:param data: Input data matrix (observations x features) or (cells x genes).
84+
:param sample_description: optional pandas.DataFrame containing sample annotations
85+
:return: Assebled size-factors.
86+
"""
6287
if size_factors is not None:
6388
if isinstance(size_factors, pd.core.series.Series):
6489
size_factors = size_factors.values
90+
elif isinstance(size_factors, str):
91+
assert size_factors in sample_description.columns, ""
92+
size_factors = sample_description[size_factors].values
6593
assert size_factors.shape[0] == data.shape[0], "data matrix and size factors must contain same number of cells"
6694
assert np.all(size_factors > 0), "size_factors <= 0 found, please remove these cells"
6795
return size_factors

diffxpy/unit_test/test_constrained.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,12 @@ def test_null_distribution_wald_constrained(self, n_genes: int = 100):
4747
dmat_est_scale = de.test.design_matrix(dmat=dmat_est)
4848

4949
# Build constraints:
50-
constraints_loc = de.utils.data_utils.build_equality_constraints_string(
50+
constraints_loc = de.utils.data_utils.constraint_matrix_from_string(
5151
dmat=dmat_est_loc,
5252
constraints=["bio1+bio2=0", "bio3+bio4=0"],
5353
dims=["design_loc_params", "loc_params"]
5454
)
55-
constraints_scale = de.utils.data_utils.build_equality_constraints_string(
55+
constraints_scale = de.utils.data_utils.constraint_matrix_from_string(
5656
dmat=dmat_est_scale,
5757
constraints=["bio1+bio2=0", "bio3+bio4=0"],
5858
dims=["design_scale_params", "scale_params"]
@@ -131,7 +131,7 @@ def test_null_distribution_wald_constrained_2layer(self, n_genes: int = 100):
131131
dmat_est_scale = de.test.design_matrix(dmat=dmat_est.iloc[:, [0]])
132132

133133
# Build constraints:
134-
constraints_loc = de.utils.data_utils.build_equality_constraints_string(
134+
constraints_loc = de.utils.data_utils.constraint_matrix_from_string(
135135
dmat=dmat_est_loc,
136136
constraints=["bio1+bio2=0",
137137
"bio3+bio4=0",
@@ -207,14 +207,14 @@ def test_null_distribution_wald_multi_constrained_2layer(self, n_genes: int = 50
207207
dmat_est_scale = de.test.design_matrix(dmat=dmat_est)
208208

209209
# Build constraints:
210-
constraints_loc = de.utils.data_utils.build_equality_constraints_string(
210+
constraints_loc = de.utils.data_utils.constraint_matrix_from_string(
211211
dmat=dmat_est_loc,
212212
constraints=["bio1+bio2=0",
213213
"bio3+bio4=0",
214214
"bio5+bio6=0"],
215215
dims=["design_loc_params", "loc_params"]
216216
)
217-
constraints_scale = de.utils.data_utils.build_equality_constraints_string(
217+
constraints_scale = de.utils.data_utils.constraint_matrix_from_string(
218218
dmat=dmat_est_scale,
219219
constraints=["bio1+bio2=0",
220220
"bio3+bio4=0",

0 commit comments

Comments
 (0)