Skip to content

Commit c44898a

Browse files
finalized addition of new constraints at all interface levels up to test.wald()
1 parent 7d6dbec commit c44898a

File tree

5 files changed

+230
-105
lines changed

5 files changed

+230
-105
lines changed

diffxpy/api/utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1-
from diffxpy.testing.utils import constraint_matrix_from_string, setup_constrained
1+
from diffxpy.testing.utils import constraint_matrix_from_string, constraint_matrix_from_dict, \
2+
constraint_system_from_star
23
from diffxpy.testing.utils import design_matrix, design_matrix_from_xarray, design_matrix_from_anndata
34
from diffxpy.testing.utils import view_coef_names, preview_coef_names

diffxpy/testing/tests.py

Lines changed: 150 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@
1717
DifferentialExpressionTestZTestLazy, DifferentialExpressionTestZTest, DifferentialExpressionTestPairwise, \
1818
DifferentialExpressionTestVsRest, _DifferentialExpressionTestMulti, DifferentialExpressionTestByPartition, \
1919
DifferentialExpressionTestWaldCont, DifferentialExpressionTestLRTCont
20-
from .utils import parse_gene_names, parse_data, parse_sample_description, parse_size_factors, parse_grouping
20+
from .utils import parse_gene_names, parse_data, parse_sample_description, parse_size_factors, parse_grouping, \
21+
constraint_system_from_star
2122

2223
# Use this to suppress matrix subclass PendingDepreceationWarnings from numpy:
2324
np.warnings.filterwarnings("ignore")
@@ -393,8 +394,8 @@ def wald(
393394
sample_description: Union[None, pd.DataFrame] = None,
394395
dmat_loc: Union[patsy.design_info.DesignMatrix, xr.Dataset] = None,
395396
dmat_scale: Union[patsy.design_info.DesignMatrix, xr.Dataset] = None,
396-
constraints_loc: np.ndarray = None,
397-
constraints_scale: np.ndarray = None,
397+
constraints_loc: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None,
398+
constraints_scale: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None,
398399
noise_model: str = "nb",
399400
size_factors: Union[np.ndarray, pd.core.series.Series, str] = None,
400401
batch_size: int = None,
@@ -451,26 +452,72 @@ def wald(
451452
:param dmat_scale: Pre-built scale model design matrix.
452453
This over-rides formula_scale and sample description information given in
453454
data or sample_description.
454-
:param constraints_loc: : Constraints for location model.
455-
Array with constraints in rows and model parameters in columns.
456-
Each constraint contains non-zero entries for the a of parameters that
457-
has to sum to zero. This constraint is enforced by binding one parameter
458-
to the negative sum of the other parameters, effectively representing that
459-
parameter as a function of the other parameters. This dependent
460-
parameter is indicated by a -1 in this array, the independent parameters
461-
of that constraint (which may be dependent at an earlier constraint)
462-
are indicated by a 1. It is highly recommended to only use this option
463-
together with prebuilt design matrix for the location model, dmat_loc.
464-
:param constraints_scale: : Constraints for scale model.
465-
Array with constraints in rows and model parameters in columns.
466-
Each constraint contains non-zero entries for the a of parameters that
467-
has to sum to zero. This constraint is enforced by binding one parameter
468-
to the negative sum of the other parameters, effectively representing that
469-
parameter as a function of the other parameters. This dependent
470-
parameter is indicated by a -1 in this array, the independent parameters
471-
of that constraint (which may be dependent at an earlier constraint)
472-
are indicated by a 1. It is highly recommended to only use this option
473-
together with prebuilt design matrix for the scale model, dmat_scale.
455+
:param constraints_loc: Constraints for location model. Can be one of the following:
456+
457+
- np.ndarray:
458+
Array with constraints in rows and model parameters in columns.
459+
Each constraint contains non-zero entries for the a of parameters that
460+
has to sum to zero. This constraint is enforced by binding one parameter
461+
to the negative sum of the other parameters, effectively representing that
462+
parameter as a function of the other parameters. This dependent
463+
parameter is indicated by a -1 in this array, the independent parameters
464+
of that constraint (which may be dependent at an earlier constraint)
465+
are indicated by a 1. You should only use this option
466+
together with prebuilt design matrix for the location model, dmat_loc,
467+
for example via de.utils.setup_constrained().
468+
- dict:
469+
Every element of the dictionary corresponds to one set of equality constraints.
470+
Each set has to be be an entry of the form {..., x: y, ...}
471+
where x is the factor to be constrained and y is a factor by which levels of x are grouped
472+
and then constrained. Set y="1" to constrain all levels of x to sum to one,
473+
a single equality constraint.
474+
475+
E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to
476+
zero. This is applicable if repeats of a an experiment within each condition
477+
are independent so that the set-up ~1+condition+batch is perfectly confounded.
478+
479+
Can only group by non-constrained effects right now, use constraint_matrix_from_string
480+
for other cases.
481+
- list of strings or tuple of strings:
482+
String encoded equality constraints.
483+
484+
E.g. ["batch1 + batch2 + batch3 = 0"]
485+
- None:
486+
No constraints are used, this is equivalent to using an identity matrix as a
487+
constraint matrix.
488+
:param constraints_scale: Constraints for scale model. Can be one of the following:
489+
490+
- np.ndarray:
491+
Array with constraints in rows and model parameters in columns.
492+
Each constraint contains non-zero entries for the a of parameters that
493+
has to sum to zero. This constraint is enforced by binding one parameter
494+
to the negative sum of the other parameters, effectively representing that
495+
parameter as a function of the other parameters. This dependent
496+
parameter is indicated by a -1 in this array, the independent parameters
497+
of that constraint (which may be dependent at an earlier constraint)
498+
are indicated by a 1. You should only use this option
499+
together with prebuilt design matrix for the scale model, dmat_scale,
500+
for example via de.utils.setup_constrained().
501+
- dict:
502+
Every element of the dictionary corresponds to one set of equality constraints.
503+
Each set has to be be an entry of the form {..., x: y, ...}
504+
where x is the factor to be constrained and y is a factor by which levels of x are grouped
505+
and then constrained. Set y="1" to constrain all levels of x to sum to one,
506+
a single equality constraint.
507+
508+
E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to
509+
zero. This is applicable if repeats of a an experiment within each condition
510+
are independent so that the set-up ~1+condition+batch is perfectly confounded.
511+
512+
Can only group by non-constrained effects right now, use constraint_matrix_from_string
513+
for other cases.
514+
- list of strings or tuple of strings:
515+
String encoded equality constraints.
516+
517+
E.g. ["batch1 + batch2 + batch3 = 0"]
518+
- None:
519+
No constraints are used, this is equivalent to using an identity matrix as a
520+
constraint matrix.
474521
:param size_factors: 1D array of transformed library size factors for each cell in the
475522
same order as in data or string-type column identifier of size-factor containing
476523
column in sample description.
@@ -523,50 +570,26 @@ def wald(
523570
sample_description=sample_description
524571
)
525572

526-
if dmat_loc is None:
527-
design_loc = data_utils.design_matrix(
528-
sample_description=sample_description,
529-
formula=formula_loc,
530-
as_categorical=[False if x in as_numeric else True for x in sample_description.columns.values],
531-
return_type="patsy"
532-
)
533-
# Check that closed-form is not used if numeric predictors are used and model is not "norm".
534-
if isinstance(init_a, str):
535-
if np.any([True if x in as_numeric else False for x in sample_description.columns.values]):
536-
if noise_model.lower() not in ["normal", "norm"]:
537-
if init_a == "closed_form":
538-
init_a = "standard"
539-
logging.getLogger("diffxpy").warning(
540-
"Setting init_a to standard as numeric predictors were supplied.")
541-
logging.getLogger("diffxpy").warning(
542-
"Closed-form initialisation is not possible" +
543-
" for noise model %s with numeric predictors." % noise_model)
544-
elif init_a == "AUTO":
545-
init_a = "standard"
546-
else:
547-
design_loc = dmat_loc
548-
549-
if dmat_scale is None:
550-
design_scale = data_utils.design_matrix(
551-
sample_description=sample_description,
552-
formula=formula_scale,
553-
as_categorical=[False if x in as_numeric else True for x in sample_description.columns.values],
554-
return_type="patsy"
555-
)
556-
# Check that closed-form is not used if numeric predictors are used and model is not "norm".
557-
if isinstance(init_b, str):
558-
if np.any([True if x in as_numeric else False for x in sample_description.columns.values]):
559-
if init_b == "closed_form":
560-
init_b = "standard"
561-
logging.getLogger("diffxpy").warning(
562-
"Setting init_b to standard as numeric predictors were supplied.")
563-
logging.getLogger("diffxpy").warning(
564-
"Closed-form initialisation is not possible" +
565-
" for noise model %s with numeric predictors." % noise_model)
566-
elif init_b == "AUTO":
567-
init_b = "standard"
568-
else:
569-
design_scale = dmat_scale
573+
logging.getLogger("diffxpy").debug("building location model")
574+
design_loc, constraints_loc = constraint_system_from_star(
575+
dmat=dmat_loc,
576+
sample_description=sample_description,
577+
formula=formula_loc,
578+
as_numeric=as_numeric,
579+
constraints=constraints_loc,
580+
dims=["design_loc_params", "loc_params"],
581+
return_type="patsy"
582+
)
583+
logging.getLogger("diffxpy").debug("building scale model")
584+
design_scale, constraints_scale = constraint_system_from_star(
585+
dmat=dmat_scale,
586+
sample_description=sample_description,
587+
formula=formula_scale,
588+
as_numeric=as_numeric,
589+
constraints=constraints_scale,
590+
dims=["design_scale_params", "scale_params"],
591+
return_type="patsy"
592+
)
570593

571594
# Define indices of coefficients to test:
572595
constraints_loc_temp = constraints_loc if constraints_loc is not None else np.eye(design_loc.shape[-1])
@@ -1661,8 +1684,8 @@ def continuous_1d(
16611684
init_b: Union[np.ndarray, str] = "standard",
16621685
gene_names: Union[np.ndarray, list] = None,
16631686
sample_description=None,
1664-
constraints_loc: Union[Tuple[str], List[str]] = (),
1665-
constraints_scale: Union[Tuple[str], List[str]] = (),
1687+
constraints_loc: Union[dict, None] = None,
1688+
constraints_scale: Union[dict, None] = None,
16661689
noise_model: str = 'nb',
16671690
size_factors: np.ndarray = None,
16681691
batch_size: int = None,
@@ -1676,7 +1699,7 @@ def continuous_1d(
16761699
16771700
This function wraps the selected statistical test for
16781701
scenarios with continuous covariates and performs the necessary
1679-
spline basis transformation of the continuous covariate so that the
1702+
spline basis transformation of the continuous co-variate so that the
16801703
problem can be framed as a GLM.
16811704
16821705
Note that direct supply of dmats is not enabled as this function wraps
@@ -1685,6 +1708,11 @@ def continuous_1d(
16851708
perform these spline basis transforms outside of diffxpy and feed the
16861709
dmat directly to one of the test routines wald() or lrt().
16871710
1711+
The constraint interface only-supports dictionary-formatted constraints and
1712+
string-formatted constraints but not array-formatted constraint matrices as
1713+
design matrices are built within this function and the shape of constraint
1714+
matrices depends on the output of this function.
1715+
16881716
:param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations.
16891717
Input data matrix (observations x features) or (cells x genes).
16901718
:param continuous: str
@@ -1735,32 +1763,56 @@ def continuous_1d(
17351763
:param gene_names: optional list/array of gene names which will be used if `data` does
17361764
not implicitly store these
17371765
:param sample_description: optional pandas.DataFrame containing sample annotations
1738-
:param constraints_loc: Grouped factors to enfore equality constraints on for location model.
1739-
Every element of the dictionary corresponds to one set of equality constraints.
1740-
Each set has to be be an entry of the form {..., x: y, ...}
1741-
where x is the factor to be constrained and y is a factor by which levels of x are grouped
1742-
and then constrained. Set y="1" to constrain all levels of x to sum to one,
1743-
a single equality constraint.
1744-
1745-
E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to
1746-
zero. This is applicable if repeats of a an experiment within each condition
1747-
are independent so that the set-up ~1+condition+batch is perfectly confounded.
1748-
1749-
Can only group by non-constrained effects right now, use constraint_matrix_from_string
1750-
for other cases.
1751-
:param constraints_scale: Grouped factors to enfore equality constraints on for scale model.
1752-
Every element of the dictionary corresponds to one set of equality constraints.
1753-
Each set has to be be an entry of the form {..., x: y, ...}
1754-
where x is the factor to be constrained and y is a factor by which levels of x are grouped
1755-
and then constrained. Set y="1" to constrain all levels of x to sum to one,
1756-
a single equality constraint.
1757-
1758-
E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to
1759-
zero. This is applicable if repeats of a an experiment within each condition
1760-
are independent so that the set-up ~1+condition+batch is perfectly confounded.
1761-
1762-
Can only group by non-constrained effects right now, use constraint_matrix_from_string
1763-
for other cases.
1766+
:param constraints_loc: Constraints for location model. Can be one of the following:
1767+
1768+
- dict:
1769+
Every element of the dictionary corresponds to one set of equality constraints.
1770+
Each set has to be be an entry of the form {..., x: y, ...}
1771+
where x is the factor to be constrained and y is a factor by which levels of x are grouped
1772+
and then constrained. Set y="1" to constrain all levels of x to sum to one,
1773+
a single equality constraint.
1774+
1775+
E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to
1776+
zero. This is applicable if repeats of a an experiment within each condition
1777+
are independent so that the set-up ~1+condition+batch is perfectly confounded.
1778+
1779+
Can only group by non-constrained effects right now, use constraint_matrix_from_string
1780+
for other cases.
1781+
- list of strings or tuple of strings:
1782+
String encoded equality constraints.
1783+
1784+
E.g. ["batch1 + batch2 + batch3 = 0"]
1785+
- None:
1786+
No constraints are used, this is equivalent to using an identity matrix as a
1787+
constraint matrix.
1788+
1789+
Note that np.ndarray encoded full constraint matrices are not supported here as the design
1790+
matrices are built within this function.
1791+
:param constraints_scale: Constraints for scale model. Can be following:
1792+
1793+
- dict:
1794+
Every element of the dictionary corresponds to one set of equality constraints.
1795+
Each set has to be be an entry of the form {..., x: y, ...}
1796+
where x is the factor to be constrained and y is a factor by which levels of x are grouped
1797+
and then constrained. Set y="1" to constrain all levels of x to sum to one,
1798+
a single equality constraint.
1799+
1800+
E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to
1801+
zero. This is applicable if repeats of a an experiment within each condition
1802+
are independent so that the set-up ~1+condition+batch is perfectly confounded.
1803+
1804+
Can only group by non-constrained effects right now, use constraint_matrix_from_string
1805+
for other cases.
1806+
- list of strings or tuple of strings:
1807+
String encoded equality constraints.
1808+
1809+
E.g. ["batch1 + batch2 + batch3 = 0"]
1810+
- None:
1811+
No constraints are used, this is equivalent to using an identity matrix as a
1812+
constraint matrix.
1813+
1814+
Note that np.ndarray encoded full constraint matrices are not supported here as the design
1815+
matrices are built within this function.
17641816
:param noise_model: str, noise model to use in model-based unit_test. Possible options:
17651817
17661818
- 'nb': default
@@ -1879,6 +1931,8 @@ def continuous_1d(
18791931
init_b=init_b,
18801932
gene_names=gene_names,
18811933
sample_description=sample_description,
1934+
constraints_loc=constraints_loc,
1935+
constraints_scale=constraints_scale,
18821936
noise_model=noise_model,
18831937
size_factors=size_factors,
18841938
batch_size=batch_size,

0 commit comments

Comments
 (0)