Skip to content

Commit 7d6dbec

Browse files
more changes to constraint api, constraint unit tests check out
1 parent 45d9724 commit 7d6dbec

File tree

6 files changed

+248
-130
lines changed

6 files changed

+248
-130
lines changed

diffxpy/api/test.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,2 @@
11
from diffxpy.testing import lrt, wald, t_test, rank_test, two_sample, pairwise, \
22
versus_rest, partition, continuous_1d
3-
from diffxpy.testing import design_matrix, coef_names

diffxpy/api/utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
from batchglm.data import constraint_matrix_from_string, setup_constrained
2-
from batchglm.data import design_matrix, design_matrix_from_xarray, design_matrix_from_anndata
3-
from batchglm.data import view_coef_names
1+
from diffxpy.testing.utils import constraint_matrix_from_string, setup_constrained
2+
from diffxpy.testing.utils import design_matrix, design_matrix_from_xarray, design_matrix_from_anndata
3+
from diffxpy.testing.utils import view_coef_names, preview_coef_names

diffxpy/testing/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,2 @@
11
from .tests import lrt, wald, t_test, rank_test, two_sample, pairwise, \
22
versus_rest, partition, continuous_1d
3-
from .utils import design_matrix, coef_names

diffxpy/testing/tests.py

Lines changed: 24 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -384,13 +384,13 @@ def wald(
384384
data: Union[anndata.AnnData, anndata.base.Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix],
385385
factor_loc_totest: Union[str, List[str]] = None,
386386
coef_to_test: Union[str, List[str]] = None,
387-
formula_loc: str = None,
388-
formula_scale: str = "~1",
387+
formula_loc: Union[None, str] = None,
388+
formula_scale: Union[None, str] = "~1",
389389
as_numeric: Union[List[str], Tuple[str], str] = (),
390390
init_a: Union[np.ndarray, str] = "AUTO",
391391
init_b: Union[np.ndarray, str] = "AUTO",
392392
gene_names: Union[np.ndarray, list] = None,
393-
sample_description: pd.DataFrame = None,
393+
sample_description: Union[None, pd.DataFrame] = None,
394394
dmat_loc: Union[patsy.design_info.DesignMatrix, xr.Dataset] = None,
395395
dmat_scale: Union[patsy.design_info.DesignMatrix, xr.Dataset] = None,
396396
constraints_loc: np.ndarray = None,
@@ -406,8 +406,7 @@ def wald(
406406
"""
407407
Perform Wald test for differential expression for each gene.
408408
409-
:param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations.
410-
Input data matrix (observations x features) or (cells x genes).
409+
:param data: Input data matrix (observations x features) or (cells x genes).
411410
:param factor_loc_totest: str, list of strings
412411
List of factors of formula to test with Wald test.
413412
E.g. "condition" or ["batch", "condition"] if formula_loc would be "~ 1 + batch + condition"
@@ -589,18 +588,16 @@ def wald(
589588
elif coef_to_test is not None:
590589
# Directly select coefficients to test from design matrix (xarray):
591590
# Check that coefficients to test are not dependent parameters if constraints are given:
592-
# TODO: design_loc is sometimes xarray and sometimes patsy when it arrives here,
593-
# should it not always be xarray?
594-
if isinstance(design_loc, patsy.design_info.DesignMatrix):
595-
col_indices = np.asarray([
596-
design_loc.design_info.column_names.index(x)
597-
for x in coef_to_test
598-
])
599-
else:
600-
col_indices = np.asarray([
601-
list(np.asarray(design_loc.coords['design_params'])).index(x)
602-
for x in coef_to_test
603-
])
591+
coef_loc_names = data_utils.view_coef_names(design_loc).tolist()
592+
if not np.all([x in coef_loc_names for x in coef_to_test]):
593+
raise ValueError(
594+
"the requested test coefficients %s were found in model coefficients %s" %
595+
(", ".join([x for x in coef_to_test if x not in coef_loc_names]),
596+
", ".join(coef_loc_names))
597+
)
598+
col_indices = np.asarray([
599+
coef_loc_names.index(x) for x in coef_to_test
600+
])
604601
else:
605602
raise ValueError("either set factor_loc_totest or coef_to_test")
606603
# Check that all tested coefficients are independent:
@@ -1739,10 +1736,11 @@ def continuous_1d(
17391736
not implicitly store these
17401737
:param sample_description: optional pandas.DataFrame containing sample annotations
17411738
:param constraints_loc: Grouped factors to enfore equality constraints on for location model.
1742-
Every element of the iteratable corresponds to one set of equality constraints.
1743-
Each set has to be a dictionary of the form {x: y} where x is the factor to be constrained
1744-
and y is a factor by which levels of x are grouped and then constrained. Set y="1" to constrain
1745-
all levels of x to sum to one, a single equality constraint.
1739+
Every element of the dictionary corresponds to one set of equality constraints.
1740+
Each set has to be be an entry of the form {..., x: y, ...}
1741+
where x is the factor to be constrained and y is a factor by which levels of x are grouped
1742+
and then constrained. Set y="1" to constrain all levels of x to sum to one,
1743+
a single equality constraint.
17461744
17471745
E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to
17481746
zero. This is applicable if repeats of a an experiment within each condition
@@ -1751,10 +1749,11 @@ def continuous_1d(
17511749
Can only group by non-constrained effects right now, use constraint_matrix_from_string
17521750
for other cases.
17531751
:param constraints_scale: Grouped factors to enfore equality constraints on for scale model.
1754-
Every element of the iteratable corresponds to one set of equality constraints.
1755-
Each set has to be a dictionary of the form {x: y} where x is the factor to be constrained
1756-
and y is a factor by which levels of x are grouped and then constrained. Set y="1" to constrain
1757-
all levels of x to sum to one, a single equality constraint.
1752+
Every element of the dictionary corresponds to one set of equality constraints.
1753+
Each set has to be be an entry of the form {..., x: y, ...}
1754+
where x is the factor to be constrained and y is a factor by which levels of x are grouped
1755+
and then constrained. Set y="1" to constrain all levels of x to sum to one,
1756+
a single equality constraint.
17581757
17591758
E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to
17601759
zero. This is applicable if repeats of a an experiment within each condition

diffxpy/testing/utils.py

Lines changed: 97 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
1-
from typing import Union
2-
31
import anndata
42
import numpy as np
53
import pandas as pd
64
import patsy
75
import scipy
6+
from typing import List, Tuple, Union
87
import xarray as xr
98

109
from batchglm import data as data_utils
10+
# Relay util functions for diffxpy api. design_matrix and preview_coef_names are redefined here.
11+
from batchglm.data import constraint_matrix_from_string, setup_constrained
12+
from batchglm.data import design_matrix_from_xarray, design_matrix_from_anndata
13+
from batchglm.data import view_coef_names
1114

1215

1316
def parse_gene_names(data, gene_names):
@@ -95,64 +98,6 @@ def parse_size_factors(
9598
return size_factors
9699

97100

98-
def design_matrix(
99-
data=None,
100-
sample_description: pd.DataFrame = None,
101-
formula: str = None,
102-
dmat: pd.DataFrame = None
103-
) -> Union[patsy.design_info.DesignMatrix, xr.Dataset]:
104-
""" Build design matrix for fit of generalized linear model.
105-
106-
This is necessary for wald tests and likelihood ratio tests.
107-
This function only carries through formatting if dmat is directly supplied.
108-
109-
:param data: input data
110-
:param formula: model formula.
111-
:param sample_description: optional pandas.DataFrame containing sample annotations
112-
:param dmat: model design matrix
113-
"""
114-
if data is None and sample_description is None and dmat is None:
115-
raise ValueError("Supply either data or sample_description or dmat.")
116-
if dmat is None and formula is None:
117-
raise ValueError("Supply either dmat or formula.")
118-
119-
if dmat is None:
120-
sample_description = parse_sample_description(data, sample_description)
121-
dmat = data_utils.design_matrix(sample_description=sample_description, formula=formula)
122-
123-
return dmat
124-
else:
125-
ar = xr.DataArray(dmat, dims=("observations", "design_params"))
126-
ar.coords["design_params"] = dmat.columns
127-
128-
ds = xr.Dataset({
129-
"design": ar,
130-
})
131-
132-
return ds
133-
134-
135-
def coef_names(
136-
data=None,
137-
sample_description: pd.DataFrame = None,
138-
formula: str = None,
139-
dmat: pd.DataFrame = None
140-
) -> list:
141-
""" Output coefficient names of model only.
142-
143-
:param data: input data
144-
:param formula: model formula.
145-
:param sample_description: optional pandas.DataFrame containing sample annotations
146-
:param dmat: model design matrix
147-
"""
148-
return design_matrix(
149-
data=data,
150-
sample_description=sample_description,
151-
formula=formula,
152-
dmat=dmat
153-
).design_info.column_names
154-
155-
156101
def parse_grouping(data, sample_description, grouping):
157102
if isinstance(grouping, str):
158103
sample_description = parse_sample_description(data, sample_description)
@@ -171,4 +116,95 @@ def dmat_unique(dmat, sample_description):
171116
dmat, idx = np.unique(dmat, axis=0, return_index=True)
172117
sample_description = sample_description.iloc[idx].reset_index(drop=True)
173118

174-
return dmat, sample_description
119+
return dmat, sample_description
120+
121+
122+
def design_matrix(
123+
data: Union[anndata.AnnData, anndata.base.Raw, xr.DataArray, xr.Dataset, np.ndarray,
124+
scipy.sparse.csr_matrix] = None,
125+
sample_description: Union[None, pd.DataFrame] = None,
126+
formula: Union[None, str] = None,
127+
as_numeric: Union[List[str], Tuple[str], str] = (),
128+
dmat: Union[pd.DataFrame, None] = None,
129+
return_type: str = "xarray",
130+
) -> Union[patsy.design_info.DesignMatrix, xr.Dataset, pd.DataFrame]:
131+
""" Create a design matrix from some sample description.
132+
133+
This function defaults to perform formatting if dmat is directly supplied as a pd.DataFrame.
134+
This function relays batchglm.data.design_matrix() to behave like the other wrappers in diffxpy.
135+
136+
:param data: Input data matrix (observations x features) or (cells x genes).
137+
:param sample_description: pandas.DataFrame of length "num_observations" containing explanatory variables as columns
138+
:param formula: model formula as string, describing the relations of the explanatory variables.
139+
140+
E.g. '~ 1 + batch + confounder'
141+
:param as_numeric:
142+
Which columns of sample_description to treat as numeric and
143+
not as categorical. This yields columns in the design matrix
144+
which do not correpond to one-hot encoded discrete factors.
145+
This makes sense for number of genes, time, pseudotime or space
146+
for example.
147+
:param dmat: a model design matrix as a pd.DataFrame
148+
:param return_type: type of the returned value.
149+
150+
- "patsy": return plain patsy.design_info.DesignMatrix object
151+
- "dataframe": return pd.DataFrame with observations as rows and params as columns
152+
- "xarray": return xr.Dataset with design matrix as ds["design"] and the sample description embedded as
153+
one variable per column
154+
:param dmat: model design matrix
155+
"""
156+
if data is None and sample_description is None and dmat is None:
157+
raise ValueError("supply either data or sample_description or dmat")
158+
if dmat is None and formula is None:
159+
raise ValueError("supply either dmat or formula")
160+
161+
if dmat is None:
162+
sample_description = parse_sample_description(data, sample_description)
163+
164+
if sample_description is not None:
165+
as_categorical = [False if x in as_numeric else True for x in sample_description.columns.values]
166+
else:
167+
as_categorical = True
168+
169+
return data_utils.design_matrix(
170+
sample_description=sample_description,
171+
formula=formula,
172+
as_categorical=as_categorical,
173+
dmat=dmat,
174+
return_type=return_type
175+
)
176+
177+
178+
def preview_coef_names(
179+
sample_description: pd.DataFrame,
180+
formula: str,
181+
as_numeric: Union[List[str], Tuple[str], str] = ()
182+
) -> np.ndarray:
183+
"""
184+
Return coefficient names of model.
185+
186+
Use this to preview what the model would look like.
187+
This function relays batchglm.data.preview_coef_names() to behave like the other wrappers in diffxpy.
188+
189+
:param sample_description: pandas.DataFrame of length "num_observations" containing explanatory variables as columns
190+
:param formula: model formula as string, describing the relations of the explanatory variables.
191+
192+
E.g. '~ 1 + batch + confounder'
193+
:param as_numeric:
194+
Which columns of sample_description to treat as numeric and
195+
not as categorical. This yields columns in the design matrix
196+
which do not correpond to one-hot encoded discrete factors.
197+
This makes sense for number of genes, time, pseudotime or space
198+
for example.
199+
:return: A list of coefficient names.
200+
"""
201+
if isinstance(as_numeric, str):
202+
as_numeric = [as_numeric]
203+
if isinstance(as_numeric, tuple):
204+
as_numeric = list(as_numeric)
205+
206+
return data_utils.preview_coef_names(
207+
sample_description=sample_description,
208+
formula=formula,
209+
as_categorical=[False if x in as_numeric else True for x in sample_description.columns.values]
210+
)

0 commit comments

Comments
 (0)