more changes to constraint api, constraint unit tests check out

davidsebfischer · davidsebfischer · commit 7d6dbec07839 · 2019-04-12T15:47:28.000+02:00
diff --git a/diffxpy/api/test.py b/diffxpy/api/test.py
@@ -1,3 +1,2 @@
 from diffxpy.testing import lrt, wald, t_test, rank_test, two_sample, pairwise, \
     versus_rest, partition, continuous_1d
-from diffxpy.testing import design_matrix, coef_names
diff --git a/diffxpy/api/utils.py b/diffxpy/api/utils.py
@@ -1,3 +1,3 @@
-from batchglm.data import constraint_matrix_from_string, setup_constrained
-from batchglm.data import design_matrix, design_matrix_from_xarray, design_matrix_from_anndata
-from batchglm.data import view_coef_names
+from diffxpy.testing.utils import constraint_matrix_from_string, setup_constrained
+from diffxpy.testing.utils import design_matrix, design_matrix_from_xarray, design_matrix_from_anndata
+from diffxpy.testing.utils import view_coef_names, preview_coef_names
diff --git a/diffxpy/testing/__init__.py b/diffxpy/testing/__init__.py
@@ -1,3 +1,2 @@
 from .tests import lrt, wald, t_test, rank_test, two_sample, pairwise, \
     versus_rest, partition, continuous_1d
-from .utils import design_matrix, coef_names
diff --git a/diffxpy/testing/tests.py b/diffxpy/testing/tests.py
@@ -384,13 +384,13 @@ def wald(
         data: Union[anndata.AnnData, anndata.base.Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix],
         factor_loc_totest: Union[str, List[str]] = None,
         coef_to_test: Union[str, List[str]] = None,
-        formula_loc: str = None,
-        formula_scale: str = "~1",
+        formula_loc: Union[None, str] = None,
+        formula_scale: Union[None, str] = "~1",
         as_numeric: Union[List[str], Tuple[str], str] = (),
         init_a: Union[np.ndarray, str] = "AUTO",
         init_b: Union[np.ndarray, str] = "AUTO",
         gene_names: Union[np.ndarray, list] = None,
-        sample_description: pd.DataFrame = None,
+        sample_description: Union[None, pd.DataFrame] = None,
         dmat_loc: Union[patsy.design_info.DesignMatrix, xr.Dataset] = None,
         dmat_scale: Union[patsy.design_info.DesignMatrix, xr.Dataset] = None,
         constraints_loc: np.ndarray = None,
@@ -406,8 +406,7 @@ def wald(
     """
     Perform Wald test for differential expression for each gene.
 
-    :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations.
-        Input data matrix (observations x features) or (cells x genes).
+    :param data: Input data matrix (observations x features) or (cells x genes).
     :param factor_loc_totest: str, list of strings
         List of factors of formula to test with Wald test.
         E.g. "condition" or ["batch", "condition"] if formula_loc would be "~ 1 + batch + condition"
@@ -589,18 +588,16 @@ def wald(
     elif coef_to_test is not None:
         # Directly select coefficients to test from design matrix (xarray):
         # Check that coefficients to test are not dependent parameters if constraints are given:
-        # TODO: design_loc is sometimes xarray and sometimes patsy when it arrives here,
-        # should it not always be xarray?
-        if isinstance(design_loc, patsy.design_info.DesignMatrix):
-            col_indices = np.asarray([
-                design_loc.design_info.column_names.index(x)
-                for x in coef_to_test
-            ])
-        else:
-            col_indices = np.asarray([
-                list(np.asarray(design_loc.coords['design_params'])).index(x)
-                for x in coef_to_test
-            ])
+        coef_loc_names = data_utils.view_coef_names(design_loc).tolist()
+        if not np.all([x in coef_loc_names for x in coef_to_test]):
+            raise ValueError(
+                "the requested test coefficients %s were found in model coefficients %s" %
+                (", ".join([x for x in coef_to_test if x not in coef_loc_names]),
+                 ", ".join(coef_loc_names))
+            )
+        col_indices = np.asarray([
+            coef_loc_names.index(x) for x in coef_to_test
+        ])
     else:
         raise ValueError("either set factor_loc_totest or coef_to_test")
     # Check that all tested coefficients are independent:
@@ -1739,10 +1736,11 @@ def continuous_1d(
         not implicitly store these
     :param sample_description: optional pandas.DataFrame containing sample annotations
     :param constraints_loc: Grouped factors to enfore equality constraints on for location model.
-        Every element of the iteratable corresponds to one set of equality constraints.
-        Each set has to be a dictionary of the form {x: y} where x is the factor to be constrained
-        and y is a factor by which levels of x are grouped and then constrained. Set y="1" to constrain
-        all levels of x to sum to one, a single equality constraint.
+        Every element of the dictionary corresponds to one set of equality constraints.
+        Each set has to be be an entry of the form {..., x: y, ...}
+        where x is the factor to be constrained and y is a factor by which levels of x are grouped
+        and then constrained. Set y="1" to constrain all levels of x to sum to one,
+        a single equality constraint.
 
             E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to
                 zero. This is applicable if repeats of a an experiment within each condition
@@ -1751,10 +1749,11 @@ def continuous_1d(
         Can only group by non-constrained effects right now, use constraint_matrix_from_string
         for other cases.
     :param constraints_scale: Grouped factors to enfore equality constraints on for scale model.
-        Every element of the iteratable corresponds to one set of equality constraints.
-        Each set has to be a dictionary of the form {x: y} where x is the factor to be constrained
-        and y is a factor by which levels of x are grouped and then constrained. Set y="1" to constrain
-        all levels of x to sum to one, a single equality constraint.
+        Every element of the dictionary corresponds to one set of equality constraints.
+        Each set has to be be an entry of the form {..., x: y, ...}
+        where x is the factor to be constrained and y is a factor by which levels of x are grouped
+        and then constrained. Set y="1" to constrain all levels of x to sum to one,
+        a single equality constraint.
 
             E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to
                 zero. This is applicable if repeats of a an experiment within each condition
diff --git a/diffxpy/testing/utils.py b/diffxpy/testing/utils.py
@@ -1,13 +1,16 @@
-from typing import Union
-
 import anndata
 import numpy as np
 import pandas as pd
 import patsy
 import scipy
+from typing import List, Tuple, Union
 import xarray as xr
 
 from batchglm import data as data_utils
+# Relay util functions for diffxpy api. design_matrix and preview_coef_names are redefined here.
+from batchglm.data import constraint_matrix_from_string, setup_constrained
+from batchglm.data import design_matrix_from_xarray, design_matrix_from_anndata
+from batchglm.data import view_coef_names
 
 
 def parse_gene_names(data, gene_names):
@@ -95,64 +98,6 @@ def parse_size_factors(
     return size_factors
 
 
-def design_matrix(
-        data=None,
-        sample_description: pd.DataFrame = None,
-        formula: str = None,
-        dmat: pd.DataFrame = None
-) -> Union[patsy.design_info.DesignMatrix, xr.Dataset]:
-    """ Build design matrix for fit of generalized linear model.
-
-    This is necessary for wald tests and likelihood ratio tests.
-    This function only carries through formatting if dmat is directly supplied.
-
-    :param data: input data
-    :param formula: model formula.
-    :param sample_description: optional pandas.DataFrame containing sample annotations
-    :param dmat: model design matrix
-    """
-    if data is None and sample_description is None and dmat is None:
-        raise ValueError("Supply either data or sample_description or dmat.")
-    if dmat is None and formula is None:
-        raise ValueError("Supply either dmat or formula.")
-
-    if dmat is None:
-        sample_description = parse_sample_description(data, sample_description)
-        dmat = data_utils.design_matrix(sample_description=sample_description, formula=formula)
-
-        return dmat
-    else:
-        ar = xr.DataArray(dmat, dims=("observations", "design_params"))
-        ar.coords["design_params"] = dmat.columns
-
-        ds = xr.Dataset({
-            "design": ar,
-        })
-
-        return ds
-
-
-def coef_names(
-        data=None,
-        sample_description: pd.DataFrame = None,
-        formula: str = None,
-        dmat: pd.DataFrame = None
-) -> list:
-    """ Output coefficient names of model only.
-
-    :param data: input data
-    :param formula: model formula.
-    :param sample_description: optional pandas.DataFrame containing sample annotations
-    :param dmat: model design matrix
-    """
-    return design_matrix(
-        data=data,
-        sample_description=sample_description,
-        formula=formula,
-        dmat=dmat
-    ).design_info.column_names
-
-
 def parse_grouping(data, sample_description, grouping):
     if isinstance(grouping, str):
         sample_description = parse_sample_description(data, sample_description)
@@ -171,4 +116,95 @@ def dmat_unique(dmat, sample_description):
     dmat, idx = np.unique(dmat, axis=0, return_index=True)
     sample_description = sample_description.iloc[idx].reset_index(drop=True)
 
-    return dmat, sample_description
+    return dmat, sample_description
+
+
+def design_matrix(
+        data: Union[anndata.AnnData, anndata.base.Raw, xr.DataArray, xr.Dataset, np.ndarray,
+                    scipy.sparse.csr_matrix] = None,
+        sample_description: Union[None, pd.DataFrame] = None,
+        formula: Union[None, str] = None,
+        as_numeric: Union[List[str], Tuple[str], str] = (),
+        dmat: Union[pd.DataFrame, None] = None,
+        return_type: str = "xarray",
+) -> Union[patsy.design_info.DesignMatrix, xr.Dataset, pd.DataFrame]:
+    """ Create a design matrix from some sample description.
+
+    This function defaults to perform formatting if dmat is directly supplied as a pd.DataFrame.
+    This function relays batchglm.data.design_matrix() to behave like the other wrappers in diffxpy.
+
+    :param data: Input data matrix (observations x features) or (cells x genes).
+    :param sample_description: pandas.DataFrame of length "num_observations" containing explanatory variables as columns
+    :param formula: model formula as string, describing the relations of the explanatory variables.
+
+        E.g. '~ 1 + batch + confounder'
+    :param as_numeric:
+        Which columns of sample_description to treat as numeric and
+        not as categorical. This yields columns in the design matrix
+        which do not correpond to one-hot encoded discrete factors.
+        This makes sense for number of genes, time, pseudotime or space
+        for example.
+    :param dmat: a model design matrix as a pd.DataFrame
+    :param return_type: type of the returned value.
+
+        - "patsy": return plain patsy.design_info.DesignMatrix object
+        - "dataframe": return pd.DataFrame with observations as rows and params as columns
+        - "xarray": return xr.Dataset with design matrix as ds["design"] and the sample description embedded as
+            one variable per column
+    :param dmat: model design matrix
+    """
+    if data is None and sample_description is None and dmat is None:
+        raise ValueError("supply either data or sample_description or dmat")
+    if dmat is None and formula is None:
+        raise ValueError("supply either dmat or formula")
+
+    if dmat is None:
+        sample_description = parse_sample_description(data, sample_description)
+
+    if sample_description is not None:
+        as_categorical = [False if x in as_numeric else True for x in sample_description.columns.values]
+    else:
+        as_categorical = True
+
+    return data_utils.design_matrix(
+        sample_description=sample_description,
+        formula=formula,
+        as_categorical=as_categorical,
+        dmat=dmat,
+        return_type=return_type
+    )
+
+
+def preview_coef_names(
+        sample_description: pd.DataFrame,
+        formula: str,
+        as_numeric: Union[List[str], Tuple[str], str] = ()
+) -> np.ndarray:
+    """
+    Return coefficient names of model.
+
+    Use this to preview what the model would look like.
+    This function relays batchglm.data.preview_coef_names() to behave like the other wrappers in diffxpy.
+
+    :param sample_description: pandas.DataFrame of length "num_observations" containing explanatory variables as columns
+    :param formula: model formula as string, describing the relations of the explanatory variables.
+
+        E.g. '~ 1 + batch + confounder'
+    :param as_numeric:
+        Which columns of sample_description to treat as numeric and
+        not as categorical. This yields columns in the design matrix
+        which do not correpond to one-hot encoded discrete factors.
+        This makes sense for number of genes, time, pseudotime or space
+        for example.
+    :return: A list of coefficient names.
+    """
+    if isinstance(as_numeric, str):
+        as_numeric = [as_numeric]
+    if isinstance(as_numeric, tuple):
+        as_numeric = list(as_numeric)
+
+    return data_utils.preview_coef_names(
+        sample_description=sample_description,
+        formula=formula,
+        as_categorical=[False if x in as_numeric else True for x in sample_description.columns.values]
+    )
diff --git a/diffxpy/unit_test/test_constrained.py b/diffxpy/unit_test/test_constrained.py

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,2 @@`
`1`	`1`	`from diffxpy.testing import lrt, wald, t_test, rank_test, two_sample, pairwise, \`
`2`	`2`	`versus_rest, partition, continuous_1d`
`3`		`-from diffxpy.testing import design_matrix, coef_names`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,2 @@`
`1`	`1`	`from .tests import lrt, wald, t_test, rank_test, two_sample, pairwise, \`
`2`	`2`	`versus_rest, partition, continuous_1d`
`3`		`-from .utils import design_matrix, coef_names`