1
- from typing import Union
2
-
3
1
import anndata
4
2
import numpy as np
5
3
import pandas as pd
6
4
import patsy
7
5
import scipy
6
+ from typing import List , Tuple , Union
8
7
import xarray as xr
9
8
10
9
from batchglm import data as data_utils
10
+ # Relay util functions for diffxpy api. design_matrix and preview_coef_names are redefined here.
11
+ from batchglm .data import constraint_matrix_from_string , setup_constrained
12
+ from batchglm .data import design_matrix_from_xarray , design_matrix_from_anndata
13
+ from batchglm .data import view_coef_names
11
14
12
15
13
16
def parse_gene_names (data , gene_names ):
@@ -95,64 +98,6 @@ def parse_size_factors(
95
98
return size_factors
96
99
97
100
98
- def design_matrix (
99
- data = None ,
100
- sample_description : pd .DataFrame = None ,
101
- formula : str = None ,
102
- dmat : pd .DataFrame = None
103
- ) -> Union [patsy .design_info .DesignMatrix , xr .Dataset ]:
104
- """ Build design matrix for fit of generalized linear model.
105
-
106
- This is necessary for wald tests and likelihood ratio tests.
107
- This function only carries through formatting if dmat is directly supplied.
108
-
109
- :param data: input data
110
- :param formula: model formula.
111
- :param sample_description: optional pandas.DataFrame containing sample annotations
112
- :param dmat: model design matrix
113
- """
114
- if data is None and sample_description is None and dmat is None :
115
- raise ValueError ("Supply either data or sample_description or dmat." )
116
- if dmat is None and formula is None :
117
- raise ValueError ("Supply either dmat or formula." )
118
-
119
- if dmat is None :
120
- sample_description = parse_sample_description (data , sample_description )
121
- dmat = data_utils .design_matrix (sample_description = sample_description , formula = formula )
122
-
123
- return dmat
124
- else :
125
- ar = xr .DataArray (dmat , dims = ("observations" , "design_params" ))
126
- ar .coords ["design_params" ] = dmat .columns
127
-
128
- ds = xr .Dataset ({
129
- "design" : ar ,
130
- })
131
-
132
- return ds
133
-
134
-
135
- def coef_names (
136
- data = None ,
137
- sample_description : pd .DataFrame = None ,
138
- formula : str = None ,
139
- dmat : pd .DataFrame = None
140
- ) -> list :
141
- """ Output coefficient names of model only.
142
-
143
- :param data: input data
144
- :param formula: model formula.
145
- :param sample_description: optional pandas.DataFrame containing sample annotations
146
- :param dmat: model design matrix
147
- """
148
- return design_matrix (
149
- data = data ,
150
- sample_description = sample_description ,
151
- formula = formula ,
152
- dmat = dmat
153
- ).design_info .column_names
154
-
155
-
156
101
def parse_grouping (data , sample_description , grouping ):
157
102
if isinstance (grouping , str ):
158
103
sample_description = parse_sample_description (data , sample_description )
@@ -171,4 +116,95 @@ def dmat_unique(dmat, sample_description):
171
116
dmat , idx = np .unique (dmat , axis = 0 , return_index = True )
172
117
sample_description = sample_description .iloc [idx ].reset_index (drop = True )
173
118
174
- return dmat , sample_description
119
+ return dmat , sample_description
120
+
121
+
122
+ def design_matrix (
123
+ data : Union [anndata .AnnData , anndata .base .Raw , xr .DataArray , xr .Dataset , np .ndarray ,
124
+ scipy .sparse .csr_matrix ] = None ,
125
+ sample_description : Union [None , pd .DataFrame ] = None ,
126
+ formula : Union [None , str ] = None ,
127
+ as_numeric : Union [List [str ], Tuple [str ], str ] = (),
128
+ dmat : Union [pd .DataFrame , None ] = None ,
129
+ return_type : str = "xarray" ,
130
+ ) -> Union [patsy .design_info .DesignMatrix , xr .Dataset , pd .DataFrame ]:
131
+ """ Create a design matrix from some sample description.
132
+
133
+ This function defaults to perform formatting if dmat is directly supplied as a pd.DataFrame.
134
+ This function relays batchglm.data.design_matrix() to behave like the other wrappers in diffxpy.
135
+
136
+ :param data: Input data matrix (observations x features) or (cells x genes).
137
+ :param sample_description: pandas.DataFrame of length "num_observations" containing explanatory variables as columns
138
+ :param formula: model formula as string, describing the relations of the explanatory variables.
139
+
140
+ E.g. '~ 1 + batch + confounder'
141
+ :param as_numeric:
142
+ Which columns of sample_description to treat as numeric and
143
+ not as categorical. This yields columns in the design matrix
144
+ which do not correpond to one-hot encoded discrete factors.
145
+ This makes sense for number of genes, time, pseudotime or space
146
+ for example.
147
+ :param dmat: a model design matrix as a pd.DataFrame
148
+ :param return_type: type of the returned value.
149
+
150
+ - "patsy": return plain patsy.design_info.DesignMatrix object
151
+ - "dataframe": return pd.DataFrame with observations as rows and params as columns
152
+ - "xarray": return xr.Dataset with design matrix as ds["design"] and the sample description embedded as
153
+ one variable per column
154
+ :param dmat: model design matrix
155
+ """
156
+ if data is None and sample_description is None and dmat is None :
157
+ raise ValueError ("supply either data or sample_description or dmat" )
158
+ if dmat is None and formula is None :
159
+ raise ValueError ("supply either dmat or formula" )
160
+
161
+ if dmat is None :
162
+ sample_description = parse_sample_description (data , sample_description )
163
+
164
+ if sample_description is not None :
165
+ as_categorical = [False if x in as_numeric else True for x in sample_description .columns .values ]
166
+ else :
167
+ as_categorical = True
168
+
169
+ return data_utils .design_matrix (
170
+ sample_description = sample_description ,
171
+ formula = formula ,
172
+ as_categorical = as_categorical ,
173
+ dmat = dmat ,
174
+ return_type = return_type
175
+ )
176
+
177
+
178
+ def preview_coef_names (
179
+ sample_description : pd .DataFrame ,
180
+ formula : str ,
181
+ as_numeric : Union [List [str ], Tuple [str ], str ] = ()
182
+ ) -> np .ndarray :
183
+ """
184
+ Return coefficient names of model.
185
+
186
+ Use this to preview what the model would look like.
187
+ This function relays batchglm.data.preview_coef_names() to behave like the other wrappers in diffxpy.
188
+
189
+ :param sample_description: pandas.DataFrame of length "num_observations" containing explanatory variables as columns
190
+ :param formula: model formula as string, describing the relations of the explanatory variables.
191
+
192
+ E.g. '~ 1 + batch + confounder'
193
+ :param as_numeric:
194
+ Which columns of sample_description to treat as numeric and
195
+ not as categorical. This yields columns in the design matrix
196
+ which do not correpond to one-hot encoded discrete factors.
197
+ This makes sense for number of genes, time, pseudotime or space
198
+ for example.
199
+ :return: A list of coefficient names.
200
+ """
201
+ if isinstance (as_numeric , str ):
202
+ as_numeric = [as_numeric ]
203
+ if isinstance (as_numeric , tuple ):
204
+ as_numeric = list (as_numeric )
205
+
206
+ return data_utils .preview_coef_names (
207
+ sample_description = sample_description ,
208
+ formula = formula ,
209
+ as_categorical = [False if x in as_numeric else True for x in sample_description .columns .values ]
210
+ )
0 commit comments