1
+ import logging
1
2
import numpy as np
2
3
import pandas as pd
4
+ from typing import Union
3
5
4
6
from ..stats import stats
5
7
from ..testing import correction
6
- from ..testing .base import _DifferentialExpressionTest
8
+ from ..testing .det import _DifferentialExpressionTest
7
9
10
+ logger = logging .getLogger (__name__ )
8
11
9
- class RefSets () :
12
+ class RefSets :
10
13
"""
11
14
Class for a list of gene sets.
12
15
@@ -18,7 +21,7 @@ class RefSets():
18
21
.gmt files can be downloaded from http://software.broadinstitute.org/gsea/msigdb/collections.jsp for example.
19
22
"""
20
23
21
- class _Set () :
24
+ class _Set :
22
25
"""
23
26
Class for a single gene set.
24
27
"""
@@ -182,18 +185,19 @@ def overlap(self, enq_set: set, set_id=None):
182
185
for x in self .sets :
183
186
x .intersect = x .genes .intersection (enq_set )
184
187
else :
185
- x .intersect = self .get_set (id ).genes .intersection (enq_set )
188
+ x .intersect = self .get_set (id ).genes .intersection (enq_set ) # bug
186
189
187
190
188
191
def test (
189
- RefSets : RefSets ,
190
- DETest : _DifferentialExpressionTest = None ,
191
- pval : np .array = None ,
192
- gene_ids : list = None ,
192
+ ref : RefSets ,
193
+ det : Union [ _DifferentialExpressionTest , None ] = None ,
194
+ pval : Union [ np .array , None ] = None ,
195
+ gene_ids : Union [ list , None ] = None ,
193
196
de_threshold = 0.05 ,
197
+ incl_all_zero = False ,
194
198
all_ids = None ,
195
- clean_ref = True ,
196
- upper = False
199
+ clean_ref = False ,
200
+ capital = True
197
201
):
198
202
""" Perform gene set enrichment.
199
203
@@ -214,51 +218,61 @@ def test(
214
218
which can be matched against the identifieres in the sets in RefSets.
215
219
:param de_threshold:
216
220
Significance threshold at which a differential test (a multiple-testing
217
- corrected p-value) is called siginficant. This
221
+ corrected p-value) is called siginficant. T
222
+ :param incl_all_zero:
223
+ Wehther to include genes in gene universe which were all zero.
218
224
:param all_ids:
219
225
Set of all gene identifiers, this is used as the background set in the
220
226
hypergeometric test. Only supply this if not all genes were tested
221
227
and are supplied above in DETest or gene_ids.
222
228
:param clean_ref:
223
229
Whether or not to only retain gene identifiers in RefSets that occur in
224
230
the background set of identifiers supplied here through all_ids.
225
- :param upper :
231
+ :param capital :
226
232
Make all gene IDs captial.
227
233
"""
228
234
return Enrich (
229
- RefSets = RefSets ,
230
- DETest = DETest ,
235
+ ref = ref ,
236
+ det = det ,
231
237
pval = pval ,
232
238
gene_ids = gene_ids ,
233
239
de_threshold = de_threshold ,
240
+ incl_all_zero = incl_all_zero ,
234
241
all_ids = all_ids ,
235
242
clean_ref = clean_ref ,
236
- upper = upper )
243
+ capital = capital
244
+ )
237
245
238
246
239
- class Enrich () :
247
+ class Enrich :
240
248
"""
241
249
"""
242
250
243
251
def __init__ (
244
252
self ,
245
- RefSets : RefSets ,
246
- DETest : _DifferentialExpressionTest = None ,
247
- pval : np .array = None ,
248
- gene_ids : list = None ,
249
- de_threshold = 0.05 ,
250
- all_ids = None ,
251
- clean_ref = True ,
252
- upper = False
253
+ ref : RefSets ,
254
+ det : Union [_DifferentialExpressionTest , None ],
255
+ pval : Union [np .array , None ],
256
+ gene_ids : Union [list , None ],
257
+ de_threshold ,
258
+ incl_all_zero ,
259
+ all_ids ,
260
+ clean_ref ,
261
+ capital
253
262
):
254
263
self ._n_overlaps = None
255
264
self ._pval_enrich = None
256
265
self ._qval_enrich = None
257
266
# Load multiple-testing-corrected differential expression
258
267
# p-values from differential expression output.
259
- if DETest is not None :
260
- self ._qval_de = DETest .qval
261
- self ._gene_ids = DETest .gene_ids
268
+ if det is not None :
269
+ if incl_all_zero :
270
+ self ._qval_de = det .qval
271
+ self ._gene_ids = det .gene_ids
272
+ else :
273
+ idx_not_all_zero = np .where (np .logical_not (det .summary ()["zero_mean" ].values ))[0 ]
274
+ self ._qval_de = det .qval [idx_not_all_zero ]
275
+ self ._gene_ids = det .gene_ids [idx_not_all_zero ]
262
276
elif pval is not None and gene_ids is not None :
263
277
self ._qval_de = np .asarray (pval )
264
278
self ._gene_ids = gene_ids
@@ -268,8 +282,11 @@ def __init__(
268
282
# Select significant genes based on user defined threshold.
269
283
if any ([x is np .nan for x in self ._gene_ids ]):
270
284
idx_notnan = np .where ([x is not np .nan for x in self ._gene_ids ])[0 ]
271
- print ('Discarded ' + str (len (self ._gene_ids ) - len (idx_notnan )) + ' nan gene ids, leaving ' +
272
- str (len (idx_notnan )) + ' genes.' )
285
+ logger .info (
286
+ " Discarded %i nan gene ids, leaving %i genes." ,
287
+ len (self ._gene_ids ) - len (idx_notnan ),
288
+ len (idx_notnan )
289
+ )
273
290
self ._qval_de = self ._qval_de [idx_notnan ]
274
291
self ._gene_ids = self ._gene_ids [idx_notnan ]
275
292
@@ -280,25 +297,31 @@ def __init__(
280
297
else :
281
298
self ._all_ids = set (self ._gene_ids )
282
299
283
- if upper == True :
300
+ if capital :
284
301
self ._gene_ids = [x .upper () for x in self ._gene_ids ]
285
302
self ._all_ids = set ([x .upper () for x in self ._all_ids ])
303
+ self ._significant_ids = set ([x .upper () for x in self ._significant_ids ])
286
304
287
305
# Generate diagnostic statistic of number of possible overlaps in total.
288
- print (str (len (set (self ._all_ids ).intersection (set (RefSets ._genes )))) +
289
- ' overlaps found between refset (' + str (len (RefSets ._genes )) +
290
- ') and provided gene list (' + str (len (self ._all_ids )) + ').' )
291
- self .missing_genes = list (set (RefSets ._genes ).difference (set (self ._all_ids )))
306
+ logger .info (
307
+ " %i overlaps found between refset (%i) and provided gene list (%i)." ,
308
+ len (set (self ._all_ids ).intersection (set (ref ._genes ))),
309
+ len (ref ._genes ),
310
+ len (self ._all_ids )
311
+ )
312
+ self .missing_genes = list (set (ref ._genes ).difference (set (self ._all_ids )))
292
313
# Clean reference set to only contains ids that were observed in
293
314
# current study if required.
294
- self .RefSets = RefSets
295
- if clean_ref == True :
315
+ self .RefSets = ref
316
+ if clean_ref :
296
317
self .RefSets .clean (self ._all_ids )
297
318
# Print if there are empty sets.
298
319
idx_nonempty = np .where ([len (x .genes ) > 0 for x in self .RefSets .sets ])[0 ]
299
320
if len (self .RefSets .sets ) - len (idx_nonempty ) > 0 :
300
- print ('Found ' + str (len (self .RefSets .sets ) - len (idx_nonempty )) +
301
- ' empty sets, removing those.' )
321
+ logger .info (
322
+ " Found %i empty sets, removing those." ,
323
+ len (self .RefSets .sets ) - len (idx_nonempty )
324
+ )
302
325
self .RefSets = self .RefSets .subset (idx = idx_nonempty )
303
326
elif len (idx_nonempty ) == 0 :
304
327
raise ValueError ('all RefSets were empty' )
@@ -356,7 +379,7 @@ def grepv_sets(self, x):
356
379
"""
357
380
return self .RefSets .grepv_sets (x )
358
381
359
- def set (id ):
382
+ def set (self , id ):
360
383
"""
361
384
Return the set with a given set identifier.
362
385
"""
@@ -374,9 +397,11 @@ def significant_set_ids(self, threshold=0.05) -> np.array:
374
397
"""
375
398
return [self .RefSets ._ids [i ] for i in np .where (self .qval <= threshold )[0 ]]
376
399
377
- def summary (self ) -> pd .DataFrame :
400
+ def summary (self , sort = True ) -> pd .DataFrame :
378
401
"""
379
402
Summarize gene set enrichement analysis as an output table.
403
+
404
+ :param sort: Whether to sort table by p-value.
380
405
"""
381
406
res = pd .DataFrame ({
382
407
"set" : self .RefSets ._ids ,
@@ -388,5 +413,15 @@ def summary(self) -> pd.DataFrame:
388
413
"background" : len (self ._all_ids )
389
414
})
390
415
# Sort by p-value
391
- res = res .iloc [np .argsort (res ['pval' ].values ), :]
416
+ if sort :
417
+ res = res .iloc [np .argsort (res ['pval' ].values ), :]
392
418
return res
419
+
420
+ def set_summary (self , id : str ):
421
+ """
422
+ Summarize gene set enrichement analysis for a given set.
423
+ :param id: Gene set to enquire.
424
+
425
+ :return: Slice of summary table.
426
+ """
427
+ return self .summary (sort = False ).iloc [self .RefSets ._ids .index (id ), :]
0 commit comments