19
19
20
20
class _BaseDiversityHandler (ABC ):
21
21
"""Abstract class for handling diversity data and metadata."""
22
- def __init__ (self , data = None , metadata : pd .DataFrame = None ):
22
+ def __init__ (
23
+ self ,
24
+ data = None ,
25
+ metadata : pd .DataFrame = None ,
26
+ max_levels_per_category : int = 5 ,
27
+ min_count_per_level : int = 3
28
+ ):
23
29
self .data = data
24
- self .metadata = metadata
30
+ metadata = metadata .copy ()
31
+
32
+ cols_to_drop = []
33
+ levels_to_drop = dict ()
34
+
35
+ warn_msg_num_levels = False
36
+ warn_msg_level_count = False
37
+ for col in metadata .columns :
38
+ # Drop non-categorical columns
39
+ if metadata [col ].dtype != np .dtype ("object" ):
40
+ cols_to_drop .append (col )
41
+ continue
42
+
43
+ # Drop columns with only one level or more than max
44
+ num_uniq_cols = len (metadata [col ].dropna ().unique ())
45
+ if not (1 < num_uniq_cols <= max_levels_per_category ):
46
+ cols_to_drop .append (col )
47
+ warn_msg_num_levels = True
48
+ continue
49
+
50
+ # Drop levels that have fewer than min_count_per_level samples
51
+ level_count = metadata [col ].value_counts ()
52
+ under_thresh = level_count [level_count < min_count_per_level ]
53
+ if not under_thresh .empty :
54
+ levels_under_thresh = list (under_thresh .index )
55
+ metadata [col ].replace (
56
+ {x : np .nan for x in levels_under_thresh },
57
+ inplace = True
58
+ )
59
+ levels_to_drop [col ] = levels_under_thresh
60
+ warn_msg_level_count = True
61
+
62
+ if warn_msg_num_levels :
63
+ warn (
64
+ "Some categories have been dropped because they had either "
65
+ "only one level or too many. Use the max_levels_per_category "
66
+ "argument to modify this threshold.\n "
67
+ f"Dropped columns: { cols_to_drop } "
68
+ )
69
+ if warn_msg_level_count :
70
+ warn (
71
+ "Some categorical levels have been dropped because they "
72
+ "did not have enough samples. Use the min_count_per_level "
73
+ "argument to modify this threshold.\n "
74
+ f"Dropped levels: { levels_to_drop } "
75
+ )
76
+
77
+ self .metadata = metadata .drop (columns = cols_to_drop )
25
78
26
79
@property
27
80
def samples (self ):
@@ -168,7 +221,7 @@ def _single_power_analysis(
168
221
:type power: float
169
222
170
223
:returns: Collection of values from power analysis
171
- :rtype: evident.power .PowerAnalysisResult
224
+ :rtype: evident.results .PowerAnalysisResult
172
225
"""
173
226
power_func = self ._create_partial_power_func (
174
227
column = column ,
@@ -235,7 +288,7 @@ def _bulk_power_analysis(
235
288
:type power: sequence of floats
236
289
237
290
:returns: Collection of values from power analyses
238
- :rtype: evident.power .PowerAnalysisResults
291
+ :rtype: evident.results .PowerAnalysisResults
239
292
"""
240
293
# Convert all to list so we can use Cartesian product
241
294
difference = _listify (difference )
@@ -311,12 +364,31 @@ def _create_partial_power_func(
311
364
312
365
313
366
class AlphaDiversityHandler (_BaseDiversityHandler ):
314
- """Handler for alpha diversity data."""
315
367
def __init__ (
316
368
self ,
317
369
data : pd .Series ,
318
- metadata : pd .DataFrame
370
+ metadata : pd .DataFrame ,
371
+ max_levels_per_category : int = 5 ,
372
+ min_count_per_level : int = 3
319
373
):
374
+ """Handler for alpha diversity data.
375
+
376
+ :param data: Alpha diversity vector
377
+ :type data: pd.Series
378
+
379
+ :param metadata: Sample metadata
380
+ :type metadata: pd.DataFrame
381
+
382
+ :param max_levels_per_category: Max number of levels in a category to
383
+ keep. Any categorical columns that have more than this number of
384
+ unique levels will not be saved, defaults to 5.
385
+ :type max_levels_per_category: int
386
+
387
+ :param min_count_per_level: Min number of samples in a given category
388
+ level to keep. Any levels that have fewer than this many samples
389
+ will not be saved, defaults to 3.
390
+ :type min_count_per_level: int
391
+ """
320
392
if not isinstance (data , pd .Series ):
321
393
raise ValueError ("data must be of type pandas.Series" )
322
394
if data .isna ().any ():
@@ -329,7 +401,9 @@ def __init__(
329
401
330
402
super ().__init__ (
331
403
data = data .loc [samps_in_common ],
332
- metadata = metadata .loc [samps_in_common ]
404
+ metadata = metadata .loc [samps_in_common ],
405
+ max_levels_per_category = max_levels_per_category ,
406
+ min_count_per_level = min_count_per_level
333
407
)
334
408
335
409
def subset_values (self , ids : list ) -> np .array :
@@ -338,12 +412,31 @@ def subset_values(self, ids: list) -> np.array:
338
412
339
413
340
414
class BetaDiversityHandler (_BaseDiversityHandler ):
341
- """Handler for beta diversity data."""
342
415
def __init__ (
343
416
self ,
344
417
data : DistanceMatrix ,
345
- metadata : pd .DataFrame
418
+ metadata : pd .DataFrame ,
419
+ max_levels_per_category : int = 5 ,
420
+ min_count_per_level : int = 3
346
421
):
422
+ """Handler for beta diversity data.
423
+
424
+ :param data: Beta diversity distance matrix
425
+ :type data: skbio.DistanceMatrix
426
+
427
+ :param metadata: Sample metadata
428
+ :type metadata: pd.DataFrame
429
+
430
+ :param max_levels_per_category: Max number of levels in a category to
431
+ keep. Any categorical columns that have more than this number of
432
+ unique levels will not be saved, defaults to 5.
433
+ :type max_levels_per_category: int
434
+
435
+ :param min_count_per_level: Min number of samples in a given category
436
+ level to keep. Any levels that have fewer than this many samples
437
+ will not be saved, defaults to 3.
438
+ :type min_count_per_level: int
439
+ """
347
440
if not isinstance (data , DistanceMatrix ):
348
441
raise ValueError ("data must be of type skbio.DistanceMatrix" )
349
442
@@ -353,7 +446,9 @@ def __init__(
353
446
354
447
super ().__init__ (
355
448
data = data .filter (samps_in_common ),
356
- metadata = metadata .loc [samps_in_common ]
449
+ metadata = metadata .loc [samps_in_common ],
450
+ max_levels_per_category = max_levels_per_category ,
451
+ min_count_per_level = min_count_per_level
357
452
)
358
453
359
454
def subset_values (self , ids : list ) -> np .array :
0 commit comments