Skip to content

Commit acce724

Browse files
authored
Merge pull request #27 from gibsramen/level-limits
Restructure level limits in DiversityHandler
2 parents c102b9a + 0675bc5 commit acce724

File tree

10 files changed

+196
-196
lines changed

10 files changed

+196
-196
lines changed

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,10 @@ The main data structure in evident is the 'DiversityHandler'.
8484
This is the way that evident stores the diversity data and metadata for power calculations.
8585
For our alpha diversity example, we'll load the `AlphaDiversityHandler` class from evident.
8686
`AlphaDiversityHandler` takes as input the pandas Series with the diversity values and the pandas DataFrame containing the sample metadata.
87+
By default, evident will only consider metadata columns with, at max, 5 levels.
88+
To modify this behavior, provide a value for the `max_levels_per_category` argument.
89+
Additionally, evident will not consider any category levels represented by fewer than 3 samples.
90+
To modify this behavior, use the `min_count_per_level` argument.
8791

8892
```python
8993
adh = evident.AlphaDiversityHandler(faith_pd, metadata)
@@ -158,10 +162,6 @@ bokeh serve --show app
158162
```
159163

160164
This should open up a browser window where you can modify the chosen column, significance, level, and observations.
161-
By default, this interactive view will only consider metadata columns with, at max, 5 levels.
162-
To modify this behavior, use the `max_levels_per_category` argument in `create_bokeh_app`.
163-
Additionally, this interactive view will not consider any category levels represented by fewer than 3 samples.
164-
To modify this behavior, use the `min_count_per_level` argument.
165165
We also provide a command line script to generate an interactive app using some test data.
166166
You can access this script at `evident/tests/make_interactive.py`.
167167

evident/diversity_handler.py

Lines changed: 105 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,62 @@
1919

2020
class _BaseDiversityHandler(ABC):
2121
"""Abstract class for handling diversity data and metadata."""
22-
def __init__(self, data=None, metadata: pd.DataFrame = None):
22+
def __init__(
23+
self,
24+
data=None,
25+
metadata: pd.DataFrame = None,
26+
max_levels_per_category: int = 5,
27+
min_count_per_level: int = 3
28+
):
2329
self.data = data
24-
self.metadata = metadata
30+
metadata = metadata.copy()
31+
32+
cols_to_drop = []
33+
levels_to_drop = dict()
34+
35+
warn_msg_num_levels = False
36+
warn_msg_level_count = False
37+
for col in metadata.columns:
38+
# Drop non-categorical columns
39+
if metadata[col].dtype != np.dtype("object"):
40+
cols_to_drop.append(col)
41+
continue
42+
43+
# Drop columns with only one level or more than max
44+
num_uniq_cols = len(metadata[col].dropna().unique())
45+
if not (1 < num_uniq_cols <= max_levels_per_category):
46+
cols_to_drop.append(col)
47+
warn_msg_num_levels = True
48+
continue
49+
50+
# Drop levels that have fewer than min_count_per_level samples
51+
level_count = metadata[col].value_counts()
52+
under_thresh = level_count[level_count < min_count_per_level]
53+
if not under_thresh.empty:
54+
levels_under_thresh = list(under_thresh.index)
55+
metadata[col].replace(
56+
{x: np.nan for x in levels_under_thresh},
57+
inplace=True
58+
)
59+
levels_to_drop[col] = levels_under_thresh
60+
warn_msg_level_count = True
61+
62+
if warn_msg_num_levels:
63+
warn(
64+
"Some categories have been dropped because they had either "
65+
"only one level or too many. Use the max_levels_per_category "
66+
"argument to modify this threshold.\n"
67+
f"Dropped columns: {cols_to_drop}"
68+
)
69+
if warn_msg_level_count:
70+
warn(
71+
"Some categorical levels have been dropped because they "
72+
"did not have enough samples. Use the min_count_per_level "
73+
"argument to modify this threshold.\n"
74+
f"Dropped levels: {levels_to_drop}"
75+
)
76+
77+
self.metadata = metadata.drop(columns=cols_to_drop)
2578

2679
@property
2780
def samples(self):
@@ -168,7 +221,7 @@ def _single_power_analysis(
168221
:type power: float
169222
170223
:returns: Collection of values from power analysis
171-
:rtype: evident.power.PowerAnalysisResult
224+
:rtype: evident.results.PowerAnalysisResult
172225
"""
173226
power_func = self._create_partial_power_func(
174227
column=column,
@@ -235,7 +288,7 @@ def _bulk_power_analysis(
235288
:type power: sequence of floats
236289
237290
:returns: Collection of values from power analyses
238-
:rtype: evident.power.PowerAnalysisResults
291+
:rtype: evident.results.PowerAnalysisResults
239292
"""
240293
# Convert all to list so we can use Cartesian product
241294
difference = _listify(difference)
@@ -311,12 +364,31 @@ def _create_partial_power_func(
311364

312365

313366
class AlphaDiversityHandler(_BaseDiversityHandler):
314-
"""Handler for alpha diversity data."""
315367
def __init__(
316368
self,
317369
data: pd.Series,
318-
metadata: pd.DataFrame
370+
metadata: pd.DataFrame,
371+
max_levels_per_category: int = 5,
372+
min_count_per_level: int = 3
319373
):
374+
"""Handler for alpha diversity data.
375+
376+
:param data: Alpha diversity vector
377+
:type data: pd.Series
378+
379+
:param metadata: Sample metadata
380+
:type metadata: pd.DataFrame
381+
382+
:param max_levels_per_category: Max number of levels in a category to
383+
keep. Any categorical columns that have more than this number of
384+
unique levels will not be saved, defaults to 5.
385+
:type max_levels_per_category: int
386+
387+
:param min_count_per_level: Min number of samples in a given category
388+
level to keep. Any levels that have fewer than this many samples
389+
will not be saved, defaults to 3.
390+
:type min_count_per_level: int
391+
"""
320392
if not isinstance(data, pd.Series):
321393
raise ValueError("data must be of type pandas.Series")
322394
if data.isna().any():
@@ -329,7 +401,9 @@ def __init__(
329401

330402
super().__init__(
331403
data=data.loc[samps_in_common],
332-
metadata=metadata.loc[samps_in_common]
404+
metadata=metadata.loc[samps_in_common],
405+
max_levels_per_category=max_levels_per_category,
406+
min_count_per_level=min_count_per_level
333407
)
334408

335409
def subset_values(self, ids: list) -> np.array:
@@ -338,12 +412,31 @@ def subset_values(self, ids: list) -> np.array:
338412

339413

340414
class BetaDiversityHandler(_BaseDiversityHandler):
341-
"""Handler for beta diversity data."""
342415
def __init__(
343416
self,
344417
data: DistanceMatrix,
345-
metadata: pd.DataFrame
418+
metadata: pd.DataFrame,
419+
max_levels_per_category: int = 5,
420+
min_count_per_level: int = 3
346421
):
422+
"""Handler for beta diversity data.
423+
424+
:param data: Beta diversity distance matrix
425+
:type data: skbio.DistanceMatrix
426+
427+
:param metadata: Sample metadata
428+
:type metadata: pd.DataFrame
429+
430+
:param max_levels_per_category: Max number of levels in a category to
431+
keep. Any categorical columns that have more than this number of
432+
unique levels will not be saved, defaults to 5.
433+
:type max_levels_per_category: int
434+
435+
:param min_count_per_level: Min number of samples in a given category
436+
level to keep. Any levels that have fewer than this many samples
437+
will not be saved, defaults to 3.
438+
:type min_count_per_level: int
439+
"""
347440
if not isinstance(data, DistanceMatrix):
348441
raise ValueError("data must be of type skbio.DistanceMatrix")
349442

@@ -353,7 +446,9 @@ def __init__(
353446

354447
super().__init__(
355448
data=data.filter(samps_in_common),
356-
metadata=metadata.loc[samps_in_common]
449+
metadata=metadata.loc[samps_in_common],
450+
max_levels_per_category=max_levels_per_category,
451+
min_count_per_level=min_count_per_level
357452
)
358453

359454
def subset_values(self, ids: list) -> np.array:

evident/interactive.py

Lines changed: 1 addition & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,5 @@
11
import os
22
import shutil
3-
from warnings import warn
4-
5-
import numpy as np
63

74
from evident.diversity_handler import (_BaseDiversityHandler,
85
AlphaDiversityHandler,
@@ -12,8 +9,6 @@
129
def create_bokeh_app(
1310
diversity_handler: _BaseDiversityHandler,
1411
output: os.PathLike,
15-
max_levels_per_category: int = 5,
16-
min_count_per_level: int = 3
1712
) -> None:
1813
"""Creates interactive power analysis using Bokeh.
1914
@@ -22,15 +17,6 @@ def create_bokeh_app(
2217
2318
:param output: Location to create Bokeh app
2419
:type output: os.PathLike
25-
26-
:param max_levels_per_category: Max number of levels in a category to
27-
keep. Any categorical columns that have more than this number of
28-
unique levels will not be saved, defaults to 5.
29-
:type max_levels_per_category: int
30-
31-
:param min_count_per_level: Min number of samples in a given category
32-
level to keep. Any levels that have fewer than this many samples
33-
will not be saved, defaults = 3.
3420
"""
3521
curr_path = os.path.dirname(__file__)
3622
support_files = os.path.join(curr_path, "support_files")
@@ -40,49 +26,9 @@ def create_bokeh_app(
4026
data_dir = os.path.join(output, "data")
4127
os.mkdir(data_dir)
4228

43-
# Process metadata
4429
md = diversity_handler.metadata.copy()
45-
cols_to_drop = []
46-
warn_msg_num_levels = False
47-
warn_msg_level_count = False
48-
for col in md.columns:
49-
# Drop non-categorical columns
50-
if md[col].dtype != np.dtype("object"):
51-
cols_to_drop.append(col)
52-
continue
53-
54-
# Drop columns with only one level or more than max
55-
if not (1 < len(md[col].dropna().unique()) <= max_levels_per_category):
56-
cols_to_drop.append(col)
57-
warn_msg_num_levels = True
58-
continue
59-
60-
# Drop levels that have fewer than min_count_per_level samples
61-
level_count = md[col].value_counts()
62-
under_thresh = level_count[level_count < min_count_per_level]
63-
if not under_thresh.empty:
64-
levels_under_thresh = list(under_thresh.index)
65-
md[col].replace(
66-
{x: np.nan for x in levels_under_thresh},
67-
inplace=True
68-
)
69-
warn_msg_level_count = True
70-
71-
if warn_msg_num_levels:
72-
warn(
73-
"Some categories have been dropped because they had either only "
74-
"one level or too many. Use the max_levels_per_category "
75-
"argument to modify this threshold."
76-
)
77-
if warn_msg_level_count:
78-
warn(
79-
"Some categorical levels have been dropped because they "
80-
"did not have enough samples. Use the min_count_per_level "
81-
"argument to modify this threshold."
82-
)
83-
8430
md_loc = os.path.join(data_dir, "metadata.tsv")
85-
md.drop(columns=cols_to_drop).to_csv(md_loc, sep="\t", index=True)
31+
md.to_csv(md_loc, sep="\t", index=True)
8632

8733
data = diversity_handler.data
8834
if isinstance(diversity_handler, AlphaDiversityHandler):

evident/q2/_methods.py

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,17 @@
1212
def alpha_power_analysis(
1313
alpha_diversity: pd.Series,
1414
sample_metadata: CategoricalMetadataColumn,
15+
max_levels_per_category: int = 5,
16+
min_count_per_level: int = 3,
1517
alpha: list = None,
1618
power: list = None,
1719
total_observations: list = None,
18-
difference: list = None
20+
difference: list = None,
1921
) -> pd.DataFrame:
2022
res = _power_analysis(alpha_diversity, sample_metadata,
21-
AlphaDiversityHandler, alpha=alpha, power=power,
23+
AlphaDiversityHandler,
24+
max_levels_per_category, min_count_per_level,
25+
alpha=alpha, power=power,
2226
total_observations=total_observations,
2327
difference=difference)
2428
return res
@@ -27,22 +31,28 @@ def alpha_power_analysis(
2731
def beta_power_analysis(
2832
beta_diversity: DistanceMatrix,
2933
sample_metadata: CategoricalMetadataColumn,
34+
max_levels_per_category: int = 5,
35+
min_count_per_level: int = 3,
3036
alpha: list = None,
3137
power: list = None,
3238
total_observations: list = None,
33-
difference: list = None
39+
difference: list = None,
3440
) -> pd.DataFrame:
3541
res = _power_analysis(beta_diversity, sample_metadata,
36-
BetaDiversityHandler, alpha=alpha, power=power,
42+
BetaDiversityHandler,
43+
max_levels_per_category, min_count_per_level,
44+
alpha=alpha, power=power,
3745
total_observations=total_observations,
3846
difference=difference)
3947
return res
4048

4149

42-
def _power_analysis(data, metadata, handler, **kwargs):
50+
def _power_analysis(data, metadata, handler, max_levels_per_category,
51+
min_count_per_level, **kwargs):
4352
md = metadata.to_series()
4453
column = md.name
45-
dh = handler(data, md.to_frame())
54+
dh = handler(data, md.to_frame(), max_levels_per_category,
55+
min_count_per_level)
4656
res = dh.power_analysis(column, **kwargs)
4757
return res.to_dataframe()
4858

@@ -52,11 +62,14 @@ def alpha_effect_size_by_category(
5262
sample_metadata: Metadata,
5363
columns: List[str],
5464
pairwise: bool = False,
55-
n_jobs: int = None
65+
n_jobs: int = None,
66+
max_levels_per_category: int = 5,
67+
min_count_per_level: int = 3
5668
) -> pd.DataFrame:
5769
res = _effect_size_by_category(alpha_diversity, sample_metadata,
5870
AlphaDiversityHandler, columns, pairwise,
59-
n_jobs)
71+
n_jobs, max_levels_per_category,
72+
min_count_per_level)
6073
return res
6174

6275

@@ -65,17 +78,22 @@ def beta_effect_size_by_category(
6578
sample_metadata: Metadata,
6679
columns: List[str],
6780
pairwise: bool = False,
68-
n_jobs: int = None
81+
n_jobs: int = None,
82+
max_levels_per_category: int = 5,
83+
min_count_per_level: int = 3
6984
) -> pd.DataFrame:
7085
res = _effect_size_by_category(beta_diversity, sample_metadata,
7186
BetaDiversityHandler, columns, pairwise,
72-
n_jobs)
87+
n_jobs, max_levels_per_category,
88+
min_count_per_level)
7389
return res
7490

7591

7692
def _effect_size_by_category(data, metadata, handler, columns, pairwise,
77-
n_jobs):
78-
dh = handler(data, metadata.to_dataframe())
93+
n_jobs, max_levels_per_category,
94+
min_count_per_level):
95+
dh = handler(data, metadata.to_dataframe(), max_levels_per_category,
96+
min_count_per_level)
7997
if pairwise:
8098
res = pairwise_effect_size_by_category(dh, columns, n_jobs=n_jobs)
8199
else:

0 commit comments

Comments
 (0)