Skip to content

Commit a62e4a0

Browse files
authored
[data/preprocessors] Fix StandardScaler to handle NaN stats (#51281)
<!-- Thank you for your contribution! Please review https://github.com/ray-project/ray/blob/master/CONTRIBUTING.rst before opening a pull request. --> <!-- Please add a reviewer to the assignee section when you create a PR. If you don't have the access to it, we will shortly find a reviewer and assign them to your PR. --> ## Why are these changes needed? When `StandardScaler.stats_` has a column with `NaN` then short circuit transform for that columns. ## Related issue number Closes #51243 ## Checks - [ ] I've signed off every commit(by using the -s flag, i.e., `git commit -s`) in this PR. - [ ] I've run `scripts/format.sh` to lint the changes in this PR. - [ ] I've included any doc changes needed for https://docs.ray.io/en/master/. - [ ] I've added any new APIs to the API Reference. For example, if I added a method in Tune, I've added it in `doc/source/tune/api/` under the corresponding `.rst` file. - [ ] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/ - Testing Strategy - [ ] Unit tests - [ ] Release tests - [ ] This PR is not tested :( --------- Signed-off-by: Praveen Gorthy <praveeng@anyscale.com>
1 parent e2e4c32 commit a62e4a0

File tree

2 files changed

+32
-7
lines changed

2 files changed

+32
-7
lines changed

python/ray/data/preprocessors/scaler.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,10 @@ def column_standard_scaler(s: pd.Series):
9595
s_mean = self.stats_[f"mean({s.name})"]
9696
s_std = self.stats_[f"std({s.name})"]
9797

98+
if s_std is None or s_mean is None:
99+
s[:] = np.nan
100+
return s
101+
98102
# Handle division by zero.
99103
# TODO: extend this to handle near-zero values.
100104
if s_std == 0:

python/ray/data/tests/preprocessors/test_scaler.py

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import pandas as pd
22
import pytest
3+
import numpy as np
34

45
import ray
56
from ray.data.preprocessor import PreprocessorNotFittedException
@@ -263,33 +264,50 @@ def test_standard_scaler():
263264
col_a = [-1, 0, 1, 2]
264265
col_b = [1, 1, 5, 5]
265266
col_c = [1, 1, 1, None]
266-
in_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b, "C": col_c})
267-
ds = ray.data.from_pandas(in_df)
267+
col_d = [None, None, None, None]
268+
sample_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b, "C": col_c, "D": col_d})
269+
ds = ray.data.from_pandas(sample_df)
268270

269-
scaler = StandardScaler(["B", "C"])
271+
scaler = StandardScaler(["B", "C", "D"])
270272

271273
# Transform with unfitted preprocessor.
272274
with pytest.raises(PreprocessorNotFittedException):
273275
scaler.transform(ds)
274276

275277
# Fit data.
276-
scaler.fit(ds)
278+
scaler = scaler.fit(ds)
277279
assert scaler.stats_ == {
278280
"mean(B)": 3.0,
279281
"mean(C)": 1.0,
282+
"mean(D)": None,
280283
"std(B)": 2.0,
281284
"std(C)": 0.0,
285+
"std(D)": None,
282286
}
283287

284288
# Transform data.
285-
transformed = scaler.transform(ds)
289+
in_col_a = [-1, 0, 1, 2]
290+
in_col_b = [1, 1, 5, 5]
291+
in_col_c = [1, 1, 1, None]
292+
in_col_d = [0, None, None, None]
293+
in_df = pd.DataFrame.from_dict(
294+
{"A": in_col_a, "B": in_col_b, "C": in_col_c, "D": in_col_d}
295+
)
296+
in_ds = ray.data.from_pandas(in_df)
297+
transformed = scaler.transform(in_ds)
286298
out_df = transformed.to_pandas()
287299

288300
processed_col_a = col_a
289301
processed_col_b = [-1.0, -1.0, 1.0, 1.0]
290302
processed_col_c = [0.0, 0.0, 0.0, None]
303+
processed_col_d = [np.nan, np.nan, np.nan, np.nan]
291304
expected_df = pd.DataFrame.from_dict(
292-
{"A": processed_col_a, "B": processed_col_b, "C": processed_col_c}
305+
{
306+
"A": processed_col_a,
307+
"B": processed_col_b,
308+
"C": processed_col_c,
309+
"D": processed_col_d,
310+
}
293311
)
294312

295313
pd.testing.assert_frame_equal(out_df, expected_df, check_like=True)
@@ -298,20 +316,23 @@ def test_standard_scaler():
298316
pred_col_a = [1, 2, 3]
299317
pred_col_b = [3, 5, 7]
300318
pred_col_c = [0, 1, 2]
319+
pred_col_d = [None, None, None]
301320
pred_in_df = pd.DataFrame.from_dict(
302-
{"A": pred_col_a, "B": pred_col_b, "C": pred_col_c}
321+
{"A": pred_col_a, "B": pred_col_b, "C": pred_col_c, "D": pred_col_d}
303322
)
304323

305324
pred_out_df = scaler.transform_batch(pred_in_df)
306325

307326
pred_processed_col_a = pred_col_a
308327
pred_processed_col_b = [0.0, 1.0, 2.0]
309328
pred_processed_col_c = [-1.0, 0.0, 1.0]
329+
pred_processed_col_d = [None, None, None]
310330
pred_expected_df = pd.DataFrame.from_dict(
311331
{
312332
"A": pred_processed_col_a,
313333
"B": pred_processed_col_b,
314334
"C": pred_processed_col_c,
335+
"D": pred_processed_col_d,
315336
}
316337
)
317338

0 commit comments

Comments
 (0)