Skip to content

Commit b72252e

Browse files
lucyleeowogrisel
andauthored
TST Fix typo, lint test_target_encoder.py (scikit-learn#26958)
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
1 parent cb15a82 commit b72252e

File tree

1 file changed

+45
-29
lines changed

1 file changed

+45
-29
lines changed

sklearn/preprocessing/tests/test_target_encoder.py

Lines changed: 45 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -61,54 +61,70 @@ def _encode_target(X_ordinal, y_int, n_categories, smooth):
6161
@pytest.mark.parametrize("smooth", [5.0, "auto"])
6262
@pytest.mark.parametrize("target_type", ["binary", "continuous"])
6363
def test_encoding(categories, unknown_value, global_random_seed, smooth, target_type):
64-
"""Check encoding for binary and continuous targets."""
64+
"""Check encoding for binary and continuous targets.
65+
66+
Compare the values returned by `TargetEncoder.fit_transform` against the
67+
expected encodings for cv splits from a naive reference Python
68+
implementation in _encode_target.
69+
"""
6570

66-
X_train_array = np.array([[0] * 20 + [1] * 30 + [2] * 40], dtype=np.int64).T
67-
X_test_array = np.array([[0, 1, 2]], dtype=np.int64).T
6871
n_categories = 3
69-
n_samples = X_train_array.shape[0]
72+
X_train_int_array = np.array([[0] * 20 + [1] * 30 + [2] * 40], dtype=np.int64).T
73+
X_test_int_array = np.array([[0, 1, 2]], dtype=np.int64).T
74+
n_samples = X_train_int_array.shape[0]
7075

7176
if categories == "auto":
72-
X_train = X_train_array
77+
X_train = X_train_int_array
78+
X_test = X_test_int_array
7379
else:
74-
X_train = categories[0][X_train_array]
80+
X_train = categories[0][X_train_int_array]
81+
X_test = categories[0][X_test_int_array]
7582

76-
if categories == "auto":
77-
X_test = X_test_array
78-
else:
79-
X_test = categories[0][X_test_array]
8083
X_test = np.concatenate((X_test, [[unknown_value]]))
8184

82-
rng = np.random.RandomState(global_random_seed)
83-
85+
data_rng = np.random.RandomState(global_random_seed)
86+
n_splits = 3
8487
if target_type == "binary":
85-
y_int = rng.randint(low=0, high=2, size=n_samples)
88+
y_int = data_rng.randint(low=0, high=2, size=n_samples)
8689
target_names = np.array(["cat", "dog"], dtype=object)
8790
y_train = target_names[y_int]
88-
cv = StratifiedKFold(n_splits=3, random_state=0, shuffle=True)
91+
8992
else: # target_type == continuous
90-
y_int = rng.uniform(low=-10, high=20, size=n_samples)
93+
y_int = data_rng.uniform(low=-10, high=20, size=n_samples)
9194
y_train = y_int
92-
cv = KFold(n_splits=3, random_state=0, shuffle=True)
9395

94-
shuffled_idx = rng.permutation(n_samples)
95-
X_train_array = X_train_array[shuffled_idx]
96+
shuffled_idx = data_rng.permutation(n_samples)
97+
X_train_int_array = X_train_int_array[shuffled_idx]
9698
X_train = X_train[shuffled_idx]
9799
y_train = y_train[shuffled_idx]
98100
y_int = y_int[shuffled_idx]
99101

100-
# Get encodings for cv splits to validate `fit_transform`
101-
expected_X_fit_transform = np.empty_like(X_train_array, dtype=np.float64)
102+
# Define our CV splitting strategy
103+
if target_type == "binary":
104+
cv = StratifiedKFold(
105+
n_splits=n_splits, random_state=global_random_seed, shuffle=True
106+
)
107+
else:
108+
cv = KFold(n_splits=n_splits, random_state=global_random_seed, shuffle=True)
109+
110+
# Compute the expected values using our reference Python implementation of
111+
# target encoding:
112+
expected_X_fit_transform = np.empty_like(X_train_int_array, dtype=np.float64)
102113

103-
for train_idx, test_idx in cv.split(X_train_array, y_train):
104-
X_, y_ = X_train_array[train_idx, 0], y_int[train_idx]
114+
for train_idx, test_idx in cv.split(X_train_int_array, y_train):
115+
X_, y_ = X_train_int_array[train_idx, 0], y_int[train_idx]
105116
cur_encodings = _encode_target(X_, y_, n_categories, smooth)
106117
expected_X_fit_transform[test_idx, 0] = cur_encodings[
107-
X_train_array[test_idx, 0]
118+
X_train_int_array[test_idx, 0]
108119
]
109120

121+
# Check that we can obtain the same encodings by calling `fit_transform` on
122+
# the estimator with the same CV parameters:
110123
target_encoder = TargetEncoder(
111-
smooth=smooth, categories=categories, cv=3, random_state=0
124+
smooth=smooth,
125+
categories=categories,
126+
cv=n_splits,
127+
random_state=global_random_seed,
112128
)
113129

114130
X_fit_transform = target_encoder.fit_transform(X_train, y_train)
@@ -120,12 +136,12 @@ def test_encoding(categories, unknown_value, global_random_seed, smooth, target_
120136
# compute encodings for all data to validate `transform`
121137
y_mean = np.mean(y_int)
122138
expected_encodings = _encode_target(
123-
X_train_array[:, 0], y_int, n_categories, smooth
139+
X_train_int_array[:, 0], y_int, n_categories, smooth
124140
)
125141
assert_allclose(target_encoder.encodings_[0], expected_encodings)
126142
assert target_encoder.target_mean_ == pytest.approx(y_mean)
127143

128-
# Transform on test data, the last value is unknown is it is encoded as the target
144+
# Transform on test data, the last value is unknown so it is encoded as the target
129145
# mean
130146
expected_X_test_transform = np.concatenate(
131147
(expected_encodings, np.array([y_mean]))
@@ -394,15 +410,15 @@ def test_smooth_zero():
394410
# it will be encoded as the mean of the second half
395411
assert_allclose(X_trans[0], np.mean(y[5:]))
396412

397-
# category 1 does nto exist in the first half, thus it will be encoded as
413+
# category 1 does not exist in the first half, thus it will be encoded as
398414
# the mean of the first half
399415
assert_allclose(X_trans[-1], np.mean(y[:5]))
400416

401417

402418
@pytest.mark.parametrize("smooth", [0.0, 1e3, "auto"])
403419
def test_invariance_of_encoding_under_label_permutation(smooth, global_random_seed):
404420
# Check that the encoding does not depend on the integer of the value of
405-
# the integer labels. This is quite of a trivial property but it is helpful
421+
# the integer labels. This is quite a trivial property but it is helpful
406422
# to understand the following test.
407423
rng = np.random.RandomState(global_random_seed)
408424

@@ -440,7 +456,7 @@ def test_invariance_of_encoding_under_label_permutation(smooth, global_random_se
440456
@pytest.mark.parametrize("smooth", [0.0, "auto"])
441457
def test_target_encoding_for_linear_regression(smooth, global_random_seed):
442458
# Check some expected statistical properties when fitting a linear
443-
# regression model on target encoded features depending on there relation
459+
# regression model on target encoded features depending on their relation
444460
# with that target.
445461

446462
# In this test, we use the Ridge class with the "lsqr" solver and a little

0 commit comments

Comments
 (0)