@@ -61,54 +61,70 @@ def _encode_target(X_ordinal, y_int, n_categories, smooth):
61
61
@pytest .mark .parametrize ("smooth" , [5.0 , "auto" ])
62
62
@pytest .mark .parametrize ("target_type" , ["binary" , "continuous" ])
63
63
def test_encoding (categories , unknown_value , global_random_seed , smooth , target_type ):
64
- """Check encoding for binary and continuous targets."""
64
+ """Check encoding for binary and continuous targets.
65
+
66
+ Compare the values returned by `TargetEncoder.fit_transform` against the
67
+ expected encodings for cv splits from a naive reference Python
68
+ implementation in _encode_target.
69
+ """
65
70
66
- X_train_array = np .array ([[0 ] * 20 + [1 ] * 30 + [2 ] * 40 ], dtype = np .int64 ).T
67
- X_test_array = np .array ([[0 , 1 , 2 ]], dtype = np .int64 ).T
68
71
n_categories = 3
69
- n_samples = X_train_array .shape [0 ]
72
+ X_train_int_array = np .array ([[0 ] * 20 + [1 ] * 30 + [2 ] * 40 ], dtype = np .int64 ).T
73
+ X_test_int_array = np .array ([[0 , 1 , 2 ]], dtype = np .int64 ).T
74
+ n_samples = X_train_int_array .shape [0 ]
70
75
71
76
if categories == "auto" :
72
- X_train = X_train_array
77
+ X_train = X_train_int_array
78
+ X_test = X_test_int_array
73
79
else :
74
- X_train = categories [0 ][X_train_array ]
80
+ X_train = categories [0 ][X_train_int_array ]
81
+ X_test = categories [0 ][X_test_int_array ]
75
82
76
- if categories == "auto" :
77
- X_test = X_test_array
78
- else :
79
- X_test = categories [0 ][X_test_array ]
80
83
X_test = np .concatenate ((X_test , [[unknown_value ]]))
81
84
82
- rng = np .random .RandomState (global_random_seed )
83
-
85
+ data_rng = np .random .RandomState (global_random_seed )
86
+ n_splits = 3
84
87
if target_type == "binary" :
85
- y_int = rng .randint (low = 0 , high = 2 , size = n_samples )
88
+ y_int = data_rng .randint (low = 0 , high = 2 , size = n_samples )
86
89
target_names = np .array (["cat" , "dog" ], dtype = object )
87
90
y_train = target_names [y_int ]
88
- cv = StratifiedKFold ( n_splits = 3 , random_state = 0 , shuffle = True )
91
+
89
92
else : # target_type == continuous
90
- y_int = rng .uniform (low = - 10 , high = 20 , size = n_samples )
93
+ y_int = data_rng .uniform (low = - 10 , high = 20 , size = n_samples )
91
94
y_train = y_int
92
- cv = KFold (n_splits = 3 , random_state = 0 , shuffle = True )
93
95
94
- shuffled_idx = rng .permutation (n_samples )
95
- X_train_array = X_train_array [shuffled_idx ]
96
+ shuffled_idx = data_rng .permutation (n_samples )
97
+ X_train_int_array = X_train_int_array [shuffled_idx ]
96
98
X_train = X_train [shuffled_idx ]
97
99
y_train = y_train [shuffled_idx ]
98
100
y_int = y_int [shuffled_idx ]
99
101
100
- # Get encodings for cv splits to validate `fit_transform`
101
- expected_X_fit_transform = np .empty_like (X_train_array , dtype = np .float64 )
102
+ # Define our CV splitting strategy
103
+ if target_type == "binary" :
104
+ cv = StratifiedKFold (
105
+ n_splits = n_splits , random_state = global_random_seed , shuffle = True
106
+ )
107
+ else :
108
+ cv = KFold (n_splits = n_splits , random_state = global_random_seed , shuffle = True )
109
+
110
+ # Compute the expected values using our reference Python implementation of
111
+ # target encoding:
112
+ expected_X_fit_transform = np .empty_like (X_train_int_array , dtype = np .float64 )
102
113
103
- for train_idx , test_idx in cv .split (X_train_array , y_train ):
104
- X_ , y_ = X_train_array [train_idx , 0 ], y_int [train_idx ]
114
+ for train_idx , test_idx in cv .split (X_train_int_array , y_train ):
115
+ X_ , y_ = X_train_int_array [train_idx , 0 ], y_int [train_idx ]
105
116
cur_encodings = _encode_target (X_ , y_ , n_categories , smooth )
106
117
expected_X_fit_transform [test_idx , 0 ] = cur_encodings [
107
- X_train_array [test_idx , 0 ]
118
+ X_train_int_array [test_idx , 0 ]
108
119
]
109
120
121
+ # Check that we can obtain the same encodings by calling `fit_transform` on
122
+ # the estimator with the same CV parameters:
110
123
target_encoder = TargetEncoder (
111
- smooth = smooth , categories = categories , cv = 3 , random_state = 0
124
+ smooth = smooth ,
125
+ categories = categories ,
126
+ cv = n_splits ,
127
+ random_state = global_random_seed ,
112
128
)
113
129
114
130
X_fit_transform = target_encoder .fit_transform (X_train , y_train )
@@ -120,12 +136,12 @@ def test_encoding(categories, unknown_value, global_random_seed, smooth, target_
120
136
# compute encodings for all data to validate `transform`
121
137
y_mean = np .mean (y_int )
122
138
expected_encodings = _encode_target (
123
- X_train_array [:, 0 ], y_int , n_categories , smooth
139
+ X_train_int_array [:, 0 ], y_int , n_categories , smooth
124
140
)
125
141
assert_allclose (target_encoder .encodings_ [0 ], expected_encodings )
126
142
assert target_encoder .target_mean_ == pytest .approx (y_mean )
127
143
128
- # Transform on test data, the last value is unknown is it is encoded as the target
144
+ # Transform on test data, the last value is unknown so it is encoded as the target
129
145
# mean
130
146
expected_X_test_transform = np .concatenate (
131
147
(expected_encodings , np .array ([y_mean ]))
@@ -394,15 +410,15 @@ def test_smooth_zero():
394
410
# it will be encoded as the mean of the second half
395
411
assert_allclose (X_trans [0 ], np .mean (y [5 :]))
396
412
397
- # category 1 does nto exist in the first half, thus it will be encoded as
413
+ # category 1 does not exist in the first half, thus it will be encoded as
398
414
# the mean of the first half
399
415
assert_allclose (X_trans [- 1 ], np .mean (y [:5 ]))
400
416
401
417
402
418
@pytest .mark .parametrize ("smooth" , [0.0 , 1e3 , "auto" ])
403
419
def test_invariance_of_encoding_under_label_permutation (smooth , global_random_seed ):
404
420
# Check that the encoding does not depend on the integer of the value of
405
- # the integer labels. This is quite of a trivial property but it is helpful
421
+ # the integer labels. This is quite a trivial property but it is helpful
406
422
# to understand the following test.
407
423
rng = np .random .RandomState (global_random_seed )
408
424
@@ -440,7 +456,7 @@ def test_invariance_of_encoding_under_label_permutation(smooth, global_random_se
440
456
@pytest .mark .parametrize ("smooth" , [0.0 , "auto" ])
441
457
def test_target_encoding_for_linear_regression (smooth , global_random_seed ):
442
458
# Check some expected statistical properties when fitting a linear
443
- # regression model on target encoded features depending on there relation
459
+ # regression model on target encoded features depending on their relation
444
460
# with that target.
445
461
446
462
# In this test, we use the Ridge class with the "lsqr" solver and a little
0 commit comments