10
10
11
11
from ..base import BaseEstimator , TransformerMixin
12
12
from ..utils import check_array
13
- from ..utils import safe_mask
14
13
from ..utils .fixes import astype
15
14
from ..utils .sparsefuncs import _get_median
16
15
from ..utils .validation import check_is_fitted
@@ -103,21 +102,11 @@ class Imputer(BaseEstimator, TransformerMixin):
103
102
- If `axis=0` and X is encoded as a CSR matrix;
104
103
- If `axis=1` and X is encoded as a CSC matrix.
105
104
106
- add_indicator_features : boolean, optional (default=False)
107
- If True, the transformed ``X`` will have binary indicator features
108
- appended. These correspond to input features with at least one
109
- missing value marking which elements have been imputed.
110
-
111
105
Attributes
112
106
----------
113
107
statistics_ : array of shape (n_features,)
114
108
The imputation fill value for each feature if axis == 0.
115
109
116
- imputed_features_ : array of shape (n_features_with_missing, )
117
- The input features which have been imputed during transform.
118
- The size of this attribute will be the number of features with
119
- at least one missing value (and fewer than all in the axis=0 case).
120
-
121
110
Notes
122
111
-----
123
112
- When ``axis=0``, columns which only contained missing values at `fit`
@@ -127,13 +116,12 @@ class Imputer(BaseEstimator, TransformerMixin):
127
116
contain missing values).
128
117
"""
129
118
def __init__ (self , missing_values = "NaN" , strategy = "mean" ,
130
- axis = 0 , verbose = 0 , copy = True , add_indicator_features = False ):
119
+ axis = 0 , verbose = 0 , copy = True ):
131
120
self .missing_values = missing_values
132
121
self .strategy = strategy
133
122
self .axis = axis
134
123
self .verbose = verbose
135
124
self .copy = copy
136
- self .add_indicator_features = add_indicator_features
137
125
138
126
def fit (self , X , y = None ):
139
127
"""Fit the imputer on X.
@@ -311,74 +299,13 @@ def _dense_fit(self, X, strategy, missing_values, axis):
311
299
312
300
return most_frequent
313
301
314
- def _sparse_transform (self , X , valid_stats , valid_idx ):
315
- """transformer on sparse data."""
316
- mask = _get_mask (X .data , self .missing_values )
317
- indexes = np .repeat (np .arange (len (X .indptr ) - 1 , dtype = np .int ),
318
- np .diff (X .indptr ))[mask ]
319
-
320
- X .data [mask ] = astype (valid_stats [indexes ], X .dtype ,
321
- copy = False )
322
-
323
- mask_matrix = X .__class__ ((mask , X .indices .copy (),
324
- X .indptr .copy ()), shape = X .shape ,
325
- dtype = X .dtype )
326
- mask_matrix .eliminate_zeros () # removes explicit False entries
327
- features_with_missing_values = mask_matrix .sum (axis = 0 ).A .nonzero ()[1 ]
328
- features_mask = safe_mask (mask_matrix , features_with_missing_values )
329
- imputed_mask = mask_matrix [:, features_mask ]
330
- if self .axis == 0 :
331
- self .imputed_features_ = valid_idx [features_with_missing_values ]
332
- else :
333
- self .imputed_features_ = features_with_missing_values
334
-
335
- if self .add_indicator_features :
336
- X = sparse .hstack ((X , imputed_mask ))
337
-
338
- return X
339
-
340
- def _dense_transform (self , X , valid_stats , valid_idx ):
341
- """transformer on dense data."""
342
- mask = _get_mask (X , self .missing_values )
343
- n_missing = np .sum (mask , axis = self .axis )
344
- values = np .repeat (valid_stats , n_missing )
345
-
346
- if self .axis == 0 :
347
- coordinates = np .where (mask .transpose ())[::- 1 ]
348
- else :
349
- coordinates = mask
350
-
351
- X [coordinates ] = values
352
-
353
- features_with_missing_values = np .where (np .any
354
- (mask , axis = 0 ))[0 ]
355
- imputed_mask = mask [:, features_with_missing_values ]
356
- if self .axis == 0 :
357
- self .imputed_features_ = valid_idx [features_with_missing_values ]
358
- else :
359
- self .imputed_features_ = features_with_missing_values
360
-
361
- if self .add_indicator_features :
362
- X = np .hstack ((X , imputed_mask ))
363
-
364
- return X
365
-
366
302
def transform (self , X ):
367
303
"""Impute all missing values in X.
368
304
369
305
Parameters
370
306
----------
371
- X : {array-like, sparse matrix}, shape = ( n_samples, n_features)
307
+ X : {array-like, sparse matrix}, shape = [ n_samples, n_features]
372
308
The input data to complete.
373
-
374
- Return
375
- ------
376
- X_new : {array-like, sparse matrix},
377
- Transformed array.
378
- shape (n_samples, n_features_new) when
379
- ``add_indicator_features`` is False,
380
- shape (n_samples, n_features_new + len(imputed_features_)
381
- when ``add_indicator_features`` is True.
382
309
"""
383
310
if self .axis == 0 :
384
311
check_is_fitted (self , 'statistics_' )
@@ -410,27 +337,39 @@ def transform(self, X):
410
337
invalid_mask = np .isnan (statistics )
411
338
valid_mask = np .logical_not (invalid_mask )
412
339
valid_statistics = statistics [valid_mask ]
413
- valid_idx = np .where (valid_mask )[0 ]
340
+ valid_statistics_indexes = np .where (valid_mask )[0 ]
414
341
missing = np .arange (X .shape [not self .axis ])[invalid_mask ]
415
342
416
343
if self .axis == 0 and invalid_mask .any ():
417
344
if self .verbose :
418
345
warnings .warn ("Deleting features without "
419
346
"observed values: %s" % missing )
420
- X = X [:, valid_idx ]
347
+ X = X [:, valid_statistics_indexes ]
421
348
elif self .axis == 1 and invalid_mask .any ():
422
349
raise ValueError ("Some rows only contain "
423
350
"missing values: %s" % missing )
424
351
425
352
# Do actual imputation
426
353
if sparse .issparse (X ) and self .missing_values != 0 :
427
- # sparse matrix and missing values is not zero
428
- X = self ._sparse_transform (X , valid_statistics , valid_idx )
354
+ mask = _get_mask (X .data , self .missing_values )
355
+ indexes = np .repeat (np .arange (len (X .indptr ) - 1 , dtype = np .int ),
356
+ np .diff (X .indptr ))[mask ]
357
+
358
+ X .data [mask ] = astype (valid_statistics [indexes ], X .dtype ,
359
+ copy = False )
429
360
else :
430
- # sparse with zero as missing value and dense matrix
431
361
if sparse .issparse (X ):
432
362
X = X .toarray ()
433
363
434
- X = self ._dense_transform (X , valid_statistics , valid_idx )
364
+ mask = _get_mask (X , self .missing_values )
365
+ n_missing = np .sum (mask , axis = self .axis )
366
+ values = np .repeat (valid_statistics , n_missing )
367
+
368
+ if self .axis == 0 :
369
+ coordinates = np .where (mask .transpose ())[::- 1 ]
370
+ else :
371
+ coordinates = mask
372
+
373
+ X [coordinates ] = values
435
374
436
375
return X
0 commit comments