@@ -159,7 +159,8 @@ def _char_wb_ngrams(self, text_document):
159
159
"""Whitespace sensitive char-n-gram tokenization.
160
160
161
161
Tokenize text_document into a sequence of character n-grams
162
- excluding any whitespace (operating only inside word boundaries)"""
162
+ operating only inside word boundaries. n-grams at the edges
163
+ of words are padded with space."""
163
164
# normalize white spaces
164
165
text_document = self ._white_spaces .sub (" " , text_document )
165
166
@@ -354,7 +355,7 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin):
354
355
analyzer : string, {'word', 'char', 'char_wb'} or callable
355
356
Whether the feature should be made of word or character n-grams.
356
357
Option 'char_wb' creates character n-grams only from text inside
357
- word boundaries.
358
+ word boundaries; n-grams at the edges of words are padded with space .
358
359
359
360
If a callable is passed it is used to extract the sequence of features
360
361
out of the raw, unprocessed input.
@@ -553,7 +554,7 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
553
554
analyzer : string, {'word', 'char', 'char_wb'} or callable
554
555
Whether the feature should be made of word or character n-grams.
555
556
Option 'char_wb' creates character n-grams only from text inside
556
- word boundaries.
557
+ word boundaries; n-grams at the edges of words are padded with space .
557
558
558
559
If a callable is passed it is used to extract the sequence of features
559
560
out of the raw, unprocessed input.
0 commit comments