8
8
import Foundation
9
9
import Hub
10
10
11
+ public enum PreTokenizerOption : String {
12
+ case firstSection
13
+ }
14
+
15
+ public typealias PreTokenizerOptions = Set < PreTokenizerOption >
16
+
11
17
public protocol PreTokenizer {
12
- func preTokenize( text: String , firstSection : Bool ) -> [ String ]
13
- func preTokenize( texts: [ String ] , firstSection : Bool ) -> [ String ]
14
- func callAsFunction( texts: [ String ] , firstSection : Bool ) -> [ String ]
15
- func callAsFunction( text: String , firstSection : Bool ) -> [ String ]
18
+ func preTokenize( text: String , options : PreTokenizerOptions ) -> [ String ]
19
+ func preTokenize( texts: [ String ] , options : PreTokenizerOptions ) -> [ String ]
20
+ func callAsFunction( texts: [ String ] , options : PreTokenizerOptions ) -> [ String ]
21
+ func callAsFunction( text: String , options : PreTokenizerOptions ) -> [ String ]
16
22
17
23
init ( config: Config )
18
24
}
19
25
20
26
extension PreTokenizer {
21
- func preTokenize( texts: [ String ] , firstSection : Bool = true ) -> [ String ] {
22
- texts. flatMap { preTokenize ( text: $0, firstSection : firstSection ) }
27
+ func preTokenize( texts: [ String ] , options : PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
28
+ texts. flatMap { preTokenize ( text: $0, options : options ) }
23
29
}
24
30
25
- func callAsFunction( texts: [ String ] , firstSection : Bool = true ) -> [ String ] {
26
- return preTokenize ( texts: texts, firstSection : firstSection )
31
+ func callAsFunction( texts: [ String ] , options : PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
32
+ return preTokenize ( texts: texts, options : options )
27
33
}
28
34
29
- func callAsFunction( text: String , firstSection : Bool = true ) -> [ String ] {
30
- return preTokenize ( text: text, firstSection : firstSection )
35
+ func callAsFunction( text: String , options : PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
36
+ return preTokenize ( text: text, options : options )
31
37
}
32
38
}
33
39
@@ -70,9 +76,9 @@ class PreTokenizerSequence: PreTokenizer {
70
76
preTokenizers = configs. compactMap { PreTokenizerFactory . fromConfig ( config: $0) }
71
77
}
72
78
73
- func preTokenize( text: String , firstSection : Bool = true ) -> [ String ] {
79
+ func preTokenize( text: String , options : PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
74
80
preTokenizers. reduce ( [ text] ) { current, preTokenizer in
75
- preTokenizer ( texts: current, firstSection : firstSection )
81
+ preTokenizer ( texts: current, options : options )
76
82
}
77
83
}
78
84
}
@@ -84,7 +90,7 @@ class WhitespacePreTokenizer: PreTokenizer {
84
90
re = #"\S+"#
85
91
}
86
92
87
- func preTokenize( text: String , firstSection : Bool = true ) -> [ String ] {
93
+ func preTokenize( text: String , options : PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
88
94
return text. ranges ( of: re) . map { String ( text [ $0] ) }
89
95
}
90
96
}
@@ -124,7 +130,7 @@ class MetaspacePreTokenizer: PreTokenizer {
124
130
125
131
// https://github.com/huggingface/tokenizers/blob/accd0650b802f2180df40ef1def3bce32156688e/tokenizers/src/pre_tokenizers/metaspace.rs#L114
126
132
// https://github.com/xenova/transformers.js/blob/b07336d8f7ff57453cc164cc68aead2a79cbd57e/src/tokenizers.js#L2153
127
- func preTokenize( text: String , firstSection : Bool = true ) -> [ String ] {
133
+ func preTokenize( text: String , options : PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
128
134
let normalized = text. replacingOccurrences ( of: " " , with: stringReplacement)
129
135
130
136
// We add a prefix space if:
@@ -140,7 +146,7 @@ class MetaspacePreTokenizer: PreTokenizer {
140
146
if prependScheme == . always {
141
147
prepend = stringReplacement
142
148
}
143
- if prependScheme == . first && firstSection {
149
+ if prependScheme == . first && options . contains ( . firstSection) {
144
150
prepend = stringReplacement
145
151
}
146
152
}
@@ -163,7 +169,7 @@ class ByteLevelPreTokenizer: PreTokenizer {
163
169
useRegex = config. useRegex? . boolValue ?? true
164
170
}
165
171
166
- func preTokenize( text: String , firstSection : Bool = true ) -> [ String ] {
172
+ func preTokenize( text: String , options : PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
167
173
// Split on whitespace and punctuation
168
174
let tokens = useRegex ? text. ranges ( of: RE) . map ( { String ( text [ $0] ) } ) : [ text]
169
175
return tokens. map { token in
@@ -185,7 +191,7 @@ class PunctuationPreTokenizer: PreTokenizer {
185
191
re = " [^ \( PUNCTUATION_REGEX) ]+|[ \( PUNCTUATION_REGEX) ]+ "
186
192
}
187
193
188
- func preTokenize( text: String , firstSection : Bool = true ) -> [ String ] {
194
+ func preTokenize( text: String , options : PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
189
195
// Ref: https://github.com/xenova/transformers.js/blob/27920d84831e323275b38f0b5186644b7936e1a2/src/tokenizers.js#L1138
190
196
return text. ranges ( of: re) . map { String ( text [ $0] ) }
191
197
}
@@ -199,7 +205,7 @@ class DigitsPreTokenizer: PreTokenizer {
199
205
re = " [^ \\ d]+| \\ d \( individualDigits ? " " : " + " ) "
200
206
}
201
207
202
- func preTokenize( text: String , firstSection : Bool = true ) -> [ String ] {
208
+ func preTokenize( text: String , options : PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
203
209
return text. ranges ( of: re) . map { String ( text [ $0] ) }
204
210
}
205
211
}
@@ -213,7 +219,7 @@ class SplitPreTokenizer: PreTokenizer {
213
219
invert = config. invert? . boolValue ?? false
214
220
}
215
221
216
- func preTokenize( text: String , firstSection : Bool = true ) -> [ String ] {
222
+ func preTokenize( text: String , options : PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
217
223
guard let pattern = pattern else { return [ text] }
218
224
return pattern. split ( text, invert: invert)
219
225
}
0 commit comments