8
8
import Foundation
9
9
import Hub
10
10
11
+ public enum PreTokenizerOption : String {
12
+ case firstSection
13
+ }
14
+
15
+ public typealias PreTokenizerOptions = Set < PreTokenizerOption >
16
+
11
17
public protocol PreTokenizer {
12
- func preTokenize( text: String ) -> [ String ]
13
- func preTokenize( texts: [ String ] ) -> [ String ]
14
- func callAsFunction( texts: [ String ] ) -> [ String ]
15
- func callAsFunction( text: String ) -> [ String ]
18
+ func preTokenize( text: String , options : PreTokenizerOptions ) -> [ String ]
19
+ func preTokenize( texts: [ String ] , options : PreTokenizerOptions ) -> [ String ]
20
+ func callAsFunction( texts: [ String ] , options : PreTokenizerOptions ) -> [ String ]
21
+ func callAsFunction( text: String , options : PreTokenizerOptions ) -> [ String ]
16
22
17
23
init ( config: Config )
18
24
}
19
25
20
26
extension PreTokenizer {
21
- func preTokenize( texts: [ String ] ) -> [ String ] {
22
- texts. flatMap { preTokenize ( text: $0) }
27
+ func preTokenize( texts: [ String ] , options : PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
28
+ texts. flatMap { preTokenize ( text: $0, options : options ) }
23
29
}
24
30
25
- func callAsFunction( texts: [ String ] ) -> [ String ] {
26
- return preTokenize ( texts: texts)
31
+ func callAsFunction( texts: [ String ] , options : PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
32
+ return preTokenize ( texts: texts, options : options )
27
33
}
28
34
29
- func callAsFunction( text: String ) -> [ String ] {
30
- return preTokenize ( text: text)
35
+ func callAsFunction( text: String , options : PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
36
+ return preTokenize ( text: text, options : options )
31
37
}
32
-
33
38
}
34
39
35
40
enum PreTokenizerType : String {
@@ -71,9 +76,9 @@ class PreTokenizerSequence: PreTokenizer {
71
76
preTokenizers = configs. compactMap { PreTokenizerFactory . fromConfig ( config: $0) }
72
77
}
73
78
74
- func preTokenize( text: String ) -> [ String ] {
79
+ func preTokenize( text: String , options : PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
75
80
preTokenizers. reduce ( [ text] ) { current, preTokenizer in
76
- preTokenizer ( texts: current)
81
+ preTokenizer ( texts: current, options : options )
77
82
}
78
83
}
79
84
}
@@ -85,7 +90,7 @@ class WhitespacePreTokenizer: PreTokenizer {
85
90
re = #"\S+"#
86
91
}
87
92
88
- func preTokenize( text: String ) -> [ String ] {
93
+ func preTokenize( text: String , options : PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
89
94
return text. ranges ( of: re) . map { String ( text [ $0] ) }
90
95
}
91
96
}
@@ -125,7 +130,7 @@ class MetaspacePreTokenizer: PreTokenizer {
125
130
126
131
// https://github.com/huggingface/tokenizers/blob/accd0650b802f2180df40ef1def3bce32156688e/tokenizers/src/pre_tokenizers/metaspace.rs#L114
127
132
// https://github.com/xenova/transformers.js/blob/b07336d8f7ff57453cc164cc68aead2a79cbd57e/src/tokenizers.js#L2153
128
- func preTokenize( text: String ) -> [ String ] {
133
+ func preTokenize( text: String , options : PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
129
134
let normalized = text. replacingOccurrences ( of: " " , with: stringReplacement)
130
135
131
136
// We add a prefix space if:
@@ -141,7 +146,7 @@ class MetaspacePreTokenizer: PreTokenizer {
141
146
if prependScheme == . always {
142
147
prepend = stringReplacement
143
148
}
144
- if prependScheme == . first /* && first_section */ {
149
+ if prependScheme == . first && options . contains ( . firstSection ) {
145
150
prepend = stringReplacement
146
151
}
147
152
}
@@ -164,7 +169,7 @@ class ByteLevelPreTokenizer: PreTokenizer {
164
169
useRegex = config. useRegex? . boolValue ?? true
165
170
}
166
171
167
- func preTokenize( text: String ) -> [ String ] {
172
+ func preTokenize( text: String , options : PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
168
173
// Split on whitespace and punctuation
169
174
let tokens = useRegex ? text. ranges ( of: RE) . map ( { String ( text [ $0] ) } ) : [ text]
170
175
return tokens. map { token in
@@ -186,7 +191,7 @@ class PunctuationPreTokenizer: PreTokenizer {
186
191
re = " [^ \( PUNCTUATION_REGEX) ]+|[ \( PUNCTUATION_REGEX) ]+ "
187
192
}
188
193
189
- func preTokenize( text: String ) -> [ String ] {
194
+ func preTokenize( text: String , options : PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
190
195
// Ref: https://github.com/xenova/transformers.js/blob/27920d84831e323275b38f0b5186644b7936e1a2/src/tokenizers.js#L1138
191
196
return text. ranges ( of: re) . map { String ( text [ $0] ) }
192
197
}
@@ -200,7 +205,7 @@ class DigitsPreTokenizer: PreTokenizer {
200
205
re = " [^ \\ d]+| \\ d \( individualDigits ? " " : " + " ) "
201
206
}
202
207
203
- func preTokenize( text: String ) -> [ String ] {
208
+ func preTokenize( text: String , options : PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
204
209
return text. ranges ( of: re) . map { String ( text [ $0] ) }
205
210
}
206
211
}
@@ -214,7 +219,7 @@ class SplitPreTokenizer: PreTokenizer {
214
219
invert = config. invert? . boolValue ?? false
215
220
}
216
221
217
- func preTokenize( text: String ) -> [ String ] {
222
+ func preTokenize( text: String , options : PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
218
223
guard let pattern = pattern else { return [ text] }
219
224
return pattern. split ( text, invert: invert)
220
225
}
@@ -248,7 +253,7 @@ extension StringSplitPattern {
248
253
}
249
254
}
250
255
251
- extension String {
256
+ public extension String {
252
257
func ranges( of string: String , options: CompareOptions = . regularExpression) -> [ Range < Index > ] {
253
258
var result : [ Range < Index > ] = [ ]
254
259
var start = startIndex
@@ -277,6 +282,42 @@ extension String {
277
282
return result
278
283
}
279
284
285
+ /// This version supports capture groups, wheres the one above doesn't
286
+ func split( by captureRegex: NSRegularExpression ) -> [ String ] {
287
+ // Find the matching capture groups
288
+ let selfRange = NSRange ( startIndex..< endIndex, in: self )
289
+ let matches = captureRegex. matches ( in: self , options: [ ] , range: selfRange)
290
+
291
+ if matches. first == nil { return [ self ] }
292
+
293
+ var result : [ String ] = [ ]
294
+ var start = startIndex
295
+ for match in matches {
296
+ // Append prefix before matched separator
297
+ let prefixEnd = index ( startIndex, offsetBy: match. range. lowerBound)
298
+ if start < prefixEnd {
299
+ result. append ( String ( self [ start..< prefixEnd] ) )
300
+ }
301
+ start = index ( startIndex, offsetBy: match. range. upperBound)
302
+
303
+ // Append separator, supporting capture groups
304
+ for r in ( 0 ..< match. numberOfRanges) . reversed ( ) {
305
+ let matchRange = match. range ( at: r)
306
+ if let sepRange = Range ( matchRange, in: self ) {
307
+ result. append ( String ( self [ sepRange] ) )
308
+ break
309
+ }
310
+ }
311
+ }
312
+
313
+ // Append remaining suffix
314
+ let beginningOfEnd = index ( startIndex, offsetBy: matches. last!. range. upperBound)
315
+ if beginningOfEnd < endIndex {
316
+ result. append ( String ( self [ beginningOfEnd... ] ) )
317
+ }
318
+
319
+ return result
320
+ }
280
321
}
281
322
282
323
public enum SplitDelimiterBehavior {
0 commit comments