@@ -9,27 +9,26 @@ import Foundation
9
9
import Hub
10
10
11
11
public protocol PreTokenizer {
12
- func preTokenize( text: String ) -> [ String ]
13
- func preTokenize( texts: [ String ] ) -> [ String ]
14
- func callAsFunction( texts: [ String ] ) -> [ String ]
15
- func callAsFunction( text: String ) -> [ String ]
12
+ func preTokenize( text: String , firstSection : Bool ) -> [ String ]
13
+ func preTokenize( texts: [ String ] , firstSection : Bool ) -> [ String ]
14
+ func callAsFunction( texts: [ String ] , firstSection : Bool ) -> [ String ]
15
+ func callAsFunction( text: String , firstSection : Bool ) -> [ String ]
16
16
17
17
init ( config: Config )
18
18
}
19
19
20
20
extension PreTokenizer {
21
- func preTokenize( texts: [ String ] ) -> [ String ] {
22
- texts. flatMap { preTokenize ( text: $0) }
21
+ func preTokenize( texts: [ String ] , firstSection : Bool = true ) -> [ String ] {
22
+ texts. flatMap { preTokenize ( text: $0, firstSection : firstSection ) }
23
23
}
24
24
25
- func callAsFunction( texts: [ String ] ) -> [ String ] {
26
- return preTokenize ( texts: texts)
25
+ func callAsFunction( texts: [ String ] , firstSection : Bool = true ) -> [ String ] {
26
+ return preTokenize ( texts: texts, firstSection : firstSection )
27
27
}
28
28
29
- func callAsFunction( text: String ) -> [ String ] {
30
- return preTokenize ( text: text)
29
+ func callAsFunction( text: String , firstSection : Bool = true ) -> [ String ] {
30
+ return preTokenize ( text: text, firstSection : firstSection )
31
31
}
32
-
33
32
}
34
33
35
34
enum PreTokenizerType : String {
@@ -71,9 +70,9 @@ class PreTokenizerSequence: PreTokenizer {
71
70
preTokenizers = configs. compactMap { PreTokenizerFactory . fromConfig ( config: $0) }
72
71
}
73
72
74
- func preTokenize( text: String ) -> [ String ] {
73
+ func preTokenize( text: String , firstSection : Bool = true ) -> [ String ] {
75
74
preTokenizers. reduce ( [ text] ) { current, preTokenizer in
76
- preTokenizer ( texts: current)
75
+ preTokenizer ( texts: current, firstSection : firstSection )
77
76
}
78
77
}
79
78
}
@@ -85,7 +84,7 @@ class WhitespacePreTokenizer: PreTokenizer {
85
84
re = #"\S+"#
86
85
}
87
86
88
- func preTokenize( text: String ) -> [ String ] {
87
+ func preTokenize( text: String , firstSection : Bool = true ) -> [ String ] {
89
88
return text. ranges ( of: re) . map { String ( text [ $0] ) }
90
89
}
91
90
}
@@ -125,7 +124,7 @@ class MetaspacePreTokenizer: PreTokenizer {
125
124
126
125
// https://github.com/huggingface/tokenizers/blob/accd0650b802f2180df40ef1def3bce32156688e/tokenizers/src/pre_tokenizers/metaspace.rs#L114
127
126
// https://github.com/xenova/transformers.js/blob/b07336d8f7ff57453cc164cc68aead2a79cbd57e/src/tokenizers.js#L2153
128
- func preTokenize( text: String ) -> [ String ] {
127
+ func preTokenize( text: String , firstSection : Bool = true ) -> [ String ] {
129
128
let normalized = text. replacingOccurrences ( of: " " , with: stringReplacement)
130
129
131
130
// We add a prefix space if:
@@ -141,7 +140,7 @@ class MetaspacePreTokenizer: PreTokenizer {
141
140
if prependScheme == . always {
142
141
prepend = stringReplacement
143
142
}
144
- if prependScheme == . first /* && first_section */ {
143
+ if prependScheme == . first && firstSection {
145
144
prepend = stringReplacement
146
145
}
147
146
}
@@ -164,7 +163,7 @@ class ByteLevelPreTokenizer: PreTokenizer {
164
163
useRegex = config. useRegex? . boolValue ?? true
165
164
}
166
165
167
- func preTokenize( text: String ) -> [ String ] {
166
+ func preTokenize( text: String , firstSection : Bool = true ) -> [ String ] {
168
167
// Split on whitespace and punctuation
169
168
let tokens = useRegex ? text. ranges ( of: RE) . map ( { String ( text [ $0] ) } ) : [ text]
170
169
return tokens. map { token in
@@ -186,7 +185,7 @@ class PunctuationPreTokenizer: PreTokenizer {
186
185
re = " [^ \( PUNCTUATION_REGEX) ]+|[ \( PUNCTUATION_REGEX) ]+ "
187
186
}
188
187
189
- func preTokenize( text: String ) -> [ String ] {
188
+ func preTokenize( text: String , firstSection : Bool = true ) -> [ String ] {
190
189
// Ref: https://github.com/xenova/transformers.js/blob/27920d84831e323275b38f0b5186644b7936e1a2/src/tokenizers.js#L1138
191
190
return text. ranges ( of: re) . map { String ( text [ $0] ) }
192
191
}
@@ -200,7 +199,7 @@ class DigitsPreTokenizer: PreTokenizer {
200
199
re = " [^ \\ d]+| \\ d \( individualDigits ? " " : " + " ) "
201
200
}
202
201
203
- func preTokenize( text: String ) -> [ String ] {
202
+ func preTokenize( text: String , firstSection : Bool = true ) -> [ String ] {
204
203
return text. ranges ( of: re) . map { String ( text [ $0] ) }
205
204
}
206
205
}
@@ -214,7 +213,7 @@ class SplitPreTokenizer: PreTokenizer {
214
213
invert = config. invert? . boolValue ?? false
215
214
}
216
215
217
- func preTokenize( text: String ) -> [ String ] {
216
+ func preTokenize( text: String , firstSection : Bool = true ) -> [ String ] {
218
217
guard let pattern = pattern else { return [ text] }
219
218
return pattern. split ( text, invert: invert)
220
219
}
0 commit comments