Replace with enum for future extensibility

pcuenca · pcuenca · commit c5b0160cb896 · 2024-04-26T22:21:00.000+02:00
diff --git a/Sources/Tokenizers/PreTokenizer.swift b/Sources/Tokenizers/PreTokenizer.swift
@@ -8,26 +8,32 @@
 import Foundation
 import Hub
 
+public enum PreTokenizerOption: String {
+    case firstSection
+}
+
+public typealias PreTokenizerOptions = Set<PreTokenizerOption>
+
 public protocol PreTokenizer {
-    func preTokenize(text: String, firstSection: Bool) -> [String]
-    func preTokenize(texts: [String], firstSection: Bool) -> [String]
-    func callAsFunction(texts: [String], firstSection: Bool) -> [String]
-    func callAsFunction(text: String, firstSection: Bool) -> [String]
+    func preTokenize(text: String, options: PreTokenizerOptions) -> [String]
+    func preTokenize(texts: [String], options: PreTokenizerOptions) -> [String]
+    func callAsFunction(texts: [String], options: PreTokenizerOptions) -> [String]
+    func callAsFunction(text: String, options: PreTokenizerOptions) -> [String]
 
     init(config: Config)
 }
 
 extension PreTokenizer {
-    func preTokenize(texts: [String], firstSection: Bool = true) -> [String] {
-        texts.flatMap { preTokenize(text: $0, firstSection: firstSection) }
+    func preTokenize(texts: [String], options: PreTokenizerOptions = [.firstSection]) -> [String] {
+        texts.flatMap { preTokenize(text: $0, options: options) }
     }
 
-    func callAsFunction(texts: [String], firstSection: Bool = true) -> [String] {
-        return preTokenize(texts: texts, firstSection: firstSection)
+    func callAsFunction(texts: [String], options: PreTokenizerOptions = [.firstSection]) -> [String] {
+        return preTokenize(texts: texts, options: options)
     }
     
-    func callAsFunction(text: String, firstSection: Bool = true) -> [String] {
-        return preTokenize(text: text, firstSection: firstSection)
+    func callAsFunction(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {
+        return preTokenize(text: text, options: options)
     }
 }
 
@@ -70,9 +76,9 @@ class PreTokenizerSequence: PreTokenizer {
         preTokenizers = configs.compactMap { PreTokenizerFactory.fromConfig(config: $0) }
     }
     
-    func preTokenize(text: String, firstSection: Bool = true) -> [String] {
+    func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {
         preTokenizers.reduce([text]) { current, preTokenizer in
-            preTokenizer(texts: current, firstSection: firstSection)
+            preTokenizer(texts: current, options: options)
         }
     }
 }
@@ -84,7 +90,7 @@ class WhitespacePreTokenizer: PreTokenizer {
         re = #"\S+"#
     }
 
-    func preTokenize(text: String, firstSection: Bool = true) -> [String] {
+    func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {
         return text.ranges(of: re).map { String(text[$0]) }
     }
 }
@@ -124,7 +130,7 @@ class MetaspacePreTokenizer: PreTokenizer {
     
     // https://github.com/huggingface/tokenizers/blob/accd0650b802f2180df40ef1def3bce32156688e/tokenizers/src/pre_tokenizers/metaspace.rs#L114
     // https://github.com/xenova/transformers.js/blob/b07336d8f7ff57453cc164cc68aead2a79cbd57e/src/tokenizers.js#L2153
-    func preTokenize(text: String, firstSection: Bool = true) -> [String] {
+    func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {
         let normalized = text.replacingOccurrences(of: " ", with: stringReplacement)
         
         // We add a prefix space if:
@@ -140,7 +146,7 @@ class MetaspacePreTokenizer: PreTokenizer {
             if prependScheme == .always {
                 prepend = stringReplacement
             }
-            if prependScheme == .first && firstSection {
+            if prependScheme == .first && options.contains(.firstSection) {
                 prepend = stringReplacement
             }
         }
@@ -163,7 +169,7 @@ class ByteLevelPreTokenizer: PreTokenizer {
         useRegex = config.useRegex?.boolValue ?? true
     }
     
-    func preTokenize(text: String, firstSection: Bool = true) -> [String] {
+    func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {
         // Split on whitespace and punctuation
         let tokens = useRegex ? text.ranges(of: RE).map({ String(text[$0]) }) : [text]
         return tokens.map { token in
@@ -185,7 +191,7 @@ class PunctuationPreTokenizer: PreTokenizer {
         re = "[^\(PUNCTUATION_REGEX)]+|[\(PUNCTUATION_REGEX)]+"
     }
 
-    func preTokenize(text: String, firstSection: Bool = true) -> [String] {
+    func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {
         // Ref: https://github.com/xenova/transformers.js/blob/27920d84831e323275b38f0b5186644b7936e1a2/src/tokenizers.js#L1138
         return text.ranges(of: re).map { String(text[$0]) }
     }
@@ -199,7 +205,7 @@ class DigitsPreTokenizer: PreTokenizer {
         re = "[^\\d]+|\\d\(individualDigits ? "" : "+")"
     }
 
-    func preTokenize(text: String, firstSection: Bool = true) -> [String] {
+    func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {
         return text.ranges(of: re).map { String(text[$0]) }
     }
 }
@@ -213,7 +219,7 @@ class SplitPreTokenizer: PreTokenizer {
         invert = config.invert?.boolValue ?? false
     }
 
-    func preTokenize(text: String, firstSection: Bool = true) -> [String] {
+    func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {
         guard let pattern = pattern else { return [text] }
         return pattern.split(text, invert: invert)
     }
diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift
@@ -185,9 +185,9 @@ public class PreTrainedTokenizer: Tokenizer {
         model = try TokenizerModel.from(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData, addedTokens: addedTokens)
     }
 
-    func preTokenize(_ text: String, firstSection: Bool) -> [String] {
+    func preTokenize(_ text: String, options: PreTokenizerOptions) -> [String] {
         guard let preTokenizer = preTokenizer else { return [text] }
-        return preTokenizer(text: text, firstSection: firstSection)
+        return preTokenizer(text: text, options: options)
     }
 
     func normalize(_ text: String) -> String {
@@ -231,7 +231,7 @@ public class PreTrainedTokenizer: Tokenizer {
         }
         return sections.enumerated().map { section, x in
             if addedTokens.contains(x) { return [x] }
-            return preTokenize(normalize(x), firstSection: section == 0).flatMap { model($0) }
+            return preTokenize(normalize(x), options: section == 0 ? [.firstSection] : []).flatMap { model($0) }
         }.flatMap { $0 }
     }
 

Original file line number	Diff line number	Diff line change
`@@ -8,26 +8,32 @@`
`8`	`8`	`import Foundation`
`9`	`9`	`import Hub`
`10`	`10`
	`11`	`+public enum PreTokenizerOption: String {`
	`12`	`+ case firstSection`
	`13`	`+}`
	`14`	`+`
	`15`	`+public typealias PreTokenizerOptions = Set<PreTokenizerOption>`
	`16`	`+`
`11`	`17`	`public protocol PreTokenizer {`
`12`		`- func preTokenize(text: String, firstSection: Bool) -> [String]`
`13`		`- func preTokenize(texts: [String], firstSection: Bool) -> [String]`
`14`		`- func callAsFunction(texts: [String], firstSection: Bool) -> [String]`
`15`		`- func callAsFunction(text: String, firstSection: Bool) -> [String]`
	`18`	`+ func preTokenize(text: String, options: PreTokenizerOptions) -> [String]`
	`19`	`+ func preTokenize(texts: [String], options: PreTokenizerOptions) -> [String]`
	`20`	`+ func callAsFunction(texts: [String], options: PreTokenizerOptions) -> [String]`
	`21`	`+ func callAsFunction(text: String, options: PreTokenizerOptions) -> [String]`
`16`	`22`
`17`	`23`	`init(config: Config)`
`18`	`24`	`}`
`19`	`25`
`20`	`26`	`extension PreTokenizer {`
`21`		`- func preTokenize(texts: [String], firstSection: Bool = true) -> [String] {`
`22`		`- texts.flatMap { preTokenize(text: $0, firstSection: firstSection) }`
	`27`	`+ func preTokenize(texts: [String], options: PreTokenizerOptions = [.firstSection]) -> [String] {`
	`28`	`+ texts.flatMap { preTokenize(text: $0, options: options) }`
`23`	`29`	`}`
`24`	`30`
`25`		`- func callAsFunction(texts: [String], firstSection: Bool = true) -> [String] {`
`26`		`- return preTokenize(texts: texts, firstSection: firstSection)`
	`31`	`+ func callAsFunction(texts: [String], options: PreTokenizerOptions = [.firstSection]) -> [String] {`
	`32`	`+ return preTokenize(texts: texts, options: options)`
`27`	`33`	`}`
`28`	`34`
`29`		`- func callAsFunction(text: String, firstSection: Bool = true) -> [String] {`
`30`		`- return preTokenize(text: text, firstSection: firstSection)`
	`35`	`+ func callAsFunction(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {`
	`36`	`+ return preTokenize(text: text, options: options)`
`31`	`37`	`}`
`32`	`38`	`}`
`33`	`39`
`@@ -70,9 +76,9 @@ class PreTokenizerSequence: PreTokenizer {`
`70`	`76`	`preTokenizers = configs.compactMap { PreTokenizerFactory.fromConfig(config: $0) }`
`71`	`77`	`}`
`72`	`78`
`73`		`- func preTokenize(text: String, firstSection: Bool = true) -> [String] {`
	`79`	`+ func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {`
`74`	`80`	`preTokenizers.reduce([text]) { current, preTokenizer in`
`75`		`- preTokenizer(texts: current, firstSection: firstSection)`
	`81`	`+ preTokenizer(texts: current, options: options)`
`76`	`82`	`}`
`77`	`83`	`}`
`78`	`84`	`}`
`@@ -84,7 +90,7 @@ class WhitespacePreTokenizer: PreTokenizer {`
`84`	`90`	`re = #"\S+"#`
`85`	`91`	`}`
`86`	`92`
`87`		`- func preTokenize(text: String, firstSection: Bool = true) -> [String] {`
	`93`	`+ func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {`
`88`	`94`	`return text.ranges(of: re).map { String(text[$0]) }`
`89`	`95`	`}`
`90`	`96`	`}`
`@@ -124,7 +130,7 @@ class MetaspacePreTokenizer: PreTokenizer {`
`124`	`130`
`125`	`131`	`// https://github.com/huggingface/tokenizers/blob/accd0650b802f2180df40ef1def3bce32156688e/tokenizers/src/pre_tokenizers/metaspace.rs#L114`
`126`	`132`	`// https://github.com/xenova/transformers.js/blob/b07336d8f7ff57453cc164cc68aead2a79cbd57e/src/tokenizers.js#L2153`
`127`		`- func preTokenize(text: String, firstSection: Bool = true) -> [String] {`
	`133`	`+ func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {`
`128`	`134`	`let normalized = text.replacingOccurrences(of: " ", with: stringReplacement)`
`129`	`135`
`130`	`136`	`// We add a prefix space if:`
`@@ -140,7 +146,7 @@ class MetaspacePreTokenizer: PreTokenizer {`
`140`	`146`	`if prependScheme == .always {`
`141`	`147`	`prepend = stringReplacement`
`142`	`148`	`}`
`143`		`- if prependScheme == .first && firstSection {`
	`149`	`+ if prependScheme == .first && options.contains(.firstSection) {`
`144`	`150`	`prepend = stringReplacement`
`145`	`151`	`}`
`146`	`152`	`}`
`@@ -163,7 +169,7 @@ class ByteLevelPreTokenizer: PreTokenizer {`
`163`	`169`	`useRegex = config.useRegex?.boolValue ?? true`
`164`	`170`	`}`
`165`	`171`
`166`		`- func preTokenize(text: String, firstSection: Bool = true) -> [String] {`
	`172`	`+ func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {`
`167`	`173`	`// Split on whitespace and punctuation`
`168`	`174`	`let tokens = useRegex ? text.ranges(of: RE).map({ String(text[$0]) }) : [text]`
`169`	`175`	`return tokens.map { token in`
`@@ -185,7 +191,7 @@ class PunctuationPreTokenizer: PreTokenizer {`
`185`	`191`	`re = "[^\(PUNCTUATION_REGEX)]+\|[\(PUNCTUATION_REGEX)]+"`
`186`	`192`	`}`
`187`	`193`
`188`		`- func preTokenize(text: String, firstSection: Bool = true) -> [String] {`
	`194`	`+ func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {`
`189`	`195`	`// Ref: https://github.com/xenova/transformers.js/blob/27920d84831e323275b38f0b5186644b7936e1a2/src/tokenizers.js#L1138`
`190`	`196`	`return text.ranges(of: re).map { String(text[$0]) }`
`191`	`197`	`}`
`@@ -199,7 +205,7 @@ class DigitsPreTokenizer: PreTokenizer {`
`199`	`205`	`re = "[^\\d]+\|\\d\(individualDigits ? "" : "+")"`
`200`	`206`	`}`
`201`	`207`
`202`		`- func preTokenize(text: String, firstSection: Bool = true) -> [String] {`
	`208`	`+ func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {`
`203`	`209`	`return text.ranges(of: re).map { String(text[$0]) }`
`204`	`210`	`}`
`205`	`211`	`}`
`@@ -213,7 +219,7 @@ class SplitPreTokenizer: PreTokenizer {`
`213`	`219`	`invert = config.invert?.boolValue ?? false`
`214`	`220`	`}`
`215`	`221`
`216`		`- func preTokenize(text: String, firstSection: Bool = true) -> [String] {`
	`222`	`+ func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {`
`217`	`223`	`guard let pattern = pattern else { return [text] }`
`218`	`224`	`return pattern.split(text, invert: invert)`
`219`	`225`	`}`
Original file line number	Diff line number	Diff line change
`@@ -185,9 +185,9 @@ public class PreTrainedTokenizer: Tokenizer {`
`185`	`185`	`model = try TokenizerModel.from(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData, addedTokens: addedTokens)`
`186`	`186`	`}`
`187`	`187`
`188`		`- func preTokenize(_ text: String, firstSection: Bool) -> [String] {`
	`188`	`+ func preTokenize(_ text: String, options: PreTokenizerOptions) -> [String] {`
`189`	`189`	`guard let preTokenizer = preTokenizer else { return [text] }`
`190`		`- return preTokenizer(text: text, firstSection: firstSection)`
	`190`	`+ return preTokenizer(text: text, options: options)`
`191`	`191`	`}`
`192`	`192`
`193`	`193`	`func normalize(_ text: String) -> String {`
`@@ -231,7 +231,7 @@ public class PreTrainedTokenizer: Tokenizer {`
`231`	`231`	`}`
`232`	`232`	`return sections.enumerated().map { section, x in`
`233`	`233`	`if addedTokens.contains(x) { return [x] }`
`234`		`- return preTokenize(normalize(x), firstSection: section == 0).flatMap { model($0) }`
	`234`	`+ return preTokenize(normalize(x), options: section == 0 ? [.firstSection] : []).flatMap { model($0) }`
`235`	`235`	`}.flatMap { $0 }`
`236`	`236`	`}`
`237`	`237`