Update PreTokenizers so Metaspace can conditionally act

pcuenca · pcuenca · commit 2eb5995e5a57 · 2024-04-26T21:14:05.000+02:00
diff --git a/Sources/Tokenizers/PreTokenizer.swift b/Sources/Tokenizers/PreTokenizer.swift
@@ -9,27 +9,26 @@ import Foundation
 import Hub
 
 public protocol PreTokenizer {
-    func preTokenize(text: String) -> [String]
-    func preTokenize(texts: [String]) -> [String]
-    func callAsFunction(texts: [String]) -> [String]
-    func callAsFunction(text: String) -> [String]
+    func preTokenize(text: String, firstSection: Bool) -> [String]
+    func preTokenize(texts: [String], firstSection: Bool) -> [String]
+    func callAsFunction(texts: [String], firstSection: Bool) -> [String]
+    func callAsFunction(text: String, firstSection: Bool) -> [String]
 
     init(config: Config)
 }
 
 extension PreTokenizer {
-    func preTokenize(texts: [String]) -> [String] {
-        texts.flatMap { preTokenize(text: $0) }
+    func preTokenize(texts: [String], firstSection: Bool = true) -> [String] {
+        texts.flatMap { preTokenize(text: $0, firstSection: firstSection) }
     }
 
-    func callAsFunction(texts: [String]) -> [String] {
-        return preTokenize(texts: texts)
+    func callAsFunction(texts: [String], firstSection: Bool = true) -> [String] {
+        return preTokenize(texts: texts, firstSection: firstSection)
     }
     
-    func callAsFunction(text: String) -> [String] {
-        return preTokenize(text: text)
+    func callAsFunction(text: String, firstSection: Bool = true) -> [String] {
+        return preTokenize(text: text, firstSection: firstSection)
     }
-    
 }
 
 enum PreTokenizerType: String {
@@ -71,9 +70,9 @@ class PreTokenizerSequence: PreTokenizer {
         preTokenizers = configs.compactMap { PreTokenizerFactory.fromConfig(config: $0) }
     }
     
-    func preTokenize(text: String) -> [String] {
+    func preTokenize(text: String, firstSection: Bool = true) -> [String] {
         preTokenizers.reduce([text]) { current, preTokenizer in
-            preTokenizer(texts: current)
+            preTokenizer(texts: current, firstSection: firstSection)
         }
     }
 }
@@ -85,7 +84,7 @@ class WhitespacePreTokenizer: PreTokenizer {
         re = #"\S+"#
     }
 
-    func preTokenize(text: String) -> [String] {
+    func preTokenize(text: String, firstSection: Bool = true) -> [String] {
         return text.ranges(of: re).map { String(text[$0]) }
     }
 }
@@ -125,7 +124,7 @@ class MetaspacePreTokenizer: PreTokenizer {
     
     // https://github.com/huggingface/tokenizers/blob/accd0650b802f2180df40ef1def3bce32156688e/tokenizers/src/pre_tokenizers/metaspace.rs#L114
     // https://github.com/xenova/transformers.js/blob/b07336d8f7ff57453cc164cc68aead2a79cbd57e/src/tokenizers.js#L2153
-    func preTokenize(text: String) -> [String] {
+    func preTokenize(text: String, firstSection: Bool = true) -> [String] {
         let normalized = text.replacingOccurrences(of: " ", with: stringReplacement)
         
         // We add a prefix space if:
@@ -141,7 +140,7 @@ class MetaspacePreTokenizer: PreTokenizer {
             if prependScheme == .always {
                 prepend = stringReplacement
             }
-            if prependScheme == .first /* && first_section */ {
+            if prependScheme == .first && firstSection {
                 prepend = stringReplacement
             }
         }
@@ -164,7 +163,7 @@ class ByteLevelPreTokenizer: PreTokenizer {
         useRegex = config.useRegex?.boolValue ?? true
     }
     
-    func preTokenize(text: String) -> [String] {
+    func preTokenize(text: String, firstSection: Bool = true) -> [String] {
         // Split on whitespace and punctuation
         let tokens = useRegex ? text.ranges(of: RE).map({ String(text[$0]) }) : [text]
         return tokens.map { token in
@@ -186,7 +185,7 @@ class PunctuationPreTokenizer: PreTokenizer {
         re = "[^\(PUNCTUATION_REGEX)]+|[\(PUNCTUATION_REGEX)]+"
     }
 
-    func preTokenize(text: String) -> [String] {
+    func preTokenize(text: String, firstSection: Bool = true) -> [String] {
         // Ref: https://github.com/xenova/transformers.js/blob/27920d84831e323275b38f0b5186644b7936e1a2/src/tokenizers.js#L1138
         return text.ranges(of: re).map { String(text[$0]) }
     }
@@ -200,7 +199,7 @@ class DigitsPreTokenizer: PreTokenizer {
         re = "[^\\d]+|\\d\(individualDigits ? "" : "+")"
     }
 
-    func preTokenize(text: String) -> [String] {
+    func preTokenize(text: String, firstSection: Bool = true) -> [String] {
         return text.ranges(of: re).map { String(text[$0]) }
     }
 }
@@ -214,7 +213,7 @@ class SplitPreTokenizer: PreTokenizer {
         invert = config.invert?.boolValue ?? false
     }
 
-    func preTokenize(text: String) -> [String] {
+    func preTokenize(text: String, firstSection: Bool = true) -> [String] {
         guard let pattern = pattern else { return [text] }
         return pattern.split(text, invert: invert)
     }
diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift
@@ -185,9 +185,9 @@ public class PreTrainedTokenizer: Tokenizer {
         model = try TokenizerModel.from(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData, addedTokens: addedTokens)
     }
 
-    func preTokenize(_ text: String) -> [String] {
+    func preTokenize(_ text: String, firstSection: Bool) -> [String] {
         guard let preTokenizer = preTokenizer else { return [text] }
-        return preTokenizer(text: text)
+        return preTokenizer(text: text, firstSection: firstSection)
     }
 
     func normalize(_ text: String) -> String {
@@ -229,9 +229,9 @@ public class PreTrainedTokenizer: Tokenizer {
         } else {
             sections = [text]
         }
-        return sections.map { x in
+        return sections.enumerated().map { section, x in
             if addedTokens.contains(x) { return [x] }
-            return preTokenize(normalize(x)).flatMap { model($0) }
+            return preTokenize(normalize(x), firstSection: section == 0).flatMap { model($0) }
         }.flatMap { $0 }
     }
 

Original file line number	Diff line number	Diff line change
`@@ -9,27 +9,26 @@ import Foundation`
`9`	`9`	`import Hub`
`10`	`10`
`11`	`11`	`public protocol PreTokenizer {`
`12`		`- func preTokenize(text: String) -> [String]`
`13`		`- func preTokenize(texts: [String]) -> [String]`
`14`		`- func callAsFunction(texts: [String]) -> [String]`
`15`		`- func callAsFunction(text: String) -> [String]`
	`12`	`+ func preTokenize(text: String, firstSection: Bool) -> [String]`
	`13`	`+ func preTokenize(texts: [String], firstSection: Bool) -> [String]`
	`14`	`+ func callAsFunction(texts: [String], firstSection: Bool) -> [String]`
	`15`	`+ func callAsFunction(text: String, firstSection: Bool) -> [String]`
`16`	`16`
`17`	`17`	`init(config: Config)`
`18`	`18`	`}`
`19`	`19`
`20`	`20`	`extension PreTokenizer {`
`21`		`- func preTokenize(texts: [String]) -> [String] {`
`22`		`- texts.flatMap { preTokenize(text: $0) }`
	`21`	`+ func preTokenize(texts: [String], firstSection: Bool = true) -> [String] {`
	`22`	`+ texts.flatMap { preTokenize(text: $0, firstSection: firstSection) }`
`23`	`23`	`}`
`24`	`24`
`25`		`- func callAsFunction(texts: [String]) -> [String] {`
`26`		`- return preTokenize(texts: texts)`
	`25`	`+ func callAsFunction(texts: [String], firstSection: Bool = true) -> [String] {`
	`26`	`+ return preTokenize(texts: texts, firstSection: firstSection)`
`27`	`27`	`}`
`28`	`28`
`29`		`- func callAsFunction(text: String) -> [String] {`
`30`		`- return preTokenize(text: text)`
	`29`	`+ func callAsFunction(text: String, firstSection: Bool = true) -> [String] {`
	`30`	`+ return preTokenize(text: text, firstSection: firstSection)`
`31`	`31`	`}`
`32`		`-`
`33`	`32`	`}`
`34`	`33`
`35`	`34`	`enum PreTokenizerType: String {`
`@@ -71,9 +70,9 @@ class PreTokenizerSequence: PreTokenizer {`
`71`	`70`	`preTokenizers = configs.compactMap { PreTokenizerFactory.fromConfig(config: $0) }`
`72`	`71`	`}`
`73`	`72`
`74`		`- func preTokenize(text: String) -> [String] {`
	`73`	`+ func preTokenize(text: String, firstSection: Bool = true) -> [String] {`
`75`	`74`	`preTokenizers.reduce([text]) { current, preTokenizer in`
`76`		`- preTokenizer(texts: current)`
	`75`	`+ preTokenizer(texts: current, firstSection: firstSection)`
`77`	`76`	`}`
`78`	`77`	`}`
`79`	`78`	`}`
`@@ -85,7 +84,7 @@ class WhitespacePreTokenizer: PreTokenizer {`
`85`	`84`	`re = #"\S+"#`
`86`	`85`	`}`
`87`	`86`
`88`		`- func preTokenize(text: String) -> [String] {`
	`87`	`+ func preTokenize(text: String, firstSection: Bool = true) -> [String] {`
`89`	`88`	`return text.ranges(of: re).map { String(text[$0]) }`
`90`	`89`	`}`
`91`	`90`	`}`
`@@ -125,7 +124,7 @@ class MetaspacePreTokenizer: PreTokenizer {`
`125`	`124`
`126`	`125`	`// https://github.com/huggingface/tokenizers/blob/accd0650b802f2180df40ef1def3bce32156688e/tokenizers/src/pre_tokenizers/metaspace.rs#L114`
`127`	`126`	`// https://github.com/xenova/transformers.js/blob/b07336d8f7ff57453cc164cc68aead2a79cbd57e/src/tokenizers.js#L2153`
`128`		`- func preTokenize(text: String) -> [String] {`
	`127`	`+ func preTokenize(text: String, firstSection: Bool = true) -> [String] {`
`129`	`128`	`let normalized = text.replacingOccurrences(of: " ", with: stringReplacement)`
`130`	`129`
`131`	`130`	`// We add a prefix space if:`
`@@ -141,7 +140,7 @@ class MetaspacePreTokenizer: PreTokenizer {`
`141`	`140`	`if prependScheme == .always {`
`142`	`141`	`prepend = stringReplacement`
`143`	`142`	`}`
`144`		`- if prependScheme == .first /* && first_section */ {`
	`143`	`+ if prependScheme == .first && firstSection {`
`145`	`144`	`prepend = stringReplacement`
`146`	`145`	`}`
`147`	`146`	`}`
`@@ -164,7 +163,7 @@ class ByteLevelPreTokenizer: PreTokenizer {`
`164`	`163`	`useRegex = config.useRegex?.boolValue ?? true`
`165`	`164`	`}`
`166`	`165`
`167`		`- func preTokenize(text: String) -> [String] {`
	`166`	`+ func preTokenize(text: String, firstSection: Bool = true) -> [String] {`
`168`	`167`	`// Split on whitespace and punctuation`
`169`	`168`	`let tokens = useRegex ? text.ranges(of: RE).map({ String(text[$0]) }) : [text]`
`170`	`169`	`return tokens.map { token in`
`@@ -186,7 +185,7 @@ class PunctuationPreTokenizer: PreTokenizer {`
`186`	`185`	`re = "[^\(PUNCTUATION_REGEX)]+\|[\(PUNCTUATION_REGEX)]+"`
`187`	`186`	`}`
`188`	`187`
`189`		`- func preTokenize(text: String) -> [String] {`
	`188`	`+ func preTokenize(text: String, firstSection: Bool = true) -> [String] {`
`190`	`189`	`// Ref: https://github.com/xenova/transformers.js/blob/27920d84831e323275b38f0b5186644b7936e1a2/src/tokenizers.js#L1138`
`191`	`190`	`return text.ranges(of: re).map { String(text[$0]) }`
`192`	`191`	`}`
`@@ -200,7 +199,7 @@ class DigitsPreTokenizer: PreTokenizer {`
`200`	`199`	`re = "[^\\d]+\|\\d\(individualDigits ? "" : "+")"`
`201`	`200`	`}`
`202`	`201`
`203`		`- func preTokenize(text: String) -> [String] {`
	`202`	`+ func preTokenize(text: String, firstSection: Bool = true) -> [String] {`
`204`	`203`	`return text.ranges(of: re).map { String(text[$0]) }`
`205`	`204`	`}`
`206`	`205`	`}`
`@@ -214,7 +213,7 @@ class SplitPreTokenizer: PreTokenizer {`
`214`	`213`	`invert = config.invert?.boolValue ?? false`
`215`	`214`	`}`
`216`	`215`
`217`		`- func preTokenize(text: String) -> [String] {`
	`216`	`+ func preTokenize(text: String, firstSection: Bool = true) -> [String] {`
`218`	`217`	`guard let pattern = pattern else { return [text] }`
`219`	`218`	`return pattern.split(text, invert: invert)`
`220`	`219`	`}`
Original file line number	Diff line number	Diff line change
`@@ -185,9 +185,9 @@ public class PreTrainedTokenizer: Tokenizer {`
`185`	`185`	`model = try TokenizerModel.from(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData, addedTokens: addedTokens)`
`186`	`186`	`}`
`187`	`187`
`188`		`- func preTokenize(_ text: String) -> [String] {`
	`188`	`+ func preTokenize(_ text: String, firstSection: Bool) -> [String] {`
`189`	`189`	`guard let preTokenizer = preTokenizer else { return [text] }`
`190`		`- return preTokenizer(text: text)`
	`190`	`+ return preTokenizer(text: text, firstSection: firstSection)`
`191`	`191`	`}`
`192`	`192`
`193`	`193`	`func normalize(_ text: String) -> String {`
`@@ -229,9 +229,9 @@ public class PreTrainedTokenizer: Tokenizer {`
`229`	`229`	`} else {`
`230`	`230`	`sections = [text]`
`231`	`231`	`}`
`232`		`- return sections.map { x in`
	`232`	`+ return sections.enumerated().map { section, x in`
`233`	`233`	`if addedTokens.contains(x) { return [x] }`
`234`		`- return preTokenize(normalize(x)).flatMap { model($0) }`
	`234`	`+ return preTokenize(normalize(x), firstSection: section == 0).flatMap { model($0) }`
`235`	`235`	`}.flatMap { $0 }`
`236`	`236`	`}`
`237`	`237`