Add RobertaProcessing (#48)

shavit · web-flow · commit 1a001b50cc06 · 2024-02-28T19:10:56.000+01:00
* Add RobertaProcessing

* Test RobertaProcessing

  * Test RobertaProcessing
  * Trim spaces from tokens pair
  * Document variables

* Comment on trim spaces
diff --git a/Package.swift b/Package.swift
@@ -30,6 +30,7 @@ let package = Package(
         .testTarget(name: "HubTests", dependencies: ["Hub"]),
         .testTarget(name: "PreTokenizerTests", dependencies: ["Tokenizers", "Hub"]),
         .testTarget(name: "TensorUtilsTests", dependencies: ["TensorUtils"]),
-        .testTarget(name: "NormalizerTests", dependencies: ["Tokenizers", "Hub"])
+        .testTarget(name: "NormalizerTests", dependencies: ["Tokenizers", "Hub"]),
+        .testTarget(name: "PostProcessorTests", dependencies: ["Tokenizers", "Hub"])
     ]
 )
diff --git a/Sources/Hub/Hub.swift b/Sources/Hub/Hub.swift
@@ -98,6 +98,9 @@ public struct Config {
         guard let list = value as? [Any] else { return nil }
         return list.map { Config($0 as! [String : Any]) }
     }
+    
+    /// Tuple of token identifier and string value
+    public var tokenValue: (UInt, String)? { value as? (UInt, String) }
 }
 
 public class LanguageModelConfigurationFromHub {
diff --git a/Sources/Tokenizers/PostProcessor.swift b/Sources/Tokenizers/PostProcessor.swift
@@ -24,7 +24,7 @@ extension PostProcessor {
 enum PostProcessorType: String {
     case TemplateProcessing
     case ByteLevel
-//    case RobertaProcessing
+    case RobertaProcessing
 }
 
 struct PostProcessorFactory {
@@ -35,6 +35,7 @@ struct PostProcessorFactory {
         switch type {
         case .TemplateProcessing: return TemplateProcessing(config: config)
         case .ByteLevel         : return ByteLevelPostProcessor(config: config)
+        case .RobertaProcessing : return RobertaProcessing(config: config)
         default                 : fatalError("Unsupported PostProcessor type: \(typeName)")
         }
     }
@@ -75,3 +76,64 @@ class ByteLevelPostProcessor: PostProcessor {
     required public init(config: Config) {}
     func postProcess(tokens: [String], tokensPair: [String]? = nil) -> [String] { tokens }
 }
+
+class RobertaProcessing: PostProcessor {
+    private let sep: (UInt, String)
+    private let cls: (UInt, String)
+    /// Trim all remaining space, or leave one space character if `addPrefixSpace` is `true`.
+    private let trimOffset: Bool
+    /// Keep one space character on each side. Depends on `trimOffsets` being `true`.
+    private let addPrefixSpace: Bool
+
+    required public init(config: Config) {
+        guard let sep = config.sep?.tokenValue else { fatalError("Missing `sep` processor configuration") }
+        guard let cls = config.cls?.tokenValue else { fatalError("Missing `cls` processor configuration") }
+        self.sep = sep
+        self.cls = cls
+        self.trimOffset = config.trimOffset?.boolValue ?? true
+        self.addPrefixSpace = config.addPrefixSpace?.boolValue ?? true
+    }
+    
+    func postProcess(tokens: [String], tokensPair: [String]?) -> [String] {
+        var outTokens = tokens
+        var tokensPair = tokensPair
+        if trimOffset {
+            if addPrefixSpace {
+                outTokens = outTokens.map({ trimExtraSpaces(token: $0) })
+                tokensPair = tokensPair?.map({ trimExtraSpaces(token: $0) })
+           } else {
+                outTokens = outTokens.map({ $0.trimmingCharacters(in: .whitespaces) })
+                tokensPair = tokensPair?.map({ $0.trimmingCharacters(in: .whitespaces) })
+            }
+        }
+
+        outTokens = [self.cls.1] + outTokens + [self.sep.1]
+        if let tokensPair = tokensPair, !tokensPair.isEmpty {
+            // Yes, it adds another `sep`.
+            // https://github.com/facebookresearch/fairseq/blob/main/fairseq/models/roberta/hub_interface.py#L58-L65
+            outTokens += [self.sep.1] + tokensPair + [self.sep.1]
+        }
+
+        return outTokens
+    }
+
+    /// Some tokens need one space around them
+    /// https://github.com/huggingface/tokenizers/blob/main/tokenizers/src/pre_tokenizers/byte_level.rs#L203-L235
+    private func trimExtraSpaces(token: String) -> String {
+        let prefixOffset = findPrefixIndex(text: token)
+        let suffixOffset = findSuffixIndex(text: token)
+        let prefixIndex = token.index(token.startIndex, offsetBy: prefixOffset)
+        let suffixIndex = token.index(token.startIndex, offsetBy: token.count - suffixOffset)
+        return String(token[prefixIndex..<suffixIndex])
+    }
+
+    private func findPrefixIndex(text: String) -> Int {
+        guard !text.isEmpty, text.first!.isWhitespace else { return 0 }
+        return text.prefix(while: { $0.isWhitespace }).count - 1
+    }
+
+    private func findSuffixIndex(text: String) -> Int {
+        guard !text.isEmpty, text.last!.isWhitespace else { return 0 }
+        return text.reversed().prefix(while: { $0.isWhitespace }).count - 1
+    }
+}
diff --git a/Tests/PostProcessorTests/PostProcessorTests.swift b/Tests/PostProcessorTests/PostProcessorTests.swift
@@ -0,0 +1,83 @@
+import XCTest
+@testable import Tokenizers
+@testable import Hub
+
+class PostProcessorTests: XCTestCase {
+    func testRobertaProcessing() {
+       let testCases: [(Config, [String], [String]?, [String])] = [
+            // Should keep spaces; uneven spaces; ignore `addPrefixSpace`.
+            (
+                Config(["cls": (0, "[HEAD]") as (UInt, String),
+                        "sep": (0, "[END]") as (UInt, String),
+                        "trimOffset": false,
+                        "addPrefixSpace": true,
+                       ]),
+                [" The", " sun", "sets ", "  in  ", "   the  ", "west"],
+                nil,
+                ["[HEAD]", " The", " sun", "sets ", "  in  ", "   the  ", "west", "[END]"]
+            ),
+            // Should leave only one space around each token.
+            (
+                Config(["cls": (0, "[START]") as (UInt, String),
+                        "sep": (0, "[BREAK]") as (UInt, String),
+                        "trimOffset": true,
+                        "addPrefixSpace": true,
+                       ]),
+                [" The ", " sun", "sets ", "  in ", "  the    ", "west"],
+                nil,
+                ["[START]", " The ", " sun", "sets ", " in ", " the ", "west", "[BREAK]"]
+            ),
+            // Should ignore empty tokens pair.
+            (
+                Config(["cls": (0, "[START]") as (UInt, String),
+                        "sep": (0, "[BREAK]") as (UInt, String),
+                        "trimOffset": true,
+                        "addPrefixSpace": true,
+                       ]),
+                [" The ", " sun", "sets ", "  in ", "  the    ", "west"],
+                [],
+                ["[START]", " The ", " sun", "sets ", " in ", " the ", "west", "[BREAK]"]
+            ),
+            // Should trim all whitespace.
+            (
+                Config(["cls": (0, "[CLS]") as (UInt, String),
+                        "sep": (0, "[SEP]") as (UInt, String),
+                        "trimOffset": true,
+                        "addPrefixSpace": false,
+                       ]),
+                [" The ", " sun", "sets ", "  in ", "  the    ", "west"],
+                nil,
+                ["[CLS]", "The", "sun", "sets", "in", "the", "west", "[SEP]"]
+            ),
+            // Should add tokens.
+            (
+                Config(["cls": (0, "[CLS]") as (UInt, String),
+                        "sep": (0, "[SEP]") as (UInt, String),
+                        "trimOffset": true,
+                        "addPrefixSpace": true,
+                       ]),
+                [" The ", " sun", "sets ", "  in ", "  the    ", "west"],
+                [".", "The", " cat ", "   is ", " sitting  ", " on", "the ", "mat"],
+                ["[CLS]", " The ", " sun", "sets ", " in ", " the ", "west", "[SEP]",
+                "[SEP]", ".", "The", " cat ", " is ", " sitting ", " on", "the ",
+                 "mat", "[SEP]"]
+            ),
+            (
+                Config(["cls": (0, "[CLS]") as (UInt, String),
+                        "sep": (0, "[SEP]") as (UInt, String),
+                        "trimOffset": true,
+                        "addPrefixSpace": true,
+                       ]),
+                [" 你 ", " 好 ", ","],
+                [" 凯  ", "  蒂  ", "!"],
+                ["[CLS]", " 你 ", " 好 ", ",", "[SEP]", "[SEP]", " 凯 ", " 蒂 ", "!", "[SEP]"]
+            ),
+        ]
+
+        for (config, tokens, tokensPair, expect) in testCases {
+            let processor = RobertaProcessing(config: config)
+            let output = processor.postProcess(tokens: tokens, tokensPair: tokensPair)
+            XCTAssertEqual(output, expect)
+        }
+    }
+}

Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,7 @@ let package = Package(`
`30`	`30`	`.testTarget(name: "HubTests", dependencies: ["Hub"]),`
`31`	`31`	`.testTarget(name: "PreTokenizerTests", dependencies: ["Tokenizers", "Hub"]),`
`32`	`32`	`.testTarget(name: "TensorUtilsTests", dependencies: ["TensorUtils"]),`
`33`		`- .testTarget(name: "NormalizerTests", dependencies: ["Tokenizers", "Hub"])`
	`33`	`+ .testTarget(name: "NormalizerTests", dependencies: ["Tokenizers", "Hub"]),`
	`34`	`+ .testTarget(name: "PostProcessorTests", dependencies: ["Tokenizers", "Hub"])`
`34`	`35`	`]`
`35`	`36`	`)`
Original file line number	Diff line number	Diff line change
`@@ -98,6 +98,9 @@ public struct Config {`
`98`	`98`	`guard let list = value as? [Any] else { return nil }`
`99`	`99`	`return list.map { Config($0 as! [String : Any]) }`
`100`	`100`	`}`
	`101`	`+`
	`102`	`+ /// Tuple of token identifier and string value`
	`103`	`+ public var tokenValue: (UInt, String)? { value as? (UInt, String) }`
`101`	`104`	`}`
`102`	`105`
`103`	`106`	`public class LanguageModelConfigurationFromHub {`