From 207aa94e978917a359a91056d030da6bba4dd8be Mon Sep 17 00:00:00 2001 From: Jan Krukowski Date: Thu, 6 Mar 2025 15:17:42 +0100 Subject: [PATCH 1/2] Added RobertaTokenizer --- Sources/Hub/Hub.swift | 18 ++++++++++- Sources/Tokenizers/Tokenizer.swift | 3 +- Tests/HubTests/HubTests.swift | 36 ++++++++++++++++++++++ Tests/TokenizersTests/TokenizerTests.swift | 28 +++++++++++++++++ 4 files changed, 83 insertions(+), 2 deletions(-) diff --git a/Sources/Hub/Hub.swift b/Sources/Hub/Hub.swift index b303736..eaa0ee5 100644 --- a/Sources/Hub/Hub.swift +++ b/Sources/Hub/Hub.swift @@ -100,7 +100,23 @@ public struct Config { } /// Tuple of token identifier and string value - public var tokenValue: (UInt, String)? { value as? (UInt, String) } + public var tokenValue: (UInt, String)? { + if let value = value as? (UInt, String) { + return value + } + if let value = value as? (String, UInt) { + return (value.1, value.0) + } + if let value = value as? [Any] { + if let stringValue = value.first as? String, let intValue = value.dropFirst().first as? UInt { + return (intValue, stringValue) + } + if let intValue = value.first as? UInt, let stringValue = value.dropFirst().first as? String { + return (intValue, stringValue) + } + } + return nil + } } public class LanguageModelConfigurationFromHub { diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift index db53337..0a87196 100644 --- a/Sources/Tokenizers/Tokenizer.swift +++ b/Sources/Tokenizers/Tokenizer.swift @@ -90,7 +90,8 @@ struct TokenizerModel { "WhisperTokenizer" : WhisperTokenizer.self, "CohereTokenizer" : CohereTokenizer.self, "Qwen2Tokenizer" : Qwen2Tokenizer.self, - "PreTrainedTokenizer": BPETokenizer.self + "PreTrainedTokenizer": BPETokenizer.self, + "RobertaTokenizer" : BPETokenizer.self, ] static func unknownToken(from tokenizerConfig: Config) -> String? { diff --git a/Tests/HubTests/HubTests.swift b/Tests/HubTests/HubTests.swift index 1d7bc86..b69473b 100644 --- a/Tests/HubTests/HubTests.swift +++ b/Tests/HubTests/HubTests.swift @@ -118,4 +118,40 @@ class HubTests: XCTestCase { let vocab_dict = config.dictionary["vocab"] as! [String: Int] XCTAssertNotEqual(vocab_dict.count, 2) } + + func testConfigTokenValue() throws { + let config1 = Config(["cls": (100, "str") as (UInt, String)]) + let tokenValue1 = config1.cls?.tokenValue + XCTAssertEqual(tokenValue1?.0, 100) + XCTAssertEqual(tokenValue1?.1, "str") + + let config2 = Config(["cls": ("str", 100) as (String, UInt)]) + let tokenValue2 = config2.cls?.tokenValue + XCTAssertEqual(tokenValue2?.0, 100) + XCTAssertEqual(tokenValue2?.1, "str") + + let config3 = Config(["cls": [100 as UInt, "str" as String] as [Any]]) + let tokenValue3 = config3.cls?.tokenValue + XCTAssertEqual(tokenValue3?.0, 100) + XCTAssertEqual(tokenValue3?.1, "str") + + let config4 = Config(["cls": ["str" as String, 100 as UInt] as [Any]]) + let tokenValue4 = config4.cls?.tokenValue + XCTAssertEqual(tokenValue4?.0, 100) + XCTAssertEqual(tokenValue4?.1, "str") + + let data5 = #"{"cls": [100, "str"]}"#.data(using: .utf8)! + let dict5 = try JSONSerialization.jsonObject(with: data5, options: []) as! [NSString: Any] + let config5 = Config(dict5) + let tokenValue5 = config5.cls?.tokenValue + XCTAssertEqual(tokenValue5?.0, 100) + XCTAssertEqual(tokenValue5?.1, "str") + + let data6 = #"{"cls": ["str", 100]}"#.data(using: .utf8)! + let dict6 = try JSONSerialization.jsonObject(with: data6, options: []) as! [NSString: Any] + let config6 = Config(dict6) + let tokenValue6 = config6.cls?.tokenValue + XCTAssertEqual(tokenValue6?.0, 100) + XCTAssertEqual(tokenValue6?.1, "str") + } } diff --git a/Tests/TokenizersTests/TokenizerTests.swift b/Tests/TokenizersTests/TokenizerTests.swift index eae7003..7ef4472 100644 --- a/Tests/TokenizersTests/TokenizerTests.swift +++ b/Tests/TokenizersTests/TokenizerTests.swift @@ -212,6 +212,34 @@ class BertSpacesTests: XCTestCase { } } +class RobertaTests: XCTestCase { + func testEncodeDecode() async throws { + guard let tokenizer = try await AutoTokenizer.from(pretrained: "ibm-granite/granite-embedding-30m-english") as? PreTrainedTokenizer else { + XCTFail() + return + } + + XCTAssertEqual(tokenizer.tokenize(text: "l'eure"), ["l", "'", "e", "ure"]) + XCTAssertEqual(tokenizer.encode(text: "l'eure"), [0, 462, 108, 242, 2407, 2]) + XCTAssertEqual(tokenizer.decode(tokens: tokenizer.encode(text: "l'eure"), skipSpecialTokens: true), "l'eure") + + XCTAssertEqual(tokenizer.tokenize(text: "mąka"), ["m", "Ä", "ħ", "ka"]) + XCTAssertEqual(tokenizer.encode(text: "mąka"), [0, 119, 649, 5782, 2348, 2]) + + XCTAssertEqual(tokenizer.tokenize(text: "département"), ["d", "é", "part", "ement"]) + XCTAssertEqual(tokenizer.encode(text: "département"), [0, 417, 1140, 7755, 6285, 2]) + + XCTAssertEqual(tokenizer.tokenize(text: "Who are you?"), ["Who", "Ġare", "Ġyou", "?"]) + XCTAssertEqual(tokenizer.encode(text: "Who are you?"), [0, 12375, 32, 47, 116, 2]) + + XCTAssertEqual(tokenizer.tokenize(text: " Who are you? "), ["ĠWho", "Ġare", "Ġyou", "?", "Ġ"]) + XCTAssertEqual(tokenizer.encode(text: " Who are you? "), [0, 3394, 32, 47, 116, 1437, 2]) + + XCTAssertEqual(tokenizer.tokenize(text: "Who are you?"), ["", "Who", "Ġare", "Ġyou", "?", ""]) + XCTAssertEqual(tokenizer.encode(text: "Who are you?"), [0, 0, 12375, 32, 47, 116, 2, 2]) + } +} + struct EncodedTokenizerSamplesDataset: Decodable { let text: String From 52f9c4267b7bd4cb53f461aec0396b3cbe8f5270 Mon Sep 17 00:00:00 2001 From: Jan Krukowski Date: Wed, 12 Mar 2025 14:18:30 +0100 Subject: [PATCH 2/2] Made parsing more restrictive, fixed tests --- Sources/Hub/Hub.swift | 18 ++++------- Tests/HubTests/HubTests.swift | 30 +++---------------- .../PostProcessorTests.swift | 24 +++++++-------- Tests/TokenizersTests/TokenizerTests.swift | 2 +- 4 files changed, 22 insertions(+), 52 deletions(-) diff --git a/Sources/Hub/Hub.swift b/Sources/Hub/Hub.swift index eaa0ee5..15e6cf5 100644 --- a/Sources/Hub/Hub.swift +++ b/Sources/Hub/Hub.swift @@ -101,21 +101,13 @@ public struct Config { /// Tuple of token identifier and string value public var tokenValue: (UInt, String)? { - if let value = value as? (UInt, String) { - return value - } - if let value = value as? (String, UInt) { - return (value.1, value.0) + guard let value = value as? [Any] else { + return nil } - if let value = value as? [Any] { - if let stringValue = value.first as? String, let intValue = value.dropFirst().first as? UInt { - return (intValue, stringValue) - } - if let intValue = value.first as? UInt, let stringValue = value.dropFirst().first as? String { - return (intValue, stringValue) - } + guard let stringValue = value.first as? String, let intValue = value.dropFirst().first as? UInt else { + return nil } - return nil + return (intValue, stringValue) } } diff --git a/Tests/HubTests/HubTests.swift b/Tests/HubTests/HubTests.swift index b69473b..7c1f3e2 100644 --- a/Tests/HubTests/HubTests.swift +++ b/Tests/HubTests/HubTests.swift @@ -120,38 +120,16 @@ class HubTests: XCTestCase { } func testConfigTokenValue() throws { - let config1 = Config(["cls": (100, "str") as (UInt, String)]) + let config1 = Config(["cls": ["str" as String, 100 as UInt] as [Any]]) let tokenValue1 = config1.cls?.tokenValue XCTAssertEqual(tokenValue1?.0, 100) XCTAssertEqual(tokenValue1?.1, "str") - let config2 = Config(["cls": ("str", 100) as (String, UInt)]) + let data = #"{"cls": ["str", 100]}"#.data(using: .utf8)! + let dict = try JSONSerialization.jsonObject(with: data, options: []) as! [NSString: Any] + let config2 = Config(dict) let tokenValue2 = config2.cls?.tokenValue XCTAssertEqual(tokenValue2?.0, 100) XCTAssertEqual(tokenValue2?.1, "str") - - let config3 = Config(["cls": [100 as UInt, "str" as String] as [Any]]) - let tokenValue3 = config3.cls?.tokenValue - XCTAssertEqual(tokenValue3?.0, 100) - XCTAssertEqual(tokenValue3?.1, "str") - - let config4 = Config(["cls": ["str" as String, 100 as UInt] as [Any]]) - let tokenValue4 = config4.cls?.tokenValue - XCTAssertEqual(tokenValue4?.0, 100) - XCTAssertEqual(tokenValue4?.1, "str") - - let data5 = #"{"cls": [100, "str"]}"#.data(using: .utf8)! - let dict5 = try JSONSerialization.jsonObject(with: data5, options: []) as! [NSString: Any] - let config5 = Config(dict5) - let tokenValue5 = config5.cls?.tokenValue - XCTAssertEqual(tokenValue5?.0, 100) - XCTAssertEqual(tokenValue5?.1, "str") - - let data6 = #"{"cls": ["str", 100]}"#.data(using: .utf8)! - let dict6 = try JSONSerialization.jsonObject(with: data6, options: []) as! [NSString: Any] - let config6 = Config(dict6) - let tokenValue6 = config6.cls?.tokenValue - XCTAssertEqual(tokenValue6?.0, 100) - XCTAssertEqual(tokenValue6?.1, "str") } } diff --git a/Tests/PostProcessorTests/PostProcessorTests.swift b/Tests/PostProcessorTests/PostProcessorTests.swift index 347bc38..4046d6d 100644 --- a/Tests/PostProcessorTests/PostProcessorTests.swift +++ b/Tests/PostProcessorTests/PostProcessorTests.swift @@ -7,8 +7,8 @@ class PostProcessorTests: XCTestCase { let testCases: [(Config, [String], [String]?, [String])] = [ // Should keep spaces; uneven spaces; ignore `addPrefixSpace`. ( - Config(["cls": (0, "[HEAD]") as (UInt, String), - "sep": (0, "[END]") as (UInt, String), + Config(["cls": ["[HEAD]", 0 as UInt], + "sep": ["[END]", 0 as UInt], "trimOffset": false, "addPrefixSpace": true, ]), @@ -18,8 +18,8 @@ class PostProcessorTests: XCTestCase { ), // Should leave only one space around each token. ( - Config(["cls": (0, "[START]") as (UInt, String), - "sep": (0, "[BREAK]") as (UInt, String), + Config(["cls": ["[START]", 0 as UInt], + "sep": ["[BREAK]", 0 as UInt], "trimOffset": true, "addPrefixSpace": true, ]), @@ -29,8 +29,8 @@ class PostProcessorTests: XCTestCase { ), // Should ignore empty tokens pair. ( - Config(["cls": (0, "[START]") as (UInt, String), - "sep": (0, "[BREAK]") as (UInt, String), + Config(["cls": ["[START]", 0 as UInt], + "sep": ["[BREAK]", 0 as UInt], "trimOffset": true, "addPrefixSpace": true, ]), @@ -40,8 +40,8 @@ class PostProcessorTests: XCTestCase { ), // Should trim all whitespace. ( - Config(["cls": (0, "[CLS]") as (UInt, String), - "sep": (0, "[SEP]") as (UInt, String), + Config(["cls": ["[CLS]", 0 as UInt], + "sep": ["[SEP]", 0 as UInt], "trimOffset": true, "addPrefixSpace": false, ]), @@ -51,8 +51,8 @@ class PostProcessorTests: XCTestCase { ), // Should add tokens. ( - Config(["cls": (0, "[CLS]") as (UInt, String), - "sep": (0, "[SEP]") as (UInt, String), + Config(["cls": ["[CLS]", 0 as UInt], + "sep": ["[SEP]", 0 as UInt], "trimOffset": true, "addPrefixSpace": true, ]), @@ -63,8 +63,8 @@ class PostProcessorTests: XCTestCase { "mat", "[SEP]"] ), ( - Config(["cls": (0, "[CLS]") as (UInt, String), - "sep": (0, "[SEP]") as (UInt, String), + Config(["cls": ["[CLS]", 0 as UInt], + "sep": ["[SEP]", 0 as UInt], "trimOffset": true, "addPrefixSpace": true, ]), diff --git a/Tests/TokenizersTests/TokenizerTests.swift b/Tests/TokenizersTests/TokenizerTests.swift index 7ef4472..13d666a 100644 --- a/Tests/TokenizersTests/TokenizerTests.swift +++ b/Tests/TokenizersTests/TokenizerTests.swift @@ -214,7 +214,7 @@ class BertSpacesTests: XCTestCase { class RobertaTests: XCTestCase { func testEncodeDecode() async throws { - guard let tokenizer = try await AutoTokenizer.from(pretrained: "ibm-granite/granite-embedding-30m-english") as? PreTrainedTokenizer else { + guard let tokenizer = try await AutoTokenizer.from(pretrained: "FacebookAI/roberta-base") as? PreTrainedTokenizer else { XCTFail() return }