diff --git a/Sources/Hub/Hub.swift b/Sources/Hub/Hub.swift index 1c2cd22..fe8f461 100644 --- a/Sources/Hub/Hub.swift +++ b/Sources/Hub/Hub.swift @@ -133,7 +133,15 @@ public struct Config { } /// Tuple of token identifier and string value - public var tokenValue: (UInt, String)? { value as? (UInt, String) } + public var tokenValue: (UInt, String)? { + guard let value = value as? [Any] else { + return nil + } + guard let stringValue = value.first as? String, let intValue = value.dropFirst().first as? UInt else { + return nil + } + return (intValue, stringValue) + } } public class LanguageModelConfigurationFromHub { diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift index 1926933..82d7ff0 100644 --- a/Sources/Tokenizers/Tokenizer.swift +++ b/Sources/Tokenizers/Tokenizer.swift @@ -101,6 +101,7 @@ struct TokenizerModel { "BertTokenizer": BertTokenizer.self, "DistilbertTokenizer": BertTokenizer.self, "DistilBertTokenizer": BertTokenizer.self, + "RobertaTokenizer": BPETokenizer.self, "CodeGenTokenizer": CodeGenTokenizer.self, "CodeLlamaTokenizer": CodeLlamaTokenizer.self, "FalconTokenizer": FalconTokenizer.self, @@ -230,7 +231,7 @@ public extension Tokenizer { func callAsFunction(_ text: String, addSpecialTokens: Bool = true) -> [Int] { encode(text: text, addSpecialTokens: addSpecialTokens) } - + func decode(tokens: [Int]) -> String { decode(tokens: tokens, skipSpecialTokens: false) } diff --git a/Tests/HubTests/HubTests.swift b/Tests/HubTests/HubTests.swift index 1f726b0..00d638e 100644 --- a/Tests/HubTests/HubTests.swift +++ b/Tests/HubTests/HubTests.swift @@ -117,4 +117,18 @@ class HubTests: XCTestCase { let vocab_dict = config.dictionary["vocab"] as! [String: Int] XCTAssertNotEqual(vocab_dict.count, 2) } + + func testConfigTokenValue() throws { + let config1 = Config(["cls": ["str" as String, 100 as UInt] as [Any]]) + let tokenValue1 = config1.cls?.tokenValue + XCTAssertEqual(tokenValue1?.0, 100) + XCTAssertEqual(tokenValue1?.1, "str") + + let data = #"{"cls": ["str", 100]}"#.data(using: .utf8)! + let dict = try JSONSerialization.jsonObject(with: data, options: []) as! [NSString: Any] + let config2 = Config(dict) + let tokenValue2 = config2.cls?.tokenValue + XCTAssertEqual(tokenValue2?.0, 100) + XCTAssertEqual(tokenValue2?.1, "str") + } } diff --git a/Tests/PostProcessorTests/PostProcessorTests.swift b/Tests/PostProcessorTests/PostProcessorTests.swift index 0e46cb2..e106ac6 100644 --- a/Tests/PostProcessorTests/PostProcessorTests.swift +++ b/Tests/PostProcessorTests/PostProcessorTests.swift @@ -7,8 +7,8 @@ class PostProcessorTests: XCTestCase { let testCases: [(Config, [String], [String]?, [String])] = [ // Should keep spaces; uneven spaces; ignore `addPrefixSpace`. ( - Config(["cls": (0, "[HEAD]") as (UInt, String), - "sep": (0, "[END]") as (UInt, String), + Config(["cls": ["[HEAD]", 0 as UInt], + "sep": ["[END]", 0 as UInt], "trimOffset": false, "addPrefixSpace": true]), [" The", " sun", "sets ", " in ", " the ", "west"], @@ -17,8 +17,8 @@ class PostProcessorTests: XCTestCase { ), // Should leave only one space around each token. ( - Config(["cls": (0, "[START]") as (UInt, String), - "sep": (0, "[BREAK]") as (UInt, String), + Config(["cls": ["[START]", 0 as UInt], + "sep": ["[BREAK]", 0 as UInt], "trimOffset": true, "addPrefixSpace": true]), [" The ", " sun", "sets ", " in ", " the ", "west"], @@ -27,8 +27,8 @@ class PostProcessorTests: XCTestCase { ), // Should ignore empty tokens pair. ( - Config(["cls": (0, "[START]") as (UInt, String), - "sep": (0, "[BREAK]") as (UInt, String), + Config(["cls": ["[START]", 0 as UInt], + "sep": ["[BREAK]", 0 as UInt], "trimOffset": true, "addPrefixSpace": true]), [" The ", " sun", "sets ", " in ", " the ", "west"], @@ -37,8 +37,8 @@ class PostProcessorTests: XCTestCase { ), // Should trim all whitespace. ( - Config(["cls": (0, "[CLS]") as (UInt, String), - "sep": (0, "[SEP]") as (UInt, String), + Config(["cls": ["[CLS]", 0 as UInt], + "sep": ["[SEP]", 0 as UInt], "trimOffset": true, "addPrefixSpace": false]), [" The ", " sun", "sets ", " in ", " the ", "west"], @@ -47,8 +47,8 @@ class PostProcessorTests: XCTestCase { ), // Should add tokens. ( - Config(["cls": (0, "[CLS]") as (UInt, String), - "sep": (0, "[SEP]") as (UInt, String), + Config(["cls": ["[CLS]", 0 as UInt], + "sep": ["[SEP]", 0 as UInt], "trimOffset": true, "addPrefixSpace": true]), [" The ", " sun", "sets ", " in ", " the ", "west"], @@ -58,8 +58,8 @@ class PostProcessorTests: XCTestCase { "mat", "[SEP]"] ), ( - Config(["cls": (0, "[CLS]") as (UInt, String), - "sep": (0, "[SEP]") as (UInt, String), + Config(["cls": ["[CLS]", 0 as UInt], + "sep": ["[SEP]", 0 as UInt], "trimOffset": true, "addPrefixSpace": true]), [" 你 ", " 好 ", ","], diff --git a/Tests/TokenizersTests/TokenizerTests.swift b/Tests/TokenizersTests/TokenizerTests.swift index 1911feb..3d9c2ea 100644 --- a/Tests/TokenizersTests/TokenizerTests.swift +++ b/Tests/TokenizersTests/TokenizerTests.swift @@ -212,6 +212,34 @@ class BertSpacesTests: XCTestCase { } } +class RobertaTests: XCTestCase { + func testEncodeDecode() async throws { + guard let tokenizer = try await AutoTokenizer.from(pretrained: "FacebookAI/roberta-base") as? PreTrainedTokenizer else { + XCTFail() + return + } + + XCTAssertEqual(tokenizer.tokenize(text: "l'eure"), ["l", "'", "e", "ure"]) + XCTAssertEqual(tokenizer.encode(text: "l'eure"), [0, 462, 108, 242, 2407, 2]) + XCTAssertEqual(tokenizer.decode(tokens: tokenizer.encode(text: "l'eure"), skipSpecialTokens: true), "l'eure") + + XCTAssertEqual(tokenizer.tokenize(text: "mąka"), ["m", "Ä", "ħ", "ka"]) + XCTAssertEqual(tokenizer.encode(text: "mąka"), [0, 119, 649, 5782, 2348, 2]) + + XCTAssertEqual(tokenizer.tokenize(text: "département"), ["d", "é", "part", "ement"]) + XCTAssertEqual(tokenizer.encode(text: "département"), [0, 417, 1140, 7755, 6285, 2]) + + XCTAssertEqual(tokenizer.tokenize(text: "Who are you?"), ["Who", "Ġare", "Ġyou", "?"]) + XCTAssertEqual(tokenizer.encode(text: "Who are you?"), [0, 12375, 32, 47, 116, 2]) + + XCTAssertEqual(tokenizer.tokenize(text: " Who are you? "), ["ĠWho", "Ġare", "Ġyou", "?", "Ġ"]) + XCTAssertEqual(tokenizer.encode(text: " Who are you? "), [0, 3394, 32, 47, 116, 1437, 2]) + + XCTAssertEqual(tokenizer.tokenize(text: "Who are you?"), ["", "Who", "Ġare", "Ġyou", "?", ""]) + XCTAssertEqual(tokenizer.encode(text: "Who are you?"), [0, 0, 12375, 32, 47, 116, 2, 2]) + } +} + struct EncodedTokenizerSamplesDataset: Decodable { let text: String // Bad naming, not just for bpe. @@ -239,16 +267,16 @@ struct EncodedData: Decodable { class TokenizerTester { let encodedSamplesFilename: String let unknownTokenId: Int? - + private var configuration: LanguageModelConfigurationFromHub? private var edgeCases: [EdgeCase]? private var _tokenizer: Tokenizer? - + init(hubModelName: String, encodedSamplesFilename: String, unknownTokenId: Int?, hubApi: HubApi) { configuration = LanguageModelConfigurationFromHub(modelName: hubModelName, hubApi: hubApi) self.encodedSamplesFilename = encodedSamplesFilename self.unknownTokenId = unknownTokenId - + // Read the edge cases dataset edgeCases = { let url = Bundle.module.url(forResource: "tokenizer_tests", withExtension: "json")! @@ -259,7 +287,7 @@ class TokenizerTester { return cases[hubModelName] }() } - + lazy var dataset: EncodedTokenizerSamplesDataset = { let url = Bundle.module.url(forResource: encodedSamplesFilename, withExtension: "json")! let json = try! Data(contentsOf: url) @@ -267,7 +295,7 @@ class TokenizerTester { let dataset = try! decoder.decode(EncodedTokenizerSamplesDataset.self, from: json) return dataset }() - + var tokenizer: Tokenizer? { get async { guard _tokenizer == nil else { return _tokenizer! } @@ -283,7 +311,7 @@ class TokenizerTester { return _tokenizer } } - + var tokenizerModel: TokenizingModel? { get async { // The model is not usually accessible; maybe it should @@ -291,7 +319,7 @@ class TokenizerTester { return (tokenizer as! PreTrainedTokenizer).model } } - + func testTokenize() async { let tokenized = await tokenizer?.tokenize(text: dataset.text) XCTAssertEqual( @@ -299,7 +327,7 @@ class TokenizerTester { dataset.bpe_tokens ) } - + func testEncode() async { let encoded = await tokenizer?.encode(text: dataset.text) XCTAssertEqual( @@ -307,7 +335,7 @@ class TokenizerTester { dataset.token_ids ) } - + func testDecode() async { let decoded = await tokenizer?.decode(tokens: dataset.token_ids) XCTAssertEqual( @@ -315,7 +343,7 @@ class TokenizerTester { dataset.decoded_text ) } - + /// Test encode and decode for a few edge cases func testEdgeCases() async { guard let edgeCases else { @@ -339,7 +367,7 @@ class TokenizerTester { ) } } - + func testUnknownToken() async { guard let model = await tokenizerModel else { return } XCTAssertEqual(model.unknownTokenId, unknownTokenId) @@ -361,10 +389,10 @@ class TokenizerTester { class TokenizerTests: XCTestCase { /// Parallel testing in Xcode (when enabled) uses different processes, so this shouldn't be a problem static var _tester: TokenizerTester? = nil - + class var hubModelName: String? { nil } class var encodedSamplesFilename: String? { nil } - + /// Known id retrieved from Python, to verify it was parsed correctly class var unknownTokenId: Int? { nil } @@ -399,25 +427,25 @@ class TokenizerTests: XCTestCase { await tester.testTokenize() } } - + func testEncode() async { if let tester = Self._tester { await tester.testEncode() } } - + func testDecode() async { if let tester = Self._tester { await tester.testDecode() } } - + func testEdgeCases() async { if let tester = Self._tester { await tester.testEdgeCases() } } - + func testUnknownToken() async { if let tester = Self._tester { await tester.testUnknownToken()