diff --git a/Sources/Hub/Hub.swift b/Sources/Hub/Hub.swift
index 1c2cd22..fe8f461 100644
--- a/Sources/Hub/Hub.swift
+++ b/Sources/Hub/Hub.swift
@@ -133,7 +133,15 @@ public struct Config {
}
/// Tuple of token identifier and string value
- public var tokenValue: (UInt, String)? { value as? (UInt, String) }
+ public var tokenValue: (UInt, String)? {
+ guard let value = value as? [Any] else {
+ return nil
+ }
+ guard let stringValue = value.first as? String, let intValue = value.dropFirst().first as? UInt else {
+ return nil
+ }
+ return (intValue, stringValue)
+ }
}
public class LanguageModelConfigurationFromHub {
diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift
index 1926933..82d7ff0 100644
--- a/Sources/Tokenizers/Tokenizer.swift
+++ b/Sources/Tokenizers/Tokenizer.swift
@@ -101,6 +101,7 @@ struct TokenizerModel {
"BertTokenizer": BertTokenizer.self,
"DistilbertTokenizer": BertTokenizer.self,
"DistilBertTokenizer": BertTokenizer.self,
+ "RobertaTokenizer": BPETokenizer.self,
"CodeGenTokenizer": CodeGenTokenizer.self,
"CodeLlamaTokenizer": CodeLlamaTokenizer.self,
"FalconTokenizer": FalconTokenizer.self,
@@ -230,7 +231,7 @@ public extension Tokenizer {
func callAsFunction(_ text: String, addSpecialTokens: Bool = true) -> [Int] {
encode(text: text, addSpecialTokens: addSpecialTokens)
}
-
+
func decode(tokens: [Int]) -> String {
decode(tokens: tokens, skipSpecialTokens: false)
}
diff --git a/Tests/HubTests/HubTests.swift b/Tests/HubTests/HubTests.swift
index 1f726b0..00d638e 100644
--- a/Tests/HubTests/HubTests.swift
+++ b/Tests/HubTests/HubTests.swift
@@ -117,4 +117,18 @@ class HubTests: XCTestCase {
let vocab_dict = config.dictionary["vocab"] as! [String: Int]
XCTAssertNotEqual(vocab_dict.count, 2)
}
+
+ func testConfigTokenValue() throws {
+ let config1 = Config(["cls": ["str" as String, 100 as UInt] as [Any]])
+ let tokenValue1 = config1.cls?.tokenValue
+ XCTAssertEqual(tokenValue1?.0, 100)
+ XCTAssertEqual(tokenValue1?.1, "str")
+
+ let data = #"{"cls": ["str", 100]}"#.data(using: .utf8)!
+ let dict = try JSONSerialization.jsonObject(with: data, options: []) as! [NSString: Any]
+ let config2 = Config(dict)
+ let tokenValue2 = config2.cls?.tokenValue
+ XCTAssertEqual(tokenValue2?.0, 100)
+ XCTAssertEqual(tokenValue2?.1, "str")
+ }
}
diff --git a/Tests/PostProcessorTests/PostProcessorTests.swift b/Tests/PostProcessorTests/PostProcessorTests.swift
index 0e46cb2..e106ac6 100644
--- a/Tests/PostProcessorTests/PostProcessorTests.swift
+++ b/Tests/PostProcessorTests/PostProcessorTests.swift
@@ -7,8 +7,8 @@ class PostProcessorTests: XCTestCase {
let testCases: [(Config, [String], [String]?, [String])] = [
// Should keep spaces; uneven spaces; ignore `addPrefixSpace`.
(
- Config(["cls": (0, "[HEAD]") as (UInt, String),
- "sep": (0, "[END]") as (UInt, String),
+ Config(["cls": ["[HEAD]", 0 as UInt],
+ "sep": ["[END]", 0 as UInt],
"trimOffset": false,
"addPrefixSpace": true]),
[" The", " sun", "sets ", " in ", " the ", "west"],
@@ -17,8 +17,8 @@ class PostProcessorTests: XCTestCase {
),
// Should leave only one space around each token.
(
- Config(["cls": (0, "[START]") as (UInt, String),
- "sep": (0, "[BREAK]") as (UInt, String),
+ Config(["cls": ["[START]", 0 as UInt],
+ "sep": ["[BREAK]", 0 as UInt],
"trimOffset": true,
"addPrefixSpace": true]),
[" The ", " sun", "sets ", " in ", " the ", "west"],
@@ -27,8 +27,8 @@ class PostProcessorTests: XCTestCase {
),
// Should ignore empty tokens pair.
(
- Config(["cls": (0, "[START]") as (UInt, String),
- "sep": (0, "[BREAK]") as (UInt, String),
+ Config(["cls": ["[START]", 0 as UInt],
+ "sep": ["[BREAK]", 0 as UInt],
"trimOffset": true,
"addPrefixSpace": true]),
[" The ", " sun", "sets ", " in ", " the ", "west"],
@@ -37,8 +37,8 @@ class PostProcessorTests: XCTestCase {
),
// Should trim all whitespace.
(
- Config(["cls": (0, "[CLS]") as (UInt, String),
- "sep": (0, "[SEP]") as (UInt, String),
+ Config(["cls": ["[CLS]", 0 as UInt],
+ "sep": ["[SEP]", 0 as UInt],
"trimOffset": true,
"addPrefixSpace": false]),
[" The ", " sun", "sets ", " in ", " the ", "west"],
@@ -47,8 +47,8 @@ class PostProcessorTests: XCTestCase {
),
// Should add tokens.
(
- Config(["cls": (0, "[CLS]") as (UInt, String),
- "sep": (0, "[SEP]") as (UInt, String),
+ Config(["cls": ["[CLS]", 0 as UInt],
+ "sep": ["[SEP]", 0 as UInt],
"trimOffset": true,
"addPrefixSpace": true]),
[" The ", " sun", "sets ", " in ", " the ", "west"],
@@ -58,8 +58,8 @@ class PostProcessorTests: XCTestCase {
"mat", "[SEP]"]
),
(
- Config(["cls": (0, "[CLS]") as (UInt, String),
- "sep": (0, "[SEP]") as (UInt, String),
+ Config(["cls": ["[CLS]", 0 as UInt],
+ "sep": ["[SEP]", 0 as UInt],
"trimOffset": true,
"addPrefixSpace": true]),
[" 你 ", " 好 ", ","],
diff --git a/Tests/TokenizersTests/TokenizerTests.swift b/Tests/TokenizersTests/TokenizerTests.swift
index 1911feb..3d9c2ea 100644
--- a/Tests/TokenizersTests/TokenizerTests.swift
+++ b/Tests/TokenizersTests/TokenizerTests.swift
@@ -212,6 +212,34 @@ class BertSpacesTests: XCTestCase {
}
}
+class RobertaTests: XCTestCase {
+ func testEncodeDecode() async throws {
+ guard let tokenizer = try await AutoTokenizer.from(pretrained: "FacebookAI/roberta-base") as? PreTrainedTokenizer else {
+ XCTFail()
+ return
+ }
+
+ XCTAssertEqual(tokenizer.tokenize(text: "l'eure"), ["l", "'", "e", "ure"])
+ XCTAssertEqual(tokenizer.encode(text: "l'eure"), [0, 462, 108, 242, 2407, 2])
+ XCTAssertEqual(tokenizer.decode(tokens: tokenizer.encode(text: "l'eure"), skipSpecialTokens: true), "l'eure")
+
+ XCTAssertEqual(tokenizer.tokenize(text: "mąka"), ["m", "Ä", "ħ", "ka"])
+ XCTAssertEqual(tokenizer.encode(text: "mąka"), [0, 119, 649, 5782, 2348, 2])
+
+ XCTAssertEqual(tokenizer.tokenize(text: "département"), ["d", "é", "part", "ement"])
+ XCTAssertEqual(tokenizer.encode(text: "département"), [0, 417, 1140, 7755, 6285, 2])
+
+ XCTAssertEqual(tokenizer.tokenize(text: "Who are you?"), ["Who", "Ġare", "Ġyou", "?"])
+ XCTAssertEqual(tokenizer.encode(text: "Who are you?"), [0, 12375, 32, 47, 116, 2])
+
+ XCTAssertEqual(tokenizer.tokenize(text: " Who are you? "), ["ĠWho", "Ġare", "Ġyou", "?", "Ġ"])
+ XCTAssertEqual(tokenizer.encode(text: " Who are you? "), [0, 3394, 32, 47, 116, 1437, 2])
+
+ XCTAssertEqual(tokenizer.tokenize(text: "Who are you?"), ["", "Who", "Ġare", "Ġyou", "?", ""])
+ XCTAssertEqual(tokenizer.encode(text: "Who are you?"), [0, 0, 12375, 32, 47, 116, 2, 2])
+ }
+}
+
struct EncodedTokenizerSamplesDataset: Decodable {
let text: String
// Bad naming, not just for bpe.
@@ -239,16 +267,16 @@ struct EncodedData: Decodable {
class TokenizerTester {
let encodedSamplesFilename: String
let unknownTokenId: Int?
-
+
private var configuration: LanguageModelConfigurationFromHub?
private var edgeCases: [EdgeCase]?
private var _tokenizer: Tokenizer?
-
+
init(hubModelName: String, encodedSamplesFilename: String, unknownTokenId: Int?, hubApi: HubApi) {
configuration = LanguageModelConfigurationFromHub(modelName: hubModelName, hubApi: hubApi)
self.encodedSamplesFilename = encodedSamplesFilename
self.unknownTokenId = unknownTokenId
-
+
// Read the edge cases dataset
edgeCases = {
let url = Bundle.module.url(forResource: "tokenizer_tests", withExtension: "json")!
@@ -259,7 +287,7 @@ class TokenizerTester {
return cases[hubModelName]
}()
}
-
+
lazy var dataset: EncodedTokenizerSamplesDataset = {
let url = Bundle.module.url(forResource: encodedSamplesFilename, withExtension: "json")!
let json = try! Data(contentsOf: url)
@@ -267,7 +295,7 @@ class TokenizerTester {
let dataset = try! decoder.decode(EncodedTokenizerSamplesDataset.self, from: json)
return dataset
}()
-
+
var tokenizer: Tokenizer? {
get async {
guard _tokenizer == nil else { return _tokenizer! }
@@ -283,7 +311,7 @@ class TokenizerTester {
return _tokenizer
}
}
-
+
var tokenizerModel: TokenizingModel? {
get async {
// The model is not usually accessible; maybe it should
@@ -291,7 +319,7 @@ class TokenizerTester {
return (tokenizer as! PreTrainedTokenizer).model
}
}
-
+
func testTokenize() async {
let tokenized = await tokenizer?.tokenize(text: dataset.text)
XCTAssertEqual(
@@ -299,7 +327,7 @@ class TokenizerTester {
dataset.bpe_tokens
)
}
-
+
func testEncode() async {
let encoded = await tokenizer?.encode(text: dataset.text)
XCTAssertEqual(
@@ -307,7 +335,7 @@ class TokenizerTester {
dataset.token_ids
)
}
-
+
func testDecode() async {
let decoded = await tokenizer?.decode(tokens: dataset.token_ids)
XCTAssertEqual(
@@ -315,7 +343,7 @@ class TokenizerTester {
dataset.decoded_text
)
}
-
+
/// Test encode and decode for a few edge cases
func testEdgeCases() async {
guard let edgeCases else {
@@ -339,7 +367,7 @@ class TokenizerTester {
)
}
}
-
+
func testUnknownToken() async {
guard let model = await tokenizerModel else { return }
XCTAssertEqual(model.unknownTokenId, unknownTokenId)
@@ -361,10 +389,10 @@ class TokenizerTester {
class TokenizerTests: XCTestCase {
/// Parallel testing in Xcode (when enabled) uses different processes, so this shouldn't be a problem
static var _tester: TokenizerTester? = nil
-
+
class var hubModelName: String? { nil }
class var encodedSamplesFilename: String? { nil }
-
+
/// Known id retrieved from Python, to verify it was parsed correctly
class var unknownTokenId: Int? { nil }
@@ -399,25 +427,25 @@ class TokenizerTests: XCTestCase {
await tester.testTokenize()
}
}
-
+
func testEncode() async {
if let tester = Self._tester {
await tester.testEncode()
}
}
-
+
func testDecode() async {
if let tester = Self._tester {
await tester.testDecode()
}
}
-
+
func testEdgeCases() async {
if let tester = Self._tester {
await tester.testEdgeCases()
}
}
-
+
func testUnknownToken() async {
if let tester = Self._tester {
await tester.testUnknownToken()