|
7 | 7 | //
|
8 | 8 |
|
9 | 9 | @testable import Tokenizers
|
10 |
| -import XCTest |
| 10 | +@testable import Hub |
| 11 | + |
11 | 12 |
|
12 | 13 | class BertTokenizerTests: XCTestCase {
|
13 | 14 | override func setUp() {
|
@@ -175,4 +176,35 @@ class BertTokenizerTests: XCTestCase {
|
175 | 176 | XCTAssertEqual(decoded, String(expected))
|
176 | 177 | }
|
177 | 178 | }
|
| 179 | + |
| 180 | + func testBertTokenizerAddedTokensRecognized() async throws { |
| 181 | + let base: URL = FileManager.default.urls(for: .cachesDirectory, in: .userDomainMask).first!.appending(component: "huggingface-tests") |
| 182 | + let hubApi = HubApi(downloadBase: base) |
| 183 | + let configuration = LanguageModelConfigurationFromHub(modelName: "google-bert/bert-base-uncased", hubApi: hubApi) |
| 184 | + guard let tokenizerConfig = try await configuration.tokenizerConfig else { fatalError("missing tokenizer config") } |
| 185 | + let tokenizerData = try await configuration.tokenizerData |
| 186 | + let addedTokens = [ |
| 187 | + "[ROAD]": 60_001, |
| 188 | + "[RIVER]": 60_002, |
| 189 | + "[BUILDING]": 60_003, |
| 190 | + "[PARK]": 60_004, |
| 191 | + "[BUFFER]": 60_005, |
| 192 | + "[INTERSECT]": 60_006, |
| 193 | + "[UNION]": 60_007, |
| 194 | + ] |
| 195 | + let tokenizer = try BertTokenizer(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData, addedTokens: addedTokens) |
| 196 | + for (token, idx) in addedTokens { |
| 197 | + XCTAssertEqual(tokenizer.convertTokenToId(token), idx) |
| 198 | + } |
| 199 | + for (token, idx) in addedTokens { |
| 200 | + XCTAssertEqual(tokenizer.convertIdToToken(idx), token) |
| 201 | + } |
| 202 | + |
| 203 | + // Reading added_tokens from tokenizer.json |
| 204 | + XCTAssertEqual(tokenizer.convertTokenToId("[PAD]"), 0) |
| 205 | + XCTAssertEqual(tokenizer.convertTokenToId("[UNK]"), 100) |
| 206 | + XCTAssertEqual(tokenizer.convertTokenToId("[CLS]"), 101) |
| 207 | + XCTAssertEqual(tokenizer.convertTokenToId("[SEP]"), 102) |
| 208 | + XCTAssertEqual(tokenizer.convertTokenToId("[MASK]"), 103) |
| 209 | + } |
178 | 210 | }
|
0 commit comments