Skip to content

Commit 0f23067

Browse files
authored
Llama 3.2 tokenizer tests (#130)
1 parent e491a33 commit 0f23067

File tree

3 files changed

+7
-1
lines changed

3 files changed

+7
-1
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"text": "Fatouville-Grestain est une commune du Nord-Ouest du d\u00e9partement de l'Eure situ\u00e9e au \nbord de l'estuaire de la Seine et \u00e0 proximit\u00e9 du d\u00e9partement du Calvados. Selon l'atlas des paysages \nde Haute-Normandie, elle appartient \u00e0 la r\u00e9gion naturelle du Lieuvin. Toutefois, l'Agreste, le service \nde la statistique et de la prospective du minist\u00e8re de l'Agriculture, de l'Agroalimentaire et de la For\u00eat, \nla classe au sein du pays d'Auge (en tant que r\u00e9gion agricole).La commune est \u00e0 moins de dix kilom\u00e8tres \u00e0 \nl'est de Honfleur, \u00e0 autant de Beuzeville et \u00e0 environ dix-sept kilom\u00e8tres de Pont-Audemer.", "bpe_tokens": ["Fat", "ou", "ville", "-G", "rest", "ain", "\u0120est", "\u0120une", "\u0120commune", "\u0120du", "\u0120Nord", "-O", "uest", "\u0120du", "\u0120d\u00c3\u00a9part", "ement", "\u0120de", "\u0120l", "'E", "ure", "\u0120situ", "\u00c3\u00a9e", "\u0120au", "\u0120\u010a", "b", "ord", "\u0120de", "\u0120l", "'est", "ua", "ire", "\u0120de", "\u0120la", "\u0120Se", "ine", "\u0120et", "\u0120\u00c3\u0142", "\u0120prox", "imit\u00c3\u00a9", "\u0120du", "\u0120d\u00c3\u00a9part", "ement", "\u0120du", "\u0120Cal", "v", "ados", ".", "\u0120Sel", "on", "\u0120l", "'", "at", "las", "\u0120des", "\u0120pays", "ages", "\u0120\u010a", "de", "\u0120Ha", "ute", "-N", "orm", "and", "ie", ",", "\u0120elle", "\u0120appart", "ient", "\u0120\u00c3\u0142", "\u0120la", "\u0120r\u00c3\u00a9gion", "\u0120nature", "lle", "\u0120du", "\u0120Lie", "u", "vin", ".", "\u0120Tout", "ef", "ois", ",", "\u0120l", "'", "Ag", "reste", ",", "\u0120le", "\u0120service", "\u0120\u010a", "de", "\u0120la", "\u0120statist", "ique", "\u0120et", "\u0120de", "\u0120la", "\u0120prospective", "\u0120du", "\u0120minist", "\u00c3\u00a8re", "\u0120de", "\u0120l", "'A", "gricult", "ure", ",", "\u0120de", "\u0120l", "'", "Ag", "ro", "al", "iment", "aire", "\u0120et", "\u0120de", "\u0120la", "\u0120For", "\u00c3\u00aat", ",", "\u0120\u010a", "la", "\u0120classe", "\u0120au", "\u0120sein", "\u0120du", "\u0120pays", "\u0120d", "'A", "uge", "\u0120(", "en", "\u0120tant", "\u0120que", "\u0120r\u00c3\u00a9gion", "\u0120agr", "ic", "ole", ").", "La", "\u0120commune", "\u0120est", "\u0120\u00c3\u0142", "\u0120moins", "\u0120de", "\u0120d", "ix", "\u0120kil", "om", "\u00c3\u00a8tres", "\u0120\u00c3\u0142", "\u0120\u010a", "l", "'est", "\u0120de", "\u0120Hon", "f", "leur", ",", "\u0120\u00c3\u0142", "\u0120aut", "ant", "\u0120de", "\u0120Be", "uze", "ville", "\u0120et", "\u0120\u00c3\u0142", "\u0120environ", "\u0120d", "ix", "-se", "pt", "\u0120kil", "om", "\u00c3\u00a8tres", "\u0120de", "\u0120Pont", "-A", "ud", "emer", "."], "token_ids": [128000, 69557, 283, 8078, 12279, 4014, 467, 1826, 6316, 79245, 3930, 30281, 24540, 3121, 3930, 76235, 1133, 409, 326, 89048, 554, 10109, 8047, 8065, 720, 65, 541, 409, 326, 17771, 4381, 556, 409, 1208, 1369, 483, 1880, 3869, 22267, 99481, 3930, 76235, 1133, 3930, 3400, 85, 5670, 13, 24082, 263, 326, 6, 266, 14833, 951, 21935, 1154, 720, 451, 14433, 1088, 11500, 494, 438, 648, 11, 27549, 74756, 1188, 3869, 1208, 88100, 7138, 66601, 3930, 22213, 84, 9799, 13, 80905, 830, 30148, 11, 326, 6, 9219, 100034, 11, 514, 2532, 720, 451, 1208, 20719, 2428, 1880, 409, 1208, 33547, 3930, 49904, 12339, 409, 326, 52374, 13130, 554, 11, 409, 326, 6, 9219, 299, 278, 3904, 12267, 1880, 409, 1208, 1789, 49530, 11, 720, 4355, 37417, 8065, 19910, 3930, 21935, 294, 52374, 4838, 320, 268, 37622, 1744, 88100, 40574, 292, 1286, 570, 8921, 79245, 1826, 3869, 40970, 409, 294, 953, 15395, 316, 75104, 3869, 720, 75, 17771, 409, 16958, 69, 36077, 11, 3869, 3154, 519, 409, 2893, 91311, 8078, 1880, 3869, 50026, 294, 953, 7962, 418, 15395, 316, 75104, 409, 40870, 6830, 664, 41996, 13], "decoded_text": "<|begin_of_text|>Fatouville-Grestain est une commune du Nord-Ouest du d\u00e9partement de l'Eure situ\u00e9e au \nbord de l'estuaire de la Seine et \u00e0 proximit\u00e9 du d\u00e9partement du Calvados. Selon l'atlas des paysages \nde Haute-Normandie, elle appartient \u00e0 la r\u00e9gion naturelle du Lieuvin. Toutefois, l'Agreste, le service \nde la statistique et de la prospective du minist\u00e8re de l'Agriculture, de l'Agroalimentaire et de la For\u00eat, \nla classe au sein du pays d'Auge (en tant que r\u00e9gion agricole).La commune est \u00e0 moins de dix kilom\u00e8tres \u00e0 \nl'est de Honfleur, \u00e0 autant de Beuzeville et \u00e0 environ dix-sept kilom\u00e8tres de Pont-Audemer."}

Tests/TokenizersTests/Resources/tokenizer_tests.json

100644100755
Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

Tests/TokenizersTests/TokenizerTests.swift

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@ class LlamaTokenizerTests: TokenizerTests {
3737
}
3838
}
3939

40+
class Llama32TokenizerTests: TokenizerTests {
41+
override class var hubModelName: String? { "mlx-community/Llama-3.2-3B-Instruct-4bit" }
42+
override class var encodedSamplesFilename: String? { "llama_3.2_encoded" }
43+
}
44+
4045
class WhisperLargeTokenizerTests: TokenizerTests {
4146
override class var hubModelName: String? { "openai/whisper-large-v2" }
4247
override class var encodedSamplesFilename: String? { "whisper_large_v2_encoded" }

0 commit comments

Comments
 (0)