Skip to content

Commit b61ab1a

Browse files
CISCqnixsynapse
authored andcommitted
tests : add test-tokenizers-repo (ggml-org#14017)
1 parent 11d112f commit b61ab1a

File tree

2 files changed

+72
-2
lines changed

2 files changed

+72
-2
lines changed

tests/CMakeLists.txt

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,34 @@ function(llama_test target)
4242
set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
4343
endfunction()
4444

45+
function(llama_test_cmd target)
46+
include(CMakeParseArguments)
47+
set(options)
48+
set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
49+
set(multiValueArgs ARGS)
50+
cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
51+
52+
if (NOT DEFINED LLAMA_TEST_LABEL)
53+
set(LLAMA_TEST_LABEL "main")
54+
endif()
55+
if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
56+
set(LLAMA_TEST_WORKING_DIRECTORY .)
57+
endif()
58+
if (DEFINED LLAMA_TEST_NAME)
59+
set(TEST_NAME ${LLAMA_TEST_NAME})
60+
else()
61+
set(TEST_NAME ${target})
62+
endif()
63+
64+
add_test(
65+
NAME ${TEST_NAME}
66+
WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
67+
COMMAND ${target}
68+
${LLAMA_TEST_ARGS})
69+
70+
set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
71+
endfunction()
72+
4573
# Builds and runs a test source file.
4674
# Optional args:
4775
# - NAME: name of the executable & test target (defaults to the source file name without extension)
@@ -97,8 +125,14 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE
97125
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
98126
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
99127

100-
# TODO: missing HF tokenizer for this model in convert_hf_to_gguf_update.py, see https://github.com/ggml-org/llama.cpp/pull/13847
101-
# llama_test(test-tokenizer-0 NAME test-tokenizer-0-nomic-bert-moe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-nomic-bert-moe.gguf)
128+
if (NOT WIN32)
129+
llama_test_cmd(
130+
${CMAKE_CURRENT_SOURCE_DIR}/test-tokenizers-repo.sh
131+
NAME test-tokenizers-ggml-vocabs
132+
WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
133+
ARGS https://huggingface.co/ggml-org/vocabs ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocabs
134+
)
135+
endif()
102136

103137
if (LLAMA_LLGUIDANCE)
104138
llama_build_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)

tests/test-tokenizers-repo.sh

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/bin/bash
2+
3+
if [ $# -lt 2 ]; then
4+
printf "Usage: $0 <git-repo> <target-folder> [<test-exe>]\n"
5+
exit 1
6+
fi
7+
8+
if [ $# -eq 3 ]; then
9+
toktest=$3
10+
else
11+
toktest="./test-tokenizer-0"
12+
fi
13+
14+
if [ ! -x $toktest ]; then
15+
printf "Test executable \"$toktest\" not found!\n"
16+
exit 1
17+
fi
18+
19+
repo=$1
20+
folder=$2
21+
22+
if [ -d $folder ] && [ -d $folder/.git ]; then
23+
(cd $folder; git pull)
24+
else
25+
git clone $repo $folder
26+
fi
27+
28+
shopt -s globstar
29+
for gguf in $folder/**/*.gguf; do
30+
if [ -f $gguf.inp ] && [ -f $gguf.out ]; then
31+
$toktest $gguf
32+
else
33+
printf "Found \"$gguf\" without matching inp/out files, ignoring...\n"
34+
fi
35+
done
36+

0 commit comments

Comments
 (0)