Skip to content

Commit 3943a4c

Browse files
author
lexasub
committed
tool: fix convertation of text/parquet to custom format
1 parent bff2db8 commit 3943a4c

File tree

4 files changed

+54
-28
lines changed

4 files changed

+54
-28
lines changed

tools/dataset-converter/CMakeLists.txt

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ if(LLAMA_PARQUET)
55
find_package(Parquet REQUIRED)
66
endif()
77

8-
add_library(dataset-to-gguf-lib
8+
add_library(dataset-to-gguf-lib STATIC
99
dataset-to-gguf/llama-gguf-writer.cpp
1010
dataset-to-gguf/llama-gguf-file.cpp
1111
dataset-to-gguf/llama-dataset-reader/llama-text-data-reader.cpp
@@ -14,28 +14,22 @@ add_library(dataset-to-gguf-lib
1414
dataset-to-gguf/llama-dataset-reader/llama-parquet-data-reader.cpp
1515
)
1616

17-
# Link libraries for dataset-to-gguf-lib
17+
target_compile_features(dataset-to-gguf-lib PRIVATE cxx_std_17)
18+
1819
target_link_libraries(dataset-to-gguf-lib common llama ${CMAKE_THREAD_LIBS_INIT})
1920
if(LLAMA_PARQUET)
2021
target_link_libraries(dataset-to-gguf-lib Arrow::arrow_shared Parquet::parquet_shared)
2122
endif()
22-
target_compile_features(dataset-to-gguf-lib PRIVATE cxx_std_11)
23-
2423

2524
add_executable(convert-to-train-gguf convert-to-train-gguf.cpp)
26-
target_link_libraries(convert-to-train-gguf PRIVATE dataset-to-gguf-lib) # Link to the new library
27-
target_compile_features(convert-to-train-gguf PRIVATE cxx_std_11) # Apply C++ standard to the executable
25+
add_dependencies(convert-to-train-gguf dataset-to-gguf-lib)
26+
target_link_libraries(convert-to-train-gguf PRIVATE dataset-to-gguf-lib)
2827

29-
# Define the executable for the unit tests
3028
set(TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS dataset-to-gguf-unit-tests)
3129
add_executable(${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} dataset-to-gguf/tests/dataset-to-gguf-tests.cpp)
32-
33-
# Link necessary libraries for the test executable
30+
add_dependencies(${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} dataset-to-gguf-lib)
3431
target_link_libraries(${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} PRIVATE common llama dataset-to-gguf-lib)
3532

36-
# Ensure C++17 for filesystem usage for the test executable
37-
target_compile_features(${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} PRIVATE cxx_std_17)
38-
3933
add_test(
4034
NAME ${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} #
4135
COMMAND $<TARGET_FILE:${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS}>

tools/dataset-converter/convert-to-train-gguf.cpp

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,8 @@ int main(int argc, char ** argv) {
7979
tokenizer_model_reader.llama_gguf_reader_get_metadata_str("general.name", "N/A").c_str());
8080
printf(" Tokenizer Model Architecture: %s\n",
8181
tokenizer_model_reader.llama_gguf_reader_get_metadata_str("general.architecture", "N/A").c_str());
82-
printf(" Tokenizer Model Tensor Count: %ld\n",
83-
tokenizer_model_reader.llama_gguf_reader_get_tensor_count());
82+
printf(" Tokenizer Model Tensor Count: %llu\n",
83+
static_cast<long long>(tokenizer_model_reader.llama_gguf_reader_get_tensor_count()));
8484
printf(" Diagnostic Test: Tokenizer Model GGUF read successful.\n");
8585
} else {
8686
fprintf(stderr, "error: Diagnostic Test: Tokenizer Model GGUF read failed to initialize.\n");
@@ -100,12 +100,10 @@ int main(int argc, char ** argv) {
100100
llama_gguf_converter converter;
101101
bool success = converter.llama_gguf_converter_convert(params, model);
102102

103-
// Clean up llama model
104-
llama_model_free(model);
105-
llama_backend_free();
106-
107103
if (!success) {
108104
fprintf(stderr, "error: GGUF conversion failed.\n");
105+
llama_model_free(model); // Free model on conversion failure
106+
llama_backend_free();
109107
return 1;
110108
}
111109

@@ -120,12 +118,14 @@ int main(int argc, char ** argv) {
120118

121119
if (!reader.llama_gguf_reader_is_initialized()) {
122120
fprintf(stderr, "error: llama_gguf_reader failed to initialize for preview.\n");
121+
llama_model_free(model); // Free model before exiting
122+
llama_backend_free();
123123
return 1;
124124
}
125125

126126
printf(" Dataset Name: %s\n",
127127
reader.llama_gguf_reader_get_metadata_str("training.dataset.name", "N/A").c_str());
128-
printf(" Sequence Count: %lu\n", reader.llama_gguf_reader_get_metadata_u64("training.sequence.count", 0));
128+
printf(" Sequence Count: %llu\n", static_cast<long long>(reader.llama_gguf_reader_get_metadata_u64("training.sequence.count", 0)));
129129
printf(" Tokenizer Model: %s\n",
130130
reader.llama_gguf_reader_get_metadata_str("training.tokenizer.gguf.model", "N/A").c_str());
131131

@@ -153,14 +153,19 @@ int main(int argc, char ** argv) {
153153
std::string detokenized_text = "";
154154
// Buffer for a single token
155155
std::array<char, 256> piece_buf; // Large enough buffer for a single token
156-
for (llama_token token : sequence_tokens) {
157-
int n_chars = llama_token_to_piece(llama_model_get_vocab(model), token,
158-
piece_buf.data(), piece_buf.size(), 1, false);
159-
if (n_chars > 0) {
160-
detokenized_text.append(piece_buf.data(), n_chars);
156+
// Ensure model is valid before calling llama_model_get_vocab
157+
if (model != nullptr) {
158+
for (llama_token token : sequence_tokens) {
159+
int n_chars = llama_token_to_piece(llama_model_get_vocab(model), token,
160+
piece_buf.data(), piece_buf.size(), 1, false);
161+
if (n_chars > 0) {
162+
detokenized_text.append(piece_buf.data(), n_chars);
163+
}
161164
}
165+
printf(" Detokenized: \"%s\"\n", detokenized_text.c_str());
166+
} else {
167+
fprintf(stderr, " Warning: Cannot detokenize preview, model is null.\n");
162168
}
163-
printf(" Detokenized: \"%s\"\n", detokenized_text.c_str());
164169
}
165170

166171
} else {
@@ -173,10 +178,16 @@ int main(int argc, char ** argv) {
173178

174179
} catch (const std::runtime_error & e) {
175180
fprintf(stderr, "error: GGUF preview failed: %s\n", e.what());
181+
llama_model_free(model); // Free model before exiting
182+
llama_backend_free();
176183
return 1;
177184
}
178185
printf("--- End of GGUF file preview ---\n");
179186
}
180187

188+
// Clean up llama model and backend after all usage
189+
llama_model_free(model);
190+
llama_backend_free();
191+
181192
return 0;
182193
}

tools/dataset-converter/dataset-to-gguf/llama-gguf-file.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,6 @@ struct llama_gguf_file {
108108

109109
private:
110110
struct gguf_context * m_ctx; // The underlying GGUF context
111-
struct ggml_context * m_ggml_ctx; // ggml_context for tensor data when reading
112111

113112
// Private helper function to find a key by name.
114113
// key: The key name to find.

tools/dataset-converter/dataset-to-gguf/tests/dataset-to-gguf-tests.cpp

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,27 @@ static std::string g_test_model_path = "../../gte-small.Q2_K.gguf"; // Specif
2828
return false; \
2929
} \
3030
} while (0)
31+
bool SetUpLlamaBackend();
32+
bool Testllama_gguf_file_DefaultConstructorInitializesContext();
33+
bool Testllama_gguf_file_ConstructorFromFileThrowsOnError();
34+
bool Testllama_gguf_file_SetAndGetMetadataString();
35+
bool Testllama_gguf_file_SetAndGetMetadataU64();
36+
bool Testllama_gguf_file_SetAndGetMetadataStringArray();
37+
bool CreateTestllama_gguf_file(const std::string & path, llama_model * model_ptr);
38+
bool Testllama_gguf_reader_ConstructorInitializesFromFile();
39+
bool Testllama_gguf_reader_GetMetadata();
40+
bool Testllama_gguf_reader_GetTensorCount();
41+
bool Testllama_gguf_reader_GetTensorNameAndTypeAndSize();
42+
bool Testllama_gguf_reader_ReadTensorData();
43+
bool Testllama_gguf_reader_ReadTensorDataInvalidIndex();
44+
bool TestTextDataReader_OpenFile();
45+
bool TestTextDataReader_ReadNextSequenceTextMode();
46+
bool TestTextDataReader_ReadNextSequencePreTokenizedMode();
47+
bool TestTextDataReader_ResetFunctionality();
48+
bool TestTextDataReader_GetTotalSequences();
49+
bool Testllama_gguf_converter_ConvertTextFileSuccess();
50+
void TearDownLlamaBackend();
51+
3152

3253
// Global setup for llama.cpp backend
3354
bool SetUpLlamaBackend() {
@@ -419,9 +440,10 @@ bool Testllama_gguf_converter_ConvertTextFileSuccess() {
419440
params.max_seq_len = 128;
420441
params.pre_tokenized = false;
421442
params.dataset_format = "text";
443+
#ifdef LLAMA_PARQUET
422444
params.parquet_text_column = "text"; // Not used for text, but for completeness
423445
params.parquet_tokens_column = "tokens"; // Not used for text, but for completeness
424-
446+
#endif
425447
llama_gguf_converter converter;
426448
TEST_ASSERT(converter.llama_gguf_converter_convert(params, g_llama_model), "GGUF conversion failed");
427449

@@ -448,7 +470,7 @@ bool Testllama_gguf_converter_ConvertTextFileSuccess() {
448470
// Main function to run all tests
449471
// =============================================================================
450472

451-
int main(int argc, char ** argv) {
473+
int main() {
452474
printf("Running dataset-to-gguf tests...\n\n");
453475

454476
// Global setup for llama.cpp backend

0 commit comments

Comments
 (0)