Skip to content

Commit 92bbf15

Browse files
author
lexasub
committed
tool: fix convertation of text/parquet to custom format
1 parent bff2db8 commit 92bbf15

File tree

3 files changed

+48
-16
lines changed

3 files changed

+48
-16
lines changed

tools/dataset-converter/convert-to-train-gguf.cpp

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,8 @@ int main(int argc, char ** argv) {
7979
tokenizer_model_reader.llama_gguf_reader_get_metadata_str("general.name", "N/A").c_str());
8080
printf(" Tokenizer Model Architecture: %s\n",
8181
tokenizer_model_reader.llama_gguf_reader_get_metadata_str("general.architecture", "N/A").c_str());
82-
printf(" Tokenizer Model Tensor Count: %ld\n",
83-
tokenizer_model_reader.llama_gguf_reader_get_tensor_count());
82+
printf(" Tokenizer Model Tensor Count: %llu\n",
83+
static_cast<long long>(tokenizer_model_reader.llama_gguf_reader_get_tensor_count()));
8484
printf(" Diagnostic Test: Tokenizer Model GGUF read successful.\n");
8585
} else {
8686
fprintf(stderr, "error: Diagnostic Test: Tokenizer Model GGUF read failed to initialize.\n");
@@ -100,12 +100,10 @@ int main(int argc, char ** argv) {
100100
llama_gguf_converter converter;
101101
bool success = converter.llama_gguf_converter_convert(params, model);
102102

103-
// Clean up llama model
104-
llama_model_free(model);
105-
llama_backend_free();
106-
107103
if (!success) {
108104
fprintf(stderr, "error: GGUF conversion failed.\n");
105+
llama_model_free(model); // Free model on conversion failure
106+
llama_backend_free();
109107
return 1;
110108
}
111109

@@ -120,12 +118,14 @@ int main(int argc, char ** argv) {
120118

121119
if (!reader.llama_gguf_reader_is_initialized()) {
122120
fprintf(stderr, "error: llama_gguf_reader failed to initialize for preview.\n");
121+
llama_model_free(model); // Free model before exiting
122+
llama_backend_free();
123123
return 1;
124124
}
125125

126126
printf(" Dataset Name: %s\n",
127127
reader.llama_gguf_reader_get_metadata_str("training.dataset.name", "N/A").c_str());
128-
printf(" Sequence Count: %lu\n", reader.llama_gguf_reader_get_metadata_u64("training.sequence.count", 0));
128+
printf(" Sequence Count: %llu\n", static_cast<long long>(reader.llama_gguf_reader_get_metadata_u64("training.sequence.count", 0)));
129129
printf(" Tokenizer Model: %s\n",
130130
reader.llama_gguf_reader_get_metadata_str("training.tokenizer.gguf.model", "N/A").c_str());
131131

@@ -153,14 +153,19 @@ int main(int argc, char ** argv) {
153153
std::string detokenized_text = "";
154154
// Buffer for a single token
155155
std::array<char, 256> piece_buf; // Large enough buffer for a single token
156-
for (llama_token token : sequence_tokens) {
157-
int n_chars = llama_token_to_piece(llama_model_get_vocab(model), token,
158-
piece_buf.data(), piece_buf.size(), 1, false);
159-
if (n_chars > 0) {
160-
detokenized_text.append(piece_buf.data(), n_chars);
156+
// Ensure model is valid before calling llama_model_get_vocab
157+
if (model != nullptr) {
158+
for (llama_token token : sequence_tokens) {
159+
int n_chars = llama_token_to_piece(llama_model_get_vocab(model), token,
160+
piece_buf.data(), piece_buf.size(), 1, false);
161+
if (n_chars > 0) {
162+
detokenized_text.append(piece_buf.data(), n_chars);
163+
}
161164
}
165+
printf(" Detokenized: \"%s\"\n", detokenized_text.c_str());
166+
} else {
167+
fprintf(stderr, " Warning: Cannot detokenize preview, model is null.\n");
162168
}
163-
printf(" Detokenized: \"%s\"\n", detokenized_text.c_str());
164169
}
165170

166171
} else {
@@ -173,10 +178,16 @@ int main(int argc, char ** argv) {
173178

174179
} catch (const std::runtime_error & e) {
175180
fprintf(stderr, "error: GGUF preview failed: %s\n", e.what());
181+
llama_model_free(model); // Free model before exiting
182+
llama_backend_free();
176183
return 1;
177184
}
178185
printf("--- End of GGUF file preview ---\n");
179186
}
180187

188+
// Clean up llama model and backend after all usage
189+
llama_model_free(model);
190+
llama_backend_free();
191+
181192
return 0;
182193
}

tools/dataset-converter/dataset-to-gguf/llama-gguf-file.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,6 @@ struct llama_gguf_file {
108108

109109
private:
110110
struct gguf_context * m_ctx; // The underlying GGUF context
111-
struct ggml_context * m_ggml_ctx; // ggml_context for tensor data when reading
112111

113112
// Private helper function to find a key by name.
114113
// key: The key name to find.

tools/dataset-converter/dataset-to-gguf/tests/dataset-to-gguf-tests.cpp

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,27 @@ static std::string g_test_model_path = "../../gte-small.Q2_K.gguf"; // Specif
2828
return false; \
2929
} \
3030
} while (0)
31+
bool SetUpLlamaBackend();
32+
bool Testllama_gguf_file_DefaultConstructorInitializesContext();
33+
bool Testllama_gguf_file_ConstructorFromFileThrowsOnError();
34+
bool Testllama_gguf_file_SetAndGetMetadataString();
35+
bool Testllama_gguf_file_SetAndGetMetadataU64();
36+
bool Testllama_gguf_file_SetAndGetMetadataStringArray();
37+
bool CreateTestllama_gguf_file(const std::string & path, llama_model * model_ptr);
38+
bool Testllama_gguf_reader_ConstructorInitializesFromFile();
39+
bool Testllama_gguf_reader_GetMetadata();
40+
bool Testllama_gguf_reader_GetTensorCount();
41+
bool Testllama_gguf_reader_GetTensorNameAndTypeAndSize();
42+
bool Testllama_gguf_reader_ReadTensorData();
43+
bool Testllama_gguf_reader_ReadTensorDataInvalidIndex();
44+
bool TestTextDataReader_OpenFile();
45+
bool TestTextDataReader_ReadNextSequenceTextMode();
46+
bool TestTextDataReader_ReadNextSequencePreTokenizedMode();
47+
bool TestTextDataReader_ResetFunctionality();
48+
bool TestTextDataReader_GetTotalSequences();
49+
bool Testllama_gguf_converter_ConvertTextFileSuccess();
50+
void TearDownLlamaBackend();
51+
3152

3253
// Global setup for llama.cpp backend
3354
bool SetUpLlamaBackend() {
@@ -419,9 +440,10 @@ bool Testllama_gguf_converter_ConvertTextFileSuccess() {
419440
params.max_seq_len = 128;
420441
params.pre_tokenized = false;
421442
params.dataset_format = "text";
443+
#ifdef LLAMA_PARQUET
422444
params.parquet_text_column = "text"; // Not used for text, but for completeness
423445
params.parquet_tokens_column = "tokens"; // Not used for text, but for completeness
424-
446+
#endif
425447
llama_gguf_converter converter;
426448
TEST_ASSERT(converter.llama_gguf_converter_convert(params, g_llama_model), "GGUF conversion failed");
427449

@@ -448,7 +470,7 @@ bool Testllama_gguf_converter_ConvertTextFileSuccess() {
448470
// Main function to run all tests
449471
// =============================================================================
450472

451-
int main(int argc, char ** argv) {
473+
int main() {
452474
printf("Running dataset-to-gguf tests...\n\n");
453475

454476
// Global setup for llama.cpp backend

0 commit comments

Comments
 (0)