tool: fix convertation of text/parquet to custom format

lexasub · lexasub · commit 3943a4c79d2d · 2025-07-11T19:40:14.000+04:00
diff --git a/tools/dataset-converter/CMakeLists.txt b/tools/dataset-converter/CMakeLists.txt
@@ -5,7 +5,7 @@ if(LLAMA_PARQUET)
     find_package(Parquet REQUIRED)
 endif()
 
-add_library(dataset-to-gguf-lib
+add_library(dataset-to-gguf-lib STATIC
         dataset-to-gguf/llama-gguf-writer.cpp
         dataset-to-gguf/llama-gguf-file.cpp
         dataset-to-gguf/llama-dataset-reader/llama-text-data-reader.cpp
@@ -14,28 +14,22 @@ add_library(dataset-to-gguf-lib
         dataset-to-gguf/llama-dataset-reader/llama-parquet-data-reader.cpp
 )
 
-# Link libraries for dataset-to-gguf-lib
+target_compile_features(dataset-to-gguf-lib PRIVATE cxx_std_17)
+
 target_link_libraries(dataset-to-gguf-lib common llama ${CMAKE_THREAD_LIBS_INIT})
 if(LLAMA_PARQUET)
     target_link_libraries(dataset-to-gguf-lib Arrow::arrow_shared Parquet::parquet_shared)
 endif()
-target_compile_features(dataset-to-gguf-lib PRIVATE cxx_std_11)
-
 
 add_executable(convert-to-train-gguf convert-to-train-gguf.cpp)
-target_link_libraries(convert-to-train-gguf PRIVATE dataset-to-gguf-lib) # Link to the new library
-target_compile_features(convert-to-train-gguf PRIVATE cxx_std_11) # Apply C++ standard to the executable
+add_dependencies(convert-to-train-gguf dataset-to-gguf-lib)
+target_link_libraries(convert-to-train-gguf PRIVATE dataset-to-gguf-lib)
 
-# Define the executable for the unit tests
 set(TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS dataset-to-gguf-unit-tests)
 add_executable(${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} dataset-to-gguf/tests/dataset-to-gguf-tests.cpp)
-
-# Link necessary libraries for the test executable
+add_dependencies(${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} dataset-to-gguf-lib)
 target_link_libraries(${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} PRIVATE common llama dataset-to-gguf-lib)
 
-# Ensure C++17 for filesystem usage for the test executable
-target_compile_features(${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} PRIVATE cxx_std_17)
-
 add_test(
         NAME ${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} #
         COMMAND $<TARGET_FILE:${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS}>
diff --git a/tools/dataset-converter/convert-to-train-gguf.cpp b/tools/dataset-converter/convert-to-train-gguf.cpp
@@ -79,8 +79,8 @@ int main(int argc, char ** argv) {
                    tokenizer_model_reader.llama_gguf_reader_get_metadata_str("general.name", "N/A").c_str());
             printf("  Tokenizer Model Architecture: %s\n",
                    tokenizer_model_reader.llama_gguf_reader_get_metadata_str("general.architecture", "N/A").c_str());
-            printf("  Tokenizer Model Tensor Count: %ld\n",
-                   tokenizer_model_reader.llama_gguf_reader_get_tensor_count());
+            printf("  Tokenizer Model Tensor Count: %llu\n",
+                   static_cast<long long>(tokenizer_model_reader.llama_gguf_reader_get_tensor_count()));
             printf("  Diagnostic Test: Tokenizer Model GGUF read successful.\n");
         } else {
             fprintf(stderr, "error: Diagnostic Test: Tokenizer Model GGUF read failed to initialize.\n");
@@ -100,12 +100,10 @@ int main(int argc, char ** argv) {
     llama_gguf_converter converter;
     bool success = converter.llama_gguf_converter_convert(params, model);
 
-    // Clean up llama model
-    llama_model_free(model);
-    llama_backend_free();
-
     if (!success) {
         fprintf(stderr, "error: GGUF conversion failed.\n");
+        llama_model_free(model); // Free model on conversion failure
+        llama_backend_free();
         return 1;
     }
 
@@ -120,12 +118,14 @@ int main(int argc, char ** argv) {
 
             if (!reader.llama_gguf_reader_is_initialized()) {
                 fprintf(stderr, "error: llama_gguf_reader failed to initialize for preview.\n");
+                llama_model_free(model); // Free model before exiting
+                llama_backend_free();
                 return 1;
             }
 
             printf("  Dataset Name: %s\n",
                    reader.llama_gguf_reader_get_metadata_str("training.dataset.name", "N/A").c_str());
-            printf("  Sequence Count: %lu\n", reader.llama_gguf_reader_get_metadata_u64("training.sequence.count", 0));
+            printf("  Sequence Count: %llu\n", static_cast<long long>(reader.llama_gguf_reader_get_metadata_u64("training.sequence.count", 0)));
             printf("  Tokenizer Model: %s\n",
                    reader.llama_gguf_reader_get_metadata_str("training.tokenizer.gguf.model", "N/A").c_str());
 
@@ -153,14 +153,19 @@ int main(int argc, char ** argv) {
                             std::string detokenized_text = "";
                             // Buffer for a single token
                             std::array<char, 256> piece_buf;  // Large enough buffer for a single token
-                            for (llama_token token : sequence_tokens) {
-                                int n_chars = llama_token_to_piece(llama_model_get_vocab(model), token,
-                                                                   piece_buf.data(), piece_buf.size(), 1, false);
-                                if (n_chars > 0) {
-                                    detokenized_text.append(piece_buf.data(), n_chars);
+                            // Ensure model is valid before calling llama_model_get_vocab
+                            if (model != nullptr) {
+                                for (llama_token token : sequence_tokens) {
+                                    int n_chars = llama_token_to_piece(llama_model_get_vocab(model), token,
+                                                                       piece_buf.data(), piece_buf.size(), 1, false);
+                                    if (n_chars > 0) {
+                                        detokenized_text.append(piece_buf.data(), n_chars);
+                                    }
                                 }
+                                printf("    Detokenized: \"%s\"\n", detokenized_text.c_str());
+                            } else {
+                                fprintf(stderr, "    Warning: Cannot detokenize preview, model is null.\n");
                             }
-                            printf("    Detokenized: \"%s\"\n", detokenized_text.c_str());
                         }
 
                     } else {
@@ -173,10 +178,16 @@ int main(int argc, char ** argv) {
 
         } catch (const std::runtime_error & e) {
             fprintf(stderr, "error: GGUF preview failed: %s\n", e.what());
+            llama_model_free(model); // Free model before exiting
+            llama_backend_free();
             return 1;
         }
         printf("--- End of GGUF file preview ---\n");
     }
 
+    // Clean up llama model and backend after all usage
+    llama_model_free(model);
+    llama_backend_free();
+
     return 0;
 }
diff --git a/tools/dataset-converter/dataset-to-gguf/llama-gguf-file.h b/tools/dataset-converter/dataset-to-gguf/llama-gguf-file.h
@@ -108,7 +108,6 @@ struct llama_gguf_file {
 
   private:
     struct gguf_context * m_ctx;       // The underlying GGUF context
-    struct ggml_context * m_ggml_ctx;  // ggml_context for tensor data when reading
 
     // Private helper function to find a key by name.
     // key: The key name to find.
diff --git a/tools/dataset-converter/dataset-to-gguf/tests/dataset-to-gguf-tests.cpp b/tools/dataset-converter/dataset-to-gguf/tests/dataset-to-gguf-tests.cpp
@@ -28,6 +28,27 @@ static std::string   g_test_model_path = "../../gte-small.Q2_K.gguf";  // Specif
             return false;                                                                                            \
         }                                                                                                            \
     } while (0)
+bool SetUpLlamaBackend();
+bool Testllama_gguf_file_DefaultConstructorInitializesContext();
+bool Testllama_gguf_file_ConstructorFromFileThrowsOnError();
+bool Testllama_gguf_file_SetAndGetMetadataString();
+bool Testllama_gguf_file_SetAndGetMetadataU64();
+bool Testllama_gguf_file_SetAndGetMetadataStringArray();
+bool CreateTestllama_gguf_file(const std::string & path, llama_model * model_ptr);
+bool Testllama_gguf_reader_ConstructorInitializesFromFile();
+bool Testllama_gguf_reader_GetMetadata();
+bool Testllama_gguf_reader_GetTensorCount();
+bool Testllama_gguf_reader_GetTensorNameAndTypeAndSize();
+bool Testllama_gguf_reader_ReadTensorData();
+bool Testllama_gguf_reader_ReadTensorDataInvalidIndex();
+bool TestTextDataReader_OpenFile();
+bool TestTextDataReader_ReadNextSequenceTextMode();
+bool TestTextDataReader_ReadNextSequencePreTokenizedMode();
+bool TestTextDataReader_ResetFunctionality();
+bool TestTextDataReader_GetTotalSequences();
+bool Testllama_gguf_converter_ConvertTextFileSuccess();
+void TearDownLlamaBackend();
+
 
 // Global setup for llama.cpp backend
 bool SetUpLlamaBackend() {
@@ -419,9 +440,10 @@ bool Testllama_gguf_converter_ConvertTextFileSuccess() {
     params.max_seq_len           = 128;
     params.pre_tokenized         = false;
     params.dataset_format        = "text";
+#ifdef LLAMA_PARQUET
     params.parquet_text_column   = "text";    // Not used for text, but for completeness
     params.parquet_tokens_column = "tokens";  // Not used for text, but for completeness
-
+#endif
     llama_gguf_converter converter;
     TEST_ASSERT(converter.llama_gguf_converter_convert(params, g_llama_model), "GGUF conversion failed");
 
@@ -448,7 +470,7 @@ bool Testllama_gguf_converter_ConvertTextFileSuccess() {
 // Main function to run all tests
 // =============================================================================
 
-int main(int argc, char ** argv) {
+int main() {
     printf("Running dataset-to-gguf tests...\n\n");
 
     // Global setup for llama.cpp backend