diff --git a/CMakeLists.txt b/CMakeLists.txt
index c79ccd09e097c..060d079477aa1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,6 +12,8 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
 
+message("CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
+
 # Add path to modules
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
 
@@ -84,6 +86,12 @@ option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
 # 3rd party libs
 option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
 option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
+option(LLAMA_PARQUET "Enable Parquet dataset support via Arrow/Parquet C++" OFF)
+
+
+if(LLAMA_PARQUET)
+    add_definitions(-DLLAMA_PARQUET)
+endif()
 
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
diff --git a/common/arg.cpp b/common/arg.cpp
index 56827a65908be..ec09c7c22239b 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1477,7 +1477,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, int value) {
             params.n_chunks = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_RETRIEVAL}));
     add_opt(common_arg(
         {"-fa", "--flash-attn"},
         string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
@@ -1539,7 +1539,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
             params.in_files.push_back(value);
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_FINETUNE}));
     add_opt(common_arg(
         {"-bf", "--binary-file"}, "FNAME",
         "binary file containing the prompt (default: none)",
@@ -2609,9 +2609,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"-o", "--output", "--output-file"}, "FNAME",
         string_format("output file (default: '%s')", params.out_file.c_str()),
         [](common_params & params, const std::string & value) {
-            params.out_file = value;
+          params.out_file = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
     add_opt(common_arg(
         {"-ofreq", "--output-frequency"}, "N",
         string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
@@ -3423,5 +3423,45 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
 
+    add_opt(common_arg(
+        {"--dataset-format"}, " ",
+        string_format("type of input data (e.g., 'text', 'parquet') (default: %s)", params.dataset_format.c_str()),
+        [](common_params & params, const std::string & format) {
+            params.dataset_format = format; //TODO ENUM CLASS
+        }
+    ).set_examples({LLAMA_EXAMPLE_FINETUNE}));
+
+    add_opt(common_arg(
+        {"--max-seq-len"}, " ",
+        string_format("max sequence length (default: %d)", params.max_seq_len),
+        [](common_params & params, int32_t max_seq_len) {
+            params.max_seq_len = max_seq_len;
+        }
+    ).set_examples({LLAMA_EXAMPLE_FINETUNE}));
+
+    add_opt(common_arg(
+        {"--pre-tokenized"},
+        string_format("input file contains pre-tokenized data (space-separated token IDs)"),
+        [](common_params & params) {
+            params.pre_tokenized = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_FINETUNE}));
+
+    add_opt(common_arg(
+        {"--preview"},
+        string_format("read and print metadata and first sequence from the output GGUF file (enables preview)"),
+        [](common_params & params) {
+            params.do_preview = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_FINETUNE}));
+
+    add_opt(common_arg(
+        {"--dataset-column"}, "<name>",
+        string_format("column name for data in dataset files"),
+        [](common_params & params, const std::string &dataset_column) {
+            params.dataset_column = dataset_column;
+        }
+    ).set_examples({LLAMA_EXAMPLE_FINETUNE}));
+
     return ctx_arg;
 }
diff --git a/common/common.h b/common/common.h
index a5abe32859fdd..570ab10f68ecb 100644
--- a/common/common.h
+++ b/common/common.h
@@ -4,12 +4,13 @@
 
 #include "llama-cpp.h"
 
+#include <cmath>
+#include <map>
 #include <set>
+#include <sstream>
 #include <string>
 #include <string_view>
 #include <vector>
-#include <map>
-#include <sstream>
 
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@@ -81,6 +82,7 @@ enum llama_example {
     LLAMA_EXAMPLE_LOOKUP,
     LLAMA_EXAMPLE_PARALLEL,
     LLAMA_EXAMPLE_TTS,
+    LLAMA_EXAMPLE_FINETUNE,
 
     LLAMA_EXAMPLE_COUNT,
 };
@@ -282,6 +284,7 @@ struct common_params {
     std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
     std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
     std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
+    std::string dataset_format = "text"; // "text" | "parquet"
 
     std::vector<std::string> in_files;   // all input files
     std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
@@ -443,6 +446,10 @@ struct common_params {
     // return false from callback to abort model loading or true to continue
     llama_progress_callback load_progress_callback = NULL;
     void *                  load_progress_callback_user_data = NULL;
+    int32_t max_seq_len = 2048;
+    bool do_preview = false;
+    bool pre_tokenized = false;
+    std::string dataset_column = "data";
 };
 
 // call once at the start of a program if it uses libcommon
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index d64956b843851..2e969f12f8b01 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -36,4 +36,5 @@ else()
         add_subdirectory(cvector-generator)
         add_subdirectory(export-lora)
     endif()
+    add_subdirectory(dataset-converter)
 endif()
diff --git a/tools/dataset-converter/CMakeLists.txt b/tools/dataset-converter/CMakeLists.txt
new file mode 100644
index 0000000000000..521cd7d815dec
--- /dev/null
+++ b/tools/dataset-converter/CMakeLists.txt
@@ -0,0 +1,37 @@
+include_directories(. ../../common)
+
+if(LLAMA_PARQUET)
+    find_package(Arrow REQUIRED)
+    find_package(Parquet REQUIRED)
+endif()
+
+add_library(dataset-to-gguf-lib STATIC
+        dataset-to-gguf/llama-gguf-writer.cpp
+        dataset-to-gguf/llama-gguf-file.cpp
+        dataset-to-gguf/llama-dataset-reader/llama-text-data-reader.cpp
+        dataset-to-gguf/llama-gguf-converter.cpp
+        dataset-to-gguf/llama-gguf-reader.cpp
+        dataset-to-gguf/llama-dataset-reader/llama-parquet-data-reader.cpp
+)
+
+target_compile_features(dataset-to-gguf-lib PRIVATE cxx_std_17)
+
+target_link_libraries(dataset-to-gguf-lib common llama ${CMAKE_THREAD_LIBS_INIT})
+if(LLAMA_PARQUET)
+    target_link_libraries(dataset-to-gguf-lib Arrow::arrow_shared Parquet::parquet_shared)
+endif()
+
+add_executable(convert-to-train-gguf convert-to-train-gguf.cpp)
+add_dependencies(convert-to-train-gguf dataset-to-gguf-lib)
+target_link_libraries(convert-to-train-gguf PRIVATE dataset-to-gguf-lib)
+
+set(TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS dataset-to-gguf-unit-tests)
+add_executable(${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} dataset-to-gguf/tests/dataset-to-gguf-tests.cpp)
+add_dependencies(${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} dataset-to-gguf-lib)
+target_link_libraries(${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} PRIVATE common llama dataset-to-gguf-lib)
+
+add_test(
+        NAME ${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} #
+        COMMAND $<TARGET_FILE:${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS}>
+        LABEL "training"
+)
diff --git a/tools/dataset-converter/README.md b/tools/dataset-converter/README.md
new file mode 100644
index 0000000000000..7033fe13c61b6
--- /dev/null
+++ b/tools/dataset-converter/README.md
@@ -0,0 +1,148 @@
+`convert-to-train-gguf` Utility
+===============================
+
+This utility is designed to convert text datasets (or pre-tokenized data) into the GGUF format, optimized for training models in `llama.cpp`.
+
+Features
+--------
+
+*   **Two-pass processing**: Efficiently handles large datasets that do not fit entirely into RAM, performing a first pass to collect metadata and a second pass to write the actual tensor data.
+
+*   **Flexible input**: Supports reading both raw text (with subsequent tokenization using a provided model) and pre-tokenized data (in the format of space-separated token IDs).
+
+*   **Modular architecture**: The code is divided into several classes (`llama_gguf_file`, `llama_gguf_writer`, `llama_dataset_reader`, `llama_text_dataset_reader`, `llama_gguf_converter`, `llama_gguf_reader`) to improve modularity, extensibility, and testability.
+
+*   **Preview functionality**: Allows you to view metadata and the first few sequences of the generated GGUF file, including optional detokenization.
+
+
+GGUF Structure for Training Data
+--------------------------------
+
+The generated GGUF files follow a specific structure for training data:
+
+*   **Metadata (KV pairs)**: All metadata keys are prefixed with `training.` to avoid conflicts with model metadata.
+
+    *   `training.format.version`: `string` (e.g., "1.0") - Specification version.
+
+    *   `training.dataset.name`: `string` (optional) - Dataset name (e.g., "OpenWebText-ru").
+
+    *   `training.dataset.source`: `string` (optional) - URL or description of the data source.
+
+    *   `training.file.creation_date`: `string` (ISO 8601) - File creation date.
+
+    *   `training.tokenizer.gguf.model`: `string` - Tokenizer model name (e.g., "llama", "gpt2", "bert").
+
+    *   `training.tokenizer.gguf.vocab`: `array[string]` - Tokenizer vocabulary.
+
+    *   `training.tokenizer.gguf.merges`: `array[string]` (optional) - Tokenizer merges (for BPE).
+
+    *   `training.tokenizer.gguf.pre`: `string` (optional) - Architecture of the pre-tokenizer.
+
+    *   `training.sequence.count`: `uint64` - Total number of sequences in the file.
+
+*   **Tensors**: Each training sequence is stored as a separate tensor.
+
+    *   **Naming**: `training.tensor.{index}` (e.g., `training.tensor.0`, `training.tensor.1`, ...). No leading zeros.
+
+    *   **Data type**: `GGML_TYPE_I32` (standard for tokens in `llama.cpp`).
+
+    *   **Shape**: `[sequence_length]` - One-dimensional array. `sequence_length` will vary for each tensor and can be obtained from the tensor's shape.
+
+
+Building
+--------
+
+It is assumed that you have already set up the `llama.cpp` build environment (e.g., using CMake) and installed Arrow and Parquet on your system.
+
+1.  **Clone the `llama.cpp` repository**:
+
+        git clone https://github.com/ggerganov/llama.cpp.git
+        cd llama.cpp
+
+
+2.  **Create a build directory and navigate into it**:
+
+        mkdir build
+        cd build
+
+
+3.  **Configure and build the project using CMake**:
+
+        cmake -DLLAMA_PARQUET=ON ..
+        cmake --build . --config Release
+
+
+    The `convert-to-train-gguf` utility will be built in the `build/bin` directory.
+
+
+Usage
+-----
+
+    ./bin/convert-to-train-gguf [options]
+
+
+### Command-line Options
+
+*   `-h`, `--help`: Show this help message and exit.
+
+*   `-m <path>, --model <path>` : Path to the GGUF model used for the tokenizer (default: `models/7B/ggml-model-f16.gguf`).
+
+*   `--in-file <path>`: Path to the input dataset file, either a plain text file or a Parquet file (default: `input.txt`).
+
+*   `-o <path>`, `--output <path>`: Path to save the output GGUF file to (default: `output.gguf`).
+
+*   `--max-seq-len <length>`: Maximum sequence length in tokens (default: `2048`). Sequences exceeding this length will be truncated.
+
+*   `--pre-tokenized`: Specifies that the input file contains pre-tokenized data (space-separated token IDs) rather than raw text.
+
+*   `--dataset-format <type>`: Type of input data (`text`, `parquet`). (default: `text`).
+
+*   `--parquet-text-column <name>`: For `parquet` input type, the column name containing raw text data (default: `text`).
+
+*   `--parquet-tokens-column <name>`: For `parquet` input type, the column name containing pre-tokenized data (list of int32) (default: `tokens`).
+
+*   `--preview`: Enables previewing of the generated GGUF file (prints metadata and the first few sequences).
+
+*   `--preview-count <N>`: Number of sequences to preview (default: `1`). Requires `--preview`.
+
+*   `--detokenize-preview`: Detokenize previewed sequences back into text for better readability. Requires `--preview`.
+
+
+### Usage Examples
+
+1.  **Converting a plain text file**:
+
+        ./bin/convert-to-train-gguf -m models/7B/ggml-model-f16.gguf -i my_dataset.txt -o my_training_data.gguf -l 1024
+
+
+2.  **Converting a pre-tokenized file**:
+
+        ./bin/convert-to-train-gguf -m models/7B/ggml-model-f16.gguf -i pre_tokenized_data.txt -o pre_tokenized_training_data.gguf -p
+
+
+    (Assumes `pre_tokenized_data.txt` contains lines like: `101 200 300 102 ...`)
+
+3.  **Converting a Parquet file with raw text**:
+
+        ./bin/convert-to-train-gguf -m models/7B/ggml-model-f16.gguf -i my_parquet_dataset.parquet -o my_training_data.gguf -t parquet --parquet-text-column "document_text"
+
+
+4.  **Converting a Parquet file with pre-tokenized data**:
+
+        ./bin/convert-to-train-gguf -m models/7B/ggml-model-f16.gguf -i my_tokenized_parquet.parquet -o my_training_data.gguf -t parquet -p --parquet-tokens-column "token_ids"
+
+
+5.  **Converting with a preview of 5 sequences and detokenization**:
+
+        ./bin/convert-to-train-gguf -m models/7B/ggml-model-f16.gguf -i my_dataset.txt -o my_training_data.gguf --preview --preview-count 5 --detokenize-preview
+
+
+
+Future Improvements
+-------------------
+
+*   **Improved Error Handling**: More detailed messages and handling of edge cases.
+
+*   **Additional Validation**: Data integrity checks at various stages.
+
+*   **Dataset Statistics**: Ability to output statistics on sequence lengths, token distribution, etc.
diff --git a/tools/dataset-converter/convert-to-train-gguf.cpp b/tools/dataset-converter/convert-to-train-gguf.cpp
new file mode 100644
index 0000000000000..eaf9638dc5a3e
--- /dev/null
+++ b/tools/dataset-converter/convert-to-train-gguf.cpp
@@ -0,0 +1,183 @@
+// Main utility for converting a text dataset to the GGUF format for training models in llama.cpp.
+//
+// Logic:
+// 1. Parses command line arguments.
+// 2. Loads the tokenizer model.
+// 3. Uses the llama_gguf_converter class to perform the entire conversion process:
+//    - First pass over the input data to collect metadata (sequence lengths).
+//    - Creation of the GGUF file and writing all collected metadata to it.
+//    - Second pass over the input data to add each sequence as a separate tensor to the GGUF file.
+// 4. After successful conversion, uses llama_gguf_reader to read and print
+//    some meta-information and the first record from the created GGUF file.
+//
+// This two-pass approach allows processing datasets significantly larger than
+// available RAM.
+
+#include "log.h"
+#include <algorithm>  // For std::min
+#include <array>      // For std::array
+#include <cinttypes>  // For PRIu64
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "arg.h"
+#include "common.h"
+#include "dataset-to-gguf/llama-gguf-converter.h"
+#include "dataset-to-gguf/llama-gguf-reader.h"
+#include "llama.h"  // For llama_backend_init, llama_backend_free, llama_model_load_from_file, llama_model_free
+#define PREVIEW_COUNT 1
+int main(int argc, char ** argv) {
+    common_params params;
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_FINETUNE)) {
+        return 1;
+    }
+
+    // Print parameters for verification
+    LOG_INF("Parameters:\n");
+    LOG_INF("  Model for tokenizer: %s\n", params.model.path.c_str());
+    LOG_INF("  Input files: ");
+    for (auto & i : params.in_files) {
+        LOG_INF("%s ", i.c_str());
+    }
+    LOG_INF("\n  Output file: %s\n", params.out_file.c_str());
+    LOG_INF("  Max sequence length: %d\n", params.max_seq_len);
+    LOG_INF("  Input type: %s\n", params.dataset_format.c_str());
+    LOG_INF("  Do preview: %s\n", params.do_preview ? "Yes" : "No");
+    if (params.dataset_format != "text") {
+        LOG_INF("  Dataset column: %s\n", params.dataset_column.c_str());
+    }
+    LOG_INF("\n");
+
+    // Initialize llama.cpp
+    llama_backend_init();
+
+    // Load the model for its tokenizer
+    llama_model_params model_params = llama_model_default_params();
+    llama_model *model        = llama_model_load_from_file(params.model.path.c_str(), model_params);
+
+    if (model == nullptr) {
+        LOG_ERR("error: failed to load model from %s\n", params.model.path.c_str());
+        llama_backend_free();
+        return 1;
+    }
+
+    // --- Diagnostic Test: Reading tokenizer model GGUF file ---
+    LOG_INF("--- Diagnostic Test: Reading tokenizer model GGUF file ---\n");
+    try {
+        llama_gguf_reader tokenizer_model_reader(params.model.path);
+        if (tokenizer_model_reader.llama_gguf_reader_is_initialized()) {
+            LOG_INF("  Tokenizer Model GGUF file opened successfully.\n");
+            LOG_INF("  Tokenizer Model Name: %s\n",
+                   tokenizer_model_reader.llama_gguf_reader_get_metadata_str("general.name", "N/A").c_str());
+            LOG_INF("  Tokenizer Model Architecture: %s\n",
+                   tokenizer_model_reader.llama_gguf_reader_get_metadata_str("general.architecture", "N/A").c_str());
+            LOG_INF("  Tokenizer Model Tensor Count: %llu\n",
+                   static_cast<long long>(tokenizer_model_reader.llama_gguf_reader_get_tensor_count()));
+            LOG_INF("  Diagnostic Test: Tokenizer Model GGUF read successful.\n");
+        } else {
+            LOG_ERR("error: Diagnostic Test: Tokenizer Model GGUF read failed to initialize.\n");
+            llama_model_free(model);  // Free model before exiting
+            llama_backend_free();
+            return 1;
+        }
+    } catch (const std::runtime_error & e) {
+        LOG_ERR("error: Diagnostic Test: Tokenizer Model GGUF read failed: %s\n", e.what());
+        llama_model_free(model);  // Free model before exiting
+        llama_backend_free();
+        return 1;
+    }
+    LOG_INF("--- End of Diagnostic Test ---\n\n");
+
+    // Create and run the converter
+    llama_gguf_converter converter;
+    bool success = converter.llama_gguf_converter_convert(params, model);
+
+    if (!success) {
+        LOG_ERR("error: GGUF conversion failed.\n");
+        llama_model_free(model); // Free model on conversion failure
+        llama_backend_free();
+        return 1;
+    }
+
+    LOG_INF("Conversion successful!\n");
+    LOG_INF("Output file: %s\n", params.out_file.c_str());
+
+    // --- Preview generated GGUF file (if requested) ---
+    if (params.do_preview) {
+        LOG_INF("\n--- Previewing generated GGUF file ---\n");
+        try {
+            llama_gguf_reader reader(params.out_file);
+
+            if (!reader.llama_gguf_reader_is_initialized()) {
+                LOG_ERR("error: llama_gguf_reader failed to initialize for preview.\n");
+                llama_model_free(model); // Free model before exiting
+                llama_backend_free();
+                return 1;
+            }
+
+            LOG_INF("  Dataset Name: %s\n",
+                   reader.llama_gguf_reader_get_metadata_str("training.dataset.name", "N/A").c_str());
+            LOG_INF("  Sequence Count: %llu\n", static_cast<long long>(reader.llama_gguf_reader_get_metadata_u64("training.sequence.count", 0)));
+            LOG_INF("  Tokenizer Model: %s\n",
+                   reader.llama_gguf_reader_get_metadata_str("training.tokenizer.gguf.model", "N/A").c_str());
+
+            int64_t tensor_count = reader.llama_gguf_reader_get_tensor_count();
+            if (tensor_count > 0) {
+                // Print N first sequences
+                for (int64_t i = 0; i < std::min(static_cast<int64_t>(PREVIEW_COUNT), tensor_count); ++i) {
+                    LOG_INF("  Sequence (training.tensor.%" PRId64 "):\n", i);
+                    std::vector<llama_token> sequence_tokens;
+                    if (reader.llama_gguf_reader_read_tensor_data(i, sequence_tokens)) {
+                        LOG_INF("    Length: %zu tokens\n", sequence_tokens.size());
+                        LOG_INF("    Tokens: [");
+                        for (size_t j = 0; j < std::min((size_t) 10, sequence_tokens.size());
+                             ++j) {  // Print up to 10 tokens
+                            LOG_INF("%d%s", sequence_tokens[j],
+                                   (j == std::min((size_t) 10, sequence_tokens.size()) - 1) ? "" : ", ");
+                        }
+                        if (sequence_tokens.size() > 10) {
+                            LOG_INF("...");
+                        }
+                        LOG_INF("]\n");
+                        // Detokenization
+                        std::string detokenized_text = "";
+                        // Buffer for a single token
+                        std::array<char, 256> piece_buf;  // Large enough buffer for a single token
+                        // Ensure model is valid before calling llama_model_get_vocab
+                        if (model != nullptr) {
+                            for (llama_token token : sequence_tokens) {
+                                int n_chars = llama_token_to_piece(llama_model_get_vocab(model), token,
+                                                                   piece_buf.data(), piece_buf.size(), 1, false);
+                                if (n_chars > 0) {
+                                    detokenized_text.append(piece_buf.data(), n_chars);
+                                }
+                            }
+                            LOG_INF("    Detokenized: \"%s\"\n", detokenized_text.c_str());
+                        } else {
+                            LOG_ERR("    Warning: Cannot detokenize preview, model is null.\n");
+                        }
+
+                    } else {
+                        LOG_ERR("    Error: Could not read data for sequence %" PRId64 ".\n", i);
+                    }
+                }
+            } else {
+                LOG_INF("  No sequences found in the GGUF file.\n");
+            }
+
+        } catch (const std::runtime_error & e) {
+            LOG_ERR("error: GGUF preview failed: %s\n", e.what());
+            llama_model_free(model); // Free model before exiting
+            llama_backend_free();
+            return 1;
+        }
+        LOG_INF("--- End of GGUF file preview ---\n");
+    }
+
+    // Clean up llama model and backend after all usage
+    llama_model_free(model);
+    llama_backend_free();
+
+    return 0;
+}
diff --git a/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-dataset-reader.h b/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-dataset-reader.h
new file mode 100644
index 0000000000000..bab5255a7aca1
--- /dev/null
+++ b/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-dataset-reader.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <string>   // For std::string
+#include <vector>   // For std::vector
+
+#include "llama.h"  // For llama_token
+
+// Abstract base class for reading dataset.
+// Defines the interface that all concrete readers must implement.
+struct llama_dataset_reader {
+    // Virtual destructor for correct deletion of derived classes.
+    virtual ~llama_dataset_reader() = default;
+
+    // Method to open the data source.
+    // path: path to the file or other data source identifier.
+    // Returns true if the source is successfully opened, otherwise false.
+    virtual bool open(const std::string & path) = 0;
+
+    // Method to read the next sequence of tokens.
+    // tokens: vector where the read tokens will be stored.
+    // Returns true if a sequence is successfully read, otherwise false (including end of file).
+    virtual bool read_next_sequence(std::vector<llama_token> & tokens) = 0;
+
+    // Method to close the data source.
+    virtual void close() = 0;
+
+    // Method to reset the reader to the beginning of the data source.
+    // Used for the second pass over the data.
+    virtual bool reset() = 0;
+
+    // Method to get the total number of sequences in the dataset.
+    // Can be implemented differently for various data source types.
+    // Returns 0 if the count is unknown or not applicable.
+    virtual uint64_t total_sequences() const = 0;
+};
diff --git a/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-parquet-data-reader.cpp b/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-parquet-data-reader.cpp
new file mode 100644
index 0000000000000..0d9eb05e0b470
--- /dev/null
+++ b/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-parquet-data-reader.cpp
@@ -0,0 +1,304 @@
+#ifdef LLAMA_PARQUET
+#include "llama-parquet-data-reader.h"
+
+#include <algorithm>  // For std::min
+#include <iostream>   // For std::cerr
+
+// Constructor
+llama_parquet_dataset_reader::llama_parquet_dataset_reader(const struct llama_model * model, int32_t max_seq_len,
+                                                           bool pre_tokenized, const std::string & dataset_column_name) :
+    model_(model),
+    max_seq_len_(max_seq_len),
+    pre_tokenized_(pre_tokenized),
+    current_row_group_index_(0),  // Initialize row group index
+    current_row_in_table_(0),
+    current_column_index_(-1),    // Initialize to -1, will be set in open
+    dataset_column_name_(dataset_column_name) {}
+
+// Destructor
+llama_parquet_dataset_reader::~llama_parquet_dataset_reader() {
+    llama_parquet_dataset_reader::close();
+}
+
+// Opens the Parquet file for reading.
+bool llama_parquet_dataset_reader::open(const std::string & path) {
+    // Close any previously open file
+    // Note: m_file_path is NOT cleared here, it's preserved for reset()
+    close();
+
+    m_file_path = path;  // Store the file path for reset()
+
+    // Open the Parquet file
+    arrow::Status status = arrow::io::ReadableFile::Open(path).Value(&input_file_);
+    if (!status.ok()) {
+        std::cerr << "Error (llama_parquet_dataset_reader::open): Failed to open Parquet file '" << path
+                  << "': " << status.ToString() << std::endl;
+        return false;
+    }
+
+    // Create a Parquet reader using parquet::arrow::OpenFile
+    arrow::Result<std::unique_ptr<parquet::arrow::FileReader>> reader_raw =
+        parquet::arrow::OpenFile(input_file_, arrow::default_memory_pool());
+
+    if (!reader_raw.ok()) {
+        std::cerr << "Error (llama_parquet_dataset_reader::open): Failed to create Parquet file reader for '" << path
+                  << "': " << reader_raw.status().ToString() << std::endl;
+        close();
+        return false;
+    }
+    parquet_reader_ = std::move(reader_raw.ValueUnsafe());
+
+    // Get the schema to determine the correct column index
+    std::shared_ptr<arrow::Schema> schema;
+    status = parquet_reader_->GetSchema(&schema);  // Corrected: Use GetSchema and pass by address
+    if (!status.ok() || schema == nullptr) {
+        std::cerr << "Error (llama_parquet_dataset_reader::open): Failed to get schema from Parquet file: "
+                  << status.ToString() << std::endl;
+        close();
+        return false;
+    }
+
+    // Determine the column index based on pre_tokenized_ flag
+    if (pre_tokenized_) {
+        current_column_index_ = schema->GetFieldIndex(dataset_column_name_);  // Use configurable name
+        if (current_column_index_ == -1) {
+            std::cerr << "Error (llama_parquet_dataset_reader::open): Pre-tokenized mode selected, but column '"
+                      << dataset_column_name_ << "' not found in Parquet schema." << std::endl;
+            close();
+            return false;
+        }
+        // Validate column type: should be List<Int32>
+        if (schema->field(current_column_index_)->type()->id() != arrow::Type::LIST) {
+            std::cerr << "Error (llama_parquet_dataset_reader::open): Column '" << dataset_column_name_
+                      << "' is not of LIST type as expected for pre-tokenized data. Actual type: "
+                      << schema->field(current_column_index_)->type()->ToString() << std::endl;
+            close();
+            return false;
+        }
+        auto list_type = std::static_pointer_cast<arrow::ListType>(schema->field(current_column_index_)->type());
+        if (list_type->value_type()->id() != arrow::Type::INT32) {
+            std::cerr << "Error (llama_parquet_dataset_reader::open): List items in column '" << dataset_column_name_
+                      << "' are not of INT32 type as expected. Actual value type: "
+                      << list_type->value_type()->ToString() << std::endl;
+            close();
+            return false;
+        }
+
+    } else {
+        current_column_index_ = schema->GetFieldIndex(dataset_column_name_);  // Use configurable name
+        if (current_column_index_ == -1) {
+            std::cerr << "Error (llama_parquet_dataset_reader::open): Raw text mode selected, but column '"
+                      << dataset_column_name_ << "' not found in Parquet schema." << std::endl;
+            close();
+            return false;
+        }
+        // Validate column type: should be String
+        if (schema->field(current_column_index_)->type()->id() != arrow::Type::STRING) {
+            std::cerr << "Error (llama_parquet_dataset_reader::open): Column '" << dataset_column_name_
+                      << "' is not of STRING type as expected for raw text. Actual type: "
+                      << schema->field(current_column_index_)->type()->ToString() << std::endl;
+            close();
+            return false;
+        }
+    }
+
+    // Initialize row group index
+    current_row_group_index_ = 0;
+    // Read the first batch (row group)
+    return llama_parquet_dataset_reader_get_next_batch();
+}
+
+// Reads the next sequence of tokens from the Parquet file.
+bool llama_parquet_dataset_reader::read_next_sequence(std::vector<llama_token> & tokens) {
+    tokens.clear();
+
+    // If current_table_ is null or we've processed all rows in the current batch, get the next batch (row group)
+    if (!current_table_ || current_row_in_table_ >= current_table_->num_rows()) {
+        if (!llama_parquet_dataset_reader_get_next_batch()) {
+            return false;  // No more batches/row groups or error getting next batch
+        }
+    }
+
+    if (!current_table_ || current_table_->num_rows() == 0) {
+        return false;  // Should not happen if get_next_batch was successful, but as a safeguard
+    }
+
+    // Assuming single chunk for simplicity. For multi-chunk columns, you'd iterate through chunks.
+    // When reading a column from a row group, it typically returns a single chunk.
+    std::shared_ptr<arrow::Array> column_array =
+        current_table_->column(0)->chunk(0);  // column(0) because we read only one column into current_table_
+
+    if (pre_tokenized_) {
+        // Pre-tokenized data: read List<Int32> array
+        auto list_array  = std::static_pointer_cast<arrow::ListArray>(column_array);
+        auto value_array = std::static_pointer_cast<arrow::Int32Array>(list_array->values());
+
+        if (list_array->IsNull(current_row_in_table_)) {
+            // Handle null list (empty sequence)
+            current_row_in_table_++;
+            return true;
+        }
+
+        int32_t start_offset      = list_array->value_offset(current_row_in_table_);
+        int32_t end_offset        = list_array->value_offset(current_row_in_table_ + 1);
+        int32_t num_tokens_in_row = end_offset - start_offset;
+
+        tokens.reserve(std::min((int32_t) max_seq_len_, num_tokens_in_row));
+        for (int32_t i = 0; i < num_tokens_in_row && i < max_seq_len_; ++i) {
+            tokens.push_back(static_cast<llama_token>(value_array->Value(start_offset + i)));
+        }
+
+    } else {
+        // Raw text data: read String array and tokenize
+        if (!model_) {
+            std::cerr << "Error (llama_parquet_dataset_reader::read_next_sequence): Llama model not provided for "
+                         "tokenization of raw text."
+                      << std::endl;
+            return false;
+        }
+
+        auto string_array = std::static_pointer_cast<arrow::StringArray>(column_array);
+        if (string_array->IsNull(current_row_in_table_)) {
+            // Handle null string (empty sequence)
+            current_row_in_table_++;
+            return true;
+        }
+
+        std::string text = string_array->GetString(current_row_in_table_);
+        std::vector<llama_token> tokens_buffer(max_seq_len_);  // Use a temporary buffer for tokenization
+
+        int n_tokens = llama_tokenize(llama_model_get_vocab(model_), text.c_str(), text.length(), tokens_buffer.data(),
+                                      max_seq_len_, false, true);
+        if (n_tokens < 0) {
+            std::cerr << "Error (llama_parquet_dataset_reader::read_next_sequence): Tokenization failed for text: '"
+                      << text << "'" << std::endl;
+            current_row_in_table_++;
+            return true;  // Return true with empty tokens to continue processing
+        }
+        tokens.assign(tokens_buffer.begin(), tokens_buffer.begin() + n_tokens);
+    }
+
+    current_row_in_table_++;
+    return true;
+}
+
+// Closes the Parquet file.
+void llama_parquet_dataset_reader::close() {
+    parquet_reader_.reset();
+    current_row_group_reader_.reset();  // Reset row group reader
+    current_table_.reset();
+    chunked_array_.reset();             // Reset chunked array
+    if (input_file_) {
+        arrow::Status status = input_file_->Close();
+        if (!status.ok()) {
+            std::cerr << "Warning (llama_parquet_dataset_reader::close): Failed to close Arrow file: "
+                      << status.ToString() << std::endl;
+        }
+    }
+    input_file_.reset();
+    current_row_group_index_ = 0;  // Reset row group index
+    current_row_in_table_    = 0;
+    current_column_index_    = -1;
+    // m_file_path is NOT cleared here. It's preserved for reset()
+}
+
+// Resets the reader to the beginning of the Parquet file.
+bool llama_parquet_dataset_reader::reset() {
+    if (m_file_path.empty()) {  // Check if path is stored
+        std::cerr << "Error (llama_parquet_dataset_reader::reset): Cannot reset, file path was not stored."
+                  << std::endl;
+        return false;
+    }
+    // Re-open the file and re-initialize the reader
+    return open(m_file_path);  // Use the stored path
+}
+
+// Private helper to get the next batch of data (now a row group)
+bool llama_parquet_dataset_reader::llama_parquet_dataset_reader_get_next_batch() {
+    current_table_.reset();     // Clear previous table
+    current_row_in_table_ = 0;  // Reset row index for new table
+    chunked_array_.reset();     // Reset chunked array for new batch
+
+    if (!parquet_reader_) {
+        std::cerr << "Error (llama_parquet_dataset_reader::llama_parquet_dataset_reader_get_next_batch): Parquet "
+                     "reader is not initialized."
+                  << std::endl;
+        return false;
+    }
+
+    if (current_row_group_index_ >= parquet_reader_->num_row_groups()) {
+        return false;  // No more row groups
+    }
+
+    // Get the reader for the current row group
+    current_row_group_reader_ = parquet_reader_->RowGroup(current_row_group_index_);
+    if (!current_row_group_reader_) {
+        std::cerr << "Error (llama_parquet_dataset_reader::llama_parquet_dataset_reader_get_next_batch): Failed to get "
+                     "row group reader for index "
+                  << current_row_group_index_ << std::endl;
+        return false;
+    }
+
+    // Get the ColumnChunkReader for the specific column
+    std::shared_ptr<parquet::arrow::ColumnChunkReader> column_chunk_reader =
+        current_row_group_reader_->Column(current_column_index_);
+    if (!column_chunk_reader) {
+        std::cerr << "Error (llama_parquet_dataset_reader::llama_parquet_dataset_reader_get_next_batch): Failed to get "
+                     "column chunk reader for column "
+                  << current_column_index_ << " in row group " << current_row_group_index_ << std::endl;
+        return false;
+    }
+
+    // Read the column data into a ChunkedArray
+    arrow::Status status = column_chunk_reader->Read(&chunked_array_);  // Use member variable
+    if (!status.ok()) {
+        std::cerr << "Error (llama_parquet_dataset_reader::llama_parquet_dataset_reader_get_next_batch): Failed to "
+                     "read column "
+                  << current_column_index_ << " from row group " << current_row_group_index_ << ": "
+                  << status.ToString() << std::endl;
+        return false;
+    }
+
+    // Get the schema from the parquet_reader_ to construct the table
+    std::shared_ptr<arrow::Schema> schema;
+    status = parquet_reader_->GetSchema(&schema);
+    if (!status.ok() || schema == nullptr) {
+        std::cerr << "Error (llama_parquet_dataset_reader::llama_parquet_dataset_reader_get_next_batch): Failed to get "
+                     "schema from Parquet reader for column "
+                  << current_column_index_ << std::endl;
+        return false;
+    }
+
+    // Get the field for the current column index
+    std::shared_ptr<arrow::Field> column_field = schema->field(current_column_index_);
+    if (column_field == nullptr) {
+        std::cerr << "Error (llama_parquet_dataset_reader::llama_parquet_dataset_reader_get_next_batch): Column field "
+                     "is null for index "
+                  << current_column_index_ << std::endl;
+        return false;
+    }
+
+    current_table_ = arrow::Table::Make(arrow::schema({ column_field }),  // Create a schema with just this column
+                                        { chunked_array_ }                // Pass the chunked array as the column data
+    );
+
+    if (!current_table_ || current_table_->num_rows() == 0) {
+        return false;  // No data in this row group
+    }
+
+    current_row_group_index_++;  // Move to the next row group for the next call
+    return true;
+}
+
+// Method to get the total number of sequences in the dataset.
+// For Parquet files, this will be the number of rows obtained from metadata.
+uint64_t llama_parquet_dataset_reader::total_sequences() const {
+    if (!parquet_reader_) {
+        std::cerr << "Error (llama_parquet_dataset_reader::total_sequences): Parquet reader is not initialized."
+                  << std::endl;
+        return 0;
+    }
+    // Total number of rows in the Parquet file
+    return parquet_reader_->parquet_reader()->metadata()->num_rows();
+}
+#endif
diff --git a/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-parquet-data-reader.h b/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-parquet-data-reader.h
new file mode 100644
index 0000000000000..f00e272722225
--- /dev/null
+++ b/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-parquet-data-reader.h
@@ -0,0 +1,75 @@
+#pragma once
+#ifdef LLAMA_PARQUET
+#include "llama.h"  // For llama_token
+
+// Include necessary Apache Arrow and Parquet headers
+// You will need to link against these libraries (e.g., -larrow -lparquet)
+#include <arrow/api.h>
+#include <arrow/io/api.h>
+#include <parquet/arrow/reader.h>
+
+#include <memory>  // For std::unique_ptr
+#include <string>
+#include <vector>
+
+#include "llama-dataset-reader.h"
+
+// Implementation of DatasetReader for reading Parquet files.
+// This class will handle reading tokenized sequences from a Parquet file.
+struct llama_parquet_dataset_reader : public llama_dataset_reader {
+    // Constructor.
+    // model: Pointer to the llama model for tokenization (can be nullptr if data is pre-tokenized).
+    // max_seq_len: Maximum sequence length for truncation.
+    // pre_tokenized: If true, input data is already tokenized (token IDs in a numeric column).
+    // text_column_name: Name of the column containing raw text data.
+    // tokens_column_name: Name of the column containing pre-tokenized data (list<int32>).
+    llama_parquet_dataset_reader(const struct llama_model * model, int32_t max_seq_len, bool pre_tokenized,
+                                 const std::string & dataset_column_name);
+
+    // Destructor.
+    ~llama_parquet_dataset_reader();
+
+    // Opens the Parquet file for reading.
+    // path: Path to the Parquet file.
+    // Returns true if the source is successfully opened, otherwise false.
+    bool open(const std::string & path) override;
+
+    // Reads the next sequence of tokens from the Parquet file.
+    // tokens: Vector where the read tokens will be stored.
+    // Returns true if a sequence is successfully read, otherwise false (including end of file).
+    bool read_next_sequence(std::vector<llama_token> & tokens) override;
+
+    // Closes the Parquet file.
+    void close() override;
+
+    // Resets the reader to the beginning of the Parquet file.
+    // Returns true if reset is successful, otherwise false.
+    bool reset() override;
+
+    // Method to get the total number of sequences in the dataset.
+    // For Parquet files, this will be the number of rows obtained from metadata.
+    uint64_t total_sequences() const override;
+
+  private:
+    const struct llama_model * model_;                            // Llama model for tokenization (if needed)
+    int32_t                    max_seq_len_;                      // Maximum sequence length
+    bool                       pre_tokenized_;                    // Flag for pre-tokenized data
+
+    std::shared_ptr<arrow::io::ReadableFile>    input_file_;      // Arrow file handle
+    std::unique_ptr<parquet::arrow::FileReader> parquet_reader_;  // Parquet reader
+    std::shared_ptr<arrow::Table>               current_table_;   // Current table batch being processed
+    std::shared_ptr<arrow::ChunkedArray>        chunked_array_;   // Member to store the chunked array
+
+    int                                             current_row_group_index_;   // Current row group index
+    std::shared_ptr<parquet::arrow::RowGroupReader> current_row_group_reader_;  // Reader for the current row group
+
+    int64_t     current_row_in_table_;  // Current row index within the current_table_
+    int         current_column_index_;  // Index of the column containing text/tokens
+    std::string m_file_path;            // Path to the Parquet file
+
+    std::string dataset_column_name_;      // Configurable name for column
+
+    // Private helper to get the next batch of data (now a row group)
+    bool llama_parquet_dataset_reader_get_next_batch();
+};
+#endif
diff --git a/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-text-data-reader.cpp b/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-text-data-reader.cpp
new file mode 100644
index 0000000000000..57db707eb4de3
--- /dev/null
+++ b/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-text-data-reader.cpp
@@ -0,0 +1,125 @@
+#include "llama-text-data-reader.h"
+
+#include <algorithm>  // For std::min
+#include <iostream>   // For std::cerr
+#include <sstream>
+
+#include "llama.h"  // For llama_tokenize, llama_model_get_vocab
+
+// Constructor
+llama_text_dataset_reader::llama_text_dataset_reader(const struct llama_model * model, int32_t max_seq_len,
+                                                     bool pre_tokenized) :
+    m_model(model),
+    m_max_seq_len(max_seq_len),
+    m_pre_tokenized(pre_tokenized),
+    m_tokens_buffer(max_seq_len) {}
+
+// Destructor
+llama_text_dataset_reader::~llama_text_dataset_reader() {
+    llama_text_dataset_reader::close();
+}
+
+// Opens the text file for reading.
+bool llama_text_dataset_reader::open(const std::string & path) {
+    m_file_path = path;  // Store the file path
+    m_input_file.open(path);
+    if (!m_input_file.is_open()) {
+        std::cerr << "Error: Failed to open input file " << path << std::endl;
+        return false;
+    }
+    return true;
+}
+
+// Reads the next sequence of tokens from the file.
+bool llama_text_dataset_reader::read_next_sequence(std::vector<llama_token> & tokens) {
+    std::string line;
+    if (!std::getline(m_input_file, line)) {
+        return false;  // End of file or read error
+    }
+
+    tokens.clear();  // Clear the vector for a new sequence
+    int n_tokens = 0;
+
+    if (line.empty()) {
+        // Empty line, return an empty sequence
+        return true;
+    }
+
+    if (m_pre_tokenized) {
+        // Pre-tokenized data mode: parse tokens from the string
+        std::istringstream iss(line);
+        llama_token token_id;
+        while (iss >> token_id) {
+            if (n_tokens < m_max_seq_len) {
+                tokens.push_back(token_id);
+                n_tokens++;
+            } else {
+                // Truncate if it exceeds m_max_seq_len
+                break;
+            }
+        }
+    } else {
+        // Raw text data mode: tokenize the string
+        if (!m_model) {
+            std::cerr << "Error: Llama model not provided for tokenization of raw text." << std::endl;
+            return false;
+        }
+        // Ensure the buffer is large enough
+        if (m_tokens_buffer.size() < (size_t) m_max_seq_len) {
+            m_tokens_buffer.resize(m_max_seq_len);
+        }
+        n_tokens = llama_tokenize(llama_model_get_vocab(m_model), line.c_str(), line.length(), m_tokens_buffer.data(),
+                                  m_max_seq_len, false, true);
+        if (n_tokens < 0) {
+            std::cerr << "Error: Tokenization failed for line: " << line << std::endl;
+            // Return an empty sequence in case of tokenization error
+            return false;
+        }
+        tokens.assign(m_tokens_buffer.begin(), m_tokens_buffer.begin() + n_tokens);
+    }
+    return true;
+}
+
+// Closes the file.
+void llama_text_dataset_reader::close() {
+    if (m_input_file.is_open()) {
+        m_input_file.close();
+    }
+}
+
+// Resets the file pointer to the beginning of the file.
+bool llama_text_dataset_reader::reset() {
+    if (m_input_file.is_open()) {
+        m_input_file.clear();                  // Clear any error flags (e.g., EOF)
+        m_input_file.seekg(0, std::ios::beg);  // Move pointer to the beginning
+        return true;
+    }
+    // If not open, try to open it again using the stored path
+    return open(m_file_path);
+}
+
+// Method to get the total number of sequences in the dataset.
+// For text files, this will be the number of lines.
+// Note: This method will be slow for very large files,
+// as it reads the entire file to count lines.
+uint64_t llama_text_dataset_reader::total_sequences() const {
+    if (m_file_path.empty()) {
+        std::cerr << "Error (llama_text_dataset_reader::total_sequences): File path not set." << std::endl;
+        return 0;
+    }
+
+    std::ifstream temp_file(m_file_path);
+    if (!temp_file.is_open()) {
+        std::cerr << "Error (llama_text_dataset_reader::total_sequences): Failed to open file '" << m_file_path
+                  << "' for counting lines." << std::endl;
+        return 0;
+    }
+
+    uint64_t count = 0;
+    std::string line;
+    while (std::getline(temp_file, line)) {
+        count++;
+    }
+    temp_file.close();
+    return count;
+}
diff --git a/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-text-data-reader.h b/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-text-data-reader.h
new file mode 100644
index 0000000000000..47f38c9ffb1e9
--- /dev/null
+++ b/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-text-data-reader.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <fstream>  // For std::ifstream
+#include <string>   // For std::string
+#include <vector>   // For std::vector
+
+#include "llama-dataset-reader.h"
+#include "llama.h"  // For llama_token and llama_model
+
+// Implementation of DatasetReader for reading text files.
+// Supports both raw text and pre-tokenized data.
+struct llama_text_dataset_reader : public llama_dataset_reader {
+    // Constructor.
+    // model: pointer to the llama model for tokenization (can be nullptr if pre_tokenized is true).
+    // max_seq_len: maximum sequence length for truncation.
+    // pre_tokenized: if true, input data is already tokenized (token IDs as numbers).
+    llama_text_dataset_reader(const struct llama_model * model, int32_t max_seq_len, bool pre_tokenized);
+
+    // Destructor.
+    ~llama_text_dataset_reader();
+
+    // Opens the text file for reading.
+    bool open(const std::string & path) override;
+
+    // Reads the next sequence of tokens from the file.
+    // If pre_tokenized is true, parses numbers from the string.
+    // If pre_tokenized is false, tokenizes the string using llama_model.
+    bool read_next_sequence(std::vector<llama_token> & tokens) override;
+
+    // Closes the file.
+    void close() override;
+
+    // Resets the file pointer to the beginning of the file.
+    bool reset() override;
+
+    // Method to get the total number of sequences in the dataset.
+    // For text files, this will be the number of lines.
+    uint64_t total_sequences() const override;
+
+  private:
+    const struct llama_model * m_model;          // Model for tokenization
+    int32_t                    m_max_seq_len;    // Maximum sequence length
+    bool                       m_pre_tokenized;  // Flag for pre-tokenized data
+    std::ifstream              m_input_file;     // File stream object
+    std::string                m_file_path;      // File path for reset and total_sequences
+    std::vector<llama_token>   m_tokens_buffer;  // Internal buffer for tokens
+};
diff --git a/tools/dataset-converter/dataset-to-gguf/llama-gguf-converter.cpp b/tools/dataset-converter/dataset-to-gguf/llama-gguf-converter.cpp
new file mode 100644
index 0000000000000..7e853de33d0ed
--- /dev/null
+++ b/tools/dataset-converter/dataset-to-gguf/llama-gguf-converter.cpp
@@ -0,0 +1,164 @@
+// Utility for converting a text dataset to the GGUF format for training models in llama.cpp.
+//
+// Logic:
+// 1. Loads the tokenizer model.
+// 2. Performs a first pass over the input data to collect metadata (sequence lengths).
+// 3. Creates a GGUF file and writes all collected metadata to it.
+// 4. Performs a second pass over the input data to add each sequence as a separate tensor to the GGUF file.
+//
+// This two-pass approach allows processing datasets significantly larger than
+// available RAM.
+
+#include "llama-gguf-converter.h"  // Include the new header name for the converter
+
+#include <cinttypes>               // For PRIu64
+#include <cstdio>                  // For fprintf, snprintf
+#include <memory>                  // For std::unique_ptr
+#include <stdexcept>               // For std::runtime_error
+#include <vector>                  // For std::vector
+
+// Include the refactored GGUF and data reader headers
+#include "common.h"
+#include "llama-dataset-reader/llama-dataset-reader.h"
+#include "llama-dataset-reader/llama-parquet-data-reader.h"
+#include "llama-dataset-reader/llama-text-data-reader.h"
+#include "llama-gguf-file.h"
+#include "llama-gguf-writer.h"
+
+// Method to execute the conversion process.
+bool llama_gguf_converter::llama_gguf_converter_convert(const struct common_params & params,
+                                                        const struct llama_model *   model) {
+    // --- Create DataReader based on input_type ---
+    std::unique_ptr<llama_dataset_reader> reader;
+    if (params.dataset_format == "text") {
+        reader = std::make_unique<llama_text_dataset_reader>(model, params.max_seq_len, params.pre_tokenized);
+#ifdef LLAMA_PARQUET
+    } else if (params.dataset_format == "parquet") {
+        reader = std::make_unique<llama_parquet_dataset_reader>(model, params.max_seq_len, params.pre_tokenized, params.dataset_column);
+#endif
+    } else {
+        fprintf(stderr, "error: Unsupported input type: %s\n", params.dataset_format.c_str());
+        return false;
+    }
+
+    // Open the data source
+    if (!reader->open(params.in_files[0])) {  //now only first file
+        fprintf(stderr, "error: Failed to open data source %s\n", params.in_files[0].c_str());
+        return false;
+    }
+
+    uint64_t total_sequence_count = 0;
+    std::vector<uint32_t> sequence_lengths;  // Will store sequence lengths for text files
+
+    // --- FIRST PASS: Collect sequence lengths or get total count ---
+    printf("First pass: Reading input data and getting sequence lengths...\n");
+
+    if (params.dataset_format == "parquet") {
+        // For Parquet, get total sequence count from metadata
+        total_sequence_count = reader->total_sequences();
+        printf("First pass complete. Found %" PRIu64 " sequences (from Parquet metadata).\n\n", total_sequence_count);
+    } else {  // For text files
+        // For text files, perform a full first pass to count sequences
+        // and their lengths (as this is the only way to know the exact token count).
+        std::vector<llama_token> tokens;
+        while (reader->read_next_sequence(tokens)) {
+            sequence_lengths.push_back(tokens.size());
+        }
+        total_sequence_count = sequence_lengths.size();
+        printf("First pass complete. Found %" PRIu64 " sequences.\n\n", total_sequence_count);
+    }
+
+    // --- WRITE GGUF FILE ---
+    printf("Creating GGUF file...\n");
+    // Create a llama_gguf_file instance, which will manage the GGUF context
+    std::unique_ptr<llama_gguf_file> gguf_file;
+    try {
+        gguf_file = std::make_unique<llama_gguf_file>();
+    } catch (const std::runtime_error & e) {
+        fprintf(stderr, "error: Failed to initialize llama_gguf_file: %s\n", e.what());
+        return false;
+    }
+
+    // Pass the pointer to gguf_file to llama_gguf_writer
+    llama_gguf_writer writer(gguf_file.get());
+
+    // Initialize GGUF file metadata
+    writer.llama_gguf_writer_init_metadata(model, params.in_files[0], total_sequence_count);
+    printf("Metadata written.\n");
+
+    // --- SECOND PASS: Write tensors ---
+    printf("Second pass: Writing tensors to GGUF file...\n");
+    if (!reader->reset()) {
+        fprintf(stderr, "error: Failed to reset data reader for second pass.\n");
+        return false;
+    }
+
+    uint64_t current_sequence_idx = 0;
+    std::vector<llama_token> tokens;  // Reuse the tokens vector
+    while (reader->read_next_sequence(tokens)) {
+        if (current_sequence_idx >= total_sequence_count) {
+            fprintf(stderr,
+                    "error: file ended prematurely on second pass. Expected %" PRIu64
+                    " sequences, but reached end of file at %" PRIu64 ".\n",
+                    total_sequence_count, current_sequence_idx);
+            break;
+        }
+
+        uint32_t expected_n_tokens;
+        if (params.dataset_format == "text") {
+            // For text files, use lengths collected in the first pass
+            expected_n_tokens = sequence_lengths[current_sequence_idx];
+        } else {
+            // For Parquet, we don't know the expected length beforehand,
+            // so just use the actual length of the read sequence.
+            // If the Parquet file contains empty sequences, they will be handled.
+            expected_n_tokens = tokens.size();
+        }
+
+        uint32_t actual_n_tokens = tokens.size();
+
+        // If the number of tokens does not match (only for text where we know it beforehand),
+        // this is a critical error, as metadata collected in the first pass will be incorrect for this tensor.
+        // Abort conversion to avoid creating a corrupted GGUF file.
+        if (params.dataset_format == "text" && actual_n_tokens != expected_n_tokens) {
+            fprintf(stderr,
+                    "error: Tokenization mismatch on second pass for sequence %" PRIu64
+                    ". Expected %u tokens, got %u.\n",
+                    current_sequence_idx, expected_n_tokens, actual_n_tokens);
+            fprintf(
+                stderr,
+                "This indicates a non-deterministic tokenizer or an issue with input reading. Aborting conversion.\n");
+            return false;  // Abort conversion
+        }
+
+        // Add tensor only if there are tokens
+        if (actual_n_tokens > 0) {
+            writer.llama_gguf_writer_add_sequence_tensor(current_sequence_idx, tokens);
+        } else {
+            // If 0 tokens were expected, but the line was not empty, print a warning
+            // (This condition `expected_n_tokens != 0` is only relevant for text files,
+            // where we might have gotten 0 tokens in the first pass for a non-empty line.)
+            if (params.dataset_format == "text" && expected_n_tokens != 0) {
+                fprintf(stderr, "warning: sequence %" PRIu64 " resulted in 0 tokens on second pass, but expected %u.\n",
+                        current_sequence_idx, expected_n_tokens);
+                // Continue, as this might be acceptable for some datasets,
+                // but warn about potential inconsistency.
+            }
+        }
+        current_sequence_idx++;
+    }
+    reader->close();  // Close DataReader after use
+    printf("Second pass complete.\n\n");
+
+    // Save file to disk
+    printf("Writing GGUF data to %s...\n", params.out_file.c_str());
+    if (!writer.llama_gguf_writer_write_to_file(params.out_file)) {
+        fprintf(stderr, "error: Failed to write GGUF file %s\n", params.out_file.c_str());
+        return false;
+    }
+
+    printf("Conversion successful!\n");
+    printf("Output file: %s\n", params.out_file.c_str());
+
+    return true;
+}
diff --git a/tools/dataset-converter/dataset-to-gguf/llama-gguf-converter.h b/tools/dataset-converter/dataset-to-gguf/llama-gguf-converter.h
new file mode 100644
index 0000000000000..43dbb489d610c
--- /dev/null
+++ b/tools/dataset-converter/dataset-to-gguf/llama-gguf-converter.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "llama.h"  // For struct llama_model
+
+// Class encapsulating the high-level logic for converting
+// input data to the GGUF format.
+struct llama_gguf_converter {
+    // Default constructor.
+    llama_gguf_converter() = default;
+
+    // Method to execute the conversion process.
+    // params: A structure containing all necessary parameters for conversion.
+    // Returns true on successful conversion, false on error.
+    bool llama_gguf_converter_convert(const struct common_params & params, const struct llama_model * model);
+};
diff --git a/tools/dataset-converter/dataset-to-gguf/llama-gguf-file.cpp b/tools/dataset-converter/dataset-to-gguf/llama-gguf-file.cpp
new file mode 100644
index 0000000000000..5571dbfee25db
--- /dev/null
+++ b/tools/dataset-converter/dataset-to-gguf/llama-gguf-file.cpp
@@ -0,0 +1,163 @@
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+
+#include "llama-gguf-file.h"
+
+// Default constructor: Initializes an empty GGUF context for writing.
+llama_gguf_file::llama_gguf_file() : m_ctx(nullptr) {
+    m_ctx = gguf_init_empty();
+    if (!m_ctx) {
+        throw std::runtime_error("Failed to initialize empty GGUF context.");
+    }
+}
+
+// Constructor: Initializes a GGUF context from an existing file for reading.
+// path: Path to the GGUF file to open.
+llama_gguf_file::llama_gguf_file(const std::string & path) : m_ctx(nullptr) {
+    struct gguf_init_params params = {};
+    // When reading, we do NOT want gguf_init_from_file to allocate a ggml_context
+    // for tensors, as we will manage data reading manually using file offsets.
+    params.no_alloc = true;  // Ensure no allocation for tensor data by gguf_init_from_file
+    m_ctx = gguf_init_from_file(path.c_str(), params);
+    if (!m_ctx) {
+        throw std::runtime_error("Failed to initialize GGUF context from file: " + path);
+    }
+}
+
+// Destructor: Frees the GGUF context.
+llama_gguf_file::~llama_gguf_file() {
+    if (m_ctx) {
+        gguf_free(m_ctx);
+        m_ctx = nullptr;
+    }
+}
+
+// Checks if the GGUF context is initialized.
+// Returns true if initialized, false otherwise.
+bool llama_gguf_file::llama_gguf_file_is_initialized() const {
+    return m_ctx != nullptr;
+}
+
+// --- Methods for working with metadata (KV-pairs) ---
+
+// Sets a string value for a given key.
+void llama_gguf_file::llama_gguf_file_set_val_str(const std::string & key, const std::string & value) {
+    if (!m_ctx) {
+        throw std::runtime_error("GGUF context not initialized.");
+    }
+    gguf_set_val_str(m_ctx, key.c_str(), value.c_str());
+}
+
+// Sets a uint64_t value for a given key.
+void llama_gguf_file::llama_gguf_file_set_val_u64(const std::string & key, uint64_t value) {
+    if (!m_ctx) {
+        throw std::runtime_error("GGUF context not initialized.");
+    }
+    gguf_set_val_u64(m_ctx, key.c_str(), value);
+}
+
+// Sets an array of strings for a given key.
+void llama_gguf_file::llama_gguf_file_set_arr_str(const std::string & key, const std::vector<const char *> & values) {
+    if (!m_ctx) {
+        throw std::runtime_error("GGUF context not initialized.");
+    }
+    gguf_set_arr_str(m_ctx, key.c_str(), const_cast<const char **>(values.data()), values.size());
+}
+
+// Sets an array of data of a specified GGUF type for a given key.
+void llama_gguf_file::llama_gguf_file_set_arr_data(const std::string & key, gguf_type type, const void * data,
+                                                   size_t n) {
+    if (!m_ctx) {
+        throw std::runtime_error("GGUF context not initialized.");
+    }
+    gguf_set_arr_data(m_ctx, key.c_str(), type, data, n);
+}
+
+// Gets a string value by key.
+std::string llama_gguf_file::llama_gguf_file_get_val_str(const std::string & key,
+                                                         const std::string & defaultValue) const {
+    if (!m_ctx) {
+        return defaultValue;
+    }
+    int64_t key_id = llama_gguf_file_find_key(key);
+    if (key_id == -1 || gguf_get_kv_type(m_ctx, key_id) != GGUF_TYPE_STRING) {
+        return defaultValue;
+    }
+    return gguf_get_val_str(m_ctx, key_id);
+}
+
+// Gets a uint64_t value by key.
+uint64_t llama_gguf_file::llama_gguf_file_get_val_u64(const std::string & key, uint64_t defaultValue) const {
+    if (!m_ctx) {
+        return defaultValue;
+    }
+    int64_t key_id = llama_gguf_file_find_key(key);
+    if (key_id == -1 || gguf_get_kv_type(m_ctx, key_id) != GGUF_TYPE_UINT64) {
+        return defaultValue;
+    }
+    return gguf_get_val_u64(m_ctx, key_id);
+}
+
+// --- Methods for working with tensors ---
+
+// Adds a ggml_tensor to the GGUF context.
+void llama_gguf_file::llama_gguf_file_add_tensor(struct ggml_tensor * tensor) {
+    if (!m_ctx) {
+        throw std::runtime_error("GGUF context not initialized.");
+    }
+    gguf_add_tensor(m_ctx, tensor);
+}
+
+// Sets the data for a tensor by its name.
+void llama_gguf_file::llama_gguf_file_set_tensor_data(const std::string & name, const void * data) {
+    if (!m_ctx) {
+        throw std::runtime_error("GGUF context not initialized.");
+    }
+    gguf_set_tensor_data(m_ctx, name.c_str(), data);
+}
+
+// Gets the number of tensors in the GGUF file.
+int64_t llama_gguf_file::llama_gguf_file_get_n_tensors() const {
+    if (!m_ctx) {
+        return 0;
+    }
+    return gguf_get_n_tensors(m_ctx);
+}
+
+std::string llama_gguf_file::llama_gguf_file_get_tensor_name(int64_t idx) const {
+    return gguf_get_tensor_name(m_ctx, idx);
+}
+
+enum ggml_type llama_gguf_file::llama_gguf_file_get_tensor_type(int64_t idx) const {
+    return gguf_get_tensor_type(m_ctx, idx);
+}
+
+size_t llama_gguf_file::llama_gguf_file_get_tensor_size(int64_t idx) const {
+    return gguf_get_tensor_size(m_ctx, idx);
+}
+
+// Reads tensor data into a vector of llama_token.
+// This is specific for sequence tensors (GGML_TYPE_I32).
+bool llama_gguf_file::llama_gguf_file_write_to_file(const std::string & output_path, bool only_meta) {
+    if (!m_ctx) {
+        std::cerr << "Error: GGUF context is not initialized. Cannot write to file." << std::endl;
+        return false;
+    }
+    if (!gguf_write_to_file(m_ctx, output_path.c_str(), only_meta)) {
+        std::cerr << "Error: Failed to write GGUF file to " << output_path << std::endl;
+        return false;
+    }
+    return true;
+}
+
+struct gguf_context * llama_gguf_file::get_gguf_context() {
+    return m_ctx;
+}
+
+int64_t llama_gguf_file::llama_gguf_file_find_key(const std::string & key) const {
+    if (!m_ctx) {
+        return -1;
+    }
+    return gguf_find_key(m_ctx, key.c_str());
+}
diff --git a/tools/dataset-converter/dataset-to-gguf/llama-gguf-file.h b/tools/dataset-converter/dataset-to-gguf/llama-gguf-file.h
new file mode 100644
index 0000000000000..fc8abf0e9677b
--- /dev/null
+++ b/tools/dataset-converter/dataset-to-gguf/llama-gguf-file.h
@@ -0,0 +1,116 @@
+#pragma once
+
+#include <cstdint>    // For uint64_t, int64_t
+#include <stdexcept>  // For std::runtime_error
+#include <string>     // For std::string
+#include <vector>     // For std::vector
+
+#include "ggml.h"     // For struct ggml_tensor
+#include "gguf.h"     // For struct gguf_context, enum gguf_type
+#include "llama.h"
+
+// Opaque type for the GGUF file handler.
+typedef struct llama_gguf_file llama_gguf_file_t;
+
+// Class for encapsulating GGUF file operations.
+// It manages the underlying gguf_context and provides a higher-level API
+// for setting metadata, adding tensors, and reading/writing files.
+struct llama_gguf_file {
+    // Default constructor: Initializes an empty GGUF context for writing.
+    llama_gguf_file();
+
+    // Constructor: Initializes a GGUF context from an existing file for reading.
+    // path: Path to the GGUF file to open.
+    // Throws std::runtime_error if file cannot be opened or context cannot be initialized.
+    llama_gguf_file(const std::string & path);
+
+    // Destructor: Frees the GGUF context and the associated ggml_context.
+    ~llama_gguf_file();
+
+    // Checks if the GGUF context is initialized.
+    // Returns true if initialized, false otherwise.
+    bool llama_gguf_file_is_initialized() const;
+
+    // --- Methods for working with metadata (KV-pairs) ---
+
+    // Sets a string value for a given key.
+    // key: The metadata key.
+    // value: The string value to set.
+    void llama_gguf_file_set_val_str(const std::string & key, const std::string & value);
+
+    // Sets a uint64_t value for a given key.
+    // key: The metadata key.
+    // value: The uint64_t value to set.
+    void llama_gguf_file_set_val_u64(const std::string & key, uint64_t value);
+
+    // Sets an array of strings for a given key.
+    // key: The metadata key.
+    // values: A vector of C-style strings (const char*) to set.
+    void llama_gguf_file_set_arr_str(const std::string & key, const std::vector<const char *> & values);
+
+    // Sets an array of data of a specified GGUF type for a given key.
+    // key: The metadata key.
+    // type: The GGUF type of the data (e.g., GGUF_TYPE_INT32).
+    // data: Pointer to the data array.
+    // n: Number of elements in the data array.
+    void llama_gguf_file_set_arr_data(const std::string & key, gguf_type type, const void * data, size_t n);
+
+    // Gets a string value by key.
+    // key: The metadata key.
+    // default_value: The value to return if the key is not found or has a different type.
+    // Returns the string value or the default_value.
+    std::string llama_gguf_file_get_val_str(const std::string & key, const std::string & default_value = "") const;
+
+    // Gets a uint64_t value by key.
+    // key: The metadata key.
+    // default_value: The value to return if the key is not found or has a different type.
+    // Returns the uint64_t value or the default_value.
+    uint64_t llama_gguf_file_get_val_u64(const std::string & key, uint64_t default_value = 0) const;
+
+    // --- Methods for working with tensors ---
+
+    // Adds a ggml_tensor to the GGUF context.
+    // tensor: Pointer to the ggml_tensor to add.
+    void llama_gguf_file_add_tensor(struct ggml_tensor * tensor);
+
+    // Sets the data for a tensor by its name.
+    // name: The name of the tensor.
+    // data: Pointer to the tensor data.
+    void llama_gguf_file_set_tensor_data(const std::string & name, const void * data);
+
+    // Gets the number of tensors in the GGUF file.
+    // Returns the count of tensors.
+    int64_t llama_gguf_file_get_n_tensors(void) const;
+
+    // Gets the name of a tensor by index.
+    // idx: The index of the tensor.
+    // Returns the tensor name or an empty string if not found.
+    std::string llama_gguf_file_get_tensor_name(int64_t idx) const;
+
+    // Gets the type of a tensor by index.
+    // idx: The index of the tensor.
+    // Returns the ggml_type of the tensor.
+    enum ggml_type llama_gguf_file_get_tensor_type(int64_t idx) const;
+
+    // Gets the size of a tensor in bytes by index.
+    // idx: The index of the tensor.
+    // Returns the size of the tensor in bytes.
+    size_t llama_gguf_file_get_tensor_size(int64_t idx) const;
+
+    // --- Methods for saving/loading the file ---
+
+    // Writes the entire GGUF context to a file.
+    // output_path: Path to the output GGUF file.
+    // only_meta: If true, only metadata is written (no tensor data).
+    // Returns true on success, false on error.
+    bool llama_gguf_file_write_to_file(const std::string & output_path, bool only_meta);
+    struct gguf_context * get_gguf_context();
+
+  private:
+    struct gguf_context * m_ctx;       // The underlying GGUF context
+
+    // Private helper function to find a key by name.
+    // key: The key name to find.
+    // Returns the key ID or -1 if not found.
+    int64_t llama_gguf_file_find_key(const std::string & key) const;
+};
diff --git a/tools/dataset-converter/dataset-to-gguf/llama-gguf-reader.cpp b/tools/dataset-converter/dataset-to-gguf/llama-gguf-reader.cpp
new file mode 100644
index 0000000000000..5109b93612a9d
--- /dev/null
+++ b/tools/dataset-converter/dataset-to-gguf/llama-gguf-reader.cpp
@@ -0,0 +1,177 @@
+#include "llama-gguf-reader.h"  // Include the new header name
+
+#include <fstream>              // For reading tensor data directly
+#include <iostream>             // For std::cerr
+#include <stdexcept>            // For std::runtime_error
+
+// Constructor: Initializes the reader to read from the specified GGUF file.
+llama_gguf_reader::llama_gguf_reader(const std::string & path) : m_gguf_file_ptr(nullptr), m_file_path(path) {
+    try {
+        // Initialize llama_gguf_file in read mode (with ggml_context allocation)
+        m_gguf_file_ptr = std::make_unique<llama_gguf_file>(path);
+    } catch (const std::runtime_error & e) {
+        std::cerr << "Error: llama_gguf_reader constructor failed to initialize llama_gguf_file from path '" << path
+                  << "': " << e.what() << std::endl;
+        // Re-throw the exception as initialization failed
+        throw;
+    }
+}
+
+// Checks if the reader has been successfully initialized.
+bool llama_gguf_reader::llama_gguf_reader_is_initialized(void) const {
+    return m_gguf_file_ptr != nullptr && m_gguf_file_ptr->llama_gguf_file_is_initialized();
+}
+
+// Gets a string metadata value by key.
+std::string llama_gguf_reader::llama_gguf_reader_get_metadata_str(const std::string & key,
+                                                                  const std::string & default_value) const {
+    if (!llama_gguf_reader_is_initialized()) {
+        std::cerr << "Error (llama_gguf_reader::llama_gguf_reader_get_metadata_str): GGUFReader is not initialized. "
+                     "Cannot retrieve metadata for key '"
+                  << key << "'." << std::endl;
+        return default_value;
+    }
+    return m_gguf_file_ptr->llama_gguf_file_get_val_str(key, default_value);
+}
+
+// Gets a uint64_t metadata value by key.
+uint64_t llama_gguf_reader::llama_gguf_reader_get_metadata_u64(const std::string & key, uint64_t default_value) const {
+    if (!llama_gguf_reader_is_initialized()) {
+        std::cerr << "Error (llama_gguf_reader::llama_gguf_reader_get_metadata_u64): GGUFReader is not initialized. "
+                     "Cannot retrieve metadata for key '"
+                  << key << "'." << std::endl;
+        return default_value;
+    }
+    return m_gguf_file_ptr->llama_gguf_file_get_val_u64(key, default_value);
+}
+
+// Gets the number of tensors in the file.
+int64_t llama_gguf_reader::llama_gguf_reader_get_tensor_count(void) const {
+    if (!llama_gguf_reader_is_initialized()) {
+        std::cerr << "Error (llama_gguf_reader::llama_gguf_reader_get_tensor_count): GGUFReader is not initialized. "
+                     "Cannot get tensor count."
+                  << std::endl;
+        return 0;
+    }
+    return m_gguf_file_ptr->llama_gguf_file_get_n_tensors();
+}
+
+// Gets the name of a tensor by index.
+std::string llama_gguf_reader::llama_gguf_reader_get_tensor_name(int64_t index) const {
+    if (!llama_gguf_reader_is_initialized()) {
+        std::cerr << "Error (llama_gguf_reader::llama_gguf_reader_get_tensor_name): GGUFReader is not initialized. "
+                     "Cannot get tensor name for index "
+                  << index << "." << std::endl;
+        return "";
+    }
+    return m_gguf_file_ptr->llama_gguf_file_get_tensor_name(index);
+}
+
+// Gets the type of a tensor by index.
+enum ggml_type llama_gguf_reader::llama_gguf_reader_get_tensor_type(int64_t index) const {
+    if (!llama_gguf_reader_is_initialized()) {
+        std::cerr << "Error (llama_gguf_reader::llama_gguf_reader_get_tensor_type): GGUFReader is not initialized. "
+                     "Cannot get tensor type for index "
+                  << index << "." << std::endl;
+        return GGML_TYPE_COUNT;  // Unknown type
+    }
+    return m_gguf_file_ptr->llama_gguf_file_get_tensor_type(index);
+}
+
+// Gets the size of a tensor in bytes by index.
+size_t llama_gguf_reader::llama_gguf_reader_get_tensor_size(int64_t index) const {
+    if (!llama_gguf_reader_is_initialized()) {
+        std::cerr << "Error (llama_gguf_reader::llama_gguf_reader_get_tensor_size): GGUFReader is not initialized. "
+                     "Cannot get tensor size for index "
+                  << index << "." << std::endl;
+        return 0;
+    }
+    return m_gguf_file_ptr->llama_gguf_file_get_tensor_size(index);
+}
+
+// Reads tensor data by index into a vector of llama_token.
+bool llama_gguf_reader::llama_gguf_reader_read_tensor_data(int64_t index, std::vector<llama_token> & tokens) const {
+    if (!llama_gguf_reader_is_initialized()) {
+        std::cerr << "Error (GGUFReader::read_tensor_data): GGUFReader is not initialized. Cannot read tensor data."
+                  << std::endl;
+        return false;
+    }
+
+    struct gguf_context * ctx_internal = m_gguf_file_ptr->get_gguf_context();
+    if (!ctx_internal) {
+        std::cerr << "Error (GGUFReader::read_tensor_data): Internal GGUF context is null in GGUFFile." << std::endl;
+        return false;
+    }
+
+    if (index < 0 || index >= gguf_get_n_tensors(ctx_internal)) {
+        std::cerr << "Error (GGUFReader::read_tensor_data): Tensor with index " << index
+                  << " not found or out of bounds." << std::endl;
+        return false;
+    }
+
+    ggml_type tensor_ggml_type = gguf_get_tensor_type(ctx_internal, index);
+    if (tensor_ggml_type != GGML_TYPE_I32) {
+        std::cerr << "Error (GGUFReader::read_tensor_data): Tensor type for '"
+                  << gguf_get_tensor_name(ctx_internal, index)
+                  << "' is not GGML_TYPE_I32. Actual type: " << ggml_type_name(tensor_ggml_type) << std::endl;
+        return false;
+    }
+
+    size_t expected_bytes = gguf_get_tensor_size(ctx_internal, index);
+    if (expected_bytes == 0) {
+        tokens.clear();
+        return true;
+    }
+
+    size_t num_tokens = expected_bytes / sizeof(llama_token);
+    if (expected_bytes % sizeof(llama_token) != 0) {
+        std::cerr << "Warning (GGUFReader::read_tensor_data): Tensor size " << expected_bytes
+                  << " bytes is not a multiple of llama_token size (" << sizeof(llama_token) << " bytes) for tensor '"
+                  << gguf_get_tensor_name(ctx_internal, index) << "'. Data might be corrupted." << std::endl;
+    }
+
+    tokens.resize(num_tokens);
+
+    size_t data_offset_in_file = gguf_get_data_offset(ctx_internal) + gguf_get_tensor_offset(ctx_internal, index);
+
+    std::ifstream file(m_file_path, std::ios::binary);
+    if (!file.is_open()) {
+        std::cerr << "Error (GGUFReader::read_tensor_data): Could not open GGUF file '" << m_file_path
+                  << "' for reading tensor data." << std::endl;
+        return false;
+    }
+
+    // Seek to the calculated offset
+    file.seekg(data_offset_in_file, std::ios::beg);
+    if (file.fail()) {
+        std::cerr << "Error (GGUFReader::read_tensor_data): Failed to seek to offset " << data_offset_in_file
+                  << " in file '" << m_file_path << "'. Stream state: good=" << file.good() << " eof=" << file.eof()
+                  << " fail=" << file.fail() << " bad=" << file.bad() << std::endl;
+        file.close();
+        return false;
+    }
+
+    // Read the tensor data into the tokens vector
+    file.read(reinterpret_cast<char *>(tokens.data()), expected_bytes);
+
+    if (!file) {  // Check if the read operation failed or reached EOF before reading all bytes
+        std::cerr << "Error (GGUFReader::read_tensor_data): Failed to read " << expected_bytes << " bytes for tensor '"
+                  << gguf_get_tensor_name(ctx_internal, index) << "' from file '" << m_file_path << "'." << std::endl;
+        std::cerr << "  Stream state after read: good=" << file.good() << " eof=" << file.eof()
+                  << " fail=" << file.fail() << " bad=" << file.bad() << std::endl;
+        std::cerr << "  Bytes actually read: " << file.gcount() << std::endl;
+        file.close();
+        return false;
+    }
+    // Verify that the number of bytes read matches the expected bytes
+    if (file.gcount() != (std::streamsize) expected_bytes) {
+        std::cerr << "Error (GGUFReader::read_tensor_data): Mismatch in bytes read for tensor '"
+                  << gguf_get_tensor_name(ctx_internal, index) << "'. Expected " << expected_bytes << ", but read "
+                  << file.gcount() << "." << std::endl;
+        file.close();
+        return false;
+    }
+
+    file.close();
+    return true;
+}
diff --git a/tools/dataset-converter/dataset-to-gguf/llama-gguf-reader.h b/tools/dataset-converter/dataset-to-gguf/llama-gguf-reader.h
new file mode 100644
index 0000000000000..4a502691c2a14
--- /dev/null
+++ b/tools/dataset-converter/dataset-to-gguf/llama-gguf-reader.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include <cstdint>            // For int64_t, uint64_t
+#include <memory>             // For std::unique_ptr
+#include <string>             // For std::string
+#include <vector>             // For std::vector
+
+#include "ggml.h"             // For ggml_type
+#include "llama-gguf-file.h"  // For llama_gguf_file_t
+#include "llama.h"            // For llama_token
+
+// Class for reading GGUF files, providing access to metadata and tensor data.
+struct llama_gguf_reader {
+    // Constructor: Initializes the reader to read from the specified GGUF file.
+    // path: Path to the GGUF file.
+    // Throws std::runtime_error if the file cannot be opened or context cannot be initialized.
+    llama_gguf_reader(const std::string & path);
+
+    // Destructor.
+    ~llama_gguf_reader() = default;
+
+    // Checks if the reader has been successfully initialized.
+    bool llama_gguf_reader_is_initialized(void) const;
+
+    // Gets a string metadata value by key.
+    std::string llama_gguf_reader_get_metadata_str(const std::string & key,
+                                                   const std::string & default_value = "") const;
+
+    // Gets a uint64_t metadata value by key.
+    uint64_t llama_gguf_reader_get_metadata_u64(const std::string & key, uint64_t default_value = 0) const;
+
+    // Gets the number of tensors in the file.
+    int64_t llama_gguf_reader_get_tensor_count(void) const;
+
+    // Gets the name of a tensor by index.
+    std::string llama_gguf_reader_get_tensor_name(int64_t index) const;
+
+    // Gets the type of a tensor by index.
+    // Returns ggml_type.
+    enum ggml_type llama_gguf_reader_get_tensor_type(int64_t index) const;
+
+    // Gets the size of a tensor in bytes by index.
+    size_t llama_gguf_reader_get_tensor_size(int64_t index) const;
+
+    // Reads tensor data by index into a vector of llama_token.
+    // index: Index of the tensor.
+    // tokens: Vector where tokens will be read into.
+    // Returns true on success, false on error (e.g., tensor not found,
+    // or its type is not GGML_TYPE_I32, or size mismatch).
+    bool llama_gguf_reader_read_tensor_data(int64_t index, std::vector<llama_token> & tokens) const;
+
+  private:
+    std::unique_ptr<llama_gguf_file> m_gguf_file_ptr;  // Pointer to the llama_gguf_file object
+    std::string                      m_file_path;      // Path to the file from which GGUF is read
+};
diff --git a/tools/dataset-converter/dataset-to-gguf/llama-gguf-writer.cpp b/tools/dataset-converter/dataset-to-gguf/llama-gguf-writer.cpp
new file mode 100644
index 0000000000000..599b0c137aa2e
--- /dev/null
+++ b/tools/dataset-converter/dataset-to-gguf/llama-gguf-writer.cpp
@@ -0,0 +1,115 @@
+#include "llama-gguf-writer.h"
+#include "llama-gguf-file.h" // Include llama-gguf-file.h
+#include "llama.h"           // For llama_model_get_vocab, llama_vocab_n_tokens, llama_vocab_get_text, llama_model_meta_val_str
+
+#include <cinttypes>          // For PRIu64
+#include <cstdio>             // For snprintf
+#include <cstring>            // For memcpy
+#include <ctime>              // For time, gmtime, strftime
+#include <iostream>           // For std::cerr
+#include <stdexcept>          // For std::runtime_error
+#include <vector>             // For std::vector
+
+// Constructor: takes a pointer to a llama_gguf_file object
+llama_gguf_writer::llama_gguf_writer(llama_gguf_file * m_gguf_file_ptr) : m_gguf_file(m_gguf_file_ptr) {
+    if (!m_gguf_file) {
+        throw std::runtime_error("llama_gguf_file pointer provided to llama_gguf_writer is null.");
+    }
+    if (!m_gguf_file->llama_gguf_file_is_initialized()) {
+        throw std::runtime_error("llama_gguf_file provided to llama_gguf_writer is not initialized.");
+    }
+}
+
+// Initializes the GGUF file metadata
+void llama_gguf_writer::llama_gguf_writer_init_metadata(const struct llama_model * model,
+                                                        const std::string & input_path, uint64_t sequence_count) {
+    if (!m_gguf_file) {
+        std::cerr << "Error: llama_gguf_file is not set. Cannot set metadata." << std::endl;
+        return;
+    }
+
+    m_gguf_file->llama_gguf_file_set_val_str("training.format.version", "1.0");
+    m_gguf_file->llama_gguf_file_set_val_str("training.dataset.name", input_path);
+
+    // Set file creation date
+    time_t now = time(0);
+    char   buf[sizeof "2011-10-08T07:07:09Z"];
+    strftime(buf, sizeof buf, "%Y-%m-%dT%H:%M:%SZ", gmtime(&now));
+    m_gguf_file->llama_gguf_file_set_val_str("training.file.creation_date", buf);
+
+    // Set tokenizer information
+    char arch_name_buffer[128];
+    int  res = llama_model_meta_val_str(model, "general.architecture", arch_name_buffer, sizeof(arch_name_buffer));
+    if (res >= 0) {
+        m_gguf_file->llama_gguf_file_set_val_str("training.tokenizer.gguf.model", arch_name_buffer);
+    } else {
+        m_gguf_file->llama_gguf_file_set_val_str("training.tokenizer.gguf.model", "unknown");
+    }
+
+    // Set tokenizer vocabulary
+    const struct llama_vocab * vocab      = llama_model_get_vocab(model);
+    int                        vocab_size = llama_vocab_n_tokens(vocab);
+    std::vector<const char *>  vocab_list;
+    vocab_list.reserve(vocab_size);
+    for (int i = 0; i < vocab_size; ++i) {
+        vocab_list.push_back(llama_vocab_get_text(vocab, i));
+    }
+    m_gguf_file->llama_gguf_file_set_arr_str("training.tokenizer.gguf.vocab", vocab_list);
+
+    // Set total sequence count
+    m_gguf_file->llama_gguf_file_set_val_u64("training.sequence.count", sequence_count);
+}
+
+// Adds a sequence of tokens to the GGUF file as a tensor
+void llama_gguf_writer::llama_gguf_writer_add_sequence_tensor(uint64_t index, const std::vector<llama_token> & tokens) {
+    if (!m_gguf_file) {
+        std::cerr << "Error: llama_gguf_file is not set. Cannot add sequence tensor." << std::endl;
+        return;
+    }
+
+    if (tokens.empty()) {
+        return;
+    }
+
+    char tensor_name[128];
+    snprintf(tensor_name, sizeof(tensor_name), "training.tensor.%" PRIu64, index);
+
+    // Allocate enough memory for a temporary ggml_context to hold the tensor
+    size_t n_tokens = tokens.size();
+    size_t tensor_mem_size = ggml_tensor_overhead() + n_tokens * sizeof(int32_t);
+
+    struct ggml_init_params ggml_params = {};
+    ggml_params.mem_size = tensor_mem_size;
+    ggml_params.mem_buffer = nullptr;
+    ggml_params.no_alloc = false;
+
+    struct ggml_context * ggml_ctx = ggml_init(ggml_params);
+    if (!ggml_ctx) {
+        std::cerr << "Error: Failed to initialize ggml context for tensor " << index << std::endl;
+        return;
+    }
+
+    // Create a 1D tensor of type GGML_TYPE_I32
+    struct ggml_tensor * tensor = ggml_new_tensor_1d(ggml_ctx, GGML_TYPE_I32, n_tokens);
+    ggml_set_name(tensor, tensor_name);
+
+    // Copy token data to the tensor buffer
+    memcpy(tensor->data, tokens.data(), n_tokens * sizeof(int32_t));
+
+    // Add the tensor to the GGUF context via llama_gguf_file
+    m_gguf_file->llama_gguf_file_add_tensor(tensor);
+
+    // Set tensor data in the GGUF context via llama_gguf_file
+    m_gguf_file->llama_gguf_file_set_tensor_data(tensor_name, tokens.data());
+
+    ggml_free(ggml_ctx);  // Free the temporary ggml context
+}
+
+// Writes the entire GGUF context (metadata and tensors) to the specified file
+bool llama_gguf_writer::llama_gguf_writer_write_to_file(const std::string & output_path) {
+    if (!m_gguf_file) {
+        std::cerr << "Error: llama_gguf_file is not set. Cannot write to file." << std::endl;
+        return false;
+    }
+    return m_gguf_file->llama_gguf_file_write_to_file(output_path, false);
+}
diff --git a/tools/dataset-converter/dataset-to-gguf/llama-gguf-writer.h b/tools/dataset-converter/dataset-to-gguf/llama-gguf-writer.h
new file mode 100644
index 0000000000000..134d06c3091b7
--- /dev/null
+++ b/tools/dataset-converter/dataset-to-gguf/llama-gguf-writer.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <string>  // For std::string
+#include <vector>  // For std::vector
+
+#include "llama-gguf-file.h"
+#include "llama.h"  // For llama_token
+
+// Class for encapsulating GGUF file writing logic.
+// It now uses llama_gguf_file for low-level operations.
+struct llama_gguf_writer {
+    // Constructor, takes a pointer to a llama_gguf_file object.
+    // m_gguf_file: pointer to an initialized llama_gguf_file object,
+    //              which will be used for writing.
+    llama_gguf_writer(llama_gguf_file * m_gguf_file);
+
+    // Destructor (does not free m_gguf_file, as it is managed externally).
+    ~llama_gguf_writer() = default;
+
+    // Initializes the GGUF file metadata.
+    // model: pointer to the loaded llama model to get tokenizer information.
+    // input_path: path to the input file, used for the dataset name.
+    // sequence_count: total number of sequences.
+    void llama_gguf_writer_init_metadata(const struct llama_model * model, const std::string & input_path,
+                                         uint64_t sequence_count);
+
+    // Adds a sequence of tokens to the GGUF file as a tensor.
+    // index: sequence index (used for tensor name).
+    // tokens: vector of tokens representing the sequence.
+    void llama_gguf_writer_add_sequence_tensor(uint64_t index, const std::vector<llama_token> & tokens);
+
+    // Writes the entire GGUF context (metadata and tensors) to the specified file.
+    // output_path: path to the output GGUF file.
+    // Returns true on success, false on error.
+    bool llama_gguf_writer_write_to_file(const std::string & output_path);
+
+  private:
+    llama_gguf_file * m_gguf_file;  // Pointer to the llama_gguf_file object
+};
diff --git a/tools/dataset-converter/dataset-to-gguf/tests/dataset-to-gguf-tests.cpp b/tools/dataset-converter/dataset-to-gguf/tests/dataset-to-gguf-tests.cpp
new file mode 100644
index 0000000000000..cf23d99666c8f
--- /dev/null
+++ b/tools/dataset-converter/dataset-to-gguf/tests/dataset-to-gguf-tests.cpp
@@ -0,0 +1,563 @@
+#include <cassert>     // For assert
+#include <filesystem>  // For working with the file system (creating/deleting temporary files)
+#include <fstream>
+#include <iostream>    // For std::cerr
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "../llama-dataset-reader/llama-text-data-reader.h"
+#include "common.h"
+#include "dataset-to-gguf/llama-gguf-converter.h"
+#include "dataset-to-gguf/llama-gguf-reader.h"
+#include "dataset-to-gguf/llama-gguf-writer.h"
+#include "llama.h"  // For llama_backend_init, llama_backend_free, llama_model_load_from_file, llama_model_free
+
+namespace fs = std::filesystem;
+
+// Global variables for tests requiring llama_model
+static llama_model * g_llama_model     = nullptr;
+static std::string   g_test_model_path = "../../gte-small.Q2_K.gguf";  // Specify the actual path to your model
+
+// Helper for assertions
+#define TEST_ASSERT(condition, message)                                                                              \
+    do {                                                                                                             \
+        if (!(condition)) {                                                                                          \
+            std::cerr << "Assertion failed: " << message << " (" << __FILE__ << ":" << __LINE__ << ")" << std::endl; \
+            return false;                                                                                            \
+        }                                                                                                            \
+    } while (0)
+bool SetUpLlamaBackend();
+bool Testllama_gguf_file_DefaultConstructorInitializesContext();
+bool Testllama_gguf_file_ConstructorFromFileThrowsOnError();
+bool Testllama_gguf_file_SetAndGetMetadataString();
+bool Testllama_gguf_file_SetAndGetMetadataU64();
+bool Testllama_gguf_file_SetAndGetMetadataStringArray();
+bool CreateTestllama_gguf_file(const std::string & path, llama_model * model_ptr);
+bool Testllama_gguf_reader_ConstructorInitializesFromFile();
+bool Testllama_gguf_reader_GetMetadata();
+bool Testllama_gguf_reader_GetTensorCount();
+bool Testllama_gguf_reader_GetTensorNameAndTypeAndSize();
+bool Testllama_gguf_reader_ReadTensorData();
+bool Testllama_gguf_reader_ReadTensorDataInvalidIndex();
+bool TestTextDataReader_OpenFile();
+bool TestTextDataReader_ReadNextSequenceTextMode();
+bool TestTextDataReader_ReadNextSequencePreTokenizedMode();
+bool TestTextDataReader_ResetFunctionality();
+bool TestTextDataReader_GetTotalSequences();
+bool Testllama_gguf_converter_ConvertTextFileSuccess();
+void TearDownLlamaBackend();
+
+
+// Global setup for llama.cpp backend
+bool SetUpLlamaBackend() {
+    llama_backend_init();
+    // Load the model for the tokenizer
+    llama_model_params model_params = llama_model_default_params();
+    g_llama_model = llama_model_load_from_file(g_test_model_path.c_str(), model_params);
+    if (g_llama_model == nullptr) {
+        std::cerr << "WARNING: Failed to load llama model for tests from " << g_test_model_path
+                  << ". Some tests may be skipped or fail." << std::endl;
+        // It's okay to continue if model loading fails, but tests relying on it will skip.
+    }
+    return true;
+}
+
+// Global teardown for llama.cpp backend
+void TearDownLlamaBackend() {
+    if (g_llama_model) {
+        llama_model_free(g_llama_model);
+        g_llama_model = nullptr;
+    }
+    llama_backend_free();
+}
+
+// =============================================================================
+// Tests for llama_gguf_file
+// =============================================================================
+
+bool Testllama_gguf_file_DefaultConstructorInitializesContext() {
+    printf("  Testllama_gguf_file_DefaultConstructorInitializesContext\n");
+    llama_gguf_file gguf_file;
+    TEST_ASSERT(gguf_file.llama_gguf_file_is_initialized(),
+                "llama_gguf_file should be initialized by default constructor");
+    return true;
+}
+
+bool Testllama_gguf_file_ConstructorFromFileThrowsOnError() {
+    printf("  Testllama_gguf_file_ConstructorFromFileThrowsOnError\n");
+    bool threw_exception = false;
+    try {
+        llama_gguf_file("non_existent_file.gguf");
+    } catch (const std::runtime_error & e) {
+        threw_exception = true;
+    }
+    TEST_ASSERT(threw_exception, "Constructor should throw for non-existent file");
+    return true;
+}
+
+bool Testllama_gguf_file_SetAndGetMetadataString() {
+    printf("  Testllama_gguf_file_SetAndGetMetadataString\n");
+    llama_gguf_file gguf_file;
+    gguf_file.llama_gguf_file_set_val_str("test.key.string", "test_value");
+    TEST_ASSERT(gguf_file.llama_gguf_file_get_val_str("test.key.string") == "test_value",
+                "Failed to get correct string value");
+    TEST_ASSERT(gguf_file.llama_gguf_file_get_val_str("non.existent.key", "default_value") == "default_value",
+                "Failed to get default string value");
+    return true;
+}
+
+bool Testllama_gguf_file_SetAndGetMetadataU64() {
+    printf("  Testllama_gguf_file_SetAndGetMetadataU64\n");
+    llama_gguf_file gguf_file;
+    gguf_file.llama_gguf_file_set_val_u64("test.key.u64", 12345ULL);
+    TEST_ASSERT(gguf_file.llama_gguf_file_get_val_u64("test.key.u64") == 12345ULL, "Failed to get correct u64 value");
+    TEST_ASSERT(gguf_file.llama_gguf_file_get_val_u64("non.existent.key.u64", 99ULL) == 99ULL,
+                "Failed to get default u64 value");
+    return true;
+}
+
+bool Testllama_gguf_file_SetAndGetMetadataStringArray() {
+    printf("  Testllama_gguf_file_SetAndGetMetadataStringArray\n");
+    llama_gguf_file           gguf_file;
+    std::vector<const char *> arr = { "val1", "val2", "val3" };
+    gguf_file.llama_gguf_file_set_arr_str("test.key.array_str", arr);
+    // As noted before, verifying array content requires more complex logic to read the GGUF file.
+    // For now, we assert that the operation doesn't crash.
+    return true;
+}
+
+// =============================================================================
+// Tests for llama_gguf_reader
+// =============================================================================
+
+// Helper to create a temporary GGUF file for llama_gguf_reader tests
+bool CreateTestllama_gguf_file(const std::string & path, llama_model * model_ptr) {
+    llama_gguf_file   writer_file;
+    llama_gguf_writer writer(&writer_file);
+
+    writer.llama_gguf_writer_init_metadata(model_ptr, "dummy_input.txt", 2);  // 2 sequences
+
+    std::vector<llama_token> seq1 = { 1, 2, 3, 4, 5 };
+    std::vector<llama_token> seq2 = { 10, 20, 30 };
+    writer.llama_gguf_writer_add_sequence_tensor(0, seq1);
+    writer.llama_gguf_writer_add_sequence_tensor(1, seq2);
+
+    return writer.llama_gguf_writer_write_to_file(path);
+}
+
+bool Testllama_gguf_reader_ConstructorInitializesFromFile() {
+    printf("  Testllama_gguf_reader_ConstructorInitializesFromFile\n");
+    std::string test_gguf_path = "test_output_reader.gguf";
+    TEST_ASSERT(CreateTestllama_gguf_file(test_gguf_path, g_llama_model),
+                "Failed to create test GGUF file for reader test");
+
+    llama_gguf_reader reader(test_gguf_path);
+    TEST_ASSERT(reader.llama_gguf_reader_is_initialized(), "llama_gguf_reader should be initialized from file");
+    fs::remove(test_gguf_path);
+    return true;
+}
+
+bool Testllama_gguf_reader_GetMetadata() {
+    printf("  Testllama_gguf_reader_GetMetadata\n");
+    std::string test_gguf_path = "test_output_reader_meta.gguf";
+    TEST_ASSERT(CreateTestllama_gguf_file(test_gguf_path, g_llama_model),
+                "Failed to create test GGUF file for reader meta test");
+
+    llama_gguf_reader reader(test_gguf_path);
+    TEST_ASSERT(reader.llama_gguf_reader_get_metadata_str("training.dataset.name") == "dummy_input.txt",
+                "Incorrect dataset name");
+    TEST_ASSERT(reader.llama_gguf_reader_get_metadata_u64("training.sequence.count") == 2ULL,
+                "Incorrect sequence count");
+    // The tokenizer model name might vary, so just check it's not empty/default if model was loaded
+    if (g_llama_model) {
+        TEST_ASSERT(reader.llama_gguf_reader_get_metadata_str("training.tokenizer.gguf.model", "default") != "default",
+                    "Tokenizer model name should not be default");
+    }
+    fs::remove(test_gguf_path);
+    return true;
+}
+
+bool Testllama_gguf_reader_GetTensorCount() {
+    printf("  Testllama_gguf_reader_GetTensorCount\n");
+    std::string test_gguf_path = "test_output_reader_count.gguf";
+    TEST_ASSERT(CreateTestllama_gguf_file(test_gguf_path, g_llama_model),
+                "Failed to create test GGUF file for reader count test");
+
+    llama_gguf_reader reader(test_gguf_path);
+    TEST_ASSERT(reader.llama_gguf_reader_get_tensor_count() == 2, "Incorrect tensor count");
+    fs::remove(test_gguf_path);
+    return true;
+}
+
+bool Testllama_gguf_reader_GetTensorNameAndTypeAndSize() {
+    printf("  Testllama_gguf_reader_GetTensorNameAndTypeAndSize\n");
+    std::string test_gguf_path = "test_output_reader_tensor_info.gguf";
+    TEST_ASSERT(CreateTestllama_gguf_file(test_gguf_path, g_llama_model),
+                "Failed to create test GGUF file for reader tensor info test");
+
+    llama_gguf_reader reader(test_gguf_path);
+    TEST_ASSERT(reader.llama_gguf_reader_get_tensor_name(0) == "training.tensor.0",
+                "Incorrect tensor name for index 0");
+    TEST_ASSERT(reader.llama_gguf_reader_get_tensor_type(0) == GGML_TYPE_I32, "Incorrect tensor type for index 0");
+    TEST_ASSERT(reader.llama_gguf_reader_get_tensor_size(0) == 5 * sizeof(llama_token),
+                "Incorrect tensor size for index 0");
+
+    TEST_ASSERT(reader.llama_gguf_reader_get_tensor_name(1) == "training.tensor.1",
+                "Incorrect tensor name for index 1");
+    TEST_ASSERT(reader.llama_gguf_reader_get_tensor_type(1) == GGML_TYPE_I32, "Incorrect tensor type for index 1");
+    TEST_ASSERT(reader.llama_gguf_reader_get_tensor_size(1) == 3 * sizeof(llama_token),
+                "Incorrect tensor size for index 1");
+    fs::remove(test_gguf_path);
+    return true;
+}
+
+bool Testllama_gguf_reader_ReadTensorData() {
+    printf("  Testllama_gguf_reader_ReadTensorData\n");
+    std::string test_gguf_path = "test_output_reader_data.gguf";
+    TEST_ASSERT(CreateTestllama_gguf_file(test_gguf_path, g_llama_model),
+                "Failed to create test GGUF file for reader data test");
+
+    llama_gguf_reader        reader(test_gguf_path);
+    std::vector<llama_token> tokens;
+
+    // Read first sequence
+    TEST_ASSERT(reader.llama_gguf_reader_read_tensor_data(0, tokens), "Failed to read tensor data for index 0");
+    TEST_ASSERT(tokens.size() == 5, "Incorrect token count for index 0");
+    TEST_ASSERT(tokens[0] == 1, "Incorrect token value at index 0, pos 0");
+    TEST_ASSERT(tokens[4] == 5, "Incorrect token value at index 0, pos 4");
+
+    // Read second sequence
+    TEST_ASSERT(reader.llama_gguf_reader_read_tensor_data(1, tokens), "Failed to read tensor data for index 1");
+    TEST_ASSERT(tokens.size() == 3, "Incorrect token count for index 1");
+    TEST_ASSERT(tokens[0] == 10, "Incorrect token value at index 1, pos 0");
+    TEST_ASSERT(tokens[2] == 30, "Incorrect token value at index 1, pos 2");
+    fs::remove(test_gguf_path);
+    return true;
+}
+
+bool Testllama_gguf_reader_ReadTensorDataInvalidIndex() {
+    printf("  Testllama_gguf_reader_ReadTensorDataInvalidIndex\n");
+    std::string test_gguf_path = "test_output_reader_invalid_idx.gguf";
+    TEST_ASSERT(CreateTestllama_gguf_file(test_gguf_path, g_llama_model),
+                "Failed to create test GGUF file for reader invalid index test");
+
+    llama_gguf_reader        reader(test_gguf_path);
+    std::vector<llama_token> tokens;
+    TEST_ASSERT(!reader.llama_gguf_reader_read_tensor_data(99, tokens), "Reading invalid index should fail");
+    fs::remove(test_gguf_path);
+    return true;
+}
+
+// =============================================================================
+// Tests for TextDataReader
+// =============================================================================
+
+// Helper to set up TextDataReader test files
+struct TextDataReaderTestFixture {
+    std::string   test_text_file         = "test_input.txt";
+    std::string   test_pretokenized_file = "test_pretokenized.txt";
+    llama_model * model_for_reader_test  = nullptr;
+
+    TextDataReaderTestFixture(llama_model * model) : model_for_reader_test(model) {
+        // Create test text file
+        std::ofstream ofs(test_text_file);
+        ofs << "Hello world\n";
+        ofs << "This is a test line.\n";
+        ofs << "\n";  // Empty line
+        ofs << "Another line";
+        ofs.close();
+
+        // Create test pre-tokenized file
+        std::ofstream ofs_pretokenized(test_pretokenized_file);
+        ofs_pretokenized << "101 200 300 102\n";
+        ofs_pretokenized << "500 600\n";
+        ofs_pretokenized << "\n";  // Empty line
+        ofs_pretokenized << "700";
+        ofs_pretokenized.close();
+    }
+
+    ~TextDataReaderTestFixture() {
+        fs::remove(test_text_file);
+        fs::remove(test_pretokenized_file);
+    }
+};
+
+bool TestTextDataReader_OpenFile() {
+    printf("  TestTextDataReader_OpenFile\n");
+    TextDataReaderTestFixture fixture(g_llama_model);
+    llama_text_dataset_reader reader(fixture.model_for_reader_test, 128, false);
+    TEST_ASSERT(reader.open(fixture.test_text_file), "Failed to open valid text file");
+    reader.close();
+    TEST_ASSERT(!reader.open("non_existent.txt"), "Opened non-existent file unexpectedly");
+    return true;
+}
+
+bool TestTextDataReader_ReadNextSequenceTextMode() {
+    printf("  TestTextDataReader_ReadNextSequenceTextMode\n");
+    if (g_llama_model == nullptr) {
+        printf("    Skipping: Llama model not loaded.\n");
+        return true;  // Skip test gracefully
+    }
+
+    TextDataReaderTestFixture fixture(g_llama_model);
+    llama_text_dataset_reader reader(fixture.model_for_reader_test, 128, false);
+    TEST_ASSERT(reader.open(fixture.test_text_file), "Failed to open text file for read test");
+
+    std::vector<llama_token> tokens;
+
+    // Read "Hello world"
+    TEST_ASSERT(reader.read_next_sequence(tokens), "Failed to read first sequence");
+    TEST_ASSERT(!tokens.empty(), "First sequence should not be empty");
+
+    // Read "This is a test line."
+    TEST_ASSERT(reader.read_next_sequence(tokens), "Failed to read second sequence");
+    TEST_ASSERT(!tokens.empty(), "Second sequence should not be empty");
+
+    // Read empty line
+    TEST_ASSERT(reader.read_next_sequence(tokens), "Failed to read empty line");
+    TEST_ASSERT(tokens.empty(), "Empty line should result in 0 tokens");
+
+    // Read "Another line"
+    TEST_ASSERT(reader.read_next_sequence(tokens), "Failed to read third sequence");
+    TEST_ASSERT(!tokens.empty(), "Third sequence should not be empty");
+
+    // End of file
+    TEST_ASSERT(!reader.read_next_sequence(tokens), "Should be end of file");
+    reader.close();
+    return true;
+}
+
+bool TestTextDataReader_ReadNextSequencePreTokenizedMode() {
+    printf("  TestTextDataReader_ReadNextSequencePreTokenizedMode\n");
+    TextDataReaderTestFixture fixture(g_llama_model);
+    llama_text_dataset_reader reader(fixture.model_for_reader_test, 128, true);
+    TEST_ASSERT(reader.open(fixture.test_pretokenized_file), "Failed to open pre-tokenized file for read test");
+
+    std::vector<llama_token> tokens;
+
+    // Read "101 200 300 102"
+    TEST_ASSERT(reader.read_next_sequence(tokens), "Failed to read first pre-tokenized sequence");
+    TEST_ASSERT(tokens.size() == 4, "Incorrect token count for first pre-tokenized sequence");
+    TEST_ASSERT(tokens[0] == 101, "Incorrect token value for first pre-tokenized sequence");
+    TEST_ASSERT(tokens[1] == 200, "Incorrect token value for first pre-tokenized sequence");
+
+    // Read "500 600"
+    TEST_ASSERT(reader.read_next_sequence(tokens), "Failed to read second pre-tokenized sequence");
+    TEST_ASSERT(tokens.size() == 2, "Incorrect token count for second pre-tokenized sequence");
+    TEST_ASSERT(tokens[0] == 500, "Incorrect token value for second pre-tokenized sequence");
+
+    // Read empty line
+    TEST_ASSERT(reader.read_next_sequence(tokens), "Failed to read empty pre-tokenized line");
+    TEST_ASSERT(tokens.empty(), "Empty pre-tokenized line should result in 0 tokens");
+
+    // Read "700"
+    TEST_ASSERT(reader.read_next_sequence(tokens), "Failed to read third pre-tokenized sequence");
+    TEST_ASSERT(tokens.size() == 1, "Incorrect token count for third pre-tokenized sequence");
+    TEST_ASSERT(tokens[0] == 700, "Incorrect token value for third pre-tokenized sequence");
+
+    // End of file
+    TEST_ASSERT(!reader.read_next_sequence(tokens), "Should be end of pre-tokenized file");
+    reader.close();
+    return true;
+}
+
+bool TestTextDataReader_ResetFunctionality() {
+    printf("  TestTextDataReader_ResetFunctionality\n");
+    TextDataReaderTestFixture fixture(g_llama_model);
+    llama_text_dataset_reader reader(fixture.model_for_reader_test, 128, false);
+    TEST_ASSERT(reader.open(fixture.test_text_file), "Failed to open text file for reset test");
+
+    std::vector<llama_token> tokens;
+    reader.read_next_sequence(tokens);                      // Read one line
+    reader.read_next_sequence(tokens);                      // Read another line
+
+    TEST_ASSERT(reader.reset(), "Failed to reset reader");  // Reset to beginning
+
+    // Should read the first line again
+    TEST_ASSERT(reader.read_next_sequence(tokens), "Failed to read first sequence after reset");
+    // (Add specific token check if you know the expected tokens for "Hello world")
+    reader.close();
+    return true;
+}
+
+bool TestTextDataReader_GetTotalSequences() {
+    printf("  TestTextDataReader_GetTotalSequences\n");
+    TextDataReaderTestFixture fixture(g_llama_model);
+
+    llama_text_dataset_reader reader_text(fixture.model_for_reader_test, 128, false);
+    TEST_ASSERT(reader_text.open(fixture.test_text_file), "Failed to open text file for total sequences test");
+    TEST_ASSERT(reader_text.total_sequences() == 4,
+                "Incorrect total sequence count for text file");  // 4 lines in test_input.txt
+    reader_text.close();
+
+    llama_text_dataset_reader reader_pretokenized(fixture.model_for_reader_test, 128, true);
+    TEST_ASSERT(reader_pretokenized.open(fixture.test_pretokenized_file),
+                "Failed to open pre-tokenized file for total sequences test");
+    TEST_ASSERT(reader_pretokenized.total_sequences() == 4,
+                "Incorrect total sequence count for pre-tokenized file");  // 4 lines in test_pretokenized.txt
+    reader_pretokenized.close();
+    return true;
+}
+
+// =============================================================================
+// Tests for llama_gguf_converter (integration)
+// =============================================================================
+
+// Helper to set up llama_gguf_converter test files
+struct llama_gguf_converterTestFixture {
+    std::string   input_text_file          = "converter_input.txt";
+    std::string   output_gguf_file         = "converter_output.gguf";
+    llama_model * model_for_converter_test = nullptr;
+
+    llama_gguf_converterTestFixture(llama_model * model) : model_for_converter_test(model) {
+        // Create test text file
+        std::ofstream ofs(input_text_file);
+        ofs << "The quick brown fox jumps over the lazy dog.\n";
+        ofs << "Hello, GGUF conversion!\n";
+        ofs.close();
+    }
+
+    ~llama_gguf_converterTestFixture() {
+        fs::remove(input_text_file);
+        fs::remove(output_gguf_file);
+    }
+};
+
+bool Testllama_gguf_converter_ConvertTextFileSuccess() {
+    printf("  Testllama_gguf_converter_ConvertTextFileSuccess\n");
+    if (g_llama_model == nullptr) {
+        printf("    Skipping: Llama model not loaded.\n");
+        return true;  // Skip test gracefully
+    }
+
+    llama_gguf_converterTestFixture fixture(g_llama_model);
+
+    common_params params;
+    params.in_files.push_back(fixture.input_text_file);
+    params.out_file              = fixture.output_gguf_file;
+    params.max_seq_len           = 128;
+    params.pre_tokenized         = false;
+    params.dataset_format        = "text";
+    params.dataset_column   = "data";    // Not used for text, but for completeness
+    llama_gguf_converter converter;
+    TEST_ASSERT(converter.llama_gguf_converter_convert(params, g_llama_model), "GGUF conversion failed");
+
+    // Verify file was created
+    TEST_ASSERT(fs::exists(fixture.output_gguf_file), "Output GGUF file was not created");
+
+    // Verify GGUF file content using llama_gguf_reader
+    llama_gguf_reader reader(fixture.output_gguf_file);
+    TEST_ASSERT(reader.llama_gguf_reader_is_initialized(), "llama_gguf_reader failed to initialize for verification");
+    TEST_ASSERT(reader.llama_gguf_reader_get_metadata_u64("training.sequence.count") == 2ULL,
+                "Incorrect sequence count in GGUF metadata");
+    TEST_ASSERT(reader.llama_gguf_reader_get_tensor_count() == 2, "Incorrect tensor count in GGUF file");
+
+    std::vector<llama_token> tokens;
+    TEST_ASSERT(reader.llama_gguf_reader_read_tensor_data(0, tokens), "Failed to read first tensor data");
+    TEST_ASSERT(!tokens.empty(), "First sequence should not be empty");
+
+    TEST_ASSERT(reader.llama_gguf_reader_read_tensor_data(1, tokens), "Failed to read second tensor data");
+    TEST_ASSERT(!tokens.empty(), "Second sequence should not be empty");
+    return true;
+}
+
+// =============================================================================
+// Main function to run all tests
+// =============================================================================
+
+int main() {
+    printf("Running dataset-to-gguf tests...\n\n");
+
+    // Global setup for llama.cpp backend
+    if (!SetUpLlamaBackend()) {
+        printf("Global setup failed. Exiting tests.\n");
+        return 1;
+    }
+
+    int failed_tests = 0;
+
+    // Run llama_gguf_file tests
+    printf("--- llama_gguf_file Tests ---\n");
+    if (!Testllama_gguf_file_DefaultConstructorInitializesContext()) {
+        failed_tests++;
+    }
+    if (!Testllama_gguf_file_ConstructorFromFileThrowsOnError()) {
+        failed_tests++;
+    }
+    if (!Testllama_gguf_file_SetAndGetMetadataString()) {
+        failed_tests++;
+    }
+    if (!Testllama_gguf_file_SetAndGetMetadataU64()) {
+        failed_tests++;
+    }
+    if (!Testllama_gguf_file_SetAndGetMetadataStringArray()) {
+        failed_tests++;
+    }
+    printf("\n");
+
+    // Run llama_gguf_reader tests
+    printf("--- llama_gguf_reader Tests ---\n");
+    if (!Testllama_gguf_reader_ConstructorInitializesFromFile()) {
+        failed_tests++;
+    }
+    if (!Testllama_gguf_reader_GetMetadata()) {
+        failed_tests++;
+    }
+    if (!Testllama_gguf_reader_GetTensorCount()) {
+        failed_tests++;
+    }
+    if (!Testllama_gguf_reader_GetTensorNameAndTypeAndSize()) {
+        failed_tests++;
+    }
+    if (!Testllama_gguf_reader_ReadTensorData()) {
+        failed_tests++;
+    }
+    if (!Testllama_gguf_reader_ReadTensorDataInvalidIndex()) {
+        failed_tests++;
+    }
+    printf("\n");
+
+    // Run TextDataReader tests
+    printf("--- TextDataReader Tests ---\n");
+    if (!TestTextDataReader_OpenFile()) {
+        failed_tests++;
+    }
+    if (!TestTextDataReader_ReadNextSequenceTextMode()) {
+        failed_tests++;
+    }
+    if (!TestTextDataReader_ReadNextSequencePreTokenizedMode()) {
+        failed_tests++;
+    }
+    if (!TestTextDataReader_ResetFunctionality()) {
+        failed_tests++;
+    }
+    if (!TestTextDataReader_GetTotalSequences()) {
+        failed_tests++;
+    }
+    printf("\n");
+
+    // Run llama_gguf_converter integration tests
+    printf("--- llama_gguf_converter Tests ---\n");
+    if (!Testllama_gguf_converter_ConvertTextFileSuccess()) {
+        failed_tests++;
+    }
+    printf("\n");
+
+    // Add ParquetDataReader tests here when you have test files and logic
+    // printf("--- ParquetDataReader Tests ---\n");
+    // if (!TestParquetDataReader_OpenFile()) failed_tests++;
+    // ...
+
+    // Global teardown for llama.cpp backend
+    TearDownLlamaBackend();
+
+    if (failed_tests == 0) {
+        printf("All tests passed!\n");
+        return 0;
+    } else {
+        printf("%d tests failed.\n", failed_tests);
+        return 1;
+    }
+}