ggml-org · lexasub · Jul 10, 2025 · Jul 10, 2025 · Jul 11, 2025 · Jul 11, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -12,6 +12,8 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
 
+message("CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
+
 # Add path to modules
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
 
@@ -84,6 +86,12 @@ option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
 # 3rd party libs
 option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
 option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
+option(LLAMA_PARQUET "Enable Parquet dataset support via Arrow/Parquet C++" OFF)
+
+
+if(LLAMA_PARQUET)
+    add_definitions(-DLLAMA_PARQUET)
+endif()
 
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)

diff --git a/common/arg.cpp b/common/arg.cpp
@@ -1477,7 +1477,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, int value) {
             params.n_chunks = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_RETRIEVAL}));
     add_opt(common_arg(
         {"-fa", "--flash-attn"},
         string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
@@ -1539,7 +1539,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
             params.in_files.push_back(value);
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_FINETUNE}));
     add_opt(common_arg(
         {"-bf", "--binary-file"}, "FNAME",
         "binary file containing the prompt (default: none)",
@@ -2609,9 +2609,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"-o", "--output", "--output-file"}, "FNAME",
         string_format("output file (default: '%s')", params.out_file.c_str()),
         [](common_params & params, const std::string & value) {
-            params.out_file = value;
+          params.out_file = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
     add_opt(common_arg(
         {"-ofreq", "--output-frequency"}, "N",
         string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
@@ -3423,5 +3423,45 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
 
+    add_opt(common_arg(
+        {"--dataset-format"}, " ",
+        string_format("type of input data (e.g., 'text', 'parquet') (default: %s)", params.dataset_format.c_str()),
+        [](common_params & params, const std::string & format) {
+            params.dataset_format = format; //TODO ENUM CLASS
+        }
+    ).set_examples({LLAMA_EXAMPLE_FINETUNE}));
+
+    add_opt(common_arg(
+        {"--max-seq-len"}, " ",
+        string_format("max sequence length (default: %d)", params.max_seq_len),
+        [](common_params & params, int32_t max_seq_len) {
+            params.max_seq_len = max_seq_len;
+        }
+    ).set_examples({LLAMA_EXAMPLE_FINETUNE}));
+
+    add_opt(common_arg(
+        {"--pre-tokenized"},
+        string_format("input file contains pre-tokenized data (space-separated token IDs)"),
+        [](common_params & params) {
+            params.pre_tokenized = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_FINETUNE}));
+
+    add_opt(common_arg(
+        {"--preview"},
+        string_format("read and print metadata and first sequence from the output GGUF file (enables preview)"),
+        [](common_params & params) {
+            params.do_preview = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_FINETUNE}));
+
+    add_opt(common_arg(
+        {"--dataset-column"}, "<name>",
+        string_format("column name for data in dataset files"),
+        [](common_params & params, const std::string &dataset_column) {
+            params.dataset_column = dataset_column;
+        }
+    ).set_examples({LLAMA_EXAMPLE_FINETUNE}));
+
     return ctx_arg;
 }
diff --git a/common/common.h b/common/common.h
@@ -4,12 +4,13 @@
 
 #include "llama-cpp.h"
 
+#include <cmath>
+#include <map>
 #include <set>
+#include <sstream>
 #include <string>
 #include <string_view>
 #include <vector>
-#include <map>
-#include <sstream>
 
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@@ -81,6 +82,7 @@ enum llama_example {
     LLAMA_EXAMPLE_LOOKUP,
     LLAMA_EXAMPLE_PARALLEL,
     LLAMA_EXAMPLE_TTS,
+    LLAMA_EXAMPLE_FINETUNE,
 
     LLAMA_EXAMPLE_COUNT,
 };
@@ -282,6 +284,7 @@ struct common_params {
     std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
     std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
     std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
+    std::string dataset_format = "text"; // "text" | "parquet"
 
     std::vector<std::string> in_files;   // all input files
     std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
@@ -443,6 +446,10 @@ struct common_params {
     // return false from callback to abort model loading or true to continue
     llama_progress_callback load_progress_callback = NULL;
     void *                  load_progress_callback_user_data = NULL;
+    int32_t max_seq_len = 2048;
+    bool do_preview = false;
+    bool pre_tokenized = false;
+    std::string dataset_column = "data";
 };
 
 // call once at the start of a program if it uses libcommon

diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
@@ -36,4 +36,5 @@ else()
         add_subdirectory(cvector-generator)
         add_subdirectory(export-lora)
     endif()
+    add_subdirectory(dataset-converter)
 endif()
diff --git a/tools/dataset-converter/CMakeLists.txt b/tools/dataset-converter/CMakeLists.txt
@@ -0,0 +1,37 @@
+include_directories(. ../../common)
+
+if(LLAMA_PARQUET)
+    find_package(Arrow REQUIRED)
+    find_package(Parquet REQUIRED)
+endif()
+
+add_library(dataset-to-gguf-lib STATIC
+        dataset-to-gguf/llama-gguf-writer.cpp
+        dataset-to-gguf/llama-gguf-file.cpp
+        dataset-to-gguf/llama-dataset-reader/llama-text-data-reader.cpp
+        dataset-to-gguf/llama-gguf-converter.cpp
+        dataset-to-gguf/llama-gguf-reader.cpp
+        dataset-to-gguf/llama-dataset-reader/llama-parquet-data-reader.cpp
+)
+
+target_compile_features(dataset-to-gguf-lib PRIVATE cxx_std_17)
+
+target_link_libraries(dataset-to-gguf-lib common llama ${CMAKE_THREAD_LIBS_INIT})
+if(LLAMA_PARQUET)
+    target_link_libraries(dataset-to-gguf-lib Arrow::arrow_shared Parquet::parquet_shared)
+endif()
+
+add_executable(convert-to-train-gguf convert-to-train-gguf.cpp)
+add_dependencies(convert-to-train-gguf dataset-to-gguf-lib)
+target_link_libraries(convert-to-train-gguf PRIVATE dataset-to-gguf-lib)
+
+set(TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS dataset-to-gguf-unit-tests)
+add_executable(${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} dataset-to-gguf/tests/dataset-to-gguf-tests.cpp)
+add_dependencies(${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} dataset-to-gguf-lib)
+target_link_libraries(${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} PRIVATE common llama dataset-to-gguf-lib)
+
+add_test(
+        NAME ${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} #
+        COMMAND $<TARGET_FILE:${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS}>
+        LABEL "training"
+)
diff --git a/tools/dataset-converter/README.md b/tools/dataset-converter/README.md
@@ -0,0 +1,148 @@
+`convert-to-train-gguf` Utility
+===============================
+
+This utility is designed to convert text datasets (or pre-tokenized data) into the GGUF format, optimized for training models in `llama.cpp`.
+
+Features
+--------
+
+*   **Two-pass processing**: Efficiently handles large datasets that do not fit entirely into RAM, performing a first pass to collect metadata and a second pass to write the actual tensor data.
+
+*   **Flexible input**: Supports reading both raw text (with subsequent tokenization using a provided model) and pre-tokenized data (in the format of space-separated token IDs).
+
+*   **Modular architecture**: The code is divided into several classes (`llama_gguf_file`, `llama_gguf_writer`, `llama_dataset_reader`, `llama_text_dataset_reader`, `llama_gguf_converter`, `llama_gguf_reader`) to improve modularity, extensibility, and testability.
+
+*   **Preview functionality**: Allows you to view metadata and the first few sequences of the generated GGUF file, including optional detokenization.
+
+
+GGUF Structure for Training Data
+--------------------------------
+
+The generated GGUF files follow a specific structure for training data:
+
+*   **Metadata (KV pairs)**: All metadata keys are prefixed with `training.` to avoid conflicts with model metadata.
+
+    *   `training.format.version`: `string` (e.g., "1.0") - Specification version.
+
+    *   `training.dataset.name`: `string` (optional) - Dataset name (e.g., "OpenWebText-ru").
+
+    *   `training.dataset.source`: `string` (optional) - URL or description of the data source.
+
+    *   `training.file.creation_date`: `string` (ISO 8601) - File creation date.
+
+    *   `training.tokenizer.gguf.model`: `string` - Tokenizer model name (e.g., "llama", "gpt2", "bert").
+
+    *   `training.tokenizer.gguf.vocab`: `array[string]` - Tokenizer vocabulary.
+
+    *   `training.tokenizer.gguf.merges`: `array[string]` (optional) - Tokenizer merges (for BPE).
+
+    *   `training.tokenizer.gguf.pre`: `string` (optional) - Architecture of the pre-tokenizer.
+
+    *   `training.sequence.count`: `uint64` - Total number of sequences in the file.
+
+*   **Tensors**: Each training sequence is stored as a separate tensor.
+
+    *   **Naming**: `training.tensor.{index}` (e.g., `training.tensor.0`, `training.tensor.1`, ...). No leading zeros.
+
+    *   **Data type**: `GGML_TYPE_I32` (standard for tokens in `llama.cpp`).
+
+    *   **Shape**: `[sequence_length]` - One-dimensional array. `sequence_length` will vary for each tensor and can be obtained from the tensor's shape.
+
+
+Building
+--------
+
+It is assumed that you have already set up the `llama.cpp` build environment (e.g., using CMake) and installed Arrow and Parquet on your system.
+
+1.  **Clone the `llama.cpp` repository**:
+
+        git clone https://github.com/ggerganov/llama.cpp.git
+        cd llama.cpp
+
+
+2.  **Create a build directory and navigate into it**:
+
+        mkdir build
+        cd build
+
+
+3.  **Configure and build the project using CMake**:
+
+        cmake -DLLAMA_PARQUET=ON ..
+        cmake --build . --config Release
+
+
+    The `convert-to-train-gguf` utility will be built in the `build/bin` directory.
+
+
+Usage
+-----
+
+    ./bin/convert-to-train-gguf [options]
+
+
+### Command-line Options
+
+*   `-h`, `--help`: Show this help message and exit.
+
+*   `-m <path>, --model <path>` : Path to the GGUF model used for the tokenizer (default: `models/7B/ggml-model-f16.gguf`).
+
+*   `--in-file <path>`: Path to the input dataset file, either a plain text file or a Parquet file (default: `input.txt`).
+
+*   `-o <path>`, `--output <path>`: Path to save the output GGUF file to (default: `output.gguf`).
+
+*   `--max-seq-len <length>`: Maximum sequence length in tokens (default: `2048`). Sequences exceeding this length will be truncated.
+
+*   `--pre-tokenized`: Specifies that the input file contains pre-tokenized data (space-separated token IDs) rather than raw text.
+
+*   `--dataset-format <type>`: Type of input data (`text`, `parquet`). (default: `text`).
+
+*   `--parquet-text-column <name>`: For `parquet` input type, the column name containing raw text data (default: `text`).
+
+*   `--parquet-tokens-column <name>`: For `parquet` input type, the column name containing pre-tokenized data (list of int32) (default: `tokens`).
+
+*   `--preview`: Enables previewing of the generated GGUF file (prints metadata and the first few sequences).
+
+*   `--preview-count <N>`: Number of sequences to preview (default: `1`). Requires `--preview`.
+
+*   `--detokenize-preview`: Detokenize previewed sequences back into text for better readability. Requires `--preview`.
+
+
+### Usage Examples
+
+1.  **Converting a plain text file**:
+
+        ./bin/convert-to-train-gguf -m models/7B/ggml-model-f16.gguf -i my_dataset.txt -o my_training_data.gguf -l 1024
+
+
+2.  **Converting a pre-tokenized file**:
+
+        ./bin/convert-to-train-gguf -m models/7B/ggml-model-f16.gguf -i pre_tokenized_data.txt -o pre_tokenized_training_data.gguf -p
+
+
+    (Assumes `pre_tokenized_data.txt` contains lines like: `101 200 300 102 ...`)
+
+3.  **Converting a Parquet file with raw text**:
+
+        ./bin/convert-to-train-gguf -m models/7B/ggml-model-f16.gguf -i my_parquet_dataset.parquet -o my_training_data.gguf -t parquet --parquet-text-column "document_text"
+
+
+4.  **Converting a Parquet file with pre-tokenized data**:
+
+        ./bin/convert-to-train-gguf -m models/7B/ggml-model-f16.gguf -i my_tokenized_parquet.parquet -o my_training_data.gguf -t parquet -p --parquet-tokens-column "token_ids"
+
+
+5.  **Converting with a preview of 5 sequences and detokenization**:
+
+        ./bin/convert-to-train-gguf -m models/7B/ggml-model-f16.gguf -i my_dataset.txt -o my_training_data.gguf --preview --preview-count 5 --detokenize-preview
+
+
+
+Future Improvements
+-------------------
+
+*   **Improved Error Handling**: More detailed messages and handling of edge cases.
+
+*   **Additional Validation**: Data integrity checks at various stages.
+
+*   **Dataset Statistics**: Ability to output statistics on sequence lengths, token distribution, etc.