diff --git a/CMakeLists.txt b/CMakeLists.txt index c79ccd09e097c..060d079477aa1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,6 +12,8 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") endif() +message("CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}") + # Add path to modules list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/") @@ -84,6 +86,12 @@ option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) # 3rd party libs option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON) option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF) +option(LLAMA_PARQUET "Enable Parquet dataset support via Arrow/Parquet C++" OFF) + + +if(LLAMA_PARQUET) + add_definitions(-DLLAMA_PARQUET) +endif() # Required for relocatable CMake package include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) diff --git a/common/arg.cpp b/common/arg.cpp index 56827a65908be..ec09c7c22239b 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1477,7 +1477,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, int value) { params.n_chunks = value; } - ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL})); + ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_RETRIEVAL})); add_opt(common_arg( {"-fa", "--flash-attn"}, string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), @@ -1539,7 +1539,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } params.in_files.push_back(value); } - ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_FINETUNE})); add_opt(common_arg( {"-bf", "--binary-file"}, "FNAME", "binary file containing the prompt (default: none)", @@ -2609,9 +2609,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"-o", "--output", "--output-file"}, "FNAME", string_format("output file (default: '%s')", params.out_file.c_str()), [](common_params & params, const std::string & value) { - params.out_file = value; + params.out_file = value; } - ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS})); + ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE})); add_opt(common_arg( {"-ofreq", "--output-frequency"}, "N", string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq), @@ -3423,5 +3423,45 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--dataset-format"}, " ", + string_format("type of input data (e.g., 'text', 'parquet') (default: %s)", params.dataset_format.c_str()), + [](common_params & params, const std::string & format) { + params.dataset_format = format; //TODO ENUM CLASS + } + ).set_examples({LLAMA_EXAMPLE_FINETUNE})); + + add_opt(common_arg( + {"--max-seq-len"}, " ", + string_format("max sequence length (default: %d)", params.max_seq_len), + [](common_params & params, int32_t max_seq_len) { + params.max_seq_len = max_seq_len; + } + ).set_examples({LLAMA_EXAMPLE_FINETUNE})); + + add_opt(common_arg( + {"--pre-tokenized"}, + string_format("input file contains pre-tokenized data (space-separated token IDs)"), + [](common_params & params) { + params.pre_tokenized = true; + } + ).set_examples({LLAMA_EXAMPLE_FINETUNE})); + + add_opt(common_arg( + {"--preview"}, + string_format("read and print metadata and first sequence from the output GGUF file (enables preview)"), + [](common_params & params) { + params.do_preview = true; + } + ).set_examples({LLAMA_EXAMPLE_FINETUNE})); + + add_opt(common_arg( + {"--dataset-column"}, "", + string_format("column name for data in dataset files"), + [](common_params & params, const std::string &dataset_column) { + params.dataset_column = dataset_column; + } + ).set_examples({LLAMA_EXAMPLE_FINETUNE})); + return ctx_arg; } diff --git a/common/common.h b/common/common.h index a5abe32859fdd..570ab10f68ecb 100644 --- a/common/common.h +++ b/common/common.h @@ -4,12 +4,13 @@ #include "llama-cpp.h" +#include +#include #include +#include #include #include #include -#include -#include #ifdef _WIN32 #define DIRECTORY_SEPARATOR '\\' @@ -81,6 +82,7 @@ enum llama_example { LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_PARALLEL, LLAMA_EXAMPLE_TTS, + LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_COUNT, }; @@ -282,6 +284,7 @@ struct common_params { std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT std::string logits_file = ""; // file for saving *all* logits // NOLINT + std::string dataset_format = "text"; // "text" | "parquet" std::vector in_files; // all input files std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) @@ -443,6 +446,10 @@ struct common_params { // return false from callback to abort model loading or true to continue llama_progress_callback load_progress_callback = NULL; void * load_progress_callback_user_data = NULL; + int32_t max_seq_len = 2048; + bool do_preview = false; + bool pre_tokenized = false; + std::string dataset_column = "data"; }; // call once at the start of a program if it uses libcommon diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index d64956b843851..2e969f12f8b01 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -36,4 +36,5 @@ else() add_subdirectory(cvector-generator) add_subdirectory(export-lora) endif() + add_subdirectory(dataset-converter) endif() diff --git a/tools/dataset-converter/CMakeLists.txt b/tools/dataset-converter/CMakeLists.txt new file mode 100644 index 0000000000000..521cd7d815dec --- /dev/null +++ b/tools/dataset-converter/CMakeLists.txt @@ -0,0 +1,37 @@ +include_directories(. ../../common) + +if(LLAMA_PARQUET) + find_package(Arrow REQUIRED) + find_package(Parquet REQUIRED) +endif() + +add_library(dataset-to-gguf-lib STATIC + dataset-to-gguf/llama-gguf-writer.cpp + dataset-to-gguf/llama-gguf-file.cpp + dataset-to-gguf/llama-dataset-reader/llama-text-data-reader.cpp + dataset-to-gguf/llama-gguf-converter.cpp + dataset-to-gguf/llama-gguf-reader.cpp + dataset-to-gguf/llama-dataset-reader/llama-parquet-data-reader.cpp +) + +target_compile_features(dataset-to-gguf-lib PRIVATE cxx_std_17) + +target_link_libraries(dataset-to-gguf-lib common llama ${CMAKE_THREAD_LIBS_INIT}) +if(LLAMA_PARQUET) + target_link_libraries(dataset-to-gguf-lib Arrow::arrow_shared Parquet::parquet_shared) +endif() + +add_executable(convert-to-train-gguf convert-to-train-gguf.cpp) +add_dependencies(convert-to-train-gguf dataset-to-gguf-lib) +target_link_libraries(convert-to-train-gguf PRIVATE dataset-to-gguf-lib) + +set(TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS dataset-to-gguf-unit-tests) +add_executable(${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} dataset-to-gguf/tests/dataset-to-gguf-tests.cpp) +add_dependencies(${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} dataset-to-gguf-lib) +target_link_libraries(${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} PRIVATE common llama dataset-to-gguf-lib) + +add_test( + NAME ${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} # + COMMAND $ + LABEL "training" +) diff --git a/tools/dataset-converter/README.md b/tools/dataset-converter/README.md new file mode 100644 index 0000000000000..7033fe13c61b6 --- /dev/null +++ b/tools/dataset-converter/README.md @@ -0,0 +1,148 @@ +`convert-to-train-gguf` Utility +=============================== + +This utility is designed to convert text datasets (or pre-tokenized data) into the GGUF format, optimized for training models in `llama.cpp`. + +Features +-------- + +* **Two-pass processing**: Efficiently handles large datasets that do not fit entirely into RAM, performing a first pass to collect metadata and a second pass to write the actual tensor data. + +* **Flexible input**: Supports reading both raw text (with subsequent tokenization using a provided model) and pre-tokenized data (in the format of space-separated token IDs). + +* **Modular architecture**: The code is divided into several classes (`llama_gguf_file`, `llama_gguf_writer`, `llama_dataset_reader`, `llama_text_dataset_reader`, `llama_gguf_converter`, `llama_gguf_reader`) to improve modularity, extensibility, and testability. + +* **Preview functionality**: Allows you to view metadata and the first few sequences of the generated GGUF file, including optional detokenization. + + +GGUF Structure for Training Data +-------------------------------- + +The generated GGUF files follow a specific structure for training data: + +* **Metadata (KV pairs)**: All metadata keys are prefixed with `training.` to avoid conflicts with model metadata. + + * `training.format.version`: `string` (e.g., "1.0") - Specification version. + + * `training.dataset.name`: `string` (optional) - Dataset name (e.g., "OpenWebText-ru"). + + * `training.dataset.source`: `string` (optional) - URL or description of the data source. + + * `training.file.creation_date`: `string` (ISO 8601) - File creation date. + + * `training.tokenizer.gguf.model`: `string` - Tokenizer model name (e.g., "llama", "gpt2", "bert"). + + * `training.tokenizer.gguf.vocab`: `array[string]` - Tokenizer vocabulary. + + * `training.tokenizer.gguf.merges`: `array[string]` (optional) - Tokenizer merges (for BPE). + + * `training.tokenizer.gguf.pre`: `string` (optional) - Architecture of the pre-tokenizer. + + * `training.sequence.count`: `uint64` - Total number of sequences in the file. + +* **Tensors**: Each training sequence is stored as a separate tensor. + + * **Naming**: `training.tensor.{index}` (e.g., `training.tensor.0`, `training.tensor.1`, ...). No leading zeros. + + * **Data type**: `GGML_TYPE_I32` (standard for tokens in `llama.cpp`). + + * **Shape**: `[sequence_length]` - One-dimensional array. `sequence_length` will vary for each tensor and can be obtained from the tensor's shape. + + +Building +-------- + +It is assumed that you have already set up the `llama.cpp` build environment (e.g., using CMake) and installed Arrow and Parquet on your system. + +1. **Clone the `llama.cpp` repository**: + + git clone https://github.com/ggerganov/llama.cpp.git + cd llama.cpp + + +2. **Create a build directory and navigate into it**: + + mkdir build + cd build + + +3. **Configure and build the project using CMake**: + + cmake -DLLAMA_PARQUET=ON .. + cmake --build . --config Release + + + The `convert-to-train-gguf` utility will be built in the `build/bin` directory. + + +Usage +----- + + ./bin/convert-to-train-gguf [options] + + +### Command-line Options + +* `-h`, `--help`: Show this help message and exit. + +* `-m , --model ` : Path to the GGUF model used for the tokenizer (default: `models/7B/ggml-model-f16.gguf`). + +* `--in-file `: Path to the input dataset file, either a plain text file or a Parquet file (default: `input.txt`). + +* `-o `, `--output `: Path to save the output GGUF file to (default: `output.gguf`). + +* `--max-seq-len `: Maximum sequence length in tokens (default: `2048`). Sequences exceeding this length will be truncated. + +* `--pre-tokenized`: Specifies that the input file contains pre-tokenized data (space-separated token IDs) rather than raw text. + +* `--dataset-format `: Type of input data (`text`, `parquet`). (default: `text`). + +* `--parquet-text-column `: For `parquet` input type, the column name containing raw text data (default: `text`). + +* `--parquet-tokens-column `: For `parquet` input type, the column name containing pre-tokenized data (list of int32) (default: `tokens`). + +* `--preview`: Enables previewing of the generated GGUF file (prints metadata and the first few sequences). + +* `--preview-count `: Number of sequences to preview (default: `1`). Requires `--preview`. + +* `--detokenize-preview`: Detokenize previewed sequences back into text for better readability. Requires `--preview`. + + +### Usage Examples + +1. **Converting a plain text file**: + + ./bin/convert-to-train-gguf -m models/7B/ggml-model-f16.gguf -i my_dataset.txt -o my_training_data.gguf -l 1024 + + +2. **Converting a pre-tokenized file**: + + ./bin/convert-to-train-gguf -m models/7B/ggml-model-f16.gguf -i pre_tokenized_data.txt -o pre_tokenized_training_data.gguf -p + + + (Assumes `pre_tokenized_data.txt` contains lines like: `101 200 300 102 ...`) + +3. **Converting a Parquet file with raw text**: + + ./bin/convert-to-train-gguf -m models/7B/ggml-model-f16.gguf -i my_parquet_dataset.parquet -o my_training_data.gguf -t parquet --parquet-text-column "document_text" + + +4. **Converting a Parquet file with pre-tokenized data**: + + ./bin/convert-to-train-gguf -m models/7B/ggml-model-f16.gguf -i my_tokenized_parquet.parquet -o my_training_data.gguf -t parquet -p --parquet-tokens-column "token_ids" + + +5. **Converting with a preview of 5 sequences and detokenization**: + + ./bin/convert-to-train-gguf -m models/7B/ggml-model-f16.gguf -i my_dataset.txt -o my_training_data.gguf --preview --preview-count 5 --detokenize-preview + + + +Future Improvements +------------------- + +* **Improved Error Handling**: More detailed messages and handling of edge cases. + +* **Additional Validation**: Data integrity checks at various stages. + +* **Dataset Statistics**: Ability to output statistics on sequence lengths, token distribution, etc. diff --git a/tools/dataset-converter/convert-to-train-gguf.cpp b/tools/dataset-converter/convert-to-train-gguf.cpp new file mode 100644 index 0000000000000..eaf9638dc5a3e --- /dev/null +++ b/tools/dataset-converter/convert-to-train-gguf.cpp @@ -0,0 +1,183 @@ +// Main utility for converting a text dataset to the GGUF format for training models in llama.cpp. +// +// Logic: +// 1. Parses command line arguments. +// 2. Loads the tokenizer model. +// 3. Uses the llama_gguf_converter class to perform the entire conversion process: +// - First pass over the input data to collect metadata (sequence lengths). +// - Creation of the GGUF file and writing all collected metadata to it. +// - Second pass over the input data to add each sequence as a separate tensor to the GGUF file. +// 4. After successful conversion, uses llama_gguf_reader to read and print +// some meta-information and the first record from the created GGUF file. +// +// This two-pass approach allows processing datasets significantly larger than +// available RAM. + +#include "log.h" +#include // For std::min +#include // For std::array +#include // For PRIu64 +#include +#include +#include + +#include "arg.h" +#include "common.h" +#include "dataset-to-gguf/llama-gguf-converter.h" +#include "dataset-to-gguf/llama-gguf-reader.h" +#include "llama.h" // For llama_backend_init, llama_backend_free, llama_model_load_from_file, llama_model_free +#define PREVIEW_COUNT 1 +int main(int argc, char ** argv) { + common_params params; + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_FINETUNE)) { + return 1; + } + + // Print parameters for verification + LOG_INF("Parameters:\n"); + LOG_INF(" Model for tokenizer: %s\n", params.model.path.c_str()); + LOG_INF(" Input files: "); + for (auto & i : params.in_files) { + LOG_INF("%s ", i.c_str()); + } + LOG_INF("\n Output file: %s\n", params.out_file.c_str()); + LOG_INF(" Max sequence length: %d\n", params.max_seq_len); + LOG_INF(" Input type: %s\n", params.dataset_format.c_str()); + LOG_INF(" Do preview: %s\n", params.do_preview ? "Yes" : "No"); + if (params.dataset_format != "text") { + LOG_INF(" Dataset column: %s\n", params.dataset_column.c_str()); + } + LOG_INF("\n"); + + // Initialize llama.cpp + llama_backend_init(); + + // Load the model for its tokenizer + llama_model_params model_params = llama_model_default_params(); + llama_model *model = llama_model_load_from_file(params.model.path.c_str(), model_params); + + if (model == nullptr) { + LOG_ERR("error: failed to load model from %s\n", params.model.path.c_str()); + llama_backend_free(); + return 1; + } + + // --- Diagnostic Test: Reading tokenizer model GGUF file --- + LOG_INF("--- Diagnostic Test: Reading tokenizer model GGUF file ---\n"); + try { + llama_gguf_reader tokenizer_model_reader(params.model.path); + if (tokenizer_model_reader.llama_gguf_reader_is_initialized()) { + LOG_INF(" Tokenizer Model GGUF file opened successfully.\n"); + LOG_INF(" Tokenizer Model Name: %s\n", + tokenizer_model_reader.llama_gguf_reader_get_metadata_str("general.name", "N/A").c_str()); + LOG_INF(" Tokenizer Model Architecture: %s\n", + tokenizer_model_reader.llama_gguf_reader_get_metadata_str("general.architecture", "N/A").c_str()); + LOG_INF(" Tokenizer Model Tensor Count: %llu\n", + static_cast(tokenizer_model_reader.llama_gguf_reader_get_tensor_count())); + LOG_INF(" Diagnostic Test: Tokenizer Model GGUF read successful.\n"); + } else { + LOG_ERR("error: Diagnostic Test: Tokenizer Model GGUF read failed to initialize.\n"); + llama_model_free(model); // Free model before exiting + llama_backend_free(); + return 1; + } + } catch (const std::runtime_error & e) { + LOG_ERR("error: Diagnostic Test: Tokenizer Model GGUF read failed: %s\n", e.what()); + llama_model_free(model); // Free model before exiting + llama_backend_free(); + return 1; + } + LOG_INF("--- End of Diagnostic Test ---\n\n"); + + // Create and run the converter + llama_gguf_converter converter; + bool success = converter.llama_gguf_converter_convert(params, model); + + if (!success) { + LOG_ERR("error: GGUF conversion failed.\n"); + llama_model_free(model); // Free model on conversion failure + llama_backend_free(); + return 1; + } + + LOG_INF("Conversion successful!\n"); + LOG_INF("Output file: %s\n", params.out_file.c_str()); + + // --- Preview generated GGUF file (if requested) --- + if (params.do_preview) { + LOG_INF("\n--- Previewing generated GGUF file ---\n"); + try { + llama_gguf_reader reader(params.out_file); + + if (!reader.llama_gguf_reader_is_initialized()) { + LOG_ERR("error: llama_gguf_reader failed to initialize for preview.\n"); + llama_model_free(model); // Free model before exiting + llama_backend_free(); + return 1; + } + + LOG_INF(" Dataset Name: %s\n", + reader.llama_gguf_reader_get_metadata_str("training.dataset.name", "N/A").c_str()); + LOG_INF(" Sequence Count: %llu\n", static_cast(reader.llama_gguf_reader_get_metadata_u64("training.sequence.count", 0))); + LOG_INF(" Tokenizer Model: %s\n", + reader.llama_gguf_reader_get_metadata_str("training.tokenizer.gguf.model", "N/A").c_str()); + + int64_t tensor_count = reader.llama_gguf_reader_get_tensor_count(); + if (tensor_count > 0) { + // Print N first sequences + for (int64_t i = 0; i < std::min(static_cast(PREVIEW_COUNT), tensor_count); ++i) { + LOG_INF(" Sequence (training.tensor.%" PRId64 "):\n", i); + std::vector sequence_tokens; + if (reader.llama_gguf_reader_read_tensor_data(i, sequence_tokens)) { + LOG_INF(" Length: %zu tokens\n", sequence_tokens.size()); + LOG_INF(" Tokens: ["); + for (size_t j = 0; j < std::min((size_t) 10, sequence_tokens.size()); + ++j) { // Print up to 10 tokens + LOG_INF("%d%s", sequence_tokens[j], + (j == std::min((size_t) 10, sequence_tokens.size()) - 1) ? "" : ", "); + } + if (sequence_tokens.size() > 10) { + LOG_INF("..."); + } + LOG_INF("]\n"); + // Detokenization + std::string detokenized_text = ""; + // Buffer for a single token + std::array piece_buf; // Large enough buffer for a single token + // Ensure model is valid before calling llama_model_get_vocab + if (model != nullptr) { + for (llama_token token : sequence_tokens) { + int n_chars = llama_token_to_piece(llama_model_get_vocab(model), token, + piece_buf.data(), piece_buf.size(), 1, false); + if (n_chars > 0) { + detokenized_text.append(piece_buf.data(), n_chars); + } + } + LOG_INF(" Detokenized: \"%s\"\n", detokenized_text.c_str()); + } else { + LOG_ERR(" Warning: Cannot detokenize preview, model is null.\n"); + } + + } else { + LOG_ERR(" Error: Could not read data for sequence %" PRId64 ".\n", i); + } + } + } else { + LOG_INF(" No sequences found in the GGUF file.\n"); + } + + } catch (const std::runtime_error & e) { + LOG_ERR("error: GGUF preview failed: %s\n", e.what()); + llama_model_free(model); // Free model before exiting + llama_backend_free(); + return 1; + } + LOG_INF("--- End of GGUF file preview ---\n"); + } + + // Clean up llama model and backend after all usage + llama_model_free(model); + llama_backend_free(); + + return 0; +} diff --git a/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-dataset-reader.h b/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-dataset-reader.h new file mode 100644 index 0000000000000..bab5255a7aca1 --- /dev/null +++ b/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-dataset-reader.h @@ -0,0 +1,35 @@ +#pragma once + +#include // For std::string +#include // For std::vector + +#include "llama.h" // For llama_token + +// Abstract base class for reading dataset. +// Defines the interface that all concrete readers must implement. +struct llama_dataset_reader { + // Virtual destructor for correct deletion of derived classes. + virtual ~llama_dataset_reader() = default; + + // Method to open the data source. + // path: path to the file or other data source identifier. + // Returns true if the source is successfully opened, otherwise false. + virtual bool open(const std::string & path) = 0; + + // Method to read the next sequence of tokens. + // tokens: vector where the read tokens will be stored. + // Returns true if a sequence is successfully read, otherwise false (including end of file). + virtual bool read_next_sequence(std::vector & tokens) = 0; + + // Method to close the data source. + virtual void close() = 0; + + // Method to reset the reader to the beginning of the data source. + // Used for the second pass over the data. + virtual bool reset() = 0; + + // Method to get the total number of sequences in the dataset. + // Can be implemented differently for various data source types. + // Returns 0 if the count is unknown or not applicable. + virtual uint64_t total_sequences() const = 0; +}; diff --git a/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-parquet-data-reader.cpp b/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-parquet-data-reader.cpp new file mode 100644 index 0000000000000..0d9eb05e0b470 --- /dev/null +++ b/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-parquet-data-reader.cpp @@ -0,0 +1,304 @@ +#ifdef LLAMA_PARQUET +#include "llama-parquet-data-reader.h" + +#include // For std::min +#include // For std::cerr + +// Constructor +llama_parquet_dataset_reader::llama_parquet_dataset_reader(const struct llama_model * model, int32_t max_seq_len, + bool pre_tokenized, const std::string & dataset_column_name) : + model_(model), + max_seq_len_(max_seq_len), + pre_tokenized_(pre_tokenized), + current_row_group_index_(0), // Initialize row group index + current_row_in_table_(0), + current_column_index_(-1), // Initialize to -1, will be set in open + dataset_column_name_(dataset_column_name) {} + +// Destructor +llama_parquet_dataset_reader::~llama_parquet_dataset_reader() { + llama_parquet_dataset_reader::close(); +} + +// Opens the Parquet file for reading. +bool llama_parquet_dataset_reader::open(const std::string & path) { + // Close any previously open file + // Note: m_file_path is NOT cleared here, it's preserved for reset() + close(); + + m_file_path = path; // Store the file path for reset() + + // Open the Parquet file + arrow::Status status = arrow::io::ReadableFile::Open(path).Value(&input_file_); + if (!status.ok()) { + std::cerr << "Error (llama_parquet_dataset_reader::open): Failed to open Parquet file '" << path + << "': " << status.ToString() << std::endl; + return false; + } + + // Create a Parquet reader using parquet::arrow::OpenFile + arrow::Result> reader_raw = + parquet::arrow::OpenFile(input_file_, arrow::default_memory_pool()); + + if (!reader_raw.ok()) { + std::cerr << "Error (llama_parquet_dataset_reader::open): Failed to create Parquet file reader for '" << path + << "': " << reader_raw.status().ToString() << std::endl; + close(); + return false; + } + parquet_reader_ = std::move(reader_raw.ValueUnsafe()); + + // Get the schema to determine the correct column index + std::shared_ptr schema; + status = parquet_reader_->GetSchema(&schema); // Corrected: Use GetSchema and pass by address + if (!status.ok() || schema == nullptr) { + std::cerr << "Error (llama_parquet_dataset_reader::open): Failed to get schema from Parquet file: " + << status.ToString() << std::endl; + close(); + return false; + } + + // Determine the column index based on pre_tokenized_ flag + if (pre_tokenized_) { + current_column_index_ = schema->GetFieldIndex(dataset_column_name_); // Use configurable name + if (current_column_index_ == -1) { + std::cerr << "Error (llama_parquet_dataset_reader::open): Pre-tokenized mode selected, but column '" + << dataset_column_name_ << "' not found in Parquet schema." << std::endl; + close(); + return false; + } + // Validate column type: should be List + if (schema->field(current_column_index_)->type()->id() != arrow::Type::LIST) { + std::cerr << "Error (llama_parquet_dataset_reader::open): Column '" << dataset_column_name_ + << "' is not of LIST type as expected for pre-tokenized data. Actual type: " + << schema->field(current_column_index_)->type()->ToString() << std::endl; + close(); + return false; + } + auto list_type = std::static_pointer_cast(schema->field(current_column_index_)->type()); + if (list_type->value_type()->id() != arrow::Type::INT32) { + std::cerr << "Error (llama_parquet_dataset_reader::open): List items in column '" << dataset_column_name_ + << "' are not of INT32 type as expected. Actual value type: " + << list_type->value_type()->ToString() << std::endl; + close(); + return false; + } + + } else { + current_column_index_ = schema->GetFieldIndex(dataset_column_name_); // Use configurable name + if (current_column_index_ == -1) { + std::cerr << "Error (llama_parquet_dataset_reader::open): Raw text mode selected, but column '" + << dataset_column_name_ << "' not found in Parquet schema." << std::endl; + close(); + return false; + } + // Validate column type: should be String + if (schema->field(current_column_index_)->type()->id() != arrow::Type::STRING) { + std::cerr << "Error (llama_parquet_dataset_reader::open): Column '" << dataset_column_name_ + << "' is not of STRING type as expected for raw text. Actual type: " + << schema->field(current_column_index_)->type()->ToString() << std::endl; + close(); + return false; + } + } + + // Initialize row group index + current_row_group_index_ = 0; + // Read the first batch (row group) + return llama_parquet_dataset_reader_get_next_batch(); +} + +// Reads the next sequence of tokens from the Parquet file. +bool llama_parquet_dataset_reader::read_next_sequence(std::vector & tokens) { + tokens.clear(); + + // If current_table_ is null or we've processed all rows in the current batch, get the next batch (row group) + if (!current_table_ || current_row_in_table_ >= current_table_->num_rows()) { + if (!llama_parquet_dataset_reader_get_next_batch()) { + return false; // No more batches/row groups or error getting next batch + } + } + + if (!current_table_ || current_table_->num_rows() == 0) { + return false; // Should not happen if get_next_batch was successful, but as a safeguard + } + + // Assuming single chunk for simplicity. For multi-chunk columns, you'd iterate through chunks. + // When reading a column from a row group, it typically returns a single chunk. + std::shared_ptr column_array = + current_table_->column(0)->chunk(0); // column(0) because we read only one column into current_table_ + + if (pre_tokenized_) { + // Pre-tokenized data: read List array + auto list_array = std::static_pointer_cast(column_array); + auto value_array = std::static_pointer_cast(list_array->values()); + + if (list_array->IsNull(current_row_in_table_)) { + // Handle null list (empty sequence) + current_row_in_table_++; + return true; + } + + int32_t start_offset = list_array->value_offset(current_row_in_table_); + int32_t end_offset = list_array->value_offset(current_row_in_table_ + 1); + int32_t num_tokens_in_row = end_offset - start_offset; + + tokens.reserve(std::min((int32_t) max_seq_len_, num_tokens_in_row)); + for (int32_t i = 0; i < num_tokens_in_row && i < max_seq_len_; ++i) { + tokens.push_back(static_cast(value_array->Value(start_offset + i))); + } + + } else { + // Raw text data: read String array and tokenize + if (!model_) { + std::cerr << "Error (llama_parquet_dataset_reader::read_next_sequence): Llama model not provided for " + "tokenization of raw text." + << std::endl; + return false; + } + + auto string_array = std::static_pointer_cast(column_array); + if (string_array->IsNull(current_row_in_table_)) { + // Handle null string (empty sequence) + current_row_in_table_++; + return true; + } + + std::string text = string_array->GetString(current_row_in_table_); + std::vector tokens_buffer(max_seq_len_); // Use a temporary buffer for tokenization + + int n_tokens = llama_tokenize(llama_model_get_vocab(model_), text.c_str(), text.length(), tokens_buffer.data(), + max_seq_len_, false, true); + if (n_tokens < 0) { + std::cerr << "Error (llama_parquet_dataset_reader::read_next_sequence): Tokenization failed for text: '" + << text << "'" << std::endl; + current_row_in_table_++; + return true; // Return true with empty tokens to continue processing + } + tokens.assign(tokens_buffer.begin(), tokens_buffer.begin() + n_tokens); + } + + current_row_in_table_++; + return true; +} + +// Closes the Parquet file. +void llama_parquet_dataset_reader::close() { + parquet_reader_.reset(); + current_row_group_reader_.reset(); // Reset row group reader + current_table_.reset(); + chunked_array_.reset(); // Reset chunked array + if (input_file_) { + arrow::Status status = input_file_->Close(); + if (!status.ok()) { + std::cerr << "Warning (llama_parquet_dataset_reader::close): Failed to close Arrow file: " + << status.ToString() << std::endl; + } + } + input_file_.reset(); + current_row_group_index_ = 0; // Reset row group index + current_row_in_table_ = 0; + current_column_index_ = -1; + // m_file_path is NOT cleared here. It's preserved for reset() +} + +// Resets the reader to the beginning of the Parquet file. +bool llama_parquet_dataset_reader::reset() { + if (m_file_path.empty()) { // Check if path is stored + std::cerr << "Error (llama_parquet_dataset_reader::reset): Cannot reset, file path was not stored." + << std::endl; + return false; + } + // Re-open the file and re-initialize the reader + return open(m_file_path); // Use the stored path +} + +// Private helper to get the next batch of data (now a row group) +bool llama_parquet_dataset_reader::llama_parquet_dataset_reader_get_next_batch() { + current_table_.reset(); // Clear previous table + current_row_in_table_ = 0; // Reset row index for new table + chunked_array_.reset(); // Reset chunked array for new batch + + if (!parquet_reader_) { + std::cerr << "Error (llama_parquet_dataset_reader::llama_parquet_dataset_reader_get_next_batch): Parquet " + "reader is not initialized." + << std::endl; + return false; + } + + if (current_row_group_index_ >= parquet_reader_->num_row_groups()) { + return false; // No more row groups + } + + // Get the reader for the current row group + current_row_group_reader_ = parquet_reader_->RowGroup(current_row_group_index_); + if (!current_row_group_reader_) { + std::cerr << "Error (llama_parquet_dataset_reader::llama_parquet_dataset_reader_get_next_batch): Failed to get " + "row group reader for index " + << current_row_group_index_ << std::endl; + return false; + } + + // Get the ColumnChunkReader for the specific column + std::shared_ptr column_chunk_reader = + current_row_group_reader_->Column(current_column_index_); + if (!column_chunk_reader) { + std::cerr << "Error (llama_parquet_dataset_reader::llama_parquet_dataset_reader_get_next_batch): Failed to get " + "column chunk reader for column " + << current_column_index_ << " in row group " << current_row_group_index_ << std::endl; + return false; + } + + // Read the column data into a ChunkedArray + arrow::Status status = column_chunk_reader->Read(&chunked_array_); // Use member variable + if (!status.ok()) { + std::cerr << "Error (llama_parquet_dataset_reader::llama_parquet_dataset_reader_get_next_batch): Failed to " + "read column " + << current_column_index_ << " from row group " << current_row_group_index_ << ": " + << status.ToString() << std::endl; + return false; + } + + // Get the schema from the parquet_reader_ to construct the table + std::shared_ptr schema; + status = parquet_reader_->GetSchema(&schema); + if (!status.ok() || schema == nullptr) { + std::cerr << "Error (llama_parquet_dataset_reader::llama_parquet_dataset_reader_get_next_batch): Failed to get " + "schema from Parquet reader for column " + << current_column_index_ << std::endl; + return false; + } + + // Get the field for the current column index + std::shared_ptr column_field = schema->field(current_column_index_); + if (column_field == nullptr) { + std::cerr << "Error (llama_parquet_dataset_reader::llama_parquet_dataset_reader_get_next_batch): Column field " + "is null for index " + << current_column_index_ << std::endl; + return false; + } + + current_table_ = arrow::Table::Make(arrow::schema({ column_field }), // Create a schema with just this column + { chunked_array_ } // Pass the chunked array as the column data + ); + + if (!current_table_ || current_table_->num_rows() == 0) { + return false; // No data in this row group + } + + current_row_group_index_++; // Move to the next row group for the next call + return true; +} + +// Method to get the total number of sequences in the dataset. +// For Parquet files, this will be the number of rows obtained from metadata. +uint64_t llama_parquet_dataset_reader::total_sequences() const { + if (!parquet_reader_) { + std::cerr << "Error (llama_parquet_dataset_reader::total_sequences): Parquet reader is not initialized." + << std::endl; + return 0; + } + // Total number of rows in the Parquet file + return parquet_reader_->parquet_reader()->metadata()->num_rows(); +} +#endif diff --git a/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-parquet-data-reader.h b/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-parquet-data-reader.h new file mode 100644 index 0000000000000..f00e272722225 --- /dev/null +++ b/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-parquet-data-reader.h @@ -0,0 +1,75 @@ +#pragma once +#ifdef LLAMA_PARQUET +#include "llama.h" // For llama_token + +// Include necessary Apache Arrow and Parquet headers +// You will need to link against these libraries (e.g., -larrow -lparquet) +#include +#include +#include + +#include // For std::unique_ptr +#include +#include + +#include "llama-dataset-reader.h" + +// Implementation of DatasetReader for reading Parquet files. +// This class will handle reading tokenized sequences from a Parquet file. +struct llama_parquet_dataset_reader : public llama_dataset_reader { + // Constructor. + // model: Pointer to the llama model for tokenization (can be nullptr if data is pre-tokenized). + // max_seq_len: Maximum sequence length for truncation. + // pre_tokenized: If true, input data is already tokenized (token IDs in a numeric column). + // text_column_name: Name of the column containing raw text data. + // tokens_column_name: Name of the column containing pre-tokenized data (list). + llama_parquet_dataset_reader(const struct llama_model * model, int32_t max_seq_len, bool pre_tokenized, + const std::string & dataset_column_name); + + // Destructor. + ~llama_parquet_dataset_reader(); + + // Opens the Parquet file for reading. + // path: Path to the Parquet file. + // Returns true if the source is successfully opened, otherwise false. + bool open(const std::string & path) override; + + // Reads the next sequence of tokens from the Parquet file. + // tokens: Vector where the read tokens will be stored. + // Returns true if a sequence is successfully read, otherwise false (including end of file). + bool read_next_sequence(std::vector & tokens) override; + + // Closes the Parquet file. + void close() override; + + // Resets the reader to the beginning of the Parquet file. + // Returns true if reset is successful, otherwise false. + bool reset() override; + + // Method to get the total number of sequences in the dataset. + // For Parquet files, this will be the number of rows obtained from metadata. + uint64_t total_sequences() const override; + + private: + const struct llama_model * model_; // Llama model for tokenization (if needed) + int32_t max_seq_len_; // Maximum sequence length + bool pre_tokenized_; // Flag for pre-tokenized data + + std::shared_ptr input_file_; // Arrow file handle + std::unique_ptr parquet_reader_; // Parquet reader + std::shared_ptr current_table_; // Current table batch being processed + std::shared_ptr chunked_array_; // Member to store the chunked array + + int current_row_group_index_; // Current row group index + std::shared_ptr current_row_group_reader_; // Reader for the current row group + + int64_t current_row_in_table_; // Current row index within the current_table_ + int current_column_index_; // Index of the column containing text/tokens + std::string m_file_path; // Path to the Parquet file + + std::string dataset_column_name_; // Configurable name for column + + // Private helper to get the next batch of data (now a row group) + bool llama_parquet_dataset_reader_get_next_batch(); +}; +#endif diff --git a/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-text-data-reader.cpp b/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-text-data-reader.cpp new file mode 100644 index 0000000000000..57db707eb4de3 --- /dev/null +++ b/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-text-data-reader.cpp @@ -0,0 +1,125 @@ +#include "llama-text-data-reader.h" + +#include // For std::min +#include // For std::cerr +#include + +#include "llama.h" // For llama_tokenize, llama_model_get_vocab + +// Constructor +llama_text_dataset_reader::llama_text_dataset_reader(const struct llama_model * model, int32_t max_seq_len, + bool pre_tokenized) : + m_model(model), + m_max_seq_len(max_seq_len), + m_pre_tokenized(pre_tokenized), + m_tokens_buffer(max_seq_len) {} + +// Destructor +llama_text_dataset_reader::~llama_text_dataset_reader() { + llama_text_dataset_reader::close(); +} + +// Opens the text file for reading. +bool llama_text_dataset_reader::open(const std::string & path) { + m_file_path = path; // Store the file path + m_input_file.open(path); + if (!m_input_file.is_open()) { + std::cerr << "Error: Failed to open input file " << path << std::endl; + return false; + } + return true; +} + +// Reads the next sequence of tokens from the file. +bool llama_text_dataset_reader::read_next_sequence(std::vector & tokens) { + std::string line; + if (!std::getline(m_input_file, line)) { + return false; // End of file or read error + } + + tokens.clear(); // Clear the vector for a new sequence + int n_tokens = 0; + + if (line.empty()) { + // Empty line, return an empty sequence + return true; + } + + if (m_pre_tokenized) { + // Pre-tokenized data mode: parse tokens from the string + std::istringstream iss(line); + llama_token token_id; + while (iss >> token_id) { + if (n_tokens < m_max_seq_len) { + tokens.push_back(token_id); + n_tokens++; + } else { + // Truncate if it exceeds m_max_seq_len + break; + } + } + } else { + // Raw text data mode: tokenize the string + if (!m_model) { + std::cerr << "Error: Llama model not provided for tokenization of raw text." << std::endl; + return false; + } + // Ensure the buffer is large enough + if (m_tokens_buffer.size() < (size_t) m_max_seq_len) { + m_tokens_buffer.resize(m_max_seq_len); + } + n_tokens = llama_tokenize(llama_model_get_vocab(m_model), line.c_str(), line.length(), m_tokens_buffer.data(), + m_max_seq_len, false, true); + if (n_tokens < 0) { + std::cerr << "Error: Tokenization failed for line: " << line << std::endl; + // Return an empty sequence in case of tokenization error + return false; + } + tokens.assign(m_tokens_buffer.begin(), m_tokens_buffer.begin() + n_tokens); + } + return true; +} + +// Closes the file. +void llama_text_dataset_reader::close() { + if (m_input_file.is_open()) { + m_input_file.close(); + } +} + +// Resets the file pointer to the beginning of the file. +bool llama_text_dataset_reader::reset() { + if (m_input_file.is_open()) { + m_input_file.clear(); // Clear any error flags (e.g., EOF) + m_input_file.seekg(0, std::ios::beg); // Move pointer to the beginning + return true; + } + // If not open, try to open it again using the stored path + return open(m_file_path); +} + +// Method to get the total number of sequences in the dataset. +// For text files, this will be the number of lines. +// Note: This method will be slow for very large files, +// as it reads the entire file to count lines. +uint64_t llama_text_dataset_reader::total_sequences() const { + if (m_file_path.empty()) { + std::cerr << "Error (llama_text_dataset_reader::total_sequences): File path not set." << std::endl; + return 0; + } + + std::ifstream temp_file(m_file_path); + if (!temp_file.is_open()) { + std::cerr << "Error (llama_text_dataset_reader::total_sequences): Failed to open file '" << m_file_path + << "' for counting lines." << std::endl; + return 0; + } + + uint64_t count = 0; + std::string line; + while (std::getline(temp_file, line)) { + count++; + } + temp_file.close(); + return count; +} diff --git a/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-text-data-reader.h b/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-text-data-reader.h new file mode 100644 index 0000000000000..47f38c9ffb1e9 --- /dev/null +++ b/tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-text-data-reader.h @@ -0,0 +1,47 @@ +#pragma once + +#include // For std::ifstream +#include // For std::string +#include // For std::vector + +#include "llama-dataset-reader.h" +#include "llama.h" // For llama_token and llama_model + +// Implementation of DatasetReader for reading text files. +// Supports both raw text and pre-tokenized data. +struct llama_text_dataset_reader : public llama_dataset_reader { + // Constructor. + // model: pointer to the llama model for tokenization (can be nullptr if pre_tokenized is true). + // max_seq_len: maximum sequence length for truncation. + // pre_tokenized: if true, input data is already tokenized (token IDs as numbers). + llama_text_dataset_reader(const struct llama_model * model, int32_t max_seq_len, bool pre_tokenized); + + // Destructor. + ~llama_text_dataset_reader(); + + // Opens the text file for reading. + bool open(const std::string & path) override; + + // Reads the next sequence of tokens from the file. + // If pre_tokenized is true, parses numbers from the string. + // If pre_tokenized is false, tokenizes the string using llama_model. + bool read_next_sequence(std::vector & tokens) override; + + // Closes the file. + void close() override; + + // Resets the file pointer to the beginning of the file. + bool reset() override; + + // Method to get the total number of sequences in the dataset. + // For text files, this will be the number of lines. + uint64_t total_sequences() const override; + + private: + const struct llama_model * m_model; // Model for tokenization + int32_t m_max_seq_len; // Maximum sequence length + bool m_pre_tokenized; // Flag for pre-tokenized data + std::ifstream m_input_file; // File stream object + std::string m_file_path; // File path for reset and total_sequences + std::vector m_tokens_buffer; // Internal buffer for tokens +}; diff --git a/tools/dataset-converter/dataset-to-gguf/llama-gguf-converter.cpp b/tools/dataset-converter/dataset-to-gguf/llama-gguf-converter.cpp new file mode 100644 index 0000000000000..7e853de33d0ed --- /dev/null +++ b/tools/dataset-converter/dataset-to-gguf/llama-gguf-converter.cpp @@ -0,0 +1,164 @@ +// Utility for converting a text dataset to the GGUF format for training models in llama.cpp. +// +// Logic: +// 1. Loads the tokenizer model. +// 2. Performs a first pass over the input data to collect metadata (sequence lengths). +// 3. Creates a GGUF file and writes all collected metadata to it. +// 4. Performs a second pass over the input data to add each sequence as a separate tensor to the GGUF file. +// +// This two-pass approach allows processing datasets significantly larger than +// available RAM. + +#include "llama-gguf-converter.h" // Include the new header name for the converter + +#include // For PRIu64 +#include // For fprintf, snprintf +#include // For std::unique_ptr +#include // For std::runtime_error +#include // For std::vector + +// Include the refactored GGUF and data reader headers +#include "common.h" +#include "llama-dataset-reader/llama-dataset-reader.h" +#include "llama-dataset-reader/llama-parquet-data-reader.h" +#include "llama-dataset-reader/llama-text-data-reader.h" +#include "llama-gguf-file.h" +#include "llama-gguf-writer.h" + +// Method to execute the conversion process. +bool llama_gguf_converter::llama_gguf_converter_convert(const struct common_params & params, + const struct llama_model * model) { + // --- Create DataReader based on input_type --- + std::unique_ptr reader; + if (params.dataset_format == "text") { + reader = std::make_unique(model, params.max_seq_len, params.pre_tokenized); +#ifdef LLAMA_PARQUET + } else if (params.dataset_format == "parquet") { + reader = std::make_unique(model, params.max_seq_len, params.pre_tokenized, params.dataset_column); +#endif + } else { + fprintf(stderr, "error: Unsupported input type: %s\n", params.dataset_format.c_str()); + return false; + } + + // Open the data source + if (!reader->open(params.in_files[0])) { //now only first file + fprintf(stderr, "error: Failed to open data source %s\n", params.in_files[0].c_str()); + return false; + } + + uint64_t total_sequence_count = 0; + std::vector sequence_lengths; // Will store sequence lengths for text files + + // --- FIRST PASS: Collect sequence lengths or get total count --- + printf("First pass: Reading input data and getting sequence lengths...\n"); + + if (params.dataset_format == "parquet") { + // For Parquet, get total sequence count from metadata + total_sequence_count = reader->total_sequences(); + printf("First pass complete. Found %" PRIu64 " sequences (from Parquet metadata).\n\n", total_sequence_count); + } else { // For text files + // For text files, perform a full first pass to count sequences + // and their lengths (as this is the only way to know the exact token count). + std::vector tokens; + while (reader->read_next_sequence(tokens)) { + sequence_lengths.push_back(tokens.size()); + } + total_sequence_count = sequence_lengths.size(); + printf("First pass complete. Found %" PRIu64 " sequences.\n\n", total_sequence_count); + } + + // --- WRITE GGUF FILE --- + printf("Creating GGUF file...\n"); + // Create a llama_gguf_file instance, which will manage the GGUF context + std::unique_ptr gguf_file; + try { + gguf_file = std::make_unique(); + } catch (const std::runtime_error & e) { + fprintf(stderr, "error: Failed to initialize llama_gguf_file: %s\n", e.what()); + return false; + } + + // Pass the pointer to gguf_file to llama_gguf_writer + llama_gguf_writer writer(gguf_file.get()); + + // Initialize GGUF file metadata + writer.llama_gguf_writer_init_metadata(model, params.in_files[0], total_sequence_count); + printf("Metadata written.\n"); + + // --- SECOND PASS: Write tensors --- + printf("Second pass: Writing tensors to GGUF file...\n"); + if (!reader->reset()) { + fprintf(stderr, "error: Failed to reset data reader for second pass.\n"); + return false; + } + + uint64_t current_sequence_idx = 0; + std::vector tokens; // Reuse the tokens vector + while (reader->read_next_sequence(tokens)) { + if (current_sequence_idx >= total_sequence_count) { + fprintf(stderr, + "error: file ended prematurely on second pass. Expected %" PRIu64 + " sequences, but reached end of file at %" PRIu64 ".\n", + total_sequence_count, current_sequence_idx); + break; + } + + uint32_t expected_n_tokens; + if (params.dataset_format == "text") { + // For text files, use lengths collected in the first pass + expected_n_tokens = sequence_lengths[current_sequence_idx]; + } else { + // For Parquet, we don't know the expected length beforehand, + // so just use the actual length of the read sequence. + // If the Parquet file contains empty sequences, they will be handled. + expected_n_tokens = tokens.size(); + } + + uint32_t actual_n_tokens = tokens.size(); + + // If the number of tokens does not match (only for text where we know it beforehand), + // this is a critical error, as metadata collected in the first pass will be incorrect for this tensor. + // Abort conversion to avoid creating a corrupted GGUF file. + if (params.dataset_format == "text" && actual_n_tokens != expected_n_tokens) { + fprintf(stderr, + "error: Tokenization mismatch on second pass for sequence %" PRIu64 + ". Expected %u tokens, got %u.\n", + current_sequence_idx, expected_n_tokens, actual_n_tokens); + fprintf( + stderr, + "This indicates a non-deterministic tokenizer or an issue with input reading. Aborting conversion.\n"); + return false; // Abort conversion + } + + // Add tensor only if there are tokens + if (actual_n_tokens > 0) { + writer.llama_gguf_writer_add_sequence_tensor(current_sequence_idx, tokens); + } else { + // If 0 tokens were expected, but the line was not empty, print a warning + // (This condition `expected_n_tokens != 0` is only relevant for text files, + // where we might have gotten 0 tokens in the first pass for a non-empty line.) + if (params.dataset_format == "text" && expected_n_tokens != 0) { + fprintf(stderr, "warning: sequence %" PRIu64 " resulted in 0 tokens on second pass, but expected %u.\n", + current_sequence_idx, expected_n_tokens); + // Continue, as this might be acceptable for some datasets, + // but warn about potential inconsistency. + } + } + current_sequence_idx++; + } + reader->close(); // Close DataReader after use + printf("Second pass complete.\n\n"); + + // Save file to disk + printf("Writing GGUF data to %s...\n", params.out_file.c_str()); + if (!writer.llama_gguf_writer_write_to_file(params.out_file)) { + fprintf(stderr, "error: Failed to write GGUF file %s\n", params.out_file.c_str()); + return false; + } + + printf("Conversion successful!\n"); + printf("Output file: %s\n", params.out_file.c_str()); + + return true; +} diff --git a/tools/dataset-converter/dataset-to-gguf/llama-gguf-converter.h b/tools/dataset-converter/dataset-to-gguf/llama-gguf-converter.h new file mode 100644 index 0000000000000..43dbb489d610c --- /dev/null +++ b/tools/dataset-converter/dataset-to-gguf/llama-gguf-converter.h @@ -0,0 +1,15 @@ +#pragma once + +#include "llama.h" // For struct llama_model + +// Class encapsulating the high-level logic for converting +// input data to the GGUF format. +struct llama_gguf_converter { + // Default constructor. + llama_gguf_converter() = default; + + // Method to execute the conversion process. + // params: A structure containing all necessary parameters for conversion. + // Returns true on successful conversion, false on error. + bool llama_gguf_converter_convert(const struct common_params & params, const struct llama_model * model); +}; diff --git a/tools/dataset-converter/dataset-to-gguf/llama-gguf-file.cpp b/tools/dataset-converter/dataset-to-gguf/llama-gguf-file.cpp new file mode 100644 index 0000000000000..5571dbfee25db --- /dev/null +++ b/tools/dataset-converter/dataset-to-gguf/llama-gguf-file.cpp @@ -0,0 +1,163 @@ +#include +#include +#include + +#include "llama-gguf-file.h" + +// Default constructor: Initializes an empty GGUF context for writing. +llama_gguf_file::llama_gguf_file() : m_ctx(nullptr) { + m_ctx = gguf_init_empty(); + if (!m_ctx) { + throw std::runtime_error("Failed to initialize empty GGUF context."); + } +} + +// Constructor: Initializes a GGUF context from an existing file for reading. +// path: Path to the GGUF file to open. +llama_gguf_file::llama_gguf_file(const std::string & path) : m_ctx(nullptr) { + struct gguf_init_params params = {}; + // When reading, we do NOT want gguf_init_from_file to allocate a ggml_context + // for tensors, as we will manage data reading manually using file offsets. + params.no_alloc = true; // Ensure no allocation for tensor data by gguf_init_from_file + m_ctx = gguf_init_from_file(path.c_str(), params); + if (!m_ctx) { + throw std::runtime_error("Failed to initialize GGUF context from file: " + path); + } +} + +// Destructor: Frees the GGUF context. +llama_gguf_file::~llama_gguf_file() { + if (m_ctx) { + gguf_free(m_ctx); + m_ctx = nullptr; + } +} + +// Checks if the GGUF context is initialized. +// Returns true if initialized, false otherwise. +bool llama_gguf_file::llama_gguf_file_is_initialized() const { + return m_ctx != nullptr; +} + +// --- Methods for working with metadata (KV-pairs) --- + +// Sets a string value for a given key. +void llama_gguf_file::llama_gguf_file_set_val_str(const std::string & key, const std::string & value) { + if (!m_ctx) { + throw std::runtime_error("GGUF context not initialized."); + } + gguf_set_val_str(m_ctx, key.c_str(), value.c_str()); +} + +// Sets a uint64_t value for a given key. +void llama_gguf_file::llama_gguf_file_set_val_u64(const std::string & key, uint64_t value) { + if (!m_ctx) { + throw std::runtime_error("GGUF context not initialized."); + } + gguf_set_val_u64(m_ctx, key.c_str(), value); +} + +// Sets an array of strings for a given key. +void llama_gguf_file::llama_gguf_file_set_arr_str(const std::string & key, const std::vector & values) { + if (!m_ctx) { + throw std::runtime_error("GGUF context not initialized."); + } + gguf_set_arr_str(m_ctx, key.c_str(), const_cast(values.data()), values.size()); +} + +// Sets an array of data of a specified GGUF type for a given key. +void llama_gguf_file::llama_gguf_file_set_arr_data(const std::string & key, gguf_type type, const void * data, + size_t n) { + if (!m_ctx) { + throw std::runtime_error("GGUF context not initialized."); + } + gguf_set_arr_data(m_ctx, key.c_str(), type, data, n); +} + +// Gets a string value by key. +std::string llama_gguf_file::llama_gguf_file_get_val_str(const std::string & key, + const std::string & defaultValue) const { + if (!m_ctx) { + return defaultValue; + } + int64_t key_id = llama_gguf_file_find_key(key); + if (key_id == -1 || gguf_get_kv_type(m_ctx, key_id) != GGUF_TYPE_STRING) { + return defaultValue; + } + return gguf_get_val_str(m_ctx, key_id); +} + +// Gets a uint64_t value by key. +uint64_t llama_gguf_file::llama_gguf_file_get_val_u64(const std::string & key, uint64_t defaultValue) const { + if (!m_ctx) { + return defaultValue; + } + int64_t key_id = llama_gguf_file_find_key(key); + if (key_id == -1 || gguf_get_kv_type(m_ctx, key_id) != GGUF_TYPE_UINT64) { + return defaultValue; + } + return gguf_get_val_u64(m_ctx, key_id); +} + +// --- Methods for working with tensors --- + +// Adds a ggml_tensor to the GGUF context. +void llama_gguf_file::llama_gguf_file_add_tensor(struct ggml_tensor * tensor) { + if (!m_ctx) { + throw std::runtime_error("GGUF context not initialized."); + } + gguf_add_tensor(m_ctx, tensor); +} + +// Sets the data for a tensor by its name. +void llama_gguf_file::llama_gguf_file_set_tensor_data(const std::string & name, const void * data) { + if (!m_ctx) { + throw std::runtime_error("GGUF context not initialized."); + } + gguf_set_tensor_data(m_ctx, name.c_str(), data); +} + +// Gets the number of tensors in the GGUF file. +int64_t llama_gguf_file::llama_gguf_file_get_n_tensors() const { + if (!m_ctx) { + return 0; + } + return gguf_get_n_tensors(m_ctx); +} + +std::string llama_gguf_file::llama_gguf_file_get_tensor_name(int64_t idx) const { + return gguf_get_tensor_name(m_ctx, idx); +} + +enum ggml_type llama_gguf_file::llama_gguf_file_get_tensor_type(int64_t idx) const { + return gguf_get_tensor_type(m_ctx, idx); +} + +size_t llama_gguf_file::llama_gguf_file_get_tensor_size(int64_t idx) const { + return gguf_get_tensor_size(m_ctx, idx); +} + +// Reads tensor data into a vector of llama_token. +// This is specific for sequence tensors (GGML_TYPE_I32). +bool llama_gguf_file::llama_gguf_file_write_to_file(const std::string & output_path, bool only_meta) { + if (!m_ctx) { + std::cerr << "Error: GGUF context is not initialized. Cannot write to file." << std::endl; + return false; + } + if (!gguf_write_to_file(m_ctx, output_path.c_str(), only_meta)) { + std::cerr << "Error: Failed to write GGUF file to " << output_path << std::endl; + return false; + } + return true; +} + +struct gguf_context * llama_gguf_file::get_gguf_context() { + return m_ctx; +} + +int64_t llama_gguf_file::llama_gguf_file_find_key(const std::string & key) const { + if (!m_ctx) { + return -1; + } + return gguf_find_key(m_ctx, key.c_str()); +} diff --git a/tools/dataset-converter/dataset-to-gguf/llama-gguf-file.h b/tools/dataset-converter/dataset-to-gguf/llama-gguf-file.h new file mode 100644 index 0000000000000..fc8abf0e9677b --- /dev/null +++ b/tools/dataset-converter/dataset-to-gguf/llama-gguf-file.h @@ -0,0 +1,116 @@ +#pragma once + +#include // For uint64_t, int64_t +#include // For std::runtime_error +#include // For std::string +#include // For std::vector + +#include "ggml.h" // For struct ggml_tensor +#include "gguf.h" // For struct gguf_context, enum gguf_type +#include "llama.h" + +// Opaque type for the GGUF file handler. +typedef struct llama_gguf_file llama_gguf_file_t; + +// Class for encapsulating GGUF file operations. +// It manages the underlying gguf_context and provides a higher-level API +// for setting metadata, adding tensors, and reading/writing files. +struct llama_gguf_file { + // Default constructor: Initializes an empty GGUF context for writing. + llama_gguf_file(); + + // Constructor: Initializes a GGUF context from an existing file for reading. + // path: Path to the GGUF file to open. + // Throws std::runtime_error if file cannot be opened or context cannot be initialized. + llama_gguf_file(const std::string & path); + + // Destructor: Frees the GGUF context and the associated ggml_context. + ~llama_gguf_file(); + + // Checks if the GGUF context is initialized. + // Returns true if initialized, false otherwise. + bool llama_gguf_file_is_initialized() const; + + // --- Methods for working with metadata (KV-pairs) --- + + // Sets a string value for a given key. + // key: The metadata key. + // value: The string value to set. + void llama_gguf_file_set_val_str(const std::string & key, const std::string & value); + + // Sets a uint64_t value for a given key. + // key: The metadata key. + // value: The uint64_t value to set. + void llama_gguf_file_set_val_u64(const std::string & key, uint64_t value); + + // Sets an array of strings for a given key. + // key: The metadata key. + // values: A vector of C-style strings (const char*) to set. + void llama_gguf_file_set_arr_str(const std::string & key, const std::vector & values); + + // Sets an array of data of a specified GGUF type for a given key. + // key: The metadata key. + // type: The GGUF type of the data (e.g., GGUF_TYPE_INT32). + // data: Pointer to the data array. + // n: Number of elements in the data array. + void llama_gguf_file_set_arr_data(const std::string & key, gguf_type type, const void * data, size_t n); + + // Gets a string value by key. + // key: The metadata key. + // default_value: The value to return if the key is not found or has a different type. + // Returns the string value or the default_value. + std::string llama_gguf_file_get_val_str(const std::string & key, const std::string & default_value = "") const; + + // Gets a uint64_t value by key. + // key: The metadata key. + // default_value: The value to return if the key is not found or has a different type. + // Returns the uint64_t value or the default_value. + uint64_t llama_gguf_file_get_val_u64(const std::string & key, uint64_t default_value = 0) const; + + // --- Methods for working with tensors --- + + // Adds a ggml_tensor to the GGUF context. + // tensor: Pointer to the ggml_tensor to add. + void llama_gguf_file_add_tensor(struct ggml_tensor * tensor); + + // Sets the data for a tensor by its name. + // name: The name of the tensor. + // data: Pointer to the tensor data. + void llama_gguf_file_set_tensor_data(const std::string & name, const void * data); + + // Gets the number of tensors in the GGUF file. + // Returns the count of tensors. + int64_t llama_gguf_file_get_n_tensors(void) const; + + // Gets the name of a tensor by index. + // idx: The index of the tensor. + // Returns the tensor name or an empty string if not found. + std::string llama_gguf_file_get_tensor_name(int64_t idx) const; + + // Gets the type of a tensor by index. + // idx: The index of the tensor. + // Returns the ggml_type of the tensor. + enum ggml_type llama_gguf_file_get_tensor_type(int64_t idx) const; + + // Gets the size of a tensor in bytes by index. + // idx: The index of the tensor. + // Returns the size of the tensor in bytes. + size_t llama_gguf_file_get_tensor_size(int64_t idx) const; + + // --- Methods for saving/loading the file --- + + // Writes the entire GGUF context to a file. + // output_path: Path to the output GGUF file. + // only_meta: If true, only metadata is written (no tensor data). + // Returns true on success, false on error. + bool llama_gguf_file_write_to_file(const std::string & output_path, bool only_meta); + struct gguf_context * get_gguf_context(); + + private: + struct gguf_context * m_ctx; // The underlying GGUF context + + // Private helper function to find a key by name. + // key: The key name to find. + // Returns the key ID or -1 if not found. + int64_t llama_gguf_file_find_key(const std::string & key) const; +}; diff --git a/tools/dataset-converter/dataset-to-gguf/llama-gguf-reader.cpp b/tools/dataset-converter/dataset-to-gguf/llama-gguf-reader.cpp new file mode 100644 index 0000000000000..5109b93612a9d --- /dev/null +++ b/tools/dataset-converter/dataset-to-gguf/llama-gguf-reader.cpp @@ -0,0 +1,177 @@ +#include "llama-gguf-reader.h" // Include the new header name + +#include // For reading tensor data directly +#include // For std::cerr +#include // For std::runtime_error + +// Constructor: Initializes the reader to read from the specified GGUF file. +llama_gguf_reader::llama_gguf_reader(const std::string & path) : m_gguf_file_ptr(nullptr), m_file_path(path) { + try { + // Initialize llama_gguf_file in read mode (with ggml_context allocation) + m_gguf_file_ptr = std::make_unique(path); + } catch (const std::runtime_error & e) { + std::cerr << "Error: llama_gguf_reader constructor failed to initialize llama_gguf_file from path '" << path + << "': " << e.what() << std::endl; + // Re-throw the exception as initialization failed + throw; + } +} + +// Checks if the reader has been successfully initialized. +bool llama_gguf_reader::llama_gguf_reader_is_initialized(void) const { + return m_gguf_file_ptr != nullptr && m_gguf_file_ptr->llama_gguf_file_is_initialized(); +} + +// Gets a string metadata value by key. +std::string llama_gguf_reader::llama_gguf_reader_get_metadata_str(const std::string & key, + const std::string & default_value) const { + if (!llama_gguf_reader_is_initialized()) { + std::cerr << "Error (llama_gguf_reader::llama_gguf_reader_get_metadata_str): GGUFReader is not initialized. " + "Cannot retrieve metadata for key '" + << key << "'." << std::endl; + return default_value; + } + return m_gguf_file_ptr->llama_gguf_file_get_val_str(key, default_value); +} + +// Gets a uint64_t metadata value by key. +uint64_t llama_gguf_reader::llama_gguf_reader_get_metadata_u64(const std::string & key, uint64_t default_value) const { + if (!llama_gguf_reader_is_initialized()) { + std::cerr << "Error (llama_gguf_reader::llama_gguf_reader_get_metadata_u64): GGUFReader is not initialized. " + "Cannot retrieve metadata for key '" + << key << "'." << std::endl; + return default_value; + } + return m_gguf_file_ptr->llama_gguf_file_get_val_u64(key, default_value); +} + +// Gets the number of tensors in the file. +int64_t llama_gguf_reader::llama_gguf_reader_get_tensor_count(void) const { + if (!llama_gguf_reader_is_initialized()) { + std::cerr << "Error (llama_gguf_reader::llama_gguf_reader_get_tensor_count): GGUFReader is not initialized. " + "Cannot get tensor count." + << std::endl; + return 0; + } + return m_gguf_file_ptr->llama_gguf_file_get_n_tensors(); +} + +// Gets the name of a tensor by index. +std::string llama_gguf_reader::llama_gguf_reader_get_tensor_name(int64_t index) const { + if (!llama_gguf_reader_is_initialized()) { + std::cerr << "Error (llama_gguf_reader::llama_gguf_reader_get_tensor_name): GGUFReader is not initialized. " + "Cannot get tensor name for index " + << index << "." << std::endl; + return ""; + } + return m_gguf_file_ptr->llama_gguf_file_get_tensor_name(index); +} + +// Gets the type of a tensor by index. +enum ggml_type llama_gguf_reader::llama_gguf_reader_get_tensor_type(int64_t index) const { + if (!llama_gguf_reader_is_initialized()) { + std::cerr << "Error (llama_gguf_reader::llama_gguf_reader_get_tensor_type): GGUFReader is not initialized. " + "Cannot get tensor type for index " + << index << "." << std::endl; + return GGML_TYPE_COUNT; // Unknown type + } + return m_gguf_file_ptr->llama_gguf_file_get_tensor_type(index); +} + +// Gets the size of a tensor in bytes by index. +size_t llama_gguf_reader::llama_gguf_reader_get_tensor_size(int64_t index) const { + if (!llama_gguf_reader_is_initialized()) { + std::cerr << "Error (llama_gguf_reader::llama_gguf_reader_get_tensor_size): GGUFReader is not initialized. " + "Cannot get tensor size for index " + << index << "." << std::endl; + return 0; + } + return m_gguf_file_ptr->llama_gguf_file_get_tensor_size(index); +} + +// Reads tensor data by index into a vector of llama_token. +bool llama_gguf_reader::llama_gguf_reader_read_tensor_data(int64_t index, std::vector & tokens) const { + if (!llama_gguf_reader_is_initialized()) { + std::cerr << "Error (GGUFReader::read_tensor_data): GGUFReader is not initialized. Cannot read tensor data." + << std::endl; + return false; + } + + struct gguf_context * ctx_internal = m_gguf_file_ptr->get_gguf_context(); + if (!ctx_internal) { + std::cerr << "Error (GGUFReader::read_tensor_data): Internal GGUF context is null in GGUFFile." << std::endl; + return false; + } + + if (index < 0 || index >= gguf_get_n_tensors(ctx_internal)) { + std::cerr << "Error (GGUFReader::read_tensor_data): Tensor with index " << index + << " not found or out of bounds." << std::endl; + return false; + } + + ggml_type tensor_ggml_type = gguf_get_tensor_type(ctx_internal, index); + if (tensor_ggml_type != GGML_TYPE_I32) { + std::cerr << "Error (GGUFReader::read_tensor_data): Tensor type for '" + << gguf_get_tensor_name(ctx_internal, index) + << "' is not GGML_TYPE_I32. Actual type: " << ggml_type_name(tensor_ggml_type) << std::endl; + return false; + } + + size_t expected_bytes = gguf_get_tensor_size(ctx_internal, index); + if (expected_bytes == 0) { + tokens.clear(); + return true; + } + + size_t num_tokens = expected_bytes / sizeof(llama_token); + if (expected_bytes % sizeof(llama_token) != 0) { + std::cerr << "Warning (GGUFReader::read_tensor_data): Tensor size " << expected_bytes + << " bytes is not a multiple of llama_token size (" << sizeof(llama_token) << " bytes) for tensor '" + << gguf_get_tensor_name(ctx_internal, index) << "'. Data might be corrupted." << std::endl; + } + + tokens.resize(num_tokens); + + size_t data_offset_in_file = gguf_get_data_offset(ctx_internal) + gguf_get_tensor_offset(ctx_internal, index); + + std::ifstream file(m_file_path, std::ios::binary); + if (!file.is_open()) { + std::cerr << "Error (GGUFReader::read_tensor_data): Could not open GGUF file '" << m_file_path + << "' for reading tensor data." << std::endl; + return false; + } + + // Seek to the calculated offset + file.seekg(data_offset_in_file, std::ios::beg); + if (file.fail()) { + std::cerr << "Error (GGUFReader::read_tensor_data): Failed to seek to offset " << data_offset_in_file + << " in file '" << m_file_path << "'. Stream state: good=" << file.good() << " eof=" << file.eof() + << " fail=" << file.fail() << " bad=" << file.bad() << std::endl; + file.close(); + return false; + } + + // Read the tensor data into the tokens vector + file.read(reinterpret_cast(tokens.data()), expected_bytes); + + if (!file) { // Check if the read operation failed or reached EOF before reading all bytes + std::cerr << "Error (GGUFReader::read_tensor_data): Failed to read " << expected_bytes << " bytes for tensor '" + << gguf_get_tensor_name(ctx_internal, index) << "' from file '" << m_file_path << "'." << std::endl; + std::cerr << " Stream state after read: good=" << file.good() << " eof=" << file.eof() + << " fail=" << file.fail() << " bad=" << file.bad() << std::endl; + std::cerr << " Bytes actually read: " << file.gcount() << std::endl; + file.close(); + return false; + } + // Verify that the number of bytes read matches the expected bytes + if (file.gcount() != (std::streamsize) expected_bytes) { + std::cerr << "Error (GGUFReader::read_tensor_data): Mismatch in bytes read for tensor '" + << gguf_get_tensor_name(ctx_internal, index) << "'. Expected " << expected_bytes << ", but read " + << file.gcount() << "." << std::endl; + file.close(); + return false; + } + + file.close(); + return true; +} diff --git a/tools/dataset-converter/dataset-to-gguf/llama-gguf-reader.h b/tools/dataset-converter/dataset-to-gguf/llama-gguf-reader.h new file mode 100644 index 0000000000000..4a502691c2a14 --- /dev/null +++ b/tools/dataset-converter/dataset-to-gguf/llama-gguf-reader.h @@ -0,0 +1,55 @@ +#pragma once + +#include // For int64_t, uint64_t +#include // For std::unique_ptr +#include // For std::string +#include // For std::vector + +#include "ggml.h" // For ggml_type +#include "llama-gguf-file.h" // For llama_gguf_file_t +#include "llama.h" // For llama_token + +// Class for reading GGUF files, providing access to metadata and tensor data. +struct llama_gguf_reader { + // Constructor: Initializes the reader to read from the specified GGUF file. + // path: Path to the GGUF file. + // Throws std::runtime_error if the file cannot be opened or context cannot be initialized. + llama_gguf_reader(const std::string & path); + + // Destructor. + ~llama_gguf_reader() = default; + + // Checks if the reader has been successfully initialized. + bool llama_gguf_reader_is_initialized(void) const; + + // Gets a string metadata value by key. + std::string llama_gguf_reader_get_metadata_str(const std::string & key, + const std::string & default_value = "") const; + + // Gets a uint64_t metadata value by key. + uint64_t llama_gguf_reader_get_metadata_u64(const std::string & key, uint64_t default_value = 0) const; + + // Gets the number of tensors in the file. + int64_t llama_gguf_reader_get_tensor_count(void) const; + + // Gets the name of a tensor by index. + std::string llama_gguf_reader_get_tensor_name(int64_t index) const; + + // Gets the type of a tensor by index. + // Returns ggml_type. + enum ggml_type llama_gguf_reader_get_tensor_type(int64_t index) const; + + // Gets the size of a tensor in bytes by index. + size_t llama_gguf_reader_get_tensor_size(int64_t index) const; + + // Reads tensor data by index into a vector of llama_token. + // index: Index of the tensor. + // tokens: Vector where tokens will be read into. + // Returns true on success, false on error (e.g., tensor not found, + // or its type is not GGML_TYPE_I32, or size mismatch). + bool llama_gguf_reader_read_tensor_data(int64_t index, std::vector & tokens) const; + + private: + std::unique_ptr m_gguf_file_ptr; // Pointer to the llama_gguf_file object + std::string m_file_path; // Path to the file from which GGUF is read +}; diff --git a/tools/dataset-converter/dataset-to-gguf/llama-gguf-writer.cpp b/tools/dataset-converter/dataset-to-gguf/llama-gguf-writer.cpp new file mode 100644 index 0000000000000..599b0c137aa2e --- /dev/null +++ b/tools/dataset-converter/dataset-to-gguf/llama-gguf-writer.cpp @@ -0,0 +1,115 @@ +#include "llama-gguf-writer.h" +#include "llama-gguf-file.h" // Include llama-gguf-file.h +#include "llama.h" // For llama_model_get_vocab, llama_vocab_n_tokens, llama_vocab_get_text, llama_model_meta_val_str + +#include // For PRIu64 +#include // For snprintf +#include // For memcpy +#include // For time, gmtime, strftime +#include // For std::cerr +#include // For std::runtime_error +#include // For std::vector + +// Constructor: takes a pointer to a llama_gguf_file object +llama_gguf_writer::llama_gguf_writer(llama_gguf_file * m_gguf_file_ptr) : m_gguf_file(m_gguf_file_ptr) { + if (!m_gguf_file) { + throw std::runtime_error("llama_gguf_file pointer provided to llama_gguf_writer is null."); + } + if (!m_gguf_file->llama_gguf_file_is_initialized()) { + throw std::runtime_error("llama_gguf_file provided to llama_gguf_writer is not initialized."); + } +} + +// Initializes the GGUF file metadata +void llama_gguf_writer::llama_gguf_writer_init_metadata(const struct llama_model * model, + const std::string & input_path, uint64_t sequence_count) { + if (!m_gguf_file) { + std::cerr << "Error: llama_gguf_file is not set. Cannot set metadata." << std::endl; + return; + } + + m_gguf_file->llama_gguf_file_set_val_str("training.format.version", "1.0"); + m_gguf_file->llama_gguf_file_set_val_str("training.dataset.name", input_path); + + // Set file creation date + time_t now = time(0); + char buf[sizeof "2011-10-08T07:07:09Z"]; + strftime(buf, sizeof buf, "%Y-%m-%dT%H:%M:%SZ", gmtime(&now)); + m_gguf_file->llama_gguf_file_set_val_str("training.file.creation_date", buf); + + // Set tokenizer information + char arch_name_buffer[128]; + int res = llama_model_meta_val_str(model, "general.architecture", arch_name_buffer, sizeof(arch_name_buffer)); + if (res >= 0) { + m_gguf_file->llama_gguf_file_set_val_str("training.tokenizer.gguf.model", arch_name_buffer); + } else { + m_gguf_file->llama_gguf_file_set_val_str("training.tokenizer.gguf.model", "unknown"); + } + + // Set tokenizer vocabulary + const struct llama_vocab * vocab = llama_model_get_vocab(model); + int vocab_size = llama_vocab_n_tokens(vocab); + std::vector vocab_list; + vocab_list.reserve(vocab_size); + for (int i = 0; i < vocab_size; ++i) { + vocab_list.push_back(llama_vocab_get_text(vocab, i)); + } + m_gguf_file->llama_gguf_file_set_arr_str("training.tokenizer.gguf.vocab", vocab_list); + + // Set total sequence count + m_gguf_file->llama_gguf_file_set_val_u64("training.sequence.count", sequence_count); +} + +// Adds a sequence of tokens to the GGUF file as a tensor +void llama_gguf_writer::llama_gguf_writer_add_sequence_tensor(uint64_t index, const std::vector & tokens) { + if (!m_gguf_file) { + std::cerr << "Error: llama_gguf_file is not set. Cannot add sequence tensor." << std::endl; + return; + } + + if (tokens.empty()) { + return; + } + + char tensor_name[128]; + snprintf(tensor_name, sizeof(tensor_name), "training.tensor.%" PRIu64, index); + + // Allocate enough memory for a temporary ggml_context to hold the tensor + size_t n_tokens = tokens.size(); + size_t tensor_mem_size = ggml_tensor_overhead() + n_tokens * sizeof(int32_t); + + struct ggml_init_params ggml_params = {}; + ggml_params.mem_size = tensor_mem_size; + ggml_params.mem_buffer = nullptr; + ggml_params.no_alloc = false; + + struct ggml_context * ggml_ctx = ggml_init(ggml_params); + if (!ggml_ctx) { + std::cerr << "Error: Failed to initialize ggml context for tensor " << index << std::endl; + return; + } + + // Create a 1D tensor of type GGML_TYPE_I32 + struct ggml_tensor * tensor = ggml_new_tensor_1d(ggml_ctx, GGML_TYPE_I32, n_tokens); + ggml_set_name(tensor, tensor_name); + + // Copy token data to the tensor buffer + memcpy(tensor->data, tokens.data(), n_tokens * sizeof(int32_t)); + + // Add the tensor to the GGUF context via llama_gguf_file + m_gguf_file->llama_gguf_file_add_tensor(tensor); + + // Set tensor data in the GGUF context via llama_gguf_file + m_gguf_file->llama_gguf_file_set_tensor_data(tensor_name, tokens.data()); + + ggml_free(ggml_ctx); // Free the temporary ggml context +} + +// Writes the entire GGUF context (metadata and tensors) to the specified file +bool llama_gguf_writer::llama_gguf_writer_write_to_file(const std::string & output_path) { + if (!m_gguf_file) { + std::cerr << "Error: llama_gguf_file is not set. Cannot write to file." << std::endl; + return false; + } + return m_gguf_file->llama_gguf_file_write_to_file(output_path, false); +} diff --git a/tools/dataset-converter/dataset-to-gguf/llama-gguf-writer.h b/tools/dataset-converter/dataset-to-gguf/llama-gguf-writer.h new file mode 100644 index 0000000000000..134d06c3091b7 --- /dev/null +++ b/tools/dataset-converter/dataset-to-gguf/llama-gguf-writer.h @@ -0,0 +1,39 @@ +#pragma once + +#include // For std::string +#include // For std::vector + +#include "llama-gguf-file.h" +#include "llama.h" // For llama_token + +// Class for encapsulating GGUF file writing logic. +// It now uses llama_gguf_file for low-level operations. +struct llama_gguf_writer { + // Constructor, takes a pointer to a llama_gguf_file object. + // m_gguf_file: pointer to an initialized llama_gguf_file object, + // which will be used for writing. + llama_gguf_writer(llama_gguf_file * m_gguf_file); + + // Destructor (does not free m_gguf_file, as it is managed externally). + ~llama_gguf_writer() = default; + + // Initializes the GGUF file metadata. + // model: pointer to the loaded llama model to get tokenizer information. + // input_path: path to the input file, used for the dataset name. + // sequence_count: total number of sequences. + void llama_gguf_writer_init_metadata(const struct llama_model * model, const std::string & input_path, + uint64_t sequence_count); + + // Adds a sequence of tokens to the GGUF file as a tensor. + // index: sequence index (used for tensor name). + // tokens: vector of tokens representing the sequence. + void llama_gguf_writer_add_sequence_tensor(uint64_t index, const std::vector & tokens); + + // Writes the entire GGUF context (metadata and tensors) to the specified file. + // output_path: path to the output GGUF file. + // Returns true on success, false on error. + bool llama_gguf_writer_write_to_file(const std::string & output_path); + + private: + llama_gguf_file * m_gguf_file; // Pointer to the llama_gguf_file object +}; diff --git a/tools/dataset-converter/dataset-to-gguf/tests/dataset-to-gguf-tests.cpp b/tools/dataset-converter/dataset-to-gguf/tests/dataset-to-gguf-tests.cpp new file mode 100644 index 0000000000000..cf23d99666c8f --- /dev/null +++ b/tools/dataset-converter/dataset-to-gguf/tests/dataset-to-gguf-tests.cpp @@ -0,0 +1,563 @@ +#include // For assert +#include // For working with the file system (creating/deleting temporary files) +#include +#include // For std::cerr +#include +#include +#include +#include + +#include "../llama-dataset-reader/llama-text-data-reader.h" +#include "common.h" +#include "dataset-to-gguf/llama-gguf-converter.h" +#include "dataset-to-gguf/llama-gguf-reader.h" +#include "dataset-to-gguf/llama-gguf-writer.h" +#include "llama.h" // For llama_backend_init, llama_backend_free, llama_model_load_from_file, llama_model_free + +namespace fs = std::filesystem; + +// Global variables for tests requiring llama_model +static llama_model * g_llama_model = nullptr; +static std::string g_test_model_path = "../../gte-small.Q2_K.gguf"; // Specify the actual path to your model + +// Helper for assertions +#define TEST_ASSERT(condition, message) \ + do { \ + if (!(condition)) { \ + std::cerr << "Assertion failed: " << message << " (" << __FILE__ << ":" << __LINE__ << ")" << std::endl; \ + return false; \ + } \ + } while (0) +bool SetUpLlamaBackend(); +bool Testllama_gguf_file_DefaultConstructorInitializesContext(); +bool Testllama_gguf_file_ConstructorFromFileThrowsOnError(); +bool Testllama_gguf_file_SetAndGetMetadataString(); +bool Testllama_gguf_file_SetAndGetMetadataU64(); +bool Testllama_gguf_file_SetAndGetMetadataStringArray(); +bool CreateTestllama_gguf_file(const std::string & path, llama_model * model_ptr); +bool Testllama_gguf_reader_ConstructorInitializesFromFile(); +bool Testllama_gguf_reader_GetMetadata(); +bool Testllama_gguf_reader_GetTensorCount(); +bool Testllama_gguf_reader_GetTensorNameAndTypeAndSize(); +bool Testllama_gguf_reader_ReadTensorData(); +bool Testllama_gguf_reader_ReadTensorDataInvalidIndex(); +bool TestTextDataReader_OpenFile(); +bool TestTextDataReader_ReadNextSequenceTextMode(); +bool TestTextDataReader_ReadNextSequencePreTokenizedMode(); +bool TestTextDataReader_ResetFunctionality(); +bool TestTextDataReader_GetTotalSequences(); +bool Testllama_gguf_converter_ConvertTextFileSuccess(); +void TearDownLlamaBackend(); + + +// Global setup for llama.cpp backend +bool SetUpLlamaBackend() { + llama_backend_init(); + // Load the model for the tokenizer + llama_model_params model_params = llama_model_default_params(); + g_llama_model = llama_model_load_from_file(g_test_model_path.c_str(), model_params); + if (g_llama_model == nullptr) { + std::cerr << "WARNING: Failed to load llama model for tests from " << g_test_model_path + << ". Some tests may be skipped or fail." << std::endl; + // It's okay to continue if model loading fails, but tests relying on it will skip. + } + return true; +} + +// Global teardown for llama.cpp backend +void TearDownLlamaBackend() { + if (g_llama_model) { + llama_model_free(g_llama_model); + g_llama_model = nullptr; + } + llama_backend_free(); +} + +// ============================================================================= +// Tests for llama_gguf_file +// ============================================================================= + +bool Testllama_gguf_file_DefaultConstructorInitializesContext() { + printf(" Testllama_gguf_file_DefaultConstructorInitializesContext\n"); + llama_gguf_file gguf_file; + TEST_ASSERT(gguf_file.llama_gguf_file_is_initialized(), + "llama_gguf_file should be initialized by default constructor"); + return true; +} + +bool Testllama_gguf_file_ConstructorFromFileThrowsOnError() { + printf(" Testllama_gguf_file_ConstructorFromFileThrowsOnError\n"); + bool threw_exception = false; + try { + llama_gguf_file("non_existent_file.gguf"); + } catch (const std::runtime_error & e) { + threw_exception = true; + } + TEST_ASSERT(threw_exception, "Constructor should throw for non-existent file"); + return true; +} + +bool Testllama_gguf_file_SetAndGetMetadataString() { + printf(" Testllama_gguf_file_SetAndGetMetadataString\n"); + llama_gguf_file gguf_file; + gguf_file.llama_gguf_file_set_val_str("test.key.string", "test_value"); + TEST_ASSERT(gguf_file.llama_gguf_file_get_val_str("test.key.string") == "test_value", + "Failed to get correct string value"); + TEST_ASSERT(gguf_file.llama_gguf_file_get_val_str("non.existent.key", "default_value") == "default_value", + "Failed to get default string value"); + return true; +} + +bool Testllama_gguf_file_SetAndGetMetadataU64() { + printf(" Testllama_gguf_file_SetAndGetMetadataU64\n"); + llama_gguf_file gguf_file; + gguf_file.llama_gguf_file_set_val_u64("test.key.u64", 12345ULL); + TEST_ASSERT(gguf_file.llama_gguf_file_get_val_u64("test.key.u64") == 12345ULL, "Failed to get correct u64 value"); + TEST_ASSERT(gguf_file.llama_gguf_file_get_val_u64("non.existent.key.u64", 99ULL) == 99ULL, + "Failed to get default u64 value"); + return true; +} + +bool Testllama_gguf_file_SetAndGetMetadataStringArray() { + printf(" Testllama_gguf_file_SetAndGetMetadataStringArray\n"); + llama_gguf_file gguf_file; + std::vector arr = { "val1", "val2", "val3" }; + gguf_file.llama_gguf_file_set_arr_str("test.key.array_str", arr); + // As noted before, verifying array content requires more complex logic to read the GGUF file. + // For now, we assert that the operation doesn't crash. + return true; +} + +// ============================================================================= +// Tests for llama_gguf_reader +// ============================================================================= + +// Helper to create a temporary GGUF file for llama_gguf_reader tests +bool CreateTestllama_gguf_file(const std::string & path, llama_model * model_ptr) { + llama_gguf_file writer_file; + llama_gguf_writer writer(&writer_file); + + writer.llama_gguf_writer_init_metadata(model_ptr, "dummy_input.txt", 2); // 2 sequences + + std::vector seq1 = { 1, 2, 3, 4, 5 }; + std::vector seq2 = { 10, 20, 30 }; + writer.llama_gguf_writer_add_sequence_tensor(0, seq1); + writer.llama_gguf_writer_add_sequence_tensor(1, seq2); + + return writer.llama_gguf_writer_write_to_file(path); +} + +bool Testllama_gguf_reader_ConstructorInitializesFromFile() { + printf(" Testllama_gguf_reader_ConstructorInitializesFromFile\n"); + std::string test_gguf_path = "test_output_reader.gguf"; + TEST_ASSERT(CreateTestllama_gguf_file(test_gguf_path, g_llama_model), + "Failed to create test GGUF file for reader test"); + + llama_gguf_reader reader(test_gguf_path); + TEST_ASSERT(reader.llama_gguf_reader_is_initialized(), "llama_gguf_reader should be initialized from file"); + fs::remove(test_gguf_path); + return true; +} + +bool Testllama_gguf_reader_GetMetadata() { + printf(" Testllama_gguf_reader_GetMetadata\n"); + std::string test_gguf_path = "test_output_reader_meta.gguf"; + TEST_ASSERT(CreateTestllama_gguf_file(test_gguf_path, g_llama_model), + "Failed to create test GGUF file for reader meta test"); + + llama_gguf_reader reader(test_gguf_path); + TEST_ASSERT(reader.llama_gguf_reader_get_metadata_str("training.dataset.name") == "dummy_input.txt", + "Incorrect dataset name"); + TEST_ASSERT(reader.llama_gguf_reader_get_metadata_u64("training.sequence.count") == 2ULL, + "Incorrect sequence count"); + // The tokenizer model name might vary, so just check it's not empty/default if model was loaded + if (g_llama_model) { + TEST_ASSERT(reader.llama_gguf_reader_get_metadata_str("training.tokenizer.gguf.model", "default") != "default", + "Tokenizer model name should not be default"); + } + fs::remove(test_gguf_path); + return true; +} + +bool Testllama_gguf_reader_GetTensorCount() { + printf(" Testllama_gguf_reader_GetTensorCount\n"); + std::string test_gguf_path = "test_output_reader_count.gguf"; + TEST_ASSERT(CreateTestllama_gguf_file(test_gguf_path, g_llama_model), + "Failed to create test GGUF file for reader count test"); + + llama_gguf_reader reader(test_gguf_path); + TEST_ASSERT(reader.llama_gguf_reader_get_tensor_count() == 2, "Incorrect tensor count"); + fs::remove(test_gguf_path); + return true; +} + +bool Testllama_gguf_reader_GetTensorNameAndTypeAndSize() { + printf(" Testllama_gguf_reader_GetTensorNameAndTypeAndSize\n"); + std::string test_gguf_path = "test_output_reader_tensor_info.gguf"; + TEST_ASSERT(CreateTestllama_gguf_file(test_gguf_path, g_llama_model), + "Failed to create test GGUF file for reader tensor info test"); + + llama_gguf_reader reader(test_gguf_path); + TEST_ASSERT(reader.llama_gguf_reader_get_tensor_name(0) == "training.tensor.0", + "Incorrect tensor name for index 0"); + TEST_ASSERT(reader.llama_gguf_reader_get_tensor_type(0) == GGML_TYPE_I32, "Incorrect tensor type for index 0"); + TEST_ASSERT(reader.llama_gguf_reader_get_tensor_size(0) == 5 * sizeof(llama_token), + "Incorrect tensor size for index 0"); + + TEST_ASSERT(reader.llama_gguf_reader_get_tensor_name(1) == "training.tensor.1", + "Incorrect tensor name for index 1"); + TEST_ASSERT(reader.llama_gguf_reader_get_tensor_type(1) == GGML_TYPE_I32, "Incorrect tensor type for index 1"); + TEST_ASSERT(reader.llama_gguf_reader_get_tensor_size(1) == 3 * sizeof(llama_token), + "Incorrect tensor size for index 1"); + fs::remove(test_gguf_path); + return true; +} + +bool Testllama_gguf_reader_ReadTensorData() { + printf(" Testllama_gguf_reader_ReadTensorData\n"); + std::string test_gguf_path = "test_output_reader_data.gguf"; + TEST_ASSERT(CreateTestllama_gguf_file(test_gguf_path, g_llama_model), + "Failed to create test GGUF file for reader data test"); + + llama_gguf_reader reader(test_gguf_path); + std::vector tokens; + + // Read first sequence + TEST_ASSERT(reader.llama_gguf_reader_read_tensor_data(0, tokens), "Failed to read tensor data for index 0"); + TEST_ASSERT(tokens.size() == 5, "Incorrect token count for index 0"); + TEST_ASSERT(tokens[0] == 1, "Incorrect token value at index 0, pos 0"); + TEST_ASSERT(tokens[4] == 5, "Incorrect token value at index 0, pos 4"); + + // Read second sequence + TEST_ASSERT(reader.llama_gguf_reader_read_tensor_data(1, tokens), "Failed to read tensor data for index 1"); + TEST_ASSERT(tokens.size() == 3, "Incorrect token count for index 1"); + TEST_ASSERT(tokens[0] == 10, "Incorrect token value at index 1, pos 0"); + TEST_ASSERT(tokens[2] == 30, "Incorrect token value at index 1, pos 2"); + fs::remove(test_gguf_path); + return true; +} + +bool Testllama_gguf_reader_ReadTensorDataInvalidIndex() { + printf(" Testllama_gguf_reader_ReadTensorDataInvalidIndex\n"); + std::string test_gguf_path = "test_output_reader_invalid_idx.gguf"; + TEST_ASSERT(CreateTestllama_gguf_file(test_gguf_path, g_llama_model), + "Failed to create test GGUF file for reader invalid index test"); + + llama_gguf_reader reader(test_gguf_path); + std::vector tokens; + TEST_ASSERT(!reader.llama_gguf_reader_read_tensor_data(99, tokens), "Reading invalid index should fail"); + fs::remove(test_gguf_path); + return true; +} + +// ============================================================================= +// Tests for TextDataReader +// ============================================================================= + +// Helper to set up TextDataReader test files +struct TextDataReaderTestFixture { + std::string test_text_file = "test_input.txt"; + std::string test_pretokenized_file = "test_pretokenized.txt"; + llama_model * model_for_reader_test = nullptr; + + TextDataReaderTestFixture(llama_model * model) : model_for_reader_test(model) { + // Create test text file + std::ofstream ofs(test_text_file); + ofs << "Hello world\n"; + ofs << "This is a test line.\n"; + ofs << "\n"; // Empty line + ofs << "Another line"; + ofs.close(); + + // Create test pre-tokenized file + std::ofstream ofs_pretokenized(test_pretokenized_file); + ofs_pretokenized << "101 200 300 102\n"; + ofs_pretokenized << "500 600\n"; + ofs_pretokenized << "\n"; // Empty line + ofs_pretokenized << "700"; + ofs_pretokenized.close(); + } + + ~TextDataReaderTestFixture() { + fs::remove(test_text_file); + fs::remove(test_pretokenized_file); + } +}; + +bool TestTextDataReader_OpenFile() { + printf(" TestTextDataReader_OpenFile\n"); + TextDataReaderTestFixture fixture(g_llama_model); + llama_text_dataset_reader reader(fixture.model_for_reader_test, 128, false); + TEST_ASSERT(reader.open(fixture.test_text_file), "Failed to open valid text file"); + reader.close(); + TEST_ASSERT(!reader.open("non_existent.txt"), "Opened non-existent file unexpectedly"); + return true; +} + +bool TestTextDataReader_ReadNextSequenceTextMode() { + printf(" TestTextDataReader_ReadNextSequenceTextMode\n"); + if (g_llama_model == nullptr) { + printf(" Skipping: Llama model not loaded.\n"); + return true; // Skip test gracefully + } + + TextDataReaderTestFixture fixture(g_llama_model); + llama_text_dataset_reader reader(fixture.model_for_reader_test, 128, false); + TEST_ASSERT(reader.open(fixture.test_text_file), "Failed to open text file for read test"); + + std::vector tokens; + + // Read "Hello world" + TEST_ASSERT(reader.read_next_sequence(tokens), "Failed to read first sequence"); + TEST_ASSERT(!tokens.empty(), "First sequence should not be empty"); + + // Read "This is a test line." + TEST_ASSERT(reader.read_next_sequence(tokens), "Failed to read second sequence"); + TEST_ASSERT(!tokens.empty(), "Second sequence should not be empty"); + + // Read empty line + TEST_ASSERT(reader.read_next_sequence(tokens), "Failed to read empty line"); + TEST_ASSERT(tokens.empty(), "Empty line should result in 0 tokens"); + + // Read "Another line" + TEST_ASSERT(reader.read_next_sequence(tokens), "Failed to read third sequence"); + TEST_ASSERT(!tokens.empty(), "Third sequence should not be empty"); + + // End of file + TEST_ASSERT(!reader.read_next_sequence(tokens), "Should be end of file"); + reader.close(); + return true; +} + +bool TestTextDataReader_ReadNextSequencePreTokenizedMode() { + printf(" TestTextDataReader_ReadNextSequencePreTokenizedMode\n"); + TextDataReaderTestFixture fixture(g_llama_model); + llama_text_dataset_reader reader(fixture.model_for_reader_test, 128, true); + TEST_ASSERT(reader.open(fixture.test_pretokenized_file), "Failed to open pre-tokenized file for read test"); + + std::vector tokens; + + // Read "101 200 300 102" + TEST_ASSERT(reader.read_next_sequence(tokens), "Failed to read first pre-tokenized sequence"); + TEST_ASSERT(tokens.size() == 4, "Incorrect token count for first pre-tokenized sequence"); + TEST_ASSERT(tokens[0] == 101, "Incorrect token value for first pre-tokenized sequence"); + TEST_ASSERT(tokens[1] == 200, "Incorrect token value for first pre-tokenized sequence"); + + // Read "500 600" + TEST_ASSERT(reader.read_next_sequence(tokens), "Failed to read second pre-tokenized sequence"); + TEST_ASSERT(tokens.size() == 2, "Incorrect token count for second pre-tokenized sequence"); + TEST_ASSERT(tokens[0] == 500, "Incorrect token value for second pre-tokenized sequence"); + + // Read empty line + TEST_ASSERT(reader.read_next_sequence(tokens), "Failed to read empty pre-tokenized line"); + TEST_ASSERT(tokens.empty(), "Empty pre-tokenized line should result in 0 tokens"); + + // Read "700" + TEST_ASSERT(reader.read_next_sequence(tokens), "Failed to read third pre-tokenized sequence"); + TEST_ASSERT(tokens.size() == 1, "Incorrect token count for third pre-tokenized sequence"); + TEST_ASSERT(tokens[0] == 700, "Incorrect token value for third pre-tokenized sequence"); + + // End of file + TEST_ASSERT(!reader.read_next_sequence(tokens), "Should be end of pre-tokenized file"); + reader.close(); + return true; +} + +bool TestTextDataReader_ResetFunctionality() { + printf(" TestTextDataReader_ResetFunctionality\n"); + TextDataReaderTestFixture fixture(g_llama_model); + llama_text_dataset_reader reader(fixture.model_for_reader_test, 128, false); + TEST_ASSERT(reader.open(fixture.test_text_file), "Failed to open text file for reset test"); + + std::vector tokens; + reader.read_next_sequence(tokens); // Read one line + reader.read_next_sequence(tokens); // Read another line + + TEST_ASSERT(reader.reset(), "Failed to reset reader"); // Reset to beginning + + // Should read the first line again + TEST_ASSERT(reader.read_next_sequence(tokens), "Failed to read first sequence after reset"); + // (Add specific token check if you know the expected tokens for "Hello world") + reader.close(); + return true; +} + +bool TestTextDataReader_GetTotalSequences() { + printf(" TestTextDataReader_GetTotalSequences\n"); + TextDataReaderTestFixture fixture(g_llama_model); + + llama_text_dataset_reader reader_text(fixture.model_for_reader_test, 128, false); + TEST_ASSERT(reader_text.open(fixture.test_text_file), "Failed to open text file for total sequences test"); + TEST_ASSERT(reader_text.total_sequences() == 4, + "Incorrect total sequence count for text file"); // 4 lines in test_input.txt + reader_text.close(); + + llama_text_dataset_reader reader_pretokenized(fixture.model_for_reader_test, 128, true); + TEST_ASSERT(reader_pretokenized.open(fixture.test_pretokenized_file), + "Failed to open pre-tokenized file for total sequences test"); + TEST_ASSERT(reader_pretokenized.total_sequences() == 4, + "Incorrect total sequence count for pre-tokenized file"); // 4 lines in test_pretokenized.txt + reader_pretokenized.close(); + return true; +} + +// ============================================================================= +// Tests for llama_gguf_converter (integration) +// ============================================================================= + +// Helper to set up llama_gguf_converter test files +struct llama_gguf_converterTestFixture { + std::string input_text_file = "converter_input.txt"; + std::string output_gguf_file = "converter_output.gguf"; + llama_model * model_for_converter_test = nullptr; + + llama_gguf_converterTestFixture(llama_model * model) : model_for_converter_test(model) { + // Create test text file + std::ofstream ofs(input_text_file); + ofs << "The quick brown fox jumps over the lazy dog.\n"; + ofs << "Hello, GGUF conversion!\n"; + ofs.close(); + } + + ~llama_gguf_converterTestFixture() { + fs::remove(input_text_file); + fs::remove(output_gguf_file); + } +}; + +bool Testllama_gguf_converter_ConvertTextFileSuccess() { + printf(" Testllama_gguf_converter_ConvertTextFileSuccess\n"); + if (g_llama_model == nullptr) { + printf(" Skipping: Llama model not loaded.\n"); + return true; // Skip test gracefully + } + + llama_gguf_converterTestFixture fixture(g_llama_model); + + common_params params; + params.in_files.push_back(fixture.input_text_file); + params.out_file = fixture.output_gguf_file; + params.max_seq_len = 128; + params.pre_tokenized = false; + params.dataset_format = "text"; + params.dataset_column = "data"; // Not used for text, but for completeness + llama_gguf_converter converter; + TEST_ASSERT(converter.llama_gguf_converter_convert(params, g_llama_model), "GGUF conversion failed"); + + // Verify file was created + TEST_ASSERT(fs::exists(fixture.output_gguf_file), "Output GGUF file was not created"); + + // Verify GGUF file content using llama_gguf_reader + llama_gguf_reader reader(fixture.output_gguf_file); + TEST_ASSERT(reader.llama_gguf_reader_is_initialized(), "llama_gguf_reader failed to initialize for verification"); + TEST_ASSERT(reader.llama_gguf_reader_get_metadata_u64("training.sequence.count") == 2ULL, + "Incorrect sequence count in GGUF metadata"); + TEST_ASSERT(reader.llama_gguf_reader_get_tensor_count() == 2, "Incorrect tensor count in GGUF file"); + + std::vector tokens; + TEST_ASSERT(reader.llama_gguf_reader_read_tensor_data(0, tokens), "Failed to read first tensor data"); + TEST_ASSERT(!tokens.empty(), "First sequence should not be empty"); + + TEST_ASSERT(reader.llama_gguf_reader_read_tensor_data(1, tokens), "Failed to read second tensor data"); + TEST_ASSERT(!tokens.empty(), "Second sequence should not be empty"); + return true; +} + +// ============================================================================= +// Main function to run all tests +// ============================================================================= + +int main() { + printf("Running dataset-to-gguf tests...\n\n"); + + // Global setup for llama.cpp backend + if (!SetUpLlamaBackend()) { + printf("Global setup failed. Exiting tests.\n"); + return 1; + } + + int failed_tests = 0; + + // Run llama_gguf_file tests + printf("--- llama_gguf_file Tests ---\n"); + if (!Testllama_gguf_file_DefaultConstructorInitializesContext()) { + failed_tests++; + } + if (!Testllama_gguf_file_ConstructorFromFileThrowsOnError()) { + failed_tests++; + } + if (!Testllama_gguf_file_SetAndGetMetadataString()) { + failed_tests++; + } + if (!Testllama_gguf_file_SetAndGetMetadataU64()) { + failed_tests++; + } + if (!Testllama_gguf_file_SetAndGetMetadataStringArray()) { + failed_tests++; + } + printf("\n"); + + // Run llama_gguf_reader tests + printf("--- llama_gguf_reader Tests ---\n"); + if (!Testllama_gguf_reader_ConstructorInitializesFromFile()) { + failed_tests++; + } + if (!Testllama_gguf_reader_GetMetadata()) { + failed_tests++; + } + if (!Testllama_gguf_reader_GetTensorCount()) { + failed_tests++; + } + if (!Testllama_gguf_reader_GetTensorNameAndTypeAndSize()) { + failed_tests++; + } + if (!Testllama_gguf_reader_ReadTensorData()) { + failed_tests++; + } + if (!Testllama_gguf_reader_ReadTensorDataInvalidIndex()) { + failed_tests++; + } + printf("\n"); + + // Run TextDataReader tests + printf("--- TextDataReader Tests ---\n"); + if (!TestTextDataReader_OpenFile()) { + failed_tests++; + } + if (!TestTextDataReader_ReadNextSequenceTextMode()) { + failed_tests++; + } + if (!TestTextDataReader_ReadNextSequencePreTokenizedMode()) { + failed_tests++; + } + if (!TestTextDataReader_ResetFunctionality()) { + failed_tests++; + } + if (!TestTextDataReader_GetTotalSequences()) { + failed_tests++; + } + printf("\n"); + + // Run llama_gguf_converter integration tests + printf("--- llama_gguf_converter Tests ---\n"); + if (!Testllama_gguf_converter_ConvertTextFileSuccess()) { + failed_tests++; + } + printf("\n"); + + // Add ParquetDataReader tests here when you have test files and logic + // printf("--- ParquetDataReader Tests ---\n"); + // if (!TestParquetDataReader_OpenFile()) failed_tests++; + // ... + + // Global teardown for llama.cpp backend + TearDownLlamaBackend(); + + if (failed_tests == 0) { + printf("All tests passed!\n"); + return 0; + } else { + printf("%d tests failed.\n", failed_tests); + return 1; + } +}