-
Notifications
You must be signed in to change notification settings - Fork 12.4k
tool: add convertation of text/parquet to custom format #14622
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
0d1a1e0
a83c0f6
234c301
1004205
aab45e2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1477,7 +1477,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex | |
[](common_params & params, int value) { | ||
params.n_chunks = value; | ||
} | ||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL})); | ||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_RETRIEVAL})); | ||
add_opt(common_arg( | ||
{"-fa", "--flash-attn"}, | ||
string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), | ||
|
@@ -1539,7 +1539,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex | |
} | ||
params.in_files.push_back(value); | ||
} | ||
).set_examples({LLAMA_EXAMPLE_IMATRIX})); | ||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_FINETUNE})); | ||
add_opt(common_arg( | ||
{"-bf", "--binary-file"}, "FNAME", | ||
"binary file containing the prompt (default: none)", | ||
|
@@ -2609,9 +2609,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex | |
{"-o", "--output", "--output-file"}, "FNAME", | ||
string_format("output file (default: '%s')", params.out_file.c_str()), | ||
[](common_params & params, const std::string & value) { | ||
params.out_file = value; | ||
params.out_file = value; | ||
} | ||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS})); | ||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE})); | ||
add_opt(common_arg( | ||
{"-ofreq", "--output-frequency"}, "N", | ||
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq), | ||
|
@@ -3423,5 +3423,45 @@ common_params_context common_params_parser_init(common_params & params, llama_ex | |
} | ||
).set_examples({LLAMA_EXAMPLE_SERVER})); | ||
|
||
add_opt(common_arg( | ||
{"--dataset-format"}, " ", | ||
string_format("type of input data (e.g., 'text', 'parquet') (default: %s)", params.dataset_format.c_str()), | ||
[](common_params & params, const std::string & format) { | ||
params.dataset_format = format; //TODO ENUM CLASS | ||
} | ||
).set_examples({LLAMA_EXAMPLE_FINETUNE})); | ||
|
||
add_opt(common_arg( | ||
{"--max-seq-len"}, " ", | ||
string_format("max sequence length (default: %d)", params.max_seq_len), | ||
[](common_params & params, int32_t max_seq_len) { | ||
params.max_seq_len = max_seq_len; | ||
} | ||
).set_examples({LLAMA_EXAMPLE_FINETUNE})); | ||
|
||
add_opt(common_arg( | ||
{"--pre-tokenized"}, | ||
string_format("input file contains pre-tokenized data (space-separated token IDs)"), | ||
[](common_params & params) { | ||
params.pre_tokenized = true; | ||
} | ||
).set_examples({LLAMA_EXAMPLE_FINETUNE})); | ||
Comment on lines
+3442
to
+3448
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this intended for the training or the data conversion code? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. --max-seq-len, --pre-tokenized for only coversion. ok, will moved to new convertor params parser |
||
|
||
add_opt(common_arg( | ||
{"--preview"}, | ||
string_format("read and print metadata and first sequence from the output GGUF file (enables preview)"), | ||
[](common_params & params) { | ||
params.do_preview = true; | ||
} | ||
).set_examples({LLAMA_EXAMPLE_FINETUNE})); | ||
|
||
add_opt(common_arg( | ||
{"--dataset-column"}, "<name>", | ||
string_format("column name for data in dataset files"), | ||
[](common_params & params, const std::string &dataset_column) { | ||
params.dataset_column = dataset_column; | ||
} | ||
).set_examples({LLAMA_EXAMPLE_FINETUNE})); | ||
|
||
return ctx_arg; | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
include_directories(. ../../common) | ||
|
||
if(LLAMA_PARQUET) | ||
find_package(Arrow REQUIRED) | ||
find_package(Parquet REQUIRED) | ||
endif() | ||
|
||
add_library(dataset-to-gguf-lib STATIC | ||
dataset-to-gguf/llama-gguf-writer.cpp | ||
dataset-to-gguf/llama-gguf-file.cpp | ||
dataset-to-gguf/llama-dataset-reader/llama-text-data-reader.cpp | ||
dataset-to-gguf/llama-gguf-converter.cpp | ||
dataset-to-gguf/llama-gguf-reader.cpp | ||
dataset-to-gguf/llama-dataset-reader/llama-parquet-data-reader.cpp | ||
) | ||
|
||
target_compile_features(dataset-to-gguf-lib PRIVATE cxx_std_17) | ||
|
||
target_link_libraries(dataset-to-gguf-lib common llama ${CMAKE_THREAD_LIBS_INIT}) | ||
if(LLAMA_PARQUET) | ||
target_link_libraries(dataset-to-gguf-lib Arrow::arrow_shared Parquet::parquet_shared) | ||
endif() | ||
|
||
add_executable(convert-to-train-gguf convert-to-train-gguf.cpp) | ||
add_dependencies(convert-to-train-gguf dataset-to-gguf-lib) | ||
target_link_libraries(convert-to-train-gguf PRIVATE dataset-to-gguf-lib) | ||
|
||
set(TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS dataset-to-gguf-unit-tests) | ||
add_executable(${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} dataset-to-gguf/tests/dataset-to-gguf-tests.cpp) | ||
add_dependencies(${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} dataset-to-gguf-lib) | ||
target_link_libraries(${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} PRIVATE common llama dataset-to-gguf-lib) | ||
|
||
add_test( | ||
NAME ${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} # | ||
COMMAND $<TARGET_FILE:${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS}> | ||
LABEL "training" | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
`convert-to-train-gguf` Utility | ||
=============================== | ||
|
||
This utility is designed to convert text datasets (or pre-tokenized data) into the GGUF format, optimized for training models in `llama.cpp`. | ||
|
||
Features | ||
-------- | ||
|
||
* **Two-pass processing**: Efficiently handles large datasets that do not fit entirely into RAM, performing a first pass to collect metadata and a second pass to write the actual tensor data. | ||
|
||
* **Flexible input**: Supports reading both raw text (with subsequent tokenization using a provided model) and pre-tokenized data (in the format of space-separated token IDs). | ||
|
||
* **Modular architecture**: The code is divided into several classes (`llama_gguf_file`, `llama_gguf_writer`, `llama_dataset_reader`, `llama_text_dataset_reader`, `llama_gguf_converter`, `llama_gguf_reader`) to improve modularity, extensibility, and testability. | ||
|
||
* **Preview functionality**: Allows you to view metadata and the first few sequences of the generated GGUF file, including optional detokenization. | ||
|
||
|
||
GGUF Structure for Training Data | ||
-------------------------------- | ||
|
||
The generated GGUF files follow a specific structure for training data: | ||
|
||
* **Metadata (KV pairs)**: All metadata keys are prefixed with `training.` to avoid conflicts with model metadata. | ||
|
||
* `training.format.version`: `string` (e.g., "1.0") - Specification version. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Preferably use an integer for the version. If you want to specify minor versions my suggestion is to either specify multiple integers or to do what NVIDIA does and specify e.g. v12.34 as something like |
||
|
||
* `training.dataset.name`: `string` (optional) - Dataset name (e.g., "OpenWebText-ru"). | ||
|
||
* `training.dataset.source`: `string` (optional) - URL or description of the data source. | ||
|
||
* `training.file.creation_date`: `string` (ISO 8601) - File creation date. | ||
|
||
* `training.tokenizer.gguf.model`: `string` - Tokenizer model name (e.g., "llama", "gpt2", "bert"). | ||
|
||
* `training.tokenizer.gguf.vocab`: `array[string]` - Tokenizer vocabulary. | ||
|
||
* `training.tokenizer.gguf.merges`: `array[string]` (optional) - Tokenizer merges (for BPE). | ||
|
||
* `training.tokenizer.gguf.pre`: `string` (optional) - Architecture of the pre-tokenizer. | ||
|
||
* `training.sequence.count`: `uint64` - Total number of sequences in the file. | ||
|
||
* **Tensors**: Each training sequence is stored as a separate tensor. | ||
|
||
* **Naming**: `training.tensor.{index}` (e.g., `training.tensor.0`, `training.tensor.1`, ...). No leading zeros. | ||
|
||
* **Data type**: `GGML_TYPE_I32` (standard for tokens in `llama.cpp`). | ||
|
||
* **Shape**: `[sequence_length]` - One-dimensional array. `sequence_length` will vary for each tensor and can be obtained from the tensor's shape. | ||
|
||
|
||
Building | ||
-------- | ||
|
||
It is assumed that you have already set up the `llama.cpp` build environment (e.g., using CMake) and installed Arrow and Parquet on your system. | ||
|
||
1. **Clone the `llama.cpp` repository**: | ||
|
||
git clone https://github.com/ggerganov/llama.cpp.git | ||
cd llama.cpp | ||
|
||
|
||
2. **Create a build directory and navigate into it**: | ||
|
||
mkdir build | ||
cd build | ||
|
||
|
||
3. **Configure and build the project using CMake**: | ||
|
||
cmake -DLLAMA_PARQUET=ON .. | ||
cmake --build . --config Release | ||
|
||
|
||
The `convert-to-train-gguf` utility will be built in the `build/bin` directory. | ||
|
||
|
||
Usage | ||
----- | ||
|
||
./bin/convert-to-train-gguf [options] | ||
|
||
|
||
### Command-line Options | ||
|
||
* `-h`, `--help`: Show this help message and exit. | ||
|
||
* `-m <path>, --model <path>` : Path to the GGUF model used for the tokenizer (default: `models/7B/ggml-model-f16.gguf`). | ||
|
||
* `--in-file <path>`: Path to the input dataset file, either a plain text file or a Parquet file (default: `input.txt`). | ||
|
||
* `-o <path>`, `--output <path>`: Path to save the output GGUF file to (default: `output.gguf`). | ||
|
||
* `--max-seq-len <length>`: Maximum sequence length in tokens (default: `2048`). Sequences exceeding this length will be truncated. | ||
|
||
* `--pre-tokenized`: Specifies that the input file contains pre-tokenized data (space-separated token IDs) rather than raw text. | ||
|
||
* `--dataset-format <type>`: Type of input data (`text`, `parquet`). (default: `text`). | ||
|
||
* `--parquet-text-column <name>`: For `parquet` input type, the column name containing raw text data (default: `text`). | ||
|
||
* `--parquet-tokens-column <name>`: For `parquet` input type, the column name containing pre-tokenized data (list of int32) (default: `tokens`). | ||
|
||
* `--preview`: Enables previewing of the generated GGUF file (prints metadata and the first few sequences). | ||
|
||
* `--preview-count <N>`: Number of sequences to preview (default: `1`). Requires `--preview`. | ||
|
||
* `--detokenize-preview`: Detokenize previewed sequences back into text for better readability. Requires `--preview`. | ||
|
||
|
||
### Usage Examples | ||
|
||
1. **Converting a plain text file**: | ||
|
||
./bin/convert-to-train-gguf -m models/7B/ggml-model-f16.gguf -i my_dataset.txt -o my_training_data.gguf -l 1024 | ||
|
||
|
||
2. **Converting a pre-tokenized file**: | ||
|
||
./bin/convert-to-train-gguf -m models/7B/ggml-model-f16.gguf -i pre_tokenized_data.txt -o pre_tokenized_training_data.gguf -p | ||
|
||
|
||
(Assumes `pre_tokenized_data.txt` contains lines like: `101 200 300 102 ...`) | ||
|
||
3. **Converting a Parquet file with raw text**: | ||
|
||
./bin/convert-to-train-gguf -m models/7B/ggml-model-f16.gguf -i my_parquet_dataset.parquet -o my_training_data.gguf -t parquet --parquet-text-column "document_text" | ||
|
||
|
||
4. **Converting a Parquet file with pre-tokenized data**: | ||
|
||
./bin/convert-to-train-gguf -m models/7B/ggml-model-f16.gguf -i my_tokenized_parquet.parquet -o my_training_data.gguf -t parquet -p --parquet-tokens-column "token_ids" | ||
|
||
|
||
5. **Converting with a preview of 5 sequences and detokenization**: | ||
|
||
./bin/convert-to-train-gguf -m models/7B/ggml-model-f16.gguf -i my_dataset.txt -o my_training_data.gguf --preview --preview-count 5 --detokenize-preview | ||
|
||
|
||
|
||
Future Improvements | ||
------------------- | ||
|
||
* **Improved Error Handling**: More detailed messages and handling of edge cases. | ||
|
||
* **Additional Validation**: Data integrity checks at various stages. | ||
|
||
* **Dataset Statistics**: Ability to output statistics on sequence lengths, token distribution, etc. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can't this be determined automatically?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree we could add
auto
as the default dataset format while retaining the parameter. The converter could automatically detect formats (via file extensions/headers) when set toauto
, while still allowing explicit overrides. This balances convenience and control.For the
--dataset-format
example:AUTO
instead of raw strings?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think an enum is better than a string.
The way I would implement the automatic detection is to try loading GGUF and Parquet first. The order shouldn't matter since the loading will fail if there is a mismatch. If both fail, load as plain text. I don't think fallback logic beyond that is needed.