Skip to content

Commit bff2db8

Browse files
author
lexasub
committed
tool: add convertation of text/parquet to custom format
1 parent 11ee0fe commit bff2db8

21 files changed

+2462
-17
lines changed

CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
1212
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
1313
endif()
1414

15+
message("CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
16+
1517
# Add path to modules
1618
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
1719

@@ -84,6 +86,12 @@ option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
8486
# 3rd party libs
8587
option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON)
8688
option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
89+
option(LLAMA_PARQUET "Enable Parquet dataset support via Arrow/Parquet C++" OFF)
90+
91+
92+
if(LLAMA_PARQUET)
93+
add_definitions(-DLLAMA_PARQUET)
94+
endif()
8795

8896
# Required for relocatable CMake package
8997
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)

common/arg.cpp

Lines changed: 83 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1470,14 +1470,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
14701470
[](common_params & params) {
14711471
params.ctx_shift = false;
14721472
}
1473-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
1473+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
14741474
add_opt(common_arg(
14751475
{"--chunks"}, "N",
14761476
string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
14771477
[](common_params & params, int value) {
14781478
params.n_chunks = value;
14791479
}
1480-
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
1480+
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_RETRIEVAL}));
14811481
add_opt(common_arg(
14821482
{"-fa", "--flash-attn"},
14831483
string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
@@ -1539,7 +1539,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
15391539
}
15401540
params.in_files.push_back(value);
15411541
}
1542-
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
1542+
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_FINETUNE}));
15431543
add_opt(common_arg(
15441544
{"-bf", "--binary-file"}, "FNAME",
15451545
"binary file containing the prompt (default: none)",
@@ -2115,70 +2115,70 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
21152115
[](common_params & params) {
21162116
params.hellaswag = true;
21172117
}
2118-
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2118+
).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE}));
21192119
add_opt(common_arg(
21202120
{"--hellaswag-tasks"}, "N",
21212121
string_format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
21222122
[](common_params & params, int value) {
21232123
params.hellaswag_tasks = value;
21242124
}
2125-
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2125+
).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE}));
21262126
add_opt(common_arg(
21272127
{"--winogrande"},
21282128
"compute Winogrande score over random tasks from datafile supplied with -f",
21292129
[](common_params & params) {
21302130
params.winogrande = true;
21312131
}
2132-
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2132+
).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE}));
21332133
add_opt(common_arg(
21342134
{"--winogrande-tasks"}, "N",
21352135
string_format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
21362136
[](common_params & params, int value) {
21372137
params.winogrande_tasks = value;
21382138
}
2139-
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2139+
).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE}));
21402140
add_opt(common_arg(
21412141
{"--multiple-choice"},
21422142
"compute multiple choice score over random tasks from datafile supplied with -f",
21432143
[](common_params & params) {
21442144
params.multiple_choice = true;
21452145
}
2146-
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2146+
).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE}));
21472147
add_opt(common_arg(
21482148
{"--multiple-choice-tasks"}, "N",
21492149
string_format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
21502150
[](common_params & params, int value) {
21512151
params.multiple_choice_tasks = value;
21522152
}
2153-
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2153+
).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE}));
21542154
add_opt(common_arg(
21552155
{"--kl-divergence"},
21562156
"computes KL-divergence to logits provided via --kl-divergence-base",
21572157
[](common_params & params) {
21582158
params.kl_divergence = true;
21592159
}
2160-
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2160+
).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE}));
21612161
add_opt(common_arg(
21622162
{"--save-all-logits", "--kl-divergence-base"}, "FNAME",
21632163
"set logits file",
21642164
[](common_params & params, const std::string & value) {
21652165
params.logits_file = value;
21662166
}
2167-
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2167+
).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE}));
21682168
add_opt(common_arg(
21692169
{"--ppl-stride"}, "N",
21702170
string_format("stride for perplexity calculation (default: %d)", params.ppl_stride),
21712171
[](common_params & params, int value) {
21722172
params.ppl_stride = value;
21732173
}
2174-
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2174+
).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE}));
21752175
add_opt(common_arg(
21762176
{"--ppl-output-type"}, "<0|1>",
21772177
string_format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
21782178
[](common_params & params, int value) {
21792179
params.ppl_output_type = value;
21802180
}
2181-
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2181+
).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE}));
21822182
add_opt(common_arg(
21832183
{"-dt", "--defrag-thold"}, "N",
21842184
string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
@@ -2609,9 +2609,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
26092609
{"-o", "--output", "--output-file"}, "FNAME",
26102610
string_format("output file (default: '%s')", params.out_file.c_str()),
26112611
[](common_params & params, const std::string & value) {
2612-
params.out_file = value;
2612+
params.out_file = value;
26132613
}
2614-
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS}));
2614+
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
26152615
add_opt(common_arg(
26162616
{"-ofreq", "--output-frequency"}, "N",
26172617
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
@@ -3423,5 +3423,73 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34233423
}
34243424
).set_examples({LLAMA_EXAMPLE_SERVER}));
34253425

3426+
add_opt(common_arg(
3427+
{"--dataset-format"}, " ",
3428+
string_format("type of input data (e.g., 'text', 'parquet') (default: %s)", params.dataset_format.c_str()),
3429+
[](common_params & params, const std::string & format) {
3430+
params.dataset_format = format; //TODO ENUM CLASS
3431+
}
3432+
).set_examples({LLAMA_EXAMPLE_FINETUNE}));
3433+
3434+
add_opt(common_arg(
3435+
{"--max-seq-len"}, " ",
3436+
string_format("max sequence length (default: %d)", params.max_seq_len),
3437+
[](common_params & params, int32_t max_seq_len) {
3438+
params.max_seq_len = max_seq_len;
3439+
}
3440+
).set_examples({LLAMA_EXAMPLE_FINETUNE}));
3441+
3442+
add_opt(common_arg(
3443+
{"--pre-tokenized"},
3444+
string_format("input file contains pre-tokenized data (space-separated token IDs)"),
3445+
[](common_params & params) {
3446+
params.pre_tokenized = true;
3447+
}
3448+
).set_examples({LLAMA_EXAMPLE_FINETUNE}));
3449+
3450+
add_opt(common_arg(
3451+
{"--preview"},
3452+
string_format("read and print metadata and first sequence from the output GGUF file (enables preview)"),
3453+
[](common_params & params) {
3454+
params.do_preview = true;
3455+
}
3456+
).set_examples({LLAMA_EXAMPLE_FINETUNE}));
3457+
3458+
add_opt(common_arg(
3459+
{"--preview-count"}, "<N>",
3460+
string_format("input file contains pre-tokenized data (space-separated token IDs)"),
3461+
[](common_params & params, int preview_count) {
3462+
params.preview_count = preview_count;
3463+
}
3464+
).set_examples({LLAMA_EXAMPLE_FINETUNE}));
3465+
3466+
add_opt(common_arg(
3467+
{"--detokenize-preview"},
3468+
string_format("detokenize previewed sequences (implies --preview)"),
3469+
[](common_params & params) {
3470+
params.detokenize_preview = params.do_preview = true;
3471+
}
3472+
).set_examples({LLAMA_EXAMPLE_FINETUNE}));
3473+
3474+
#ifdef LLAMA_PARQUET
3475+
3476+
3477+
add_opt(common_arg(
3478+
{"--parquet-text-column"}, "<name>",
3479+
string_format("column name for raw text in Parquet files (default: 'text')"),
3480+
[](common_params & params, const std::string &parquet_text_column) {
3481+
params.parquet_text_column = parquet_text_column;
3482+
}
3483+
).set_examples({LLAMA_EXAMPLE_FINETUNE}));
3484+
3485+
add_opt(common_arg(
3486+
{"--parquet-tokens-column"}, "<name>",
3487+
string_format("column name for pre-tokenized data (list<int32>) in Parquet files (default: 'tokens')"),
3488+
[](common_params & params, const std::string &parquet_tokens_column) {
3489+
params.parquet_tokens_column = parquet_tokens_column;
3490+
}
3491+
).set_examples({LLAMA_EXAMPLE_FINETUNE}));
3492+
3493+
#endif
34263494
return ctx_arg;
34273495
}

common/common.h

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@
44

55
#include "llama-cpp.h"
66

7+
#include <cmath>
8+
#include <map>
79
#include <set>
10+
#include <sstream>
811
#include <string>
912
#include <string_view>
1013
#include <vector>
11-
#include <map>
12-
#include <sstream>
1314

1415
#ifdef _WIN32
1516
#define DIRECTORY_SEPARATOR '\\'
@@ -81,6 +82,7 @@ enum llama_example {
8182
LLAMA_EXAMPLE_LOOKUP,
8283
LLAMA_EXAMPLE_PARALLEL,
8384
LLAMA_EXAMPLE_TTS,
85+
LLAMA_EXAMPLE_FINETUNE,
8486

8587
LLAMA_EXAMPLE_COUNT,
8688
};
@@ -282,6 +284,7 @@ struct common_params {
282284
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
283285
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
284286
std::string logits_file = ""; // file for saving *all* logits // NOLINT
287+
std::string dataset_format = "text"; // "text" | "parquet"
285288

286289
std::vector<std::string> in_files; // all input files
287290
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
@@ -443,6 +446,15 @@ struct common_params {
443446
// return false from callback to abort model loading or true to continue
444447
llama_progress_callback load_progress_callback = NULL;
445448
void * load_progress_callback_user_data = NULL;
449+
int32_t max_seq_len = 2048;
450+
bool do_preview = false;
451+
bool pre_tokenized = false;
452+
bool detokenize_preview = false;
453+
int preview_count = 1;
454+
#ifdef LLAMA_PARQUET
455+
std::string parquet_text_column = "text";
456+
std::string parquet_tokens_column = "tokens";
457+
#endif
446458
};
447459

448460
// call once at the start of a program if it uses libcommon

tools/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,5 @@ else()
3636
add_subdirectory(cvector-generator)
3737
add_subdirectory(export-lora)
3838
endif()
39+
add_subdirectory(dataset-converter)
3940
endif()
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
include_directories(.)
2+
3+
if(LLAMA_PARQUET)
4+
find_package(Arrow REQUIRED)
5+
find_package(Parquet REQUIRED)
6+
endif()
7+
8+
add_library(dataset-to-gguf-lib
9+
dataset-to-gguf/llama-gguf-writer.cpp
10+
dataset-to-gguf/llama-gguf-file.cpp
11+
dataset-to-gguf/llama-dataset-reader/llama-text-data-reader.cpp
12+
dataset-to-gguf/llama-gguf-converter.cpp
13+
dataset-to-gguf/llama-gguf-reader.cpp
14+
dataset-to-gguf/llama-dataset-reader/llama-parquet-data-reader.cpp
15+
)
16+
17+
# Link libraries for dataset-to-gguf-lib
18+
target_link_libraries(dataset-to-gguf-lib common llama ${CMAKE_THREAD_LIBS_INIT})
19+
if(LLAMA_PARQUET)
20+
target_link_libraries(dataset-to-gguf-lib Arrow::arrow_shared Parquet::parquet_shared)
21+
endif()
22+
target_compile_features(dataset-to-gguf-lib PRIVATE cxx_std_11)
23+
24+
25+
add_executable(convert-to-train-gguf convert-to-train-gguf.cpp)
26+
target_link_libraries(convert-to-train-gguf PRIVATE dataset-to-gguf-lib) # Link to the new library
27+
target_compile_features(convert-to-train-gguf PRIVATE cxx_std_11) # Apply C++ standard to the executable
28+
29+
# Define the executable for the unit tests
30+
set(TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS dataset-to-gguf-unit-tests)
31+
add_executable(${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} dataset-to-gguf/tests/dataset-to-gguf-tests.cpp)
32+
33+
# Link necessary libraries for the test executable
34+
target_link_libraries(${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} PRIVATE common llama dataset-to-gguf-lib)
35+
36+
# Ensure C++17 for filesystem usage for the test executable
37+
target_compile_features(${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} PRIVATE cxx_std_17)
38+
39+
add_test(
40+
NAME ${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS} #
41+
COMMAND $<TARGET_FILE:${TEST_TARGET_GGUF_DATASET_CONVERTER_TESTS}>
42+
LABEL "training"
43+
)

0 commit comments

Comments
 (0)