Skip to content

Commit aab45e2

Browse files
author
lexasub
committed
small fixes
1 parent 1004205 commit aab45e2

File tree

10 files changed

+78
-132
lines changed

10 files changed

+78
-132
lines changed

common/arg.cpp

Lines changed: 5 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1470,7 +1470,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
14701470
[](common_params & params) {
14711471
params.ctx_shift = false;
14721472
}
1473-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
1473+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
14741474
add_opt(common_arg(
14751475
{"--chunks"}, "N",
14761476
string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@@ -3455,41 +3455,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34553455
}
34563456
).set_examples({LLAMA_EXAMPLE_FINETUNE}));
34573457

3458-
add_opt(common_arg(
3459-
{"--preview-count"}, "<N>",
3460-
string_format("input file contains pre-tokenized data (space-separated token IDs)"),
3461-
[](common_params & params, int preview_count) {
3462-
params.preview_count = preview_count;
3463-
}
3464-
).set_examples({LLAMA_EXAMPLE_FINETUNE}));
3465-
3466-
add_opt(common_arg(
3467-
{"--detokenize-preview"},
3468-
string_format("detokenize previewed sequences (implies --preview)"),
3469-
[](common_params & params) {
3470-
params.detokenize_preview = params.do_preview = true;
3471-
}
3472-
).set_examples({LLAMA_EXAMPLE_FINETUNE}));
3473-
3474-
#ifdef LLAMA_PARQUET
3475-
3476-
3477-
add_opt(common_arg(
3478-
{"--parquet-text-column"}, "<name>",
3479-
string_format("column name for raw text in Parquet files (default: 'text')"),
3480-
[](common_params & params, const std::string &parquet_text_column) {
3481-
params.parquet_text_column = parquet_text_column;
3482-
}
3483-
).set_examples({LLAMA_EXAMPLE_FINETUNE}));
3484-
34853458
add_opt(common_arg(
3486-
{"--parquet-tokens-column"}, "<name>",
3487-
string_format("column name for pre-tokenized data (list<int32>) in Parquet files (default: 'tokens')"),
3488-
[](common_params & params, const std::string &parquet_tokens_column) {
3489-
params.parquet_tokens_column = parquet_tokens_column;
3459+
{"--dataset-column"}, "<name>",
3460+
string_format("column name for data in dataset files"),
3461+
[](common_params & params, const std::string &dataset_column) {
3462+
params.dataset_column = dataset_column;
34903463
}
34913464
).set_examples({LLAMA_EXAMPLE_FINETUNE}));
34923465

3493-
#endif
34943466
return ctx_arg;
34953467
}

common/common.h

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -449,12 +449,7 @@ struct common_params {
449449
int32_t max_seq_len = 2048;
450450
bool do_preview = false;
451451
bool pre_tokenized = false;
452-
bool detokenize_preview = false;
453-
int preview_count = 1;
454-
#ifdef LLAMA_PARQUET
455-
std::string parquet_text_column = "text";
456-
std::string parquet_tokens_column = "tokens";
457-
#endif
452+
std::string dataset_column = "data";
458453
};
459454

460455
// call once at the start of a program if it uses libcommon

tools/dataset-converter/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
include_directories(.)
1+
include_directories(. ../../common)
22

33
if(LLAMA_PARQUET)
44
find_package(Arrow REQUIRED)

tools/dataset-converter/convert-to-train-gguf.cpp

Lines changed: 56 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
// This two-pass approach allows processing datasets significantly larger than
1414
// available RAM.
1515

16+
#include "log.h"
1617
#include <algorithm> // For std::min
1718
#include <array> // For std::array
1819
#include <cinttypes> // For PRIu64
@@ -25,36 +26,28 @@
2526
#include "dataset-to-gguf/llama-gguf-converter.h"
2627
#include "dataset-to-gguf/llama-gguf-reader.h"
2728
#include "llama.h" // For llama_backend_init, llama_backend_free, llama_model_load_from_file, llama_model_free
28-
29+
#define PREVIEW_COUNT 1
2930
int main(int argc, char ** argv) {
3031
common_params params;
3132
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_FINETUNE)) {
3233
return 1;
3334
}
3435

3536
// Print parameters for verification
36-
printf("Parameters:\n");
37-
printf(" Model for tokenizer: %s\n", params.model.path.c_str());
38-
printf(" Input files: ");
37+
LOG_INF("Parameters:\n");
38+
LOG_INF(" Model for tokenizer: %s\n", params.model.path.c_str());
39+
LOG_INF(" Input files: ");
3940
for (auto & i : params.in_files) {
40-
printf("%s ", i.c_str());
41-
}
42-
printf("\n Output file: %s\n", params.out_file.c_str());
43-
printf(" Max sequence length: %d\n", params.max_seq_len);
44-
printf(" Pre-tokenized input: %s\n", params.pre_tokenized ? "Yes" : "No");
45-
printf(" Input type: %s\n", params.dataset_format.c_str());
46-
printf(" Do preview: %s\n", params.do_preview ? "Yes" : "No");
47-
if (params.do_preview) {
48-
printf(" Preview count: %d\n", params.preview_count);
49-
printf(" Detokenize preview: %s\n", params.detokenize_preview ? "Yes" : "No");
41+
LOG_INF("%s ", i.c_str());
5042
}
51-
#ifdef LLAMA_PARQUET
52-
if (params.dataset_format == "parquet") {
53-
printf(" Parquet text column: %s\n", params.parquet_text_column.c_str());
54-
printf(" Parquet tokens column: %s\n", params.parquet_tokens_column.c_str());
43+
LOG_INF("\n Output file: %s\n", params.out_file.c_str());
44+
LOG_INF(" Max sequence length: %d\n", params.max_seq_len);
45+
LOG_INF(" Input type: %s\n", params.dataset_format.c_str());
46+
LOG_INF(" Do preview: %s\n", params.do_preview ? "Yes" : "No");
47+
if (params.dataset_format != "text") {
48+
LOG_INF(" Dataset column: %s\n", params.dataset_column.c_str());
5549
}
56-
#endif
57-
printf("\n");
50+
LOG_INF("\n");
5851

5952
// Initialize llama.cpp
6053
llama_backend_init();
@@ -64,125 +57,122 @@ int main(int argc, char ** argv) {
6457
llama_model *model = llama_model_load_from_file(params.model.path.c_str(), model_params);
6558

6659
if (model == nullptr) {
67-
fprintf(stderr, "error: failed to load model from %s\n", params.model.path.c_str());
60+
LOG_ERR("error: failed to load model from %s\n", params.model.path.c_str());
6861
llama_backend_free();
6962
return 1;
7063
}
7164

7265
// --- Diagnostic Test: Reading tokenizer model GGUF file ---
73-
printf("--- Diagnostic Test: Reading tokenizer model GGUF file ---\n");
66+
LOG_INF("--- Diagnostic Test: Reading tokenizer model GGUF file ---\n");
7467
try {
7568
llama_gguf_reader tokenizer_model_reader(params.model.path);
7669
if (tokenizer_model_reader.llama_gguf_reader_is_initialized()) {
77-
printf(" Tokenizer Model GGUF file opened successfully.\n");
78-
printf(" Tokenizer Model Name: %s\n",
70+
LOG_INF(" Tokenizer Model GGUF file opened successfully.\n");
71+
LOG_INF(" Tokenizer Model Name: %s\n",
7972
tokenizer_model_reader.llama_gguf_reader_get_metadata_str("general.name", "N/A").c_str());
80-
printf(" Tokenizer Model Architecture: %s\n",
73+
LOG_INF(" Tokenizer Model Architecture: %s\n",
8174
tokenizer_model_reader.llama_gguf_reader_get_metadata_str("general.architecture", "N/A").c_str());
82-
printf(" Tokenizer Model Tensor Count: %llu\n",
75+
LOG_INF(" Tokenizer Model Tensor Count: %llu\n",
8376
static_cast<long long>(tokenizer_model_reader.llama_gguf_reader_get_tensor_count()));
84-
printf(" Diagnostic Test: Tokenizer Model GGUF read successful.\n");
77+
LOG_INF(" Diagnostic Test: Tokenizer Model GGUF read successful.\n");
8578
} else {
86-
fprintf(stderr, "error: Diagnostic Test: Tokenizer Model GGUF read failed to initialize.\n");
79+
LOG_ERR("error: Diagnostic Test: Tokenizer Model GGUF read failed to initialize.\n");
8780
llama_model_free(model); // Free model before exiting
8881
llama_backend_free();
8982
return 1;
9083
}
9184
} catch (const std::runtime_error & e) {
92-
fprintf(stderr, "error: Diagnostic Test: Tokenizer Model GGUF read failed: %s\n", e.what());
85+
LOG_ERR("error: Diagnostic Test: Tokenizer Model GGUF read failed: %s\n", e.what());
9386
llama_model_free(model); // Free model before exiting
9487
llama_backend_free();
9588
return 1;
9689
}
97-
printf("--- End of Diagnostic Test ---\n\n");
90+
LOG_INF("--- End of Diagnostic Test ---\n\n");
9891

9992
// Create and run the converter
10093
llama_gguf_converter converter;
10194
bool success = converter.llama_gguf_converter_convert(params, model);
10295

10396
if (!success) {
104-
fprintf(stderr, "error: GGUF conversion failed.\n");
97+
LOG_ERR("error: GGUF conversion failed.\n");
10598
llama_model_free(model); // Free model on conversion failure
10699
llama_backend_free();
107100
return 1;
108101
}
109102

110-
printf("Conversion successful!\n");
111-
printf("Output file: %s\n", params.out_file.c_str());
103+
LOG_INF("Conversion successful!\n");
104+
LOG_INF("Output file: %s\n", params.out_file.c_str());
112105

113106
// --- Preview generated GGUF file (if requested) ---
114107
if (params.do_preview) {
115-
printf("\n--- Previewing generated GGUF file ---\n");
108+
LOG_INF("\n--- Previewing generated GGUF file ---\n");
116109
try {
117110
llama_gguf_reader reader(params.out_file);
118111

119112
if (!reader.llama_gguf_reader_is_initialized()) {
120-
fprintf(stderr, "error: llama_gguf_reader failed to initialize for preview.\n");
113+
LOG_ERR("error: llama_gguf_reader failed to initialize for preview.\n");
121114
llama_model_free(model); // Free model before exiting
122115
llama_backend_free();
123116
return 1;
124117
}
125118

126-
printf(" Dataset Name: %s\n",
119+
LOG_INF(" Dataset Name: %s\n",
127120
reader.llama_gguf_reader_get_metadata_str("training.dataset.name", "N/A").c_str());
128-
printf(" Sequence Count: %llu\n", static_cast<long long>(reader.llama_gguf_reader_get_metadata_u64("training.sequence.count", 0)));
129-
printf(" Tokenizer Model: %s\n",
121+
LOG_INF(" Sequence Count: %llu\n", static_cast<long long>(reader.llama_gguf_reader_get_metadata_u64("training.sequence.count", 0)));
122+
LOG_INF(" Tokenizer Model: %s\n",
130123
reader.llama_gguf_reader_get_metadata_str("training.tokenizer.gguf.model", "N/A").c_str());
131124

132125
int64_t tensor_count = reader.llama_gguf_reader_get_tensor_count();
133126
if (tensor_count > 0) {
134127
// Print N first sequences
135-
for (int64_t i = 0; i < std::min((int64_t) params.preview_count, tensor_count); ++i) {
136-
printf(" Sequence (training.tensor.%" PRId64 "):\n", i);
128+
for (int64_t i = 0; i < std::min(static_cast<int64_t>(PREVIEW_COUNT), tensor_count); ++i) {
129+
LOG_INF(" Sequence (training.tensor.%" PRId64 "):\n", i);
137130
std::vector<llama_token> sequence_tokens;
138131
if (reader.llama_gguf_reader_read_tensor_data(i, sequence_tokens)) {
139-
printf(" Length: %zu tokens\n", sequence_tokens.size());
140-
printf(" Tokens: [");
132+
LOG_INF(" Length: %zu tokens\n", sequence_tokens.size());
133+
LOG_INF(" Tokens: [");
141134
for (size_t j = 0; j < std::min((size_t) 10, sequence_tokens.size());
142135
++j) { // Print up to 10 tokens
143-
printf("%d%s", sequence_tokens[j],
136+
LOG_INF("%d%s", sequence_tokens[j],
144137
(j == std::min((size_t) 10, sequence_tokens.size()) - 1) ? "" : ", ");
145138
}
146139
if (sequence_tokens.size() > 10) {
147-
printf("...");
140+
LOG_INF("...");
148141
}
149-
printf("]\n");
150-
151-
if (params.detokenize_preview) {
152-
// Detokenization
153-
std::string detokenized_text = "";
154-
// Buffer for a single token
155-
std::array<char, 256> piece_buf; // Large enough buffer for a single token
156-
// Ensure model is valid before calling llama_model_get_vocab
157-
if (model != nullptr) {
158-
for (llama_token token : sequence_tokens) {
159-
int n_chars = llama_token_to_piece(llama_model_get_vocab(model), token,
160-
piece_buf.data(), piece_buf.size(), 1, false);
161-
if (n_chars > 0) {
162-
detokenized_text.append(piece_buf.data(), n_chars);
163-
}
142+
LOG_INF("]\n");
143+
// Detokenization
144+
std::string detokenized_text = "";
145+
// Buffer for a single token
146+
std::array<char, 256> piece_buf; // Large enough buffer for a single token
147+
// Ensure model is valid before calling llama_model_get_vocab
148+
if (model != nullptr) {
149+
for (llama_token token : sequence_tokens) {
150+
int n_chars = llama_token_to_piece(llama_model_get_vocab(model), token,
151+
piece_buf.data(), piece_buf.size(), 1, false);
152+
if (n_chars > 0) {
153+
detokenized_text.append(piece_buf.data(), n_chars);
164154
}
165-
printf(" Detokenized: \"%s\"\n", detokenized_text.c_str());
166-
} else {
167-
fprintf(stderr, " Warning: Cannot detokenize preview, model is null.\n");
168155
}
156+
LOG_INF(" Detokenized: \"%s\"\n", detokenized_text.c_str());
157+
} else {
158+
LOG_ERR(" Warning: Cannot detokenize preview, model is null.\n");
169159
}
170160

171161
} else {
172-
fprintf(stderr, " Error: Could not read data for sequence %" PRId64 ".\n", i);
162+
LOG_ERR(" Error: Could not read data for sequence %" PRId64 ".\n", i);
173163
}
174164
}
175165
} else {
176-
printf(" No sequences found in the GGUF file.\n");
166+
LOG_INF(" No sequences found in the GGUF file.\n");
177167
}
178168

179169
} catch (const std::runtime_error & e) {
180-
fprintf(stderr, "error: GGUF preview failed: %s\n", e.what());
170+
LOG_ERR("error: GGUF preview failed: %s\n", e.what());
181171
llama_model_free(model); // Free model before exiting
182172
llama_backend_free();
183173
return 1;
184174
}
185-
printf("--- End of GGUF file preview ---\n");
175+
LOG_INF("--- End of GGUF file preview ---\n");
186176
}
187177

188178
// Clean up llama model and backend after all usage

tools/dataset-converter/dataset-to-gguf/llama-dataset-reader/llama-parquet-data-reader.cpp

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,18 @@
66

77
// Constructor
88
llama_parquet_dataset_reader::llama_parquet_dataset_reader(const struct llama_model * model, int32_t max_seq_len,
9-
bool pre_tokenized, const std::string & text_column_name,
10-
const std::string & tokens_column_name) :
9+
bool pre_tokenized, const std::string & dataset_column_name) :
1110
model_(model),
1211
max_seq_len_(max_seq_len),
1312
pre_tokenized_(pre_tokenized),
1413
current_row_group_index_(0), // Initialize row group index
1514
current_row_in_table_(0),
1615
current_column_index_(-1), // Initialize to -1, will be set in open
17-
text_column_name_(text_column_name),
18-
tokens_column_name_(tokens_column_name) {}
16+
dataset_column_name_(dataset_column_name) {}
1917

2018
// Destructor
2119
llama_parquet_dataset_reader::~llama_parquet_dataset_reader() {
22-
close();
23-
m_file_path.clear(); // Clear the stored path only on destruction
20+
llama_parquet_dataset_reader::close();
2421
}
2522

2623
// Opens the Parquet file for reading.
@@ -63,41 +60,41 @@ bool llama_parquet_dataset_reader::open(const std::string & path) {
6360

6461
// Determine the column index based on pre_tokenized_ flag
6562
if (pre_tokenized_) {
66-
current_column_index_ = schema->GetFieldIndex(tokens_column_name_); // Use configurable name
63+
current_column_index_ = schema->GetFieldIndex(dataset_column_name_); // Use configurable name
6764
if (current_column_index_ == -1) {
6865
std::cerr << "Error (llama_parquet_dataset_reader::open): Pre-tokenized mode selected, but column '"
69-
<< tokens_column_name_ << "' not found in Parquet schema." << std::endl;
66+
<< dataset_column_name_ << "' not found in Parquet schema." << std::endl;
7067
close();
7168
return false;
7269
}
7370
// Validate column type: should be List<Int32>
7471
if (schema->field(current_column_index_)->type()->id() != arrow::Type::LIST) {
75-
std::cerr << "Error (llama_parquet_dataset_reader::open): Column '" << tokens_column_name_
72+
std::cerr << "Error (llama_parquet_dataset_reader::open): Column '" << dataset_column_name_
7673
<< "' is not of LIST type as expected for pre-tokenized data. Actual type: "
7774
<< schema->field(current_column_index_)->type()->ToString() << std::endl;
7875
close();
7976
return false;
8077
}
8178
auto list_type = std::static_pointer_cast<arrow::ListType>(schema->field(current_column_index_)->type());
8279
if (list_type->value_type()->id() != arrow::Type::INT32) {
83-
std::cerr << "Error (llama_parquet_dataset_reader::open): List items in column '" << tokens_column_name_
80+
std::cerr << "Error (llama_parquet_dataset_reader::open): List items in column '" << dataset_column_name_
8481
<< "' are not of INT32 type as expected. Actual value type: "
8582
<< list_type->value_type()->ToString() << std::endl;
8683
close();
8784
return false;
8885
}
8986

9087
} else {
91-
current_column_index_ = schema->GetFieldIndex(text_column_name_); // Use configurable name
88+
current_column_index_ = schema->GetFieldIndex(dataset_column_name_); // Use configurable name
9289
if (current_column_index_ == -1) {
9390
std::cerr << "Error (llama_parquet_dataset_reader::open): Raw text mode selected, but column '"
94-
<< text_column_name_ << "' not found in Parquet schema." << std::endl;
91+
<< dataset_column_name_ << "' not found in Parquet schema." << std::endl;
9592
close();
9693
return false;
9794
}
9895
// Validate column type: should be String
9996
if (schema->field(current_column_index_)->type()->id() != arrow::Type::STRING) {
100-
std::cerr << "Error (llama_parquet_dataset_reader::open): Column '" << text_column_name_
97+
std::cerr << "Error (llama_parquet_dataset_reader::open): Column '" << dataset_column_name_
10198
<< "' is not of STRING type as expected for raw text. Actual type: "
10299
<< schema->field(current_column_index_)->type()->ToString() << std::endl;
103100
close();

0 commit comments

Comments
 (0)