13
13
// This two-pass approach allows processing datasets significantly larger than
14
14
// available RAM.
15
15
16
+ #include " log.h"
16
17
#include < algorithm> // For std::min
17
18
#include < array> // For std::array
18
19
#include < cinttypes> // For PRIu64
25
26
#include " dataset-to-gguf/llama-gguf-converter.h"
26
27
#include " dataset-to-gguf/llama-gguf-reader.h"
27
28
#include " llama.h" // For llama_backend_init, llama_backend_free, llama_model_load_from_file, llama_model_free
28
-
29
+ # define PREVIEW_COUNT 1
29
30
int main (int argc, char ** argv) {
30
31
common_params params;
31
32
if (!common_params_parse (argc, argv, params, LLAMA_EXAMPLE_FINETUNE)) {
32
33
return 1 ;
33
34
}
34
35
35
36
// Print parameters for verification
36
- printf (" Parameters:\n " );
37
- printf (" Model for tokenizer: %s\n " , params.model .path .c_str ());
38
- printf (" Input files: " );
37
+ LOG_INF (" Parameters:\n " );
38
+ LOG_INF (" Model for tokenizer: %s\n " , params.model .path .c_str ());
39
+ LOG_INF (" Input files: " );
39
40
for (auto & i : params.in_files ) {
40
- printf (" %s " , i.c_str ());
41
- }
42
- printf (" \n Output file: %s\n " , params.out_file .c_str ());
43
- printf (" Max sequence length: %d\n " , params.max_seq_len );
44
- printf (" Pre-tokenized input: %s\n " , params.pre_tokenized ? " Yes" : " No" );
45
- printf (" Input type: %s\n " , params.dataset_format .c_str ());
46
- printf (" Do preview: %s\n " , params.do_preview ? " Yes" : " No" );
47
- if (params.do_preview ) {
48
- printf (" Preview count: %d\n " , params.preview_count );
49
- printf (" Detokenize preview: %s\n " , params.detokenize_preview ? " Yes" : " No" );
41
+ LOG_INF (" %s " , i.c_str ());
50
42
}
51
- #ifdef LLAMA_PARQUET
52
- if (params.dataset_format == " parquet" ) {
53
- printf (" Parquet text column: %s\n " , params.parquet_text_column .c_str ());
54
- printf (" Parquet tokens column: %s\n " , params.parquet_tokens_column .c_str ());
43
+ LOG_INF (" \n Output file: %s\n " , params.out_file .c_str ());
44
+ LOG_INF (" Max sequence length: %d\n " , params.max_seq_len );
45
+ LOG_INF (" Input type: %s\n " , params.dataset_format .c_str ());
46
+ LOG_INF (" Do preview: %s\n " , params.do_preview ? " Yes" : " No" );
47
+ if (params.dataset_format != " text" ) {
48
+ LOG_INF (" Dataset column: %s\n " , params.dataset_column .c_str ());
55
49
}
56
- #endif
57
- printf (" \n " );
50
+ LOG_INF (" \n " );
58
51
59
52
// Initialize llama.cpp
60
53
llama_backend_init ();
@@ -64,125 +57,122 @@ int main(int argc, char ** argv) {
64
57
llama_model *model = llama_model_load_from_file (params.model .path .c_str (), model_params);
65
58
66
59
if (model == nullptr ) {
67
- fprintf (stderr, " error: failed to load model from %s\n " , params.model .path .c_str ());
60
+ LOG_ERR ( " error: failed to load model from %s\n " , params.model .path .c_str ());
68
61
llama_backend_free ();
69
62
return 1 ;
70
63
}
71
64
72
65
// --- Diagnostic Test: Reading tokenizer model GGUF file ---
73
- printf (" --- Diagnostic Test: Reading tokenizer model GGUF file ---\n " );
66
+ LOG_INF (" --- Diagnostic Test: Reading tokenizer model GGUF file ---\n " );
74
67
try {
75
68
llama_gguf_reader tokenizer_model_reader (params.model .path );
76
69
if (tokenizer_model_reader.llama_gguf_reader_is_initialized ()) {
77
- printf (" Tokenizer Model GGUF file opened successfully.\n " );
78
- printf (" Tokenizer Model Name: %s\n " ,
70
+ LOG_INF (" Tokenizer Model GGUF file opened successfully.\n " );
71
+ LOG_INF (" Tokenizer Model Name: %s\n " ,
79
72
tokenizer_model_reader.llama_gguf_reader_get_metadata_str (" general.name" , " N/A" ).c_str ());
80
- printf (" Tokenizer Model Architecture: %s\n " ,
73
+ LOG_INF (" Tokenizer Model Architecture: %s\n " ,
81
74
tokenizer_model_reader.llama_gguf_reader_get_metadata_str (" general.architecture" , " N/A" ).c_str ());
82
- printf (" Tokenizer Model Tensor Count: %llu\n " ,
75
+ LOG_INF (" Tokenizer Model Tensor Count: %llu\n " ,
83
76
static_cast <long long >(tokenizer_model_reader.llama_gguf_reader_get_tensor_count ()));
84
- printf (" Diagnostic Test: Tokenizer Model GGUF read successful.\n " );
77
+ LOG_INF (" Diagnostic Test: Tokenizer Model GGUF read successful.\n " );
85
78
} else {
86
- fprintf (stderr, " error: Diagnostic Test: Tokenizer Model GGUF read failed to initialize.\n " );
79
+ LOG_ERR ( " error: Diagnostic Test: Tokenizer Model GGUF read failed to initialize.\n " );
87
80
llama_model_free (model); // Free model before exiting
88
81
llama_backend_free ();
89
82
return 1 ;
90
83
}
91
84
} catch (const std::runtime_error & e) {
92
- fprintf (stderr, " error: Diagnostic Test: Tokenizer Model GGUF read failed: %s\n " , e.what ());
85
+ LOG_ERR ( " error: Diagnostic Test: Tokenizer Model GGUF read failed: %s\n " , e.what ());
93
86
llama_model_free (model); // Free model before exiting
94
87
llama_backend_free ();
95
88
return 1 ;
96
89
}
97
- printf (" --- End of Diagnostic Test ---\n\n " );
90
+ LOG_INF (" --- End of Diagnostic Test ---\n\n " );
98
91
99
92
// Create and run the converter
100
93
llama_gguf_converter converter;
101
94
bool success = converter.llama_gguf_converter_convert (params, model);
102
95
103
96
if (!success) {
104
- fprintf (stderr, " error: GGUF conversion failed.\n " );
97
+ LOG_ERR ( " error: GGUF conversion failed.\n " );
105
98
llama_model_free (model); // Free model on conversion failure
106
99
llama_backend_free ();
107
100
return 1 ;
108
101
}
109
102
110
- printf (" Conversion successful!\n " );
111
- printf (" Output file: %s\n " , params.out_file .c_str ());
103
+ LOG_INF (" Conversion successful!\n " );
104
+ LOG_INF (" Output file: %s\n " , params.out_file .c_str ());
112
105
113
106
// --- Preview generated GGUF file (if requested) ---
114
107
if (params.do_preview ) {
115
- printf (" \n --- Previewing generated GGUF file ---\n " );
108
+ LOG_INF (" \n --- Previewing generated GGUF file ---\n " );
116
109
try {
117
110
llama_gguf_reader reader (params.out_file );
118
111
119
112
if (!reader.llama_gguf_reader_is_initialized ()) {
120
- fprintf (stderr, " error: llama_gguf_reader failed to initialize for preview.\n " );
113
+ LOG_ERR ( " error: llama_gguf_reader failed to initialize for preview.\n " );
121
114
llama_model_free (model); // Free model before exiting
122
115
llama_backend_free ();
123
116
return 1 ;
124
117
}
125
118
126
- printf (" Dataset Name: %s\n " ,
119
+ LOG_INF (" Dataset Name: %s\n " ,
127
120
reader.llama_gguf_reader_get_metadata_str (" training.dataset.name" , " N/A" ).c_str ());
128
- printf (" Sequence Count: %llu\n " , static_cast <long long >(reader.llama_gguf_reader_get_metadata_u64 (" training.sequence.count" , 0 )));
129
- printf (" Tokenizer Model: %s\n " ,
121
+ LOG_INF (" Sequence Count: %llu\n " , static_cast <long long >(reader.llama_gguf_reader_get_metadata_u64 (" training.sequence.count" , 0 )));
122
+ LOG_INF (" Tokenizer Model: %s\n " ,
130
123
reader.llama_gguf_reader_get_metadata_str (" training.tokenizer.gguf.model" , " N/A" ).c_str ());
131
124
132
125
int64_t tensor_count = reader.llama_gguf_reader_get_tensor_count ();
133
126
if (tensor_count > 0 ) {
134
127
// Print N first sequences
135
- for (int64_t i = 0 ; i < std::min (( int64_t ) params. preview_count , tensor_count); ++i) {
136
- printf (" Sequence (training.tensor.%" PRId64 " ):\n " , i);
128
+ for (int64_t i = 0 ; i < std::min (static_cast < int64_t >(PREVIEW_COUNT) , tensor_count); ++i) {
129
+ LOG_INF (" Sequence (training.tensor.%" PRId64 " ):\n " , i);
137
130
std::vector<llama_token> sequence_tokens;
138
131
if (reader.llama_gguf_reader_read_tensor_data (i, sequence_tokens)) {
139
- printf (" Length: %zu tokens\n " , sequence_tokens.size ());
140
- printf (" Tokens: [" );
132
+ LOG_INF (" Length: %zu tokens\n " , sequence_tokens.size ());
133
+ LOG_INF (" Tokens: [" );
141
134
for (size_t j = 0 ; j < std::min ((size_t ) 10 , sequence_tokens.size ());
142
135
++j) { // Print up to 10 tokens
143
- printf (" %d%s" , sequence_tokens[j],
136
+ LOG_INF (" %d%s" , sequence_tokens[j],
144
137
(j == std::min ((size_t ) 10 , sequence_tokens.size ()) - 1 ) ? " " : " , " );
145
138
}
146
139
if (sequence_tokens.size () > 10 ) {
147
- printf (" ..." );
140
+ LOG_INF (" ..." );
148
141
}
149
- printf (" ]\n " );
150
-
151
- if (params.detokenize_preview ) {
152
- // Detokenization
153
- std::string detokenized_text = " " ;
154
- // Buffer for a single token
155
- std::array<char , 256 > piece_buf; // Large enough buffer for a single token
156
- // Ensure model is valid before calling llama_model_get_vocab
157
- if (model != nullptr ) {
158
- for (llama_token token : sequence_tokens) {
159
- int n_chars = llama_token_to_piece (llama_model_get_vocab (model), token,
160
- piece_buf.data (), piece_buf.size (), 1 , false );
161
- if (n_chars > 0 ) {
162
- detokenized_text.append (piece_buf.data (), n_chars);
163
- }
142
+ LOG_INF (" ]\n " );
143
+ // Detokenization
144
+ std::string detokenized_text = " " ;
145
+ // Buffer for a single token
146
+ std::array<char , 256 > piece_buf; // Large enough buffer for a single token
147
+ // Ensure model is valid before calling llama_model_get_vocab
148
+ if (model != nullptr ) {
149
+ for (llama_token token : sequence_tokens) {
150
+ int n_chars = llama_token_to_piece (llama_model_get_vocab (model), token,
151
+ piece_buf.data (), piece_buf.size (), 1 , false );
152
+ if (n_chars > 0 ) {
153
+ detokenized_text.append (piece_buf.data (), n_chars);
164
154
}
165
- printf (" Detokenized: \" %s\"\n " , detokenized_text.c_str ());
166
- } else {
167
- fprintf (stderr, " Warning: Cannot detokenize preview, model is null.\n " );
168
155
}
156
+ LOG_INF (" Detokenized: \" %s\"\n " , detokenized_text.c_str ());
157
+ } else {
158
+ LOG_ERR (" Warning: Cannot detokenize preview, model is null.\n " );
169
159
}
170
160
171
161
} else {
172
- fprintf (stderr, " Error: Could not read data for sequence %" PRId64 " .\n " , i);
162
+ LOG_ERR ( " Error: Could not read data for sequence %" PRId64 " .\n " , i);
173
163
}
174
164
}
175
165
} else {
176
- printf (" No sequences found in the GGUF file.\n " );
166
+ LOG_INF (" No sequences found in the GGUF file.\n " );
177
167
}
178
168
179
169
} catch (const std::runtime_error & e) {
180
- fprintf (stderr, " error: GGUF preview failed: %s\n " , e.what ());
170
+ LOG_ERR ( " error: GGUF preview failed: %s\n " , e.what ());
181
171
llama_model_free (model); // Free model before exiting
182
172
llama_backend_free ();
183
173
return 1 ;
184
174
}
185
- printf (" --- End of GGUF file preview ---\n " );
175
+ LOG_INF (" --- End of GGUF file preview ---\n " );
186
176
}
187
177
188
178
// Clean up llama model and backend after all usage
0 commit comments