@@ -16,15 +16,22 @@ struct mtmd_context {
16
16
struct clip_ctx * ctx_clip;
17
17
const struct llama_model * text_model;
18
18
std::vector<float > image_embd_v; // image embedding vector
19
+
19
20
bool print_timings;
20
21
int n_threads;
21
22
std::string image_marker;
23
+ bool calc_image_hash;
22
24
23
25
// TODO @ngxson : add timings
24
26
25
27
mtmd_context (const char * mmproj_fname,
26
28
const llama_model * text_model,
27
- const mtmd_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) {
29
+ const mtmd_context_params & ctx_params) :
30
+ print_timings (ctx_params.print_timings),
31
+ n_threads (ctx_params.n_threads),
32
+ image_marker (ctx_params.image_marker),
33
+ calc_image_hash (ctx_params.calc_image_hash)
34
+ {
28
35
clip_context_params ctx_clip_params;
29
36
ctx_clip_params.use_gpu = ctx_params.use_gpu ;
30
37
ctx_clip_params.verbosity = ctx_params.verbosity ;
@@ -49,6 +56,7 @@ struct mtmd_image_tokens {
49
56
uint32_t ny; // number of tokens in y direction
50
57
uint32_t n_tokens () const { return nx * ny; }
51
58
clip_image_f32_batch batch_f32; // preprocessed image patches
59
+ size_t image_hash = 0 ; // hash of the image, useful for KV cache tracking
52
60
};
53
61
54
62
mtmd_context * mtmd_init_from_file (const char * mmproj_fname,
@@ -88,6 +96,16 @@ static std::vector<llama_token> mtmd_tokenize_text_internal(
88
96
return result;
89
97
}
90
98
99
+ static uint64_t hash_vector_float (const std::vector<float > & vec) {
100
+ uint64_t seed = vec.size ();
101
+ std::hash<float > hasher;
102
+ for (float val : vec) {
103
+ // inspired by boost::hash_combine
104
+ seed ^= hasher (val) + 0x9e3779b9 + (seed << 6 ) + (seed >> 2 );
105
+ }
106
+ return seed;
107
+ }
108
+
91
109
mtmd_input_chunks * mtmd_tokenize (mtmd_context * ctx,
92
110
const mtmd_input_text & text,
93
111
const std::vector<mtmd_bitmap> & bitmaps) {
@@ -153,6 +171,11 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
153
171
image_tokens->ny = 1 ; // TODO
154
172
image_tokens->batch_f32 = std::move (batch_f32);
155
173
174
+ // optionally calculate the hash
175
+ if (ctx->calc_image_hash ) {
176
+ image_tokens->image_hash = hash_vector_float (image_tokens->batch_f32 .entries [0 ]->buf );
177
+ }
178
+
156
179
mtmd_input_chunk chunk{
157
180
MTMD_INPUT_CHUNK_TYPE_IMAGE,
158
181
{},
@@ -196,6 +219,10 @@ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
196
219
return image_tokens->ny ;
197
220
}
198
221
222
+ uint64_t mtmd_image_tokens_get_hash (const mtmd_image_tokens * image_tokens) {
223
+ return image_tokens->image_hash ;
224
+ }
225
+
199
226
int32_t mtmd_encode (mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
200
227
int n_mmproj_embd = clip_n_mmproj_embd (ctx->ctx_clip );
201
228
ctx->image_embd_v .resize (image_tokens->n_tokens () * n_mmproj_embd);
0 commit comments