Skip to content

Commit 7ac0b7b

Browse files
committed
mtmd : ability to calc image hash
1 parent a46b6db commit 7ac0b7b

File tree

3 files changed

+37
-5
lines changed

3 files changed

+37
-5
lines changed

examples/llava/gemma3-cli.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ struct gemma3_context {
8989
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{
9090
/* use_gpu */ true,
9191
/* timings */ true,
92+
/* hash */ false,
9293
/* n_threads */ params.cpuparams.n_threads,
9394
/* verbosity */ GGML_LOG_LEVEL_INFO,
9495
}));

examples/llava/mtmd.cpp

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,22 @@ struct mtmd_context {
1616
struct clip_ctx * ctx_clip;
1717
const struct llama_model * text_model;
1818
std::vector<float> image_embd_v; // image embedding vector
19+
1920
bool print_timings;
2021
int n_threads;
2122
std::string image_marker;
23+
bool calc_image_hash;
2224

2325
// TODO @ngxson : add timings
2426

2527
mtmd_context(const char * mmproj_fname,
2628
const llama_model * text_model,
27-
const mtmd_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) {
29+
const mtmd_context_params & ctx_params) :
30+
print_timings (ctx_params.print_timings),
31+
n_threads (ctx_params.n_threads),
32+
image_marker (ctx_params.image_marker),
33+
calc_image_hash(ctx_params.calc_image_hash)
34+
{
2835
clip_context_params ctx_clip_params;
2936
ctx_clip_params.use_gpu = ctx_params.use_gpu;
3037
ctx_clip_params.verbosity = ctx_params.verbosity;
@@ -49,6 +56,7 @@ struct mtmd_image_tokens {
4956
uint32_t ny; // number of tokens in y direction
5057
uint32_t n_tokens() const { return nx * ny; }
5158
clip_image_f32_batch batch_f32; // preprocessed image patches
59+
size_t image_hash = 0; // hash of the image, useful for KV cache tracking
5260
};
5361

5462
mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
@@ -88,6 +96,16 @@ static std::vector<llama_token> mtmd_tokenize_text_internal(
8896
return result;
8997
}
9098

99+
static uint64_t hash_vector_float(const std::vector<float> & vec) {
100+
uint64_t seed = vec.size();
101+
std::hash<float> hasher;
102+
for (float val : vec) {
103+
// inspired by boost::hash_combine
104+
seed ^= hasher(val) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
105+
}
106+
return seed;
107+
}
108+
91109
mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
92110
const mtmd_input_text & text,
93111
const std::vector<mtmd_bitmap> & bitmaps) {
@@ -153,6 +171,11 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
153171
image_tokens->ny = 1; // TODO
154172
image_tokens->batch_f32 = std::move(batch_f32);
155173

174+
// optionally calculate the hash
175+
if (ctx->calc_image_hash) {
176+
image_tokens->image_hash = hash_vector_float(image_tokens->batch_f32.entries[0]->buf);
177+
}
178+
156179
mtmd_input_chunk chunk{
157180
MTMD_INPUT_CHUNK_TYPE_IMAGE,
158181
{},
@@ -196,6 +219,10 @@ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
196219
return image_tokens->ny;
197220
}
198221

222+
uint64_t mtmd_image_tokens_get_hash(const mtmd_image_tokens * image_tokens) {
223+
return image_tokens->image_hash;
224+
}
225+
199226
int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
200227
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
201228
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);

examples/llava/mtmd.h

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ using mtmd_input_chunks = std::vector<mtmd_input_chunk>;
5252
struct mtmd_context_params {
5353
bool use_gpu = true;
5454
bool print_timings = true;
55+
// calc_image_hash is useful for tracking KV cache
56+
// if not set, mtmd_image_tokens_get_hash will return 0
57+
bool calc_image_hash = false;
5558
int n_threads = 4;
5659
enum ggml_log_level verbosity = GGML_LOG_LEVEL_INFO;
5760
const char * image_marker = "<__image__>";
@@ -91,10 +94,11 @@ MTMD_API mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
9194
MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks, bool free_images);
9295

9396
// access mtmd_image_tokens
94-
MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
95-
MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
96-
MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
97-
MTMD_API void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
97+
MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
98+
MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
99+
MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
100+
MTMD_API uint64_t mtmd_image_tokens_get_hash(const mtmd_image_tokens * image_tokens);
101+
MTMD_API void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
98102

99103
// returns 0 on success
100104
MTMD_API int32_t mtmd_encode(mtmd_context * ctx,

0 commit comments

Comments
 (0)