-
Notifications
You must be signed in to change notification settings - Fork 12.4k
mtmd : add qwen2vl and qwen2.5vl #13141
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 7 commits
Commits
Show all changes
10 commits
Select commit
Hold shift + click to select a range
6f7a55c
llava : add clip_n_output_tokens, deprecate clip_n_patches
ngxson 8742f8a
mtmd : add qwen2vl and qwen2.5vl
ngxson 8646e36
decode_embd_batch::set_position_...
ngxson 513e9c9
Merge branch 'master' into xsn/mtmd_qwen2vl
ngxson b303584
working version
ngxson 14a6ab1
Merge branch 'master' into xsn/mtmd_qwen2vl
ngxson d23fdc2
deprecate llama-qwen2vl-cli
ngxson e9bff42
Merge branch 'master' into xsn/mtmd_qwen2vl
ngxson 496f1ce
correct order W, H of clip_embd_nbytes_by_img
ngxson db85de1
edit existing line in hot topics
ngxson File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -112,7 +112,7 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair< | |
} | ||
|
||
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out) | ||
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) { | ||
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out, clip_image_f32 * img_input) { | ||
struct { | ||
struct ggml_context * ctx; | ||
} model; | ||
|
@@ -175,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> | |
|
||
model.ctx = ggml_init(params); | ||
|
||
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4 | ||
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_output_tokens(ctx_clip, img_input), num_images - 1); // example: 4096 x 576 x 4 | ||
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false); | ||
// fill it with the image embeddings, ignoring the base | ||
for (size_t i = 1; i < num_images; i++) { | ||
|
@@ -214,8 +214,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> | |
|
||
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context | ||
// append without newline tokens (default behavior in llava_arch when not using unpad ): | ||
memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches | ||
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip)); | ||
memcpy(image_embd_out + clip_n_output_tokens(ctx_clip, img_input) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches | ||
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_output_tokens(ctx_clip, img_input)); | ||
|
||
// Debug: Test single segments | ||
// Current findings: sending base image, sending a segment embedding all works similar to python | ||
|
@@ -313,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli | |
image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), | ||
image_embd_v[i], | ||
clip_embd_nbytes_by_img(ctx_clip, nx, ny)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the other call. |
||
n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res); | ||
n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res); | ||
} | ||
*n_img_pos = n_img_pos_out; | ||
for (size_t i = 0; i < image_embd_v.size(); i++) { | ||
|
@@ -342,8 +342,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli | |
} | ||
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { | ||
// flat / default llava-1.5 type embedding | ||
*n_img_pos = clip_n_patches(ctx_clip); | ||
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0); | ||
*n_img_pos = clip_n_output_tokens(ctx_clip, img_res); | ||
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096 | ||
if (!encoded) { | ||
LOG_ERR("Unable to encode image\n"); | ||
|
@@ -381,7 +381,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli | |
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size); | ||
|
||
int n_img_pos_out; | ||
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out); | ||
clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0); | ||
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input); | ||
*n_img_pos = n_img_pos_out; | ||
|
||
for (size_t i = 0; i < image_embd_v.size(); i++) { | ||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Rename
img_h
andimg_w
tonx
andny
and fix calls:There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed in 496f1ce