-
Notifications
You must be signed in to change notification settings - Fork 12.4k
Implement automatic NGL detection #6502
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 3 commits
1e66c3a
746d5fb
2549662
2325ec0
a612032
1589e52
48fbf8c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1650,6 +1650,28 @@ static size_t llama_get_device_memory(int device) { | |
#endif | ||
} | ||
|
||
// TODO: implement for other backends to return free memory | ||
static size_t llama_get_available_device_memory(int device) { | ||
#if defined(GGML_USE_CUDA) | ||
size_t free; | ||
ggml_backend_cuda_get_free_device_memory(device, &free); | ||
return free; | ||
#elif defined(GGML_USE_SYCL) | ||
size_t total; | ||
size_t free; | ||
ggml_backend_sycl_get_free_device_memory(device, &total, &free); | ||
return free; | ||
#elif defined(GGML_USE_VULKAN) | ||
size_t total; | ||
size_t free; | ||
ggml_backend_vk_get_free_device_memory(device, &total, &free); | ||
return free; | ||
#else | ||
return 1; | ||
GGML_UNUSED(device); | ||
#endif | ||
} | ||
|
||
// | ||
// globals | ||
// | ||
|
@@ -4329,6 +4351,32 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { | |
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } | ||
} | ||
|
||
static int llm_determine_max_ngl(const llama_model_loader & ml, const llama_model & model, const int main_gpu) { | ||
const auto & hparams = model.hparams; | ||
|
||
size_t available_gpu_memory = llama_get_available_device_memory(main_gpu); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The program logic here is inconsistent with the |
||
|
||
// TODO: This is a rough, pretty inaccurate estimate, should implement using existing layer size and not guesstimating | ||
size_t model_size = ml.n_bytes; | ||
int32_t model_layers = hparams.n_layer; | ||
size_t memory_per_layer = model_size / model_layers; | ||
|
||
// TODO: get buffer size dynamically | ||
int32_t buf_size = 400 * MiB; | ||
int32_t buf_size_k = 200 * MiB; | ||
int32_t buf_size_v = 200 * MiB; | ||
|
||
int32_t total_buf_size = buf_size + buf_size_k + buf_size_v; | ||
|
||
available_gpu_memory = available_gpu_memory - hparams.n_ctx_train; // context size | ||
available_gpu_memory = available_gpu_memory - total_buf_size; // buffer size | ||
|
||
// Calculate the maximum number of layers that can fit into the GPU memory | ||
int32_t max_ngl = std::floor(static_cast<float>(available_gpu_memory) / memory_per_layer); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You should leave a small amount of headroom when the number of layers is determined automatically; you would want to avoid a scenario where an application ooms because llama.cpp only left 5 MB of VRAM. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought about doing that; thanks for reminding |
||
|
||
return max_ngl; | ||
} | ||
|
||
// Returns false if cancelled by progress_callback | ||
static bool llm_load_tensors( | ||
llama_model_loader & ml, | ||
|
@@ -4344,6 +4392,11 @@ static bool llm_load_tensors( | |
|
||
auto & hparams = model.hparams; | ||
|
||
if (n_gpu_layers == -2) { | ||
n_gpu_layers = llm_determine_max_ngl(ml, model, main_gpu); | ||
LLAMA_LOG_INFO("%s: automatically set n_gpu_layers = %d\n", __func__, n_gpu_layers); | ||
} | ||
|
||
model.split_mode = split_mode; | ||
model.main_gpu = main_gpu; | ||
model.n_gpu_layers = n_gpu_layers; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree it can be a breaking change, but I would prefer to have this approach as the default. i.e. if
-ngl
is not passed: automatically offload the maximum possible layers to VRAM.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could be. If someone doesn't want that, they could simply
-ngl 0
or just not compile with GPU args passed.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes but this is just my personal point of view. @ggerganov or @slaren would have a better global view