Skip to content

Implement automatic NGL detection #6502

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 7 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -836,7 +836,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
invalid_param = true;
return true;
}
params.n_gpu_layers = std::stoi(argv[i]);
std::string argValue = argv[i];
if (argValue == "auto" || argValue == "a") {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree it can be a breaking change, but I would prefer to have this approach as the default. i.e. if -ngl is not passed: automatically offload the maximum possible layers to VRAM.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could be. If someone doesn't want that, they could simply -ngl 0 or just not compile with GPU args passed.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes but this is just my personal point of view. @ggerganov or @slaren would have a better global view

params.n_gpu_layers = -2;
} else {
params.n_gpu_layers = std::stoi(argValue);
}
if (!llama_supports_gpu_offload()) {
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
Expand Down Expand Up @@ -1407,6 +1412,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
if (llama_supports_gpu_offload()) {
printf(" -ngl N, --n-gpu-layers N\n");
printf(" number of layers to store in VRAM\n");
printf(" set to 'auto' or 'a' to determine max automatically based on VRAM size\n");
printf(" -ngld N, --n-gpu-layers-draft N\n");
printf(" number of layers to store in VRAM for the draft model\n");
printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
Expand Down Expand Up @@ -2480,7 +2486,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
fprintf(stream, "n_gpu_layers: %d # default: -1, auto: -2\n", params.n_gpu_layers);
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
Expand Down
2 changes: 1 addition & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ struct gpt_params {
int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode
float p_split = 0.1f; // speculative decoding split probability
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default, -2 - automatically determine)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
Expand Down
5 changes: 5 additions & 0 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2612,6 +2612,11 @@ GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, si
CUDA_CHECK(cudaMemGetInfo(free, total));
}

GGML_CALL void ggml_backend_cuda_get_free_device_memory(int device, size_t * free) {
size_t total;
ggml_backend_cuda_get_device_memory(device, free, &total);
}

GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
return false;
Expand Down
1 change: 1 addition & 0 deletions ggml-cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type
GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
GGML_API GGML_CALL void ggml_backend_cuda_get_free_device_memory(int device, size_t * free);

GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
Expand Down
11 changes: 11 additions & 0 deletions ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16022,6 +16022,17 @@ catch (sycl::exception const &exc) {
std::exit(1);
}

GGML_CALL void ggml_backend_sycl_get_free_device_memory(int device, size_t *free) try {
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_free_device_memory\n");
size_t total;
ggml_backend_sycl_get_device_memory(device, free, &total);
}
catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
<< ", line:" << __LINE__ << std::endl;
std::exit(1);
}

////////////////////////////////////////////////////////////////////////////////

// backend interface
Expand Down
1 change: 1 addition & 0 deletions ggml-sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
GGML_API GGML_CALL void ggml_backend_sycl_get_free_device_memory(int device, size_t *free);
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);

// TODO: these are temporary
Expand Down
15 changes: 15 additions & 0 deletions ggml-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5781,6 +5781,21 @@ GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size
}
}

GGML_CALL void ggml_backend_vk_get_free_device_memory(int device, size_t * free) {
GGML_ASSERT(device < (int) vk_instance.device_indices.size());

vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];

vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();

for (const vk::MemoryHeap& heap : memprops.memoryHeaps) {
if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
*free = heap.size;
break;
}
}
}

// backend registry
GGML_CALL static ggml_backend_t ggml_backend_reg_vk_init(const char * params, void * user_data) {
ggml_backend_t vk_backend = ggml_backend_vk_init((int) (intptr_t) user_data);
Expand Down
1 change: 1 addition & 0 deletions ggml-vulkan.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
GGML_API GGML_CALL int ggml_backend_vk_get_device_count(void);
GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
GGML_API GGML_CALL void ggml_backend_vk_get_free_device_memory(int device, size_t * free);

GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
Expand Down
53 changes: 53 additions & 0 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1650,6 +1650,28 @@ static size_t llama_get_device_memory(int device) {
#endif
}

// TODO: implement for other backends to return free memory
static size_t llama_get_available_device_memory(int device) {
#if defined(GGML_USE_CUDA)
size_t free;
ggml_backend_cuda_get_free_device_memory(device, &free);
return free;
#elif defined(GGML_USE_SYCL)
size_t total;
size_t free;
ggml_backend_sycl_get_free_device_memory(device, &total, &free);
return free;
#elif defined(GGML_USE_VULKAN)
size_t total;
size_t free;
ggml_backend_vk_get_free_device_memory(device, &total, &free);
return free;
#else
return 1;
GGML_UNUSED(device);
#endif
}

//
// globals
//
Expand Down Expand Up @@ -4329,6 +4351,32 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
}

static int llm_determine_max_ngl(const llama_model_loader & ml, const llama_model & model, const int main_gpu) {
const auto & hparams = model.hparams;

size_t available_gpu_memory = llama_get_available_device_memory(main_gpu);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The program logic here is inconsistent with the --help. The --help says "based on VRAM size" which I would interpret as total VRAM but here you are using how much free VRAM there is.


// TODO: This is a rough, pretty inaccurate estimate, should implement using existing layer size and not guesstimating
size_t model_size = ml.n_bytes;
int32_t model_layers = hparams.n_layer;
size_t memory_per_layer = model_size / model_layers;

// TODO: get buffer size dynamically
int32_t buf_size = 400 * MiB;
int32_t buf_size_k = 200 * MiB;
int32_t buf_size_v = 200 * MiB;

int32_t total_buf_size = buf_size + buf_size_k + buf_size_v;

available_gpu_memory = available_gpu_memory - hparams.n_ctx_train; // context size
available_gpu_memory = available_gpu_memory - total_buf_size; // buffer size

// Calculate the maximum number of layers that can fit into the GPU memory
int32_t max_ngl = std::floor(static_cast<float>(available_gpu_memory) / memory_per_layer);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should leave a small amount of headroom when the number of layers is determined automatically; you would want to avoid a scenario where an application ooms because llama.cpp only left 5 MB of VRAM.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought about doing that; thanks for reminding


return max_ngl;
}

// Returns false if cancelled by progress_callback
static bool llm_load_tensors(
llama_model_loader & ml,
Expand All @@ -4344,6 +4392,11 @@ static bool llm_load_tensors(

auto & hparams = model.hparams;

if (n_gpu_layers == -2) {
n_gpu_layers = llm_determine_max_ngl(ml, model, main_gpu);
LLAMA_LOG_INFO("%s: automatically set n_gpu_layers = %d\n", __func__, n_gpu_layers);
}

model.split_mode = split_mode;
model.main_gpu = main_gpu;
model.n_gpu_layers = n_gpu_layers;
Expand Down