Skip to content

Commit 053b153

Browse files
threading: support for GGML_SCHED_PRIO_LOW, update thread info on Windows to avoid throttling (#12995)
* threading: support for GGML_SCHED_PRIO_LOW, update thread info on Windows to avoid throttling We talked about adding LOW priority for GGML threads in the original threadpool PR. It might be useful for some cases to avoid contention. Latest Windows ARM64 releases started parking (offlining) the CPU cores more aggresively which results in suboptimal performance with n_threads > 4. To deal with that we now disable Power Throttling for our threads for the NORMAL and higher priorities. Co-authored-by: Diego Devesa <slarengh@gmail.com> * threading: disable SetThreadInfo() calls for older Windows versions * Update tools/llama-bench/llama-bench.cpp Co-authored-by: Diego Devesa <slarengh@gmail.com> --------- Co-authored-by: Diego Devesa <slarengh@gmail.com>
1 parent b3a89c3 commit 053b153

File tree

5 files changed

+29
-3
lines changed

5 files changed

+29
-3
lines changed

common/arg.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1348,9 +1348,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
13481348
));
13491349
add_opt(common_arg(
13501350
{"--prio"}, "N",
1351-
string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
1351+
string_format("set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n", params.cpuparams.priority),
13521352
[](common_params & params, int prio) {
1353-
if (prio < 0 || prio > 3) {
1353+
if (prio < GGML_SCHED_PRIO_LOW || prio > GGML_SCHED_PRIO_REALTIME) {
13541354
throw std::invalid_argument("invalid value");
13551355
}
13561356
params.cpuparams.priority = (enum ggml_sched_priority) prio;

common/common.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
203203

204204
DWORD p = NORMAL_PRIORITY_CLASS;
205205
switch (prio) {
206+
case GGML_SCHED_PRIO_LOW: p = BELOW_NORMAL_PRIORITY_CLASS; break;
206207
case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
207208
case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
208209
case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
@@ -228,6 +229,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
228229

229230
int p = 0;
230231
switch (prio) {
232+
case GGML_SCHED_PRIO_LOW: p = 5; break;
231233
case GGML_SCHED_PRIO_NORMAL: p = 0; break;
232234
case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
233235
case GGML_SCHED_PRIO_HIGH: p = -10; break;

ggml/include/ggml.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2181,6 +2181,7 @@ extern "C" {
21812181

21822182
// scheduling priorities
21832183
enum ggml_sched_priority {
2184+
GGML_SCHED_PRIO_LOW = -1,
21842185
GGML_SCHED_PRIO_NORMAL,
21852186
GGML_SCHED_PRIO_MEDIUM,
21862187
GGML_SCHED_PRIO_HIGH,

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2418,12 +2418,32 @@ static bool ggml_thread_apply_priority(int32_t prio) {
24182418
// This is up to the applications.
24192419
DWORD p = THREAD_PRIORITY_NORMAL;
24202420
switch (prio) {
2421+
case GGML_SCHED_PRIO_LOW: p = THREAD_PRIORITY_BELOW_NORMAL; break;
24212422
case GGML_SCHED_PRIO_NORMAL: p = THREAD_PRIORITY_NORMAL; break;
24222423
case GGML_SCHED_PRIO_MEDIUM: p = THREAD_PRIORITY_ABOVE_NORMAL; break;
24232424
case GGML_SCHED_PRIO_HIGH: p = THREAD_PRIORITY_HIGHEST; break;
24242425
case GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
24252426
}
24262427

2428+
if (prio != GGML_SCHED_PRIO_LOW) {
2429+
// Tell Windows that this thread should not be throttled (needs its own CPU core).
2430+
// Newer Windows 11 versions aggresively park (offline) CPU cores and often place
2431+
// all our threads onto the first 4 cores which results in terrible performance with
2432+
// n_threads > 4
2433+
#if _WIN32_WINNT >= 0x0602
2434+
THREAD_POWER_THROTTLING_STATE t;
2435+
ZeroMemory(&t, sizeof(t));
2436+
t.Version = THREAD_POWER_THROTTLING_CURRENT_VERSION;
2437+
t.ControlMask = THREAD_POWER_THROTTLING_EXECUTION_SPEED;
2438+
t.StateMask = 0;
2439+
2440+
if (!SetThreadInformation(GetCurrentThread(), ThreadPowerThrottling, &t, sizeof(t))) {
2441+
GGML_LOG_DEBUG("failed to disable thread power throttling %d : (%d)\n", prio, (int) GetLastError());
2442+
return false;
2443+
}
2444+
#endif
2445+
}
2446+
24272447
if (prio == GGML_SCHED_PRIO_NORMAL) {
24282448
// Keep inherited policy/priority
24292449
return true;
@@ -2451,6 +2471,8 @@ static bool ggml_thread_apply_priority(int32_t prio) {
24512471
struct sched_param p;
24522472
int32_t policy = SCHED_OTHER;
24532473
switch (prio) {
2474+
// TODO: there seems to be no way to set lower prio on Apple platforms
2475+
case GGML_SCHED_PRIO_LOW: policy = SCHED_OTHER; p.sched_priority = 0; break;
24542476
case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
24552477
case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
24562478
case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;
@@ -2507,6 +2529,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
25072529
struct sched_param p;
25082530
int32_t policy = SCHED_OTHER;
25092531
switch (prio) {
2532+
case GGML_SCHED_PRIO_LOW: policy = SCHED_BATCH; p.sched_priority = 0; break;
25102533
case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
25112534
case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
25122535
case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;

tools/llama-bench/llama-bench.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,7 @@ static void print_usage(int /* argc */, char ** argv) {
315315
printf(" --numa <distribute|isolate|numactl> numa mode (default: disabled)\n");
316316
printf(" -r, --repetitions <n> number of times to repeat each test (default: %d)\n",
317317
cmd_params_defaults.reps);
318-
printf(" --prio <0|1|2|3> process/thread priority (default: %d)\n",
318+
printf(" --prio <-1|0|1|2|3> process/thread priority (default: %d)\n",
319319
cmd_params_defaults.prio);
320320
printf(" --delay <0...N> (seconds) delay between each test (default: %d)\n",
321321
cmd_params_defaults.delay);

0 commit comments

Comments
 (0)