Skip to content

Commit 4fc4bf6

Browse files
committed
Merge branch 'master' into huydt/mb
2 parents 820cee1 + 201b31d commit 4fc4bf6

File tree

16 files changed

+330
-47
lines changed

16 files changed

+330
-47
lines changed

.github/labeler.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,3 +86,10 @@ nix:
8686
embedding:
8787
- changed-files:
8888
- any-glob-to-any-file: examples/embedding/
89+
90+
Ascend NPU:
91+
- changed-files:
92+
- any-glob-to-any-file:
93+
- ggml/include/ggml-cann.h
94+
- ggml/src/ggml-cann/**
95+
- docs/backend/CANN.md

docs/backend/CANN.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
- [DataType Supports](#datatype-supports)
99
- [Docker](#docker)
1010
- [Linux](#linux)
11+
- [Environment variable setup](#environment-variable-setup)
1112
- [TODO](#todo)
1213

1314

@@ -290,5 +291,24 @@ Authors from Peking University: Bizhao Shi (bshi@pku.edu.cn), Yuxin Yang (yxyang
290291

291292
We would like to thank Tuo Dai, Shanni Li, and all of the project maintainers from Huawei Technologies Co., Ltd for their help during the code development and pull request.
292293

294+
## Environment variable setup
295+
296+
### GGML_CANN_ASYNC_MODE
297+
298+
Enables asynchronous operator submission. Disabled by default.
299+
300+
### GGML_CANN_MEM_POOL
301+
302+
Specifies the memory pool management strategy:
303+
304+
- vmm: Utilizes a virtual memory manager pool. If hardware support for VMM is unavailable, falls back to the legacy (leg) memory pool.
305+
306+
- prio: Employs a priority queue-based memory pool management.
307+
- leg: Uses a fixed-size buffer pool.
308+
309+
### GGML_CANN_DISABLE_BUF_POOL_CLEAN
310+
311+
Controls automatic cleanup of the memory pool. This option is only effective when using the prio or leg memory pool strategies.
312+
293313
## TODO
294314
- Support more models and data types.

ggml/src/ggml-cann/common.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include <thread>
3838
#include <unistd.h>
3939
#include <functional>
40+
#include <optional>
4041

4142
#include "../include/ggml-cann.h"
4243
#include "../include/ggml.h"
@@ -103,6 +104,9 @@ const ggml_cann_device_info& ggml_cann_info();
103104
void ggml_cann_set_device(int32_t device);
104105
int32_t ggml_cann_get_device();
105106

107+
std::optional<std::string> get_env(const std::string& name);
108+
bool parse_bool(const std::string& value);
109+
106110
/**
107111
* @brief Abstract base class for memory pools used by CANN.
108112
*/
@@ -354,7 +358,8 @@ struct ggml_backend_cann_context {
354358
: device(device), name("CANN" + std::to_string(device)), task_queue(1024, device) {
355359
ggml_cann_set_device(device);
356360
description = aclrtGetSocName();
357-
async_mode = (getenv("GGML_CANN_ASYNC_MODE") != nullptr);
361+
362+
bool async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
358363
GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
359364
device, async_mode ? "ON" : "OFF");
360365
}

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
#include <mutex>
3232
#include <queue>
3333
#include <chrono>
34+
#include <unordered_set>
35+
#include <optional>
3436

3537
#include "ggml-impl.h"
3638
#include "ggml-backend-impl.h"
@@ -93,6 +95,26 @@ int32_t ggml_cann_get_device() {
9395
return id;
9496
}
9597

98+
/**
99+
* @brief Get the value of the specified environment variable (name).
100+
* if not empty, return a std::string object
101+
*/
102+
std::optional<std::string> get_env(const std::string& name) {
103+
const char* val = std::getenv(name.c_str());
104+
if (!val) return std::nullopt;
105+
std::string res = std::string(val);
106+
std::transform(res.begin(), res.end(), res.begin(), ::tolower);
107+
return res;
108+
}
109+
110+
/**
111+
* @brief Verify whether the environment variable is a valid value.
112+
*/
113+
bool parse_bool(const std::string& value) {
114+
std::unordered_set<std::string> valid_values = {"on", "1", "yes", "y", "enable", "true"};
115+
return valid_values.find(value) != valid_values.end();
116+
}
117+
96118
/**
97119
* @brief Initialize the CANN device information.
98120
*
@@ -214,7 +236,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
214236
* @param device The device ID to associate with this buffer pool.
215237
*/
216238
explicit ggml_cann_pool_buf_prio(int device) : device(device) {
217-
disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
239+
disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
218240
}
219241

220242
/**
@@ -410,7 +432,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
410432
* @param device The device ID to associate with this buffer pool.
411433
*/
412434
explicit ggml_cann_pool_buf(int device) : device(device) {
413-
disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
435+
disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
414436
}
415437

416438
/**
@@ -731,16 +753,18 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
731753
*/
732754
std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
733755
int device) {
734-
bool disable_vmm = (getenv("GGML_CANN_DISABLE_VMM_POOL") != nullptr);
735-
if (!disable_vmm && ggml_cann_info().devices[device].vmm) {
736-
GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
737-
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
738-
}
739-
bool enable_buf_prio = (getenv("GGML_CANN_ENABLE_BUF_PRIO_POOL") != nullptr);
740-
if (enable_buf_prio) {
756+
std::string mem_pool_type = get_env("GGML_CANN_MEM_POOL").value_or("");
757+
758+
if (mem_pool_type == "prio") {
741759
GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
742760
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf_prio(device));
743761
}
762+
763+
if (ggml_cann_info().devices[device].vmm && mem_pool_type != "leg") {
764+
GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
765+
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
766+
}
767+
744768
GGML_LOG_INFO("%s: device %d use buffer pool\n", __func__, device);
745769
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf(device));
746770
}

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1144,7 +1144,6 @@ typedef void (*ggml_cuda_op_mul_mat_t)(
11441144
static cudaError_t ggml_cuda_cpy_tensor_2d(
11451145
void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
11461146

1147-
GGML_ASSERT(ggml_backend_buffer_is_cuda(src->buffer));
11481147
const char * src_ptr = (const char *) src->data;
11491148
char * dst_ptr = (char *) dst;
11501149

@@ -1427,8 +1426,6 @@ static void ggml_cuda_op_mul_mat(
14271426
const int64_t nb2 = dst->nb[2];
14281427
const int64_t nb3 = dst->nb[3];
14291428

1430-
GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
1431-
GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
14321429
ggml_backend_cuda_buffer_context * src1_ctx = (ggml_backend_cuda_buffer_context *) src1->buffer->context;
14331430
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *) dst->buffer->context;
14341431

@@ -1750,7 +1747,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
17501747
GGML_ASSERT(!ggml_is_transposed(src0));
17511748
GGML_ASSERT(!ggml_is_transposed(src1));
17521749

1753-
GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
1750+
GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft));
17541751
GGML_ASSERT(src0->type == GGML_TYPE_F16);
17551752

17561753
// Byte offsets and tensor dimensions are currently used in an inconsistent way for dst.

ggml/src/ggml-sycl/convert.cpp

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,17 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int64_t k,
265265
#endif
266266
}
267267

268+
template <typename dst_t>
269+
static void dequantize_row_q6_K_sycl_reorder(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
270+
const int64_t nb = k / QK_K;
271+
272+
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
273+
274+
stream->parallel_for(
275+
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
276+
[=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K_reorder(vx, y, item_ct1, nb); });
277+
}
278+
268279
template <typename dst_t>
269280
static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int64_t k,
270281
dpct::queue_ptr stream) {
@@ -530,7 +541,11 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
530541
case GGML_TYPE_Q5_K:
531542
return dequantize_row_q5_K_sycl;
532543
case GGML_TYPE_Q6_K:
533-
return dequantize_row_q6_K_sycl;
544+
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
545+
return dequantize_row_q6_K_sycl_reorder;
546+
} else {
547+
return dequantize_row_q6_K_sycl;
548+
}
534549
case GGML_TYPE_IQ1_S:
535550
return dequantize_row_iq1_s_sycl;
536551
case GGML_TYPE_IQ1_M:
@@ -587,7 +602,11 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
587602
case GGML_TYPE_Q5_K:
588603
return dequantize_row_q5_K_sycl;
589604
case GGML_TYPE_Q6_K:
590-
return dequantize_row_q6_K_sycl;
605+
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
606+
return dequantize_row_q6_K_sycl_reorder;
607+
} else {
608+
return dequantize_row_q6_K_sycl;
609+
}
591610
case GGML_TYPE_IQ1_S:
592611
return dequantize_row_iq1_s_sycl;
593612
case GGML_TYPE_IQ1_M:

ggml/src/ggml-sycl/dequantize.hpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -538,6 +538,38 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
538538
#endif
539539
}
540540

541+
template <typename dst_t>
542+
static void dequantize_block_q6_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy,
543+
const sycl::nd_item<3> & item_ct1, int64_t n_blocks) {
544+
const int64_t ib = item_ct1.get_group(2);
545+
546+
const int64_t tid = item_ct1.get_local_id(2);
547+
const int64_t ip = tid / 32; // ip is 0 or 1
548+
const int64_t il = tid - 32 * ip; // 0...32
549+
const int64_t is = 8 * ip + il / 16;
550+
551+
const uint8_t * base_ptr = static_cast<const uint8_t *>(vx);
552+
const auto ql_offset = ib * (QK_K / 2);
553+
const auto qh_offset = (QK_K / 2) * n_blocks + (QK_K / 4) * ib;
554+
const auto base_scales_offset = (QK_K / 2) * n_blocks + (QK_K / 4) * n_blocks + (QK_K / 16) * ib;
555+
const auto base_d_offset = ((QK_K / 2) + (QK_K / 4) + (QK_K / 16)) * n_blocks;
556+
const uint8_t * ql_ptr = base_ptr + ql_offset;
557+
const uint8_t * qh_ptr = base_ptr + qh_offset;
558+
const uint8_t * scales_ptr = base_ptr + base_scales_offset;
559+
const ggml_half * d = (const ggml_half *) (base_ptr + base_d_offset) + ib;
560+
561+
dst_t * y = yy + ib * QK_K + 128 * ip + il;
562+
563+
const uint8_t * ql = ql_ptr + 64 * ip + il;
564+
const uint8_t qh = *(qh_ptr + 32 * ip + il);
565+
const int8_t * sc = reinterpret_cast<const int8_t *>(scales_ptr + is);
566+
567+
y[0] = *d * sc[0] * ((int8_t) ((ql[0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
568+
y[32] = *d * sc[2] * ((int8_t) ((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
569+
y[64] = *d * sc[4] * ((int8_t) ((ql[0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
570+
y[96] = *d * sc[6] * ((int8_t) ((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
571+
}
572+
541573
template<typename dst_t>
542574
static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy,
543575
const sycl::nd_item<3> &item_ct1,

ggml/src/ggml-sycl/ggml-sycl.cpp

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -354,7 +354,8 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
354354
assert(tensor->view_src->buffer->buft == buffer->buft);
355355
return GGML_STATUS_SUCCESS;
356356
}
357-
if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K) && !g_ggml_sycl_disable_optimize) {
357+
if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K || tensor->type == GGML_TYPE_Q6_K) &&
358+
!g_ggml_sycl_disable_optimize) {
358359
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
359360
tensor->extra = extra;
360361
ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
@@ -2989,6 +2990,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
29892990
case GGML_TYPE_Q4_0:
29902991
return true;
29912992
case GGML_TYPE_Q4_K:
2993+
case GGML_TYPE_Q6_K:
29922994
return !g_ggml_sycl_prioritize_dmmv;
29932995
default:
29942996
return false;
@@ -3008,6 +3010,7 @@ inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
30083010
switch (type) {
30093011
case GGML_TYPE_Q4_0:
30103012
case GGML_TYPE_Q4_K:
3013+
case GGML_TYPE_Q6_K:
30113014
return true;
30123015
default:
30133016
return false;
@@ -3092,6 +3095,50 @@ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
30923095
sycl::free(tmp_buf, *stream);
30933096
}
30943097

3098+
static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
3099+
GGML_ASSERT(size % sizeof(block_q6_K) == 0);
3100+
GGML_ASSERT(offset % sizeof(block_q6_K) == 0);
3101+
3102+
const int nblocks = size / sizeof(block_q6_K);
3103+
3104+
auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
3105+
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
3106+
3107+
auto * ql_ptr = data_device;
3108+
auto * qh_ptr = ql_ptr + (QK_K / 2) * nblocks;
3109+
auto * scales_ptr = qh_ptr + (QK_K / 4) * nblocks;
3110+
sycl::half * dm_ptr = (sycl::half *) (scales_ptr + (QK_K / 16) * nblocks);
3111+
3112+
stream
3113+
->parallel_for(nblocks,
3114+
[=](auto i) {
3115+
const block_q6_K * x = (const block_q6_K *) tmp_buf;
3116+
const int ib = i;
3117+
3118+
const uint8_t * ql = x[ib].ql;
3119+
const uint8_t * qh = x[ib].qh;
3120+
uint8_t * base_ql_ptr = ql_ptr + (QK_K / 2) * ib;
3121+
uint8_t * base_qh_ptr = qh_ptr + (QK_K / 4) * ib;
3122+
uint8_t * base_scales_ptr = scales_ptr + (QK_K / 16) * ib;
3123+
3124+
for (int j = 0; j < QK_K / 2; ++j) {
3125+
base_ql_ptr[j] = ql[j];
3126+
}
3127+
for (int j = 0; j < QK_K / 4; ++j) {
3128+
base_qh_ptr[j] = qh[j];
3129+
}
3130+
3131+
for (int j = 0; j < QK_K / 16; ++j) {
3132+
base_scales_ptr[j] = x[ib].scales[j];
3133+
}
3134+
3135+
dm_ptr[ib] = x[ib].d;
3136+
})
3137+
.wait_and_throw();
3138+
3139+
sycl::free(tmp_buf, *stream);
3140+
}
3141+
30953142
static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
30963143
uint8_t * data_device = (uint8_t *) src0->data;
30973144
size_t ncols = src0->ne[0];
@@ -3105,6 +3152,9 @@ static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
31053152
case GGML_TYPE_Q4_K:
31063153
reorder_qw_q4_k(data_device, size, 0, stream);
31073154
break;
3155+
case GGML_TYPE_Q6_K:
3156+
reorder_qw_q6_k(data_device, size, 0, stream);
3157+
break;
31083158
default:
31093159
GGML_ABORT("reorder_qw() called with unsupported type");
31103160
break;

0 commit comments

Comments
 (0)