@@ -80,7 +80,11 @@ void SQ4BitGemm_CompInt8(
80
80
81
81
size_t CountN;
82
82
83
+ #if defined(RISCV64_SPACEMIT_IME1)
83
84
const size_t ComputeBlockCountN = RangeCountM == 1 ? RangeCountN : 16 ;
85
+ #elif defined(RISCV64_SPACEMIT_IME2)
86
+ const size_t ComputeBlockCountN = RangeCountM == 1 ? RangeCountN : 32 ;
87
+ #endif
84
88
85
89
for (size_t n = 0 ; n < RangeCountN; n += CountN) {
86
90
CountN = std::min (RangeCountN - n, ComputeBlockCountN);
@@ -279,6 +283,8 @@ struct block {
279
283
};
280
284
281
285
// control size
286
+ static_assert (sizeof (block<4 , 32 >) == 32 * sizeof (ggml_half) + QK4_0 * 16 , " wrong block<4,32> size/padding" );
287
+ static_assert (sizeof (block<8 , 32 >) == 32 * sizeof (ggml_half) + QK4_0 * 32 , " wrong block<8,32> size/padding" );
282
288
static_assert (sizeof (block<4 , 16 >) == 16 * sizeof (ggml_half) + QK4_0 * 8 , " wrong block<4,16> size/padding" );
283
289
static_assert (sizeof (block<8 , 16 >) == 16 * sizeof (ggml_half) + QK4_0 * 16 , " wrong block<8,16> size/padding" );
284
290
@@ -296,24 +302,55 @@ static block_q4_0x16 make_block_q4_0x16(block_q4_0* in, unsigned int blck_size_i
296
302
for (int i = 0 ; i < 16 ; i++) {
297
303
// [0, 15], in.d & 0x0F
298
304
for (int j = 0 ; j < QK4_0 / 4 ; j++) {
299
- // [b0 b16] ......... [b8 b24] ......... [b15 b31]
300
- // [b0 b8] ......... [b7 b15]
305
+ // src [b0 b16] ......... [b8 b24] ......... [b15 b31]
306
+ // dst [b0 b8] ......... [b7 b15]
301
307
out.qs [i * QK4_0 / 4 + j] = (in[i].qs [j] & 0x0F ) | ((in[i].qs [j + QK4_0 / 4 ] & 0x0F ) << 4 );
302
308
}
303
309
}
304
310
305
311
for (int i = 0 ; i < 16 ; i++) {
306
312
// [16, 31], in.d & 0xF0
307
313
for (int j = 0 ; j < QK4_0 / 4 ; j++) {
308
- // [b0 b16] ......... [b8 b24] ......... [b15 b31]
309
- // [b16 b24] ......... [b23 b31]
314
+ // src [b0 b16] ......... [b8 b24] ......... [b15 b31]
315
+ // dst [b16 b24] ......... [b23 b31]
310
316
out.qs [4 * QK4_0 + i * QK4_0 / 4 + j] = ((in[i].qs [j] & 0xF0 ) >> 4 ) | (in[i].qs [j + QK4_0 / 4 ] & 0xF0 );
311
317
}
312
318
}
313
319
314
320
return out;
315
321
}
316
322
323
+ using block_q4_0x32 = block<4 , 32 >;
324
+ using block_q8_0x32 = block<8 , 32 >;
325
+ static block_q4_0x32 make_block_q4_0x32 (block_q4_0* in, unsigned int blck_size_interleave) {
326
+ block_q4_0x32 out;
327
+ assert (QK4_0 / blck_size_interleave == 1 );
328
+
329
+ for (int i = 0 ; i < 32 ; i++) { // zhaolikun [check]
330
+ out.d [i] = in[i].d ;
331
+ }
332
+
333
+ for (int i = 0 ; i < 32 ; i++) {
334
+ // [0, 15], in.d & 0x0F
335
+ for (int j = 0 ; j < QK4_0/4 ; j++) {
336
+ // src [b0 b16] ......... [b8 b24] ......... [b15 b31]
337
+ // dst [b0 b1] ......... [b14 b15]
338
+ out.qs [i * QK4_0/2 + j] = (in[i].qs [j*2 ] & 0x0F ) | ((in[i].qs [j*2 + 1 ] & 0x0F ) << 4 );
339
+ }
340
+ }
341
+
342
+ for (int i = 0 ; i < 32 ; i++) {
343
+ // [16, 31], in.d & 0xF0
344
+ for (int j = 0 ; j < QK4_0/4 ; j++) {
345
+ // src [b0 b16] ......... [b8 b24] ......... [b15 b31]
346
+ // dst [b16 b17] ......... [b30 b31]
347
+ out.qs [i * QK4_0/2 + QK4_0/4 + j] = ((in[i].qs [j*2 ] & 0xF0 )>>4 ) | (in[i].qs [j*2 + 1 ] & 0xF0 );
348
+ }
349
+ }
350
+
351
+ return out;
352
+ }
353
+
317
354
static int repack_q4_0_to_q4_0_16_bl (struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
318
355
GGML_ASSERT (t->type == GGML_TYPE_Q4_0);
319
356
GGML_ASSERT (interleave_block == 16 );
@@ -346,6 +383,38 @@ static int repack_q4_0_to_q4_0_16_bl(struct ggml_tensor* t, int interleave_block
346
383
GGML_UNUSED (data_size);
347
384
}
348
385
386
+ static int repack_q4_0_to_q4_0_32_bl (struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
387
+ GGML_ASSERT (t->type == GGML_TYPE_Q4_0);
388
+ GGML_ASSERT (interleave_block == 32 ); // unused
389
+
390
+ constexpr int nrows_interleaved = 32 ;
391
+
392
+ block_q4_0x32* dst = (block_q4_0x32*)t->data ;
393
+ const block_q4_0* src = (const block_q4_0*)data;
394
+ block_q4_0 dst_tmp[32 ];
395
+ int nrow = ggml_nrows (t);
396
+ int nblocks = t->ne [0 ] / QK4_0;
397
+
398
+ GGML_ASSERT (data_size == nrow * nblocks * sizeof (block_q4_0));
399
+
400
+ if (t->ne [1 ] % nrows_interleaved != 0 || t->ne [0 ] % QK4_0 != 0 ) {
401
+ return -1 ;
402
+ }
403
+
404
+ for (int b = 0 ; b < nrow; b += nrows_interleaved) {
405
+ for (int64_t x = 0 ; x < nblocks; x++) {
406
+ for (int i = 0 ; i < nrows_interleaved; i++) {
407
+ dst_tmp[i] = src[x + i * nblocks];
408
+ }
409
+ *dst++ = make_block_q4_0x32 (dst_tmp, interleave_block);
410
+ }
411
+ src += nrows_interleaved * nblocks;
412
+ }
413
+ return 0 ;
414
+
415
+ GGML_UNUSED (data_size);
416
+ }
417
+
349
418
namespace ggml ::cpu::riscv64_spacemit {
350
419
351
420
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
@@ -355,6 +424,10 @@ template <>
355
424
int repack<block_q4_0, 8 , 16 >(struct ggml_tensor * t, const void * data, size_t data_size) {
356
425
return repack_q4_0_to_q4_0_16_bl (t, 16 , data, data_size);
357
426
}
427
+ template <>
428
+ int repack<block_q4_0, 16 , 32 >(struct ggml_tensor * t, const void * data, size_t data_size) {
429
+ return repack_q4_0_to_q4_0_32_bl (t, 32 , data, data_size);
430
+ }
358
431
359
432
class tensor_traits_base : public ggml ::cpu::tensor_traits {
360
433
public:
@@ -707,15 +780,22 @@ class tensor_traits_common : public tensor_traits_base {
707
780
};
708
781
709
782
static const tensor_traits<block_q4_0, 8 , 16 > q4_0_16x8_q8_0;
783
+ static const tensor_traits<block_q4_0, 16 , 32 > q4_0_32x16_q8_0;
710
784
static const tensor_traits_common rvv_impl;
711
785
712
786
} // namespace ggml::cpu::riscv64_spacemit
713
787
714
788
static const ggml::cpu::tensor_traits* ggml_riscv64_spacemit_get_optimal_repack_type (const struct ggml_tensor * cur) {
715
789
if (cur->type == GGML_TYPE_Q4_0) {
716
- if (cur->ne [1 ] % 16 == 0 ) {
717
- return &ggml::cpu::riscv64_spacemit::q4_0_16x8_q8_0;
718
- }
790
+ #if defined(RISCV64_SPACEMIT_IME1)
791
+ if (cur->ne [1 ] % 16 == 0 ) {
792
+ return &ggml::cpu::riscv64_spacemit::q4_0_16x8_q8_0;
793
+ }
794
+ #elif defined(RISCV64_SPACEMIT_IME2)
795
+ if (cur->ne [1 ] % 32 == 0 ) {
796
+ return &ggml::cpu::riscv64_spacemit::q4_0_32x16_q8_0;
797
+ }
798
+ #endif
719
799
} else if (cur->type == GGML_TYPE_F32) {
720
800
return &ggml::cpu::riscv64_spacemit::rvv_impl;
721
801
}
0 commit comments