@@ -10,12 +10,12 @@ static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
10
10
*dsti = *xi;
11
11
}
12
12
13
- static __device__ void cpy_1_f32_bf16 (const char * cxi, char * cdsti) {
14
- const float * xi = (const float *) cxi;
15
- nv_bfloat16 * dsti = (nv_bfloat16 *) cdsti;
13
+ // static __device__ void cpy_1_f32_bf16(const char * cxi, char * cdsti) {
14
+ // const float * xi = (const float *) cxi;
15
+ // nv_bfloat16 * dsti = (nv_bfloat16 *) cdsti;
16
16
17
- *dsti = *xi;
18
- }
17
+ // *dsti = *xi;
18
+ // }
19
19
20
20
static __device__ void cpy_1_f32_f16 (const char * cxi, char * cdsti) {
21
21
const float * xi = (const float *) cxi;
@@ -463,15 +463,15 @@ static void ggml_cpy_f32_f16_cuda(
463
463
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
464
464
}
465
465
466
- static void ggml_cpy_f32_bf16_cuda (
467
- const char * cx, char * cdst, const int ne,
468
- const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
469
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
466
+ // static void ggml_cpy_f32_bf16_cuda(
467
+ // const char * cx, char * cdst, const int ne,
468
+ // const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
469
+ // const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
470
470
471
- const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1 ) / CUDA_CPY_BLOCK_SIZE;
472
- cpy_f32_f16<cpy_1_f32_bf16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0 , stream>>>
473
- (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
474
- }
471
+ // const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
472
+ // cpy_f32_f16<cpy_1_f32_bf16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
473
+ // (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
474
+ // }
475
475
476
476
static void ggml_cpy_f32_q8_0_cuda (
477
477
const char * cx, char * cdst, const int ne,
@@ -716,8 +716,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
716
716
ggml_cpy_f32_q6_0_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
717
717
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
718
718
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
719
- } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
720
- ggml_cpy_f32_bf16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
719
+ // } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
720
+ // ggml_cpy_f32_bf16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
721
721
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
722
722
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
723
723
} else {
@@ -746,8 +746,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
746
746
return (void *) cpy_f32_f16<cpy_1_f32_bf16>;
747
747
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
748
748
return (void *) cpy_f32_f16<cpy_1_f32_f16>;
749
- } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
750
- return (void *) cpy_f32_f16<cpy_1_f32_bf16>;
749
+ // } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
750
+ // return (void*) cpy_f32_f16<cpy_1_f32_bf16>;
751
751
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
752
752
return (void *) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>;
753
753
} else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
0 commit comments