Skip to content

Commit 95046f7

Browse files
committed
fix merge
1 parent 90f6637 commit 95046f7

File tree

2 files changed

+19
-19
lines changed

2 files changed

+19
-19
lines changed

ggml/src/ggml-cuda/cpy.cu

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,12 @@ static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
1010
*dsti = *xi;
1111
}
1212

13-
static __device__ void cpy_1_f32_bf16(const char * cxi, char * cdsti) {
14-
const float * xi = (const float *) cxi;
15-
nv_bfloat16 * dsti = (nv_bfloat16 *) cdsti;
13+
// static __device__ void cpy_1_f32_bf16(const char * cxi, char * cdsti) {
14+
// const float * xi = (const float *) cxi;
15+
// nv_bfloat16 * dsti = (nv_bfloat16 *) cdsti;
1616

17-
*dsti = *xi;
18-
}
17+
// *dsti = *xi;
18+
// }
1919

2020
static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
2121
const float * xi = (const float *) cxi;
@@ -463,15 +463,15 @@ static void ggml_cpy_f32_f16_cuda(
463463
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
464464
}
465465

466-
static void ggml_cpy_f32_bf16_cuda(
467-
const char * cx, char * cdst, const int ne,
468-
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
469-
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
466+
// static void ggml_cpy_f32_bf16_cuda(
467+
// const char * cx, char * cdst, const int ne,
468+
// const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
469+
// const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
470470

471-
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
472-
cpy_f32_f16<cpy_1_f32_bf16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
473-
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
474-
}
471+
// const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
472+
// cpy_f32_f16<cpy_1_f32_bf16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
473+
// (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
474+
// }
475475

476476
static void ggml_cpy_f32_q8_0_cuda(
477477
const char * cx, char * cdst, const int ne,
@@ -716,8 +716,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
716716
ggml_cpy_f32_q6_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
717717
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
718718
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
719-
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
720-
ggml_cpy_f32_bf16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
719+
// } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
720+
// ggml_cpy_f32_bf16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
721721
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
722722
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
723723
} else {
@@ -746,8 +746,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
746746
return (void*) cpy_f32_f16<cpy_1_f32_bf16>;
747747
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
748748
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
749-
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
750-
return (void*) cpy_f32_f16<cpy_1_f32_bf16>;
749+
// } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
750+
// return (void*) cpy_f32_f16<cpy_1_f32_bf16>;
751751
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
752752
return (void*) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>;
753753
} else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1349,7 +1349,7 @@ static void ggml_cuda_op_mul_mat_cublas(
13491349
GGML_ASSERT(to_bf16_cuda != nullptr);
13501350
size_t ne = src1_ncols*ne10;
13511351
src1_as_bf16.alloc(ne);
1352-
to_bf16_cuda(src1_ddf_i, src1_as_bf16.get(), ne, stream);
1352+
to_bf16_cuda(src1_ddf_i, src1_as_bf16.get(), src1_ncols, ne10, stream);
13531353
}
13541354
const nv_bfloat16 * src1_ptr = src1->type == GGML_TYPE_BF16 ? (const nv_bfloat16 *) src1_ddf_i : src1_as_bf16.get();
13551355
const nv_bfloat16 * src0_ptr = (const nv_bfloat16 *)src0_dd_i;
@@ -1369,7 +1369,7 @@ static void ggml_cuda_op_mul_mat_cublas(
13691369
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
13701370

13711371
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
1372-
to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
1372+
to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff, src1_ncols, stream);
13731373
} else if (((GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || GGML_CUDA_CC_IS_AMD(cc)) && use_fp16) {
13741374
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
13751375
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));

0 commit comments

Comments
 (0)