@@ -556,8 +556,8 @@ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer
556
556
557
557
if (ggml_is_quantized (tensor->type ) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage (buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
558
558
// initialize padding to 0 to avoid possible NaN values
559
- const size_t original_size = ggml_nbytes (tensor);
560
- const size_t padded_size = ggml_backend_buft_get_alloc_size (buffer->buft , tensor);
559
+ size_t original_size = ggml_nbytes (tensor);
560
+ size_t padded_size = ggml_backend_buft_get_alloc_size (buffer->buft , tensor);
561
561
562
562
if (padded_size > original_size) {
563
563
ggml_cuda_set_device (ctx->device );
@@ -680,7 +680,6 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
680
680
681
681
if (ggml_is_quantized (tensor->type )) {
682
682
if (ne0 % MATRIX_ROW_PADDING != 0 ) {
683
- GGML_ASSERT (tensor->nb [0 ] == ggml_element_size (tensor));
684
683
size += ggml_row_size (tensor->type , MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
685
684
}
686
685
}
@@ -802,7 +801,6 @@ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buff
802
801
803
802
static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor (ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
804
803
GGML_ASSERT (tensor->view_src == nullptr ); // views of split tensors are not supported
805
- GGML_ASSERT (ggml_is_contiguous (tensor) && " split buffers only supported for contiguous tensors" );
806
804
807
805
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context ;
808
806
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft ->context ;
@@ -854,7 +852,6 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff
854
852
// split tensors must always be set in their entirety at once
855
853
GGML_ASSERT (offset == 0 );
856
854
GGML_ASSERT (size == ggml_nbytes (tensor));
857
- GGML_ASSERT (ggml_is_contiguous (tensor) && " split buffers only supported for contiguous tensors" );
858
855
859
856
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft ->context ;
860
857
@@ -893,7 +890,6 @@ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buff
893
890
// split tensors must always be set in their entirety at once
894
891
GGML_ASSERT (offset == 0 );
895
892
GGML_ASSERT (size == ggml_nbytes (tensor));
896
- GGML_ASSERT (ggml_is_contiguous (tensor) && " split buffers only supported for contiguous tensors" );
897
893
898
894
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft ->context ;
899
895
@@ -975,7 +971,6 @@ static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buf
975
971
976
972
static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size (ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
977
973
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context ;
978
- GGML_ASSERT (ggml_is_contiguous (tensor) && " split buffers only supported for contiguous tensors" );
979
974
980
975
size_t total_size = 0 ;
981
976
@@ -2071,7 +2066,6 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
2071
2066
src0_slice.ne [2 ] = 1 ;
2072
2067
src0_slice.nb [3 ] = src0_slice.nb [2 ];
2073
2068
src0_slice.data = (char *) src0->data + i02*nb02;
2074
- GGML_ASSERT (!ggml_cuda_should_use_mmq (src0->type , cc, ne11) || ne00 % MATRIX_ROW_PADDING == 0 );
2075
2069
2076
2070
ggml_tensor src1_slice;
2077
2071
memset (&src1_slice, 0 , sizeof (src1_slice));
0 commit comments