@@ -589,8 +589,8 @@ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer
589
589
590
590
if (ggml_is_quantized (tensor->type ) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage (buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
591
591
// initialize padding to 0 to avoid possible NaN values
592
- size_t original_size = ggml_nbytes (tensor);
593
- size_t padded_size = ggml_backend_buft_get_alloc_size (buffer->buft , tensor);
592
+ const size_t original_size = ggml_nbytes (tensor);
593
+ const size_t padded_size = ggml_backend_buft_get_alloc_size (buffer->buft , tensor);
594
594
595
595
if (padded_size > original_size) {
596
596
ggml_cuda_set_device (ctx->device );
@@ -712,6 +712,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
712
712
713
713
if (ggml_is_quantized (tensor->type )) {
714
714
if (ne0 % MATRIX_ROW_PADDING != 0 ) {
715
+ GGML_ASSERT (tensor->nb [0 ] == ggml_element_size (tensor));
715
716
size += ggml_row_size (tensor->type , MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
716
717
}
717
718
}
@@ -833,6 +834,7 @@ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buff
833
834
834
835
static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor (ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
835
836
GGML_ASSERT (tensor->view_src == nullptr ); // views of split tensors are not supported
837
+ GGML_ASSERT (ggml_is_contiguous (tensor) && " split buffers only supported for contiguous tensors" );
836
838
837
839
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context ;
838
840
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft ->context ;
@@ -884,6 +886,7 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff
884
886
// split tensors must always be set in their entirety at once
885
887
GGML_ASSERT (offset == 0 );
886
888
GGML_ASSERT (size == ggml_nbytes (tensor));
889
+ GGML_ASSERT (ggml_is_contiguous (tensor) && " split buffers only supported for contiguous tensors" );
887
890
888
891
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft ->context ;
889
892
@@ -922,6 +925,7 @@ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buff
922
925
// split tensors must always be set in their entirety at once
923
926
GGML_ASSERT (offset == 0 );
924
927
GGML_ASSERT (size == ggml_nbytes (tensor));
928
+ GGML_ASSERT (ggml_is_contiguous (tensor) && " split buffers only supported for contiguous tensors" );
925
929
926
930
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft ->context ;
927
931
@@ -1003,6 +1007,7 @@ static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buf
1003
1007
1004
1008
static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size (ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
1005
1009
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context ;
1010
+ GGML_ASSERT (ggml_is_contiguous (tensor) && " split buffers only supported for contiguous tensors" );
1006
1011
1007
1012
size_t total_size = 0 ;
1008
1013
@@ -2391,6 +2396,12 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
2391
2396
ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping (ctx.pool (), num_src1_rows);
2392
2397
CUDA_CHECK (cudaMemsetAsync (dev_cur_src1_row.get (), 0 , sizeof (int ), stream));
2393
2398
2399
+ ggml_tensor src0_slice = *src0;
2400
+ src0_slice.ne [2 ] = 1 ;
2401
+ src0_slice.nb [3 ] = src0_slice.nb [2 ];
2402
+ src0_slice.data = (char *) src0->data + i02*nb02;
2403
+ GGML_ASSERT (!ggml_cuda_should_use_mmq (src0->type , cc, ne11) || ne00 % MATRIX_ROW_PADDING == 0 );
2404
+
2394
2405
{
2395
2406
dim3 block_dims (std::min ((unsigned int )ne10, 768u ));
2396
2407
dim3 grid_dims (ids->ne [1 ], n_ids);
0 commit comments