@@ -9991,8 +9991,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
9991
9991
// TODO: mmq/mmv support
9992
9992
#endif
9993
9993
9994
- const size_t nb11 = src1->nb[1];
9995
- const size_t nb1 = dst->nb[1];
9994
+ const int64_t nb11 = src1->nb[1];
9995
+ const int64_t nb1 = dst->nb[1];
9996
9996
9997
9997
const struct ggml_tensor * ids = src0;
9998
9998
const int32_t id = ((int32_t *) dst->op_params)[0];
@@ -10517,11 +10517,15 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
10517
10517
10518
10518
if (ggml_is_quantized(tensor->type)) {
10519
10519
// initialize padding to 0 to avoid possible NaN values
10520
- size_t original_size = ggml_nbytes(tensor);
10520
+ int64_t row_low = 0;
10521
+ int64_t row_high = ggml_nrows(tensor);
10522
+ int64_t nrows_split = row_high - row_low;
10523
+
10524
+ size_t original_size = ggml_nbytes_split(tensor, nrows_split);
10521
10525
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
10522
10526
10523
10527
if (padded_size > original_size && tensor->view_src == nullptr) {
10524
- CUDA_CHECK(cudaMemset ((char *)tensor->data + original_size, 0, padded_size - original_size));
10528
+ CUDA_CHECK(cudaMemsetAsync ((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0] ));
10525
10529
}
10526
10530
}
10527
10531
}
@@ -10624,7 +10628,12 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend
10624
10628
}
10625
10629
10626
10630
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
10627
- size_t size = ggml_nbytes(tensor);
10631
+ int64_t row_low = 0;
10632
+ int64_t row_high = ggml_nrows(tensor);
10633
+ int64_t nrows_split = row_high - row_low;
10634
+
10635
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
10636
+
10628
10637
int64_t ne0 = tensor->ne[0];
10629
10638
10630
10639
if (ggml_is_quantized(tensor->type)) {
0 commit comments