Revert "cuda : fix tensor size calculation for non-split buffer (ggml-org#5145)"

Nexesenex · Nexesenex · commit 3d83ce9710fe · 2024-01-31T14:04:37.000+01:00
This reverts commit 62fead3.
diff --git a/ggml-backend.c b/ggml-backend.c
@@ -38,9 +38,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
 GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
     // get_alloc_size is optional, defaults to ggml_nbytes
     if (buft->iface.get_alloc_size) {
-        size_t size = buft->iface.get_alloc_size(buft, tensor);
-        assert(size >= ggml_nbytes(tensor));
-        return size;
+        return buft->iface.get_alloc_size(buft, tensor);
     }
     return ggml_nbytes(tensor);
 }
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -9991,8 +9991,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
     // TODO: mmq/mmv support
 #endif
 
-    const size_t nb11 = src1->nb[1];
-    const size_t nb1  =  dst->nb[1];
+    const int64_t nb11 = src1->nb[1];
+    const int64_t nb1  =  dst->nb[1];
 
     const struct ggml_tensor * ids = src0;
     const int32_t id = ((int32_t *) dst->op_params)[0];
@@ -10517,11 +10517,15 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
 
     if (ggml_is_quantized(tensor->type)) {
         // initialize padding to 0 to avoid possible NaN values
-        size_t original_size = ggml_nbytes(tensor);
+        int64_t row_low = 0;
+        int64_t row_high = ggml_nrows(tensor);
+        int64_t nrows_split = row_high - row_low;
+
+        size_t original_size = ggml_nbytes_split(tensor, nrows_split);
         size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
 
         if (padded_size > original_size && tensor->view_src == nullptr) {
-            CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
+            CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
         }
     }
 }
@@ -10624,7 +10628,12 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend
 }
 
 GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
-    size_t size = ggml_nbytes(tensor);
+    int64_t row_low = 0;
+    int64_t row_high = ggml_nrows(tensor);
+    int64_t nrows_split = row_high - row_low;
+
+    size_t size = ggml_nbytes_split(tensor, nrows_split);
+
     int64_t ne0 = tensor->ne[0];
 
     if (ggml_is_quantized(tensor->type)) {

Original file line number	Diff line number	Diff line change
`@@ -38,9 +38,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {`
`38`	`38`	`GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {`
`39`	`39`	`// get_alloc_size is optional, defaults to ggml_nbytes`
`40`	`40`	`if (buft->iface.get_alloc_size) {`
`41`		`- size_t size = buft->iface.get_alloc_size(buft, tensor);`
`42`		`- assert(size >= ggml_nbytes(tensor));`
`43`		`- return size;`
	`41`	`+ return buft->iface.get_alloc_size(buft, tensor);`
`44`	`42`	`}`
`45`	`43`	`return ggml_nbytes(tensor);`
`46`	`44`	`}`