llama : fix Mamba-2 conv state saving

compilade · compilade · commit e9b0d1984cb0 · 2024-08-21T16:29:54.000-04:00
* ggml : make the ggml_mul fast broadcast path more consistently formatted
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -10226,11 +10226,11 @@ static void ggml_compute_forward_mul_f32(
             if (scale == 0.0f) {
                 // NOTE: this also sets NANs to zero, which is not compliant with IEEE754,
                 //       but it is useful when resetting the state of recurrent models.
-                memset((char *)dst->data + ir*nb1, 0, nb1);
+                memset((char *) dst->data + ir*nb1, 0, ne0 * sizeof(float));
             } else {
                 if (dst->data != src0->data) {
                     // src0 is same shape as dst => same indices
-                    memcpy((char *)dst->data + ir*nb1, (char *)src0->data + ir*nb01, ne0 * sizeof(float));
+                    memcpy((char *) dst->data + ir*nb1, (char *) src0->data + ir*nb01, ne0 * sizeof(float));
                 }
                 if (scale != 1.0f) {
                     ggml_vec_scale_f32(ne0, (float *) ((char *) dst->data + ir*nb1), scale);
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -9335,7 +9335,7 @@ static struct ggml_tensor * llm_build_mamba2(
             ggml_cpy(ctx, last_conv,
                 ggml_view_1d(ctx, conv_states_all,
                     (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
-                    kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
+                    kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
 
         // 1D convolution
         // The equivalent is to make a self-overlapping view of conv_x