Skip to content

Commit 483bab2

Browse files
authored
Avoid the transposed X branch in the Z = X * Y matrix multiplication (#439)
Should make results reproducible for different number of threads and batch sizes
1 parent 404e1da commit 483bab2

File tree

1 file changed

+7
-5
lines changed

1 file changed

+7
-5
lines changed

llama.cpp

+7-5
Original file line numberDiff line numberDiff line change
@@ -727,11 +727,13 @@ static bool llama_eval_internal(
727727

728728
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
729729
struct ggml_tensor * V_trans =
730-
ggml_permute(ctx0,
731-
ggml_reshape_3d(ctx0,
732-
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
733-
n_embd/n_head, n_head, n_past + N),
734-
1, 2, 0, 3);
730+
ggml_cpy(ctx0,
731+
ggml_permute(ctx0,
732+
ggml_reshape_3d(ctx0,
733+
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
734+
n_embd/n_head, n_head, n_past + N),
735+
1, 2, 0, 3),
736+
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
735737

736738
// KQV = transpose(V) * KQ_soft_max
737739
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);

0 commit comments

Comments
 (0)