From bc9e84daca501568a8ca0d0618d532caec598b6e Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 21 Apr 2023 16:31:12 +0200
Subject: [PATCH 01/86] add python wrapper

https://gist.github.com/abetlen/2b90e5f153f6efd00931d098de5c73ce
---
 py/llama_cpp/__init__.py |   0
 py/llama_cpp/llama.py    | 173 +++++++++++++++++++++++++++++++++++++++
 setup.py                 |  15 ++++
 3 files changed, 188 insertions(+)
 create mode 100644 py/llama_cpp/__init__.py
 create mode 100644 py/llama_cpp/llama.py
 create mode 100644 setup.py

diff --git a/py/llama_cpp/__init__.py b/py/llama_cpp/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/py/llama_cpp/llama.py b/py/llama_cpp/llama.py
new file mode 100644
index 0000000000000..39da58f17c83e
--- /dev/null
+++ b/py/llama_cpp/llama.py
@@ -0,0 +1,173 @@
+import os
+import sys
+import glob
+import ctypes
+
+from ctypes import c_int, c_float, c_double, c_char_p, c_void_p, c_bool, POINTER, Structure
+
+
+# Load the library
+if sys.platform == 'win32':
+    lib = ctypes.cdll.LoadLibrary(next(iter(glob.glob(os.path.join(os.path.dirname(__file__), '..', '..', '**', 'llama.dll'), recursive=True))))
+else:
+    lib = ctypes.cdll.LoadLibrary(next(iter(glob.glob(os.path.join(os.path.dirname(__file__), '..', '..', '**', 'libllama.so'), recursive=True))))
+
+
+# C types
+llama_token = c_int
+llama_token_p = POINTER(llama_token)
+
+class llama_token_data(Structure):
+    _fields_ = [
+        ('id', llama_token), # token id
+        ('p', c_float), # probability of the token
+        ('plog', c_float), # log probability of the token
+    ]
+
+llama_token_data_p = POINTER(llama_token_data)
+llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
+class llama_context_params(Structure):
+    _fields_ = [
+        ('n_ctx', c_int), # text context
+        ('n_parts', c_int), # -1 for default
+        ('seed', c_int), # RNG seed, 0 for random
+        ('f16_kv', c_bool), # use fp16 for KV cache
+        ('logits_all', c_bool), # the llama_eval() call computes all logits, not just the last one
+        ('vocab_only', c_bool), # only load the vocabulary, no weights
+        ('use_mmap', c_bool), # use mmap if possible
+        ('use_mlock', c_bool), # force system to keep model in RAM
+        ('embedding', c_bool), # embedding mode only
+        ('progress_callback', llama_progress_callback), # called with a progress value between 0 and 1, pass NULL to disable
+        ('progress_callback_user_data', c_void_p), # context pointer passed to the progress callback
+    ]
+
+llama_context_params_p = POINTER(llama_context_params)
+
+llama_context_p = c_void_p
+
+# C functions
+lib.llama_context_default_params.argtypes = []
+lib.llama_context_default_params.restype = llama_context_params
+
+lib.llama_init_from_file.argtypes = [c_char_p, llama_context_params]
+lib.llama_init_from_file.restype = llama_context_p
+
+lib.llama_free.argtypes = [llama_context_p]
+lib.llama_free.restype = None
+
+lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int]
+lib.llama_model_quantize.restype = c_int
+
+lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_int]
+lib.llama_eval.restype = c_int
+
+lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, c_bool]
+lib.llama_tokenize.restype = c_int
+
+lib.llama_n_vocab.argtypes = [llama_context_p]
+lib.llama_n_vocab.restype = c_int
+
+lib.llama_n_ctx.argtypes = [llama_context_p]
+lib.llama_n_ctx.restype = c_int
+
+lib.llama_get_logits.argtypes = [llama_context_p]
+lib.llama_get_logits.restype = POINTER(c_float)
+
+lib.llama_get_embeddings.argtypes = [llama_context_p]
+lib.llama_get_embeddings.restype = POINTER(c_float)
+
+lib.llama_token_to_str.argtypes = [llama_context_p, llama_token]
+lib.llama_token_to_str.restype = c_char_p
+
+lib.llama_token_bos.argtypes = []
+lib.llama_token_bos.restype = llama_token
+
+lib.llama_token_eos.argtypes = []
+lib.llama_token_eos.restype = llama_token
+
+lib.llama_sample_top_p_top_k.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_float, c_float, c_float]
+lib.llama_sample_top_p_top_k.restype = llama_token
+
+lib.llama_print_timings.argtypes = [llama_context_p]
+lib.llama_print_timings.restype = None
+
+lib.llama_reset_timings.argtypes = [llama_context_p]
+lib.llama_reset_timings.restype = None
+
+lib.llama_print_system_info.argtypes = []
+lib.llama_print_system_info.restype = c_char_p
+
+# Python functions
+def llama_context_default_params() -> llama_context_params:
+    params = lib.llama_context_default_params()
+    return params
+
+def llama_init_from_file(path_model: str, params: llama_context_params) -> llama_context_p:
+    """Various functions for loading a ggml llama model.
+    Allocate (almost) all memory needed for the model.
+    Return NULL on failure """
+    return lib.llama_init_from_file(path_model.encode('utf-8'), params)
+
+def llama_free(ctx: llama_context_p):
+    """Free all allocated memory"""
+    lib.llama_free(ctx)
+
+def llama_model_quantize(fname_inp: str, fname_out: str, itype: c_int, qk: c_int) -> c_int:
+    """Returns 0 on success"""
+    return lib.llama_model_quantize(fname_inp.encode('utf-8'), fname_out.encode('utf-8'), itype, qk)
+
+def llama_eval(ctx: llama_context_p, tokens: llama_token_p, n_tokens: c_int, n_past: c_int, n_threads: c_int) -> c_int:
+    """Run the llama inference to obtain the logits and probabilities for the next token.
+    tokens + n_tokens is the provided batch of new tokens to process
+    n_past is the number of tokens to use from previous eval calls
+    Returns 0 on success"""
+    return lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads)
+
+def llama_tokenize(ctx: llama_context_p, text: str, tokens: llama_token_p, n_max_tokens: c_int, add_bos: c_bool) -> c_int:
+    """Convert the provided text into tokens.
+    The tokens pointer must be large enough to hold the resulting tokens.
+    Returns the number of tokens on success, no more than n_max_tokens
+    Returns a negative number on failure - the number of tokens that would have been returned"""
+    return lib.llama_tokenize(ctx, text.encode('utf-8'), tokens, n_max_tokens, add_bos)
+
+def llama_n_vocab(ctx: llama_context_p) -> c_int:
+    return lib.llama_n_vocab(ctx)
+
+def llama_n_ctx(ctx: llama_context_p) -> c_int:
+    return lib.llama_n_ctx(ctx)
+
+def llama_get_logits(ctx: llama_context_p):
+    """Token logits obtained from the last call to llama_eval()
+    The logits for the last token are stored in the last row
+    Can be mutated in order to change the probabilities of the next token
+    Rows: n_tokens
+    Cols: n_vocab"""
+    return lib.llama_get_logits(ctx)
+
+def llama_get_embeddings(ctx: llama_context_p):
+    """Get the embeddings for the input
+    shape: [n_embd] (1-dimensional)"""
+    return lib.llama_get_embeddings(ctx)
+
+def llama_token_to_str(ctx: llama_context_p, token: int) -> str:
+    """Token Id -> String. Uses the vocabulary in the provided context"""
+    return lib.llama_token_to_str(ctx, token).decode('utf-8')
+
+def llama_token_bos() -> llama_token:
+    return lib.llama_token_bos()
+
+def llama_token_eos() -> llama_token:
+    return lib.llama_token_eos()
+
+def llama_sample_top_p_top_k(ctx: llama_context_p, last_n_tokens_data: llama_token_p, last_n_tokens_size: c_int, top_k: c_int, top_p: c_float, temp: c_float, repeat_penalty: c_float) -> llama_token:
+    return lib.llama_sample_top_p_top_k(ctx, last_n_tokens_data, last_n_tokens_size, top_k, top_p, temp, repeat_penalty)
+
+def llama_print_timings(ctx: llama_context_p):
+    lib.llama_print_timings(ctx)
+
+def llama_reset_timings(ctx: llama_context_p):
+    lib.llama_reset_timings(ctx)
+
+def llama_print_system_info() -> str:
+    """Print system informaiton"""
+    return lib.llama_print_system_info().decode('utf-8')
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000000000..cc3a23f0739fd
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,15 @@
+
+from setuptools import setup, find_packages
+import glob, os
+
+setup(
+    name='llama_cpp',
+    version='0.0.1',
+    author='Anonymous',
+    author_email='',
+    license='All rights reserved',
+    packages=find_packages(where='py'),
+    package_dir={'': 'py'},
+    install_requires=[],
+    entry_points={'console_scripts': []},
+)

From 5f6b7150714e4db2d0804bab4f89d704ba656924 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 14 Apr 2023 14:40:06 +0200
Subject: [PATCH 02/86] fix decoding error. adds errors=ignore parameter

---
 py/llama_cpp/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/py/llama_cpp/llama.py b/py/llama_cpp/llama.py
index 39da58f17c83e..a09c8425e1d81 100644
--- a/py/llama_cpp/llama.py
+++ b/py/llama_cpp/llama.py
@@ -151,7 +151,7 @@ def llama_get_embeddings(ctx: llama_context_p):
 
 def llama_token_to_str(ctx: llama_context_p, token: int) -> str:
     """Token Id -> String. Uses the vocabulary in the provided context"""
-    return lib.llama_token_to_str(ctx, token).decode('utf-8')
+    return lib.llama_token_to_str(ctx, token).decode('utf-8', errors='ignore')
 
 def llama_token_bos() -> llama_token:
     return lib.llama_token_bos()

From ed6b64fb98b16cfbba761d76eb36ce17350ac172 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 14 Apr 2023 03:16:50 +0200
Subject: [PATCH 03/86] add python bindings for functions to get and set the
 whole llama state (rng, logits, embedding and kv_cache)

---
 py/llama_cpp/llama.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/py/llama_cpp/llama.py b/py/llama_cpp/llama.py
index a09c8425e1d81..3bd6b8efea85d 100644
--- a/py/llama_cpp/llama.py
+++ b/py/llama_cpp/llama.py
@@ -97,6 +97,15 @@ class llama_context_params(Structure):
 lib.llama_print_system_info.argtypes = []
 lib.llama_print_system_info.restype = c_char_p
 
+lib.llama_get_state_size.argtypes = [llama_context_p]
+lib.llama_get_state_size.restype = c_size_t
+
+lib.llama_copy_state_data.argtypes = [llama_context_p, c_ubyte_p]
+lib.llama_copy_state_data.restype = c_size_t
+
+lib.llama_set_state_data.argtypes = [llama_context_p, c_ubyte_p]
+lib.llama_set_state_data.restype = c_size_t
+
 # Python functions
 def llama_context_default_params() -> llama_context_params:
     params = lib.llama_context_default_params()
@@ -171,3 +180,12 @@ def llama_reset_timings(ctx: llama_context_p):
 def llama_print_system_info() -> str:
     """Print system informaiton"""
     return lib.llama_print_system_info().decode('utf-8')
+
+def llama_get_state_size(ctx: llama_context_p) -> c_size_t:
+    return lib.llama_get_state_size(ctx)
+
+def llama_copy_state_data(ctx: llama_context_p, dst: c_ubyte_p) -> c_size_t:
+    return lib.llama_copy_state_data(ctx, dst)
+
+def llama_set_state_data(ctx: llama_context_p, src: c_ubyte_p) -> c_size_t:
+    return lib.llama_set_state_data(ctx, src)

From 6e88dc93bdc002c7870fe0547fc9a9960ac9d1ad Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 13 May 2023 19:05:24 +0200
Subject: [PATCH 04/86] update python bindings

---
 py/llama_cpp/llama.py | 220 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 178 insertions(+), 42 deletions(-)

diff --git a/py/llama_cpp/llama.py b/py/llama_cpp/llama.py
index 3bd6b8efea85d..bc0fa8b72855c 100644
--- a/py/llama_cpp/llama.py
+++ b/py/llama_cpp/llama.py
@@ -3,7 +3,7 @@
 import glob
 import ctypes
 
-from ctypes import c_int, c_float, c_double, c_char_p, c_void_p, c_bool, POINTER, Structure
+from ctypes import c_int, c_float, c_double, c_char_p, c_void_p, c_bool, c_size_t, c_ubyte, POINTER, Structure
 
 
 # Load the library
@@ -19,36 +19,58 @@
 
 class llama_token_data(Structure):
     _fields_ = [
-        ('id', llama_token), # token id
-        ('p', c_float), # probability of the token
+        ('id',   llama_token), # token id
+        ('p',    c_float), # probability of the token
         ('plog', c_float), # log probability of the token
     ]
 
 llama_token_data_p = POINTER(llama_token_data)
+
+class llama_token_data_array(Structure):
+    _fields_ = [
+        ('data',   llama_token_data_p),
+        ('size',   c_size_t),
+        ('sorted', c_bool),
+    ]
+
+llama_token_data_array_p = POINTER(llama_token_data_array)
+
 llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
 class llama_context_params(Structure):
     _fields_ = [
-        ('n_ctx', c_int), # text context
-        ('n_parts', c_int), # -1 for default
-        ('seed', c_int), # RNG seed, 0 for random
-        ('f16_kv', c_bool), # use fp16 for KV cache
-        ('logits_all', c_bool), # the llama_eval() call computes all logits, not just the last one
-        ('vocab_only', c_bool), # only load the vocabulary, no weights
-        ('use_mmap', c_bool), # use mmap if possible
-        ('use_mlock', c_bool), # force system to keep model in RAM
-        ('embedding', c_bool), # embedding mode only
-        ('progress_callback', llama_progress_callback), # called with a progress value between 0 and 1, pass NULL to disable
-        ('progress_callback_user_data', c_void_p), # context pointer passed to the progress callback
+        ('n_ctx',        c_int),  # text context
+        ('n_parts',      c_int),  # -1 for default
+        ('n_gpu_layers', c_int),  # number of layers to store in VRAM
+        ('seed',         c_int),  # RNG seed, 0 for random
+        ('f16_kv',       c_bool), # use fp16 for KV cache
+        ('logits_all',   c_bool), # the llama_eval() call computes all logits, not just the last one
+        ('vocab_only',   c_bool), # only load the vocabulary, no weights
+        ('use_mmap',     c_bool), # use mmap if possible
+        ('use_mlock',    c_bool), # force system to keep model in RAM
+        ('embedding',    c_bool), # embedding mode only
+        ('progress_callback',           llama_progress_callback), # called with a progress value between 0 and 1, pass NULL to disable
+        ('progress_callback_user_data', c_void_p),                # context pointer passed to the progress callback
     ]
 
+
 llama_context_params_p = POINTER(llama_context_params)
 
 llama_context_p = c_void_p
 
+c_size_p = POINTER(c_size_t)
+c_ubyte_p = POINTER(c_ubyte)
+c_float_p = POINTER(c_float)
+
 # C functions
 lib.llama_context_default_params.argtypes = []
 lib.llama_context_default_params.restype = llama_context_params
 
+lib.llama_mmap_supported.argtypes = []
+lib.llama_mmap_supported.restype = c_bool
+
+lib.llama_mlock_supported.argtypes = []
+lib.llama_mlock_supported.restype = c_bool
+
 lib.llama_init_from_file.argtypes = [c_char_p, llama_context_params]
 lib.llama_init_from_file.restype = llama_context_p
 
@@ -58,6 +80,30 @@ class llama_context_params(Structure):
 lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int]
 lib.llama_model_quantize.restype = c_int
 
+lib.llama_apply_lora_from_file.argtypes = [llama_context_p, c_char_p, c_char_p, c_int]
+lib.llama_apply_lora_from_file.restype = c_int
+
+lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p]
+lib.llama_get_kv_cache_token_count.restype = c_int
+
+lib.llama_set_rng_seed.argtypes = [llama_context_p, c_int]
+lib.llama_set_rng_seed.restype = None
+
+lib.llama_get_state_size.argtypes = [llama_context_p]
+lib.llama_get_state_size.restype = c_size_t
+
+lib.llama_copy_state_data.argtypes = [llama_context_p, c_ubyte_p]
+lib.llama_copy_state_data.restype = c_size_t
+
+lib.llama_set_state_data.argtypes = [llama_context_p, c_ubyte_p]
+lib.llama_set_state_data.restype = c_size_t
+
+lib.llama_load_session_file.argtypes = [llama_context_p, c_char_p, llama_token_p, c_size_t, c_size_p]
+lib.llama_load_session_file.restype = c_bool
+
+lib.llama_save_session_file.argtypes = [llama_context_p, c_char_p, llama_token_p, c_size_t]
+lib.llama_save_session_file.restype = c_bool
+
 lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_int]
 lib.llama_eval.restype = c_int
 
@@ -70,11 +116,14 @@ class llama_context_params(Structure):
 lib.llama_n_ctx.argtypes = [llama_context_p]
 lib.llama_n_ctx.restype = c_int
 
+lib.llama_n_embd.argtypes = [llama_context_p]
+lib.llama_n_embd.restype = c_int
+
 lib.llama_get_logits.argtypes = [llama_context_p]
-lib.llama_get_logits.restype = POINTER(c_float)
+lib.llama_get_logits.restype = c_float_p
 
 lib.llama_get_embeddings.argtypes = [llama_context_p]
-lib.llama_get_embeddings.restype = POINTER(c_float)
+lib.llama_get_embeddings.restype = c_float_p
 
 lib.llama_token_to_str.argtypes = [llama_context_p, llama_token]
 lib.llama_token_to_str.restype = c_char_p
@@ -85,8 +134,44 @@ class llama_context_params(Structure):
 lib.llama_token_eos.argtypes = []
 lib.llama_token_eos.restype = llama_token
 
-lib.llama_sample_top_p_top_k.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_float, c_float, c_float]
-lib.llama_sample_top_p_top_k.restype = llama_token
+lib.llama_token_nl.argtypes = []
+lib.llama_token_nl.restype = llama_token
+
+lib.llama_sample_repetition_penalty.argtypes = [llama_context_p, llama_token_data_array_p, llama_token_p, c_size_t, c_float]
+lib.llama_sample_repetition_penalty.restype = None
+
+lib.llama_sample_frequency_and_presence_penalties.argtypes = [llama_context_p, llama_token_data_array_p, llama_token_p, c_size_t, c_float, c_float]
+lib.llama_sample_frequency_and_presence_penalties.restype = None
+
+lib.llama_sample_softmax.argtypes = [llama_context_p, llama_token_data_array_p]
+lib.llama_sample_softmax.restype = None
+
+lib.llama_sample_top_k.argtypes = [llama_context_p, llama_token_data_array_p, c_int, c_size_t]
+lib.llama_sample_top_k.restype = None
+
+lib.llama_sample_top_p.argtypes = [llama_context_p, llama_token_data_array_p, c_float, c_size_t]
+lib.llama_sample_top_p.restype = None
+
+lib.llama_sample_tail_free.argtypes = [llama_context_p, llama_token_data_array_p, c_float, c_size_t]
+lib.llama_sample_tail_free.restype = None
+
+lib.llama_sample_typical.argtypes = [llama_context_p, llama_token_data_array_p, c_float, c_size_t]
+lib.llama_sample_typical.restype = None
+
+lib.llama_sample_temperature.argtypes = [llama_context_p, llama_token_data_array_p, c_float]
+lib.llama_sample_temperature.restype = None
+
+lib.llama_sample_token_mirostat.argtypes = [llama_context_p, llama_token_data_array_p, c_float, c_float, c_int, c_float_p]
+lib.llama_sample_token_mirostat.restype = llama_token
+
+lib.llama_sample_token_mirostat_v2.argtypes = [llama_context_p, llama_token_data_array_p, c_float, c_float, c_float_p]
+lib.llama_sample_token_mirostat_v2.restype = llama_token
+
+lib.llama_sample_token_greedy.argtypes = [llama_context_p, llama_token_data_array_p]
+lib.llama_sample_token_greedy.restype = llama_token
+
+lib.llama_sample_token.argtypes = [llama_context_p, llama_token_data_array_p]
+lib.llama_sample_token.restype = llama_token
 
 lib.llama_print_timings.argtypes = [llama_context_p]
 lib.llama_print_timings.restype = None
@@ -97,20 +182,18 @@ class llama_context_params(Structure):
 lib.llama_print_system_info.argtypes = []
 lib.llama_print_system_info.restype = c_char_p
 
-lib.llama_get_state_size.argtypes = [llama_context_p]
-lib.llama_get_state_size.restype = c_size_t
-
-lib.llama_copy_state_data.argtypes = [llama_context_p, c_ubyte_p]
-lib.llama_copy_state_data.restype = c_size_t
-
-lib.llama_set_state_data.argtypes = [llama_context_p, c_ubyte_p]
-lib.llama_set_state_data.restype = c_size_t
 
 # Python functions
 def llama_context_default_params() -> llama_context_params:
     params = lib.llama_context_default_params()
     return params
 
+def llama_mmap_supported() -> bool:
+    return lib.llama_mmap_supported()
+
+def llama_mlock_supported() -> bool:
+    return lib.llama_mlock_supported()
+
 def llama_init_from_file(path_model: str, params: llama_context_params) -> llama_context_p:
     """Various functions for loading a ggml llama model.
     Allocate (almost) all memory needed for the model.
@@ -125,6 +208,30 @@ def llama_model_quantize(fname_inp: str, fname_out: str, itype: c_int, qk: c_int
     """Returns 0 on success"""
     return lib.llama_model_quantize(fname_inp.encode('utf-8'), fname_out.encode('utf-8'), itype, qk)
 
+def llama_apply_lora_from_file(ctx: llama_context_p, path_lora: str, path_base_model: str, n_threads: c_int) -> c_int:
+    return lib.llama_apply_lora_from_file(ctx, path_lora.encode('utf-8'), path_base_model.encode('utf-8'), n_threads)
+
+def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int:
+    return lib.llama_get_kv_cache_token_count(ctx)
+
+def llama_set_rng_seed(ctx: llama_context_p, seed: c_int):
+    return lib.llama_set_rng_seed(ctx, seed)
+
+def llama_get_state_size(ctx: llama_context_p) -> c_size_t:
+    return lib.llama_get_state_size(ctx)
+
+def llama_copy_state_data(ctx: llama_context_p, dst: c_ubyte_p) -> c_size_t:
+    return lib.llama_copy_state_data(ctx, dst)
+
+def llama_set_state_data(ctx: llama_context_p, src: c_ubyte_p) -> c_size_t:
+    return lib.llama_set_state_data(ctx, src)
+
+def llama_load_session_file(ctx: llama_context_p, path_session: str, tokens_out: llama_token_p, n_token_capacity: c_size_t, n_token_count_out: c_size_p) -> c_bool:
+    return lib.llama_load_session_file(ctx, path_session.encode('utf-8'), tokens_out, n_token_capacity, n_token_count_out)
+
+def llama_save_session_file(ctx: llama_context_p, path_session: str, tokens: llama_token_p, n_token_count: c_size_t) -> c_bool:
+    return lib.llama_save_session_file(ctx, path_session.encode('utf-8'), tokens, n_token_count)
+
 def llama_eval(ctx: llama_context_p, tokens: llama_token_p, n_tokens: c_int, n_past: c_int, n_threads: c_int) -> c_int:
     """Run the llama inference to obtain the logits and probabilities for the next token.
     tokens + n_tokens is the provided batch of new tokens to process
@@ -145,7 +252,10 @@ def llama_n_vocab(ctx: llama_context_p) -> c_int:
 def llama_n_ctx(ctx: llama_context_p) -> c_int:
     return lib.llama_n_ctx(ctx)
 
-def llama_get_logits(ctx: llama_context_p):
+def llama_n_embd(ctx: llama_context_p) -> c_int:
+    return lib.llama_n_embd(ctx)
+
+def llama_get_logits(ctx: llama_context_p) -> c_float_p:
     """Token logits obtained from the last call to llama_eval()
     The logits for the last token are stored in the last row
     Can be mutated in order to change the probabilities of the next token
@@ -153,7 +263,7 @@ def llama_get_logits(ctx: llama_context_p):
     Cols: n_vocab"""
     return lib.llama_get_logits(ctx)
 
-def llama_get_embeddings(ctx: llama_context_p):
+def llama_get_embeddings(ctx: llama_context_p) -> c_float_p:
     """Get the embeddings for the input
     shape: [n_embd] (1-dimensional)"""
     return lib.llama_get_embeddings(ctx)
@@ -168,8 +278,44 @@ def llama_token_bos() -> llama_token:
 def llama_token_eos() -> llama_token:
     return lib.llama_token_eos()
 
-def llama_sample_top_p_top_k(ctx: llama_context_p, last_n_tokens_data: llama_token_p, last_n_tokens_size: c_int, top_k: c_int, top_p: c_float, temp: c_float, repeat_penalty: c_float) -> llama_token:
-    return lib.llama_sample_top_p_top_k(ctx, last_n_tokens_data, last_n_tokens_size, top_k, top_p, temp, repeat_penalty)
+def llama_token_nl() -> llama_token:
+    return lib.llama_token_nl()
+
+def llama_sample_repetition_penalty(ctx: llama_context_p, candidates: llama_token_data_array_p, last_tokens: llama_token_p, last_tokens_size: c_size_t, penalty: float):
+    lib.llama_sample_repetition_penalty(ctx, candidates, last_tokens, last_tokens_size, penalty)
+
+def llama_sample_frequency_and_presence_penalties(ctx: llama_context_p, candidates: llama_token_data_array_p, last_tokens: llama_token_p, last_tokens_size: c_size_t, alpha_frequency: float, alpha_presence: float):
+    lib.llama_sample_frequency_and_presence_penalties(ctx, candidates, last_tokens, last_tokens_size, alpha_frequency, alpha_presence)
+
+def llama_sample_softmax(ctx: llama_context_p, candidates: llama_token_data_array_p):
+    lib.llama_sample_softmax(ctx, candidates)
+
+def llama_sample_top_k(ctx: llama_context_p, candidates: llama_token_data_array_p, k: c_int, min_keep: c_size_t):
+    lib.llama_sample_top_k(ctx, candidates, k, min_keep)
+
+def llama_sample_top_p(ctx: llama_context_p, candidates: llama_token_data_array_p, p: float, min_keep: c_size_t):
+    lib.llama_sample_top_p(ctx, candidates, c_float(p), c_size_t(min_keep))
+
+def llama_sample_tail_free(ctx: llama_context_p, candidates: llama_token_data_array_p, z: float, min_keep: c_size_t):
+    lib.llama_sample_tail_free(ctx, candidates, z, min_keep)
+
+def llama_sample_typical(ctx: llama_context_p, candidates: llama_token_data_array_p, p: float, min_keep: c_size_t):
+    lib.llama_sample_typical(ctx, candidates, p, min_keep)
+
+def llama_sample_temperature(ctx: llama_context_p, candidates: llama_token_data_array_p, temp: float):
+    lib.llama_sample_temperature(ctx, candidates, temp)
+
+def llama_sample_token_mirostat(ctx: llama_context_p, candidates: llama_token_data_array_p, tau: float, eta: float, m: c_int, mu: c_float_p) -> llama_token:
+    return lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
+
+def llama_sample_token_mirostat_v2(ctx: llama_context_p, candidates: llama_token_data_array_p, tau: float, eta: float, mu: c_float_p) -> llama_token:
+    return lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)
+
+def llama_sample_token_greedy(ctx: llama_context_p, candidates: llama_token_data_array_p) -> llama_token:
+    return lib.llama_sample_token_greedy(ctx, candidates)
+
+def llama_sample_token(ctx: llama_context_p, candidates: llama_token_data_array_p) -> llama_token:
+    return lib.llama_sample_token(ctx, candidates)
 
 def llama_print_timings(ctx: llama_context_p):
     lib.llama_print_timings(ctx)
@@ -177,15 +323,5 @@ def llama_print_timings(ctx: llama_context_p):
 def llama_reset_timings(ctx: llama_context_p):
     lib.llama_reset_timings(ctx)
 
-def llama_print_system_info() -> str:
-    """Print system informaiton"""
-    return lib.llama_print_system_info().decode('utf-8')
-
-def llama_get_state_size(ctx: llama_context_p) -> c_size_t:
-    return lib.llama_get_state_size(ctx)
-
-def llama_copy_state_data(ctx: llama_context_p, dst: c_ubyte_p) -> c_size_t:
-    return lib.llama_copy_state_data(ctx, dst)
-
-def llama_set_state_data(ctx: llama_context_p, src: c_ubyte_p) -> c_size_t:
-    return lib.llama_set_state_data(ctx, src)
+def llama_print_system_info() -> c_char_p:
+    return lib.llama_print_system_info()
\ No newline at end of file

From 6e968d22b06c75041e089aabd9d55d9f9a1e3f43 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 14 May 2023 16:07:08 +0200
Subject: [PATCH 05/86] add text generating baby-llama from scratch example

---
 examples/baby-llama/baby-llama-text.cpp | 1359 +++++++++++++++++++++++
 1 file changed, 1359 insertions(+)
 create mode 100644 examples/baby-llama/baby-llama-text.cpp

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
new file mode 100644
index 0000000000000..d114d689e05a1
--- /dev/null
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -0,0 +1,1359 @@
+#include "ggml.h"
+#include "llama.h"
+#include <unordered_map>
+#include <vector>
+#include <cassert>
+#include <random>
+#include <cstring>
+#include <stdexcept>
+#include <cstdarg>
+
+
+struct random_normal_distribution {
+    std::mt19937 gen;
+    std::normal_distribution<float> rd;
+    float min;
+    float max;
+};
+
+
+struct random_uniform_distribution {
+    std::mt19937 gen;
+    std::uniform_real_distribution<float> rd;
+};
+
+void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) {
+    rnd->gen = std::mt19937(seed);
+    rnd->rd = std::normal_distribution<float>{mean, std};
+    rnd->min = min;
+    rnd->max = max;
+}
+
+void init_random_uniform_distribution(struct random_uniform_distribution * rnd, int seed, float min, float max) {
+    rnd->gen = std::mt19937(seed);
+    rnd->rd = std::uniform_real_distribution<float>{min, max};
+}
+
+int clamp(const int v, const int min, const int max) {
+    return ((v < min) ? (min) : (v > max) ? (max) : v);
+}
+
+float fclamp(const float v, const float min, const float max) {
+    return ((v < min) ? (min) : (v > max) ? (max) : v);
+}
+
+float frand_normal(struct random_normal_distribution * rnd) {
+    return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max);
+}
+
+float frand_uniform(struct random_uniform_distribution * rnd) {
+    return rnd->rd(rnd->gen);
+}
+
+struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
+    switch (tensor->n_dims) {
+        case 1:
+            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
+                *dst = frand_normal(rnd);
+            }
+            break;
+        case 2:
+            for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                    float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+                    *dst = frand_normal(rnd);
+                }
+            }
+            break;
+        case 3:
+            for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+                for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                    for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                        float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
+                        *dst = frand_normal(rnd);
+                    }
+                }
+            }
+            break;
+        case 4:
+            for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
+                for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+                    for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                        for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                            float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]);
+                            *dst = frand_normal(rnd);
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    };
+    return tensor;
+}
+
+struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
+    switch (tensor->n_dims) {
+        case 1:
+            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
+                *dst = frand_uniform(rnd);
+            }
+            break;
+        case 2:
+            for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                    float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+                    *dst = frand_uniform(rnd);
+                }
+            }
+            break;
+        case 3:
+            for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+                for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                    for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                        float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
+                        *dst = frand_uniform(rnd);
+                    }
+                }
+            }
+            break;
+        case 4:
+            for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
+                for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+                    for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                        for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                            float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]);
+                            *dst = frand_uniform(rnd);
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    };
+    return tensor;
+}
+
+struct my_llama_hparams {
+    uint32_t n_vocab = 32000;
+    uint32_t n_ctx   = 512;   // this is provided as user input?
+    uint32_t n_embd  = 4096;
+    uint32_t n_mult  = 4;
+    uint32_t n_head  = 32;
+    uint32_t n_layer = 32;
+    uint32_t n_rot   = 64;
+
+    bool operator!=(const my_llama_hparams& other) const {
+        return memcmp(this, &other, sizeof(my_llama_hparams));
+    }
+};
+
+uint32_t get_n_ff(const struct my_llama_hparams* hparams) {
+    const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
+    return n_ff;
+}
+
+struct my_llama_layer {
+    // normalization
+    struct ggml_tensor * attention_norm;
+
+    // attention
+    struct ggml_tensor * wq;
+    struct ggml_tensor * wk;
+    struct ggml_tensor * wv;
+    struct ggml_tensor * wo;
+
+    // normalization
+    struct ggml_tensor * ffn_norm;
+
+    // ff
+    struct ggml_tensor * w1;
+    struct ggml_tensor * w2;
+    struct ggml_tensor * w3;
+};
+
+struct my_llama_kv_cache {
+    struct ggml_context * ctx = NULL;
+
+    struct ggml_tensor * k;
+    struct ggml_tensor * v;
+
+    // llama_ctx_buffer buf;
+
+    int n; // number of tokens currently in the cache
+};
+
+struct my_llama_model {
+    struct ggml_context * ctx = NULL;
+
+    my_llama_hparams hparams;
+
+    struct ggml_tensor * tok_embeddings;
+
+    struct ggml_tensor * norm;
+    struct ggml_tensor * output;
+
+    std::vector<my_llama_layer> layers;
+};
+
+void init_model(struct my_llama_model * model) {
+    const auto & hparams = model->hparams;
+
+    const uint32_t n_embd  = hparams.n_embd;
+    const uint32_t n_layer = hparams.n_layer;
+    const uint32_t n_vocab = hparams.n_vocab;
+
+    const uint32_t n_ff = get_n_ff(&hparams);
+
+    struct ggml_context * ctx = model->ctx;
+
+    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab});
+    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);          // ("norm.weight",           {n_embd});
+    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("output.weight",         {n_embd, n_vocab});
+
+    model->layers.resize(n_layer);
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = model->layers[i];
+
+        // std::string layers_i = "layers." + std::to_string(i);
+
+        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd});
+
+        layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);     // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
+        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);     // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
+        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);     // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
+        layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);     // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
+
+        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);       // (layers_i + ".ffn_norm.weight", {n_embd});
+
+        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);     // (layers_i + ".feed_forward.w1.weight", {n_embd,   n_ff});
+        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,   n_ff, n_embd);     // (layers_i + ".feed_forward.w2.weight", {  n_ff,   n_embd});
+        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);     // (layers_i + ".feed_forward.w3.weight", {n_embd,   n_ff});
+    }
+}
+
+
+void set_param_model(struct my_llama_model * model) {
+    const auto& hparams = model->hparams;
+
+    const uint32_t n_layer = hparams.n_layer;
+
+    struct ggml_context* ctx = model->ctx;
+
+    ggml_set_param(ctx, model->tok_embeddings);
+    ggml_set_param(ctx, model->norm);
+    ggml_set_param(ctx, model->output);
+
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = model->layers[i];
+
+        ggml_set_param(ctx, layer.attention_norm);
+        ggml_set_param(ctx, layer.wq);
+        ggml_set_param(ctx, layer.wk);
+        ggml_set_param(ctx, layer.wv);
+        ggml_set_param(ctx, layer.wo);
+        ggml_set_param(ctx, layer.ffn_norm);
+        ggml_set_param(ctx, layer.w1);
+        ggml_set_param(ctx, layer.w2);
+        ggml_set_param(ctx, layer.w3);
+    }
+}
+
+void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
+    const auto & hparams = model->hparams;
+
+    const uint32_t n_layer = hparams.n_layer;
+
+    struct random_normal_distribution rnd;
+    init_random_normal_distribution(&rnd, seed, mean, std, min, max);
+
+    randomize_tensor_normal(model->tok_embeddings, &rnd);
+    randomize_tensor_normal(model->norm,           &rnd);
+    randomize_tensor_normal(model->output,         &rnd);
+
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = model->layers[i];
+        randomize_tensor_normal(layer.attention_norm, &rnd);
+
+        randomize_tensor_normal(layer.wq, &rnd);
+        randomize_tensor_normal(layer.wk, &rnd);
+        randomize_tensor_normal(layer.wv, &rnd);
+        randomize_tensor_normal(layer.wo, &rnd);
+
+        randomize_tensor_normal(layer.ffn_norm, &rnd);
+
+        randomize_tensor_normal(layer.w1, &rnd);
+        randomize_tensor_normal(layer.w2, &rnd);
+        randomize_tensor_normal(layer.w3, &rnd);
+    }
+}
+
+bool init_kv_cache(struct my_llama_kv_cache* cache, struct my_llama_model * model, int n_batch) {
+    const auto & hparams = model->hparams;
+
+    const uint32_t n_ctx   = hparams.n_ctx;
+    const uint32_t n_embd  = hparams.n_embd;
+    const uint32_t n_layer = hparams.n_layer;
+
+    const int64_t n_mem      = n_layer*n_ctx*n_batch;
+    const int64_t n_elements = n_embd*n_mem;
+
+    // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
+
+    // struct ggml_init_params params;
+    // params.mem_size   = cache.buf.size;
+    // params.mem_buffer = cache.buf.addr;
+    // params.no_alloc   = false;
+    if (!cache->ctx) {
+        struct ggml_init_params params;
+        params.mem_size   = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024;
+        params.mem_buffer = NULL;
+        params.no_alloc   = false;
+
+        cache->ctx = ggml_init(params);
+
+        if (!cache->ctx) {
+            fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
+            return false;
+        }
+    }
+
+    cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
+    cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
+
+    return true;
+}
+
+struct ggml_tensor * forward(
+        struct my_llama_model    * model,
+        struct my_llama_kv_cache * cache,
+        struct ggml_context   * ctx0,
+        struct ggml_cgraph    * gf,
+        struct ggml_tensor    * tokens_input,
+        const  int              n_tokens,
+        const  int              n_past) {
+
+    const int N = n_tokens;
+
+    struct my_llama_kv_cache& kv_self = *cache;
+    const auto & hparams = model->hparams;
+    const int n_ctx   = hparams.n_ctx;
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_head  = hparams.n_head;
+    const int n_rot   = hparams.n_rot;
+
+    struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens));
+
+    struct ggml_tensor * kc = kv_self.k;
+    struct ggml_tensor * vc = kv_self.v;
+
+    // inpL shape [n_embd,N,1,1]
+    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * inpSA = inpL;
+
+        struct ggml_tensor * cur;
+
+        // lctx.use_buf(ctx0, 0);
+
+        // norm
+        {
+            // cur shape [n_embd,N,1,1]
+            cur = ggml_rms_norm(ctx0, inpL);
+
+            // cur = attention_norm*cur
+            cur = ggml_mul(ctx0,
+                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
+                        cur);
+        }
+
+        // self-attention
+        {
+            // compute Q and K and RoPE them
+            // wq   shape [n_embd, n_embd, 1, 1]
+            // wk   shape [n_embd, n_embd, 1, 1]
+            // Qcur shape [n_embd/n_head, n_head, N, 1]
+            // Kcur shape [n_embd/n_head, n_head, N, 1]
+            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+
+            // store key and value to memory
+            {
+                // compute the transposed [N, n_embd] V matrix
+                // wv   shape [n_embd, n_embd, 1, 1]
+                // Vcur shape [n_embd, N, 1, 1]
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N)));
+
+                // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
+                // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
+                // k         shape [n_embd * N, 1]   == kv_self.k[:,n_past:n_past+N,il,0]
+                // v         shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
+
+                /* {
+                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+                    struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
+                            (   n_ctx)*ggml_element_size(kv_self.v),
+                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+
+                    // important: storing RoPE-ed version of K in the KV cache!
+                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+                } //*/
+
+                kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+                vc = ggml_set_2d(ctx0, vc, Vcur, (   n_ctx)*ggml_element_size(kv_self.v),
+                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+            }
+
+            // Qcur shape [n_embd/n_head, n_head, N, 1]
+            // Q shape    [n_embd/n_head, N, n_head, 1]
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        Qcur,
+                        0, 2, 1, 3);
+
+            // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
+            // K shape [n_embd/n_head, n_past + N, n_head, 1]
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd),
+                            n_embd/n_head, n_head, n_past + N),
+                        0, 2, 1, 3);
+
+            // K * Q
+            // KQ shape [n_past + N, N, n_head, 1]
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            // KQ_scaled shape [n_past + N, N, n_head, 1]
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale(ctx0,
+                        KQ,
+                        ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
+
+            // KQ_masked = mask_past(KQ_scaled)
+            // KQ_masked shape [n_past + N, N, n_head, 1]
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+
+            // KQ = soft_max(KQ_masked)
+            // KQ_soft_max shape [n_past + N, N, n_head, 1]
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+
+            // split cached V into n_head heads
+            //// V shape [n_past + N, n_embd/n_head, n_head, 1]
+            // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1]
+            struct ggml_tensor * V =
+                ggml_view_3d(ctx0, vc,
+                        n_past + N, n_embd/n_head, n_head,
+                        n_ctx*ggml_element_size(vc),
+                        n_ctx*ggml_element_size(vc)*n_embd/n_head,
+                        il*n_ctx*ggml_element_size(vc)*n_embd);
+
+            // KQV shape [n_embd/n_head, N, n_head, 1]
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            // KQV_merged shape [n_embd/n_head, n_head, N, 1]
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            // KQV_merged shape
+
+            // cur = KQV_merged.contiguous().view(n_embd, N)
+            // cur shape [n_embd,N,1,1]
+            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N);
+            // cur = ggml_cpy(ctx0,
+            //         KQV_merged,
+            //         ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+
+            // projection (no bias)
+            // cur shape [n_embd,N,1,1]
+            cur = ggml_mul_mat(ctx0,
+                    model->layers[il].wo,
+                    cur);
+        }
+
+        // lctx.use_buf(ctx0, 1);
+
+        // inpFF shape [n_embd,N,1,1]
+        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
+
+        // feed-forward network
+        {
+            // norm
+            {
+                // cur shape [n_embd,N,1,1]
+                cur = ggml_rms_norm(ctx0, inpFF);
+
+                // cur = ffn_norm*cur
+                // cur shape [n_embd,N,1,1]
+                cur = ggml_mul(ctx0,
+                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
+                        cur);
+            }
+
+            // tmp shape [n_ff,N,1,1]
+            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
+                    model->layers[il].w3,
+                    cur);
+
+            // cur shape [n_ff,N,1,1]
+            cur = ggml_mul_mat(ctx0,
+                    model->layers[il].w1,
+                    cur);
+
+            // SILU activation
+            // cur shape [n_ff,N,1,1]
+            cur = ggml_silu(ctx0, cur);
+
+            // cur shape [n_ff,N,1,1]
+            cur = ggml_mul(ctx0, cur, tmp);
+
+            // cur shape [n_embd,N,1,1]
+            cur = ggml_mul_mat(ctx0,
+                    model->layers[il].w2,
+                    cur);
+        }
+
+        // cur shape [n_embd,N,1,1]
+        cur = ggml_add(ctx0, cur, inpFF);
+
+        // input for next layer
+        // inpL shape [n_embd,N,1,1]
+        inpL = cur;
+    }
+
+    // norm
+    {
+
+        // inpL shape [n_embd,N,1,1]
+        inpL = ggml_rms_norm(ctx0, inpL);
+
+        // inpL = norm*inpL
+        // inpL shape [n_embd,N,1,1]
+        inpL = ggml_mul(ctx0,
+                    ggml_repeat(ctx0, model->norm, inpL),
+                    inpL);
+
+        //embeddings = inpL;
+    }
+
+    // lm_head
+    // inpL shape [n_vocab,N,1,1]
+    inpL = ggml_mul_mat(ctx0, model->output, inpL);
+
+    // run the computation
+    ggml_build_forward_expand(gf, inpL);
+
+    return inpL;
+}
+
+void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
+    GGML_ASSERT(tensor->n_dims == 1);
+    GGML_ASSERT(tensor->ne[0] == ne0);
+}
+
+void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
+    GGML_ASSERT(tensor->n_dims == 2);
+    GGML_ASSERT(tensor->ne[0] == ne0);
+    GGML_ASSERT(tensor->ne[1] == ne1);
+}
+
+void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
+    GGML_ASSERT(tensor->n_dims == 3);
+    GGML_ASSERT(tensor->ne[0] == ne0);
+    GGML_ASSERT(tensor->ne[1] == ne1);
+    GGML_ASSERT(tensor->ne[2] == ne2);
+}
+
+void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
+    GGML_ASSERT(tensor->n_dims == 4);
+    GGML_ASSERT(tensor->ne[0] == ne0);
+    GGML_ASSERT(tensor->ne[1] == ne1);
+    GGML_ASSERT(tensor->ne[2] == ne2);
+    GGML_ASSERT(tensor->ne[3] == ne3);
+}
+
+struct ggml_tensor * forward_batch(
+        struct my_llama_model    * model,
+        struct my_llama_kv_cache * cache,
+        struct ggml_context   * ctx0,
+        struct ggml_cgraph    * gf,
+        struct ggml_tensor    * tokens_input,
+        const  int              n_tokens,
+        const  int              n_past,
+        const  int              n_batch) {
+
+    const int N = n_tokens;
+
+    struct my_llama_kv_cache& kv_self = *cache;
+    const auto & hparams = model->hparams;
+    const int n_ctx   = hparams.n_ctx;
+    const int n_vocab = hparams.n_vocab;
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_head  = hparams.n_head;
+    const int n_rot   = hparams.n_rot;
+    const int n_ff    = get_n_ff(&hparams);
+
+    struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch);
+    memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch);
+
+    struct ggml_tensor * kc = kv_self.k;
+    struct ggml_tensor * vc = kv_self.v;
+
+    // inpL shape [n_embd,N*n_batch,1]
+    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
+    assert_shape_2d(inpL, n_embd, N*n_batch);
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * inpSA = inpL;
+
+        struct ggml_tensor * cur;
+
+        // lctx.use_buf(ctx0, 0);
+
+        // norm
+        {
+            // cur shape [n_embd,N*n_batch,1,1]
+            cur = ggml_rms_norm(ctx0, inpL);
+            assert_shape_2d(cur, n_embd, N*n_batch);
+
+            // cur = attention_norm*cur
+            cur = ggml_mul(ctx0,
+                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
+                        cur);
+            assert_shape_2d(cur, n_embd, N*n_batch);
+        }
+
+        // self-attention
+        {
+            // compute Q and K and RoPE them
+            // wq   shape [n_embd, n_embd, 1, 1]
+            // wk   shape [n_embd, n_embd, 1, 1]
+            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
+            // Kcur shape [n_embd/n_head, n_head, N, n_batch]
+            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
+            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
+            assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
+            assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
+
+            // store key and value to memory
+            {
+                // compute the transposed [N, n_embd] V matrix
+                // wv   shape [n_embd, n_embd, 1, 1]
+                // Vcur shape [N, n_embd, n_batch, 1]
+                struct ggml_tensor * Vcur = ggml_cont(ctx0,
+                    ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
+                            ggml_mul_mat(ctx0,
+                                model->layers[il].wv,
+                                cur),
+                        n_embd, N, n_batch),
+                        1, 0, 2, 3));
+
+                assert_shape_3d(Vcur, N, n_embd, n_batch);
+
+                // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
+                // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
+                // k         shape [n_embd * N, n_batch]   == kv_self.k[:,n_past:n_past+N,:,il]
+                // v         shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il]
+
+                /* {
+                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+                    struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
+                            (   n_ctx)*ggml_element_size(kv_self.v),
+                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+
+                    // important: storing RoPE-ed version of K in the KV cache!
+                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+                } //*/
+
+                kc = ggml_set_2d(ctx0, kc,
+                        ggml_reshape_2d(ctx0, Kcur, n_embd*N, n_batch),
+                        ggml_element_size(kc)*n_embd*n_ctx,
+                        (ggml_element_size(kc)*n_embd)*(il*n_batch*n_ctx + n_past));
+                vc = ggml_set_2d(ctx0, vc,
+                        ggml_reshape_2d(ctx0, Vcur, N*n_embd, n_batch),
+                        ggml_element_size(vc)*n_ctx*n_embd,
+                        ggml_element_size(vc)*(n_past + il*n_embd*n_batch*n_ctx));
+
+                assert_shape_1d(kc, n_embd * n_ctx * n_batch * n_layer);
+                assert_shape_1d(vc, n_embd * n_ctx * n_batch * n_layer);
+            }
+
+            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
+            // Q shape    [n_embd/n_head, N, n_head, n_batch]
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        Qcur,
+                        0, 2, 1, 3);
+            assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);
+
+            // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
+            // K shape [n_embd/n_head, n_past + N, n_head, n_batch]
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_reshape_4d(ctx0,
+                            ggml_view_3d(ctx0,
+                                kc,
+                                n_embd,
+                                (n_past + N),
+                                n_batch,
+                                n_embd*ggml_element_size(kc),
+                                n_ctx*n_embd*ggml_element_size(kc),
+                                il*n_batch*n_ctx*n_embd*ggml_element_size(kc)),
+                            n_embd/n_head, n_head, n_past + N, n_batch),
+                        0, 2, 1, 3);
+            assert_shape_4d(K, n_embd/n_head, n_past + N, n_head, n_batch);
+
+            // K * Q
+            // KQ shape [n_past + N, N, n_head, n_batch]
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            assert_shape_4d(KQ, n_past + N, N, n_head, n_batch);
+
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            // KQ_scaled shape [n_past + N, N, n_head, n_batch]
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale(ctx0,
+                        KQ,
+                        ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
+            assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch);
+
+            // KQ_masked = mask_past(KQ_scaled)
+            // KQ_masked shape [n_past + N, N, n_head, n_batch]
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+            assert_shape_4d(KQ_masked, n_past + N, N, n_head, n_batch);
+
+            // KQ = soft_max(KQ_masked)
+            // KQ_soft_max shape [n_past + N, N, n_head, n_batch]
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+            assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch);
+
+            // split cached V into n_head heads
+            // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
+            // V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il]
+            struct ggml_tensor * V =
+                ggml_view_4d(ctx0, vc,
+                        n_past + N, n_embd/n_head, n_head, n_batch,
+                        ggml_element_size(vc)*n_ctx,
+                        ggml_element_size(vc)*n_ctx*n_embd/n_head,
+                        ggml_element_size(vc)*n_ctx*n_embd,
+                        il*n_batch*n_ctx*n_embd*ggml_element_size(vc));
+            assert_shape_4d(V, n_past + N, n_embd/n_head, n_head, n_batch);
+
+            // KQV shape [n_embd/n_head, N, n_head, n_batch]
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+            assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch);
+
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            // KQV_merged shape [n_embd/n_head, n_head, N, n_batch]
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch);
+            // KQV_merged shape
+
+            // cur = KQV_merged.contiguous().view(n_embd, N)
+            // cur shape [n_embd,N*n_batch,1,1]
+            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch);
+            assert_shape_2d(cur, n_embd, N*n_batch);
+            // cur = ggml_cpy(ctx0,
+            //         KQV_merged,
+            //         ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+
+            // projection (no bias)
+            // cur shape [n_embd,N*n_batch,1,1]
+            cur = ggml_mul_mat(ctx0,
+                    model->layers[il].wo,
+                    cur);
+            assert_shape_2d(cur, n_embd, N*n_batch);
+        }
+
+        // lctx.use_buf(ctx0, 1);
+
+        // inpFF shape [n_embd,N*n_batch,1,1]
+        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
+        assert_shape_2d(inpFF, n_embd, N*n_batch);
+
+        // feed-forward network
+        {
+            // norm
+            {
+                // cur shape [n_embd,N*n_batch,1,1]
+                cur = ggml_rms_norm(ctx0, inpFF);
+                assert_shape_2d(cur, n_embd, N*n_batch);
+
+                // cur = ffn_norm*cur
+                // cur shape [n_embd,N*n_batch,1,1]
+                cur = ggml_mul(ctx0,
+                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
+                        cur);
+                assert_shape_2d(cur, n_embd, N*n_batch);
+            }
+
+            // tmp shape [n_ff,N*n_batch,1,1]
+            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
+                    model->layers[il].w3,
+                    cur);
+            assert_shape_2d(tmp, n_ff, N*n_batch);
+
+            // cur shape [n_ff,N*n_batch,1,1]
+            cur = ggml_mul_mat(ctx0,
+                    model->layers[il].w1,
+                    cur);
+            assert_shape_2d(cur, n_ff, N*n_batch);
+
+            // SILU activation
+            // cur shape [n_ff,N*n_batch,1,1]
+            cur = ggml_silu(ctx0, cur);
+            assert_shape_2d(cur, n_ff, N*n_batch);
+
+            // cur shape [n_ff,N*n_batch,1,1]
+            cur = ggml_mul(ctx0, cur, tmp);
+            assert_shape_2d(cur, n_ff, N*n_batch);
+
+            // cur shape [n_embd,N*n_batch,1,1]
+            cur = ggml_mul_mat(ctx0,
+                    model->layers[il].w2,
+                    cur);
+            assert_shape_2d(cur, n_embd, N*n_batch);
+        }
+
+        // cur shape [n_embd,N*n_batch,1,1]
+        cur = ggml_add(ctx0, cur, inpFF);
+        assert_shape_2d(cur, n_embd, N*n_batch);
+
+        // input for next layer
+        // inpL shape [n_embd,N*n_batch,1,1]
+        inpL = cur;
+        assert_shape_2d(inpL, n_embd, N*n_batch);
+    }
+
+    // norm
+    {
+
+        // inpL shape [n_embd,N*n_batch,1,1]
+        inpL = ggml_rms_norm(ctx0, inpL);
+        assert_shape_2d(inpL, n_embd, N*n_batch);
+
+        // inpL = norm*inpL
+        // inpL shape [n_embd,N*n_batch,1,1]
+        inpL = ggml_mul(ctx0,
+                    ggml_repeat(ctx0, model->norm, inpL),
+                    inpL);
+
+        assert_shape_2d(inpL, n_embd, N*n_batch);
+
+        //embeddings = inpL;
+    }
+
+    // lm_head
+    // inpL shape [n_vocab,N*n_batch,1,1]
+    inpL = ggml_mul_mat(ctx0, model->output, inpL);
+    assert_shape_2d(inpL, n_vocab, N*n_batch);
+
+    {
+        // inpL shape [n_vocab,N,n_batch,1]
+        inpL = ggml_reshape_3d(ctx0,
+                        inpL,
+                        n_vocab, N, n_batch);
+        assert_shape_3d(inpL, n_vocab, N, n_batch);
+    }
+
+    // run the computation
+    ggml_build_forward_expand(gf, inpL);
+
+    return inpL;
+}
+
+void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
+    assert(logits->n_dims == 2);
+    assert(probs->n_dims == 2);
+    assert(best_samples->n_dims == 1);
+    assert(logits->ne[1] == best_samples->ne[0]);
+    assert(logits->ne[0] == probs->ne[0]);
+    assert(logits->ne[1] == probs->ne[1]);
+    for (int i = 0; i < logits->ne[1]; ++i) {
+        float max_logit = ggml_get_f32_1d(logits, i * logits->ne[0]);
+        ggml_set_i32_1d(best_samples, i, 0);
+        for (int k = 0; k < logits->ne[0]; ++k) {
+            float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k);
+            if (logit > max_logit) {
+                max_logit = logit;
+                ggml_set_i32_1d(best_samples, i, k);
+            }
+        }
+        float psum = 0;
+        for (int k = 0; k < logits->ne[0]; ++k) {
+            float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k);
+            float p = (logit == -INFINITY) ? 0 : expf(logit - max_logit);
+            psum += p;
+            ggml_set_f32_1d(probs, i * probs->ne[0] + k, p);
+        }
+        for (int k = 0; k < logits->ne[0]; ++k) {
+            float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
+            ggml_set_f32_1d(probs, i * probs->ne[0] + k, p / psum);
+        }
+    }
+}
+
+void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
+    GGML_ASSERT(best_samples->n_dims == 2);
+    GGML_ASSERT(logits->n_dims == 3);
+    GGML_ASSERT(probs->n_dims == 3);
+    int n_tokens = best_samples->ne[0];
+    int n_batch  = best_samples->ne[1];
+    int n_vocab  = logits->ne[0];
+    GGML_ASSERT(n_tokens == logits->ne[1]);
+    GGML_ASSERT(n_batch  == logits->ne[2]);
+    GGML_ASSERT(n_vocab  == probs->ne[0]);
+    GGML_ASSERT(n_tokens == probs->ne[1]);
+    GGML_ASSERT(n_batch  == probs->ne[2]);
+
+    for (int k = 0; k < n_batch; ++k) {
+        struct ggml_tensor * best_samples_k = ggml_view_1d(ctx,
+                                                best_samples,
+                                                best_samples->ne[0],
+                                                k*best_samples->nb[1]);
+        struct ggml_tensor * logits_k       = ggml_view_2d(ctx,
+                                                logits,
+                                                logits->ne[0],
+                                                logits->ne[1],
+                                                logits->nb[1],
+                                                k*logits->nb[2]);
+        struct ggml_tensor * probs_k        = ggml_view_2d(ctx,
+                                                probs,
+                                                probs->ne[0],
+                                                probs->ne[1],
+                                                probs->nb[1],
+                                                k*probs->nb[2]);
+        sample_softmax(logits_k, probs_k, best_samples_k);
+    }
+}
+
+
+void print_row(struct ggml_tensor * probs, int i) {
+    for (int k = 0; k < probs->ne[0]; ++k) {
+        float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
+        printf(" %.2f", p);
+    }
+    printf("\n");
+}
+
+void print_matrix(struct ggml_tensor * probs) {
+    assert(probs->n_dims == 2);
+    for (int i = 0; i < probs->ne[1]; ++i) {
+        for (int k = 0; k < probs->ne[0]; ++k) {
+            float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
+            printf(" %.2f", p);
+        }
+        printf("\n");
+    }
+}
+
+
+void print_token(struct llama_context * ctx, llama_token token) {
+    printf("%s", llama_token_to_str(ctx, token));
+}
+
+void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
+    for (int i=0; i<tokens->ne[0]; ++i) {
+        int token = ggml_get_i32_1d(tokens, i);
+        print_token(ctx, token);
+    }
+}
+
+void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens) {
+    for (int i1=0; i1<tokens->ne[1]; ++i1) {
+        for (int i0=0; i0<tokens->ne[0]; ++i0) {
+            int token = ggml_get_i32_1d(tokens, i0 + i1*tokens->ne[0]);
+            print_token(ctx, token);
+        }
+        printf("\n--\n");
+    }
+}
+
+void get_example_targets(const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
+    int n_tokens = tokens_input->ne[0];
+    int n_vocab  = targets->ne[0];
+
+    int n_examples = (n_train_data / (size_t) n_tokens);
+    int begin = (example_id % n_examples) * n_tokens;
+    GGML_ASSERT(begin+n_tokens-1 < n_train_data);
+
+    ggml_set_f32(targets, -1.0f);
+    ggml_set_i32_1d(tokens_input, 0, llama_token_bos());
+    for (int i=1; i<n_tokens+1; ++i) {
+        int token = clamp(train_data[begin+i-1], 0, n_vocab-1);
+        ggml_set_f32_1d(targets, (i-1)*n_vocab + token, +1.0f);
+        if (i<n_tokens) {
+            ggml_set_i32_1d(tokens_input, i, token);
+        }
+    }
+}
+
+void get_example_targets_batch(struct ggml_context * ctx, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
+    GGML_ASSERT(tokens_input->n_dims == 2);
+    GGML_ASSERT(     targets->n_dims == 3);
+    int n_tokens = tokens_input->ne[0];
+    int n_batch  = tokens_input->ne[1];
+    GGML_ASSERT(n_tokens == targets->ne[1]);
+    GGML_ASSERT(n_batch  == targets->ne[2]);
+
+    for (int k=0; k<n_batch; ++k) {
+        struct ggml_tensor * tokens_input_k = ggml_view_1d(ctx,
+                                                tokens_input,
+                                                tokens_input->ne[0],
+                                                k*tokens_input->nb[1]);
+        struct ggml_tensor * targets_k    = ggml_view_2d(ctx,
+                                                targets,
+                                                targets->ne[0],
+                                                targets->ne[1],
+                                                targets->nb[1],
+                                                k*targets->nb[2]);
+        
+        get_example_targets(train_data, n_train_data,
+            example_id*n_batch + k, tokens_input_k, targets_k);
+    }
+}
+
+
+void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
+    int n_tokens = tokens_input->ne[0];
+    int n_vocab = targets->ne[0];
+    for (int i=0; i<n_tokens-n_shift; ++i) {
+        ggml_set_i32_1d(tokens_input, i, ggml_get_i32_1d(tokens_input, i + n_shift));
+        for (int k=0; k<n_vocab; ++k) {
+            ggml_set_f32_1d(targets, i*n_vocab + k, ggml_get_f32_1d(targets, (i + n_shift)*n_vocab + k));
+        }
+    }
+}
+
+struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
+    // todo: instead of a-b: a[1:]-b[:-1]
+    return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
+}
+
+struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
+    const float eps = 1e-3;
+    return
+        ggml_sum(ctx,
+            ggml_neg(ctx,
+                ggml_sum_rows(ctx,
+                    ggml_mul(ctx,
+                        ggml_soft_max(ctx, a),
+                        ggml_log(ctx,
+                            ggml_add1(ctx,
+                                ggml_soft_max(ctx, b),
+                                ggml_new_f32(ctx, eps)))))));
+}
+
+#ifdef __GNUC__
+#ifdef __MINGW32__
+__attribute__((format(gnu_printf, 1, 2)))
+#else
+__attribute__((format(printf, 1, 2)))
+#endif
+#endif
+static std::string format(const char * fmt, ...) {
+    va_list ap, ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX);
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
+
+struct llama_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+    llama_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
+        }
+        seek(0, SEEK_END);
+        size = tell();
+        seek(0, SEEK_SET);
+    }
+
+    size_t tell() const {
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        GGML_ASSERT(ret != -1); // this really shouldn't fail
+        return (size_t) ret;
+    }
+
+    void seek(size_t offset, int whence) {
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        GGML_ASSERT(ret == 0); // same
+    }
+
+    void read_raw(void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        std::size_t ret = std::fread(ptr, size, 1, fp);
+        if (ferror(fp)) {
+            throw std::runtime_error(format("read error: %s", strerror(errno)));
+        }
+        if (ret != 1) {
+            throw std::runtime_error(std::string("unexpectedly reached end of file"));
+        }
+    }
+
+    std::uint32_t read_u32() {
+        std::uint32_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+
+    std::string read_string(std::uint32_t len) {
+        std::vector<char> chars(len);
+        read_raw(chars.data(), len);
+        return std::string(chars.data(), len);
+    }
+
+    void write_raw(const void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        size_t ret = std::fwrite(ptr, size, 1, fp);
+        if (ret != 1) {
+            throw std::runtime_error(format("write error: %s", strerror(errno)));
+        }
+    }
+
+    void write_u32(std::uint32_t val) {
+        write_raw(&val, sizeof(val));
+    }
+
+    ~llama_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+};
+
+int tokenize_file(struct llama_context * lctx, const char * filename, std::vector<llama_token>& out) {
+    struct llama_file f(filename, "rb");
+
+    std::vector<char> buf;
+    buf.resize(f.size);
+
+    f.read_raw(buf.data(), f.size);
+    
+    out.resize(buf.size());
+
+    int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false);
+    
+    if (n_tokens >= 0) { 
+        out.resize(n_tokens);
+    }
+
+    return n_tokens;
+}
+
+int main(int argc, char ** argv) {
+    const char * default_model = "ggml-vic7b-uncensored-q4_0.bin";
+    const char * default_train = "shakespeare.txt";
+    const char * default_argv[3] = {argv[0], default_model, default_train};
+    
+    if (argc < 3) {
+        fprintf(stderr, "usage: %s model training_data\n", argv[0]);
+        //return 1;
+    }
+
+    const char * fn_model = (argc >= 2) ? argv[1] : default_argv[1];
+    const char * fn_train = (argc >= 3) ? argv[2] : default_argv[2];
+
+    struct llama_context_params llama_params = llama_context_default_params();
+    llama_params.vocab_only = true;
+
+    struct llama_context * lctx = llama_init_from_file(fn_model, llama_params);
+
+    std::vector<llama_token> train_tokens;
+    if (tokenize_file(lctx, fn_train, train_tokens) < 0) {
+        fprintf(stderr, "%s: failed to tokenize file '%s'\n", __func__, fn_train);
+    }
+
+    struct my_llama_model model;
+    model.hparams.n_vocab = llama_n_vocab(lctx);
+    model.hparams.n_ctx   = 16;
+    model.hparams.n_embd  = 64;
+    model.hparams.n_mult  = 8;
+    model.hparams.n_head  = 8;
+    model.hparams.n_layer = 1;
+    model.hparams.n_rot   = std::min(16u, model.hparams.n_embd / model.hparams.n_head);
+
+    struct my_llama_kv_cache kv_self;
+
+    int n_batch = 8;
+
+    struct ggml_init_params lcparams;
+    lcparams.mem_size   = 1024ll*1024ll*1024ll;
+    lcparams.mem_buffer = NULL;
+    lcparams.no_alloc   = false;
+
+    model.ctx = ggml_init(lcparams);
+    kv_self.ctx = model.ctx;
+
+    printf("init model\n");
+    init_model(&model);
+    set_param_model(&model);
+    randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
+    init_kv_cache(&kv_self, &model, n_batch);
+
+    size_t    compute_size = 1024ll*1024ll*1024ll*32ll;
+    uint8_t * compute_addr = new uint8_t[compute_size];
+    
+    int n_examples = 256;
+    int n_tokens = model.hparams.n_ctx;
+    int n_vocab  = model.hparams.n_vocab;
+
+    for (int ex=0; ex<n_examples; ++ex) {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ compute_size,
+            /*.mem_buffer =*/ compute_addr,
+            /*.no_alloc   =*/ false,
+        };
+        struct ggml_context * ctx0 = ggml_init(params);
+
+        struct ggml_tensor * after_opt_best_samples  = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
+        struct ggml_tensor * after_opt_probs         = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
+        struct ggml_tensor * tokens_input            = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
+        struct ggml_tensor * targets                 = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
+
+        int n_past = 0;
+
+        ggml_cgraph gf = {};
+        gf.n_threads = 1;
+
+        get_example_targets_batch(ctx0, train_tokens.data(), train_tokens.size(), ex,  tokens_input, targets);
+
+        struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch);
+        struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
+        // struct ggml_tensor * e = cross_entropy_loss(ctx0, targets, logits);
+
+        ggml_build_forward_expand(&gf, e);
+        ggml_graph_compute(ctx0, &gf);
+
+        float error_before_opt = ggml_get_f32_1d(e, 0);
+        
+        struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
+        struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
+        opt_params_adam.print_forward_graph = false;
+        opt_params_adam.print_backward_graph = false;
+        opt_params_lbfgs.print_forward_graph = false;
+        opt_params_lbfgs.print_backward_graph = false;
+        opt_params_adam.adam.n_iter = 16;
+        opt_params_lbfgs.lbfgs.n_iter = 16;
+        ggml_opt(ctx0, opt_params_adam, e);
+        // ggml_opt(ctx0, opt_params_lbfgs, e);
+
+        ggml_build_forward_expand(&gf, e);
+        ggml_graph_compute(ctx0, &gf);
+
+        float error_after_opt = ggml_get_f32_1d(e, 0);
+
+        if (ex % 1 == 0) {
+            printf("Example %d\n", ex);
+            printf("error_before_opt: %.2f\n", error_before_opt);
+            printf("error_after_opt:  %.2f\n", error_after_opt);
+        }
+
+        if (ex % 2 == 0) {
+            sample_softmax_batch(ctx0, logits, after_opt_probs, after_opt_best_samples);
+            // printf("probabilities after optimization:\n");
+            // print_matrix(after_opt_probs);
+            printf("Example:\n---\n");
+            print_tokens_batch(lctx, tokens_input);
+            printf("\n---\n");
+
+            printf("best samples after optimization:\n---\n");
+            print_tokens_batch(lctx, after_opt_best_samples);
+            printf("\n---\n");
+        }
+
+        ggml_free(ctx0);
+    }
+
+    {
+        int n_gen = 128;
+        int sample_ctx = n_tokens - n_tokens/8;
+
+        printf("Generating %d tokens.\n", n_gen);
+
+        struct ggml_tensor * tokens_input = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, n_tokens);
+        struct ggml_tensor * targets      = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens);
+
+        get_example_targets(train_tokens.data(), train_tokens.size(), 137, tokens_input, targets);
+        for (int i=sample_ctx; i<n_tokens; ++i) {
+            ggml_set_i32_1d(tokens_input, i, n_vocab/2);
+        }
+
+        for (int i=0; i<sample_ctx-1; ++i) {
+            print_token(lctx, ggml_get_i32_1d(tokens_input, i));
+        }
+
+        printf("---\n");
+        for (int i=0; i<n_gen; ++i) {
+            struct ggml_init_params params = {
+                /*.mem_size   =*/ compute_size,
+                /*.mem_buffer =*/ compute_addr,
+                /*.no_alloc   =*/ false,
+            };
+            struct ggml_context * ctx0 = ggml_init(params);
+
+            ggml_cgraph gf = {};
+            gf.n_threads = 1;
+
+            int n_past = 0;
+            struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
+
+            ggml_build_forward_expand(&gf, logits);
+            ggml_graph_compute(ctx0, &gf);
+
+            struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
+            struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
+
+            sample_softmax(logits, probs, best_samples);
+
+            int token = ggml_get_i32_1d(best_samples, sample_ctx-1);
+
+            // print_row(probs, sample_at);
+            print_token(lctx, token);
+
+            lshift_examples(tokens_input, targets, 1);
+            ggml_set_i32_1d(tokens_input, 0, 0);
+            ggml_set_i32_1d(tokens_input, sample_ctx-1, token);
+
+            ggml_free(ctx0);
+        }        
+    }
+
+    free(compute_addr);
+    ggml_free(model.ctx);
+
+    return 0;
+}

From f89c278d831e0336e1b4e17cbde2aed4ca970b3a Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 14 May 2023 17:00:19 +0200
Subject: [PATCH 06/86] fix race condition bug in
 ggml_compute_forward_diag_mask_f32

---
 ggml.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/ggml.c b/ggml.c
index e5b3528d8a742..3e17dfeb6656d 100644
--- a/ggml.c
+++ b/ggml.c
@@ -10321,20 +10321,23 @@ static void ggml_compute_forward_diag_mask_f32(
     assert(src1->type == GGML_TYPE_I32);
     assert(ggml_nelements(src1) == 2);
 
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
     const int ith = params->ith;
     const int nth = params->nth;
 
     const int  n_past  =       ((int32_t *) src1->data)[0];
     const bool inplace = (bool)((int32_t *) src1->data)[1];
 
-    if (!inplace) {
+
+    if (!inplace && (params->type == GGML_TASK_INIT)) {
+        // dup needs to be synchronized across threads to avoid race conditions.
+        // => do it in INIT phase
         ggml_compute_forward_dup_same_cont(params, src0, dst);
     }
 
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+    
     // TODO: handle transposed/permuted matrices
 
     const int n  = ggml_nrows(src0);

From ec1aea09ec041ab06ab898ac14bc23145f10ed8d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 14 May 2023 17:16:26 +0200
Subject: [PATCH 07/86] implement ggml_soft_max_back for more performant
 backward pass of soft_max

avoids creating big intermediate matrices of size n_embd x n_embd for llama layers and n_vocab x n_vocab for cross entropy loss
---
 ggml.c | 196 ++++++++++++++++++++++++++++++++++++++++++++-------------
 ggml.h |  12 ++++
 2 files changed, 164 insertions(+), 44 deletions(-)

diff --git a/ggml.c b/ggml.c
index 3e17dfeb6656d..2cc51fcc0d8d7 100644
--- a/ggml.c
+++ b/ggml.c
@@ -3325,6 +3325,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
     "DIAG_MASK_INF",
     "DIAG_MASK_ZERO",
     "SOFT_MAX",
+    "SOFT_MAX_BACK",
     "ROPE",
     "ROPE_BACK",
     "ALIBI",
@@ -3338,7 +3339,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
     "MAP_BINARY",
 };
 
-static_assert(GGML_OP_COUNT == 50, "GGML_OP_COUNT != 50");
+static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -3385,6 +3386,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "diag_mask_inf(x)",
     "diag_mask_zero(x)",
     "soft_max(x)",
+    "soft_max_back(x)",
     "rope(x)",
     "rope_back(x)",
     "alibi(x)",
@@ -3398,7 +3400,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "f(x,y)",
 };
 
-static_assert(GGML_OP_COUNT == 50, "GGML_OP_COUNT != 50");
+static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
 
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
 static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@@ -5927,6 +5929,44 @@ struct ggml_tensor * ggml_soft_max_inplace(
     return ggml_soft_max_impl(ctx, a, true);
 }
 
+
+// ggml_soft_max_back
+
+struct ggml_tensor * ggml_soft_max_back_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        bool                  inplace) {
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        is_node = true; // TODO : implement backward pass
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op   = GGML_OP_SOFT_MAX_BACK;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src0 = a;
+    result->src1 = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_soft_max_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    return ggml_soft_max_back_impl(ctx, a, b, false);
+}
+
+struct ggml_tensor * ggml_soft_max_back_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    return ggml_soft_max_back_impl(ctx, a, b, true);
+}
+
 // ggml_rope
 
 struct ggml_tensor * ggml_rope_impl(
@@ -10482,6 +10522,103 @@ static void ggml_compute_forward_soft_max(
     }
 }
 
+// ggml_compute_forward_soft_max_back
+
+static void ggml_compute_forward_soft_max_back_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(src1));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_are_same_shape(src1, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+                
+    // TODO: handle transposed/permuted matrices
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float *dy = (float *)((char *) src0->data + i1*src0->nb[1]);
+        float *y  = (float *)((char *) src1->data + i1*src1->nb[1]);
+        float *dx = (float *)((char *) dst->data  + i1*dst->nb[1]);
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            //printf("p[%d] = %f\n", i, p[i]);
+            assert(!isnan(s0[i]));
+            assert(!isnan(s1[i]));
+        }
+#endif
+        // Jii = yi - yi*yi
+        // Jij = -yi*yj
+        // J = diag(y)-y.T*y
+        // dx = J * dy
+        // dxk = sum_i(Jki * dyi)
+
+        // quadratic runtime, linear memory
+        for (int k = 0; k < nc; k++) {
+
+            ggml_float sum = 0.0;
+
+            for (int i = 0; i < k; i++) {
+                float Jki = -y[k]*y[i];
+                sum += (ggml_float) Jki * dy[i];
+            }
+
+            float Jkk = y[k] - y[k]*y[k];
+            sum += (ggml_float) Jkk * dy[k];
+
+            for (int i = k+1; i < nc; i++) {
+                float Jki = -y[k]*y[i];
+                sum += (ggml_float) Jki * dy[i];
+            }
+
+            dx[k] = (float) sum;
+        }
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            assert(!isnan(dx[i]));
+            assert(!isinf(dx[i]));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_soft_max_back(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_soft_max_back_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
 // ggml_compute_forward_alibi
 
 static void ggml_compute_forward_alibi_f32(
@@ -12529,6 +12666,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_soft_max(params, tensor->src0, tensor);
             } break;
+        case GGML_OP_SOFT_MAX_BACK:
+            {
+                ggml_compute_forward_soft_max_back(params, tensor->src0, tensor->src1, tensor);
+            } break;
         case GGML_OP_ROPE:
             {
                 ggml_compute_forward_rope(params, tensor->src0, tensor->src1, tensor);
@@ -13146,50 +13287,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 // necessary for llama
                 if (src0->grad) {
-                    // y = softmax(x)
-                    //
-                    // Jii = yi - yi*yi
-                    // Jij = -yi*yj
-                    // J = diag(y)-y.*y
-                    // dx = J * dy
-                    // dxk = sum(Jkj * dyk)
-
-                    int64_t ne2[4] = {
-                        tensor->ne[0],
-                        1,
-                        tensor->ne[1]*tensor->ne[2],
-                        tensor->ne[3]
-                    };
-                    struct ggml_tensor * tensor2 = ggml_cont(ctx,
-                        ggml_reshape_4d(ctx,
-                            ggml_cont(ctx, tensor),
-                            ne2[0], ne2[1], ne2[2], ne2[3]));
-
-                    struct ggml_tensor * grad2 = ggml_cont(ctx,
-                        ggml_reshape_4d(ctx,
-                            ggml_cont(ctx, tensor->grad),
-                            ne2[0], ne2[1], ne2[2], ne2[3]));
-
-                    struct ggml_tensor * tensor2_t = ggml_cont(ctx, // [1,ne0,ne1*ne2,ne3]
-                        ggml_permute(ctx,                           // [1,ne0,ne1*ne2,ne3]
-                            tensor2,                                // [ne0,1,ne1*ne2,ne3]
-                            1, 0, 2, 3));
-
                     src0->grad =
-                        ggml_add_impl(ctx,
-                            src0->grad,                   // [ne0,ne1,ne2,ne3]
-                            ggml_reshape(ctx,             // [ne0,ne1,ne2,ne3]
-                                ggml_mul_mat(ctx,         // [ne0,1,ne1*ne2,ne3]
-                                    ggml_sub(ctx,         // [ne0,ne0,ne1*ne2,ne3]
-                                        ggml_diag(ctx,    // [ne0,ne0,ne1*ne2,ne3]
-                                            tensor2),     // [ne0,1,ne1*ne2,ne3]
-                                        ggml_mul_mat(ctx, // [ne0,ne0,ne1*ne2,ne3]
-                                            tensor2_t,    // [1,ne0,ne1*ne2,ne3]
-                                            tensor2_t)),  // [1,ne0,ne1*ne2,ne3]
-                                    grad2),               // [ne0,1,ne1*ne2,ne3]
-                                src0->grad),
-                            inplace);
+                        ggml_add_impl(ctx, src0->grad,
+                            ggml_soft_max_back(ctx, tensor->grad, tensor),
+                        inplace);
                 }
+
+            } break;
+        case GGML_OP_SOFT_MAX_BACK:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
             } break;
         case GGML_OP_ROPE:
             {
@@ -13718,6 +13825,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     } break;
                 case GGML_OP_DIAG_MASK_INF:
                 case GGML_OP_SOFT_MAX:
+                case GGML_OP_SOFT_MAX_BACK:
                 case GGML_OP_ROPE:
                 case GGML_OP_ROPE_BACK:
                     {
diff --git a/ggml.h b/ggml.h
index 967ef72d034dd..0a0989516265b 100644
--- a/ggml.h
+++ b/ggml.h
@@ -307,6 +307,7 @@ extern "C" {
         GGML_OP_DIAG_MASK_INF,
         GGML_OP_DIAG_MASK_ZERO,
         GGML_OP_SOFT_MAX,
+        GGML_OP_SOFT_MAX_BACK,
         GGML_OP_ROPE,
         GGML_OP_ROPE_BACK,
         GGML_OP_ALIBI,
@@ -860,6 +861,17 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
 
+    GGML_API struct ggml_tensor * ggml_soft_max_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
     // rotary position embedding
     // if mode & 1 == 1, skip n_past elements
     // if mode & 2 == 1, GPT-NeoX style

From 4339f8cf285c95af503a7105b747b29ef7b1d64b Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 14 May 2023 17:55:02 +0200
Subject: [PATCH 08/86] improve softmax backward pass

go from quadratic runtime to linear runtime by simplifying the formulas
---
 ggml.c | 40 +++++++++++++++++++---------------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/ggml.c b/ggml.c
index 2cc51fcc0d8d7..0935549a4aacf 100644
--- a/ggml.c
+++ b/ggml.c
@@ -10571,27 +10571,25 @@ static void ggml_compute_forward_soft_max_back_f32(
         // J = diag(y)-y.T*y
         // dx = J * dy
         // dxk = sum_i(Jki * dyi)
-
-        // quadratic runtime, linear memory
-        for (int k = 0; k < nc; k++) {
-
-            ggml_float sum = 0.0;
-
-            for (int i = 0; i < k; i++) {
-                float Jki = -y[k]*y[i];
-                sum += (ggml_float) Jki * dy[i];
-            }
-
-            float Jkk = y[k] - y[k]*y[k];
-            sum += (ggml_float) Jkk * dy[k];
-
-            for (int i = k+1; i < nc; i++) {
-                float Jki = -y[k]*y[i];
-                sum += (ggml_float) Jki * dy[i];
-            }
-
-            dx[k] = (float) sum;
-        }
+        // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
+        // dxk = sum_i(-yk*yi * dyi) + yk*dyk
+        // dxk = -yk * sum_i(yi * dyi) + yk*dyk
+        // dxk = -yk * dot(y, dy) + yk*dyk
+        // dxk = yk * (- dot(y, dy) + dyk)
+        // dxk = yk * (dyk - dot(y, dy))
+        // 
+        // post-order:
+        // dot_y_dy := dot(y, dy)
+        // dx := dy
+        // dx := dx - dot_y_dy
+        // dx := dx * y
+
+        // linear runtime, no additional memory
+        float dot_y_dy = 0;
+        ggml_vec_dot_f32 (nc, &dot_y_dy, y, dy);
+        ggml_vec_cpy_f32 (nc, dx, dy);
+        ggml_vec_acc1_f32(nc, dx, -dot_y_dy);
+        ggml_vec_mul_f32 (nc, dx, dx, y);
 
 #ifndef NDEBUG
         for (int i = 0; i < nc; ++i) {

From 69108167cd17ef40f40272defb4dabd106b3f003 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 14 May 2023 20:54:57 +0200
Subject: [PATCH 09/86] fix race condition bug in non-inplace
 ggml_compute_forward_diag_mask_f32

memcpy needs to be synchronized across threads to avoid race conditions.
=> do it in INIT phase
---
 ggml.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/ggml.c b/ggml.c
index 0935549a4aacf..2286c615da5f3 100644
--- a/ggml.c
+++ b/ggml.c
@@ -10358,8 +10358,8 @@ static void ggml_compute_forward_diag_mask_f32(
         const struct ggml_tensor * src1,
         struct ggml_tensor * dst,
         const float value) {
-    assert(src1->type == GGML_TYPE_I32);
-    assert(ggml_nelements(src1) == 2);
+    GGML_ASSERT(src1->type == GGML_TYPE_I32);
+    GGML_ASSERT(ggml_nelements(src1) == 2);
 
     const int ith = params->ith;
     const int nth = params->nth;
@@ -10369,9 +10369,12 @@ static void ggml_compute_forward_diag_mask_f32(
 
 
     if (!inplace && (params->type == GGML_TASK_INIT)) {
-        // dup needs to be synchronized across threads to avoid race conditions.
+        // memcpy needs to be synchronized across threads to avoid race conditions.
         // => do it in INIT phase
-        ggml_compute_forward_dup_same_cont(params, src0, dst);
+        memcpy(
+            ((char *)  dst->data),
+            ((char *) src0->data),
+            ggml_nbytes(dst));
     }
 
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -10385,8 +10388,8 @@ static void ggml_compute_forward_diag_mask_f32(
     const int nr = src0->ne[1];
     const int nz = n/nr;
 
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
+    GGML_ASSERT( dst->nb[0] == sizeof(float));
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
 
     for (int k = 0; k < nz; k++) {
         for (int j = ith; j < nr; j += nth) {

From 1f2b76de01ce3da98417518d4fad6f5d1fa89f6f Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 14 May 2023 20:55:24 +0200
Subject: [PATCH 10/86] fix bug in ggml_compute_forward_soft_max_back_f32 on
 DEBUG build

---
 ggml.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index 2286c615da5f3..06e3feea05424 100644
--- a/ggml.c
+++ b/ggml.c
@@ -10565,8 +10565,8 @@ static void ggml_compute_forward_soft_max_back_f32(
 #ifndef NDEBUG
         for (int i = 0; i < nc; ++i) {
             //printf("p[%d] = %f\n", i, p[i]);
-            assert(!isnan(s0[i]));
-            assert(!isnan(s1[i]));
+            assert(!isnan(dy[i]));
+            assert(!isnan(y[i]));
         }
 #endif
         // Jii = yi - yi*yi

From c054079fb81a25acf941c9c27c19087c0eaed632 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 14 May 2023 20:56:50 +0200
Subject: [PATCH 11/86] improve performance of mul_mat backward pass

avoid transpose by using mul_mat with swapped arguments
---
 ggml.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/ggml.c b/ggml.c
index 06e3feea05424..9a0a07aa57d40 100644
--- a/ggml.c
+++ b/ggml.c
@@ -13050,15 +13050,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                 //     src1,          // [n,p]
                                 //     tensor->grad), // [m,p]
                                 // for now just using A*B==(B.T*A.T).T
-                                ggml_cont(ctx,                      // [n,m]
-                                    ggml_transpose(ctx,             // [n,m]
-                                        ggml_mul_mat(ctx,           // [m,n]
-                                            ggml_cont(ctx,          // [p,m]
-                                                ggml_transpose(ctx, // [p,m]
-                                                    tensor->grad)), // [m,p]
-                                            ggml_cont(ctx,          // [p,n]
-                                                ggml_transpose(ctx, // [p,n]
-                                                    src1))))),      // [n,p]
+                                ggml_mul_mat(ctx,                   // [n,m]
+                                    ggml_cont(ctx,                  // [p,n]
+                                        ggml_transpose(ctx,         // [p,n]
+                                            src1)),                 // [n,p]
+                                    ggml_cont(ctx,                  // [p,m]
+                                        ggml_transpose(ctx,         // [p,m]
+                                            tensor->grad))),        // [m,p]
                                 inplace);
                 }
                 if (src1->grad) {
@@ -13070,6 +13068,15 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                     ggml_cont(ctx,                  // [m,n]
                                         ggml_transpose(ctx, src0)), // [m,n]
                                     tensor->grad),                  // [m,p]
+
+                                // // when src0 is bigger than tensor->grad (this is the case in llama),
+                                // // avoid transpose of src0, rather transpose smaller tensor->grad
+                                // // and then use ggml_out_prod
+                                // ggml_out_prod(ctx,                  // [n,p]
+                                //     src0,                           // [n,m]
+                                //     ggml_cont(ctx,                  // [p,m]
+                                //         ggml_transpose(ctx,         // [p,m]
+                                //             tensor->grad)),         // [m,p]
                                 inplace);
                 }
             } break;

From d9b526872880acdd430d14a97ebd9b386c15a8eb Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 14 May 2023 20:57:47 +0200
Subject: [PATCH 12/86] avoid printing too much newlines in baby-llama-text

---
 examples/baby-llama/baby-llama-text.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index d114d689e05a1..c5abf66a4b686 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -969,9 +969,16 @@ void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
 
 void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens) {
     for (int i1=0; i1<tokens->ne[1]; ++i1) {
+        int num_newline = 0;
         for (int i0=0; i0<tokens->ne[0]; ++i0) {
             int token = ggml_get_i32_1d(tokens, i0 + i1*tokens->ne[0]);
-            print_token(ctx, token);
+            bool isnl = (token == llama_token_nl());
+            if (isnl) {
+                ++num_newline;
+            }
+            if (!isnl || (num_newline < 2)) {
+                print_token(ctx, token);
+            }
         }
         printf("\n--\n");
     }

From a703d7a85f010c1ed25ec02bc6bb0bfd77eb72ba Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 14 May 2023 20:58:43 +0200
Subject: [PATCH 13/86] activate threading in baby-llama-text

---
 examples/baby-llama/baby-llama-text.cpp | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index c5abf66a4b686..9f2ff90340289 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -1199,10 +1199,12 @@ int main(int argc, char ** argv) {
 
     struct llama_context * lctx = llama_init_from_file(fn_model, llama_params);
 
+    printf("%s: tokenize training data\n", __func__);
     std::vector<llama_token> train_tokens;
     if (tokenize_file(lctx, fn_train, train_tokens) < 0) {
         fprintf(stderr, "%s: failed to tokenize file '%s'\n", __func__, fn_train);
     }
+    printf("%s: number of training tokens: %d\n", __func__, train_tokens.size());
 
     struct my_llama_model model;
     model.hparams.n_vocab = llama_n_vocab(lctx);
@@ -1225,7 +1227,7 @@ int main(int argc, char ** argv) {
     model.ctx = ggml_init(lcparams);
     kv_self.ctx = model.ctx;
 
-    printf("init model\n");
+    printf("%s: init model\n", __func__);
     init_model(&model);
     set_param_model(&model);
     randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
@@ -1238,6 +1240,8 @@ int main(int argc, char ** argv) {
     int n_tokens = model.hparams.n_ctx;
     int n_vocab  = model.hparams.n_vocab;
 
+    printf("%s: begin training\n", __func__);
+
     for (int ex=0; ex<n_examples; ++ex) {
         struct ggml_init_params params = {
             /*.mem_size   =*/ compute_size,
@@ -1254,7 +1258,7 @@ int main(int argc, char ** argv) {
         int n_past = 0;
 
         ggml_cgraph gf = {};
-        gf.n_threads = 1;
+        gf.n_threads = 4;
 
         get_example_targets_batch(ctx0, train_tokens.data(), train_tokens.size(), ex,  tokens_input, targets);
 
@@ -1271,9 +1275,12 @@ int main(int argc, char ** argv) {
         struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
         opt_params_adam.print_forward_graph = false;
         opt_params_adam.print_backward_graph = false;
+        opt_params_adam.n_threads = gf.n_threads;
+        opt_params_adam.adam.n_iter = 16;
+        
         opt_params_lbfgs.print_forward_graph = false;
         opt_params_lbfgs.print_backward_graph = false;
-        opt_params_adam.adam.n_iter = 16;
+        opt_params_lbfgs.n_threads = gf.n_threads;
         opt_params_lbfgs.lbfgs.n_iter = 16;
         ggml_opt(ctx0, opt_params_adam, e);
         // ggml_opt(ctx0, opt_params_lbfgs, e);

From efa4bb78ea0672b7ead8807a945ce77ef9474e14 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 15 May 2023 14:17:42 +0200
Subject: [PATCH 14/86] add ggml_out_prod and use it for mul_mat backward pass
 for improved performance

performance stats report improvement from 37 seconds to 16 seconds runtime during my training tests
---
 ggml.c | 264 ++++++++++++++++++++++++++++++++++++++++++++++++++-------
 ggml.h |  13 ++-
 2 files changed, 246 insertions(+), 31 deletions(-)

diff --git a/ggml.c b/ggml.c
index 9a0a07aa57d40..77b654809972a 100644
--- a/ggml.c
+++ b/ggml.c
@@ -3310,6 +3310,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
     "RMS_NORM_BACK",
 
     "MUL_MAT",
+    "OUT_PROD",
 
     "SCALE",
     "SET",
@@ -3339,7 +3340,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
     "MAP_BINARY",
 };
 
-static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
+static_assert(GGML_OP_COUNT == 52, "GGML_OP_COUNT != 52");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -3370,6 +3371,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "rms_norm(x)",
     "rms_norm_back(x)",
 
+    "X*Y",
     "X*Y",
 
     "x*v",
@@ -3400,7 +3402,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "f(x,y)",
 };
 
-static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
+static_assert(GGML_OP_COUNT == 52, "GGML_OP_COUNT != 52");
 
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
 static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@@ -3566,6 +3568,15 @@ static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct
         (t0->ne[3] == t1->ne[3]);
 }
 
+static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return
+        (t0->ne[1] == t1->ne[1])  &&
+        (t0->ne[2] == t1->ne[2])  &&
+        (t0->ne[3] == t1->ne[3]);
+}
+
 bool ggml_is_quantized(enum ggml_type type) {
     return GGML_IS_QUANTIZED[type];
 }
@@ -5156,6 +5167,32 @@ struct ggml_tensor * ggml_mul_mat(
     return result;
 }
 
+// ggml_out_prod
+
+struct ggml_tensor * ggml_out_prod(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    GGML_ASSERT(ggml_can_out_prod(a, b));
+    GGML_ASSERT(!ggml_is_transposed(a));
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        is_node = true;
+    }
+
+    const int64_t ne[4] = { a->ne[0], b->ne[0], a->ne[2], b->ne[3] };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MIN(a->n_dims, b->n_dims), ne);
+
+    result->op   = GGML_OP_OUT_PROD;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src0 = a;
+    result->src1 = b;
+
+    return result;
+}
+
 // ggml_scale
 
 struct ggml_tensor * ggml_scale_impl(
@@ -9802,6 +9839,178 @@ static void ggml_compute_forward_mul_mat(
     }
 }
 
+// ggml_compute_forward_out_prod
+
+
+static void ggml_compute_forward_out_prod_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    int64_t t0 = ggml_perf_time_us();
+    UNUSED(t0);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+
+    const int64_t ne0  = dst->ne[0];
+    const int64_t ne1  = dst->ne[1];
+    const int64_t ne2  = dst->ne[2];
+    const int64_t ne3  = dst->ne[3];
+
+    const int nb00 = src0->nb[0];
+    const int nb01 = src0->nb[1];
+    const int nb02 = src0->nb[2];
+    const int nb03 = src0->nb[3];
+
+    const int nb10 = src1->nb[0];
+    const int nb11 = src1->nb[1];
+    const int nb12 = src1->nb[2];
+    const int nb13 = src1->nb[3];
+
+    const int nb0  = dst->nb[0];
+    const int nb1  = dst->nb[1];
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_ASSERT(ne02 == ne12);
+    GGML_ASSERT(ne03 == ne13);
+    GGML_ASSERT(ne2  == ne12);
+    GGML_ASSERT(ne3  == ne13);
+
+    // we don't support permuted src0 or src1
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    // GGML_ASSERT(nb0 <= nb1);
+    // GGML_ASSERT(nb1 <= nb2);
+    // GGML_ASSERT(nb2 <= nb3);
+
+    GGML_ASSERT(ne0 == ne00);
+    GGML_ASSERT(ne1 == ne10);
+    GGML_ASSERT(ne2 == ne02);
+    GGML_ASSERT(ne3 == ne03);
+
+    // nb01 >= nb00 - src0 is not transposed
+    //   compute by src0 rows
+
+    // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
+    // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+
+    if (params->type == GGML_TASK_INIT) {
+        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // parallelize by last two dimensions
+
+    // total parallel in src0
+    const int64_t np = ne02*ne03;
+
+    // per thread
+    const int64_t dp = (np + nth - 1)/nth;
+
+    // range for this thread
+    const int64_t ip0 = dp*ith;
+    const int64_t ip1 = MIN(ip0 + dp, np);
+
+    // dst[:,:,:,:] = 0
+    // for i2,i3:
+    //   for i01:
+    //     for i1:
+    //       for i0:
+    //         dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3]
+
+    for (int64_t ip = ip0; ip < ip1; ++ip) {
+        // src0 indices
+        const int64_t i3 = ip/ne02;
+        const int64_t i2 = ip - i3*ne02;
+
+        const int64_t i02 = i2;
+        const int64_t i03 = i3;
+
+        const int64_t i12 = i2;
+        const int64_t i13 = i3;
+
+        for (int64_t i01 = 0; i01 < ne01; ++i01) {
+            const int64_t i11 = i01;
+
+            for (int64_t i1 = 0; i1 < ne1; ++i1) {
+                const int64_t i10 = i1;
+
+                float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
+                float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
+                float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
+
+                ggml_vec_mad_f32(ne0, d, s0, *s1);
+                // for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                //     d[i0] += s0[i0] * s1[i1];
+                // }
+            }
+        }
+    }
+
+    //int64_t t1 = ggml_perf_time_us();
+    //static int64_t acc = 0;
+    //acc += t1 - t0;
+    //if (t1 - t0 > 10) {
+    //    printf("\n");
+    //    printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
+    //    printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
+    //    printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
+    //    printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13);
+
+    //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
+    //}
+}
+
+static void ggml_compute_forward_out_prod(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q8_1:
+            {
+                GGML_ASSERT(false); // todo
+                // ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_F16:
+            {
+                GGML_ASSERT(false); // todo
+                // ggml_compute_forward_out_prod_f16_f32(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_out_prod_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
 // ggml_compute_forward_scale
 
 static void ggml_compute_forward_scale_f32(
@@ -10380,7 +10589,7 @@ static void ggml_compute_forward_diag_mask_f32(
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
     }
-    
+
     // TODO: handle transposed/permuted matrices
 
     const int n  = ggml_nrows(src0);
@@ -10541,7 +10750,7 @@ static void ggml_compute_forward_soft_max_back_f32(
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
     }
-                
+
     // TODO: handle transposed/permuted matrices
 
     const int ith = params->ith;
@@ -10580,7 +10789,7 @@ static void ggml_compute_forward_soft_max_back_f32(
         // dxk = -yk * dot(y, dy) + yk*dyk
         // dxk = yk * (- dot(y, dy) + dyk)
         // dxk = yk * (dyk - dot(y, dy))
-        // 
+        //
         // post-order:
         // dot_y_dy := dot(y, dy)
         // dx := dy
@@ -12611,6 +12820,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_mul_mat(params, tensor->src0, tensor->src1, tensor);
             } break;
+        case GGML_OP_OUT_PROD:
+            {
+                ggml_compute_forward_out_prod(params, tensor->src0, tensor->src1, tensor);
+            } break;
         case GGML_OP_SCALE:
             {
                 ggml_compute_forward_scale(params, tensor->src0, tensor->src1, tensor);
@@ -13041,45 +13254,37 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
 
                 // necessary for llama
                 if (src0->grad) {
-                    // TODO: this requires outer product - ggml_out_prod(ctx, src1, tensor->grad);
                     src0->grad =
                         ggml_add_impl(ctx,
                                 src0->grad,
-                                // ds0 = dt.dot(s1.T)
-                                // ggml_out_prod(ctx, // [n,m]
-                                //     src1,          // [n,p]
-                                //     tensor->grad), // [m,p]
-                                // for now just using A*B==(B.T*A.T).T
-                                ggml_mul_mat(ctx,                   // [n,m]
-                                    ggml_cont(ctx,                  // [p,n]
-                                        ggml_transpose(ctx,         // [p,n]
-                                            src1)),                 // [n,p]
-                                    ggml_cont(ctx,                  // [p,m]
-                                        ggml_transpose(ctx,         // [p,m]
-                                            tensor->grad))),        // [m,p]
+                                ggml_out_prod(ctx, // [n,m]
+                                    src1,          // [n,p]
+                                    tensor->grad), // [m,p]
                                 inplace);
                 }
                 if (src1->grad) {
                     src1->grad =
                         ggml_add_impl(ctx,
                                 src1->grad,
-                                // ds1 = s0.T.dot(dt):
-                                ggml_mul_mat(ctx,                   // [n,p]
-                                    ggml_cont(ctx,                  // [m,n]
-                                        ggml_transpose(ctx, src0)), // [m,n]
-                                    tensor->grad),                  // [m,p]
+                                // ggml_mul_mat(ctx,                   // [n,p]
+                                //     ggml_cont(ctx,                  // [m,n]
+                                //         ggml_transpose(ctx, src0)), // [m,n]
+                                //     tensor->grad),                  // [m,p]
 
-                                // // when src0 is bigger than tensor->grad (this is the case in llama),
+                                // // when src0 is bigger than tensor->grad (this is mostly the case in llama),
                                 // // avoid transpose of src0, rather transpose smaller tensor->grad
                                 // // and then use ggml_out_prod
-                                // ggml_out_prod(ctx,                  // [n,p]
-                                //     src0,                           // [n,m]
-                                //     ggml_cont(ctx,                  // [p,m]
-                                //         ggml_transpose(ctx,         // [p,m]
-                                //             tensor->grad)),         // [m,p]
+                                ggml_out_prod(ctx,                  // [n,p]
+                                    src0,                           // [n,m]
+                                    ggml_transpose(ctx,             // [p,m]
+                                        tensor->grad)),             // [m,p]
                                 inplace);
                 }
             } break;
+        case GGML_OP_OUT_PROD:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
         case GGML_OP_SCALE:
             {
                 // necessary for llama
@@ -13757,6 +13962,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                         node->n_tasks = n_threads;
                     } break;
                 case GGML_OP_MUL_MAT:
+                case GGML_OP_OUT_PROD:
                     {
                         node->n_tasks = n_threads;
 
diff --git a/ggml.h b/ggml.h
index 0a0989516265b..aa75fd726b18d 100644
--- a/ggml.h
+++ b/ggml.h
@@ -292,6 +292,7 @@ extern "C" {
         GGML_OP_RMS_NORM_BACK,
 
         GGML_OP_MUL_MAT,
+        GGML_OP_OUT_PROD,
 
         GGML_OP_SCALE,
         GGML_OP_SET,
@@ -643,14 +644,22 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
 
-    // A: m rows, n columns
-    // B: p rows, n columns (i.e. we transpose it internally)
+    // A: n columns, m rows
+    // B: n columns, p rows  (i.e. we transpose it internally)
     // result is m columns, p rows
     GGML_API struct ggml_tensor * ggml_mul_mat(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
 
+    // A: m columns, n rows,
+    // B: p columns, n rows,
+    // result is m columns, p rows
+    GGML_API struct ggml_tensor * ggml_out_prod(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
     //
     // operations on tensors without backpropagation
     //

From f3cf7df21fcd95ca618caa6c57e63483cdc3eb12 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 15 May 2023 14:18:57 +0200
Subject: [PATCH 15/86] better weight initialization improves training
 convergence at start

---
 examples/baby-llama/baby-llama.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
index 5573c154b5622..e5639da37e576 100644
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -79,34 +79,39 @@ struct ggml_tensor * randomize_tensor_normal(
         int ndims,
         const int64_t ne[],
         struct random_normal_distribution * rnd) {
+    float scale = 1.0; // xavier
     switch (ndims) {
         case 1:
+            scale /= sqrtf(ne[0]);
             for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((float *)tensor->data)[i0] = frand_normal(rnd);
+                ((float *)tensor->data)[i0] = scale * frand_normal(rnd);
             }
             break;
         case 2:
+            scale /= sqrtf(ne[0]+ne[1]);
             for (int i1 = 0; i1 < ne[1]; i1++) {
                 for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((float *)tensor->data)[i1*ne[0] + i0] = frand_normal(rnd);
+                    ((float *)tensor->data)[i1*ne[0] + i0] = scale * frand_normal(rnd);
                 }
             }
             break;
         case 3:
+            scale /= sqrtf(ne[0]+ne[1]);
             for (int i2 = 0; i2 < ne[2]; i2++) {
                 for (int i1 = 0; i1 < ne[1]; i1++) {
                     for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand_normal(rnd);
+                        ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
                     }
                 }
             }
             break;
         case 4:
+            scale /= sqrtf(ne[0]+ne[1]);
             for (int i3 = 0; i3 < ne[3]; i3++) {
                 for (int i2 = 0; i2 < ne[2]; i2++) {
                     for (int i1 = 0; i1 < ne[1]; i1++) {
                         for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand_normal(rnd);
+                            ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
                         }
                     }
                 }

From 19fb91899bad98423a5a14cea8cfa22a3334432d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 15 May 2023 14:19:38 +0200
Subject: [PATCH 16/86] better weight initialization improves training
 convergence at start

---
 examples/baby-llama/baby-llama-text.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index 9f2ff90340289..b5177ed5b99dd 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -51,38 +51,43 @@ float frand_uniform(struct random_uniform_distribution * rnd) {
 }
 
 struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
+    float scale = 1.0f; // xavier
     switch (tensor->n_dims) {
         case 1:
+            scale /= sqrtf(tensor->ne[0]);
             for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
                 float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
-                *dst = frand_normal(rnd);
+                *dst = scale * frand_normal(rnd);
             }
             break;
         case 2:
+            scale /= sqrtf(tensor->ne[0]*tensor->ne[1]);
             for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
                 for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
                     float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-                    *dst = frand_normal(rnd);
+                    *dst = scale * frand_normal(rnd);
                 }
             }
             break;
         case 3:
+            scale /= sqrtf(tensor->ne[0]*tensor->ne[1]);
             for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
                 for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
                     for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
                         float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
-                        *dst = frand_normal(rnd);
+                        *dst = scale * frand_normal(rnd);
                     }
                 }
             }
             break;
         case 4:
+            scale /= sqrtf(tensor->ne[0]*tensor->ne[1]);
             for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
                 for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
                     for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
                         for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
                             float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]);
-                            *dst = frand_normal(rnd);
+                            *dst = scale * frand_normal(rnd);
                         }
                     }
                 }

From ec881156f6ad56ed552c06698083f5a263ff0a6d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 15 May 2023 14:42:24 +0200
Subject: [PATCH 17/86] improve ggml_out_prod performance

- change iteration order (>15s -> 10s runtime)
- parallelize over one more dimension: over dst matrix rows (10s -> <5s runtime)
---
 ggml.c | 51 +++++++++++++++++++++++++--------------------------
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/ggml.c b/ggml.c
index 77b654809972a..52a9c9bcc0abb 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9917,51 +9917,50 @@ static void ggml_compute_forward_out_prod_f32(
         return;
     }
 
-    // parallelize by last two dimensions
+    // parallelize by last three dimensions
 
-    // total parallel in src0
-    const int64_t np = ne02*ne03;
+    // total rows in dst
+    const int64_t nr = ne1*ne2*ne3;
 
-    // per thread
-    const int64_t dp = (np + nth - 1)/nth;
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
 
-    // range for this thread
-    const int64_t ip0 = dp*ith;
-    const int64_t ip1 = MIN(ip0 + dp, np);
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
 
     // dst[:,:,:,:] = 0
     // for i2,i3:
-    //   for i01:
-    //     for i1:
+    //   for i1:
+    //     for i01:
     //       for i0:
     //         dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3]
 
-    for (int64_t ip = ip0; ip < ip1; ++ip) {
-        // src0 indices
-        const int64_t i3 = ip/ne02;
-        const int64_t i2 = ip - i3*ne02;
-
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+        // dst indices
+        const int64_t i3 = ir/(ne2*ne1);
+        const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
+        const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);
+        
         const int64_t i02 = i2;
         const int64_t i03 = i3;
 
+        const int64_t i10 = i1;
         const int64_t i12 = i2;
         const int64_t i13 = i3;
 
+
         for (int64_t i01 = 0; i01 < ne01; ++i01) {
             const int64_t i11 = i01;
 
-            for (int64_t i1 = 0; i1 < ne1; ++i1) {
-                const int64_t i10 = i1;
-
-                float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
-                float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
-                float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
+            float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
+            float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
+            float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
 
-                ggml_vec_mad_f32(ne0, d, s0, *s1);
-                // for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                //     d[i0] += s0[i0] * s1[i1];
-                // }
-            }
+            ggml_vec_mad_f32(ne0, d, s0, *s1);
+            // for (int64_t i0 = 0; i0 < ne0; ++i0) {
+            //     d[i0] += s0[i0] * s1[i1];
+            // }
         }
     }
 

From e063135d0bcbdec80b54e182faa7954771d9d989 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 15 May 2023 21:12:28 +0200
Subject: [PATCH 18/86] add llama sampler, shuffle samples and constrain
 sampling to tokens occurring in train data

---
 examples/baby-llama/baby-llama-text.cpp | 235 ++++++++++++++++++++++--
 1 file changed, 219 insertions(+), 16 deletions(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index b5177ed5b99dd..ed7dc9666e4ba 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -7,6 +7,7 @@
 #include <cstring>
 #include <stdexcept>
 #include <cstdarg>
+#include <algorithm>
 
 
 struct random_normal_distribution {
@@ -42,6 +43,10 @@ float fclamp(const float v, const float min, const float max) {
     return ((v < min) ? (min) : (v > max) ? (max) : v);
 }
 
+float frand() {
+    return (float)rand()/(float)RAND_MAX;
+}
+
 float frand_normal(struct random_normal_distribution * rnd) {
     return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max);
 }
@@ -162,6 +167,17 @@ uint32_t get_n_ff(const struct my_llama_hparams* hparams) {
     return n_ff;
 }
 
+void print_params(struct my_llama_hparams * params) {
+    printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
+    printf("%s: n_ctx:   %d\n", __func__, params->n_ctx);
+    printf("%s: n_embd:  %d\n", __func__, params->n_embd);
+    printf("%s: n_mult:  %d\n", __func__, params->n_mult);
+    printf("%s: n_head:  %d\n", __func__, params->n_head);
+    printf("%s: n_ff:    %d\n", __func__, get_n_ff(params));
+    printf("%s: n_layer: %d\n", __func__, params->n_layer);
+    printf("%s: n_rot:   %d\n", __func__, params->n_rot);
+}
+
 struct my_llama_layer {
     // normalization
     struct ggml_tensor * attention_norm;
@@ -989,18 +1005,17 @@ void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens)
     }
 }
 
-void get_example_targets(const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
+void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
     int n_tokens = tokens_input->ne[0];
     int n_vocab  = targets->ne[0];
 
-    int n_examples = (n_train_data / (size_t) n_tokens);
-    int begin = (example_id % n_examples) * n_tokens;
-    GGML_ASSERT(begin+n_tokens-1 < n_train_data);
+    int sample = train_samples[example_id % n_train_samples];
+    GGML_ASSERT(sample+n_tokens-1 < n_train_data);
 
-    ggml_set_f32(targets, -1.0f);
+    ggml_set_f32(targets, -1.0f/n_vocab);
     ggml_set_i32_1d(tokens_input, 0, llama_token_bos());
     for (int i=1; i<n_tokens+1; ++i) {
-        int token = clamp(train_data[begin+i-1], 0, n_vocab-1);
+        int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
         ggml_set_f32_1d(targets, (i-1)*n_vocab + token, +1.0f);
         if (i<n_tokens) {
             ggml_set_i32_1d(tokens_input, i, token);
@@ -1008,7 +1023,7 @@ void get_example_targets(const llama_token * train_data, size_t n_train_data, in
     }
 }
 
-void get_example_targets_batch(struct ggml_context * ctx, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
+void get_example_targets_batch(struct ggml_context * ctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
     GGML_ASSERT(tokens_input->n_dims == 2);
     GGML_ASSERT(     targets->n_dims == 3);
     int n_tokens = tokens_input->ne[0];
@@ -1028,7 +1043,7 @@ void get_example_targets_batch(struct ggml_context * ctx, const llama_token * tr
                                                 targets->nb[1],
                                                 k*targets->nb[2]);
         
-        get_example_targets(train_data, n_train_data,
+        get_example_targets(train_samples, n_train_samples, train_data, n_train_data,
             example_id*n_batch + k, tokens_input_k, targets_k);
     }
 }
@@ -1171,10 +1186,11 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
     struct llama_file f(filename, "rb");
 
     std::vector<char> buf;
-    buf.resize(f.size);
+    buf.resize(f.size+1);
 
     f.read_raw(buf.data(), f.size);
-    
+    buf[f.size] = '\0';
+
     out.resize(buf.size());
 
     int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false);
@@ -1186,6 +1202,143 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
     return n_tokens;
 }
 
+void shuffle_ints(int * begin, int * end) {
+    if (end <= begin) return;
+    int max=begin[0];
+    for (int i=1; i<end-begin; ++i) {
+        if (begin[i] > max) {
+            max = begin[i];
+        }
+    }
+    std::vector<float> vals;
+    vals.resize(max+1);
+    for (int i=0; i<max+1; ++i) {
+       vals[i] = frand();
+    }
+    std::sort(begin, end, [&vals](auto a, auto b){ 
+       return vals.at(a) < vals.at(b);
+    });
+}
+
+struct my_llama_sampler_params {
+    float temp            = 0.0f;  // <= 0.0 disabled
+    int   top_k           = 20;    // <= 0 to use vocab size
+    float top_p           = 0.95f; // 1.0 = disabled
+    float tfs_z           = 1.00f; // 1.0 = disabled
+    float typical_p       = 1.00f; // 1.0 = disabled
+    int   repeat_last_n   = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float repeat_penalty  = 1.0f;  // 1.0 = disabled
+    float alpha_presence  = 0.0f;  // 0.0 = disabled
+    float alpha_frequency = 0.0f;  // 0.0 = disabled
+    int   mirostat        = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float mirostat_tau    = 5.00f; // target entropy
+    float mirostat_eta    = 0.10f; // learning rate
+    bool  penalize_nl     = true;  // consider newlines as a repeatable token
+};
+
+struct my_llama_sampler {
+    struct llama_context * ctx = NULL;
+    my_llama_sampler_params params;
+    
+    int n_vocab = 0;
+    int n_ctx = 0;
+
+    float mirostat_mu;
+
+    std::vector<llama_token_data> candidates;
+    llama_token_data_array candidates_p;
+
+};
+
+void init_sampler(struct my_llama_sampler * sampler, struct llama_context * ctx) {
+    sampler->ctx = ctx;
+    sampler->n_vocab = llama_n_vocab(sampler->ctx);
+    sampler->n_ctx   = llama_n_ctx(sampler->ctx);
+    sampler->mirostat_mu = 2.0f * sampler->params.mirostat_tau;
+}
+
+llama_token sample(struct my_llama_sampler * sampler, float * logits, const llama_token * last_tokens, int n_last_tokens) {
+    GGML_ASSERT(sampler->ctx != NULL);
+
+    struct llama_context * ctx = sampler->ctx;
+
+    sampler->candidates.resize(sampler->n_vocab);
+    for (llama_token token_id = 0; token_id < sampler->n_vocab; ++token_id) {
+        sampler->candidates[token_id].id = token_id;
+        sampler->candidates[token_id].logit = logits[token_id];
+        sampler->candidates[token_id].p = 0.0;
+    }
+
+    llama_token_data_array * candidates_p = & sampler->candidates_p;
+
+    candidates_p->data = sampler->candidates.data();
+    candidates_p->size = sampler->candidates.size();
+    candidates_p->sorted = false;
+
+    const auto params = sampler->params;
+
+    // Apply penalties
+    const float nl_logit = logits[llama_token_nl()];
+
+    const int n_last = std::min(std::min(n_last_tokens, params.repeat_last_n), sampler->n_ctx);
+
+    llama_sample_repetition_penalty(
+        ctx, 
+        candidates_p,
+        last_tokens + n_last_tokens - n_last,
+        n_last,
+        params.repeat_penalty);
+    llama_sample_frequency_and_presence_penalties(
+        ctx, 
+        candidates_p,
+        last_tokens + n_last_tokens - n_last,
+        n_last, 
+        params.alpha_frequency, 
+        params.alpha_presence);
+
+    if (!params.penalize_nl) {
+        logits[llama_token_nl()] = nl_logit;
+    }
+
+    llama_token token = 0;
+    if (params.temp <= 0) {
+        // Greedy sampling
+        token = llama_sample_token_greedy(ctx, candidates_p);
+    } else {
+        if (params.mirostat == 1) {
+            int mirostat_m = 100;
+            llama_sample_temperature(ctx, candidates_p, params.temp);
+            token = llama_sample_token_mirostat(ctx, candidates_p, params.mirostat_tau, params.mirostat_eta, mirostat_m, &sampler->mirostat_mu);
+        } else if (params.mirostat == 2) {
+            llama_sample_temperature(ctx, candidates_p, params.temp);
+            token = llama_sample_token_mirostat_v2(ctx, candidates_p, params.mirostat_tau, params.mirostat_eta, &sampler->mirostat_mu);
+        } else {
+            // Temperature sampling
+            llama_sample_top_k        (ctx, candidates_p, params.top_k, 1);
+            llama_sample_tail_free    (ctx, candidates_p, params.tfs_z, 1);
+            llama_sample_typical      (ctx, candidates_p, params.typical_p, 1);
+            
+            llama_sample_top_p        (ctx, candidates_p, params.top_p, 1);
+            llama_sample_temperature  (ctx, candidates_p, params.temp);
+            token = llama_sample_token(ctx, candidates_p);
+        }
+    }
+    return token;
+}
+
+void set_logits_masked(struct ggml_tensor * logits, std::vector<bool>& mask, float value) {
+    GGML_ASSERT(logits->ne[0] == mask.size());
+    for (int i2 = 0; i2 < logits->ne[2]; ++i2) {
+        for (int i1 = 0; i1 < logits->ne[1]; ++i1) {
+            for (int i0 = 0; i0 < logits->ne[0]; ++i0) {
+                if (!mask[i0]) continue;
+                float * ptr = (float *) ((char *) logits->data + i2*logits->nb[2] + i1*logits->nb[1] + i0*logits->nb[0]);
+                *ptr = value;
+            }
+        }
+    }
+}
+
 int main(int argc, char ** argv) {
     const char * default_model = "ggml-vic7b-uncensored-q4_0.bin";
     const char * default_train = "shakespeare.txt";
@@ -1220,6 +1373,17 @@ int main(int argc, char ** argv) {
     model.hparams.n_layer = 1;
     model.hparams.n_rot   = std::min(16u, model.hparams.n_embd / model.hparams.n_head);
 
+    print_params(&model.hparams);
+
+    std::vector<bool> token_occurs;
+    std::vector<bool> token_notavail;
+    token_occurs.resize(model.hparams.n_vocab, false);
+    token_notavail.resize(model.hparams.n_vocab, true);
+    for (int i=0; i<train_tokens.size(); ++i) {
+        token_occurs[train_tokens[i]] = true;
+        token_notavail[train_tokens[i]] = false;
+    }
+
     struct my_llama_kv_cache kv_self;
 
     int n_batch = 8;
@@ -1232,11 +1396,15 @@ int main(int argc, char ** argv) {
     model.ctx = ggml_init(lcparams);
     kv_self.ctx = model.ctx;
 
+    my_llama_sampler sampler;
+
     printf("%s: init model\n", __func__);
     init_model(&model);
     set_param_model(&model);
     randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
     init_kv_cache(&kv_self, &model, n_batch);
+    init_sampler(&sampler, lctx);
+
 
     size_t    compute_size = 1024ll*1024ll*1024ll*32ll;
     uint8_t * compute_addr = new uint8_t[compute_size];
@@ -1245,9 +1413,25 @@ int main(int argc, char ** argv) {
     int n_tokens = model.hparams.n_ctx;
     int n_vocab  = model.hparams.n_vocab;
 
+    std::vector<int> train_samples;
+    for (int i=0; i<train_tokens.size()-n_tokens; ++i) {
+        train_samples.push_back(i);
+    }
+    shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size());
+    for (int i=0; i<train_samples.size(); ++i) {
+        GGML_ASSERT(train_samples[i]+n_tokens-1 < train_tokens.size());
+    }
+
     printf("%s: begin training\n", __func__);
 
     for (int ex=0; ex<n_examples; ++ex) {
+        if (ex*n_batch >= train_samples.size()) {
+            shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size());
+            for (int i=0; i<train_samples.size(); ++i) {
+                GGML_ASSERT(train_samples[i]+n_tokens-1 < train_tokens.size());
+            }
+        }
+
         struct ggml_init_params params = {
             /*.mem_size   =*/ compute_size,
             /*.mem_buffer =*/ compute_addr,
@@ -1302,14 +1486,27 @@ int main(int argc, char ** argv) {
         }
 
         if (ex % 2 == 0) {
-            sample_softmax_batch(ctx0, logits, after_opt_probs, after_opt_best_samples);
+            set_logits_masked(logits, token_notavail, -1e9);
+            for (int i=0; i<n_batch; ++i) {
+                init_sampler(&sampler, lctx);
+                for (int k=0; k<n_tokens; ++k) {
+                    int32_t token = sample(&sampler, 
+                        (float *)       ((char *) logits->data + i*logits->nb[2] + k*logits->nb[1]), 
+                        (llama_token *) ((char *) tokens_input->data + i*tokens_input->nb[1]), 
+                        k);
+                    * ((int32_t *) ((char *) after_opt_best_samples->data + i*after_opt_best_samples->nb[1] + k*after_opt_best_samples->nb[0])) = token;
+                }
+            }
+
+            // sample_softmax_batch(ctx0, logits, after_opt_probs, after_opt_best_samples);
             // printf("probabilities after optimization:\n");
             // print_matrix(after_opt_probs);
             printf("Example:\n---\n");
             print_tokens_batch(lctx, tokens_input);
             printf("\n---\n");
 
-            printf("best samples after optimization:\n---\n");
+            // printf("best samples after optimization:\n---\n");
+            printf("samples after optimization:\n---\n");
             print_tokens_batch(lctx, after_opt_best_samples);
             printf("\n---\n");
         }
@@ -1320,13 +1517,15 @@ int main(int argc, char ** argv) {
     {
         int n_gen = 128;
         int sample_ctx = n_tokens - n_tokens/8;
+        
+        init_sampler(&sampler, lctx);
 
         printf("Generating %d tokens.\n", n_gen);
 
         struct ggml_tensor * tokens_input = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, n_tokens);
         struct ggml_tensor * targets      = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens);
 
-        get_example_targets(train_tokens.data(), train_tokens.size(), 137, tokens_input, targets);
+        get_example_targets(train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), 137, tokens_input, targets);
         for (int i=sample_ctx; i<n_tokens; ++i) {
             ggml_set_i32_1d(tokens_input, i, n_vocab/2);
         }
@@ -1356,9 +1555,13 @@ int main(int argc, char ** argv) {
             struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
             struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
 
-            sample_softmax(logits, probs, best_samples);
-
-            int token = ggml_get_i32_1d(best_samples, sample_ctx-1);
+            set_logits_masked(logits, token_notavail, -1e9);
+            int token = sample(&sampler, 
+                (float *) ((char *) logits->data + (sample_ctx-1)*logits->nb[1]), 
+                (llama_token *) tokens_input->data, 
+                sample_ctx-1);
+            // sample_softmax(logits, probs, best_samples);
+            //int token = ggml_get_i32_1d(best_samples, sample_ctx-1);
 
             // print_row(probs, sample_at);
             print_token(lctx, token);

From d328472f16b55ea206b1a30fa043c0e5df444bc6 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 17 May 2023 12:52:10 +0200
Subject: [PATCH 19/86] fix get_samples call, add model tensor names, increase
 model size, start training samples after newline

---
 examples/baby-llama/baby-llama-text.cpp | 117 +++++++++++++++---------
 1 file changed, 72 insertions(+), 45 deletions(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index ed7dc9666e4ba..2de3171f1c675 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -8,6 +8,7 @@
 #include <stdexcept>
 #include <cstdarg>
 #include <algorithm>
+#include <string>
 
 
 struct random_normal_distribution {
@@ -162,22 +163,6 @@ struct my_llama_hparams {
     }
 };
 
-uint32_t get_n_ff(const struct my_llama_hparams* hparams) {
-    const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
-    return n_ff;
-}
-
-void print_params(struct my_llama_hparams * params) {
-    printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
-    printf("%s: n_ctx:   %d\n", __func__, params->n_ctx);
-    printf("%s: n_embd:  %d\n", __func__, params->n_embd);
-    printf("%s: n_mult:  %d\n", __func__, params->n_mult);
-    printf("%s: n_head:  %d\n", __func__, params->n_head);
-    printf("%s: n_ff:    %d\n", __func__, get_n_ff(params));
-    printf("%s: n_layer: %d\n", __func__, params->n_layer);
-    printf("%s: n_rot:   %d\n", __func__, params->n_rot);
-}
-
 struct my_llama_layer {
     // normalization
     struct ggml_tensor * attention_norm;
@@ -221,6 +206,22 @@ struct my_llama_model {
     std::vector<my_llama_layer> layers;
 };
 
+uint32_t get_n_ff(const struct my_llama_hparams* hparams) {
+    const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
+    return n_ff;
+}
+
+void print_params(struct my_llama_hparams * params) {
+    printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
+    printf("%s: n_ctx:   %d\n", __func__, params->n_ctx);
+    printf("%s: n_embd:  %d\n", __func__, params->n_embd);
+    printf("%s: n_mult:  %d\n", __func__, params->n_mult);
+    printf("%s: n_head:  %d\n", __func__, params->n_head);
+    printf("%s: n_ff:    %d\n", __func__, get_n_ff(params));
+    printf("%s: n_layer: %d\n", __func__, params->n_layer);
+    printf("%s: n_rot:   %d\n", __func__, params->n_rot);
+}
+
 void init_model(struct my_llama_model * model) {
     const auto & hparams = model->hparams;
 
@@ -232,32 +233,48 @@ void init_model(struct my_llama_model * model) {
 
     struct ggml_context * ctx = model->ctx;
 
-    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab});
-    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);          // ("norm.weight",           {n_embd});
-    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("output.weight",         {n_embd, n_vocab});
+    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
+    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
+
+    ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
+    ggml_set_name(model->norm,           "norm.weight");
+    ggml_set_name(model->output,         "output.weight");
 
     model->layers.resize(n_layer);
     for (uint32_t i = 0; i < n_layer; ++i) {
         auto & layer = model->layers[i];
 
-        // std::string layers_i = "layers." + std::to_string(i);
+        std::string layers_i = "layers." + std::to_string(i);
+
+        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+
+        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
 
-        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd});
+        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
+        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,   n_ff, n_embd);
+        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
 
-        layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);     // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
-        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);     // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
-        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);     // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
-        layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);     // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
+        ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str());
+        
+        ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str());
+        ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str());
+        ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str());
+        ggml_set_name(layer.wo, (layers_i + ".attention.wo.weight").c_str());
 
-        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);       // (layers_i + ".ffn_norm.weight", {n_embd});
+        ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
 
-        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);     // (layers_i + ".feed_forward.w1.weight", {n_embd,   n_ff});
-        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,   n_ff, n_embd);     // (layers_i + ".feed_forward.w2.weight", {  n_ff,   n_embd});
-        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);     // (layers_i + ".feed_forward.w3.weight", {n_embd,   n_ff});
+        ggml_set_name(layer.w1, (layers_i + ".feed_forward.w1.weight").c_str());
+        ggml_set_name(layer.w2, (layers_i + ".feed_forward.w2.weight").c_str());
+        ggml_set_name(layer.w3, (layers_i + ".feed_forward.w3.weight").c_str());
     }
 }
 
-
 void set_param_model(struct my_llama_model * model) {
     const auto& hparams = model->hparams;
 
@@ -676,7 +693,6 @@ struct ggml_tensor * forward_batch(
                                 cur),
                         n_embd, N, n_batch),
                         1, 0, 2, 3));
-
                 assert_shape_3d(Vcur, N, n_embd, n_batch);
 
                 // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
@@ -1366,12 +1382,12 @@ int main(int argc, char ** argv) {
 
     struct my_llama_model model;
     model.hparams.n_vocab = llama_n_vocab(lctx);
-    model.hparams.n_ctx   = 16;
-    model.hparams.n_embd  = 64;
-    model.hparams.n_mult  = 8;
-    model.hparams.n_head  = 8;
-    model.hparams.n_layer = 1;
-    model.hparams.n_rot   = std::min(16u, model.hparams.n_embd / model.hparams.n_head);
+    model.hparams.n_ctx   = 32;
+    model.hparams.n_embd  = 128;
+    model.hparams.n_mult  = 64;
+    model.hparams.n_head  = 16;
+    model.hparams.n_layer = 4;
+    model.hparams.n_rot   = std::min(64u, model.hparams.n_embd / model.hparams.n_head);
 
     print_params(&model.hparams);
 
@@ -1383,13 +1399,18 @@ int main(int argc, char ** argv) {
         token_occurs[train_tokens[i]] = true;
         token_notavail[train_tokens[i]] = false;
     }
+    int n_unique_tokens = 0;
+    for (int i=0; i<token_occurs.size(); ++i) {
+        n_unique_tokens += token_occurs[i] ? 1 : 0;
+    }
+    printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens);
 
     struct my_llama_kv_cache kv_self;
 
-    int n_batch = 8;
+    int n_batch = 32;
 
     struct ggml_init_params lcparams;
-    lcparams.mem_size   = 1024ll*1024ll*1024ll;
+    lcparams.mem_size   = 1024ll*1024ll*1024ll*8ll;
     lcparams.mem_buffer = NULL;
     lcparams.no_alloc   = false;
 
@@ -1414,8 +1435,11 @@ int main(int argc, char ** argv) {
     int n_vocab  = model.hparams.n_vocab;
 
     std::vector<int> train_samples;
-    for (int i=0; i<train_tokens.size()-n_tokens; ++i) {
-        train_samples.push_back(i);
+    train_samples.push_back(0);
+    for (int i=1; i<train_tokens.size()-n_tokens; ++i) {
+        if (train_tokens[i-1] == llama_token_nl()) {
+            train_samples.push_back(i);
+        }
     }
     shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size());
     for (int i=0; i<train_samples.size(); ++i) {
@@ -1447,13 +1471,16 @@ int main(int argc, char ** argv) {
         int n_past = 0;
 
         ggml_cgraph gf = {};
-        gf.n_threads = 4;
+        gf.n_threads = 6;
 
-        get_example_targets_batch(ctx0, train_tokens.data(), train_tokens.size(), ex,  tokens_input, targets);
+        get_example_targets_batch(ctx0, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex,  tokens_input, targets);
 
         struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch);
-        struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
-        // struct ggml_tensor * e = cross_entropy_loss(ctx0, targets, logits);
+        struct ggml_tensor * se = square_error_loss(ctx0, targets, logits);
+        // struct ggml_tensor * ce = cross_entropy_loss(ctx0, targets, logits);
+        // struct ggml_tensor * e = ggml_add(ctx0, se, ce);
+        // struct ggml_tensor * e = ce;
+        struct ggml_tensor * e = se;
 
         ggml_build_forward_expand(&gf, e);
         ggml_graph_compute(ctx0, &gf);

From b241b9cb6cf000c0d3e16e4aecda11bb08e77537 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 17 May 2023 13:49:32 +0200
Subject: [PATCH 20/86] save train trained model to checkpoint and load model
 to be trained from checkpoint

---
 examples/baby-llama/baby-llama-text.cpp | 149 ++++++++++++++++++++++--
 1 file changed, 140 insertions(+), 9 deletions(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index 2de3171f1c675..542b5e3866530 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -204,6 +204,9 @@ struct my_llama_model {
     struct ggml_tensor * output;
 
     std::vector<my_llama_layer> layers;
+
+    uint32_t train_its = 0;
+    uint32_t train_samples = 0;
 };
 
 uint32_t get_n_ff(const struct my_llama_hparams* hparams) {
@@ -1124,11 +1127,12 @@ struct llama_file {
     llama_file(const char * fname, const char * mode) {
         fp = std::fopen(fname, mode);
         if (fp == NULL) {
-            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
+            size = 0;
+        } else {
+            seek(0, SEEK_END);
+            size = tell();
+            seek(0, SEEK_SET);
         }
-        seek(0, SEEK_END);
-        size = tell();
-        seek(0, SEEK_SET);
     }
 
     size_t tell() const {
@@ -1355,18 +1359,135 @@ void set_logits_masked(struct ggml_tensor * logits, std::vector<bool>& mask, flo
     }
 }
 
+enum llama_file_version {
+    LLAMA_FILE_VERSION_GGML,
+    LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
+    LLAMA_FILE_VERSION_GGJT_V1, // added padding
+    LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
+};
+
+void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
+    const char * name = ggml_get_name(tensor);
+    uint32_t name_len = strlen(name);
+    uint32_t nd = tensor->n_dims;
+    uint32_t ne[4] = { tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3] };
+    file->write_u32(nd);
+    file->write_u32(name_len);
+    file->write_u32(tensor->type);
+    file->write_raw(ne, sizeof(ne[0]) * nd);
+    file->write_raw(name, name_len);
+    file->seek(-file->tell() & 31, SEEK_CUR);
+    file->write_raw(tensor->data, ggml_nbytes(tensor));
+}
+
+void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
+    uint32_t nd = file->read_u32();
+    GGML_ASSERT(nd == tensor->n_dims);
+    uint32_t name_len = file->read_u32();
+    enum ggml_type type = (enum ggml_type) file->read_u32();
+    GGML_ASSERT(type == tensor->type);
+    uint32_t ne[4];
+    file->read_raw(ne, sizeof(ne[0]) * nd);
+    for (int i=0; i<nd; ++i) {
+        GGML_ASSERT(ne[i] == tensor->ne[i]);
+    }
+    std::string name = file->read_string(name_len);
+    file->seek(-file->tell() & 31, SEEK_CUR);
+
+    GGML_ASSERT(strcmp(ggml_get_name(tensor), name.c_str()) == 0);
+    file->read_raw(tensor->data, ggml_nbytes(tensor));
+}
+
+void save_model(struct my_llama_model * model, const char * filename) {
+    struct llama_file file(filename, "wb");
+    if (file.fp == NULL) {
+        return;
+    }
+    file.write_u32(model->train_its);
+    file.write_u32(model->train_samples);
+    file.write_u32(model->hparams.n_vocab);
+    file.write_u32(model->hparams.n_embd);
+    file.write_u32(model->hparams.n_mult);
+    file.write_u32(model->hparams.n_head);
+    file.write_u32(model->hparams.n_layer);
+    file.write_u32(model->hparams.n_rot);
+
+    write_tensor(&file, model->tok_embeddings);
+    write_tensor(&file, model->norm);
+    write_tensor(&file, model->output);
+
+    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+        auto & layer = model->layers[i];
+
+        write_tensor(&file, layer.attention_norm);
+        write_tensor(&file, layer.wq);
+        write_tensor(&file, layer.wk);
+        write_tensor(&file, layer.wv);
+        write_tensor(&file, layer.wo);
+        write_tensor(&file, layer.ffn_norm);
+        write_tensor(&file, layer.w1);
+        write_tensor(&file, layer.w2);
+        write_tensor(&file, layer.w3);
+    }
+}
+
+void load_model(struct my_llama_model * model, const char * filename, bool init) {
+    struct llama_file file(filename, "rb");
+    
+    if (file.fp) {
+        printf("%s: Loading model from '%s'.\n", __func__, filename);
+        model->train_its       = file.read_u32();
+        model->train_samples   = file.read_u32();
+        model->hparams.n_vocab = file.read_u32();
+        model->hparams.n_embd  = file.read_u32();
+        model->hparams.n_mult  = file.read_u32();
+        model->hparams.n_head  = file.read_u32();
+        model->hparams.n_layer = file.read_u32();
+        model->hparams.n_rot   = file.read_u32();
+        printf("%s: Training iterations: %u.\n", __func__, model->train_its);
+        printf("%s: Training samples:    %u.\n", __func__, model->train_samples);
+        print_params(&model->hparams);
+    }
+    
+    if (init) {
+        init_model(model);
+    }
+
+    if (file.fp) {
+        read_tensor(&file, model->tok_embeddings);
+        read_tensor(&file, model->norm);
+        read_tensor(&file, model->output);
+
+        for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+            auto & layer = model->layers[i];
+
+            read_tensor(&file, layer.attention_norm);
+            read_tensor(&file, layer.wq);
+            read_tensor(&file, layer.wk);
+            read_tensor(&file, layer.wv);
+            read_tensor(&file, layer.wo);
+            read_tensor(&file, layer.ffn_norm);
+            read_tensor(&file, layer.w1);
+            read_tensor(&file, layer.w2);
+            read_tensor(&file, layer.w3);
+        }
+    }
+}
+
 int main(int argc, char ** argv) {
     const char * default_model = "ggml-vic7b-uncensored-q4_0.bin";
     const char * default_train = "shakespeare.txt";
-    const char * default_argv[3] = {argv[0], default_model, default_train};
+    const char * default_checkpoint = "checkpoint.bin";
+    const char * default_argv[4] = {argv[0], default_model, default_train, default_checkpoint};
     
-    if (argc < 3) {
+    if (argc < 4) {
         fprintf(stderr, "usage: %s model training_data\n", argv[0]);
         //return 1;
     }
 
     const char * fn_model = (argc >= 2) ? argv[1] : default_argv[1];
     const char * fn_train = (argc >= 3) ? argv[2] : default_argv[2];
+    const char * fn_chkpt = (argc >= 4) ? argv[3] : default_argv[3];
 
     struct llama_context_params llama_params = llama_context_default_params();
     llama_params.vocab_only = true;
@@ -1420,7 +1541,7 @@ int main(int argc, char ** argv) {
     my_llama_sampler sampler;
 
     printf("%s: init model\n", __func__);
-    init_model(&model);
+    load_model(&model, fn_chkpt, true);
     set_param_model(&model);
     randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
     init_kv_cache(&kv_self, &model, n_batch);
@@ -1498,8 +1619,16 @@ int main(int argc, char ** argv) {
         opt_params_lbfgs.print_backward_graph = false;
         opt_params_lbfgs.n_threads = gf.n_threads;
         opt_params_lbfgs.lbfgs.n_iter = 16;
-        ggml_opt(ctx0, opt_params_adam, e);
-        // ggml_opt(ctx0, opt_params_lbfgs, e);
+
+        bool use_adam = true;
+        if (use_adam) {
+            ggml_opt(ctx0, opt_params_adam, e);
+        } else {
+            ggml_opt(ctx0, opt_params_lbfgs, e);
+        }
+
+        model.train_its += use_adam ? opt_params_adam.adam.n_iter : opt_params_lbfgs.lbfgs.n_iter;
+        model.train_samples += n_batch;
 
         ggml_build_forward_expand(&gf, e);
         ggml_graph_compute(ctx0, &gf);
@@ -1541,6 +1670,8 @@ int main(int argc, char ** argv) {
         ggml_free(ctx0);
     }
 
+    save_model(&model, fn_chkpt);
+
     {
         int n_gen = 128;
         int sample_ctx = n_tokens - n_tokens/8;

From 25fe1c3815eec962d70516993844635d116bf30f Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 19 May 2023 14:53:21 +0200
Subject: [PATCH 21/86] use inplace functions where possible

---
 examples/baby-llama/baby-llama-text.cpp | 26 ++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index 542b5e3866530..7517e203737e3 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -421,8 +421,8 @@ struct ggml_tensor * forward(
             // wk   shape [n_embd, n_embd, 1, 1]
             // Qcur shape [n_embd/n_head, n_head, N, 1]
             // Kcur shape [n_embd/n_head, n_head, N, 1]
-            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
-            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
 
             // store key and value to memory
             {
@@ -447,8 +447,8 @@ struct ggml_tensor * forward(
                     ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
                 } //*/
 
-                kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
-                vc = ggml_set_2d(ctx0, vc, Vcur, (   n_ctx)*ggml_element_size(kv_self.v),
+                kc = ggml_set_1d_inplace(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+                vc = ggml_set_2d_inplace(ctx0, vc, Vcur, (   n_ctx)*ggml_element_size(kv_self.v),
                         (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
             }
 
@@ -678,8 +678,8 @@ struct ggml_tensor * forward_batch(
             // wk   shape [n_embd, n_embd, 1, 1]
             // Qcur shape [n_embd/n_head, n_head, N, n_batch]
             // Kcur shape [n_embd/n_head, n_head, N, n_batch]
-            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
-            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
+            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
+            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
             assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
             assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
 
@@ -714,11 +714,11 @@ struct ggml_tensor * forward_batch(
                     ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
                 } //*/
 
-                kc = ggml_set_2d(ctx0, kc,
+                kc = ggml_set_2d_inplace(ctx0, kc,
                         ggml_reshape_2d(ctx0, Kcur, n_embd*N, n_batch),
                         ggml_element_size(kc)*n_embd*n_ctx,
                         (ggml_element_size(kc)*n_embd)*(il*n_batch*n_ctx + n_past));
-                vc = ggml_set_2d(ctx0, vc,
+                vc = ggml_set_2d_inplace(ctx0, vc,
                         ggml_reshape_2d(ctx0, Vcur, N*n_embd, n_batch),
                         ggml_element_size(vc)*n_ctx*n_embd,
                         ggml_element_size(vc)*(n_past + il*n_embd*n_batch*n_ctx));
@@ -760,19 +760,19 @@ struct ggml_tensor * forward_batch(
             // KQ_scaled = KQ / sqrt(n_embd/n_head)
             // KQ_scaled shape [n_past + N, N, n_head, n_batch]
             struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0,
+                ggml_scale_inplace(ctx0,
                         KQ,
                         ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
             assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch);
 
             // KQ_masked = mask_past(KQ_scaled)
             // KQ_masked shape [n_past + N, N, n_head, n_batch]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
             assert_shape_4d(KQ_masked, n_past + N, N, n_head, n_batch);
 
             // KQ = soft_max(KQ_masked)
             // KQ_soft_max shape [n_past + N, N, n_head, n_batch]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
             assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch);
 
             // split cached V into n_head heads
@@ -816,7 +816,7 @@ struct ggml_tensor * forward_batch(
         // lctx.use_buf(ctx0, 1);
 
         // inpFF shape [n_embd,N*n_batch,1,1]
-        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
+        struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA);
         assert_shape_2d(inpFF, n_embd, N*n_batch);
 
         // feed-forward network
@@ -864,7 +864,7 @@ struct ggml_tensor * forward_batch(
         }
 
         // cur shape [n_embd,N*n_batch,1,1]
-        cur = ggml_add(ctx0, cur, inpFF);
+        cur = ggml_add_inplace(ctx0, cur, inpFF);
         assert_shape_2d(cur, n_embd, N*n_batch);
 
         // input for next layer

From d8b066642965f738c48594a944c3f63e607b0f70 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 19 May 2023 18:29:47 +0200
Subject: [PATCH 22/86] initialize rng with srand

---
 examples/baby-llama/baby-llama-text.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index 7517e203737e3..aa8c3ace49564 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -3,10 +3,12 @@
 #include <unordered_map>
 #include <vector>
 #include <cassert>
-#include <random>
 #include <cstring>
 #include <stdexcept>
 #include <cstdarg>
+#include <ctime>
+#include <random>
+#include <stdexcept>
 #include <algorithm>
 #include <string>
 
@@ -1485,6 +1487,8 @@ int main(int argc, char ** argv) {
         //return 1;
     }
 
+    srand(time(NULL));
+
     const char * fn_model = (argc >= 2) ? argv[1] : default_argv[1];
     const char * fn_train = (argc >= 3) ? argv[2] : default_argv[2];
     const char * fn_chkpt = (argc >= 4) ? argv[3] : default_argv[3];

From 44d83558bc9783831c6353799324ff8d7ff089b8 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 19 May 2023 18:34:18 +0200
Subject: [PATCH 23/86] use different arguments for input and output checkpoint

---
 examples/baby-llama/baby-llama-text.cpp | 48 +++++++++++++++----------
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index aa8c3ace49564..5c019f4bba009 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -1433,7 +1433,7 @@ void save_model(struct my_llama_model * model, const char * filename) {
     }
 }
 
-void load_model(struct my_llama_model * model, const char * filename, bool init) {
+bool load_model(struct my_llama_model * model, const char * filename, bool init) {
     struct llama_file file(filename, "rb");
     
     if (file.fp) {
@@ -1474,24 +1474,28 @@ void load_model(struct my_llama_model * model, const char * filename, bool init)
             read_tensor(&file, layer.w3);
         }
     }
+
+    return (file.fp != NULL);
 }
 
 int main(int argc, char ** argv) {
     const char * default_model = "ggml-vic7b-uncensored-q4_0.bin";
     const char * default_train = "shakespeare.txt";
-    const char * default_checkpoint = "checkpoint.bin";
-    const char * default_argv[4] = {argv[0], default_model, default_train, default_checkpoint};
+    const char * default_chkpt_in = "checkpoint.bin";
+    const char * default_chkpt_out = "checkpoint.bin";
+    const char * default_argv[5] = {argv[0], default_model, default_train, default_chkpt_in, default_chkpt_out};
     
-    if (argc < 4) {
-        fprintf(stderr, "usage: %s model training_data\n", argv[0]);
+    if (argc < 5) {
+        fprintf(stderr, "usage: %s model training_data chkpt_in chkpt_out\n", argv[0]);
         //return 1;
     }
 
     srand(time(NULL));
 
-    const char * fn_model = (argc >= 2) ? argv[1] : default_argv[1];
-    const char * fn_train = (argc >= 3) ? argv[2] : default_argv[2];
-    const char * fn_chkpt = (argc >= 4) ? argv[3] : default_argv[3];
+    const char * fn_model     = (argc >= 2) ? argv[1] : default_argv[1];
+    const char * fn_train     = (argc >= 3) ? argv[2] : default_argv[2];
+    const char * fn_chkpt_in  = (argc >= 4) ? argv[3] : default_argv[3];
+    const char * fn_chkpt_out = (argc >= 5) ? argv[4] : default_argv[4];
 
     struct llama_context_params llama_params = llama_context_default_params();
     llama_params.vocab_only = true;
@@ -1516,17 +1520,20 @@ int main(int argc, char ** argv) {
 
     print_params(&model.hparams);
 
-    std::vector<bool> token_occurs;
-    std::vector<bool> token_notavail;
-    token_occurs.resize(model.hparams.n_vocab, false);
+    std::vector<size_t> token_noccurs;
+    std::vector<bool>   token_notavail;
+    token_noccurs.resize(model.hparams.n_vocab, 0);
     token_notavail.resize(model.hparams.n_vocab, true);
     for (int i=0; i<train_tokens.size(); ++i) {
-        token_occurs[train_tokens[i]] = true;
+        ++token_noccurs[train_tokens[i]];
         token_notavail[train_tokens[i]] = false;
     }
+    std::vector<float> token_freq;
+    token_freq.resize(model.hparams.n_vocab, 0);
     int n_unique_tokens = 0;
-    for (int i=0; i<token_occurs.size(); ++i) {
-        n_unique_tokens += token_occurs[i] ? 1 : 0;
+    for (int i=0; i<token_noccurs.size(); ++i) {
+        token_freq[i] = (float) token_noccurs[i] / (float) train_tokens.size();
+        n_unique_tokens += (token_noccurs[i] > 0) ? 1 : 0;
     }
     printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens);
 
@@ -1545,9 +1552,12 @@ int main(int argc, char ** argv) {
     my_llama_sampler sampler;
 
     printf("%s: init model\n", __func__);
-    load_model(&model, fn_chkpt, true);
+    bool existed = load_model(&model, fn_chkpt_in, true);
+    bool from_scratch = !existed;
     set_param_model(&model);
-    randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
+    if (from_scratch) { 
+        randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f); 
+    }
     init_kv_cache(&kv_self, &model, n_batch);
     init_sampler(&sampler, lctx);
 
@@ -1559,10 +1569,12 @@ int main(int argc, char ** argv) {
     int n_tokens = model.hparams.n_ctx;
     int n_vocab  = model.hparams.n_vocab;
 
+    bool samples_start_after_nl = false;
+
     std::vector<int> train_samples;
     train_samples.push_back(0);
     for (int i=1; i<train_tokens.size()-n_tokens; ++i) {
-        if (train_tokens[i-1] == llama_token_nl()) {
+        if (!samples_start_after_nl || (train_tokens[i-1] == llama_token_nl())) {
             train_samples.push_back(i);
         }
     }
@@ -1674,7 +1686,7 @@ int main(int argc, char ** argv) {
         ggml_free(ctx0);
     }
 
-    save_model(&model, fn_chkpt);
+    save_model(&model, fn_chkpt_out);
 
     {
         int n_gen = 128;

From 37f5b76df1113e1ddadbe3db08f7c9b85bee63cc Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 19 May 2023 18:35:40 +0200
Subject: [PATCH 24/86] ggml fixes to support backward pass on inplace
 operations

---
 ggml.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/ggml.c b/ggml.c
index 52a9c9bcc0abb..7039a3cec45b7 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4334,7 +4334,7 @@ struct ggml_tensor * ggml_add_impl(
 
     bool is_node = false;
 
-    if (!inplace && (a->grad || b->grad)) {
+    if (a->grad || b->grad) {
         is_node = true;
     }
 
@@ -5248,7 +5248,7 @@ struct ggml_tensor * ggml_set_impl(
 
     bool is_node = false;
 
-    if (!inplace && (a->grad || b->grad)) {
+    if (a->grad || b->grad) {
         is_node = true;
     }
 
@@ -6016,7 +6016,7 @@ struct ggml_tensor * ggml_rope_impl(
     GGML_ASSERT(n_past >= 0);
     bool is_node = false;
 
-    if (!inplace && a->grad) {
+    if (a->grad) {
         is_node = true;
     }
 
@@ -6065,8 +6065,7 @@ struct ggml_tensor * ggml_rope_back(
     bool is_node = false;
 
     if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
+        is_node = false; // TODO: implement backward
     }
 
     struct ggml_tensor * result = ggml_dup_tensor(ctx, a);

From 09b304d01540ac2efb00ef3b1d9706da08d0c2bf Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 19 May 2023 18:36:05 +0200
Subject: [PATCH 25/86] remove duplicate include

---
 examples/baby-llama/baby-llama-text.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index 5c019f4bba009..b56441f9a7ec2 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -4,7 +4,6 @@
 #include <vector>
 #include <cassert>
 #include <cstring>
-#include <stdexcept>
 #include <cstdarg>
 #include <ctime>
 #include <random>

From da86a1d736f02793ea6a0942d6a5a72427fe55b8 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 19 May 2023 18:39:38 +0200
Subject: [PATCH 26/86] fix cross entropy loss

- add target probabilities for each sample which is then used in cross entropy loss
---
 examples/baby-llama/baby-llama-text.cpp | 111 ++++++++++++++----------
 1 file changed, 64 insertions(+), 47 deletions(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index b56441f9a7ec2..e65d2d1867c32 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -1025,78 +1025,93 @@ void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens)
     }
 }
 
-void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
+void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
     int n_tokens = tokens_input->ne[0];
-    int n_vocab  = targets->ne[0];
+    int n_vocab  = target_logits->ne[0];
+
+    const float eps = 1e-6f;
+    const float target_prob = 1.0f;
 
     int sample = train_samples[example_id % n_train_samples];
     GGML_ASSERT(sample+n_tokens-1 < n_train_data);
 
-    ggml_set_f32(targets, -1.0f/n_vocab);
+    ggml_set_f32(target_logits, -1.0f/n_vocab);
+    ggml_set_f32(target_probs, 0.0f);
     ggml_set_i32_1d(tokens_input, 0, llama_token_bos());
     for (int i=1; i<n_tokens+1; ++i) {
         int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
-        ggml_set_f32_1d(targets, (i-1)*n_vocab + token, +1.0f);
+        ggml_set_f32_1d(target_logits, (i-1)*n_vocab + token, +1.0f);
+        ggml_set_f32_1d(target_probs,  (i-1)*n_vocab + token, -1.0f);
         if (i<n_tokens) {
             ggml_set_i32_1d(tokens_input, i, token);
         }
     }
 }
 
-void get_example_targets_batch(struct ggml_context * ctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
-    GGML_ASSERT(tokens_input->n_dims == 2);
-    GGML_ASSERT(     targets->n_dims == 3);
+void get_example_targets_batch(struct ggml_context * ctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
+    GGML_ASSERT(tokens_input->n_dims  == 2);
+    GGML_ASSERT(target_logits->n_dims == 3);
+    GGML_ASSERT(target_probs->n_dims  == 3);
     int n_tokens = tokens_input->ne[0];
     int n_batch  = tokens_input->ne[1];
-    GGML_ASSERT(n_tokens == targets->ne[1]);
-    GGML_ASSERT(n_batch  == targets->ne[2]);
+    GGML_ASSERT(n_tokens == target_logits->ne[1]);
+    GGML_ASSERT(n_batch  == target_logits->ne[2]);
+    GGML_ASSERT(n_tokens == target_probs->ne[1]);
+    GGML_ASSERT(n_batch  == target_probs->ne[2]);
 
     for (int k=0; k<n_batch; ++k) {
-        struct ggml_tensor * tokens_input_k = ggml_view_1d(ctx,
+        struct ggml_tensor * tokens_input_k  = ggml_view_1d(ctx,
                                                 tokens_input,
                                                 tokens_input->ne[0],
                                                 k*tokens_input->nb[1]);
-        struct ggml_tensor * targets_k    = ggml_view_2d(ctx,
-                                                targets,
-                                                targets->ne[0],
-                                                targets->ne[1],
-                                                targets->nb[1],
-                                                k*targets->nb[2]);
+        struct ggml_tensor * target_logits_k = ggml_view_2d(ctx,
+                                                target_logits,
+                                                target_logits->ne[0],
+                                                target_logits->ne[1],
+                                                target_logits->nb[1],
+                                                k*target_logits->nb[2]);
+        
+        struct ggml_tensor * target_probs_k = ggml_view_2d(ctx,
+                                                target_probs,
+                                                target_probs->ne[0],
+                                                target_probs->ne[1],
+                                                target_probs->nb[1],
+                                                k*target_probs->nb[2]);
         
         get_example_targets(train_samples, n_train_samples, train_data, n_train_data,
-            example_id*n_batch + k, tokens_input_k, targets_k);
+            example_id*n_batch + k, tokens_input_k, target_logits_k, target_probs_k);
     }
 }
 
 
-void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
+void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs, int n_shift) {
     int n_tokens = tokens_input->ne[0];
-    int n_vocab = targets->ne[0];
+    int n_vocab = target_logits->ne[0];
     for (int i=0; i<n_tokens-n_shift; ++i) {
         ggml_set_i32_1d(tokens_input, i, ggml_get_i32_1d(tokens_input, i + n_shift));
         for (int k=0; k<n_vocab; ++k) {
-            ggml_set_f32_1d(targets, i*n_vocab + k, ggml_get_f32_1d(targets, (i + n_shift)*n_vocab + k));
+            ggml_set_f32_1d(target_logits, i*n_vocab + k, ggml_get_f32_1d(target_logits, (i + n_shift)*n_vocab + k));
+            ggml_set_f32_1d(target_probs, i*n_vocab + k,  ggml_get_f32_1d(target_probs,  (i + n_shift)*n_vocab + k));
         }
     }
 }
 
-struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
-    // todo: instead of a-b: a[1:]-b[:-1]
-    return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
+struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * target) {
+    return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, target, a)));
 }
 
-struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
-    const float eps = 1e-3;
+struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * probs) {
+    const float eps = 1e-9f;
     return
         ggml_sum(ctx,
-            ggml_neg(ctx,
-                ggml_sum_rows(ctx,
-                    ggml_mul(ctx,
-                        ggml_soft_max(ctx, a),
-                        ggml_log(ctx,
-                            ggml_add1(ctx,
-                                ggml_soft_max(ctx, b),
-                                ggml_new_f32(ctx, eps)))))));
+            ggml_mul(ctx,
+                probs,
+                ggml_log(ctx,
+                    ggml_add1(ctx,
+                        ggml_scale(ctx,
+                            ggml_soft_max(ctx, a),
+                            ggml_new_f32(ctx, 1.0f-eps)),
+                        ggml_new_f32(ctx, eps)))));
 }
 
 #ifdef __GNUC__
@@ -1602,21 +1617,22 @@ int main(int argc, char ** argv) {
         struct ggml_tensor * after_opt_best_samples  = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
         struct ggml_tensor * after_opt_probs         = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
         struct ggml_tensor * tokens_input            = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
-        struct ggml_tensor * targets                 = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
+        struct ggml_tensor * target_logits           = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
+        struct ggml_tensor * target_probs            = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
 
         int n_past = 0;
 
         ggml_cgraph gf = {};
         gf.n_threads = 6;
 
-        get_example_targets_batch(ctx0, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex,  tokens_input, targets);
+        get_example_targets_batch(ctx0, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex,  tokens_input, target_logits, target_probs);
 
         struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch);
-        struct ggml_tensor * se = square_error_loss(ctx0, targets, logits);
-        // struct ggml_tensor * ce = cross_entropy_loss(ctx0, targets, logits);
+        // struct ggml_tensor * se = square_error_loss(ctx0, logits, target_logits);
+        struct ggml_tensor * ce = cross_entropy_loss(ctx0, logits, target_probs);
         // struct ggml_tensor * e = ggml_add(ctx0, se, ce);
-        // struct ggml_tensor * e = ce;
-        struct ggml_tensor * e = se;
+        struct ggml_tensor * e = ce;
+        // struct ggml_tensor * e = se;
 
         ggml_build_forward_expand(&gf, e);
         ggml_graph_compute(ctx0, &gf);
@@ -1652,12 +1668,12 @@ int main(int argc, char ** argv) {
 
         if (ex % 1 == 0) {
             printf("Example %d\n", ex);
-            printf("error_before_opt: %.2f\n", error_before_opt);
-            printf("error_after_opt:  %.2f\n", error_after_opt);
+            printf("error_before_opt: %.6f\n", error_before_opt);
+            printf("error_after_opt:  %.6f\n", error_after_opt);
         }
 
         if (ex % 2 == 0) {
-            set_logits_masked(logits, token_notavail, -1e9);
+            // set_logits_masked(logits, token_notavail, -1e9);
             for (int i=0; i<n_batch; ++i) {
                 init_sampler(&sampler, lctx);
                 for (int k=0; k<n_tokens; ++k) {
@@ -1695,10 +1711,11 @@ int main(int argc, char ** argv) {
 
         printf("Generating %d tokens.\n", n_gen);
 
-        struct ggml_tensor * tokens_input = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, n_tokens);
-        struct ggml_tensor * targets      = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens);
+        struct ggml_tensor * tokens_input  = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, n_tokens);
+        struct ggml_tensor * target_logits = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab,  n_tokens);
+        struct ggml_tensor * target_probs  = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab,  n_tokens);
 
-        get_example_targets(train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), 137, tokens_input, targets);
+        get_example_targets(train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs);
         for (int i=sample_ctx; i<n_tokens; ++i) {
             ggml_set_i32_1d(tokens_input, i, n_vocab/2);
         }
@@ -1728,7 +1745,7 @@ int main(int argc, char ** argv) {
             struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
             struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
 
-            set_logits_masked(logits, token_notavail, -1e9);
+            // set_logits_masked(logits, token_notavail, -1e9);
             int token = sample(&sampler, 
                 (float *) ((char *) logits->data + (sample_ctx-1)*logits->nb[1]), 
                 (llama_token *) tokens_input->data, 
@@ -1739,7 +1756,7 @@ int main(int argc, char ** argv) {
             // print_row(probs, sample_at);
             print_token(lctx, token);
 
-            lshift_examples(tokens_input, targets, 1);
+            lshift_examples(tokens_input, target_logits, target_probs, 1);
             ggml_set_i32_1d(tokens_input, 0, 0);
             ggml_set_i32_1d(tokens_input, sample_ctx-1, token);
 

From e19ead6e3f30a8c0c944e238d90caf5902c92415 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 19 May 2023 18:40:20 +0200
Subject: [PATCH 27/86] print used memory before and after optimization

---
 examples/baby-llama/baby-llama-text.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index e65d2d1867c32..099863bb8ad0c 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -1637,6 +1637,8 @@ int main(int argc, char ** argv) {
         ggml_build_forward_expand(&gf, e);
         ggml_graph_compute(ctx0, &gf);
 
+        size_t used_mem_before_opt = ggml_used_mem(ctx0);
+
         float error_before_opt = ggml_get_f32_1d(e, 0);
         
         struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
@@ -1645,6 +1647,7 @@ int main(int argc, char ** argv) {
         opt_params_adam.print_backward_graph = false;
         opt_params_adam.n_threads = gf.n_threads;
         opt_params_adam.adam.n_iter = 16;
+        opt_params_adam.adam.alpha = 1e-4;
         
         opt_params_lbfgs.print_forward_graph = false;
         opt_params_lbfgs.print_backward_graph = false;
@@ -1658,6 +1661,8 @@ int main(int argc, char ** argv) {
             ggml_opt(ctx0, opt_params_lbfgs, e);
         }
 
+        size_t used_mem_after_opt = ggml_used_mem(ctx0);
+
         model.train_its += use_adam ? opt_params_adam.adam.n_iter : opt_params_lbfgs.lbfgs.n_iter;
         model.train_samples += n_batch;
 
@@ -1666,6 +1671,9 @@ int main(int argc, char ** argv) {
 
         float error_after_opt = ggml_get_f32_1d(e, 0);
 
+        printf("used_mem_before_opt: %zu bytes\n", used_mem_before_opt);
+        printf("used_mem_after_opt:  %zu bytes\n", used_mem_after_opt);
+
         if (ex % 1 == 0) {
             printf("Example %d\n", ex);
             printf("error_before_opt: %.6f\n", error_before_opt);

From 332003584eb39bbd4465d69666033155e087ef46 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 19 May 2023 18:41:06 +0200
Subject: [PATCH 28/86] sample with non-greedy sampling parameters at the end
 of training

---
 examples/baby-llama/baby-llama-text.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index 099863bb8ad0c..beacf46861a96 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -1712,9 +1712,12 @@ int main(int argc, char ** argv) {
     save_model(&model, fn_chkpt_out);
 
     {
-        int n_gen = 128;
+        int n_gen = 1024;
         int sample_ctx = n_tokens - n_tokens/8;
         
+        sampler.params.temp = 0.2;
+        sampler.params.repeat_penalty = 1.1;
+        sampler.params.mirostat = 2;
         init_sampler(&sampler, lctx);
 
         printf("Generating %d tokens.\n", n_gen);

From 08a330a13661b1623c866a75fe95faee0074285f Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 19 May 2023 18:41:26 +0200
Subject: [PATCH 29/86] add cmake target for baby-llama-text

---
 examples/baby-llama/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/baby-llama/CMakeLists.txt b/examples/baby-llama/CMakeLists.txt
index d2ce36367474f..c89dc792b9a0e 100644
--- a/examples/baby-llama/CMakeLists.txt
+++ b/examples/baby-llama/CMakeLists.txt
@@ -2,3 +2,7 @@ set(TARGET baby-llama)
 add_executable(${TARGET} baby-llama.cpp)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+
+add_executable(${TARGET}-text baby-llama-text.cpp)
+target_link_libraries(${TARGET}-text PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET}-text PRIVATE cxx_std_11)

From a6aafdd719c7ce0dbcc9a182c6131125039d8ffb Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 20 May 2023 14:47:56 +0200
Subject: [PATCH 30/86] add ggml_add1_inplace to header

---
 ggml.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ggml.h b/ggml.h
index aa75fd726b18d..62da0bd3553dd 100644
--- a/ggml.h
+++ b/ggml.h
@@ -520,6 +520,11 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
 
+    GGML_API struct ggml_tensor * ggml_add1_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
     GGML_API struct ggml_tensor * ggml_acc(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,

From f4e9ce79989e8fc62310ca33919346cd0ec79a07 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 20 May 2023 14:49:19 +0200
Subject: [PATCH 31/86] enable gradient propagation for inplace add1 and scale
 operations

those functions backward passes don't need the original src0, so they also work when forward is inplace
---
 ggml.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index 7039a3cec45b7..7b66846ee73b6 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4374,7 +4374,7 @@ struct ggml_tensor * ggml_add1_impl(
 
     bool is_node = false;
 
-    if (!inplace && (a->grad || b->grad)) {
+    if (a->grad || b->grad) {
         is_node = true;
     }
 
@@ -5205,7 +5205,7 @@ struct ggml_tensor * ggml_scale_impl(
 
     bool is_node = false;
 
-    if (!inplace && (a->grad || b->grad)) {
+    if (a->grad || b->grad) {
         is_node = true;
     }
 

From ef17d99f657aef57b977ce33fc13345f467b1f44 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 20 May 2023 14:54:40 +0200
Subject: [PATCH 32/86] implement AdamW in ggml_opt_adam by adding weight decay
 parameter (default 0.001f)

also add a schedule parameter (default 1.0f) that can be used to scale alpha and decay according to learning schedule.
setting the decay parameter to zero disables AdamW resulting in normal Adam optimizer.

since the difference between Adam and AdamW is minimal it is not implemented as another optimizer, but integrated into the existing Adam optimizer.
---
 ggml.c | 11 +++++++++--
 ggml.h |  2 ++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index 7b66846ee73b6..f259f5605ad6a 100644
--- a/ggml.c
+++ b/ggml.c
@@ -14603,7 +14603,9 @@ static enum ggml_opt_result ggml_opt_adam(
     }
 
     // constants
-    const float alpha = params.adam.alpha;
+    const float sched = params.adam.sched;
+    const float decay = params.adam.decay * sched;
+    const float alpha = params.adam.alpha * sched;
     const float beta1 = params.adam.beta1;
     const float beta2 = params.adam.beta2;
     const float eps   = params.adam.eps;
@@ -14673,7 +14675,11 @@ static enum ggml_opt_result ggml_opt_adam(
 
             // m^hat = m_t / (1 - beta1^t)
             // v^hat = v_t / (1 - beta2^t)
-            // x_t = x_t-1 - alpha*m^hat/(sqrt(v^hat) + eps)
+            // x_t = x_t-1 - sched*(alpha*m^hat/(sqrt(v^hat) + eps) + decay*x_t-1)
+            // x_t = x_t-1 - sched*alpha*m^hat/(sqrt(v^hat) + eps) - sched*decay*x_t-1
+            // x_t = x_t-1*(1-sched*decay) - sched*alpha*m^hat/(sqrt(v^hat) + eps)
+            // x_t = x_t-1*(1-sched*decay) + sched*decay*(-alpha/decay)*m^hat/(sqrt(v^hat) + eps)
+            // x_t = mix(x_t-1, (-alpha/decay)*m^hat/(sqrt(v^hat) + eps), sched*decay)
             ggml_vec_cpy_f32  (nx, mh, m);
             ggml_vec_cpy_f32  (nx, vh, v);
 
@@ -14684,6 +14690,7 @@ static enum ggml_opt_result ggml_opt_adam(
             ggml_vec_acc1_f32 (nx, vh, eps);
 
             ggml_vec_div_f32  (nx, mh, mh, vh);
+            ggml_vec_scale_f32(nx, x,  1.0f - decay);
             ggml_vec_sub_f32  (nx, x,  x,  mh);
 
             // update the parameters
diff --git a/ggml.h b/ggml.h
index 62da0bd3553dd..6ce660c7454d4 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1055,6 +1055,8 @@ extern "C" {
         struct {
             int n_iter;
 
+            float sched; // schedule multiplier (fixed, decay or warmup)
+            float decay; // weight decay for AdamW, use 0.0f to disable
             float alpha; // learning rate
             float beta1;
             float beta2;

From 96514971dddaaba3e4424b822224f665c52c13ee Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 21 May 2023 12:17:57 +0200
Subject: [PATCH 33/86] use inplace operations in cross_entropy_loss

---
 examples/baby-llama/baby-llama-text.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index beacf46861a96..0c13f7fd4382f 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -1107,8 +1107,8 @@ struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_t
             ggml_mul(ctx,
                 probs,
                 ggml_log(ctx,
-                    ggml_add1(ctx,
-                        ggml_scale(ctx,
+                    ggml_add1_inplace(ctx,
+                        ggml_scale_inplace(ctx,
                             ggml_soft_max(ctx, a),
                             ggml_new_f32(ctx, 1.0f-eps)),
                         ggml_new_f32(ctx, eps)))));

From 57c2f4f9095906711ad7c38e831e1659cbbc2785 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 21 May 2023 12:18:47 +0200
Subject: [PATCH 34/86] fix random weight initialization scale

---
 examples/baby-llama/baby-llama-text.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index 0c13f7fd4382f..4c3cd631b45a8 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -68,7 +68,7 @@ struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct
             }
             break;
         case 2:
-            scale /= sqrtf(tensor->ne[0]*tensor->ne[1]);
+            scale /= sqrtf(tensor->ne[0]+tensor->ne[1]);
             for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
                 for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
                     float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
@@ -77,7 +77,7 @@ struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct
             }
             break;
         case 3:
-            scale /= sqrtf(tensor->ne[0]*tensor->ne[1]);
+            scale /= sqrtf(tensor->ne[0]+tensor->ne[1]);
             for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
                 for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
                     for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
@@ -88,7 +88,7 @@ struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct
             }
             break;
         case 4:
-            scale /= sqrtf(tensor->ne[0]*tensor->ne[1]);
+            scale /= sqrtf(tensor->ne[0]+tensor->ne[1]);
             for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
                 for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
                     for (int i1 = 0; i1 < tensor->ne[1]; i1++) {

From 1eee9255e74dcd17ee248a9bb65c0c060fd97454 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 21 May 2023 15:03:51 +0200
Subject: [PATCH 35/86] add missing default parameters for adam optimizer

---
 ggml.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ggml.c b/ggml.c
index f259f5605ad6a..b8253de9344ea 100644
--- a/ggml.c
+++ b/ggml.c
@@ -15117,6 +15117,8 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
 
                     .adam = {
                         .n_iter = 10000,
+                        .sched  = 1.000f,
+                        .decay  = 0.001f,
                         .alpha  = 0.001f,
                         .beta1  = 0.9f,
                         .beta2  = 0.999f,

From ec1783c3e0e61d09eb8777b776ec227b9aae0d5f Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 21 May 2023 15:16:07 +0200
Subject: [PATCH 36/86] add ggml_opt_context, so that we can properly resume
 training

otherwise the optimizer states, tracking statistics about the error function and its derivates,
will reset to zero each time ggml_opt is called, hindering convergence on resumed training.

now the optimizer context and all its memory is stored in a separate struct.
---
 ggml.c | 314 +++++++++++++++++++++++++++++++++++++++------------------
 ggml.h |  56 ++++++++++
 2 files changed, 270 insertions(+), 100 deletions(-)

diff --git a/ggml.c b/ggml.c
index b8253de9344ea..cfc9bb455aec2 100644
--- a/ggml.c
+++ b/ggml.c
@@ -14577,6 +14577,7 @@ static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g
 
 static enum ggml_opt_result ggml_opt_adam(
         struct ggml_context * ctx,
+        struct ggml_opt_context * opt,
         struct ggml_opt_params params,
         struct ggml_tensor * f,
         struct ggml_cgraph * gf,
@@ -14602,6 +14603,12 @@ static enum ggml_opt_result ggml_opt_adam(
         }
     }
 
+    if ((opt->params.type != params.type) || (opt->nx != nx) || (opt->params.past != params.past)) {
+        int iter = opt->iter;
+        ggml_opt_init(opt->ctx, opt, params, nx);
+        opt->iter = iter;
+    }
+
     // constants
     const float sched = params.adam.sched;
     const float decay = params.adam.decay * sched;
@@ -14610,19 +14617,15 @@ static enum ggml_opt_result ggml_opt_adam(
     const float beta2 = params.adam.beta2;
     const float eps   = params.adam.eps;
 
-    float * x  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // view of the parameters
-    float * g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // gradient
-    float * g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // gradient squared
-    float * m  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // first moment
-    float * v  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // second moment
-    float * mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // first moment hat
-    float * vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // second moment hat
+    float * x  = opt->adam.x->data;  // view of the parameters
+    float * g1 = opt->adam.g1->data; // gradient
+    float * g2 = opt->adam.g2->data; // gradient squared
+    float * m  = opt->adam.m->data;  // first moment
+    float * v  = opt->adam.v->data;  // second moment
+    float * mh = opt->adam.mh->data; // first moment hat
+    float * vh = opt->adam.vh->data; // second moment hat
 
-    float * pf = params.past > 0 ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)->data : NULL; // past function values
-
-    // initialize
-    ggml_vec_set_f32(nx, m, 0.0f);
-    ggml_vec_set_f32(nx, v, 0.0f);
+    float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
 
     // update view
     ggml_opt_get_params(np, ps, x);
@@ -14632,16 +14635,27 @@ static enum ggml_opt_result ggml_opt_adam(
     ggml_set_f32      (f->grad, 1.0f);
     ggml_graph_compute(ctx, gb);
 
-    float fx_prev = ggml_get_f32_1d(f, 0);
+    opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
+    opt->adam.fx_best = opt->adam.fx_prev;
     if (pf) {
-        pf[0] = fx_prev;
+        pf[opt->iter % params.past] = opt->adam.fx_prev;
+    }
+
+    // initialize
+    if (opt->just_initialized) {
+        opt->adam.n_no_improvement = 0;
+        opt->just_initialized = false;
     }
 
-    int n_no_improvement = 0;
-    float fx_best = fx_prev;
+    float * fx_best = &opt->adam.fx_best;
+    float * fx_prev = &opt->adam.fx_prev;
+    int * n_no_improvement = &opt->adam.n_no_improvement;
+
+    int iter0 = opt->iter;
 
     // run the optimizer
     for (int t = 0; t < params.adam.n_iter; ++t) {
+        opt->iter = iter0 + t + 1;
         GGML_PRINT_DEBUG  ("=== iter %d ===\n", t);
 
         GGML_PRINT_DEBUG  ("f      = %10.6f\n", ggml_get_f32_1d(f, 0));
@@ -14683,8 +14697,8 @@ static enum ggml_opt_result ggml_opt_adam(
             ggml_vec_cpy_f32  (nx, mh, m);
             ggml_vec_cpy_f32  (nx, vh, v);
 
-            ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, t + 1)));
-            ggml_vec_scale_f32(nx, vh,  1.0f/(1.0f - powf(beta2, t + 1)));
+            ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, opt->iter)));
+            ggml_vec_scale_f32(nx, vh,  1.0f/(1.0f - powf(beta2, opt->iter)));
 
             ggml_vec_sqrt_f32 (nx, vh, vh);
             ggml_vec_acc1_f32 (nx, vh, eps);
@@ -14704,7 +14718,7 @@ static enum ggml_opt_result ggml_opt_adam(
         const float fx = ggml_get_f32_1d(f, 0);
 
         // check convergence
-        if (fabsf(fx - fx_prev)/fx < params.adam.eps_f) {
+        if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
             GGML_PRINT_DEBUG("converged\n");
 
             return GGML_OPT_OK;
@@ -14713,32 +14727,32 @@ static enum ggml_opt_result ggml_opt_adam(
         // delta-based convergence test
         if (pf != NULL) {
             // need at least params.past iterations to start checking for convergence
-            if (params.past <= t) {
-                const float rate = (pf[t%params.past] - fx)/fx;
+            if (params.past <= iter0 + t) {
+                const float rate = (pf[(iter0 + t)%params.past] - fx)/fx;
 
                 if (fabsf(rate) < params.delta) {
                     return GGML_OPT_OK;
                 }
             }
 
-            pf[t%params.past] = fx;
+            pf[(iter0 + t)%params.past] = fx;
         }
 
         // check for improvement
         if (params.max_no_improvement > 0) {
-            if (fx_best > fx) {
-                fx_best = fx;
-                n_no_improvement = 0;
+            if (fx_best[0] > fx) {
+                fx_best[0] = fx;
+                n_no_improvement[0] = 0;
             } else {
-                ++n_no_improvement;
+                ++n_no_improvement[0];
 
-                if (n_no_improvement >= params.max_no_improvement) {
+                if (n_no_improvement[0] >= params.max_no_improvement) {
                     return GGML_OPT_OK;
                 }
             }
         }
 
-        fx_prev = fx;
+        fx_prev[0] = fx;
 
         {
             const int64_t t_end_cpu = ggml_cycles();
@@ -14877,6 +14891,7 @@ static enum ggml_opt_result linesearch_backtracking(
 
 static enum ggml_opt_result ggml_opt_lbfgs(
         struct ggml_context * ctx,
+        struct ggml_opt_context * opt,
         struct ggml_opt_params params,
         struct ggml_tensor * f,
         struct ggml_cgraph * gf,
@@ -14909,31 +14924,32 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         }
     }
 
-    float * x  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // current parameters
-    float * xp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // previous parameters
-    float * g  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // current gradient
-    float * gp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // previous gradient
-    float * d  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // search direction
+    if ((opt->params.type != params.type) || (opt->nx != nx) || (opt->params.past != params.past) || (opt->params.lbfgs.m != params.lbfgs.m)) {
+        int iter = opt->iter;
+        ggml_opt_init(ctx, opt, params, nx);
+        opt->iter = iter;
+    }
+
+    float * x  = opt->lbfgs.x->data;  // current parameters
+    float * xp = opt->lbfgs.xp->data; // previous parameters
+    float * g  = opt->lbfgs.g->data;  // current gradient
+    float * gp = opt->lbfgs.gp->data; // previous gradient
+    float * d  = opt->lbfgs.d->data;  // search direction
 
-    float * pf = params.past > 0 ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)->data : NULL; // past function values
+    float * pf = params.past > 0 ? opt->lbfgs.pf->data : NULL; // past function values
 
     float fx    = 0.0f; // cost function value
     float xnorm = 0.0f; // ||x||
     float gnorm = 0.0f; // ||g||
-    float step  = 0.0f;
 
     // initialize x from the graph nodes
     ggml_opt_get_params(np, ps, x);
 
     // the L-BFGS memory
-    struct ggml_lbfgs_iteration_data * lm = alloca(sizeof(struct ggml_lbfgs_iteration_data)*m);
-
-    for (int i = 0; i < m; ++i) {
-        lm[i].alpha = 0.0f;
-        lm[i].ys    = 0.0f;
-        lm[i].s     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data;
-        lm[i].y     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data;
-    }
+    float * lm_alpha = opt->lbfgs.lmal->data;
+    float * lm_ys    = opt->lbfgs.lmys->data;
+    float * lm_s     = opt->lbfgs.lms->data;
+    float * lm_y     = opt->lbfgs.lmy->data;
 
     // evaluate the function value and its gradient
     {
@@ -14948,12 +14964,6 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         fx = ggml_get_f32_1d(f, 0);
     }
 
-    if (pf) {
-        pf[0] = fx;
-    }
-
-    float fx_best = fx;
-
     // search direction = -gradient
     ggml_vec_neg_f32(nx, d, g);
 
@@ -14970,26 +14980,43 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         return GGML_OPT_OK;
     }
 
-    // initial step
-    ggml_vec_norm_inv_f32(nx, &step, d);
+    if (opt->just_initialized) {
+        if (pf) {
+            pf[0] = fx;
+        }
+        opt->lbfgs.fx_best = fx;
+
+        // initial step
+        ggml_vec_norm_inv_f32(nx, &opt->lbfgs.step, d);
+        opt->lbfgs.j                = 0;
+        opt->lbfgs.k                = 1;
+        opt->lbfgs.end              = 0;
+        opt->lbfgs.n_no_improvement = 0;
+        opt->just_initialized       = false;
+    }
+
+    float * fx_best        = &opt->lbfgs.fx_best;
+    float * step           = &opt->lbfgs.step;
+    int * j                = &opt->lbfgs.j;
+    int * k                = &opt->lbfgs.k;
+    int * end              = &opt->lbfgs.end;
+    int * n_no_improvement = &opt->lbfgs.n_no_improvement;
 
-    int j                = 0;
-    int k                = 1;
-    int ls               = 0;
-    int end              = 0;
-    int bound            = 0;
-    int n_no_improvement = 0;
+    int ls     = 0;
+    int bound  = 0;
 
     float ys   = 0.0f;
     float yy   = 0.0f;
     float beta = 0.0f;
 
+    int it = 0;
+
     while (true) {
         // store the current position and gradient vectors
         ggml_vec_cpy_f32(nx, xp, x);
         ggml_vec_cpy_f32(nx, gp, g);
 
-        ls = linesearch_backtracking(ctx, &params, nx, x, &fx, g, d, &step, xp, f, gf, gb, np, ps);
+        ls = linesearch_backtracking(ctx, &params, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps);
 
         if (ls < 0) {
             // linesearch failed - go back to the previous point and return
@@ -15015,32 +15042,32 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         // delta-based convergence test
         if (pf != NULL) {
             // need at least params.past iterations to start checking for convergence
-            if (params.past <= k) {
-                const float rate = (pf[k%params.past] - fx)/fx;
+            if (params.past <= k[0]) {
+                const float rate = (pf[k[0]%params.past] - fx)/fx;
 
                 if (fabsf(rate) < params.delta) {
                     return GGML_OPT_OK;
                 }
             }
 
-            pf[k%params.past] = fx;
+            pf[k[0]%params.past] = fx;
         }
 
         // check for improvement
         if (params.max_no_improvement > 0) {
-            if (fx < fx_best) {
-                fx_best = fx;
-                n_no_improvement = 0;
+            if (fx < fx_best[0]) {
+                fx_best[0] = fx;
+                n_no_improvement[0] = 0;
             } else {
-                n_no_improvement++;
+                n_no_improvement[0]++;
 
-                if (n_no_improvement >= params.max_no_improvement) {
+                if (n_no_improvement[0] >= params.max_no_improvement) {
                     return GGML_OPT_OK;
                 }
             }
         }
 
-        if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < k + 1) {
+        if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < it + 1) {
             // reached the maximum number of iterations
             return GGML_OPT_DID_NOT_CONVERGE;
         }
@@ -15049,50 +15076,51 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         //   s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}.
         //   y_{k+1} = g_{k+1} - g_{k}.
         //
-        ggml_vec_sub_f32(nx, lm[end].s, x, xp);
-        ggml_vec_sub_f32(nx, lm[end].y, g, gp);
+        ggml_vec_sub_f32(nx, &lm_s[end[0]*nx], x, xp);
+        ggml_vec_sub_f32(nx, &lm_y[end[0]*nx], g, gp);
 
         // compute scalars ys and yy:
         //     ys = y^t \cdot s    -> 1 / \rho.
         //     yy = y^t \cdot y.
         //
-        ggml_vec_dot_f32(nx, &ys, lm[end].y, lm[end].s);
-        ggml_vec_dot_f32(nx, &yy, lm[end].y, lm[end].y);
+        ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0] *nx]);
+        ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
 
-        lm[end].ys = ys;
+        lm_ys[end[0]] = ys;
 
         // find new search direction
         //   ref: https://en.wikipedia.org/wiki/Limited-memory_BFGS
 
-        bound = (m <= k) ? m : k;
-        k++;
-        end = (end + 1)%m;
+        bound = (m <= k[0]) ? m : k[0];
+        k[0]++;
+        it++;
+        end[0] = (end[0] + 1)%m;
 
         // initialize search direction with -g
         ggml_vec_neg_f32(nx, d, g);
 
-        j = end;
+        j[0] = end[0];
         for (int i = 0; i < bound; ++i) {
-            j = (j + m - 1) % m;
+            j[0] = (j[0] + m - 1) % m;
             // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}
-            ggml_vec_dot_f32(nx, &lm[j].alpha, lm[j].s, d);
-            lm[j].alpha /= lm[j].ys;
+            ggml_vec_dot_f32(nx, &lm_alpha[j[0]], &lm_s[j[0]*nx], d);
+            lm_alpha[j[0]] /= lm_ys[j[0]];
             // q_{i} = q_{i+1} - \alpha_{i} y_{i}
-            ggml_vec_mad_f32(nx, d, lm[j].y, -lm[j].alpha);
+            ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]);
         }
 
         ggml_vec_scale_f32(nx, d, ys/yy);
 
         for (int i = 0; i < bound; ++i) {
             // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}
-            ggml_vec_dot_f32(nx, &beta, lm[j].y, d);
-            beta /= lm[j].ys;
+            ggml_vec_dot_f32(nx, &beta, &lm_y[j[0]*nx], d);
+            beta /= lm_ys[j[0]];
             // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}
-            ggml_vec_mad_f32(nx, d, lm[j].s, lm[j].alpha - beta);
-            j = (j + 1)%m;
+            ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta);
+            j[0] = (j[0] + 1)%m;
         }
 
-        step = 1.0;
+        step[0] = 1.0;
     }
 
     return GGML_OPT_DID_NOT_CONVERGE;
@@ -15161,6 +15189,71 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
     return result;
 }
 
+GGML_API void ggml_opt_init(
+        struct ggml_context * ctx,
+        struct ggml_opt_context * opt,
+        struct ggml_opt_params params,
+        int64_t nx) {
+    opt->ctx = ctx;
+    opt->params = params;
+    opt->iter = 0;
+    opt->nx = nx;
+    opt->just_initialized = true;
+    switch (opt->params.type) {
+        case GGML_OPT_ADAM:
+            {
+                opt->adam.x  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
+                opt->adam.g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
+                opt->adam.g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
+                opt->adam.m  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
+                opt->adam.v  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
+                opt->adam.mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
+                opt->adam.vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
+                opt->adam.pf = params.past > 0
+                    ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)
+                    : NULL;
+                ggml_set_zero(opt->adam.x);
+                ggml_set_zero(opt->adam.g1);
+                ggml_set_zero(opt->adam.g2);
+                ggml_set_zero(opt->adam.m);
+                ggml_set_zero(opt->adam.v);
+                ggml_set_zero(opt->adam.mh);
+                ggml_set_zero(opt->adam.vh);
+                if (opt->adam.pf) {
+                    ggml_set_zero(opt->adam.pf);
+                }
+            } break;
+        case GGML_OPT_LBFGS:
+            {
+                opt->lbfgs.x  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
+                opt->lbfgs.xp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
+                opt->lbfgs.g  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
+                opt->lbfgs.gp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
+                opt->lbfgs.d  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
+                opt->lbfgs.pf = params.past > 0
+                    ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)
+                    : NULL;
+                opt->lbfgs.lmal = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.lbfgs.m);
+                opt->lbfgs.lmys = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.lbfgs.m);
+                opt->lbfgs.lms  = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, params.lbfgs.m);
+                opt->lbfgs.lmy  = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, params.lbfgs.m);
+                ggml_set_zero(opt->lbfgs.x);
+                ggml_set_zero(opt->lbfgs.xp);
+                ggml_set_zero(opt->lbfgs.g);
+                ggml_set_zero(opt->lbfgs.gp);
+                ggml_set_zero(opt->lbfgs.d);
+                ggml_set_zero(opt->lbfgs.pf);
+                if (opt->lbfgs.pf) {
+                    ggml_set_zero(opt->lbfgs.pf);
+                }
+                ggml_set_zero(opt->lbfgs.lmal);
+                ggml_set_zero(opt->lbfgs.lmys);
+                ggml_set_zero(opt->lbfgs.lms);
+                ggml_set_zero(opt->lbfgs.lmy);
+            } break;
+    }
+}
+
 enum ggml_opt_result ggml_opt(
         struct ggml_context * ctx,
         struct ggml_opt_params params,
@@ -15183,33 +15276,54 @@ enum ggml_opt_result ggml_opt(
 
     enum ggml_opt_result result = GGML_OPT_OK;
 
+    struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context));
+
+    ggml_opt_init(ctx, opt, params, 0);
+    result = ggml_opt_resume(ctx, opt, f);
+
+    if (free_ctx) {
+        ggml_free(ctx);
+    }
+
+    return result;
+}
+
+enum ggml_opt_result ggml_opt_resume(
+        struct ggml_context * ctx,
+        struct ggml_opt_context * opt,
+        struct ggml_tensor * f) {
+
     // build forward + backward compute graphs
-    struct ggml_cgraph gf = ggml_build_forward (f);
-    struct ggml_cgraph gb = ggml_build_backward(ctx, &gf, true);
+    struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0));
+    struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0));
+    
+    struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
+    struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
+
+    *gf = ggml_build_forward (f);
+    *gb = ggml_build_backward(ctx, gf, true);
 
-    switch (params.type) {
+    enum ggml_opt_result result = GGML_OPT_OK;
+
+    switch (opt->params.type) {
         case GGML_OPT_ADAM:
             {
-                result = ggml_opt_adam(ctx, params, f, &gf, &gb);
+                result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb);
             } break;
         case GGML_OPT_LBFGS:
             {
-                result = ggml_opt_lbfgs(ctx, params, f, &gf, &gb);
+                result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb);
             } break;
     }
 
-    if (params.print_forward_graph) {
-        ggml_graph_print   (&gf);
-        ggml_graph_dump_dot(&gf, NULL, "opt-forward.dot");
-    }
-
-    if (params.print_backward_graph) {
-        ggml_graph_print   (&gb);
-        ggml_graph_dump_dot(&gb, &gf, "opt-backward.dot");
+    if (opt->params.print_forward_graph) {
+        ggml_graph_print   (gf);
+        ggml_graph_dump_dot(gf, NULL, "opt-forward.dot");
     }
 
-    if (free_ctx) {
-        ggml_free(ctx);
+    if (opt->params.print_backward_graph) {
+        ggml_graph_print   (gb);
+        ggml_graph_dump_dot(gb, gf, "opt-backward.dot");
     }
 
     return result;
diff --git a/ggml.h b/ggml.h
index 6ce660c7454d4..64de9eb3ea76f 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1081,6 +1081,49 @@ extern "C" {
         } lbfgs;
     };
 
+    struct ggml_opt_context {
+        struct ggml_context * ctx;
+        struct ggml_opt_params params;
+
+        int iter;
+        int64_t nx; // number of parameter elements
+
+        bool just_initialized;
+
+        struct {
+            struct ggml_tensor * x;  // view of the parameters
+            struct ggml_tensor * g1; // gradient
+            struct ggml_tensor * g2; // gradient squared
+            struct ggml_tensor * m;  // first moment
+            struct ggml_tensor * v;  // second moment
+            struct ggml_tensor * mh; // first moment hat
+            struct ggml_tensor * vh; // second moment hat
+            struct ggml_tensor * pf; // past function values
+            float fx_best;
+            float fx_prev;
+            int n_no_improvement;
+        } adam;
+
+        struct {
+            struct ggml_tensor * x;    // current parameters
+            struct ggml_tensor * xp;   // previous parameters
+            struct ggml_tensor * g;    // current gradient
+            struct ggml_tensor * gp;   // previous gradient
+            struct ggml_tensor * d;    // search direction
+            struct ggml_tensor * pf;   // past function values
+            struct ggml_tensor * lmal; // the L-BFGS memory alpha
+            struct ggml_tensor * lmys; // the L-BFGS memory ys
+            struct ggml_tensor * lms;  // the L-BFGS memory s
+            struct ggml_tensor * lmy;  // the L-BFGS memory y
+            float fx_best;
+            float step;
+            int j;
+            int k;
+            int end;
+            int n_no_improvement;
+        } lbfgs;
+    };
+
     GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
 
     // optimize the function defined by the tensor f
@@ -1089,6 +1132,19 @@ extern "C" {
             struct ggml_opt_params params,
             struct ggml_tensor * f);
 
+    // initialize optimizer context
+    GGML_API void ggml_opt_init(
+            struct ggml_context * ctx,
+            struct ggml_opt_context * opt,
+            struct ggml_opt_params params,
+            int64_t nx);
+
+    // continue optimizing the function defined by the tensor f
+    GGML_API enum ggml_opt_result ggml_opt_resume(
+            struct ggml_context * ctx,
+            struct ggml_opt_context * opt,
+            struct ggml_tensor * f);
+
     //
     // quantization
     //

From 2afd2184793541f67f660ca1aabe399aaa71719e Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 21 May 2023 21:12:10 +0200
Subject: [PATCH 37/86] fix bug in llama_sample_token_mirostat_v2

when all candidates are filtered out through mu threshold, the following soft_max operation will fail.
so keep at least one.
---
 llama.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index 98f49abd7cf48..ca61a69e0bb17 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1882,6 +1882,10 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
         return -log2f(candidate.p) > *mu;
     }));
 
+    if (candidates->size == 0) {
+        candidates->size = 1;
+    }
+
     // Normalize the probabilities of the remaining words
     llama_sample_softmax(ctx, candidates);
 

From 93eb8f77522a090d7e69bd206ab61bfaa207679f Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 21 May 2023 21:14:49 +0200
Subject: [PATCH 38/86] add forward function without using cache, for more
 performant training

during training on whole samples no cache is required.
removing the cache and simplifying the remaining code results in performance and memory usage improvement.
---
 examples/baby-llama/baby-llama-text.cpp | 234 +++++++++++++++++++++++-
 1 file changed, 233 insertions(+), 1 deletion(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index 4c3cd631b45a8..cda1edece92a9 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -911,6 +911,234 @@ struct ggml_tensor * forward_batch(
     return inpL;
 }
 
+struct ggml_tensor * forward_batch_wo_cache(
+        struct my_llama_model * model,
+        struct ggml_context   * ctx0,
+        struct ggml_cgraph    * gf,
+        struct ggml_tensor    * tokens_input,
+        const  int              n_tokens,
+        const  int              n_batch) {
+
+    const int n_past = 0;
+    const int N = n_tokens;
+
+    const auto & hparams = model->hparams;
+    const int n_ctx   = hparams.n_ctx;
+    const int n_vocab = hparams.n_vocab;
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_head  = hparams.n_head;
+    const int n_rot   = hparams.n_rot;
+    const int n_ff    = get_n_ff(&hparams);
+
+    struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch);
+    memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch);
+
+    // inpL shape [n_embd,N*n_batch,1]
+    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
+    assert_shape_2d(inpL, n_embd, N*n_batch);
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * inpSA = inpL;
+
+        struct ggml_tensor * cur;
+
+        // lctx.use_buf(ctx0, 0);
+
+        // norm
+        {
+            // cur shape [n_embd,N*n_batch,1,1]
+            cur = ggml_rms_norm(ctx0, inpL);
+            assert_shape_2d(cur, n_embd, N*n_batch);
+
+            // cur = attention_norm*cur
+            cur = ggml_mul(ctx0,
+                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
+                        cur);
+            assert_shape_2d(cur, n_embd, N*n_batch);
+        }
+
+        // self-attention
+        {
+            // compute Q and K and RoPE them
+            // wq   shape [n_embd, n_embd, 1, 1]
+            // wk   shape [n_embd, n_embd, 1, 1]
+            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
+            // Kcur shape [n_embd/n_head, n_head, N, n_batch]
+            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
+            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
+            assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
+            assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
+
+            // Vcur shape [N, n_batch, n_embd/n_head, n_head]
+            struct ggml_tensor * Vcur = ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, cur, model->layers[il].wv), N, n_batch, n_embd/n_head, n_head);
+            assert_shape_4d(Vcur, N, n_batch, n_embd/n_head, n_head);
+
+            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
+            // Q shape    [n_embd/n_head, N, n_head, n_batch]
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        Qcur,
+                        0, 2, 1, 3);
+            assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);
+
+            // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
+            // K shape [n_embd/n_head, N, n_head, n_batch]
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        Kcur,
+                        0, 2, 1, 3);
+            assert_shape_4d(K, n_embd/n_head, N, n_head, n_batch);
+
+            // K * Q
+            // KQ shape [N, N, n_head, n_batch]
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            assert_shape_4d(KQ, N, N, n_head, n_batch);
+
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            // KQ_scaled shape [N, N, n_head, n_batch]
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale_inplace(ctx0,
+                        KQ,
+                        ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
+            assert_shape_4d(KQ_scaled, N, N, n_head, n_batch);
+
+            // KQ_masked = mask_past(KQ_scaled)
+            // KQ_masked shape [N, N, n_head, n_batch]
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
+            assert_shape_4d(KQ_masked, N, N, n_head, n_batch);
+
+            // KQ = soft_max(KQ_masked)
+            // KQ_soft_max shape [N, N, n_head, n_batch]
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
+            assert_shape_4d(KQ_soft_max, N, N, n_head, n_batch);
+
+            // Vcur shape [N, n_batch, n_embd/n_head, n_head]
+            // V shape    [N, n_embd/n_head, n_head, n_batch]
+            struct ggml_tensor * V = 
+                ggml_permute(ctx0,
+                    Vcur,
+                    0, 3, 1, 2);
+            assert_shape_4d(V, N, n_embd/n_head, n_head, n_batch);
+
+            // KQV shape [n_embd/n_head, N, n_head, n_batch]
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+            assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch);
+
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            // KQV_merged shape [n_embd/n_head, n_head, N, n_batch]
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch);
+            // KQV_merged shape
+
+            // cur shape [n_embd,N*n_batch,1,1]
+            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch);
+            assert_shape_2d(cur, n_embd, N*n_batch);
+
+            // projection (no bias)
+            // cur shape [n_embd,N*n_batch,1,1]
+            cur = ggml_mul_mat(ctx0,
+                    model->layers[il].wo,
+                    cur);
+            assert_shape_2d(cur, n_embd, N*n_batch);
+        }
+
+        // lctx.use_buf(ctx0, 1);
+
+        // inpFF shape [n_embd,N*n_batch,1,1]
+        struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA);
+        assert_shape_2d(inpFF, n_embd, N*n_batch);
+
+        // feed-forward network
+        {
+            // norm
+            {
+                // cur shape [n_embd,N*n_batch,1,1]
+                cur = ggml_rms_norm(ctx0, inpFF);
+                assert_shape_2d(cur, n_embd, N*n_batch);
+
+                // cur = ffn_norm*cur
+                // cur shape [n_embd,N*n_batch,1,1]
+                cur = ggml_mul(ctx0,
+                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
+                        cur);
+                assert_shape_2d(cur, n_embd, N*n_batch);
+            }
+
+            // tmp shape [n_ff,N*n_batch,1,1]
+            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
+                    model->layers[il].w3,
+                    cur);
+            assert_shape_2d(tmp, n_ff, N*n_batch);
+
+            // cur shape [n_ff,N*n_batch,1,1]
+            cur = ggml_mul_mat(ctx0,
+                    model->layers[il].w1,
+                    cur);
+            assert_shape_2d(cur, n_ff, N*n_batch);
+
+            // SILU activation
+            // cur shape [n_ff,N*n_batch,1,1]
+            cur = ggml_silu(ctx0, cur);
+            assert_shape_2d(cur, n_ff, N*n_batch);
+
+            // cur shape [n_ff,N*n_batch,1,1]
+            cur = ggml_mul(ctx0, cur, tmp);
+            assert_shape_2d(cur, n_ff, N*n_batch);
+
+            // cur shape [n_embd,N*n_batch,1,1]
+            cur = ggml_mul_mat(ctx0,
+                    model->layers[il].w2,
+                    cur);
+            assert_shape_2d(cur, n_embd, N*n_batch);
+        }
+
+        // cur shape [n_embd,N*n_batch,1,1]
+        cur = ggml_add_inplace(ctx0, cur, inpFF);
+        assert_shape_2d(cur, n_embd, N*n_batch);
+
+        // input for next layer
+        // inpL shape [n_embd,N*n_batch,1,1]
+        inpL = cur;
+        assert_shape_2d(inpL, n_embd, N*n_batch);
+    }
+
+    // norm
+    {
+
+        // inpL shape [n_embd,N*n_batch,1,1]
+        inpL = ggml_rms_norm(ctx0, inpL);
+        assert_shape_2d(inpL, n_embd, N*n_batch);
+
+        // inpL = norm*inpL
+        // inpL shape [n_embd,N*n_batch,1,1]
+        inpL = ggml_mul(ctx0,
+                    ggml_repeat(ctx0, model->norm, inpL),
+                    inpL);
+
+        assert_shape_2d(inpL, n_embd, N*n_batch);
+
+        //embeddings = inpL;
+    }
+
+    // lm_head
+    // inpL shape [n_vocab,N*n_batch,1,1]
+    inpL = ggml_mul_mat(ctx0, model->output, inpL);
+    assert_shape_2d(inpL, n_vocab, N*n_batch);
+
+    {
+        // inpL shape [n_vocab,N,n_batch,1]
+        inpL = ggml_reshape_3d(ctx0,
+                        inpL,
+                        n_vocab, N, n_batch);
+        assert_shape_3d(inpL, n_vocab, N, n_batch);
+    }
+
+    // run the computation
+    ggml_build_forward_expand(gf, inpL);
+
+    return inpL;
+}
+
 void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
     assert(logits->n_dims == 2);
     assert(probs->n_dims == 2);
@@ -1627,7 +1855,11 @@ int main(int argc, char ** argv) {
 
         get_example_targets_batch(ctx0, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex,  tokens_input, target_logits, target_probs);
 
-        struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch);
+        struct ggml_tensor * logits = 
+            (n_past == 0) 
+            ? forward_batch_wo_cache(&model, ctx0, &gf, tokens_input, n_tokens, n_batch)
+            : forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch);
+
         // struct ggml_tensor * se = square_error_loss(ctx0, logits, target_logits);
         struct ggml_tensor * ce = cross_entropy_loss(ctx0, logits, target_probs);
         // struct ggml_tensor * e = ggml_add(ctx0, se, ce);

From 37c69435f04f0e827eaccd3988d78ff385206869 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 21 May 2023 21:17:46 +0200
Subject: [PATCH 39/86] print suppressed newline tokens as string "\n"

printing too much actual newlines is suppressed to avoid flooding the console.
---
 examples/baby-llama/baby-llama-text.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index cda1edece92a9..e4df2eca595e0 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -1247,6 +1247,8 @@ void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens)
             }
             if (!isnl || (num_newline < 2)) {
                 print_token(ctx, token);
+            } else {
+                printf("\\n");
             }
         }
         printf("\n--\n");

From 42d9b4cfc2c13a434e55e2739e255e9af728d842 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 21 May 2023 21:36:04 +0200
Subject: [PATCH 40/86] store optimizer state in training checkpoint and add
 learning schedule

persistent optimizer state allows to resume training without resetting the optimizer
learning schedule consists of linear warmup ramp followed by cosine decay with restarts
---
 examples/baby-llama/baby-llama-text.cpp | 263 ++++++++++++++++++++----
 1 file changed, 226 insertions(+), 37 deletions(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index e4df2eca595e0..ff213ea485435 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -208,6 +208,7 @@ struct my_llama_model {
 
     uint32_t train_its = 0;
     uint32_t train_samples = 0;
+    uint32_t train_tokens = 0;
 };
 
 uint32_t get_n_ff(const struct my_llama_hparams* hparams) {
@@ -237,6 +238,10 @@ void init_model(struct my_llama_model * model) {
 
     struct ggml_context * ctx = model->ctx;
 
+    model->train_its = 0;
+    model->train_samples = 0;
+    model->train_tokens = 0;
+
     model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
     model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
     model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
@@ -1613,6 +1618,13 @@ enum llama_file_version {
 };
 
 void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
+    if (tensor == NULL) {
+        file->write_u32(0);
+        file->write_u32(0);
+        file->write_u32(GGML_TYPE_F32);
+        file->seek(-file->tell() & 31, SEEK_CUR);
+        return;
+    }
     const char * name = ggml_get_name(tensor);
     uint32_t name_len = strlen(name);
     uint32_t nd = tensor->n_dims;
@@ -1629,28 +1641,135 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
 void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
     uint32_t nd = file->read_u32();
     GGML_ASSERT(nd == tensor->n_dims);
-    uint32_t name_len = file->read_u32();
-    enum ggml_type type = (enum ggml_type) file->read_u32();
+
+    uint32_t name_len       = file->read_u32();
+    enum     ggml_type type = (enum ggml_type) file->read_u32();
     GGML_ASSERT(type == tensor->type);
+
     uint32_t ne[4];
     file->read_raw(ne, sizeof(ne[0]) * nd);
     for (int i=0; i<nd; ++i) {
         GGML_ASSERT(ne[i] == tensor->ne[i]);
     }
-    std::string name = file->read_string(name_len);
-    file->seek(-file->tell() & 31, SEEK_CUR);
 
+    std::string name = file->read_string(name_len);
     GGML_ASSERT(strcmp(ggml_get_name(tensor), name.c_str()) == 0);
+
+    file->seek(-file->tell() & 31, SEEK_CUR);
     file->read_raw(tensor->data, ggml_nbytes(tensor));
 }
 
-void save_model(struct my_llama_model * model, const char * filename) {
+void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt) {
+    const uint32_t version = 0;
+    GGML_ASSERT(opt->nx   >= 0);
+    GGML_ASSERT(opt->iter >= 0);
+    file->write_u32(version);
+    file->write_raw(&opt->params, sizeof(opt->params));
+    file->write_raw(&opt->nx,     sizeof(opt->nx));
+    file->write_raw(&opt->iter,   sizeof(opt->iter));
+    file->write_u32((uint32_t)  opt->just_initialized);
+    switch (opt->params.type) {
+        case GGML_OPT_ADAM:
+            {
+                GGML_ASSERT(opt->adam.x != NULL);
+                write_tensor(file, opt->adam.x);
+                write_tensor(file, opt->adam.g1);
+                write_tensor(file, opt->adam.g2);
+                write_tensor(file, opt->adam.m);
+                write_tensor(file, opt->adam.v);
+                write_tensor(file, opt->adam.mh);
+                write_tensor(file, opt->adam.vh);
+                write_tensor(file, opt->adam.pf);
+                file->write_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
+                file->write_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
+                file->write_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
+            } break;
+        case GGML_OPT_LBFGS:
+            {
+                GGML_ASSERT(opt->adam.x != NULL);
+                write_tensor(file, opt->lbfgs.x);
+                write_tensor(file, opt->lbfgs.xp);
+                write_tensor(file, opt->lbfgs.g);
+                write_tensor(file, opt->lbfgs.gp);
+                write_tensor(file, opt->lbfgs.d);
+                write_tensor(file, opt->lbfgs.pf);
+                write_tensor(file, opt->lbfgs.lmal);
+                write_tensor(file, opt->lbfgs.lmys);
+                write_tensor(file, opt->lbfgs.lms);
+                write_tensor(file, opt->lbfgs.lmy);
+                file->write_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
+                file->write_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
+                file->write_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
+                file->write_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
+                file->write_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
+                file->write_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
+            } break;
+    }
+}
+
+void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
+    uint32_t version = file->read_u32();
+    GGML_ASSERT(version == 0);
+
+    file->read_raw(&opt->params, sizeof(opt->params));
+    file->read_raw(&opt->nx,     sizeof(opt->nx));
+    ggml_opt_init(ctx, opt, opt->params, opt->nx);
+
+    file->read_raw(&opt->iter,   sizeof(opt->iter));
+    opt->just_initialized = (bool) file->read_u32();
+
+    switch (opt->params.type) {
+        case GGML_OPT_ADAM:
+            {
+                read_tensor(file, opt->adam.x);
+                read_tensor(file, opt->adam.g1);
+                read_tensor(file, opt->adam.g2);
+                read_tensor(file, opt->adam.m);
+                read_tensor(file, opt->adam.v);
+                read_tensor(file, opt->adam.mh);
+                read_tensor(file, opt->adam.vh);
+                if (opt->adam.pf) { read_tensor(file, opt->adam.pf); }
+                file->read_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
+                file->read_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
+                file->read_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
+            } break;
+        case GGML_OPT_LBFGS:
+            {
+                GGML_ASSERT(opt->adam.x != NULL);
+                read_tensor(file, opt->lbfgs.x);
+                read_tensor(file, opt->lbfgs.xp);
+                read_tensor(file, opt->lbfgs.g);
+                read_tensor(file, opt->lbfgs.gp);
+                read_tensor(file, opt->lbfgs.d);
+                if (opt->lbfgs.pf) { read_tensor(file, opt->lbfgs.pf); }
+                read_tensor(file, opt->lbfgs.lmal);
+                read_tensor(file, opt->lbfgs.lmys);
+                read_tensor(file, opt->lbfgs.lms);
+                read_tensor(file, opt->lbfgs.lmy);
+                file->read_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
+                file->read_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
+                file->read_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
+                file->read_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
+                file->read_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
+                file->read_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
+            } break;
+    }
+}
+
+void save_checkpoint(struct my_llama_model * model, struct ggml_opt_context * opt, const char * filename) {
     struct llama_file file(filename, "wb");
     if (file.fp == NULL) {
         return;
     }
+
+    const uint32_t magic   = 'ggcp';
+    const uint32_t version = 0;
+
+    file.write_u32(magic);
+    file.write_u32(version);
     file.write_u32(model->train_its);
     file.write_u32(model->train_samples);
+    file.write_u32(model->train_tokens);
     file.write_u32(model->hparams.n_vocab);
     file.write_u32(model->hparams.n_embd);
     file.write_u32(model->hparams.n_mult);
@@ -1675,23 +1794,35 @@ void save_model(struct my_llama_model * model, const char * filename) {
         write_tensor(&file, layer.w2);
         write_tensor(&file, layer.w3);
     }
+
+    write_opt_context(&file, opt);
 }
 
-bool load_model(struct my_llama_model * model, const char * filename, bool init) {
+bool load_checkpoint(struct my_llama_model * model, struct ggml_opt_context * opt, const char * filename, bool init) {
     struct llama_file file(filename, "rb");
-    
+
+    uint32_t magic;
+    uint32_t version;
+
+    uint32_t train_its = 0;
+    uint32_t train_samples = 0;
+    uint32_t train_tokens = 0;
+
     if (file.fp) {
         printf("%s: Loading model from '%s'.\n", __func__, filename);
-        model->train_its       = file.read_u32();
-        model->train_samples   = file.read_u32();
+        magic                  = file.read_u32();
+        GGML_ASSERT(magic     == 'ggcp');
+        version                = file.read_u32();
+        GGML_ASSERT(version   == 0);
+        train_its              = file.read_u32();
+        train_samples          = file.read_u32();
+        train_tokens           = file.read_u32();
         model->hparams.n_vocab = file.read_u32();
         model->hparams.n_embd  = file.read_u32();
         model->hparams.n_mult  = file.read_u32();
         model->hparams.n_head  = file.read_u32();
         model->hparams.n_layer = file.read_u32();
         model->hparams.n_rot   = file.read_u32();
-        printf("%s: Training iterations: %u.\n", __func__, model->train_its);
-        printf("%s: Training samples:    %u.\n", __func__, model->train_samples);
         print_params(&model->hparams);
     }
     
@@ -1699,6 +1830,16 @@ bool load_model(struct my_llama_model * model, const char * filename, bool init)
         init_model(model);
     }
 
+    if (file.fp) {
+        model->train_its = train_its;
+        model->train_samples = train_samples;
+        model->train_tokens = train_tokens;
+    }
+
+    printf("%s: Training iterations: %u.\n", __func__, model->train_its);
+    printf("%s: Training samples:    %u.\n", __func__, model->train_samples);
+    printf("%s: Training tokens:     %u.\n", __func__, model->train_tokens);
+
     if (file.fp) {
         read_tensor(&file, model->tok_embeddings);
         read_tensor(&file, model->norm);
@@ -1717,11 +1858,30 @@ bool load_model(struct my_llama_model * model, const char * filename, bool init)
             read_tensor(&file, layer.w2);
             read_tensor(&file, layer.w3);
         }
+
+        read_opt_context(&file, model->ctx, opt);
     }
 
     return (file.fp != NULL);
 }
 
+float cosine_decay(const int decay_steps, const float alpha, int step) {
+    if (step > decay_steps) {
+        step = decay_steps;
+    }
+    const float cosine_decay = 0.50f*(1.0f + cosf(3.14159265359f*step/decay_steps));
+    const float decay = (1 - alpha)*cosine_decay + alpha;
+    return decay;
+}
+
+float cosine_decay_restart(int decay_steps, const float alpha, int step, float restart_step_mult) {
+    while (step > decay_steps) {
+        step -= decay_steps;
+        decay_steps = (int) restart_step_mult * decay_steps;
+    }
+    return cosine_decay(decay_steps, alpha, step);
+}
+
 int main(int argc, char ** argv) {
     const char * default_model = "ggml-vic7b-uncensored-q4_0.bin";
     const char * default_train = "shakespeare.txt";
@@ -1795,16 +1955,55 @@ int main(int argc, char ** argv) {
 
     my_llama_sampler sampler;
 
+
+    int n_threads = 6;
+    
+    bool use_adam = true;
+
+    int warmup = 100;
+    int cos_decay_steps = 1000;
+    float cos_decay_restart = 1.1f;
+    float cos_decay_alpha = 0.0f;
+
+    struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context));
+    memset(opt, 0, sizeof(struct ggml_opt_context));
+
+    struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
+    struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
+    opt_params_adam.print_forward_graph = false;
+    opt_params_adam.print_backward_graph = false;
+    opt_params_adam.n_threads = n_threads;
+    opt_params_adam.adam.n_iter = 16;
+    opt_params_adam.adam.sched = 1.0f;
+    opt_params_adam.adam.alpha = 1e-3;
+    opt_params_adam.adam.decay = 1e-3;
+
+    opt_params_lbfgs.print_forward_graph = false;
+    opt_params_lbfgs.print_backward_graph = false;
+    opt_params_lbfgs.n_threads = n_threads;
+    opt_params_lbfgs.lbfgs.n_iter = 16;
+
+    opt->ctx = model.ctx;
+    opt->params = use_adam ? opt_params_adam : opt_params_lbfgs;
+
     printf("%s: init model\n", __func__);
-    bool existed = load_model(&model, fn_chkpt_in, true);
-    bool from_scratch = !existed;
+    bool existed = load_checkpoint(&model, opt, fn_chkpt_in, true);
     set_param_model(&model);
+    
+    opt->iter = model.train_its;
+    printf("%s: opt iter %d\n", __func__, opt->iter);
+
+    bool from_scratch = !existed;
     if (from_scratch) { 
         randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f); 
     }
-    init_kv_cache(&kv_self, &model, n_batch);
+
+    init_kv_cache(&kv_self, &model, 1);
+    // init_kv_cache(&kv_self, &model, n_batch);
     init_sampler(&sampler, lctx);
 
+    printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx));
+    // ggml_print_tensor_objects(model.ctx);
 
     size_t    compute_size = 1024ll*1024ll*1024ll*32ll;
     uint8_t * compute_addr = new uint8_t[compute_size];
@@ -1853,7 +2052,7 @@ int main(int argc, char ** argv) {
         int n_past = 0;
 
         ggml_cgraph gf = {};
-        gf.n_threads = 6;
+        gf.n_threads = n_threads;
 
         get_example_targets_batch(ctx0, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex,  tokens_input, target_logits, target_probs);
 
@@ -1875,30 +2074,20 @@ int main(int argc, char ** argv) {
 
         float error_before_opt = ggml_get_f32_1d(e, 0);
         
-        struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
-        struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
-        opt_params_adam.print_forward_graph = false;
-        opt_params_adam.print_backward_graph = false;
-        opt_params_adam.n_threads = gf.n_threads;
-        opt_params_adam.adam.n_iter = 16;
-        opt_params_adam.adam.alpha = 1e-4;
-        
-        opt_params_lbfgs.print_forward_graph = false;
-        opt_params_lbfgs.print_backward_graph = false;
-        opt_params_lbfgs.n_threads = gf.n_threads;
-        opt_params_lbfgs.lbfgs.n_iter = 16;
-
-        bool use_adam = true;
-        if (use_adam) {
-            ggml_opt(ctx0, opt_params_adam, e);
-        } else {
-            ggml_opt(ctx0, opt_params_lbfgs, e);
-        }
+        opt->params.adam.sched = (opt->iter < warmup) 
+            ? (float) opt->iter / (float) warmup 
+            : cosine_decay_restart(cos_decay_steps, cos_decay_alpha, opt->iter - warmup, cos_decay_restart);
+        printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched);
+
+        // ggml_opt(ctx0, opt->params, e);
+        ggml_opt_resume(ctx0, opt, e);
 
         size_t used_mem_after_opt = ggml_used_mem(ctx0);
 
-        model.train_its += use_adam ? opt_params_adam.adam.n_iter : opt_params_lbfgs.lbfgs.n_iter;
+        model.train_its = opt->iter;
+        // model.train_its += use_adam ? opt_params_adam.adam.n_iter : opt_params_lbfgs.lbfgs.n_iter;
         model.train_samples += n_batch;
+        model.train_tokens += n_batch * n_tokens;
 
         ggml_build_forward_expand(&gf, e);
         ggml_graph_compute(ctx0, &gf);
@@ -1909,7 +2098,7 @@ int main(int argc, char ** argv) {
         printf("used_mem_after_opt:  %zu bytes\n", used_mem_after_opt);
 
         if (ex % 1 == 0) {
-            printf("Example %d\n", ex);
+            printf("Example %d, opt iter %d\n", ex, opt->iter);
             printf("error_before_opt: %.6f\n", error_before_opt);
             printf("error_after_opt:  %.6f\n", error_after_opt);
         }
@@ -1943,7 +2132,7 @@ int main(int argc, char ** argv) {
         ggml_free(ctx0);
     }
 
-    save_model(&model, fn_chkpt_out);
+    save_checkpoint(&model, opt, fn_chkpt_out);
 
     {
         int n_gen = 1024;

From b763d6f1f233bedb7fc1c89dba0f3f39a59ba8c7 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 22 May 2023 16:54:21 +0200
Subject: [PATCH 41/86] remove unused functions

---
 examples/baby-llama/baby-llama-text.cpp | 68 -------------------------
 1 file changed, 68 deletions(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index ff213ea485435..b187bfd1728fa 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -1144,72 +1144,6 @@ struct ggml_tensor * forward_batch_wo_cache(
     return inpL;
 }
 
-void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
-    assert(logits->n_dims == 2);
-    assert(probs->n_dims == 2);
-    assert(best_samples->n_dims == 1);
-    assert(logits->ne[1] == best_samples->ne[0]);
-    assert(logits->ne[0] == probs->ne[0]);
-    assert(logits->ne[1] == probs->ne[1]);
-    for (int i = 0; i < logits->ne[1]; ++i) {
-        float max_logit = ggml_get_f32_1d(logits, i * logits->ne[0]);
-        ggml_set_i32_1d(best_samples, i, 0);
-        for (int k = 0; k < logits->ne[0]; ++k) {
-            float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k);
-            if (logit > max_logit) {
-                max_logit = logit;
-                ggml_set_i32_1d(best_samples, i, k);
-            }
-        }
-        float psum = 0;
-        for (int k = 0; k < logits->ne[0]; ++k) {
-            float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k);
-            float p = (logit == -INFINITY) ? 0 : expf(logit - max_logit);
-            psum += p;
-            ggml_set_f32_1d(probs, i * probs->ne[0] + k, p);
-        }
-        for (int k = 0; k < logits->ne[0]; ++k) {
-            float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
-            ggml_set_f32_1d(probs, i * probs->ne[0] + k, p / psum);
-        }
-    }
-}
-
-void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
-    GGML_ASSERT(best_samples->n_dims == 2);
-    GGML_ASSERT(logits->n_dims == 3);
-    GGML_ASSERT(probs->n_dims == 3);
-    int n_tokens = best_samples->ne[0];
-    int n_batch  = best_samples->ne[1];
-    int n_vocab  = logits->ne[0];
-    GGML_ASSERT(n_tokens == logits->ne[1]);
-    GGML_ASSERT(n_batch  == logits->ne[2]);
-    GGML_ASSERT(n_vocab  == probs->ne[0]);
-    GGML_ASSERT(n_tokens == probs->ne[1]);
-    GGML_ASSERT(n_batch  == probs->ne[2]);
-
-    for (int k = 0; k < n_batch; ++k) {
-        struct ggml_tensor * best_samples_k = ggml_view_1d(ctx,
-                                                best_samples,
-                                                best_samples->ne[0],
-                                                k*best_samples->nb[1]);
-        struct ggml_tensor * logits_k       = ggml_view_2d(ctx,
-                                                logits,
-                                                logits->ne[0],
-                                                logits->ne[1],
-                                                logits->nb[1],
-                                                k*logits->nb[2]);
-        struct ggml_tensor * probs_k        = ggml_view_2d(ctx,
-                                                probs,
-                                                probs->ne[0],
-                                                probs->ne[1],
-                                                probs->nb[1],
-                                                k*probs->nb[2]);
-        sample_softmax(logits_k, probs_k, best_samples_k);
-    }
-}
-
-
 void print_row(struct ggml_tensor * probs, int i) {
     for (int k = 0; k < probs->ne[0]; ++k) {
         float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
@@ -2116,7 +2050,6 @@ int main(int argc, char ** argv) {
                 }
             }
 
-            // sample_softmax_batch(ctx0, logits, after_opt_probs, after_opt_best_samples);
             // printf("probabilities after optimization:\n");
             // print_matrix(after_opt_probs);
             printf("Example:\n---\n");
@@ -2184,7 +2117,6 @@ int main(int argc, char ** argv) {
                 (float *) ((char *) logits->data + (sample_ctx-1)*logits->nb[1]), 
                 (llama_token *) tokens_input->data, 
                 sample_ctx-1);
-            // sample_softmax(logits, probs, best_samples);
             //int token = ggml_get_i32_1d(best_samples, sample_ctx-1);
 
             // print_row(probs, sample_at);

From cc440bd4381bfc9bf2de464e9992cc4511e64969 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 22 May 2023 16:55:52 +0200
Subject: [PATCH 42/86] fix bug in get_samples which corrupted training targets

---
 examples/baby-llama/baby-llama-text.cpp | 34 +++++++++++++++++++++----
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index b187bfd1728fa..a21403a7733fe 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -1184,16 +1184,40 @@ void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens)
             if (isnl) {
                 ++num_newline;
             }
-            if (!isnl || (num_newline < 2)) {
-                print_token(ctx, token);
+            if (isnl) {
+                if (num_newline < 2) {
+                    print_token(ctx, token);
+                } else {
+                    printf("\\n");
+                }
             } else {
-                printf("\\n");
+                print_token(ctx, token);
             }
         }
         printf("\n--\n");
     }
 }
 
+void set_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, float value) {
+    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    *ptr = value;
+}
+
+void set_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int32_t value) {
+    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    *ptr = value;
+}
+
+float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
+    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    return *ptr;
+}
+
+int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
+    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    return *ptr;
+}
+
 void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
     int n_tokens = tokens_input->ne[0];
     int n_vocab  = target_logits->ne[0];
@@ -1209,8 +1233,8 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons
     ggml_set_i32_1d(tokens_input, 0, llama_token_bos());
     for (int i=1; i<n_tokens+1; ++i) {
         int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
-        ggml_set_f32_1d(target_logits, (i-1)*n_vocab + token, +1.0f);
-        ggml_set_f32_1d(target_probs,  (i-1)*n_vocab + token, -1.0f);
+        set_f32_2d(target_logits, token, i-1, +1.0f);
+        set_f32_2d(target_probs,  token, i-1, -1.0f);
         if (i<n_tokens) {
             ggml_set_i32_1d(tokens_input, i, token);
         }

From 065167930295ab2eb2974ad53407c24d6426545e Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 22 May 2023 16:56:28 +0200
Subject: [PATCH 43/86] save checkpoint only when it was trained

---
 examples/baby-llama/baby-llama-text.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index a21403a7733fe..84ef911f849d4 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -2089,7 +2089,9 @@ int main(int argc, char ** argv) {
         ggml_free(ctx0);
     }
 
-    save_checkpoint(&model, opt, fn_chkpt_out);
+    if (n_examples > 0) {
+        save_checkpoint(&model, opt, fn_chkpt_out);
+    }
 
     {
         int n_gen = 1024;

From d3acbf644e96fb5dd18d2bc9f4dd119c732a8f17 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 22 May 2023 20:53:57 +0200
Subject: [PATCH 44/86] simplify code

---
 examples/baby-llama/baby-llama-text.cpp | 190 +++++++++++++-----------
 1 file changed, 106 insertions(+), 84 deletions(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index 84ef911f849d4..9a193b81dd1b5 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -1144,9 +1144,34 @@ struct ggml_tensor * forward_batch_wo_cache(
     return inpL;
 }
 
+void set_f32_3d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, float value) {
+    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
+    *ptr = value;
+}
+
+void set_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, float value) {
+    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    *ptr = value;
+}
+
+void set_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int32_t value) {
+    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    *ptr = value;
+}
+
+float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
+    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    return *ptr;
+}
+
+int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
+    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    return *ptr;
+}
+
 void print_row(struct ggml_tensor * probs, int i) {
     for (int k = 0; k < probs->ne[0]; ++k) {
-        float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
+        float p = get_f32_2d(probs, k, i);
         printf(" %.2f", p);
     }
     printf("\n");
@@ -1156,7 +1181,7 @@ void print_matrix(struct ggml_tensor * probs) {
     assert(probs->n_dims == 2);
     for (int i = 0; i < probs->ne[1]; ++i) {
         for (int k = 0; k < probs->ne[0]; ++k) {
-            float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
+            float p = get_f32_2d(probs, k, i);
             printf(" %.2f", p);
         }
         printf("\n");
@@ -1179,52 +1204,30 @@ void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens)
     for (int i1=0; i1<tokens->ne[1]; ++i1) {
         int num_newline = 0;
         for (int i0=0; i0<tokens->ne[0]; ++i0) {
-            int token = ggml_get_i32_1d(tokens, i0 + i1*tokens->ne[0]);
-            bool isnl = (token == llama_token_nl());
-            if (isnl) {
-                ++num_newline;
-            }
-            if (isnl) {
-                if (num_newline < 2) {
-                    print_token(ctx, token);
-                } else {
-                    printf("\\n");
-                }
-            } else {
-                print_token(ctx, token);
-            }
+            int token = get_i32_2d(tokens, i0, i1);
+            print_token(ctx, token);
+            // bool isnl = (token == llama_token_nl());
+            // if (isnl) {
+            //     ++num_newline;
+            // }
+            // if (isnl) {
+            //     if (num_newline < 2) {
+            //         print_token(ctx, token);
+            //     } else {
+            //         printf("\\n");
+            //     }
+            // } else {
+            //     print_token(ctx, token);
+            // }
         }
         printf("\n--\n");
     }
 }
 
-void set_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, float value) {
-    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-    *ptr = value;
-}
-
-void set_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int32_t value) {
-    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-    *ptr = value;
-}
-
-float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
-    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-    return *ptr;
-}
-
-int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
-    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-    return *ptr;
-}
-
 void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
     int n_tokens = tokens_input->ne[0];
     int n_vocab  = target_logits->ne[0];
 
-    const float eps = 1e-6f;
-    const float target_prob = 1.0f;
-
     int sample = train_samples[example_id % n_train_samples];
     GGML_ASSERT(sample+n_tokens-1 < n_train_data);
 
@@ -1241,38 +1244,42 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons
     }
 }
 
-void get_example_targets_batch(struct ggml_context * ctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
+void get_example_targets_batch(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
     GGML_ASSERT(tokens_input->n_dims  == 2);
     GGML_ASSERT(target_logits->n_dims == 3);
     GGML_ASSERT(target_probs->n_dims  == 3);
+    int n_vocab  = target_logits->ne[0];
     int n_tokens = tokens_input->ne[0];
     int n_batch  = tokens_input->ne[1];
     GGML_ASSERT(n_tokens == target_logits->ne[1]);
     GGML_ASSERT(n_batch  == target_logits->ne[2]);
+    GGML_ASSERT(n_vocab  == target_probs->ne[0]);
     GGML_ASSERT(n_tokens == target_probs->ne[1]);
     GGML_ASSERT(n_batch  == target_probs->ne[2]);
 
+    ggml_set_f32(target_logits, -1.0f/n_vocab);
+    ggml_set_f32(target_probs, 0.0f);
     for (int k=0; k<n_batch; ++k) {
-        struct ggml_tensor * tokens_input_k  = ggml_view_1d(ctx,
-                                                tokens_input,
-                                                tokens_input->ne[0],
-                                                k*tokens_input->nb[1]);
-        struct ggml_tensor * target_logits_k = ggml_view_2d(ctx,
-                                                target_logits,
-                                                target_logits->ne[0],
-                                                target_logits->ne[1],
-                                                target_logits->nb[1],
-                                                k*target_logits->nb[2]);
-        
-        struct ggml_tensor * target_probs_k = ggml_view_2d(ctx,
-                                                target_probs,
-                                                target_probs->ne[0],
-                                                target_probs->ne[1],
-                                                target_probs->nb[1],
-                                                k*target_probs->nb[2]);
-        
-        get_example_targets(train_samples, n_train_samples, train_data, n_train_data,
-            example_id*n_batch + k, tokens_input_k, target_logits_k, target_probs_k);
+        // printf("%s: batch %d\n", __func__, k);
+        int sample = train_samples[(example_id*n_batch + k) % n_train_samples];
+        GGML_ASSERT(sample+n_tokens-1 < n_train_data);
+
+        set_i32_2d(tokens_input, 0, k, llama_token_bos());
+        for (int i=1; i<n_tokens+1; ++i) {
+            int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
+            // print_token(lctx, token);
+            set_f32_3d(target_logits, token, i-1, k, +1.0f);
+            set_f32_3d(target_probs,  token, i-1, k, -1.0f);
+            if (i<n_tokens) {
+                set_i32_2d(tokens_input, i, k, token);
+            }
+        }
+        // printf("\n=\n");
+        // for (int i=0; i<n_tokens; ++i) {
+        //     int token = get_i32_2d(tokens_input, i, k);
+        //     print_token(lctx, token);
+        // }
+        // printf("\n-\n");
     }
 }
 
@@ -1423,11 +1430,30 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
     out.resize(buf.size());
 
     int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false);
-    
     if (n_tokens >= 0) { 
         out.resize(n_tokens);
     }
 
+    bool verify = false;
+    if (verify) {
+        const char * in  = buf.data();
+        const char * end = buf.data() + buf.size();
+        for (int i=0; i < out.size(); ++i) {
+            const char * s = llama_token_to_str(lctx, out[i]);
+            int len = strlen(s);
+            if (in >= end) {
+                printf("%s: unexpected end of original text.\n", __func__);
+                break;
+            }
+            const bool matches = (strncmp(in, s, len) == 0);
+            if (matches) {
+                in += len;
+            } else {
+                printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s);
+            }
+        }
+    }
+
     return n_tokens;
 }
 
@@ -1841,9 +1867,9 @@ float cosine_decay_restart(int decay_steps, const float alpha, int step, float r
 }
 
 int main(int argc, char ** argv) {
-    const char * default_model = "ggml-vic7b-uncensored-q4_0.bin";
-    const char * default_train = "shakespeare.txt";
-    const char * default_chkpt_in = "checkpoint.bin";
+    const char * default_model     = "ggml-vic7b-uncensored-q4_0.bin";
+    const char * default_train     = "shakespeare.txt";
+    const char * default_chkpt_in  = "checkpoint.bin";
     const char * default_chkpt_out = "checkpoint.bin";
     const char * default_argv[5] = {argv[0], default_model, default_train, default_chkpt_in, default_chkpt_out};
     
@@ -1890,6 +1916,7 @@ int main(int argc, char ** argv) {
         ++token_noccurs[train_tokens[i]];
         token_notavail[train_tokens[i]] = false;
     }
+
     std::vector<float> token_freq;
     token_freq.resize(model.hparams.n_vocab, 0);
     int n_unique_tokens = 0;
@@ -1901,10 +1928,9 @@ int main(int argc, char ** argv) {
 
     struct my_llama_kv_cache kv_self;
 
-    int n_batch = 32;
 
     struct ggml_init_params lcparams;
-    lcparams.mem_size   = 1024ll*1024ll*1024ll*8ll;
+    lcparams.mem_size   = 1024ll*1024ll*1024ll*2ll;
     lcparams.mem_buffer = NULL;
     lcparams.no_alloc   = false;
 
@@ -1913,15 +1939,21 @@ int main(int argc, char ** argv) {
 
     my_llama_sampler sampler;
 
+    int n_threads  = 6;
+    int n_batch    = 32;
+    int n_examples = 32;
 
-    int n_threads = 6;
-    
+    bool samples_start_after_nl = false;
     bool use_adam = true;
 
-    int warmup = 100;
-    int cos_decay_steps = 1000;
+    int warmup              = 100;
+    int cos_decay_steps     = 1000;
     float cos_decay_restart = 1.1f;
-    float cos_decay_alpha = 0.0f;
+    float cos_decay_alpha   = 0.0f;
+
+
+    int n_tokens = model.hparams.n_ctx;
+    int n_vocab  = model.hparams.n_vocab;
 
     struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context));
     memset(opt, 0, sizeof(struct ggml_opt_context));
@@ -1965,12 +1997,7 @@ int main(int argc, char ** argv) {
 
     size_t    compute_size = 1024ll*1024ll*1024ll*32ll;
     uint8_t * compute_addr = new uint8_t[compute_size];
-    
-    int n_examples = 256;
-    int n_tokens = model.hparams.n_ctx;
-    int n_vocab  = model.hparams.n_vocab;
 
-    bool samples_start_after_nl = false;
 
     std::vector<int> train_samples;
     train_samples.push_back(0);
@@ -2012,18 +2039,14 @@ int main(int argc, char ** argv) {
         ggml_cgraph gf = {};
         gf.n_threads = n_threads;
 
-        get_example_targets_batch(ctx0, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex,  tokens_input, target_logits, target_probs);
+        get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex,  tokens_input, target_logits, target_probs);
 
         struct ggml_tensor * logits = 
             (n_past == 0) 
             ? forward_batch_wo_cache(&model, ctx0, &gf, tokens_input, n_tokens, n_batch)
             : forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch);
 
-        // struct ggml_tensor * se = square_error_loss(ctx0, logits, target_logits);
-        struct ggml_tensor * ce = cross_entropy_loss(ctx0, logits, target_probs);
-        // struct ggml_tensor * e = ggml_add(ctx0, se, ce);
-        struct ggml_tensor * e = ce;
-        // struct ggml_tensor * e = se;
+        struct ggml_tensor * e = cross_entropy_loss(ctx0, logits, target_probs);
 
         ggml_build_forward_expand(&gf, e);
         ggml_graph_compute(ctx0, &gf);
@@ -2043,9 +2066,8 @@ int main(int argc, char ** argv) {
         size_t used_mem_after_opt = ggml_used_mem(ctx0);
 
         model.train_its = opt->iter;
-        // model.train_its += use_adam ? opt_params_adam.adam.n_iter : opt_params_lbfgs.lbfgs.n_iter;
         model.train_samples += n_batch;
-        model.train_tokens += n_batch * n_tokens;
+        model.train_tokens  += n_batch * n_tokens;
 
         ggml_build_forward_expand(&gf, e);
         ggml_graph_compute(ctx0, &gf);

From 6d40cc3a44768d71b1b7f978012c39d9c4ed5186 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 22 May 2023 20:56:35 +0200
Subject: [PATCH 45/86] remove trailing whitespace

---
 examples/baby-llama/baby-llama-text.cpp | 56 ++++++++++++-------------
 ggml.c                                  |  4 +-
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index 9a193b81dd1b5..cf7fdbb0128b4 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -270,7 +270,7 @@ void init_model(struct my_llama_model * model) {
         layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
 
         ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str());
-        
+
         ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str());
         ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str());
         ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str());
@@ -1019,7 +1019,7 @@ struct ggml_tensor * forward_batch_wo_cache(
 
             // Vcur shape [N, n_batch, n_embd/n_head, n_head]
             // V shape    [N, n_embd/n_head, n_head, n_batch]
-            struct ggml_tensor * V = 
+            struct ggml_tensor * V =
                 ggml_permute(ctx0,
                     Vcur,
                     0, 3, 1, 2);
@@ -1430,7 +1430,7 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
     out.resize(buf.size());
 
     int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false);
-    if (n_tokens >= 0) { 
+    if (n_tokens >= 0) {
         out.resize(n_tokens);
     }
 
@@ -1470,7 +1470,7 @@ void shuffle_ints(int * begin, int * end) {
     for (int i=0; i<max+1; ++i) {
        vals[i] = frand();
     }
-    std::sort(begin, end, [&vals](auto a, auto b){ 
+    std::sort(begin, end, [&vals](auto a, auto b){
        return vals.at(a) < vals.at(b);
     });
 }
@@ -1494,7 +1494,7 @@ struct my_llama_sampler_params {
 struct my_llama_sampler {
     struct llama_context * ctx = NULL;
     my_llama_sampler_params params;
-    
+
     int n_vocab = 0;
     int n_ctx = 0;
 
@@ -1538,17 +1538,17 @@ llama_token sample(struct my_llama_sampler * sampler, float * logits, const llam
     const int n_last = std::min(std::min(n_last_tokens, params.repeat_last_n), sampler->n_ctx);
 
     llama_sample_repetition_penalty(
-        ctx, 
+        ctx,
         candidates_p,
         last_tokens + n_last_tokens - n_last,
         n_last,
         params.repeat_penalty);
     llama_sample_frequency_and_presence_penalties(
-        ctx, 
+        ctx,
         candidates_p,
         last_tokens + n_last_tokens - n_last,
-        n_last, 
-        params.alpha_frequency, 
+        n_last,
+        params.alpha_frequency,
         params.alpha_presence);
 
     if (!params.penalize_nl) {
@@ -1572,7 +1572,7 @@ llama_token sample(struct my_llama_sampler * sampler, float * logits, const llam
             llama_sample_top_k        (ctx, candidates_p, params.top_k, 1);
             llama_sample_tail_free    (ctx, candidates_p, params.tfs_z, 1);
             llama_sample_typical      (ctx, candidates_p, params.typical_p, 1);
-            
+
             llama_sample_top_p        (ctx, candidates_p, params.top_p, 1);
             llama_sample_temperature  (ctx, candidates_p, params.temp);
             token = llama_sample_token(ctx, candidates_p);
@@ -1809,7 +1809,7 @@ bool load_checkpoint(struct my_llama_model * model, struct ggml_opt_context * op
         model->hparams.n_rot   = file.read_u32();
         print_params(&model->hparams);
     }
-    
+
     if (init) {
         init_model(model);
     }
@@ -1872,7 +1872,7 @@ int main(int argc, char ** argv) {
     const char * default_chkpt_in  = "checkpoint.bin";
     const char * default_chkpt_out = "checkpoint.bin";
     const char * default_argv[5] = {argv[0], default_model, default_train, default_chkpt_in, default_chkpt_out};
-    
+
     if (argc < 5) {
         fprintf(stderr, "usage: %s model training_data chkpt_in chkpt_out\n", argv[0]);
         //return 1;
@@ -1979,13 +1979,13 @@ int main(int argc, char ** argv) {
     printf("%s: init model\n", __func__);
     bool existed = load_checkpoint(&model, opt, fn_chkpt_in, true);
     set_param_model(&model);
-    
+
     opt->iter = model.train_its;
     printf("%s: opt iter %d\n", __func__, opt->iter);
 
     bool from_scratch = !existed;
-    if (from_scratch) { 
-        randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f); 
+    if (from_scratch) {
+        randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
     }
 
     init_kv_cache(&kv_self, &model, 1);
@@ -2041,8 +2041,8 @@ int main(int argc, char ** argv) {
 
         get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex,  tokens_input, target_logits, target_probs);
 
-        struct ggml_tensor * logits = 
-            (n_past == 0) 
+        struct ggml_tensor * logits =
+            (n_past == 0)
             ? forward_batch_wo_cache(&model, ctx0, &gf, tokens_input, n_tokens, n_batch)
             : forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch);
 
@@ -2054,9 +2054,9 @@ int main(int argc, char ** argv) {
         size_t used_mem_before_opt = ggml_used_mem(ctx0);
 
         float error_before_opt = ggml_get_f32_1d(e, 0);
-        
-        opt->params.adam.sched = (opt->iter < warmup) 
-            ? (float) opt->iter / (float) warmup 
+
+        opt->params.adam.sched = (opt->iter < warmup)
+            ? (float) opt->iter / (float) warmup
             : cosine_decay_restart(cos_decay_steps, cos_decay_alpha, opt->iter - warmup, cos_decay_restart);
         printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched);
 
@@ -2088,9 +2088,9 @@ int main(int argc, char ** argv) {
             for (int i=0; i<n_batch; ++i) {
                 init_sampler(&sampler, lctx);
                 for (int k=0; k<n_tokens; ++k) {
-                    int32_t token = sample(&sampler, 
-                        (float *)       ((char *) logits->data + i*logits->nb[2] + k*logits->nb[1]), 
-                        (llama_token *) ((char *) tokens_input->data + i*tokens_input->nb[1]), 
+                    int32_t token = sample(&sampler,
+                        (float *)       ((char *) logits->data + i*logits->nb[2] + k*logits->nb[1]),
+                        (llama_token *) ((char *) tokens_input->data + i*tokens_input->nb[1]),
                         k);
                     * ((int32_t *) ((char *) after_opt_best_samples->data + i*after_opt_best_samples->nb[1] + k*after_opt_best_samples->nb[0])) = token;
                 }
@@ -2118,7 +2118,7 @@ int main(int argc, char ** argv) {
     {
         int n_gen = 1024;
         int sample_ctx = n_tokens - n_tokens/8;
-        
+
         sampler.params.temp = 0.2;
         sampler.params.repeat_penalty = 1.1;
         sampler.params.mirostat = 2;
@@ -2161,9 +2161,9 @@ int main(int argc, char ** argv) {
             struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
 
             // set_logits_masked(logits, token_notavail, -1e9);
-            int token = sample(&sampler, 
-                (float *) ((char *) logits->data + (sample_ctx-1)*logits->nb[1]), 
-                (llama_token *) tokens_input->data, 
+            int token = sample(&sampler,
+                (float *) ((char *) logits->data + (sample_ctx-1)*logits->nb[1]),
+                (llama_token *) tokens_input->data,
                 sample_ctx-1);
             //int token = ggml_get_i32_1d(best_samples, sample_ctx-1);
 
@@ -2175,7 +2175,7 @@ int main(int argc, char ** argv) {
             ggml_set_i32_1d(tokens_input, sample_ctx-1, token);
 
             ggml_free(ctx0);
-        }        
+        }
     }
 
     free(compute_addr);
diff --git a/ggml.c b/ggml.c
index cfc9bb455aec2..1ff5b97c25bce 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9940,7 +9940,7 @@ static void ggml_compute_forward_out_prod_f32(
         const int64_t i3 = ir/(ne2*ne1);
         const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
         const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);
-        
+
         const int64_t i02 = i2;
         const int64_t i03 = i3;
 
@@ -15296,7 +15296,7 @@ enum ggml_opt_result ggml_opt_resume(
     // build forward + backward compute graphs
     struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0));
     struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0));
-    
+
     struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
     struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
 

From c47df098420606f5f85947f886b3b05be3fb9cd7 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 28 May 2023 17:32:01 +0200
Subject: [PATCH 46/86] simplify backward pass for SQRT

---
 ggml.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml.c b/ggml.c
index 1ff5b97c25bce..1a964c5b263af 100644
--- a/ggml.c
+++ b/ggml.c
@@ -13063,11 +13063,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     src0->grad =
                         ggml_add_impl(ctx,
                                 src0->grad,
-                                ggml_mul(ctx,
-                                    tensor->grad, // this was not catched by test_grad because in test_grad tensor->grad is 1
+                                ggml_scale(ctx,
                                     ggml_div(ctx,
-                                        ggml_repeat(ctx, ggml_new_f32(ctx, 0.5f), tensor),
-                                        tensor)),
+                                        tensor->grad,
+                                        tensor),
+                                    ggml_new_f32(ctx, 0.5f)),
                                 inplace);
                 }
             } break;

From 05cb629c8efccb06166fd801fe2cad870fa80350 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 28 May 2023 18:00:17 +0200
Subject: [PATCH 47/86] replace inefficient repeat backward pass with dedicated
 repeat_back operation

---
 ggml.c | 183 +++++++++++++++++++++++++++++++++++++++++++++------------
 ggml.h |   6 ++
 2 files changed, 150 insertions(+), 39 deletions(-)

diff --git a/ggml.c b/ggml.c
index 1a964c5b263af..0571777d19159 100644
--- a/ggml.c
+++ b/ggml.c
@@ -3297,6 +3297,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
     "SUM_ROWS",
     "MEAN",
     "REPEAT",
+    "REPEAT_BACK",
     "ABS",
     "SGN",
     "NEG",
@@ -3340,7 +3341,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
     "MAP_BINARY",
 };
 
-static_assert(GGML_OP_COUNT == 52, "GGML_OP_COUNT != 52");
+static_assert(GGML_OP_COUNT == 53, "GGML_OP_COUNT != 53");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -3359,6 +3360,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "Σx_k",
     "Σx/n",
     "repeat(x)",
+    "repeat_back(x)",
     "abs(x)",
     "sgn(x)",
     "-x",
@@ -3402,7 +3404,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "f(x,y)",
 };
 
-static_assert(GGML_OP_COUNT == 52, "GGML_OP_COUNT != 52");
+static_assert(GGML_OP_COUNT == 53, "GGML_OP_COUNT != 53");
 
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
 static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@@ -4790,6 +4792,34 @@ struct ggml_tensor * ggml_repeat(
     return result;
 }
 
+// ggml_repeat_back
+
+struct ggml_tensor * ggml_repeat_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b) {
+    GGML_ASSERT(ggml_can_repeat(b, a));
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    if (ggml_are_same_shape(a, b) && !is_node) {
+        return a;
+    }
+
+    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
+
+    result->op   = GGML_OP_REPEAT_BACK;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src0 = a;
+    result->src1 = b;
+
+    return result;
+}
+
 // ggml_abs
 
 struct ggml_tensor * ggml_abs_impl(
@@ -8430,6 +8460,99 @@ static void ggml_compute_forward_repeat(
     }
 }
 
+// ggml_compute_forward_repeat_back
+
+static void ggml_compute_forward_repeat_back_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(params->ith == 0);
+    GGML_ASSERT(ggml_can_repeat(dst, src0));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int64_t ne0  = dst->ne[0];
+    const int64_t ne1  = dst->ne[1];
+    const int64_t ne2  = dst->ne[2];
+    const int64_t ne3  = dst->ne[3];
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const size_t nb0  = dst->nb[0];
+    const size_t nb1  = dst->nb[1];
+    const size_t nb2  = dst->nb[2];
+    const size_t nb3  = dst->nb[3];
+
+    const size_t nb00 = src0->nb[0];
+    const size_t nb01 = src0->nb[1];
+    const size_t nb02 = src0->nb[2];
+    const size_t nb03 = src0->nb[3];
+
+    // guaranteed to be an integer due to the check in ggml_can_repeat
+    const int nr0 = (int)(ne00/ne0);
+    const int nr1 = (int)(ne01/ne1);
+    const int nr2 = (int)(ne02/ne2);
+    const int nr3 = (int)(ne03/ne3);
+
+    // TODO: support for transposed / permuted tensors
+    GGML_ASSERT(nb0  == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    if (ggml_is_contiguous(dst)) {
+        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+    } else {
+        for         (int k3 = 0; k3 < ne3; k3++) {
+            for     (int k2 = 0; k2 < ne2; k2++) {
+                for (int k1 = 0; k1 < ne1; k1++) {
+                    ggml_vec_set_f32(ne0, 
+                        (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3), 
+                        0);
+                }
+            }
+        }
+    }
+
+    // TODO: maybe this is not optimal?
+    for                         (int i3 = 0; i3 < nr3; i3++) {
+        for                     (int k3 = 0; k3 < ne3; k3++) {
+            for                 (int i2 = 0; i2 < nr2; i2++) {
+                for             (int k2 = 0; k2 < ne2; k2++) {
+                    for         (int i1 = 0; i1 < nr1; i1++) {
+                        for     (int k1 = 0; k1 < ne1; k1++) {
+                            for (int i0 = 0; i0 < nr0; i0++) {
+                                ggml_vec_acc_f32(ne0,
+                                        (float *) ((char *)  dst->data + (         k3)*nb3  + (         k2)*nb2  + (         k1)*nb1),
+                                        (float *) ((char *) src0->data + (i3*ne3 + k3)*nb03 + (i2*ne2 + k2)*nb02 + (i1*ne1 + k1)*nb01 + (i0*ne0)*nb00));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_repeat_back(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_repeat_back_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
 // ggml_compute_forward_abs
 
 static void ggml_compute_forward_abs_f32(
@@ -12770,6 +12893,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_repeat(params, tensor->src0, tensor);
             } break;
+        case GGML_OP_REPEAT_BACK:
+            {
+                ggml_compute_forward_repeat_back(params, tensor->src0, tensor);
+            } break;
         case GGML_OP_ABS:
             {
                 ggml_compute_forward_abs(params, tensor->src0, tensor);
@@ -13113,43 +13240,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 // necessary for llama
                 if (src0->grad) {
-                    GGML_ASSERT(src0->n_dims == 1 || src0->n_dims == 2);
-                    const int nc  = tensor->ne[0];
-                    const int nr  = tensor->ne[1];
-                    const int nc0 = src0->ne[0];
-                    const int nr0 = src0->ne[1];
-                    const int ncr = nc/nc0; // guaranteed to be an integer due to the check in ggml_can_repeat
-                    const int nrr = nr/nr0; // guaranteed to be an integer due to the check in ggml_can_repeat
-                    // tensor->grad [nc,nr,1,1]
-                    // reshape      [nc0,nc/nc0,nr0,nr/nr0]
-                    // permute      [nc0,nr0,nc/nc0,nr/nr0]
-                    // substitute   [nc0,nr0,ncr,nrr]
-                    // reshape      [nc0*nr0,ncr*nrr,1,1]
-                    // transpose    [ncr*nrr,nc0*nr0,1,1]
-                    // sum rows     [1,nc0*nr0,1,1]
-                    // transpose    [nc0*nr0,1,1]
-                    // reshape      [nc0,nr0,1,1] reshape_1d or reshape_2d
-                    // add to src0->grad
-
-                    int64_t ne[4]  = {nc0,ncr,nr0,nrr};
-
-                    struct ggml_tensor* F00 = tensor->grad;
-                    struct ggml_tensor* F01 = ggml_reshape   (ctx, F00, ggml_new_tensor(ctx,tensor->grad->type,4,ne));
-                    struct ggml_tensor* F02 = ggml_permute   (ctx, F01, 0,2,1,3);
-                    struct ggml_tensor* F03 = ggml_cont      (ctx, F02);
-                    struct ggml_tensor* F04 = ggml_reshape_2d(ctx, F03, nc0*nr0, ncr*nrr);
-                    struct ggml_tensor* F05 = ggml_transpose (ctx, F04);
-                    struct ggml_tensor* F06 = ggml_cont      (ctx, F05);
-                    struct ggml_tensor* F07 = ggml_sum_rows  (ctx, F06);
-                    struct ggml_tensor* F08 = ggml_transpose (ctx, F07);
-                    struct ggml_tensor* F09 = ggml_cont      (ctx, F08);
-                    struct ggml_tensor* F10 = ggml_reshape   (ctx, F09, src0->grad);
-
-                    src0->grad =
-                        ggml_add_impl(ctx,
-                                src0->grad,
-                                F10,
-                                inplace);
+                    src0->grad = ggml_add_impl(ctx,
+                            src0->grad,
+                            ggml_repeat_back(ctx, tensor->grad, src0->grad),
+                            inplace);
+                }
+            } break;
+        case GGML_OP_REPEAT_BACK:
+            {
+                if (src0->grad) {
+                    // TODO: test this
+                    src0->grad = ggml_add_impl(ctx,
+                            src0->grad,
+                            ggml_repeat(ctx, tensor->grad, src0->grad),
+                            inplace);
                 }
             } break;
         case GGML_OP_ABS:
@@ -13941,6 +14045,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 case GGML_OP_SUM_ROWS:
                 case GGML_OP_MEAN:
                 case GGML_OP_REPEAT:
+                case GGML_OP_REPEAT_BACK:
                 case GGML_OP_ABS:
                 case GGML_OP_SGN:
                 case GGML_OP_NEG:
diff --git a/ggml.h b/ggml.h
index 64de9eb3ea76f..711d34e78a25a 100644
--- a/ggml.h
+++ b/ggml.h
@@ -279,6 +279,7 @@ extern "C" {
         GGML_OP_SUM_ROWS,
         GGML_OP_MEAN,
         GGML_OP_REPEAT,
+        GGML_OP_REPEAT_BACK,
         GGML_OP_ABS,
         GGML_OP_SGN,
         GGML_OP_NEG,
@@ -596,6 +597,11 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
 
+    GGML_API struct ggml_tensor * ggml_repeat_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
     GGML_API struct ggml_tensor * ggml_abs(
             struct ggml_context * ctx,
             struct ggml_tensor  * a);

From 71aaf8dedf0e5e4e427e4251a041d2fe9e4d0656 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 28 May 2023 21:57:38 +0200
Subject: [PATCH 48/86] add ggml_cross_entropy_loss with backward pass for
 faster training

cross entropy loss can also be implemented using softmax and log, but as dedicated operation it is faster and especially avoids unnecessary memory overhead.
---
 ggml.c | 377 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 ggml.h |  16 +++
 2 files changed, 391 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index 0571777d19159..01604ec95ec50 100644
--- a/ggml.c
+++ b/ggml.c
@@ -3339,9 +3339,12 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
 
     "MAP_UNARY",
     "MAP_BINARY",
+
+    "CROSS_ENTROPY_LOSS",
+    "CROSS_ENTROPY_LOSS_BACK",
 };
 
-static_assert(GGML_OP_COUNT == 53, "GGML_OP_COUNT != 53");
+static_assert(GGML_OP_COUNT == 55, "GGML_OP_COUNT != 55");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -3402,9 +3405,12 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
 
     "f(x)",
     "f(x,y)",
+
+    "cross_entropy_loss(x,y)",
+    "cross_entropy_loss_back(x,y)",
 };
 
-static_assert(GGML_OP_COUNT == 53, "GGML_OP_COUNT != 53");
+static_assert(GGML_OP_COUNT == 55, "GGML_OP_COUNT != 55");
 
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
 static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@@ -6347,6 +6353,50 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
     return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
 }
 
+// ggml_cross_entropy_loss
+
+struct ggml_tensor * ggml_cross_entropy_loss(
+        struct ggml_context         * ctx,
+        struct ggml_tensor          * a,
+        struct ggml_tensor          * b) {
+    GGML_ASSERT(ggml_are_same_shape(a, b));
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1);
+
+    result->op   = GGML_OP_CROSS_ENTROPY_LOSS;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src0 = a;
+    result->src1 = b;
+
+    return result;
+}
+
+// ggml_cross_entropy_loss_back
+
+struct ggml_tensor * ggml_cross_entropy_loss_back(
+        struct ggml_context         * ctx,
+        struct ggml_tensor          * a,
+        struct ggml_tensor          * b,
+        struct ggml_tensor          * c) {
+    GGML_ASSERT(ggml_are_same_shape(a, b));
+    GGML_ASSERT(ggml_is_scalar(c));
+
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+
+    result->op   = GGML_OP_CROSS_ENTROPY_LOSS_BACK;
+    result->grad = NULL;
+    result->src0 = a;
+    result->src1 = b;
+    result->opt[0] = c;
+
+    return result;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 void ggml_set_param(
@@ -12831,6 +12881,287 @@ static void ggml_compute_forward_map_binary(
     }
 }
 
+// ggml_compute_forward_cross_entropy_loss
+
+static void ggml_compute_forward_cross_entropy_loss_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(src1));
+    GGML_ASSERT(ggml_is_scalar(dst));
+    GGML_ASSERT(ggml_are_same_shape(src0, src1));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    float * sums = (float *) params->wdata;
+
+    // TODO: handle transposed/permuted matrices
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    if (params->type == GGML_TASK_INIT) {
+        if (ith == 0) {
+            memset(sums, 0, sizeof(float) * (nth + nth * nc));
+        }
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        if (ith == 0) {
+            float * dp = (float *) dst->data;
+            ggml_vec_sum_f32(nth, dp, sums);
+            dp[0] *= -1.0f;
+        }
+        return;
+    }
+
+    const float eps = 1e-9f;
+
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
+        float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
+        float * st = (float *) params->wdata + nth + ith*nc;
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            //printf("p[%d] = %f\n", i, p[i]);
+            assert(!isnan(s0[i]));
+            assert(!isnan(s1[i]));
+        }
+#endif
+        // soft_max
+        ggml_float sum = 0.0;
+        {
+            float max = -INFINITY;
+            ggml_vec_max_f32(nc, &max, s0);
+
+            uint16_t scvt;
+            for (int i = 0; i < nc; i++) {
+                if (s0[i] == -INFINITY) {
+                    st[i] = 0.0f;
+                } else {
+                    // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max);
+                    ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
+                    memcpy(&scvt, &s, sizeof(scvt));
+                    const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
+                    sum += (ggml_float)val;
+                    st[i] = val;
+                }
+            }
+
+            assert(sum > 0.0);
+            sum = 1.0/sum;
+        }
+        // avoid log(0) by rescaling from [0..1] to [eps..1]
+        sum = sum * (1.0f - eps);
+        ggml_vec_scale_f32(nc, st, sum);
+        ggml_vec_add1_f32(nc, st, st, eps);
+        ggml_vec_log_f32(nc, st, st);
+        ggml_vec_mul_f32(nc, st, st, s1);
+
+        ggml_vec_sum_f32(nc, sums + ith, st);
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            assert(!isnan(st[i]));
+            assert(!isinf(st[i]));
+        }
+#endif
+    }
+
+}
+
+static void ggml_compute_forward_cross_entropy_loss(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_cross_entropy_loss_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_cross_entropy_loss_back
+
+static void ggml_compute_forward_cross_entropy_loss_back_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        const struct ggml_tensor * opt0,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(src1));
+    GGML_ASSERT(ggml_is_contiguous(opt0));
+    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+    const int64_t ith = params->ith;
+    const int64_t nth = params->nth;
+
+    float * sums = (float *) params->wdata;
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const float eps = 1e-9f;
+
+    // TODO: handle transposed/permuted matrices
+    const int64_t nc = src0->ne[0];
+    const int64_t nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
+
+    float * d   = (float *) opt0->data;
+
+    for (int64_t i1 = ir0; i1 < ir1; i1++) {
+        float * ds0 = (float *)((char *) dst->data  + i1*dst->nb[1]);
+        float * s0  = (float *)((char *) src0->data + i1*src0->nb[1]);
+        float * s1  = (float *)((char *) src1->data + i1*src1->nb[1]);
+        float * sm  = (float *) params->wdata + ith*nc;
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            //printf("p[%d] = %f\n", i, p[i]);
+            assert(!isnan(s0[i]));
+            assert(!isnan(s1[i]));
+        }
+#endif
+        // step by step explanation:
+        {
+            // forward pass with annotated gradients from backward pass
+            // (built by going in reverse operation order, adding to gradients of current operation args)
+            // st0 = exp(s0-max(s0))                                                       grad[st0] = grad[st1]*(1.0 - eps)/sum
+                                                          // from softmax_back:            grad[s0]  = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
+            // ggml_vec_scale_f32(nc, st, sum);           // st1 = st0*/sum = softmax(s0)  grad[st1] = grad[st2]*(1.0 - eps)
+            // ggml_vec_scale_f32(nc, st, (1.0f - eps));  // st2 = st1*(1.0 - eps)         grad[st2] = grad[st3]
+            // ggml_vec_add1_f32(nc, st, st, eps);        // st3 = st2 + eps               grad[st3] = grad[st4]/st3
+            // ggml_vec_log_f32(nc, st, st);              // st4 = log(st3)                grad[st4] = grad[st5] * s1
+            // ggml_vec_mul_f32(nc, st, st, s1);          // st5 = st4 * s1                grad[st5] = grad[sums[ith]]
+            // ggml_vec_sum_f32(nc, sums + ith, st);      // sums[ith] = st5               grad[sums[ith]] = grad[cross_entropy_loss] = -grad[cel]
+
+            // substitute into grad[st1], because we can reuse softmax_back from this point on
+            // grad[st1] = -grad[cel]*s1*(1.0 - eps)/(eps + softmax(s0)*(1.0 - eps))
+            // postorder:
+            // grad[st1] := softmax(s0)
+            // grad[st1] := grad[st1]*(1.0 - eps)
+            // grad[st1] := grad[st1] + eps
+            // grad[st1] := s1 / grad[st1]
+            // grad[st1] := grad[st1]*(1.0-eps)*-grad[cel]
+
+            // src0 gradients by going through softmax_back
+            // grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
+            //   from softmax_back:
+            //   dxk = yk * (dyk - dot(y, dy))
+            //   dot_y_dy := dot(y, dy)
+            //   dx := dy
+            //   dx := dx - dot_y_dy
+            //   dx := dx * y
+            //   postorder:
+            //   dot_st1_dst1 := dot(st1, grad[st1])
+            //   grad[s0] := grad[st1]
+            //   grad[s0] := grad[s0] - dot_st1_dst1
+            //   grad[s0] := grad[s0] * st1
+
+            // prepend postorder from grad[st1] directly using grad[s0] as memory location, as we will grad[s0] := grad[st1]
+            // sm           := softmax(s0)
+            // grad[s0]     := sm*(1.0 - eps)
+            // grad[s0]     := grad[s0] + eps
+            // grad[s0]     := s1 / grad[s0]
+            // grad[s0]     := grad[s0]*(1.0-eps)*-grad[cel]
+            // dot_st1_dst1 := dot(sm, grad[s0])
+            // grad[s0]     := grad[s0] - dot_st1_dst1
+            // grad[s0]     := grad[s0] * sm
+        }
+
+        // soft_max
+        ggml_float sum = 0.0;
+        {
+            float max = -INFINITY;
+            ggml_vec_max_f32(nc, &max, s0);
+
+            uint16_t scvt;
+            for (int i = 0; i < nc; i++) {
+                if (s0[i] == -INFINITY) {
+                    sm[i] = 0.0f;
+                } else {
+                    // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max);
+                    ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
+                    memcpy(&scvt, &s, sizeof(scvt));
+                    const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
+                    sum += (ggml_float)val;
+                    sm[i] = val;
+                }
+            }
+
+            assert(sum > 0.0);
+            sum = 1.0/sum;
+        }
+
+        float dot_st1_dst1 = 0;
+        ggml_vec_scale_f32(nc, sm, sum);
+        ggml_vec_cpy_f32  (nc, ds0, sm);
+        ggml_vec_scale_f32(nc, ds0, (1.0 - eps));
+        ggml_vec_add1_f32 (nc, ds0, ds0, eps);
+        ggml_vec_div_f32  (nc, ds0, s1, ds0);
+        ggml_vec_scale_f32(nc, ds0, -(1.0 - eps)*d[0]);
+        ggml_vec_dot_f32  (nc, &dot_st1_dst1, sm, ds0);
+        ggml_vec_acc1_f32 (nc, ds0, -dot_st1_dst1);
+        ggml_vec_mul_f32  (nc, ds0, ds0, sm);
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            assert(!isnan(sm[i]));
+            assert(!isinf(sm[i]));
+            assert(!isnan(ds0[i]));
+            assert(!isinf(ds0[i]));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_cross_entropy_loss_back(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        const struct ggml_tensor * opt0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_cross_entropy_loss_back_f32(params, src0, src1, opt0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+
 /////////////////////////////////
 
 static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
@@ -13052,6 +13383,16 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
                 ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun);
             }
             break;
+        case GGML_OP_CROSS_ENTROPY_LOSS:
+            {
+                ggml_compute_forward_cross_entropy_loss(params, tensor->src0, tensor->src1, tensor);
+            }
+            break;
+        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
+            {
+                ggml_compute_forward_cross_entropy_loss_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
+            }
+            break;
         case GGML_OP_NONE:
             {
                 // nop
@@ -13677,6 +14018,22 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 GGML_ASSERT(false); // not supported
             } break;
+        case GGML_OP_CROSS_ENTROPY_LOSS:
+            {
+                if (src0->grad) {
+                    src0->grad = ggml_add_impl(ctx,
+                                src0->grad,
+                                ggml_cross_entropy_loss_back(ctx,
+                                    src0,
+                                    src1,
+                                    tensor->grad),
+                                inplace);
+                }
+            } break;
+        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
+            {
+                GGML_ASSERT(false); // not supported
+            } break;
         case GGML_OP_NONE:
             {
                 // nop
@@ -14225,6 +14582,22 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     {
                         node->n_tasks = 1;
                     } break;
+                case GGML_OP_CROSS_ENTROPY_LOSS:
+                    {
+                        node->n_tasks = n_threads;
+
+                        size_t cur = ggml_type_size(node->type)*(node->n_tasks + node->src0->ne[0]*node->n_tasks);
+
+                        work_size = MAX(work_size, cur);
+                    } break;
+                case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
+                    {
+                        node->n_tasks = n_threads;
+
+                        size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*node->n_tasks;
+
+                        work_size = MAX(work_size, cur);
+                    } break;
                 case GGML_OP_NONE:
                     {
                         node->n_tasks = 1;
diff --git a/ggml.h b/ggml.h
index 711d34e78a25a..ba60588d6b521 100644
--- a/ggml.h
+++ b/ggml.h
@@ -322,6 +322,9 @@ extern "C" {
         GGML_OP_MAP_UNARY,
         GGML_OP_MAP_BINARY,
 
+        GGML_OP_CROSS_ENTROPY_LOSS,
+        GGML_OP_CROSS_ENTROPY_LOSS_BACK,
+
         GGML_OP_COUNT,
     };
 
@@ -972,6 +975,19 @@ extern "C" {
             struct ggml_tensor          * b,
                    ggml_binary_op_f32_t   fun);
 
+    // loss function
+
+    GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
+            struct ggml_context         * ctx,
+            struct ggml_tensor          * a,
+            struct ggml_tensor          * b);
+
+    GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
+            struct ggml_context         * ctx,
+            struct ggml_tensor          * a,
+            struct ggml_tensor          * b,
+            struct ggml_tensor          * c);
+
     //
     // automatic differentiation
     //

From f056a04a80127b1d45d91335ebd814b5c7a18a73 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 28 May 2023 21:59:17 +0200
Subject: [PATCH 49/86] add tests for cross_entropy_loss backward pass

finite differences regularly results in estimated gradient of zero, despite the backward pass giving non zero gradient.
_probably_ the finite differences fails due to numerical issues
---
 tests/test-grad0.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index ec5059220078d..b7d68cad9fd28 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -1090,6 +1090,25 @@ int main(int argc, const char ** argv) {
             }
         }
 
+        // cross_entropy_loss
+        {
+            const int nargs = 1;
+
+            int64_t ne2[4];
+            get_random_dims(ne2, 4);
+
+            for (int ndims = 1; ndims <= 3; ++ndims) {
+                x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor(ctx0, ndims, ne2, 0.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cross_entropy_loss(ctx0, x[0], x[1]));
+
+                check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-1f, 1e-2f, INFINITY);
+                // finite differences regularly fails!
+            }
+        }
+
         // rope
         {
             const int nargs = 1;

From 1fbd19abe162d26fa51e16d607b4ed2f3b43109d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 28 May 2023 22:00:26 +0200
Subject: [PATCH 50/86] use ggml_cross_entropy_loss in text training example

---
 examples/baby-llama/baby-llama-text.cpp | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index cf7fdbb0128b4..22f4b56a33043 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -1237,7 +1237,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons
     for (int i=1; i<n_tokens+1; ++i) {
         int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
         set_f32_2d(target_logits, token, i-1, +1.0f);
-        set_f32_2d(target_probs,  token, i-1, -1.0f);
+        set_f32_2d(target_probs,  token, i-1, +1.0f);
         if (i<n_tokens) {
             ggml_set_i32_1d(tokens_input, i, token);
         }
@@ -1269,7 +1269,7 @@ void get_example_targets_batch(struct llama_context * lctx, const int * train_sa
             int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
             // print_token(lctx, token);
             set_f32_3d(target_logits, token, i-1, k, +1.0f);
-            set_f32_3d(target_probs,  token, i-1, k, -1.0f);
+            set_f32_3d(target_probs,  token, i-1, k, +1.0f);
             if (i<n_tokens) {
                 set_i32_2d(tokens_input, i, k, token);
             }
@@ -1301,17 +1301,7 @@ struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_te
 }
 
 struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * probs) {
-    const float eps = 1e-9f;
-    return
-        ggml_sum(ctx,
-            ggml_mul(ctx,
-                probs,
-                ggml_log(ctx,
-                    ggml_add1_inplace(ctx,
-                        ggml_scale_inplace(ctx,
-                            ggml_soft_max(ctx, a),
-                            ggml_new_f32(ctx, 1.0f-eps)),
-                        ggml_new_f32(ctx, eps)))));
+    return ggml_cross_entropy_loss(ctx, a, probs);
 }
 
 #ifdef __GNUC__

From 5f5aa20078e979cf6ef681a96e269e0365917dab Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 28 May 2023 22:00:56 +0200
Subject: [PATCH 51/86] remove trailing whitespace

---
 ggml.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index 01604ec95ec50..bdd29ac50f389 100644
--- a/ggml.c
+++ b/ggml.c
@@ -8559,8 +8559,8 @@ static void ggml_compute_forward_repeat_back_f32(
         for         (int k3 = 0; k3 < ne3; k3++) {
             for     (int k2 = 0; k2 < ne2; k2++) {
                 for (int k1 = 0; k1 < ne1; k1++) {
-                    ggml_vec_set_f32(ne0, 
-                        (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3), 
+                    ggml_vec_set_f32(ne0,
+                        (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3),
                         0);
                 }
             }

From 89475fb320168e0a82f19e74285748f843106242 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 28 May 2023 22:40:58 +0200
Subject: [PATCH 52/86] slightly improve how cross entropy loss is compute

btw: directly implemented cross entropy loss seems to have way lower magnitudes than when implemented with softmax and log.
probably the input to log gets closer to zero due to float numerics.
maybe the multiplication by (1.0-eps)/sum is more accurate..
---
 ggml.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index bdd29ac50f389..b75d55c3d6a72 100644
--- a/ggml.c
+++ b/ggml.c
@@ -12961,10 +12961,10 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
             }
 
             assert(sum > 0.0);
-            sum = 1.0/sum;
+            // sum = 1.0/sum;
         }
         // avoid log(0) by rescaling from [0..1] to [eps..1]
-        sum = sum * (1.0f - eps);
+        sum = (1.0f - eps) / sum;
         ggml_vec_scale_f32(nc, st, sum);
         ggml_vec_add1_f32(nc, st, st, eps);
         ggml_vec_log_f32(nc, st, st);

From bf4d9b3b812bf34c72258ec841b940911c336bcb Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 29 May 2023 01:20:26 +0200
Subject: [PATCH 53/86] add llama_get_vocab to get the vocabulary as output
 parameters

---
 llama.cpp | 13 +++++++++++++
 llama.h   |  8 ++++++++
 2 files changed, 21 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index ca61a69e0bb17..3095c71738175 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2852,6 +2852,19 @@ int llama_n_embd(const struct llama_context * ctx) {
     return ctx->model.hparams.n_embd;
 }
 
+int llama_get_vocab(
+        const struct llama_context * ctx,
+        const char * * strings,
+        float  * scores,
+        int capacity) {
+    int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
+    for (int i = 0; i<n; ++i) {
+        strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
+        scores[i]  = ctx->vocab.id_to_token[i].score;
+    }
+    return n;
+}
+
 float * llama_get_logits(struct llama_context * ctx) {
     return ctx->logits.data();
 }
diff --git a/llama.h b/llama.h
index 21cba8cf61061..33385d0ea4192 100644
--- a/llama.h
+++ b/llama.h
@@ -172,6 +172,14 @@ extern "C" {
     LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
     LLAMA_API int llama_n_embd (const struct llama_context * ctx);
 
+    // Get the vocabulary as output parameters.
+    // Returns number of results.
+    LLAMA_API int llama_get_vocab(
+            const struct llama_context * ctx,
+            const char * * strings,
+            float  * scores,
+            int capacity);
+
     // Token logits obtained from the last call to llama_eval()
     // The logits for the last token are stored in the last row
     // Can be mutated in order to change the probabilities of the next token

From 2da5c8cf246518806cb7aea830a0ed0ffa146ed8 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 29 May 2023 01:20:55 +0200
Subject: [PATCH 54/86] set default model.type for unknown models with few
 layers

---
 llama.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index 3095c71738175..a05a9fc62baa3 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -905,6 +905,12 @@ static void llama_model_load_internal(
             case 40: model.type = e_model::MODEL_13B; break;
             case 60: model.type = e_model::MODEL_30B; break;
             case 80: model.type = e_model::MODEL_65B; break;
+            default:
+                {
+                    if (hparams.n_layer < 32) {
+                        model.type = e_model::MODEL_7B;
+                    }
+                } break;
         }
 
         hparams.n_ctx = n_ctx;

From 4b81c32d5bafed56e73ae50f33737190e7cb4457 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 29 May 2023 01:27:09 +0200
Subject: [PATCH 55/86] add export of training checkpoint to llama compatible
 model file

---
 examples/baby-llama/baby-llama-text.cpp | 97 +++++++++++++++++++++----
 1 file changed, 83 insertions(+), 14 deletions(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index 22f4b56a33043..34a6d10511bbd 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -150,6 +150,19 @@ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struc
     return tensor;
 }
 
+struct llama_vocab {
+    using id    = int32_t;
+    using token = std::string;
+
+    struct token_score {
+        token tok;
+        float score;
+    };
+
+    std::unordered_map<token, id> token_to_id;
+    std::vector<token_score> id_to_token;
+};
+
 struct my_llama_hparams {
     uint32_t n_vocab = 32000;
     uint32_t n_ctx   = 512;   // this is provided as user input?
@@ -278,9 +291,20 @@ void init_model(struct my_llama_model * model) {
 
         ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
 
-        ggml_set_name(layer.w1, (layers_i + ".feed_forward.w1.weight").c_str());
-        ggml_set_name(layer.w2, (layers_i + ".feed_forward.w2.weight").c_str());
-        ggml_set_name(layer.w3, (layers_i + ".feed_forward.w3.weight").c_str());
+        // 'layers.10.feed_forward.w1.weight' has length of 32.
+        // ggml_tensor->name only has 32 characters, but we need one more for the '\0' terminator.
+        // ggml_set_name will set the last character to '\0', so we can only store 'layers.10.feed_forward.w1.weigh'.
+        // when saving llama compatible model the tensors names will miss a character.
+        // ggml_set_name(layer.w1, (layers_i + ".feed_forward.w1.weight").c_str());
+        // ggml_set_name(layer.w2, (layers_i + ".feed_forward.w2.weight").c_str());
+        // ggml_set_name(layer.w3, (layers_i + ".feed_forward.w3.weight").c_str());
+
+        strncpy(layer.w1->name, (layers_i + ".feed_forward.w1.weight").c_str(), sizeof(layer.w1->name));
+        strncpy(layer.w2->name, (layers_i + ".feed_forward.w2.weight").c_str(), sizeof(layer.w2->name));
+        strncpy(layer.w3->name, (layers_i + ".feed_forward.w3.weight").c_str(), sizeof(layer.w3->name));
+        layer.w1->padding[0] = 0;
+        layer.w2->padding[0] = 0;
+        layer.w3->padding[0] = 0;
     }
 }
 
@@ -1584,13 +1608,6 @@ void set_logits_masked(struct ggml_tensor * logits, std::vector<bool>& mask, flo
     }
 }
 
-enum llama_file_version {
-    LLAMA_FILE_VERSION_GGML,
-    LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
-    LLAMA_FILE_VERSION_GGJT_V1, // added padding
-    LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
-};
-
 void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
     if (tensor == NULL) {
         file->write_u32(0);
@@ -1627,7 +1644,7 @@ void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
     }
 
     std::string name = file->read_string(name_len);
-    GGML_ASSERT(strcmp(ggml_get_name(tensor), name.c_str()) == 0);
+    GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)) == 0);
 
     file->seek(-file->tell() & 31, SEEK_CUR);
     file->read_raw(tensor->data, ggml_nbytes(tensor));
@@ -1839,6 +1856,50 @@ bool load_checkpoint(struct my_llama_model * model, struct ggml_opt_context * op
     return (file.fp != NULL);
 }
 
+void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, const char * filename) {
+    struct llama_file file(filename, "wb");
+    if (file.fp == NULL) {
+        return;
+    }
+
+    // write_magic
+    file.write_u32(LLAMA_FILE_MAGIC);   // magic
+    file.write_u32(LLAMA_FILE_VERSION); // version
+    // write_hparams
+    file.write_u32(model->hparams.n_vocab);
+    file.write_u32(model->hparams.n_embd);
+    file.write_u32(model->hparams.n_mult);
+    file.write_u32(model->hparams.n_head);
+    file.write_u32(model->hparams.n_layer);
+    file.write_u32(model->hparams.n_rot);
+    file.write_u32(LLAMA_FTYPE_ALL_F32);
+    // write_vocab
+    uint32_t n_vocab = model->hparams.n_vocab;
+    for (uint32_t i = 0; i < n_vocab; i++) {
+        const auto & token_score = vocab->id_to_token.at(i);
+        file.write_u32((uint32_t) token_score.tok.size());
+        file.write_raw(token_score.tok.data(), token_score.tok.size());
+        file.write_raw(&token_score.score, sizeof(token_score.score));
+    }
+    // write tensors
+    write_tensor(&file, model->tok_embeddings);
+    write_tensor(&file, model->norm);
+    write_tensor(&file, model->output);
+    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+        auto & layer = model->layers[i];
+
+        write_tensor(&file, layer.attention_norm);
+        write_tensor(&file, layer.wq);
+        write_tensor(&file, layer.wk);
+        write_tensor(&file, layer.wv);
+        write_tensor(&file, layer.wo);
+        write_tensor(&file, layer.ffn_norm);
+        write_tensor(&file, layer.w1);
+        write_tensor(&file, layer.w2);
+        write_tensor(&file, layer.w3);
+    }
+}
+
 float cosine_decay(const int decay_steps, const float alpha, int step) {
     if (step > decay_steps) {
         step = decay_steps;
@@ -1861,10 +1922,11 @@ int main(int argc, char ** argv) {
     const char * default_train     = "shakespeare.txt";
     const char * default_chkpt_in  = "checkpoint.bin";
     const char * default_chkpt_out = "checkpoint.bin";
-    const char * default_argv[5] = {argv[0], default_model, default_train, default_chkpt_in, default_chkpt_out};
+    const char * default_model_out = "ggml-checkpoint-f32.bin";
+    const char * default_argv[6] = {argv[0], default_model, default_train, default_chkpt_in, default_chkpt_out, default_model_out};
 
-    if (argc < 5) {
-        fprintf(stderr, "usage: %s model training_data chkpt_in chkpt_out\n", argv[0]);
+    if (argc < 6) {
+        fprintf(stderr, "usage: %s model training_data chkpt_in chkpt_out model_out\n", argv[0]);
         //return 1;
     }
 
@@ -1874,6 +1936,7 @@ int main(int argc, char ** argv) {
     const char * fn_train     = (argc >= 3) ? argv[2] : default_argv[2];
     const char * fn_chkpt_in  = (argc >= 4) ? argv[3] : default_argv[3];
     const char * fn_chkpt_out = (argc >= 5) ? argv[4] : default_argv[4];
+    const char * fn_model_out = (argc >= 6) ? argv[5] : default_argv[5];
 
     struct llama_context_params llama_params = llama_context_default_params();
     llama_params.vocab_only = true;
@@ -1970,6 +2033,8 @@ int main(int argc, char ** argv) {
     bool existed = load_checkpoint(&model, opt, fn_chkpt_in, true);
     set_param_model(&model);
 
+    opt->params = use_adam ? opt_params_adam : opt_params_lbfgs;
+
     opt->iter = model.train_its;
     printf("%s: opt iter %d\n", __func__, opt->iter);
 
@@ -2105,6 +2170,10 @@ int main(int argc, char ** argv) {
         save_checkpoint(&model, opt, fn_chkpt_out);
     }
 
+    if (strlen(fn_model_out) > 0) {
+        save_as_llama_model(&vocab, &model, fn_model_out);
+    }
+
     {
         int n_gen = 1024;
         int sample_ctx = n_tokens - n_tokens/8;

From 56895e28f6630457f6a02d82feee62d05f50c134 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 29 May 2023 02:25:18 +0200
Subject: [PATCH 56/86] get vocabulary for exporting training checkpoint to
 llama compatible model file

---
 examples/baby-llama/baby-llama-text.cpp | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index 34a6d10511bbd..267f44321bbef 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -1943,6 +1943,25 @@ int main(int argc, char ** argv) {
 
     struct llama_context * lctx = llama_init_from_file(fn_model, llama_params);
 
+    struct llama_vocab vocab;
+    {
+        std::vector<const char *> strings;
+        std::vector<float> scores;
+        int n_vocab = llama_n_vocab(lctx);
+        strings.resize(n_vocab, NULL);
+        scores.resize(n_vocab, 0);
+        n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
+        GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
+        vocab.id_to_token.resize(n_vocab);
+        for (int i=0; i<n_vocab; ++i) {
+            std::string tok   = std::string(strings[i]);
+            float       score = scores[i];
+            vocab.id_to_token[i].tok   = tok;
+            vocab.id_to_token[i].score = score;
+            vocab.token_to_id.emplace(tok, i);
+        }
+    }
+
     printf("%s: tokenize training data\n", __func__);
     std::vector<llama_token> train_tokens;
     if (tokenize_file(lctx, fn_train, train_tokens) < 0) {

From 22a7279ffb2c7669926ab899df57154d144d893f Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 29 May 2023 22:00:40 +0200
Subject: [PATCH 57/86] implement backward pass of flash attention

---
 ggml.c | 647 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 ggml.h |   9 +
 2 files changed, 652 insertions(+), 4 deletions(-)

diff --git a/ggml.c b/ggml.c
index b75d55c3d6a72..353b42cee2679 100644
--- a/ggml.c
+++ b/ggml.c
@@ -3336,6 +3336,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
 
     "FLASH_ATTN",
     "FLASH_FF",
+    "FLASH_ATTN_BACK",
 
     "MAP_UNARY",
     "MAP_BINARY",
@@ -3344,7 +3345,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
     "CROSS_ENTROPY_LOSS_BACK",
 };
 
-static_assert(GGML_OP_COUNT == 55, "GGML_OP_COUNT != 55");
+static_assert(GGML_OP_COUNT == 56, "GGML_OP_COUNT != 56");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -3402,6 +3403,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
 
     "flash_attn(x)",
     "flash_ff(x)",
+    "flash_attn_back(x)",
 
     "f(x)",
     "f(x,y)",
@@ -3410,7 +3412,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "cross_entropy_loss_back(x,y)",
 };
 
-static_assert(GGML_OP_COUNT == 55, "GGML_OP_COUNT != 55");
+static_assert(GGML_OP_COUNT == 56, "GGML_OP_COUNT != 56");
 
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
 static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@@ -6251,7 +6253,6 @@ struct ggml_tensor * ggml_flash_ff(
     bool is_node = false;
 
     if (a->grad || b0->grad || b1->grad || c0->grad || c1->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
         is_node = true;
     }
 
@@ -6269,6 +6270,71 @@ struct ggml_tensor * ggml_flash_ff(
     return result;
 }
 
+// ggml_flash_attn_back
+
+struct ggml_tensor * ggml_flash_attn_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * q,
+        struct ggml_tensor  * k,
+        struct ggml_tensor  * v,
+        struct ggml_tensor  * d,
+        bool                  masked) {
+    GGML_ASSERT(ggml_can_mul_mat(k, q));
+    // TODO: check if vT can be multiplied by (k*qT)
+
+    // d shape [D,N,ne2,ne3]
+    // q shape [D,N,ne2,ne3]
+    // k shape [D,M,ne2,ne3]
+    // v shape [M,D,ne2,ne3]
+    
+    const int64_t   D = q->ne[0];
+    const int64_t   N = q->ne[1];
+    const int64_t   M = k->ne[1];
+    const int64_t ne2 = q->ne[2];
+    const int64_t ne3 = q->ne[3];
+
+    GGML_ASSERT(k->ne[0] == D);
+    GGML_ASSERT(v->ne[0] == M);
+    GGML_ASSERT(v->ne[1] == D);
+    GGML_ASSERT(d->ne[0] == D);
+    GGML_ASSERT(d->ne[1] == N);
+    GGML_ASSERT(k->ne[2] == ne2);
+    GGML_ASSERT(k->ne[3] == ne3);
+    GGML_ASSERT(v->ne[2] == ne2);
+    GGML_ASSERT(v->ne[3] == ne3);
+    GGML_ASSERT(d->ne[2] == ne2);
+    GGML_ASSERT(d->ne[3] == ne3);
+
+    bool is_node = false;
+
+    if (q->grad || k->grad || v->grad) {
+        // when using this operation (in backwards pass) these grads are set.
+        // we don't want to create (big) grad of our result, so is_node is false.
+        is_node = false; 
+    }
+
+    // store gradients of q, k and v as continuous tensors concatenated in result.
+    // q shape[D,N,ne2,ne3] ; k shape [D,M,ne2,ne3] ; v shape [M,D,ne2,ne3]
+    // gradq->data = result->data
+    // gradk->data = result->data + nb0*D*N*ne2*ne3
+    // gradv->data = result->data + nb0*D*N*ne2*ne3 + nb0*D*M*ne2*ne3
+    // note: v and gradv are actually transposed, i.e. v->ne[0] != D. 
+    int64_t ne[4] = {D,M+N+M,ne2,ne3}; 
+
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+    result->op   = GGML_OP_FLASH_ATTN_BACK;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src0 = q;
+    result->src1 = k;
+    result->opt[0] = v;
+    result->opt[1] = d;
+    result->opt[2] = ggml_new_i32(ctx, masked ? 1 : 0);
+
+    return result;
+}
+
+
 // ggml_map_unary
 
 struct ggml_tensor * ggml_map_unary_impl_f32(
@@ -12788,6 +12854,394 @@ static void ggml_compute_forward_flash_ff(
     }
 }
 
+// ggml_compute_forward_flash_attn_back
+
+static void ggml_compute_forward_flash_attn_back_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * q,
+        const struct ggml_tensor * k,
+        const struct ggml_tensor * v,
+        const struct ggml_tensor * d,
+        const bool masked,
+              struct ggml_tensor * dst) {
+    int64_t t0 = ggml_perf_time_us();
+    UNUSED(t0);
+
+    const int64_t neq0 = q->ne[0];
+    const int64_t neq1 = q->ne[1];
+    const int64_t neq2 = q->ne[2];
+    const int64_t neq3 = q->ne[3];
+
+    const int64_t nek0 = k->ne[0];
+    const int64_t nek1 = k->ne[1];
+    //const int64_t nek2 = k->ne[2];
+    //const int64_t nek3 = k->ne[3];
+
+    const int64_t nev0 = v->ne[0];
+    const int64_t nev1 = v->ne[1];
+    //const int64_t nev2 = v->ne[2];
+    //const int64_t nev3 = v->ne[3];
+
+    const int64_t ne0  = dst->ne[0];
+    const int64_t ne1  = dst->ne[1];
+    //const int64_t ne2  = dst->ne[2];
+    //const int64_t ne3  = dst->ne[3];
+
+    const int nbk0 = k->nb[0];
+    const int nbk1 = k->nb[1];
+    const int nbk2 = k->nb[2];
+    const int nbk3 = k->nb[3];
+
+    const int nbq0 = q->nb[0];
+    const int nbq1 = q->nb[1];
+    const int nbq2 = q->nb[2];
+    const int nbq3 = q->nb[3];
+
+    const int nbv0 = v->nb[0];
+    const int nbv1 = v->nb[1];
+    const int nbv2 = v->nb[2];
+    const int nbv3 = v->nb[3];
+
+    const int nbd0 = d->nb[0];
+    const int nbd1 = d->nb[1];
+    const int nbd2 = d->nb[2];
+    const int nbd3 = d->nb[3];
+
+    const int nb0  = dst->nb[0];
+    const int nb1  = dst->nb[1];
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t D = neq0;
+    const int64_t N = neq1;
+    const int64_t P = nek1 - N;
+    const int64_t M = P + N;
+
+    const int Mup  = ggml_up(M, GGML_SOFT_MAX_UNROLL);
+    const int mxDM = MAX(D, Mup);
+
+    GGML_ASSERT(ne0 == D);
+    GGML_ASSERT(ne1 == N);
+    GGML_ASSERT(P >= 0);
+
+    GGML_ASSERT(nbq0 == sizeof(float));
+    GGML_ASSERT(nbk0 == sizeof(float));
+    GGML_ASSERT(nbv0 == sizeof(float));
+
+    GGML_ASSERT(neq0 == D);
+    GGML_ASSERT(nek0 == D);
+    GGML_ASSERT(nev1 == D);
+
+    GGML_ASSERT(neq1 == N);
+    GGML_ASSERT(nek1 == N + P);
+    GGML_ASSERT(nev1 == D);
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    if (params->type == GGML_TASK_INIT) {
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // parallelize by q rows using ggml_vec_dot_f32
+
+    // total rows in q
+    const int nr = neq1*neq2*neq3;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    const float scale = 1.0f/sqrtf(D);
+
+    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // q indices
+        const int iq3 = ir/(neq2*neq1);
+        const int iq2 = (ir - iq3*neq2*neq1)/neq1;
+        const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
+
+        // not sure about CACHE_LINE_SIZE_F32..
+        // - maybe it must not be multiplied by 2 and excluded from .. in SM 1*(..) offset?
+        float * S  = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 0*(mxDM+CACHE_LINE_SIZE_F32);
+        float * SM = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 1*(mxDM+CACHE_LINE_SIZE_F32);
+
+        for (int i = M; i < Mup; ++i) {
+            S[i] = -INFINITY;
+        }
+
+        for (int64_t ic = 0; ic < nek1; ++ic) {
+            // k indices
+            const int ik3 = iq3;
+            const int ik2 = iq2;
+            const int ik1 = ic;
+
+            // S indices
+            const int i1 = ik1;
+
+            ggml_vec_dot_f32(neq0,
+                    S + i1,
+                    (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
+                    (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
+        }
+
+        // scale
+        ggml_vec_scale_f32(nek1, S, scale);
+
+        if (masked) {
+            for (int64_t i = P; i < M; i++) {
+                if (i > P + iq1) {
+                    S[i] = -INFINITY;
+                }
+            }
+        }
+
+        // softmax
+        {
+            float max = -INFINITY;
+            ggml_vec_max_f32(M, &max, S);
+
+            ggml_float sum = 0.0;
+            {
+#ifdef GGML_SOFT_MAX_ACCELERATE
+                max = -max;
+                vDSP_vsadd(SM, 1, &max, SM, 1, Mup);
+                vvexpf(SM, SM, &Mup);
+                ggml_vec_sum_f32(Mup, &sum, SM);
+#else
+                uint16_t   scvt[GGML_SOFT_MAX_UNROLL];
+                ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
+
+                for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
+                    float * SS = SM + i;
+
+                    for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
+                        if (SS[j] == -INFINITY) {
+                            SS[j] = 0.0f;
+                        } else {
+                            ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
+                            memcpy(&scvt[j], &s, sizeof(uint16_t));
+                            const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
+                            sump[j] += (ggml_float)val;
+                            SS[j] = val;
+                        }
+                    }
+                }
+
+                for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
+                    sum += sump[i];
+                }
+#endif
+            }
+
+            assert(sum > 0.0);
+
+            sum = 1.0/sum;
+            ggml_vec_scale_f32(M, SM, sum);
+
+        }
+
+        // step-by-step explanation
+        {
+            // forward-process                   shape      grads from backward process
+            // parallel_for iq2,iq3:
+            //  k[:D,:M,:,:]                     [D,M,:,:]  grad[k][:D,:M,iq2,iq3]  += grad[kcur]
+            //  q[:D,:N,:,:]                     [D,N,:,:]  grad[q][:D,iq1,iq2,iq3] += grad[qcur]
+            //  v[:M,:D,:,:]                     [M,D,:,:]  grad[v][:M,:D,iq2,iq3]  += grad[vcur]
+            //  for iq1:
+            //   kcur   = k[:D,:M,iq2,iq3]       [D,M,1,1]  grad[kcur] = grad[S1].T @ qcur
+            //   qcur   = q[:D,iq1,iq2,iq3]      [D,1,1,1]  grad[qcur] = grad[S1]   @ kcur
+            //   vcur   = v[:M,:D,iq2,iq3]       [M,D,1,1]  grad[vcur] = grad[S5].T @ S4
+            //   S0     = -Inf                   [D,1,1,1]
+            //  ~S1[i]  = dot(kcur[:D,i], qcur)
+            //   S1     = qcur.T @ kcur          [M,1,1,1]  grad[S1]   = grad[S2] * scale
+            //   S2     = S1 * scale             [M,1,1,1]  grad[S2]   = diag_mask_zero(grad[S3], P)
+            //   S3     = diag_mask_inf(S2, P)   [M,1,1,1]  grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
+            //   S4     = softmax(S3)            [M,1,1,1]  grad[S4]   = grad[S5] @ vcur
+            //  ~S5[i]  = dot(vcur[:,i],S4)
+            //   S5     = S4.T @ vcur            [D,1,1,1]  grad[S5]   = d[:D,iq1,iq2,iq3]
+            //  ~dst[i,iq1,iq2,iq3]  = S5[i]              ^
+            //   dst[:D,iq1,iq2,iq3] = S5                 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,iq1,iq2,iq3]
+            // dst                               backward-/ grad[dst]                 = d
+            //
+            // output gradients with their dependencies:
+            //
+            // grad[kcur] = grad[S1].T @ qcur
+            // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
+            // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
+            // grad[S4]   = grad[S5] @ vcur
+            // grad[S4]   = d[:D,iq1,iq2,iq3] @ vcur
+            // grad[qcur] = grad[S1]   @ kcur
+            // grad[vcur] = grad[S5].T @ S4
+            // grad[vcur] = d[:D,iq1,iq2,iq3].T @ S4
+            //
+            // in post-order:
+            //
+            // S1         = qcur.T @ kcur
+            // S2         = S1 * scale
+            // S3         = diag_mask_inf(S2, P)
+            // S4         = softmax(S3)
+            // grad[S4]   = d[:D,iq1,iq2,iq3] @ vcur
+            // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
+            // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
+            // grad[qcur] = grad[S1]   @ kcur
+            // grad[kcur] = grad[S1].T @ qcur
+            // grad[vcur] = d[:D,iq1,iq2,iq3].T @ S4
+            //
+            // using less variables (SM=S4):
+            //
+            // S             = diag_mask_inf(qcur.T @ kcur * scale, P)
+            // SM            = softmax(S)
+            // S             = d[:D,iq1,iq2,iq3] @ vcur
+            // dot_SM_gradSM = dot(SM, S)
+            // S             = SM * (S - dot(SM, S))
+            // S             = diag_mask_zero(S, P) * scale
+            //
+            // grad[q][:D,iq1,iq2,iq3] += S   @ kcur
+            // grad[k][:D,:M,iq2,iq3]  += S.T @ qcur
+            // grad[v][:M,:D,iq2,iq3]  += d[:D,iq1,iq2,iq3].T @ SM
+        }
+
+        // S = gradSM = d[:D,iq1,iq2,iq3] @ vcur
+        // S = d[:D,iq1,iq2,iq3] @ vcur
+        // S[:M] += vcur[:,ic] * d[ic,iq1,iq2,iq3]
+        ggml_vec_set_f32(D, S, 0);
+        for (int64_t ic = 0; ic < D; ++ic) {
+            // dst indices
+            const int i1 = iq1;
+            const int i2 = iq2;
+            const int i3 = iq3;
+
+            ggml_vec_mad_f32(M,
+                    S,
+                     (float *) ((char *) v->data + (          ic*nbv1 + i2*nbv2 + i3*nbv3)),
+                    *(float *) ((char *) d->data + (ic*nbd1 + i1*nbd2 + i2*nbd2 + i3*nbd3)));
+        }
+
+        // S = SM * (S - dot(SM, S))
+        float dot_SM_gradSM = 0;
+        ggml_vec_dot_f32 (M, &dot_SM_gradSM, SM, S);
+        ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
+        ggml_vec_mul_f32 (M, S, S, SM);
+
+        // S = diag_mask_zero(S, P) * scale
+        if (masked) {
+            for (int64_t i = P + iq1 + 1; i < M; i++) {
+                S[i] = 0;
+            }
+        }
+        ggml_vec_scale_f32(M, S, scale);
+
+        void * grad_q = (char *) dst->data;
+        void * grad_k = (char *) dst->data + nb0*D*N*neq2*neq3;
+        void * grad_v = (char *) dst->data + nb0*D*N*neq2*neq3 + nb0*D*M*neq2*neq3;
+
+        const size_t nbgq1 = nb0*neq0;
+        const size_t nbgq2 = nb0*neq0*neq1;
+        const size_t nbgq3 = nb0*neq0*neq1*neq2;
+
+        const size_t nbgk1 = nb0*nek0;
+        const size_t nbgk2 = nb0*nek0*nek1;
+        const size_t nbgk3 = nb0*nek0*nek1*neq2;
+
+        const size_t nbgv1 = nb0*nev0;
+        const size_t nbgv2 = nb0*nev0*nev1;
+        const size_t nbgv3 = nb0*nev0*nev1*neq2;
+
+        // S    shape [M,1]
+        // SM   shape [M,1]
+        // kcur shape [D,M]
+        // qcur shape [D,1]
+        // vcur shape [M,D]
+        //
+        // grad[q][:D,iq1,iq2,iq3] += S @ kcur
+        // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M]
+        // grad[q][ic,iq1,iq2,iq3] += dot(kcur[:,ic],S.T)
+        // grad[q][ic,iq1,iq2,iq3] += dot(k[:D,ic,iq2,iq3],S.T)
+        for (int64_t ic = 0; ic < M; ++ic) {
+            // dst indices
+            const int i1 = iq1;
+            const int i2 = iq2;
+            const int i3 = iq3;
+
+            ggml_vec_dot_f32(D,
+                    (float *) ((char *) grad_q  + (ic*nb0 + i1*nbgq1  + i2*nbgq2  + i3*nbgq3)),
+                    (float *) ((char *) k->data + (         ic*nbk1   + i2*nbk2   + i3*nbk3)),
+                    S);
+        }
+
+        // grad[k][:D,:M,iq2,iq3] += S.T       @ qcur
+        // grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0]
+        // grad[k][:D,ic,iq2,iq3] += S[ic]     * qcur[:D,0]
+        for (int64_t ic = 0; ic < M; ++ic) {
+            // dst indices
+            const int i1 = iq1;
+            const int i2 = iq2;
+            const int i3 = iq3;
+
+            ggml_vec_set_f32(D,
+                    (float *) ((char *) grad_k  + (ic*nbgk1  + i2*nbgk2  + i3*nbgk3)),
+                    0);
+            ggml_vec_mad_f32(D,
+                    (float *) ((char *) grad_k  + (ic*nbgk1  + i2*nbgk2  + i3*nbgk3)),
+                    (float *) ((char *) q->data + (i1*nbk1   + i2*nbk2   + i3*nbk3)),
+                    S[ic]);
+        }
+
+        // grad[v][:M,:D,iq2,iq3] += d[:D,iq1,iq2,iq3].T       @ SM
+        // grad[v][:M,ic,iq2,iq3] += d[:D,iq1,iq2,iq3].T[0,ic] * SM[:M]
+        // grad[v][:M,ic,iq2,iq3] += d[ic,iq1,iq2,iq3]         * SM[:M]
+        for (int64_t ic = 0; ic < D; ++ic) {
+            // dst indices
+            const int i1 = iq1;
+            const int i2 = iq2;
+            const int i3 = iq3;
+
+            ggml_vec_set_f32(M,
+                    (float *) ((char *) grad_v   + (          ic*nbgv1 + i2*nbgv2 + i3*nbgv3)),
+                    0);
+            ggml_vec_mad_f32(M,
+                    (float *) ((char *) grad_v   + (          ic*nbgv1 + i2*nbgv2 + i3*nbgv3)),
+                    SM,
+                    *(float *) ((char *) d->data + (ic*nbd1 + i1*nbd2  + i2*nbd2  + i3*nbd3)));
+        }
+    }
+}
+
+static void ggml_compute_forward_flash_attn_back(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * q,
+        const struct ggml_tensor * k,
+        const struct ggml_tensor * v,
+        const struct ggml_tensor * d,
+        const bool masked,
+        struct ggml_tensor * dst) {
+    switch (q->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_flash_attn_back_f32(params, q, k, v, d, masked, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
 // ggml_compute_forward_map_unary
 
 static void ggml_compute_forward_map_unary_f32(
@@ -13371,6 +13825,13 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor);
             } break;
+        case GGML_OP_FLASH_ATTN_BACK:
+            {
+                int32_t t = ggml_get_i32_1d(tensor->opt[2], 0);
+                GGML_ASSERT(t == 0 || t == 1);
+                bool masked = t != 0;
+                ggml_compute_forward_flash_attn_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], masked, tensor);
+            } break;
         case GGML_OP_MAP_UNARY:
             {
                 const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data);
@@ -14007,12 +14468,169 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             } break;
         case GGML_OP_FLASH_ATTN:
             {
-                GGML_ASSERT(false); // not supported
+                struct ggml_tensor * flash_grad = NULL;
+                if (src0->grad || src1->grad || tensor->opt[0]->grad) {
+                    int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
+                    GGML_ASSERT(t == 0 || t == 1);
+                    bool masked = t != 0;
+                    flash_grad = 
+                        ggml_flash_attn_back(ctx, 
+                            src0->grad, 
+                            src1->grad, 
+                            tensor->opt[0]->grad,
+                            tensor->grad,
+                            masked);
+                }
+
+                if (src0->grad) {
+                    struct ggml_tensor * grad_q = NULL;
+                    const size_t nb0    = flash_grad->nb[0];
+                    const size_t offset = 0;
+                    switch(src0->n_dims) {
+                        case 2:
+                            {
+                                grad_q = ggml_view_2d(ctx, 
+                                    flash_grad, 
+                                    src0->ne[0], 
+                                    src0->ne[1], 
+                                    nb0*src0->ne[0],
+                                    offset);
+                            } break;
+                        case 3:
+                            {
+                                grad_q = ggml_view_3d(ctx, 
+                                    flash_grad, 
+                                    src0->ne[0], 
+                                    src0->ne[1], 
+                                    src0->ne[2], 
+                                    nb0*src0->ne[0],
+                                    nb0*src0->ne[0]*src0->ne[1],
+                                    offset);
+                            } break;
+                        case 4:
+                            {
+                                grad_q = ggml_view_3d(ctx, 
+                                    flash_grad, 
+                                    src0->ne[0], 
+                                    src0->ne[1], 
+                                    src0->ne[2], 
+                                    src0->ne[3], 
+                                    nb0*src0->ne[0],
+                                    nb0*src0->ne[0]*src0->ne[1],
+                                    nb0*src0->ne[0]*src0->ne[1]*src0->ne[2],
+                                    offset);
+                            } break;
+                    }
+
+                    src0->grad = ggml_add_impl(ctx,
+                            src0->grad,
+                            grad_q,
+                            inplace);
+                }
+
+                if (src1->grad) {
+                    struct ggml_tensor * grad_k = NULL;
+                    const size_t nb0    = flash_grad->nb[0];
+                    const size_t offset = nb0*src0->ne[0]*src0->ne[1]*src0->ne[2]*src0->ne[3];
+                    switch(src1->n_dims) {
+                        case 2:
+                            {
+                                grad_k = ggml_view_2d(ctx, 
+                                    flash_grad, 
+                                    src1->ne[0], 
+                                    src1->ne[1], 
+                                    nb0*src1->ne[0],
+                                    offset);
+                            } break;
+                        case 3:
+                            {
+                                grad_k = ggml_view_3d(ctx, 
+                                    flash_grad, 
+                                    src1->ne[0], 
+                                    src1->ne[1], 
+                                    src1->ne[2], 
+                                    nb0*src1->ne[0],
+                                    nb0*src1->ne[0]*src1->ne[1],
+                                    offset);
+                            } break;
+                        case 4:
+                            {
+                                grad_k = ggml_view_3d(ctx, 
+                                    flash_grad, 
+                                    src1->ne[0], 
+                                    src1->ne[1], 
+                                    src1->ne[2], 
+                                    src1->ne[3], 
+                                    nb0*src1->ne[0],
+                                    nb0*src1->ne[0]*src1->ne[1],
+                                    nb0*src1->ne[0]*src1->ne[1]*src1->ne[2],
+                                    offset);
+                            } break;
+                    }
+
+                    src1->grad = ggml_add_impl(ctx,
+                            src1->grad,
+                            grad_k,
+                            inplace);
+                }
+
+                struct ggml_tensor * opt0 = tensor->opt[0];
+
+                if (opt0->grad) {
+                    struct ggml_tensor * grad_v = NULL;
+                    const size_t nb0    = flash_grad->nb[0];
+                    const size_t offset = nb0*src0->ne[0]*src0->ne[1]*src0->ne[2]*src0->ne[3]
+                                        + nb0*src1->ne[0]*src1->ne[1]*src1->ne[2]*src1->ne[3];
+                    switch(opt0->n_dims) {
+                        case 2:
+                            {
+                                grad_v = ggml_view_2d(ctx, 
+                                    flash_grad, 
+                                    opt0->ne[0], 
+                                    opt0->ne[1], 
+                                    nb0*opt0->ne[0],
+                                    offset);
+                            } break;
+                        case 3:
+                            {
+                                grad_v = ggml_view_3d(ctx, 
+                                    flash_grad, 
+                                    opt0->ne[0], 
+                                    opt0->ne[1], 
+                                    opt0->ne[2], 
+                                    nb0*opt0->ne[0],
+                                    nb0*opt0->ne[0]*opt0->ne[1],
+                                    offset);
+                            } break;
+                        case 4:
+                            {
+                                grad_v = ggml_view_3d(ctx, 
+                                    flash_grad, 
+                                    opt0->ne[0], 
+                                    opt0->ne[1], 
+                                    opt0->ne[2], 
+                                    opt0->ne[3], 
+                                    nb0*opt0->ne[0],
+                                    nb0*opt0->ne[0]*opt0->ne[1],
+                                    nb0*opt0->ne[0]*opt0->ne[1]*opt0->ne[2],
+                                    offset);
+                            } break;
+                    }
+
+                    opt0->grad = ggml_add_impl(ctx,
+                            opt0->grad,
+                            grad_v,
+                            inplace);
+                }
             } break;
         case GGML_OP_FLASH_FF:
             {
                 GGML_ASSERT(false); // not supported
             } break;
+        case GGML_OP_FLASH_ATTN_BACK:
+            {
+                GGML_ASSERT(false); // not supported
+            } break;
         case GGML_OP_MAP_UNARY:
         case GGML_OP_MAP_BINARY:
             {
@@ -14575,6 +15193,27 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                             cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
                         }
 
+                        work_size = MAX(work_size, cur);
+                    } break;
+                case GGML_OP_FLASH_ATTN_BACK:
+                    {
+                        node->n_tasks = n_threads;
+
+                        size_t cur = 0;
+
+                        const int64_t    D = node->src0->ne[0];
+                        const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
+                        const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
+                        if (node->src1->type == GGML_TYPE_F32) {
+                            cur  = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2
+                        }
+
+                        if (node->src1->type == GGML_TYPE_F16) {
+                            cur  = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2
+                        }
+
                         work_size = MAX(work_size, cur);
                     } break;
                 case GGML_OP_MAP_UNARY:
diff --git a/ggml.h b/ggml.h
index ba60588d6b521..5dc80e74beb1c 100644
--- a/ggml.h
+++ b/ggml.h
@@ -318,6 +318,7 @@ extern "C" {
 
         GGML_OP_FLASH_ATTN,
         GGML_OP_FLASH_FF,
+        GGML_OP_FLASH_ATTN_BACK,
 
         GGML_OP_MAP_UNARY,
         GGML_OP_MAP_BINARY,
@@ -952,6 +953,14 @@ extern "C" {
             struct ggml_tensor  * v,
             bool                  masked);
 
+    GGML_API struct ggml_tensor * ggml_flash_attn_back(
+           struct ggml_context * ctx,
+           struct ggml_tensor  * q,
+           struct ggml_tensor  * k,
+           struct ggml_tensor  * v,
+           struct ggml_tensor  * d,
+           bool                  masked);
+
     GGML_API struct ggml_tensor * ggml_flash_ff(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,

From 38560b6d51b53c614ff934d1da4e67ad9ed96517 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 29 May 2023 23:45:58 +0200
Subject: [PATCH 58/86] bugfixes for backward pass of flash attention

---
 ggml.c | 463 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 241 insertions(+), 222 deletions(-)

diff --git a/ggml.c b/ggml.c
index 353b42cee2679..47a01cea6cc42 100644
--- a/ggml.c
+++ b/ggml.c
@@ -6221,7 +6221,6 @@ struct ggml_tensor * ggml_flash_attn(
     bool is_node = false;
 
     if (q->grad || k->grad || v->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
         is_node = true;
     }
 
@@ -12882,10 +12881,15 @@ static void ggml_compute_forward_flash_attn_back_f32(
     //const int64_t nev2 = v->ne[2];
     //const int64_t nev3 = v->ne[3];
 
+    const int64_t ned0 = d->ne[0];
+    const int64_t ned1 = d->ne[1];
+    //const int64_t ned2 = d->ne[2];
+    //const int64_t ned3 = d->ne[3];
+
     const int64_t ne0  = dst->ne[0];
     const int64_t ne1  = dst->ne[1];
-    //const int64_t ne2  = dst->ne[2];
-    //const int64_t ne3  = dst->ne[3];
+    const int64_t ne2  = dst->ne[2];
+    const int64_t ne3  = dst->ne[3];
 
     const int nbk0 = k->nb[0];
     const int nbk1 = k->nb[1];
@@ -12923,8 +12927,8 @@ static void ggml_compute_forward_flash_attn_back_f32(
     const int Mup  = ggml_up(M, GGML_SOFT_MAX_UNROLL);
     const int mxDM = MAX(D, Mup);
 
-    GGML_ASSERT(ne0 == D);
-    GGML_ASSERT(ne1 == N);
+    // GGML_ASSERT(ne0 == D);
+    // GGML_ASSERT(ne1 == N);
     GGML_ASSERT(P >= 0);
 
     GGML_ASSERT(nbq0 == sizeof(float));
@@ -12934,10 +12938,12 @@ static void ggml_compute_forward_flash_attn_back_f32(
     GGML_ASSERT(neq0 == D);
     GGML_ASSERT(nek0 == D);
     GGML_ASSERT(nev1 == D);
+    GGML_ASSERT(ned0 == D);
 
     GGML_ASSERT(neq1 == N);
     GGML_ASSERT(nek1 == N + P);
     GGML_ASSERT(nev1 == D);
+    GGML_ASSERT(ned1 == N);
 
     // dst cannot be transposed or permuted
     GGML_ASSERT(nb0 == sizeof(float));
@@ -12946,6 +12952,9 @@ static void ggml_compute_forward_flash_attn_back_f32(
     GGML_ASSERT(nb2 <= nb3);
 
     if (params->type == GGML_TASK_INIT) {
+        if (ith == 0) {
+            memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3);
+        }
         return;
     }
 
@@ -12956,7 +12965,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
     // parallelize by q rows using ggml_vec_dot_f32
 
     // total rows in q
-    const int nr = neq1*neq2*neq3;
+    const int nr = neq2*neq3;
 
     // rows per thread
     const int dr = (nr + nth - 1)/nth;
@@ -12971,253 +12980,263 @@ static void ggml_compute_forward_flash_attn_back_f32(
 
     for (int ir = ir0; ir < ir1; ++ir) {
         // q indices
-        const int iq3 = ir/(neq2*neq1);
-        const int iq2 = (ir - iq3*neq2*neq1)/neq1;
-        const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
+        const int iq3 = ir/(neq2);
+        const int iq2 = (ir - iq3*neq2)/neq2;
+        for ( int iq1 = 0; iq1 < neq1; ++iq1) {
 
-        // not sure about CACHE_LINE_SIZE_F32..
-        // - maybe it must not be multiplied by 2 and excluded from .. in SM 1*(..) offset?
-        float * S  = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 0*(mxDM+CACHE_LINE_SIZE_F32);
-        float * SM = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 1*(mxDM+CACHE_LINE_SIZE_F32);
 
-        for (int i = M; i < Mup; ++i) {
-            S[i] = -INFINITY;
-        }
+            // not sure about CACHE_LINE_SIZE_F32..
+            // - maybe it must not be multiplied by 2 and excluded from .. in SM 1*(..) offset?
+            float * S  = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 0*(mxDM+CACHE_LINE_SIZE_F32);
+            float * SM = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 1*(mxDM+CACHE_LINE_SIZE_F32);
 
-        for (int64_t ic = 0; ic < nek1; ++ic) {
-            // k indices
-            const int ik3 = iq3;
-            const int ik2 = iq2;
-            const int ik1 = ic;
+            for (int i = M; i < Mup; ++i) {
+                S[i] = -INFINITY;
+            }
 
-            // S indices
-            const int i1 = ik1;
+            for (int64_t ic = 0; ic < nek1; ++ic) {
+                // k indices
+                const int ik3 = iq3;
+                const int ik2 = iq2;
+                const int ik1 = ic;
 
-            ggml_vec_dot_f32(neq0,
-                    S + i1,
-                    (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
-                    (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
-        }
+                // S indices
+                const int i1 = ik1;
 
-        // scale
-        ggml_vec_scale_f32(nek1, S, scale);
+                ggml_vec_dot_f32(neq0,
+                        S + i1,
+                        (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
+                        (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
+            }
 
-        if (masked) {
-            for (int64_t i = P; i < M; i++) {
-                if (i > P + iq1) {
-                    S[i] = -INFINITY;
+            // scale
+            ggml_vec_scale_f32(nek1, S, scale);
+
+            if (masked) {
+                for (int64_t i = P; i < M; i++) {
+                    if (i > P + iq1) {
+                        S[i] = -INFINITY;
+                    }
                 }
             }
-        }
 
-        // softmax
-        {
-            float max = -INFINITY;
-            ggml_vec_max_f32(M, &max, S);
-
-            ggml_float sum = 0.0;
+            // softmax
             {
+                float max = -INFINITY;
+                ggml_vec_max_f32(M, &max, S);
+
+                ggml_float sum = 0.0;
+                {
 #ifdef GGML_SOFT_MAX_ACCELERATE
-                max = -max;
-                vDSP_vsadd(SM, 1, &max, SM, 1, Mup);
-                vvexpf(SM, SM, &Mup);
-                ggml_vec_sum_f32(Mup, &sum, SM);
+                    max = -max;
+                    vDSP_vsadd(SM, 1, &max, SM, 1, Mup);
+                    vvexpf(SM, SM, &Mup);
+                    ggml_vec_sum_f32(Mup, &sum, SM);
 #else
-                uint16_t   scvt[GGML_SOFT_MAX_UNROLL];
-                ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
+                    uint16_t   scvt[GGML_SOFT_MAX_UNROLL];
+                    ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
 
-                for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
-                    float * SS = SM + i;
+                    for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
+                        float * SR =  S + i;
+                        float * SW = SM + i;
 
-                    for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
-                        if (SS[j] == -INFINITY) {
-                            SS[j] = 0.0f;
-                        } else {
-                            ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
-                            memcpy(&scvt[j], &s, sizeof(uint16_t));
-                            const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
-                            sump[j] += (ggml_float)val;
-                            SS[j] = val;
+                        for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
+                            if (SR[j] == -INFINITY) {
+                                SW[j] = 0.0f;
+                            } else {
+                                ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
+                                memcpy(&scvt[j], &s, sizeof(uint16_t));
+                                const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
+                                sump[j] += (ggml_float)val;
+                                SW[j] = val;
+                            }
                         }
                     }
-                }
 
-                for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
-                    sum += sump[i];
-                }
+                    for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
+                        sum += sump[i];
+                    }
 #endif
-            }
+                }
 
-            assert(sum > 0.0);
+                assert(sum > 0.0);
 
-            sum = 1.0/sum;
-            ggml_vec_scale_f32(M, SM, sum);
+                sum = 1.0/sum;
+                ggml_vec_scale_f32(M, SM, sum);
 
-        }
+            }
 
-        // step-by-step explanation
-        {
-            // forward-process                   shape      grads from backward process
-            // parallel_for iq2,iq3:
-            //  k[:D,:M,:,:]                     [D,M,:,:]  grad[k][:D,:M,iq2,iq3]  += grad[kcur]
-            //  q[:D,:N,:,:]                     [D,N,:,:]  grad[q][:D,iq1,iq2,iq3] += grad[qcur]
-            //  v[:M,:D,:,:]                     [M,D,:,:]  grad[v][:M,:D,iq2,iq3]  += grad[vcur]
-            //  for iq1:
-            //   kcur   = k[:D,:M,iq2,iq3]       [D,M,1,1]  grad[kcur] = grad[S1].T @ qcur
-            //   qcur   = q[:D,iq1,iq2,iq3]      [D,1,1,1]  grad[qcur] = grad[S1]   @ kcur
-            //   vcur   = v[:M,:D,iq2,iq3]       [M,D,1,1]  grad[vcur] = grad[S5].T @ S4
-            //   S0     = -Inf                   [D,1,1,1]
-            //  ~S1[i]  = dot(kcur[:D,i], qcur)
-            //   S1     = qcur.T @ kcur          [M,1,1,1]  grad[S1]   = grad[S2] * scale
-            //   S2     = S1 * scale             [M,1,1,1]  grad[S2]   = diag_mask_zero(grad[S3], P)
-            //   S3     = diag_mask_inf(S2, P)   [M,1,1,1]  grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
-            //   S4     = softmax(S3)            [M,1,1,1]  grad[S4]   = grad[S5] @ vcur
-            //  ~S5[i]  = dot(vcur[:,i],S4)
-            //   S5     = S4.T @ vcur            [D,1,1,1]  grad[S5]   = d[:D,iq1,iq2,iq3]
-            //  ~dst[i,iq1,iq2,iq3]  = S5[i]              ^
-            //   dst[:D,iq1,iq2,iq3] = S5                 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,iq1,iq2,iq3]
-            // dst                               backward-/ grad[dst]                 = d
-            //
-            // output gradients with their dependencies:
-            //
-            // grad[kcur] = grad[S1].T @ qcur
-            // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
-            // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
-            // grad[S4]   = grad[S5] @ vcur
-            // grad[S4]   = d[:D,iq1,iq2,iq3] @ vcur
-            // grad[qcur] = grad[S1]   @ kcur
-            // grad[vcur] = grad[S5].T @ S4
-            // grad[vcur] = d[:D,iq1,iq2,iq3].T @ S4
-            //
-            // in post-order:
-            //
-            // S1         = qcur.T @ kcur
-            // S2         = S1 * scale
-            // S3         = diag_mask_inf(S2, P)
-            // S4         = softmax(S3)
-            // grad[S4]   = d[:D,iq1,iq2,iq3] @ vcur
-            // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
-            // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
-            // grad[qcur] = grad[S1]   @ kcur
-            // grad[kcur] = grad[S1].T @ qcur
-            // grad[vcur] = d[:D,iq1,iq2,iq3].T @ S4
-            //
-            // using less variables (SM=S4):
-            //
-            // S             = diag_mask_inf(qcur.T @ kcur * scale, P)
-            // SM            = softmax(S)
-            // S             = d[:D,iq1,iq2,iq3] @ vcur
-            // dot_SM_gradSM = dot(SM, S)
-            // S             = SM * (S - dot(SM, S))
-            // S             = diag_mask_zero(S, P) * scale
-            //
-            // grad[q][:D,iq1,iq2,iq3] += S   @ kcur
-            // grad[k][:D,:M,iq2,iq3]  += S.T @ qcur
-            // grad[v][:M,:D,iq2,iq3]  += d[:D,iq1,iq2,iq3].T @ SM
-        }
+            // step-by-step explanation
+            {
+                // forward-process                   shape      grads from backward process
+                // parallel_for iq2,iq3:
+                //  k[:D,:M,:,:]                     [D,M,:,:]  grad[k][:D,:M,iq2,iq3]  += grad[kcur]
+                //  q[:D,:N,:,:]                     [D,N,:,:]  grad[q][:D,iq1,iq2,iq3] += grad[qcur]
+                //  v[:M,:D,:,:]                     [M,D,:,:]  grad[v][:M,:D,iq2,iq3]  += grad[vcur]
+                //  for iq1:
+                //   kcur   = k[:D,:M,iq2,iq3]       [D,M,1,1]  grad[kcur] = grad[S1].T @ qcur
+                //   qcur   = q[:D,iq1,iq2,iq3]      [D,1,1,1]  grad[qcur] = grad[S1]   @ kcur
+                //   vcur   = v[:M,:D,iq2,iq3]       [M,D,1,1]  grad[vcur] = grad[S5].T @ S4
+                //   S0     = -Inf                   [D,1,1,1]
+                //  ~S1[i]  = dot(kcur[:D,i], qcur)
+                //   S1     = qcur.T @ kcur          [M,1,1,1]  grad[S1]   = grad[S2] * scale
+                //   S2     = S1 * scale             [M,1,1,1]  grad[S2]   = diag_mask_zero(grad[S3], P)
+                //   S3     = diag_mask_inf(S2, P)   [M,1,1,1]  grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
+                //   S4     = softmax(S3)            [M,1,1,1]  grad[S4]   = grad[S5] @ vcur
+                //  ~S5[i]  = dot(vcur[:,i],S4)
+                //   S5     = S4.T @ vcur            [D,1,1,1]  grad[S5]   = d[:D,iq1,iq2,iq3]
+                //  ~dst[i,iq1,iq2,iq3]  = S5[i]              ^
+                //   dst[:D,iq1,iq2,iq3] = S5                 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,iq1,iq2,iq3]
+                // dst                               backward-/ grad[dst]                 = d
+                //
+                // output gradients with their dependencies:
+                //
+                // grad[kcur] = grad[S1].T @ qcur
+                // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
+                // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
+                // grad[S4]   = grad[S5] @ vcur
+                // grad[S4]   = d[:D,iq1,iq2,iq3] @ vcur
+                // grad[qcur] = grad[S1]   @ kcur
+                // grad[vcur] = grad[S5].T @ S4
+                // grad[vcur] = d[:D,iq1,iq2,iq3].T @ S4
+                //
+                // in post-order:
+                //
+                // S1         = qcur.T @ kcur
+                // S2         = S1 * scale
+                // S3         = diag_mask_inf(S2, P)
+                // S4         = softmax(S3)
+                // grad[S4]   = d[:D,iq1,iq2,iq3] @ vcur
+                // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
+                // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
+                // grad[qcur] = grad[S1]   @ kcur
+                // grad[kcur] = grad[S1].T @ qcur
+                // grad[vcur] = d[:D,iq1,iq2,iq3].T @ S4
+                //
+                // using less variables (SM=S4):
+                //
+                // S             = diag_mask_inf(qcur.T @ kcur * scale, P)
+                // SM            = softmax(S)
+                // S             = d[:D,iq1,iq2,iq3] @ vcur
+                // dot_SM_gradSM = dot(SM, S)
+                // S             = SM * (S - dot(SM, S))
+                // S             = diag_mask_zero(S, P) * scale
+                //
+                // grad[q][:D,iq1,iq2,iq3] += S   @ kcur
+                // grad[k][:D,:M,iq2,iq3]  += S.T @ qcur
+                // grad[v][:M,:D,iq2,iq3]  += d[:D,iq1,iq2,iq3].T @ SM
+            }
 
-        // S = gradSM = d[:D,iq1,iq2,iq3] @ vcur
-        // S = d[:D,iq1,iq2,iq3] @ vcur
-        // S[:M] += vcur[:,ic] * d[ic,iq1,iq2,iq3]
-        ggml_vec_set_f32(D, S, 0);
-        for (int64_t ic = 0; ic < D; ++ic) {
-            // dst indices
-            const int i1 = iq1;
-            const int i2 = iq2;
-            const int i3 = iq3;
+            // S = gradSM = d[:D,iq1,iq2,iq3] @ vcur
+            // S = d[:D,iq1,iq2,iq3] @ vcur
+            // S[:M] += vcur[:M,ic] * d[ic,iq1,iq2,iq3]
+            ggml_vec_set_f32(M, S, 0);
+            for (int64_t ic = 0; ic < D; ++ic) {
+                // dst indices
+                const int i1 = iq1;
+                const int i2 = iq2;
+                const int i3 = iq3;
 
-            ggml_vec_mad_f32(M,
-                    S,
-                     (float *) ((char *) v->data + (          ic*nbv1 + i2*nbv2 + i3*nbv3)),
-                    *(float *) ((char *) d->data + (ic*nbd1 + i1*nbd2 + i2*nbd2 + i3*nbd3)));
-        }
+                ggml_vec_mad_f32(M,
+                        S,
+                         (float *) ((char *) v->data + (          ic*nbv1 + i2*nbv2 + i3*nbv3)),
+                        *(float *) ((char *) d->data + (ic*nbd0 + i1*nbd1 + i2*nbd2 + i3*nbd3)));
+            }
 
-        // S = SM * (S - dot(SM, S))
-        float dot_SM_gradSM = 0;
-        ggml_vec_dot_f32 (M, &dot_SM_gradSM, SM, S);
-        ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
-        ggml_vec_mul_f32 (M, S, S, SM);
+            // S = SM * (S - dot(SM, S))
+            float dot_SM_gradSM = 0;
+            ggml_vec_dot_f32 (M, &dot_SM_gradSM, SM, S);
+            ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
+            ggml_vec_mul_f32 (M, S, S, SM);
 
-        // S = diag_mask_zero(S, P) * scale
-        if (masked) {
-            for (int64_t i = P + iq1 + 1; i < M; i++) {
-                S[i] = 0;
+            // S = diag_mask_zero(S, P) * scale
+            if (masked) {
+                // for (int64_t i = P + iq1 + 1; i < M; i++) {
+                //     S[i] = 0;
+                // }
+                for (int64_t i = P; i < M; i++) {
+                    if (i > P + iq1) {
+                        S[i] = 0;
+                    }
+                }                
             }
-        }
-        ggml_vec_scale_f32(M, S, scale);
+            ggml_vec_scale_f32(M, S, scale);
 
-        void * grad_q = (char *) dst->data;
-        void * grad_k = (char *) dst->data + nb0*D*N*neq2*neq3;
-        void * grad_v = (char *) dst->data + nb0*D*N*neq2*neq3 + nb0*D*M*neq2*neq3;
+            void * grad_q = (char *) dst->data;
+            void * grad_k = (char *) dst->data + nb0*D*N*neq2*neq3;
+            void * grad_v = (char *) dst->data + nb0*D*N*neq2*neq3 + nb0*D*M*neq2*neq3;
 
-        const size_t nbgq1 = nb0*neq0;
-        const size_t nbgq2 = nb0*neq0*neq1;
-        const size_t nbgq3 = nb0*neq0*neq1*neq2;
+            const size_t nbgq1 = nb0*neq0;
+            const size_t nbgq2 = nb0*neq0*neq1;
+            const size_t nbgq3 = nb0*neq0*neq1*neq2;
 
-        const size_t nbgk1 = nb0*nek0;
-        const size_t nbgk2 = nb0*nek0*nek1;
-        const size_t nbgk3 = nb0*nek0*nek1*neq2;
+            const size_t nbgk1 = nb0*nek0;
+            const size_t nbgk2 = nb0*nek0*nek1;
+            const size_t nbgk3 = nb0*nek0*nek1*neq2;
 
-        const size_t nbgv1 = nb0*nev0;
-        const size_t nbgv2 = nb0*nev0*nev1;
-        const size_t nbgv3 = nb0*nev0*nev1*neq2;
+            const size_t nbgv1 = nb0*nev0;
+            const size_t nbgv2 = nb0*nev0*nev1;
+            const size_t nbgv3 = nb0*nev0*nev1*neq2;
 
-        // S    shape [M,1]
-        // SM   shape [M,1]
-        // kcur shape [D,M]
-        // qcur shape [D,1]
-        // vcur shape [M,D]
-        //
-        // grad[q][:D,iq1,iq2,iq3] += S @ kcur
-        // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M]
-        // grad[q][ic,iq1,iq2,iq3] += dot(kcur[:,ic],S.T)
-        // grad[q][ic,iq1,iq2,iq3] += dot(k[:D,ic,iq2,iq3],S.T)
-        for (int64_t ic = 0; ic < M; ++ic) {
-            // dst indices
-            const int i1 = iq1;
-            const int i2 = iq2;
-            const int i3 = iq3;
-
-            ggml_vec_dot_f32(D,
-                    (float *) ((char *) grad_q  + (ic*nb0 + i1*nbgq1  + i2*nbgq2  + i3*nbgq3)),
-                    (float *) ((char *) k->data + (         ic*nbk1   + i2*nbk2   + i3*nbk3)),
-                    S);
-        }
+            // S    shape [M,1]
+            // SM   shape [M,1]
+            // kcur shape [D,M]
+            // qcur shape [D,1]
+            // vcur shape [M,D]
+            //
+            // grad[q][:D,iq1,iq2,iq3] += S @ kcur
+            // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M]
+            // grad[q][:D,iq1,iq2,iq3] += S[ic] * kcur[:D,ic]
+            // 
+            //// grad[q][ic,iq1,iq2,iq3] += dot(kcur[:,ic],S.T)
+            //// grad[q][ic,iq1,iq2,iq3] += dot(k[:D,ic,iq2,iq3],S.T)
+            for (int64_t ic = 0; ic < M; ++ic) {
+                // dst indices
+                const int i1 = iq1;
+                const int i2 = iq2;
+                const int i3 = iq3;
+                
+                ggml_vec_mad_f32(D,
+                        (float *) ((char *) grad_q  + (i1*nbgq1  + i2*nbgq2  + i3*nbgq3)),
+                        (float *) ((char *) k->data + (ic*nbk1   + i2*nbk2   + i3*nbk3)),
+                        S[ic]);
+            }
 
-        // grad[k][:D,:M,iq2,iq3] += S.T       @ qcur
-        // grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0]
-        // grad[k][:D,ic,iq2,iq3] += S[ic]     * qcur[:D,0]
-        for (int64_t ic = 0; ic < M; ++ic) {
-            // dst indices
-            const int i1 = iq1;
-            const int i2 = iq2;
-            const int i3 = iq3;
+            // grad[k][:D,:M,iq2,iq3] += S.T       @ qcur
+            // grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0]
+            // grad[k][:D,ic,iq2,iq3] += S[ic]     * qcur[:D,0]
+            for (int64_t ic = 0; ic < M; ++ic) {
+                // dst indices
+                const int i1 = iq1;
+                const int i2 = iq2;
+                const int i3 = iq3;
 
-            ggml_vec_set_f32(D,
-                    (float *) ((char *) grad_k  + (ic*nbgk1  + i2*nbgk2  + i3*nbgk3)),
-                    0);
-            ggml_vec_mad_f32(D,
-                    (float *) ((char *) grad_k  + (ic*nbgk1  + i2*nbgk2  + i3*nbgk3)),
-                    (float *) ((char *) q->data + (i1*nbk1   + i2*nbk2   + i3*nbk3)),
-                    S[ic]);
-        }
+                // ggml_vec_set_f32(D,
+                //         (float *) ((char *) grad_k  + (ic*nbgk1  + i2*nbgk2  + i3*nbgk3)),
+                //         0);
+                ggml_vec_mad_f32(D,
+                        (float *) ((char *) grad_k  + (ic*nbgk1  + i2*nbgk2  + i3*nbgk3)),
+                        (float *) ((char *) q->data + (i1*nbq1   + i2*nbq2   + i3*nbq3)),
+                        S[ic]);
+            }
 
-        // grad[v][:M,:D,iq2,iq3] += d[:D,iq1,iq2,iq3].T       @ SM
-        // grad[v][:M,ic,iq2,iq3] += d[:D,iq1,iq2,iq3].T[0,ic] * SM[:M]
-        // grad[v][:M,ic,iq2,iq3] += d[ic,iq1,iq2,iq3]         * SM[:M]
-        for (int64_t ic = 0; ic < D; ++ic) {
-            // dst indices
-            const int i1 = iq1;
-            const int i2 = iq2;
-            const int i3 = iq3;
+            // grad[v][:M,:D,iq2,iq3] += d[:D,iq1,iq2,iq3].T       @ SM
+            // grad[v][:M,ic,iq2,iq3] += d[:D,iq1,iq2,iq3].T[0,ic] * SM[:M]
+            // grad[v][:M,ic,iq2,iq3] += d[ic,iq1,iq2,iq3]         * SM[:M]
+            for (int64_t ic = 0; ic < D; ++ic) {
+                // dst indices
+                const int i1 = iq1;
+                const int i2 = iq2;
+                const int i3 = iq3;
 
-            ggml_vec_set_f32(M,
-                    (float *) ((char *) grad_v   + (          ic*nbgv1 + i2*nbgv2 + i3*nbgv3)),
-                    0);
-            ggml_vec_mad_f32(M,
-                    (float *) ((char *) grad_v   + (          ic*nbgv1 + i2*nbgv2 + i3*nbgv3)),
-                    SM,
-                    *(float *) ((char *) d->data + (ic*nbd1 + i1*nbd2  + i2*nbd2  + i3*nbd3)));
+                // ggml_vec_set_f32(M,
+                //         (float *) ((char *) grad_v   + (          ic*nbgv1 + i2*nbgv2 + i3*nbgv3)),
+                //         0);
+                ggml_vec_mad_f32(M,
+                        (float *) ((char *) grad_v   + (          ic*nbgv1 + i2*nbgv2 + i3*nbgv3)),
+                        SM,
+                        *(float *) ((char *) d->data + (ic*nbd0 + i1*nbd1  + i2*nbd2  + i3*nbd3)));
+            }
         }
     }
 }
@@ -14475,9 +14494,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     bool masked = t != 0;
                     flash_grad = 
                         ggml_flash_attn_back(ctx, 
-                            src0->grad, 
-                            src1->grad, 
-                            tensor->opt[0]->grad,
+                            src0, 
+                            src1, 
+                            tensor->opt[0],
                             tensor->grad,
                             masked);
                 }
@@ -14509,7 +14528,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                             } break;
                         case 4:
                             {
-                                grad_q = ggml_view_3d(ctx, 
+                                grad_q = ggml_view_4d(ctx, 
                                     flash_grad, 
                                     src0->ne[0], 
                                     src0->ne[1], 
@@ -14555,7 +14574,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                             } break;
                         case 4:
                             {
-                                grad_k = ggml_view_3d(ctx, 
+                                grad_k = ggml_view_4d(ctx, 
                                     flash_grad, 
                                     src1->ne[0], 
                                     src1->ne[1], 
@@ -14604,7 +14623,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                             } break;
                         case 4:
                             {
-                                grad_v = ggml_view_3d(ctx, 
+                                grad_v = ggml_view_4d(ctx, 
                                     flash_grad, 
                                     opt0->ne[0], 
                                     opt0->ne[1], 

From 70c08318af062c31e47fd6a914e9f3abf8db385e Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 29 May 2023 23:51:40 +0200
Subject: [PATCH 59/86] test flash attention backward pass

need to set loose error bounds to pass.
the finitie differences are close to numeric limits and often return quite different values than the backward pass.
reducing eps further lets the gradients vanish completely.
likewise setting eps to big results in wronger values.
the softmax in the middle of the function is probably the most responsible for the numeric issues using finite differences.
---
 tests/test-grad0.c | 41 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index b7d68cad9fd28..c8c2c0f717e32 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -5,7 +5,7 @@
 #include <stdlib.h>
 #include <assert.h>
 
-#define MAX_NARGS 2
+#define MAX_NARGS 3
 
 #undef MIN
 #undef MAX
@@ -1143,6 +1143,45 @@ int main(int argc, const char ** argv) {
             }
         }
 
+        // flash_attn
+        {
+            const int nargs = 3;
+
+            int64_t ne2[4];
+
+            get_random_dims(ne2, 4);
+            int64_t D = ne2[0];
+            int64_t N = ne2[1];
+            int64_t M = ne2[2] + N;
+            int64_t B = ne2[3];
+
+            for (int masked = 0; masked <= 1; ++masked) {
+                for (int ndims = 2; ndims <= 4; ++ndims) {
+                    int64_t neq[4] = { D, N, B, ne[3] };
+                    int64_t nek[4] = { D, M, B, ne[3] };
+                    int64_t nev[4] = { M, D, B, ne[3] };
+                    if (ndims == 2) {
+                        neq[2] = 1; neq[3] = 1;
+                        nek[2] = 1; nek[3] = 1;
+                        nev[2] = 1; nev[3] = 1;
+                    } else if (ndims == 3) {
+                        neq[3] = 1;
+                        nek[3] = 1;
+                        nev[3] = 1;
+                    }
+                    x[0] = get_random_tensor(ctx0, ndims, neq, -0.1250f, 0.1250f);
+                    x[1] = get_random_tensor(ctx0, ndims, nek, -0.1250f, 0.1250f);
+                    x[2] = get_random_tensor(ctx0, ndims, nev, -0.1250f, 0.1250f);
+                    ggml_set_param(ctx0, x[0]);
+                    ggml_set_param(ctx0, x[1]);
+                    ggml_set_param(ctx0, x[2]);
+
+                    struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
+
+                    check_gradient("flash_attn", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
+                }
+            }
+        }
         ggml_free(ctx0);
     }
 

From fcbc4457d6a45112659d419104e294f3d0f68be3 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 30 May 2023 13:17:58 +0200
Subject: [PATCH 60/86] add option to train with flash attention and move
 options to the top of the main function

training from scratch also works with flash attention
training convergence and generation results after fix number of iterations are worse than when not using flash attention.
maybe there still lingers a bug in the flash attention backward pass?
but training works, just with slower convergence.

flash attention is still worth to use, because it requires way less memory and is faster with high n_ctx
---
 examples/baby-llama/baby-llama-text.cpp | 318 +++++++++++++++++++++---
 1 file changed, 288 insertions(+), 30 deletions(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index 267f44321bbef..418cc5fff47aa 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -1168,6 +1168,239 @@ struct ggml_tensor * forward_batch_wo_cache(
     return inpL;
 }
 
+struct ggml_tensor * forward_batch_wo_cache_flash_attn(
+        struct my_llama_model * model,
+        struct ggml_context   * ctx0,
+        struct ggml_cgraph    * gf,
+        struct ggml_tensor    * tokens_input,
+        const  int              n_tokens,
+        const  int              n_batch) {
+
+    const int n_past = 0;
+    const int N = n_tokens;
+
+    const auto & hparams = model->hparams;
+    const int n_ctx   = hparams.n_ctx;
+    const int n_vocab = hparams.n_vocab;
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_head  = hparams.n_head;
+    const int n_rot   = hparams.n_rot;
+    const int n_ff    = get_n_ff(&hparams);
+
+    struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch);
+    memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch);
+
+    // inpL shape [n_embd,N*n_batch,1]
+    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
+    assert_shape_2d(inpL, n_embd, N*n_batch);
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * inpSA = inpL;
+
+        struct ggml_tensor * cur;
+
+        // lctx.use_buf(ctx0, 0);
+
+        // norm
+        {
+            // cur shape [n_embd,N*n_batch,1,1]
+            cur = ggml_rms_norm(ctx0, inpL);
+            assert_shape_2d(cur, n_embd, N*n_batch);
+
+            // cur = attention_norm*cur
+            cur = ggml_mul(ctx0,
+                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
+                        cur);
+            assert_shape_2d(cur, n_embd, N*n_batch);
+        }
+
+        // self-attention
+        {
+            // compute Q and K and RoPE them
+            // wq   shape [n_embd, n_embd, 1, 1]
+            // wk   shape [n_embd, n_embd, 1, 1]
+            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
+            // Kcur shape [n_embd/n_head, n_head, N, n_batch]
+            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
+            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
+            assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
+            assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
+
+            // Vcur shape [N, n_batch, n_embd/n_head, n_head]
+            struct ggml_tensor * Vcur = ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, cur, model->layers[il].wv), N, n_batch, n_embd/n_head, n_head);
+            assert_shape_4d(Vcur, N, n_batch, n_embd/n_head, n_head);
+
+            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
+            // Q shape    [n_embd/n_head, N, n_head, n_batch]
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        Qcur,
+                        0, 2, 1, 3);
+            assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);
+
+            // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
+            // K shape [n_embd/n_head, N, n_head, n_batch]
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        Kcur,
+                        0, 2, 1, 3);
+            assert_shape_4d(K, n_embd/n_head, N, n_head, n_batch);
+
+            // // K * Q
+            // // KQ shape [N, N, n_head, n_batch]
+            // struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            // assert_shape_4d(KQ, N, N, n_head, n_batch);
+
+            // // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            // // KQ_scaled shape [N, N, n_head, n_batch]
+            // struct ggml_tensor * KQ_scaled =
+            //     ggml_scale_inplace(ctx0,
+            //             KQ,
+            //             ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
+            // assert_shape_4d(KQ_scaled, N, N, n_head, n_batch);
+
+            // // KQ_masked = mask_past(KQ_scaled)
+            // // KQ_masked shape [N, N, n_head, n_batch]
+            // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
+            // assert_shape_4d(KQ_masked, N, N, n_head, n_batch);
+
+            // // KQ = soft_max(KQ_masked)
+            // // KQ_soft_max shape [N, N, n_head, n_batch]
+            // struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
+            // assert_shape_4d(KQ_soft_max, N, N, n_head, n_batch);
+
+            // Vcur shape [N, n_batch, n_embd/n_head, n_head]
+            // V shape    [N, n_embd/n_head, n_head, n_batch]
+            struct ggml_tensor * V =
+                ggml_permute(ctx0,
+                    Vcur,
+                    0, 3, 1, 2);
+            assert_shape_4d(V, N, n_embd/n_head, n_head, n_batch);
+
+            // // KQV shape [n_embd/n_head, N, n_head, n_batch]
+            // struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+            // assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch);
+
+
+            bool masked = true;
+            struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, masked);
+            assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch);
+
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            // KQV_merged shape [n_embd/n_head, n_head, N, n_batch]
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch);
+            // KQV_merged shape
+
+            // cur shape [n_embd,N*n_batch,1,1]
+            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch);
+            assert_shape_2d(cur, n_embd, N*n_batch);
+
+            // projection (no bias)
+            // cur shape [n_embd,N*n_batch,1,1]
+            cur = ggml_mul_mat(ctx0,
+                    model->layers[il].wo,
+                    cur);
+            assert_shape_2d(cur, n_embd, N*n_batch);
+        }
+
+        // lctx.use_buf(ctx0, 1);
+
+        // inpFF shape [n_embd,N*n_batch,1,1]
+        struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA);
+        assert_shape_2d(inpFF, n_embd, N*n_batch);
+
+        // feed-forward network
+        {
+            // norm
+            {
+                // cur shape [n_embd,N*n_batch,1,1]
+                cur = ggml_rms_norm(ctx0, inpFF);
+                assert_shape_2d(cur, n_embd, N*n_batch);
+
+                // cur = ffn_norm*cur
+                // cur shape [n_embd,N*n_batch,1,1]
+                cur = ggml_mul(ctx0,
+                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
+                        cur);
+                assert_shape_2d(cur, n_embd, N*n_batch);
+            }
+
+            // tmp shape [n_ff,N*n_batch,1,1]
+            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
+                    model->layers[il].w3,
+                    cur);
+            assert_shape_2d(tmp, n_ff, N*n_batch);
+
+            // cur shape [n_ff,N*n_batch,1,1]
+            cur = ggml_mul_mat(ctx0,
+                    model->layers[il].w1,
+                    cur);
+            assert_shape_2d(cur, n_ff, N*n_batch);
+
+            // SILU activation
+            // cur shape [n_ff,N*n_batch,1,1]
+            cur = ggml_silu(ctx0, cur);
+            assert_shape_2d(cur, n_ff, N*n_batch);
+
+            // cur shape [n_ff,N*n_batch,1,1]
+            cur = ggml_mul(ctx0, cur, tmp);
+            assert_shape_2d(cur, n_ff, N*n_batch);
+
+            // cur shape [n_embd,N*n_batch,1,1]
+            cur = ggml_mul_mat(ctx0,
+                    model->layers[il].w2,
+                    cur);
+            assert_shape_2d(cur, n_embd, N*n_batch);
+        }
+
+        // cur shape [n_embd,N*n_batch,1,1]
+        cur = ggml_add_inplace(ctx0, cur, inpFF);
+        assert_shape_2d(cur, n_embd, N*n_batch);
+
+        // input for next layer
+        // inpL shape [n_embd,N*n_batch,1,1]
+        inpL = cur;
+        assert_shape_2d(inpL, n_embd, N*n_batch);
+    }
+
+    // norm
+    {
+
+        // inpL shape [n_embd,N*n_batch,1,1]
+        inpL = ggml_rms_norm(ctx0, inpL);
+        assert_shape_2d(inpL, n_embd, N*n_batch);
+
+        // inpL = norm*inpL
+        // inpL shape [n_embd,N*n_batch,1,1]
+        inpL = ggml_mul(ctx0,
+                    ggml_repeat(ctx0, model->norm, inpL),
+                    inpL);
+
+        assert_shape_2d(inpL, n_embd, N*n_batch);
+
+        //embeddings = inpL;
+    }
+
+    // lm_head
+    // inpL shape [n_vocab,N*n_batch,1,1]
+    inpL = ggml_mul_mat(ctx0, model->output, inpL);
+    assert_shape_2d(inpL, n_vocab, N*n_batch);
+
+    {
+        // inpL shape [n_vocab,N,n_batch,1]
+        inpL = ggml_reshape_3d(ctx0,
+                        inpL,
+                        n_vocab, N, n_batch);
+        assert_shape_3d(inpL, n_vocab, N, n_batch);
+    }
+
+    // run the computation
+    ggml_build_forward_expand(gf, inpL);
+
+    return inpL;
+}
+
 void set_f32_3d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, float value) {
     float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
     *ptr = value;
@@ -1644,7 +1877,7 @@ void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
     }
 
     std::string name = file->read_string(name_len);
-    GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)) == 0);
+    GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0);
 
     file->seek(-file->tell() & 31, SEEK_CUR);
     file->read_raw(tensor->data, ggml_nbytes(tensor));
@@ -1930,7 +2163,42 @@ int main(int argc, char ** argv) {
         //return 1;
     }
 
-    srand(time(NULL));
+    int seed = 1;
+    int n_ctx    = 256;
+    // int n_ctx    =  64;
+    int n_embd   = 256;
+    int n_mult   = 256;
+    int n_head   =   8;
+    int n_layer  =  16;
+    int n_rotmax =  64;
+
+    int n_threads  =  6;
+    int n_batch    =  8;
+    int n_examples =  32;
+    
+    int print_info_interval    = 1;
+    int print_details_interval = 2;
+
+    bool samples_start_after_nl = false;
+    bool use_adam  = true;
+    bool use_flash = false;
+
+    // only adam
+    int warmup              =  100;
+    int cos_decay_steps     = 1000;
+    float cos_decay_restart = 1.1f;
+    float cos_decay_alpha   = 0.0f;
+    
+    int   lbfgs_n_iter      = 16;
+    int   adam_n_iter       = 16;
+    float adam_alpha        = 1e-3;
+    float adam_decay        = 1e-3;
+
+    if (seed < 0) {
+        srand(time(NULL));
+    } else {
+        srand(seed);
+    }
 
     const char * fn_model     = (argc >= 2) ? argv[1] : default_argv[1];
     const char * fn_train     = (argc >= 3) ? argv[2] : default_argv[2];
@@ -1971,12 +2239,12 @@ int main(int argc, char ** argv) {
 
     struct my_llama_model model;
     model.hparams.n_vocab = llama_n_vocab(lctx);
-    model.hparams.n_ctx   = 32;
-    model.hparams.n_embd  = 128;
-    model.hparams.n_mult  = 64;
-    model.hparams.n_head  = 16;
-    model.hparams.n_layer = 4;
-    model.hparams.n_rot   = std::min(64u, model.hparams.n_embd / model.hparams.n_head);
+    model.hparams.n_ctx   = n_ctx;
+    model.hparams.n_embd  = n_embd;
+    model.hparams.n_mult  = n_mult;
+    model.hparams.n_head  = n_head;
+    model.hparams.n_layer = n_layer;
+    model.hparams.n_rot   = std::min((uint32_t)n_rotmax, model.hparams.n_embd / model.hparams.n_head);
 
     print_params(&model.hparams);
 
@@ -2011,18 +2279,6 @@ int main(int argc, char ** argv) {
 
     my_llama_sampler sampler;
 
-    int n_threads  = 6;
-    int n_batch    = 32;
-    int n_examples = 32;
-
-    bool samples_start_after_nl = false;
-    bool use_adam = true;
-
-    int warmup              = 100;
-    int cos_decay_steps     = 1000;
-    float cos_decay_restart = 1.1f;
-    float cos_decay_alpha   = 0.0f;
-
 
     int n_tokens = model.hparams.n_ctx;
     int n_vocab  = model.hparams.n_vocab;
@@ -2035,15 +2291,15 @@ int main(int argc, char ** argv) {
     opt_params_adam.print_forward_graph = false;
     opt_params_adam.print_backward_graph = false;
     opt_params_adam.n_threads = n_threads;
-    opt_params_adam.adam.n_iter = 16;
+    opt_params_adam.adam.n_iter = adam_n_iter;
     opt_params_adam.adam.sched = 1.0f;
-    opt_params_adam.adam.alpha = 1e-3;
-    opt_params_adam.adam.decay = 1e-3;
+    opt_params_adam.adam.alpha = adam_alpha;
+    opt_params_adam.adam.decay = adam_decay;
 
     opt_params_lbfgs.print_forward_graph = false;
     opt_params_lbfgs.print_backward_graph = false;
-    opt_params_lbfgs.n_threads = n_threads;
-    opt_params_lbfgs.lbfgs.n_iter = 16;
+    opt_params_lbfgs.n_threads    = n_threads;
+    opt_params_lbfgs.lbfgs.n_iter = lbfgs_n_iter;
 
     opt->ctx = model.ctx;
     opt->params = use_adam ? opt_params_adam : opt_params_lbfgs;
@@ -2117,7 +2373,9 @@ int main(int argc, char ** argv) {
 
         struct ggml_tensor * logits =
             (n_past == 0)
-            ? forward_batch_wo_cache(&model, ctx0, &gf, tokens_input, n_tokens, n_batch)
+            ? (use_flash
+               ? forward_batch_wo_cache_flash_attn(&model, ctx0, &gf, tokens_input, n_tokens, n_batch)
+               : forward_batch_wo_cache(&model, ctx0, &gf, tokens_input, n_tokens, n_batch))
             : forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch);
 
         struct ggml_tensor * e = cross_entropy_loss(ctx0, logits, target_probs);
@@ -2148,16 +2406,16 @@ int main(int argc, char ** argv) {
 
         float error_after_opt = ggml_get_f32_1d(e, 0);
 
-        printf("used_mem_before_opt: %zu bytes\n", used_mem_before_opt);
-        printf("used_mem_after_opt:  %zu bytes\n", used_mem_after_opt);
 
-        if (ex % 1 == 0) {
+        if (ex % print_info_interval == 0) {
             printf("Example %d, opt iter %d\n", ex, opt->iter);
             printf("error_before_opt: %.6f\n", error_before_opt);
             printf("error_after_opt:  %.6f\n", error_after_opt);
+            printf("used_mem_before_opt: %zu bytes\n", used_mem_before_opt);
+            printf("used_mem_after_opt:  %zu bytes\n", used_mem_after_opt);
         }
 
-        if (ex % 2 == 0) {
+        if (ex % print_details_interval == 0) {
             // set_logits_masked(logits, token_notavail, -1e9);
             for (int i=0; i<n_batch; ++i) {
                 init_sampler(&sampler, lctx);

From ec8e262d1d3c55c62b2be44dbaf6f0b8e28c1872 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 30 May 2023 15:53:55 +0200
Subject: [PATCH 61/86] add train_params and command line option parser

---
 examples/baby-llama/baby-llama-text.cpp | 454 +++++++++++++++++++-----
 1 file changed, 367 insertions(+), 87 deletions(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index 418cc5fff47aa..ecdb418bf2285 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -2150,66 +2150,341 @@ float cosine_decay_restart(int decay_steps, const float alpha, int step, float r
     return cosine_decay(decay_steps, alpha, step);
 }
 
-int main(int argc, char ** argv) {
-    const char * default_model     = "ggml-vic7b-uncensored-q4_0.bin";
-    const char * default_train     = "shakespeare.txt";
-    const char * default_chkpt_in  = "checkpoint.bin";
-    const char * default_chkpt_out = "checkpoint.bin";
-    const char * default_model_out = "ggml-checkpoint-f32.bin";
-    const char * default_argv[6] = {argv[0], default_model, default_train, default_chkpt_in, default_chkpt_out, default_model_out};
-
-    if (argc < 6) {
-        fprintf(stderr, "usage: %s model training_data chkpt_in chkpt_out model_out\n", argv[0]);
-        //return 1;
-    }
-
-    int seed = 1;
-    int n_ctx    = 256;
-    // int n_ctx    =  64;
-    int n_embd   = 256;
-    int n_mult   = 256;
-    int n_head   =   8;
-    int n_layer  =  16;
-    int n_rotmax =  64;
-
-    int n_threads  =  6;
-    int n_batch    =  8;
-    int n_examples =  32;
+struct train_params {
+    const char * fn_vocab_model;
+    const char * fn_train_data;
+    const char * fn_checkpoint_in;
+    const char * fn_checkpoint_out;
+    const char * fn_model_out;
+
+    int seed;
+    int n_ctx;
+    int n_embd;
+    int n_mult;
+    int n_head;
+    int n_layer;
+    int n_rotmax;
+
+    int n_threads;
+    int n_batch;
+    int n_examples;
+    int n_predict;
     
-    int print_info_interval    = 1;
-    int print_details_interval = 2;
+    int print_info_interval;
+    int print_details_interval;
 
-    bool samples_start_after_nl = false;
-    bool use_adam  = true;
-    bool use_flash = false;
+    bool samples_start_after_nl;
+    bool use_adam;
+    bool use_flash;
 
     // only adam
-    int warmup              =  100;
-    int cos_decay_steps     = 1000;
-    float cos_decay_restart = 1.1f;
-    float cos_decay_alpha   = 0.0f;
+    int   warmup;
+    int   cos_decay_steps;
+    float cos_decay_restart;
+    float cos_decay_alpha;
+
+    int   lbfgs_n_iter;
+    int   adam_n_iter;
+    float adam_alpha;
+    float adam_decay;
+};
+
+struct train_params get_default_train_params() {
+    struct train_params params;
+    params.fn_vocab_model    = "ggml-vic7b-uncensored-q4_0.bin";
+    params.fn_train_data     = "shakespeare.txt";
+    params.fn_checkpoint_in  = "checkpoint.bin";
+    params.fn_checkpoint_out = "checkpoint.bin";
+    params.fn_model_out      = "ggml-checkpoint-f32.bin";
+
+    params.seed       =   -1;
+
+    params.n_ctx      =  128;
+    params.n_embd     =  256;
+    params.n_mult     =  256;
+    params.n_head     =    8;
+    params.n_layer    =   16;
+    params.n_rotmax   =   64;
+
+    params.n_threads  =    6;
+    params.n_batch    =    8;
+    params.n_examples =    8;
+    params.n_predict  = 1024;
     
-    int   lbfgs_n_iter      = 16;
-    int   adam_n_iter       = 16;
-    float adam_alpha        = 1e-3;
-    float adam_decay        = 1e-3;
+    params.print_info_interval    = 1;
+    params.print_details_interval = 2;
+
+    params.samples_start_after_nl = false;
+    params.use_adam               = true;
+    params.use_flash              = true;
+
+    // only adam
+    params.warmup            =  100;
+    params.cos_decay_steps   = 1000;
+    params.cos_decay_restart = 1.1f;
+    params.cos_decay_alpha   = 0.0f;
+
+    params.lbfgs_n_iter      = 16;
+    params.adam_n_iter       = 16;
+    params.adam_alpha        = 1e-3;
+    params.adam_decay        = 1e-3;
+
+    return params;
+}
+
+void train_print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help                 show this help message and exit\n");
+    fprintf(stderr, "  --vocab-model FNAME        model path from which to load vocab (default '%s')\n", params->fn_vocab_model);
+    fprintf(stderr, "  --train-data FNAME         path from which to load training data (default '%s')\n", params->fn_train_data);
+    fprintf(stderr, "  --checkpoint-in FNAME      path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in);
+    fprintf(stderr, "  --checkpoint-out FNAME     path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out);
+    fprintf(stderr, "  --model-out FNAME          path to save ggml model (default '%s')\n", params->fn_model_out);
+    fprintf(stderr, "  -s SEED, --seed SEED       RNG seed (default: -1, use random seed for < 0)\n");
+    fprintf(stderr, "  -c N, --ctx N              Context size used during training (default %d)\n", params->n_ctx);
+    fprintf(stderr, "  --embd N                   Embedding size used for new models (default %d)\n", params->n_embd);
+    fprintf(stderr, "  --mult N                   Mult size used for new models, influences feedforward size. (default %d)\n", params->n_mult);
+    fprintf(stderr, "  --head N                   Number of heads for new models (default %d)\n", params->n_head);
+    fprintf(stderr, "  --layer N                  Number of layers for new models (default %d)\n", params->n_layer);
+    fprintf(stderr, "  --rotmax N                 Maximal number Rope dimensions for new models (default %d)\n", params->n_rotmax);
+    fprintf(stderr, "  -t N, --threads N          Number of threads (default %d)\n", params->n_threads);
+    fprintf(stderr, "  -b N, --batch N            Parallel batch size (default %d)\n", params->n_batch);
+    fprintf(stderr, "  -n N, --examples N         Number of examples to train (default %d)\n", params->n_examples);
+    fprintf(stderr, "  --predict N                Number of tokens to generate after training (default %d)\n", params->n_predict);
+    fprintf(stderr, "  --print-info-interval N    Print infos during training each N examples (default %d)\n", params->print_info_interval);
+    fprintf(stderr, "  --print-details-interval N Print details during training each N examples (default %d)\n", params->print_details_interval);
+    fprintf(stderr, "  --samples-after-nl         Training samples start after newlines. (default %s)\n", params->samples_start_after_nl ? "on" : "off");
+    fprintf(stderr, "  --use-lbfgs                Use LBFGS optimizer instead of default Adam\n");
+    fprintf(stderr, "  --use-adam                 Use Adam optimizer (default)\n");
+    fprintf(stderr, "  --no-flash                 Don't use flash attention.\n");
+    fprintf(stderr, "  --use-flash                Use flash attention (default)\n");
+    fprintf(stderr, "  --warmup N                 Number of warmup steps (default %d)\n", params->warmup);
+    fprintf(stderr, "  --cos-decay-steps N        Number of cosine decay steps (default %d)\n", params->cos_decay_steps);
+    fprintf(stderr, "  --cos-decay-restart N      Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart);
+    fprintf(stderr, "  --cos-decay-alpha N        Cosine decay alpha (default %f)\n", params->cos_decay_alpha);
+    fprintf(stderr, "  --lbfgs-iter N             Maximum number of LBFGS optimization iterations for each batch (default %d)\n", params->lbfgs_n_iter);
+    fprintf(stderr, "  --adam-iter N              Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter);
+    fprintf(stderr, "  --adam-alpha N             Adam learning rate alpha (default %f)\n", params->adam_alpha);
+    fprintf(stderr, "  --adam-decay N             AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay);
+    fprintf(stderr, "\n");
+}
 
-    if (seed < 0) {
+bool train_params_parse(int argc, char ** argv, struct train_params * params) {
+    bool invalid_param = false;
+    std::string arg;
+    struct train_params default_params = get_default_train_params();
+    const std::string arg_prefix = "--";
+
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
+
+        if (arg == "--vocab-model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_vocab_model = argv[i];
+        } else if (arg == "--train-data") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_train_data = argv[i];
+        } else if (arg == "--checkpoint-in") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_checkpoint_in = argv[i];
+        } else if (arg == "--checkpoint-out") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_checkpoint_out = argv[i];
+        } else if (arg == "--model-out") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_model_out = argv[i];
+        } else if (arg == "-s" || arg == "--seed") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->seed = std::stoi(argv[i]);
+        } else if (arg == "-c" || arg == "--ctx") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_ctx = std::stoi(argv[i]);
+        } else if (arg == "--embd") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_embd = std::stoi(argv[i]);
+        } else if (arg == "--mult") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_mult = std::stoi(argv[i]);
+        } else if (arg == "--head") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_head = std::stoi(argv[i]);
+        } else if (arg == "--layer") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_layer = std::stoi(argv[i]);
+        } else if (arg == "--rotmax") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_rotmax = std::stoi(argv[i]);
+        } else if (arg == "-t" || arg == "--threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_threads = std::stoi(argv[i]);
+        } else if (arg == "-b" || arg == "--batch") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_batch = std::stoi(argv[i]);
+        } else if (arg == "-n" || arg == "--examples") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_examples = std::stoi(argv[i]);
+        } else if (arg == "--predict") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_predict = std::stoi(argv[i]);
+        } else if (arg == "--print-info-interval") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->print_info_interval = std::stoi(argv[i]);
+        } else if (arg == "--print-details-interval") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->print_details_interval = std::stoi(argv[i]);
+        } else if (arg == "--samples-after-nl") {
+            params->samples_start_after_nl = true;
+        } else if (arg == "--use-lbfgs") {
+            params->use_adam = false;
+        } else if (arg == "--use-adam") {
+            params->use_adam = true;
+        } else if (arg == "--no-flash") {
+            params->use_flash = false;
+        } else if (arg == "--use-flash") {
+            params->use_flash = true;
+        } else if (arg == "--warmup") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->warmup = std::stoi(argv[i]);
+        } else if (arg == "--cos-decay-steps") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->cos_decay_steps = std::stof(argv[i]);
+        } else if (arg == "--cos-decay-restart") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->cos_decay_restart = std::stof(argv[i]);
+        } else if (arg == "--cos-decay-alpha") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->cos_decay_alpha = std::stof(argv[i]);
+        } else if (arg == "--lbfgs-iter") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->lbfgs_n_iter = std::stoi(argv[i]);
+        } else if (arg == "--adam-iter") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_n_iter = std::stoi(argv[i]);
+        } else if (arg == "--adam-alpha") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_alpha = std::stof(argv[i]);
+        } else if (arg == "--adam-decay") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_decay = std::stof(argv[i]);
+        } else if (arg == "-h" || arg == "--help") {
+            train_print_usage(argc, argv, &default_params);
+            exit(0);
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            train_print_usage(argc, argv, &default_params);
+            exit(1);
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        train_print_usage(argc, argv, &default_params);
+        exit(1);
+    }
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    struct train_params params = get_default_train_params();
+
+    if (!train_params_parse(argc, argv, &params)) {
+        return 1;
+    }
+
+
+    if (params.seed < 0) {
         srand(time(NULL));
     } else {
-        srand(seed);
+        srand(params.seed);
     }
 
-    const char * fn_model     = (argc >= 2) ? argv[1] : default_argv[1];
-    const char * fn_train     = (argc >= 3) ? argv[2] : default_argv[2];
-    const char * fn_chkpt_in  = (argc >= 4) ? argv[3] : default_argv[3];
-    const char * fn_chkpt_out = (argc >= 5) ? argv[4] : default_argv[4];
-    const char * fn_model_out = (argc >= 6) ? argv[5] : default_argv[5];
-
     struct llama_context_params llama_params = llama_context_default_params();
     llama_params.vocab_only = true;
 
-    struct llama_context * lctx = llama_init_from_file(fn_model, llama_params);
+    struct llama_context * lctx = llama_init_from_file(params.fn_vocab_model, llama_params);
 
     struct llama_vocab vocab;
     {
@@ -2232,19 +2507,19 @@ int main(int argc, char ** argv) {
 
     printf("%s: tokenize training data\n", __func__);
     std::vector<llama_token> train_tokens;
-    if (tokenize_file(lctx, fn_train, train_tokens) < 0) {
-        fprintf(stderr, "%s: failed to tokenize file '%s'\n", __func__, fn_train);
+    if (tokenize_file(lctx, params.fn_train_data, train_tokens) < 0) {
+        fprintf(stderr, "%s: failed to tokenize file '%s'\n", __func__, params.fn_train_data);
     }
     printf("%s: number of training tokens: %d\n", __func__, train_tokens.size());
 
     struct my_llama_model model;
     model.hparams.n_vocab = llama_n_vocab(lctx);
-    model.hparams.n_ctx   = n_ctx;
-    model.hparams.n_embd  = n_embd;
-    model.hparams.n_mult  = n_mult;
-    model.hparams.n_head  = n_head;
-    model.hparams.n_layer = n_layer;
-    model.hparams.n_rot   = std::min((uint32_t)n_rotmax, model.hparams.n_embd / model.hparams.n_head);
+    model.hparams.n_ctx   = params.n_ctx;
+    model.hparams.n_embd  = params.n_embd;
+    model.hparams.n_mult  = params.n_mult;
+    model.hparams.n_head  = params.n_head;
+    model.hparams.n_layer = params.n_layer;
+    model.hparams.n_rot   = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
 
     print_params(&model.hparams);
 
@@ -2282,6 +2557,7 @@ int main(int argc, char ** argv) {
 
     int n_tokens = model.hparams.n_ctx;
     int n_vocab  = model.hparams.n_vocab;
+    int n_batch  = params.n_batch;
 
     struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context));
     memset(opt, 0, sizeof(struct ggml_opt_context));
@@ -2290,32 +2566,32 @@ int main(int argc, char ** argv) {
     struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
     opt_params_adam.print_forward_graph = false;
     opt_params_adam.print_backward_graph = false;
-    opt_params_adam.n_threads = n_threads;
-    opt_params_adam.adam.n_iter = adam_n_iter;
-    opt_params_adam.adam.sched = 1.0f;
-    opt_params_adam.adam.alpha = adam_alpha;
-    opt_params_adam.adam.decay = adam_decay;
+    opt_params_adam.n_threads   = params.n_threads;
+    opt_params_adam.adam.n_iter = params.adam_n_iter;
+    opt_params_adam.adam.sched  = 1.0f;
+    opt_params_adam.adam.alpha  = params.adam_alpha;
+    opt_params_adam.adam.decay  = params.adam_decay;
 
     opt_params_lbfgs.print_forward_graph = false;
     opt_params_lbfgs.print_backward_graph = false;
-    opt_params_lbfgs.n_threads    = n_threads;
-    opt_params_lbfgs.lbfgs.n_iter = lbfgs_n_iter;
+    opt_params_lbfgs.n_threads    = params.n_threads;
+    opt_params_lbfgs.lbfgs.n_iter = params.lbfgs_n_iter;
 
     opt->ctx = model.ctx;
-    opt->params = use_adam ? opt_params_adam : opt_params_lbfgs;
+    opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs;
 
     printf("%s: init model\n", __func__);
-    bool existed = load_checkpoint(&model, opt, fn_chkpt_in, true);
+    bool existed = load_checkpoint(&model, opt, params.fn_checkpoint_in, true);
     set_param_model(&model);
 
-    opt->params = use_adam ? opt_params_adam : opt_params_lbfgs;
+    opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs;
 
     opt->iter = model.train_its;
     printf("%s: opt iter %d\n", __func__, opt->iter);
 
     bool from_scratch = !existed;
     if (from_scratch) {
-        randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
+        randomize_model(&model, params.seed, 0.0f, 1.0f, -1.0f, +1.0f);
     }
 
     init_kv_cache(&kv_self, &model, 1);
@@ -2328,11 +2604,11 @@ int main(int argc, char ** argv) {
     size_t    compute_size = 1024ll*1024ll*1024ll*32ll;
     uint8_t * compute_addr = new uint8_t[compute_size];
 
-
+    GGML_ASSERT(train_tokens.size() > n_tokens);;
     std::vector<int> train_samples;
     train_samples.push_back(0);
     for (int i=1; i<train_tokens.size()-n_tokens; ++i) {
-        if (!samples_start_after_nl || (train_tokens[i-1] == llama_token_nl())) {
+        if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl())) {
             train_samples.push_back(i);
         }
     }
@@ -2343,7 +2619,7 @@ int main(int argc, char ** argv) {
 
     printf("%s: begin training\n", __func__);
 
-    for (int ex=0; ex<n_examples; ++ex) {
+    for (int ex=0; ex<params.n_examples; ++ex) {
         if (ex*n_batch >= train_samples.size()) {
             shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size());
             for (int i=0; i<train_samples.size(); ++i) {
@@ -2351,12 +2627,12 @@ int main(int argc, char ** argv) {
             }
         }
 
-        struct ggml_init_params params = {
+        struct ggml_init_params cparams = {
             /*.mem_size   =*/ compute_size,
             /*.mem_buffer =*/ compute_addr,
             /*.no_alloc   =*/ false,
         };
-        struct ggml_context * ctx0 = ggml_init(params);
+        struct ggml_context * ctx0 = ggml_init(cparams);
 
         struct ggml_tensor * after_opt_best_samples  = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
         struct ggml_tensor * after_opt_probs         = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
@@ -2367,13 +2643,13 @@ int main(int argc, char ** argv) {
         int n_past = 0;
 
         ggml_cgraph gf = {};
-        gf.n_threads = n_threads;
+        gf.n_threads = params.n_threads;
 
         get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex,  tokens_input, target_logits, target_probs);
 
         struct ggml_tensor * logits =
             (n_past == 0)
-            ? (use_flash
+            ? (params.use_flash
                ? forward_batch_wo_cache_flash_attn(&model, ctx0, &gf, tokens_input, n_tokens, n_batch)
                : forward_batch_wo_cache(&model, ctx0, &gf, tokens_input, n_tokens, n_batch))
             : forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch);
@@ -2387,9 +2663,14 @@ int main(int argc, char ** argv) {
 
         float error_before_opt = ggml_get_f32_1d(e, 0);
 
-        opt->params.adam.sched = (opt->iter < warmup)
-            ? (float) opt->iter / (float) warmup
-            : cosine_decay_restart(cos_decay_steps, cos_decay_alpha, opt->iter - warmup, cos_decay_restart);
+        opt->params.adam.sched = (opt->iter < params.warmup)
+            ? (float) opt->iter / (float) params.warmup
+            : cosine_decay_restart(
+                params.cos_decay_steps, 
+                params.cos_decay_alpha, 
+                opt->iter - params.warmup, 
+                params.cos_decay_restart);
+
         printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched);
 
         // ggml_opt(ctx0, opt->params, e);
@@ -2406,8 +2687,7 @@ int main(int argc, char ** argv) {
 
         float error_after_opt = ggml_get_f32_1d(e, 0);
 
-
-        if (ex % print_info_interval == 0) {
+        if (params.print_info_interval > 0 && ex % params.print_info_interval == 0) {
             printf("Example %d, opt iter %d\n", ex, opt->iter);
             printf("error_before_opt: %.6f\n", error_before_opt);
             printf("error_after_opt:  %.6f\n", error_after_opt);
@@ -2415,7 +2695,7 @@ int main(int argc, char ** argv) {
             printf("used_mem_after_opt:  %zu bytes\n", used_mem_after_opt);
         }
 
-        if (ex % print_details_interval == 0) {
+        if (params.print_details_interval > 0 && ex % params.print_details_interval == 0) {
             // set_logits_masked(logits, token_notavail, -1e9);
             for (int i=0; i<n_batch; ++i) {
                 init_sampler(&sampler, lctx);
@@ -2443,16 +2723,16 @@ int main(int argc, char ** argv) {
         ggml_free(ctx0);
     }
 
-    if (n_examples > 0) {
-        save_checkpoint(&model, opt, fn_chkpt_out);
+    if (params.n_examples > 0) {
+        save_checkpoint(&model, opt, params.fn_checkpoint_out);
     }
 
-    if (strlen(fn_model_out) > 0) {
-        save_as_llama_model(&vocab, &model, fn_model_out);
+    if (strlen(params.fn_model_out) > 0) {
+        save_as_llama_model(&vocab, &model, params.fn_model_out);
     }
 
     {
-        int n_gen = 1024;
+        int n_gen = params.n_predict;
         int sample_ctx = n_tokens - n_tokens/8;
 
         sampler.params.temp = 0.2;
@@ -2477,15 +2757,15 @@ int main(int argc, char ** argv) {
 
         printf("---\n");
         for (int i=0; i<n_gen; ++i) {
-            struct ggml_init_params params = {
+            struct ggml_init_params cparams = {
                 /*.mem_size   =*/ compute_size,
                 /*.mem_buffer =*/ compute_addr,
                 /*.no_alloc   =*/ false,
             };
-            struct ggml_context * ctx0 = ggml_init(params);
+            struct ggml_context * ctx0 = ggml_init(cparams);
 
             ggml_cgraph gf = {};
-            gf.n_threads = 1;
+            gf.n_threads = params.n_threads;
 
             int n_past = 0;
             struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);

From ad966da95528053df25067332493b52b1fa4e0c6 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 30 May 2023 15:58:22 +0200
Subject: [PATCH 62/86] remove unnecessary comments

---
 examples/baby-llama/baby-llama-text.cpp | 65 -------------------------
 1 file changed, 65 deletions(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index ecdb418bf2285..5d48b7155f279 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -1191,7 +1191,6 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
     struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch);
     memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch);
 
-    // inpL shape [n_embd,N*n_batch,1]
     struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
     assert_shape_2d(inpL, n_embd, N*n_batch);
     for (int il = 0; il < n_layer; ++il) {
@@ -1199,11 +1198,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
 
         struct ggml_tensor * cur;
 
-        // lctx.use_buf(ctx0, 0);
-
         // norm
         {
-            // cur shape [n_embd,N*n_batch,1,1]
             cur = ggml_rms_norm(ctx0, inpL);
             assert_shape_2d(cur, n_embd, N*n_batch);
 
@@ -1219,94 +1215,48 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
             // compute Q and K and RoPE them
             // wq   shape [n_embd, n_embd, 1, 1]
             // wk   shape [n_embd, n_embd, 1, 1]
-            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
-            // Kcur shape [n_embd/n_head, n_head, N, n_batch]
             struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
             struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
             assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
             assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
 
-            // Vcur shape [N, n_batch, n_embd/n_head, n_head]
             struct ggml_tensor * Vcur = ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, cur, model->layers[il].wv), N, n_batch, n_embd/n_head, n_head);
             assert_shape_4d(Vcur, N, n_batch, n_embd/n_head, n_head);
 
-            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
-            // Q shape    [n_embd/n_head, N, n_head, n_batch]
             struct ggml_tensor * Q =
                 ggml_permute(ctx0,
                         Qcur,
                         0, 2, 1, 3);
             assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);
 
-            // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
-            // K shape [n_embd/n_head, N, n_head, n_batch]
             struct ggml_tensor * K =
                 ggml_permute(ctx0,
                         Kcur,
                         0, 2, 1, 3);
             assert_shape_4d(K, n_embd/n_head, N, n_head, n_batch);
 
-            // // K * Q
-            // // KQ shape [N, N, n_head, n_batch]
-            // struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            // assert_shape_4d(KQ, N, N, n_head, n_batch);
-
-            // // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // // KQ_scaled shape [N, N, n_head, n_batch]
-            // struct ggml_tensor * KQ_scaled =
-            //     ggml_scale_inplace(ctx0,
-            //             KQ,
-            //             ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
-            // assert_shape_4d(KQ_scaled, N, N, n_head, n_batch);
-
-            // // KQ_masked = mask_past(KQ_scaled)
-            // // KQ_masked shape [N, N, n_head, n_batch]
-            // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
-            // assert_shape_4d(KQ_masked, N, N, n_head, n_batch);
-
-            // // KQ = soft_max(KQ_masked)
-            // // KQ_soft_max shape [N, N, n_head, n_batch]
-            // struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-            // assert_shape_4d(KQ_soft_max, N, N, n_head, n_batch);
-
-            // Vcur shape [N, n_batch, n_embd/n_head, n_head]
-            // V shape    [N, n_embd/n_head, n_head, n_batch]
             struct ggml_tensor * V =
                 ggml_permute(ctx0,
                     Vcur,
                     0, 3, 1, 2);
             assert_shape_4d(V, N, n_embd/n_head, n_head, n_batch);
 
-            // // KQV shape [n_embd/n_head, N, n_head, n_batch]
-            // struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            // assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch);
-
-
             bool masked = true;
             struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, masked);
             assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch);
 
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // KQV_merged shape [n_embd/n_head, n_head, N, n_batch]
             struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
             assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch);
-            // KQV_merged shape
-
-            // cur shape [n_embd,N*n_batch,1,1]
             cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch);
             assert_shape_2d(cur, n_embd, N*n_batch);
 
             // projection (no bias)
-            // cur shape [n_embd,N*n_batch,1,1]
             cur = ggml_mul_mat(ctx0,
                     model->layers[il].wo,
                     cur);
             assert_shape_2d(cur, n_embd, N*n_batch);
         }
 
-        // lctx.use_buf(ctx0, 1);
-
-        // inpFF shape [n_embd,N*n_batch,1,1]
         struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA);
         assert_shape_2d(inpFF, n_embd, N*n_batch);
 
@@ -1314,52 +1264,43 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
         {
             // norm
             {
-                // cur shape [n_embd,N*n_batch,1,1]
                 cur = ggml_rms_norm(ctx0, inpFF);
                 assert_shape_2d(cur, n_embd, N*n_batch);
 
                 // cur = ffn_norm*cur
-                // cur shape [n_embd,N*n_batch,1,1]
                 cur = ggml_mul(ctx0,
                         ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
                         cur);
                 assert_shape_2d(cur, n_embd, N*n_batch);
             }
 
-            // tmp shape [n_ff,N*n_batch,1,1]
             struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
                     model->layers[il].w3,
                     cur);
             assert_shape_2d(tmp, n_ff, N*n_batch);
 
-            // cur shape [n_ff,N*n_batch,1,1]
             cur = ggml_mul_mat(ctx0,
                     model->layers[il].w1,
                     cur);
             assert_shape_2d(cur, n_ff, N*n_batch);
 
             // SILU activation
-            // cur shape [n_ff,N*n_batch,1,1]
             cur = ggml_silu(ctx0, cur);
             assert_shape_2d(cur, n_ff, N*n_batch);
 
-            // cur shape [n_ff,N*n_batch,1,1]
             cur = ggml_mul(ctx0, cur, tmp);
             assert_shape_2d(cur, n_ff, N*n_batch);
 
-            // cur shape [n_embd,N*n_batch,1,1]
             cur = ggml_mul_mat(ctx0,
                     model->layers[il].w2,
                     cur);
             assert_shape_2d(cur, n_embd, N*n_batch);
         }
 
-        // cur shape [n_embd,N*n_batch,1,1]
         cur = ggml_add_inplace(ctx0, cur, inpFF);
         assert_shape_2d(cur, n_embd, N*n_batch);
 
         // input for next layer
-        // inpL shape [n_embd,N*n_batch,1,1]
         inpL = cur;
         assert_shape_2d(inpL, n_embd, N*n_batch);
     }
@@ -1367,28 +1308,22 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
     // norm
     {
 
-        // inpL shape [n_embd,N*n_batch,1,1]
         inpL = ggml_rms_norm(ctx0, inpL);
         assert_shape_2d(inpL, n_embd, N*n_batch);
 
         // inpL = norm*inpL
-        // inpL shape [n_embd,N*n_batch,1,1]
         inpL = ggml_mul(ctx0,
                     ggml_repeat(ctx0, model->norm, inpL),
                     inpL);
 
         assert_shape_2d(inpL, n_embd, N*n_batch);
-
-        //embeddings = inpL;
     }
 
     // lm_head
-    // inpL shape [n_vocab,N*n_batch,1,1]
     inpL = ggml_mul_mat(ctx0, model->output, inpL);
     assert_shape_2d(inpL, n_vocab, N*n_batch);
 
     {
-        // inpL shape [n_vocab,N,n_batch,1]
         inpL = ggml_reshape_3d(ctx0,
                         inpL,
                         n_vocab, N, n_batch);

From 1074a81e819b9076599d9da93d6ebd99dcce93b8 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 30 May 2023 16:06:20 +0200
Subject: [PATCH 63/86] add train params to specify memory size

---
 examples/baby-llama/baby-llama-text.cpp | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index 5d48b7155f279..03f93c749e660 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -2122,6 +2122,9 @@ struct train_params {
     int   adam_n_iter;
     float adam_alpha;
     float adam_decay;
+
+    int mem_model_gb;
+    int mem_compute_gb;
 };
 
 struct train_params get_default_train_params() {
@@ -2164,6 +2167,9 @@ struct train_params get_default_train_params() {
     params.adam_alpha        = 1e-3;
     params.adam_decay        = 1e-3;
 
+    params.mem_model_gb   = 2;
+    params.mem_compute_gb = 32;
+
     return params;
 }
 
@@ -2203,6 +2209,8 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --adam-iter N              Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter);
     fprintf(stderr, "  --adam-alpha N             Adam learning rate alpha (default %f)\n", params->adam_alpha);
     fprintf(stderr, "  --adam-decay N             AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay);
+    fprintf(stderr, "  --mem-model N              Memory to allocate for model and cache in gigabytes. (default %d)\n", params->mem_model_gb);
+    fprintf(stderr, "  --mem-compute N            Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb);
     fprintf(stderr, "\n");
 }
 
@@ -2384,6 +2392,18 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->adam_decay = std::stof(argv[i]);
+        } else if (arg == "--mem-model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->mem_model_gb = std::stoi(argv[i]);
+        } else if (arg == "--mem-compute") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->mem_compute_gb = std::stoi(argv[i]);
         } else if (arg == "-h" || arg == "--help") {
             train_print_usage(argc, argv, &default_params);
             exit(0);
@@ -2480,7 +2500,7 @@ int main(int argc, char ** argv) {
 
 
     struct ggml_init_params lcparams;
-    lcparams.mem_size   = 1024ll*1024ll*1024ll*2ll;
+    lcparams.mem_size   = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
     lcparams.mem_buffer = NULL;
     lcparams.no_alloc   = false;
 
@@ -2536,7 +2556,7 @@ int main(int argc, char ** argv) {
     printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx));
     // ggml_print_tensor_objects(model.ctx);
 
-    size_t    compute_size = 1024ll*1024ll*1024ll*32ll;
+    size_t    compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
     uint8_t * compute_addr = new uint8_t[compute_size];
 
     GGML_ASSERT(train_tokens.size() > n_tokens);;

From 21b11b55d4a07a68b235228cde1cf4d65d5e5f66 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 30 May 2023 17:03:09 +0200
Subject: [PATCH 64/86] remove python bindings

---
 py/llama_cpp/__init__.py |   0
 py/llama_cpp/llama.py    | 327 ---------------------------------------
 setup.py                 |  15 --
 3 files changed, 342 deletions(-)
 delete mode 100644 py/llama_cpp/__init__.py
 delete mode 100644 py/llama_cpp/llama.py
 delete mode 100644 setup.py

diff --git a/py/llama_cpp/__init__.py b/py/llama_cpp/__init__.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/py/llama_cpp/llama.py b/py/llama_cpp/llama.py
deleted file mode 100644
index bc0fa8b72855c..0000000000000
--- a/py/llama_cpp/llama.py
+++ /dev/null
@@ -1,327 +0,0 @@
-import os
-import sys
-import glob
-import ctypes
-
-from ctypes import c_int, c_float, c_double, c_char_p, c_void_p, c_bool, c_size_t, c_ubyte, POINTER, Structure
-
-
-# Load the library
-if sys.platform == 'win32':
-    lib = ctypes.cdll.LoadLibrary(next(iter(glob.glob(os.path.join(os.path.dirname(__file__), '..', '..', '**', 'llama.dll'), recursive=True))))
-else:
-    lib = ctypes.cdll.LoadLibrary(next(iter(glob.glob(os.path.join(os.path.dirname(__file__), '..', '..', '**', 'libllama.so'), recursive=True))))
-
-
-# C types
-llama_token = c_int
-llama_token_p = POINTER(llama_token)
-
-class llama_token_data(Structure):
-    _fields_ = [
-        ('id',   llama_token), # token id
-        ('p',    c_float), # probability of the token
-        ('plog', c_float), # log probability of the token
-    ]
-
-llama_token_data_p = POINTER(llama_token_data)
-
-class llama_token_data_array(Structure):
-    _fields_ = [
-        ('data',   llama_token_data_p),
-        ('size',   c_size_t),
-        ('sorted', c_bool),
-    ]
-
-llama_token_data_array_p = POINTER(llama_token_data_array)
-
-llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
-class llama_context_params(Structure):
-    _fields_ = [
-        ('n_ctx',        c_int),  # text context
-        ('n_parts',      c_int),  # -1 for default
-        ('n_gpu_layers', c_int),  # number of layers to store in VRAM
-        ('seed',         c_int),  # RNG seed, 0 for random
-        ('f16_kv',       c_bool), # use fp16 for KV cache
-        ('logits_all',   c_bool), # the llama_eval() call computes all logits, not just the last one
-        ('vocab_only',   c_bool), # only load the vocabulary, no weights
-        ('use_mmap',     c_bool), # use mmap if possible
-        ('use_mlock',    c_bool), # force system to keep model in RAM
-        ('embedding',    c_bool), # embedding mode only
-        ('progress_callback',           llama_progress_callback), # called with a progress value between 0 and 1, pass NULL to disable
-        ('progress_callback_user_data', c_void_p),                # context pointer passed to the progress callback
-    ]
-
-
-llama_context_params_p = POINTER(llama_context_params)
-
-llama_context_p = c_void_p
-
-c_size_p = POINTER(c_size_t)
-c_ubyte_p = POINTER(c_ubyte)
-c_float_p = POINTER(c_float)
-
-# C functions
-lib.llama_context_default_params.argtypes = []
-lib.llama_context_default_params.restype = llama_context_params
-
-lib.llama_mmap_supported.argtypes = []
-lib.llama_mmap_supported.restype = c_bool
-
-lib.llama_mlock_supported.argtypes = []
-lib.llama_mlock_supported.restype = c_bool
-
-lib.llama_init_from_file.argtypes = [c_char_p, llama_context_params]
-lib.llama_init_from_file.restype = llama_context_p
-
-lib.llama_free.argtypes = [llama_context_p]
-lib.llama_free.restype = None
-
-lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int]
-lib.llama_model_quantize.restype = c_int
-
-lib.llama_apply_lora_from_file.argtypes = [llama_context_p, c_char_p, c_char_p, c_int]
-lib.llama_apply_lora_from_file.restype = c_int
-
-lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p]
-lib.llama_get_kv_cache_token_count.restype = c_int
-
-lib.llama_set_rng_seed.argtypes = [llama_context_p, c_int]
-lib.llama_set_rng_seed.restype = None
-
-lib.llama_get_state_size.argtypes = [llama_context_p]
-lib.llama_get_state_size.restype = c_size_t
-
-lib.llama_copy_state_data.argtypes = [llama_context_p, c_ubyte_p]
-lib.llama_copy_state_data.restype = c_size_t
-
-lib.llama_set_state_data.argtypes = [llama_context_p, c_ubyte_p]
-lib.llama_set_state_data.restype = c_size_t
-
-lib.llama_load_session_file.argtypes = [llama_context_p, c_char_p, llama_token_p, c_size_t, c_size_p]
-lib.llama_load_session_file.restype = c_bool
-
-lib.llama_save_session_file.argtypes = [llama_context_p, c_char_p, llama_token_p, c_size_t]
-lib.llama_save_session_file.restype = c_bool
-
-lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_int]
-lib.llama_eval.restype = c_int
-
-lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, c_bool]
-lib.llama_tokenize.restype = c_int
-
-lib.llama_n_vocab.argtypes = [llama_context_p]
-lib.llama_n_vocab.restype = c_int
-
-lib.llama_n_ctx.argtypes = [llama_context_p]
-lib.llama_n_ctx.restype = c_int
-
-lib.llama_n_embd.argtypes = [llama_context_p]
-lib.llama_n_embd.restype = c_int
-
-lib.llama_get_logits.argtypes = [llama_context_p]
-lib.llama_get_logits.restype = c_float_p
-
-lib.llama_get_embeddings.argtypes = [llama_context_p]
-lib.llama_get_embeddings.restype = c_float_p
-
-lib.llama_token_to_str.argtypes = [llama_context_p, llama_token]
-lib.llama_token_to_str.restype = c_char_p
-
-lib.llama_token_bos.argtypes = []
-lib.llama_token_bos.restype = llama_token
-
-lib.llama_token_eos.argtypes = []
-lib.llama_token_eos.restype = llama_token
-
-lib.llama_token_nl.argtypes = []
-lib.llama_token_nl.restype = llama_token
-
-lib.llama_sample_repetition_penalty.argtypes = [llama_context_p, llama_token_data_array_p, llama_token_p, c_size_t, c_float]
-lib.llama_sample_repetition_penalty.restype = None
-
-lib.llama_sample_frequency_and_presence_penalties.argtypes = [llama_context_p, llama_token_data_array_p, llama_token_p, c_size_t, c_float, c_float]
-lib.llama_sample_frequency_and_presence_penalties.restype = None
-
-lib.llama_sample_softmax.argtypes = [llama_context_p, llama_token_data_array_p]
-lib.llama_sample_softmax.restype = None
-
-lib.llama_sample_top_k.argtypes = [llama_context_p, llama_token_data_array_p, c_int, c_size_t]
-lib.llama_sample_top_k.restype = None
-
-lib.llama_sample_top_p.argtypes = [llama_context_p, llama_token_data_array_p, c_float, c_size_t]
-lib.llama_sample_top_p.restype = None
-
-lib.llama_sample_tail_free.argtypes = [llama_context_p, llama_token_data_array_p, c_float, c_size_t]
-lib.llama_sample_tail_free.restype = None
-
-lib.llama_sample_typical.argtypes = [llama_context_p, llama_token_data_array_p, c_float, c_size_t]
-lib.llama_sample_typical.restype = None
-
-lib.llama_sample_temperature.argtypes = [llama_context_p, llama_token_data_array_p, c_float]
-lib.llama_sample_temperature.restype = None
-
-lib.llama_sample_token_mirostat.argtypes = [llama_context_p, llama_token_data_array_p, c_float, c_float, c_int, c_float_p]
-lib.llama_sample_token_mirostat.restype = llama_token
-
-lib.llama_sample_token_mirostat_v2.argtypes = [llama_context_p, llama_token_data_array_p, c_float, c_float, c_float_p]
-lib.llama_sample_token_mirostat_v2.restype = llama_token
-
-lib.llama_sample_token_greedy.argtypes = [llama_context_p, llama_token_data_array_p]
-lib.llama_sample_token_greedy.restype = llama_token
-
-lib.llama_sample_token.argtypes = [llama_context_p, llama_token_data_array_p]
-lib.llama_sample_token.restype = llama_token
-
-lib.llama_print_timings.argtypes = [llama_context_p]
-lib.llama_print_timings.restype = None
-
-lib.llama_reset_timings.argtypes = [llama_context_p]
-lib.llama_reset_timings.restype = None
-
-lib.llama_print_system_info.argtypes = []
-lib.llama_print_system_info.restype = c_char_p
-
-
-# Python functions
-def llama_context_default_params() -> llama_context_params:
-    params = lib.llama_context_default_params()
-    return params
-
-def llama_mmap_supported() -> bool:
-    return lib.llama_mmap_supported()
-
-def llama_mlock_supported() -> bool:
-    return lib.llama_mlock_supported()
-
-def llama_init_from_file(path_model: str, params: llama_context_params) -> llama_context_p:
-    """Various functions for loading a ggml llama model.
-    Allocate (almost) all memory needed for the model.
-    Return NULL on failure """
-    return lib.llama_init_from_file(path_model.encode('utf-8'), params)
-
-def llama_free(ctx: llama_context_p):
-    """Free all allocated memory"""
-    lib.llama_free(ctx)
-
-def llama_model_quantize(fname_inp: str, fname_out: str, itype: c_int, qk: c_int) -> c_int:
-    """Returns 0 on success"""
-    return lib.llama_model_quantize(fname_inp.encode('utf-8'), fname_out.encode('utf-8'), itype, qk)
-
-def llama_apply_lora_from_file(ctx: llama_context_p, path_lora: str, path_base_model: str, n_threads: c_int) -> c_int:
-    return lib.llama_apply_lora_from_file(ctx, path_lora.encode('utf-8'), path_base_model.encode('utf-8'), n_threads)
-
-def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int:
-    return lib.llama_get_kv_cache_token_count(ctx)
-
-def llama_set_rng_seed(ctx: llama_context_p, seed: c_int):
-    return lib.llama_set_rng_seed(ctx, seed)
-
-def llama_get_state_size(ctx: llama_context_p) -> c_size_t:
-    return lib.llama_get_state_size(ctx)
-
-def llama_copy_state_data(ctx: llama_context_p, dst: c_ubyte_p) -> c_size_t:
-    return lib.llama_copy_state_data(ctx, dst)
-
-def llama_set_state_data(ctx: llama_context_p, src: c_ubyte_p) -> c_size_t:
-    return lib.llama_set_state_data(ctx, src)
-
-def llama_load_session_file(ctx: llama_context_p, path_session: str, tokens_out: llama_token_p, n_token_capacity: c_size_t, n_token_count_out: c_size_p) -> c_bool:
-    return lib.llama_load_session_file(ctx, path_session.encode('utf-8'), tokens_out, n_token_capacity, n_token_count_out)
-
-def llama_save_session_file(ctx: llama_context_p, path_session: str, tokens: llama_token_p, n_token_count: c_size_t) -> c_bool:
-    return lib.llama_save_session_file(ctx, path_session.encode('utf-8'), tokens, n_token_count)
-
-def llama_eval(ctx: llama_context_p, tokens: llama_token_p, n_tokens: c_int, n_past: c_int, n_threads: c_int) -> c_int:
-    """Run the llama inference to obtain the logits and probabilities for the next token.
-    tokens + n_tokens is the provided batch of new tokens to process
-    n_past is the number of tokens to use from previous eval calls
-    Returns 0 on success"""
-    return lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads)
-
-def llama_tokenize(ctx: llama_context_p, text: str, tokens: llama_token_p, n_max_tokens: c_int, add_bos: c_bool) -> c_int:
-    """Convert the provided text into tokens.
-    The tokens pointer must be large enough to hold the resulting tokens.
-    Returns the number of tokens on success, no more than n_max_tokens
-    Returns a negative number on failure - the number of tokens that would have been returned"""
-    return lib.llama_tokenize(ctx, text.encode('utf-8'), tokens, n_max_tokens, add_bos)
-
-def llama_n_vocab(ctx: llama_context_p) -> c_int:
-    return lib.llama_n_vocab(ctx)
-
-def llama_n_ctx(ctx: llama_context_p) -> c_int:
-    return lib.llama_n_ctx(ctx)
-
-def llama_n_embd(ctx: llama_context_p) -> c_int:
-    return lib.llama_n_embd(ctx)
-
-def llama_get_logits(ctx: llama_context_p) -> c_float_p:
-    """Token logits obtained from the last call to llama_eval()
-    The logits for the last token are stored in the last row
-    Can be mutated in order to change the probabilities of the next token
-    Rows: n_tokens
-    Cols: n_vocab"""
-    return lib.llama_get_logits(ctx)
-
-def llama_get_embeddings(ctx: llama_context_p) -> c_float_p:
-    """Get the embeddings for the input
-    shape: [n_embd] (1-dimensional)"""
-    return lib.llama_get_embeddings(ctx)
-
-def llama_token_to_str(ctx: llama_context_p, token: int) -> str:
-    """Token Id -> String. Uses the vocabulary in the provided context"""
-    return lib.llama_token_to_str(ctx, token).decode('utf-8', errors='ignore')
-
-def llama_token_bos() -> llama_token:
-    return lib.llama_token_bos()
-
-def llama_token_eos() -> llama_token:
-    return lib.llama_token_eos()
-
-def llama_token_nl() -> llama_token:
-    return lib.llama_token_nl()
-
-def llama_sample_repetition_penalty(ctx: llama_context_p, candidates: llama_token_data_array_p, last_tokens: llama_token_p, last_tokens_size: c_size_t, penalty: float):
-    lib.llama_sample_repetition_penalty(ctx, candidates, last_tokens, last_tokens_size, penalty)
-
-def llama_sample_frequency_and_presence_penalties(ctx: llama_context_p, candidates: llama_token_data_array_p, last_tokens: llama_token_p, last_tokens_size: c_size_t, alpha_frequency: float, alpha_presence: float):
-    lib.llama_sample_frequency_and_presence_penalties(ctx, candidates, last_tokens, last_tokens_size, alpha_frequency, alpha_presence)
-
-def llama_sample_softmax(ctx: llama_context_p, candidates: llama_token_data_array_p):
-    lib.llama_sample_softmax(ctx, candidates)
-
-def llama_sample_top_k(ctx: llama_context_p, candidates: llama_token_data_array_p, k: c_int, min_keep: c_size_t):
-    lib.llama_sample_top_k(ctx, candidates, k, min_keep)
-
-def llama_sample_top_p(ctx: llama_context_p, candidates: llama_token_data_array_p, p: float, min_keep: c_size_t):
-    lib.llama_sample_top_p(ctx, candidates, c_float(p), c_size_t(min_keep))
-
-def llama_sample_tail_free(ctx: llama_context_p, candidates: llama_token_data_array_p, z: float, min_keep: c_size_t):
-    lib.llama_sample_tail_free(ctx, candidates, z, min_keep)
-
-def llama_sample_typical(ctx: llama_context_p, candidates: llama_token_data_array_p, p: float, min_keep: c_size_t):
-    lib.llama_sample_typical(ctx, candidates, p, min_keep)
-
-def llama_sample_temperature(ctx: llama_context_p, candidates: llama_token_data_array_p, temp: float):
-    lib.llama_sample_temperature(ctx, candidates, temp)
-
-def llama_sample_token_mirostat(ctx: llama_context_p, candidates: llama_token_data_array_p, tau: float, eta: float, m: c_int, mu: c_float_p) -> llama_token:
-    return lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
-
-def llama_sample_token_mirostat_v2(ctx: llama_context_p, candidates: llama_token_data_array_p, tau: float, eta: float, mu: c_float_p) -> llama_token:
-    return lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)
-
-def llama_sample_token_greedy(ctx: llama_context_p, candidates: llama_token_data_array_p) -> llama_token:
-    return lib.llama_sample_token_greedy(ctx, candidates)
-
-def llama_sample_token(ctx: llama_context_p, candidates: llama_token_data_array_p) -> llama_token:
-    return lib.llama_sample_token(ctx, candidates)
-
-def llama_print_timings(ctx: llama_context_p):
-    lib.llama_print_timings(ctx)
-
-def llama_reset_timings(ctx: llama_context_p):
-    lib.llama_reset_timings(ctx)
-
-def llama_print_system_info() -> c_char_p:
-    return lib.llama_print_system_info()
\ No newline at end of file
diff --git a/setup.py b/setup.py
deleted file mode 100644
index cc3a23f0739fd..0000000000000
--- a/setup.py
+++ /dev/null
@@ -1,15 +0,0 @@
-
-from setuptools import setup, find_packages
-import glob, os
-
-setup(
-    name='llama_cpp',
-    version='0.0.1',
-    author='Anonymous',
-    author_email='',
-    license='All rights reserved',
-    packages=find_packages(where='py'),
-    package_dir={'': 'py'},
-    install_requires=[],
-    entry_points={'console_scripts': []},
-)

From 8fd8599f613115279c6997d04b5636702c6834da Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 30 May 2023 17:07:03 +0200
Subject: [PATCH 65/86] rename baby-llama-text to train-text-from-scratch

---
 examples/CMakeLists.txt                                       | 1 +
 examples/baby-llama/CMakeLists.txt                            | 4 ----
 examples/train-text-from-scratch/CMakeLists.txt               | 4 ++++
 .../train-text-from-scratch.cpp}                              | 0
 4 files changed, 5 insertions(+), 4 deletions(-)
 create mode 100644 examples/train-text-from-scratch/CMakeLists.txt
 rename examples/{baby-llama/baby-llama-text.cpp => train-text-from-scratch/train-text-from-scratch.cpp} (100%)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index e4ce5aca7b98b..e8eab351ded94 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -37,6 +37,7 @@ else()
     add_subdirectory(save-load-state)
     add_subdirectory(benchmark)
     add_subdirectory(baby-llama)
+    add_subdirectory(train-text-from-scratch)
     if(LLAMA_BUILD_SERVER)
         add_subdirectory(server)
     endif()
diff --git a/examples/baby-llama/CMakeLists.txt b/examples/baby-llama/CMakeLists.txt
index c89dc792b9a0e..d2ce36367474f 100644
--- a/examples/baby-llama/CMakeLists.txt
+++ b/examples/baby-llama/CMakeLists.txt
@@ -2,7 +2,3 @@ set(TARGET baby-llama)
 add_executable(${TARGET} baby-llama.cpp)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
-
-add_executable(${TARGET}-text baby-llama-text.cpp)
-target_link_libraries(${TARGET}-text PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET}-text PRIVATE cxx_std_11)
diff --git a/examples/train-text-from-scratch/CMakeLists.txt b/examples/train-text-from-scratch/CMakeLists.txt
new file mode 100644
index 0000000000000..1a44c4961c084
--- /dev/null
+++ b/examples/train-text-from-scratch/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TARGET train-text-from-scratch)
+add_executable(${TARGET} train-text-from-scratch.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
similarity index 100%
rename from examples/baby-llama/baby-llama-text.cpp
rename to examples/train-text-from-scratch/train-text-from-scratch.cpp

From 7f172c1070d514e450e002e430957773093572ba Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 31 May 2023 00:25:37 +0200
Subject: [PATCH 66/86] replace auto parameters in lambda function

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index aa3b237890a8a..b8caded0047b3 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1652,7 +1652,7 @@ void shuffle_ints(int * begin, int * end) {
     for (int i=0; i<max+1; ++i) {
        vals[i] = frand();
     }
-    std::sort(begin, end, [&vals](auto a, auto b){
+    std::sort(begin, end, [&vals](int a, int b){
        return vals.at(a) < vals.at(b);
     });
 }

From f88fb2bdc5532bb06fc63cc4c5b5199ab6674fcd Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 31 May 2023 12:38:26 +0200
Subject: [PATCH 67/86] add #include <climits>

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index b8caded0047b3..45da62995896c 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3,6 +3,7 @@
 #include <unordered_map>
 #include <vector>
 #include <cassert>
+#include <climits>
 #include <cstring>
 #include <cstdarg>
 #include <ctime>

From 01fc3faf71bc245fa3e31c160397ce915a17d8d3 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 31 May 2023 15:00:54 +0200
Subject: [PATCH 68/86] add explicit cast to fix compile error

"error: non-constant-expression cannot be narrowed from type 'int64_t' (aka 'long long') to 'uint32_t' (aka 'unsigned int') in initializer list [-Wc++11-narrowing]"
---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 45da62995896c..3dcb7fc16026b 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1788,7 +1788,10 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
     const char * name = ggml_get_name(tensor);
     uint32_t name_len = strlen(name);
     uint32_t nd = tensor->n_dims;
-    uint32_t ne[4] = { tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3] };
+    uint32_t ne[4] = { (uint32_t)tensor->ne[0], 
+                       (uint32_t)tensor->ne[1], 
+                       (uint32_t)tensor->ne[2], 
+                       (uint32_t)tensor->ne[3] };
     file->write_u32(nd);
     file->write_u32(name_len);
     file->write_u32(tensor->type);

From 83a34444afabc6838542b3f66b8c139c86273665 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 31 May 2023 15:02:38 +0200
Subject: [PATCH 69/86] remove trailing whitespace

---
 .../train-text-from-scratch/train-text-from-scratch.cpp     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 3dcb7fc16026b..7e8d80b940859 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1788,9 +1788,9 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
     const char * name = ggml_get_name(tensor);
     uint32_t name_len = strlen(name);
     uint32_t nd = tensor->n_dims;
-    uint32_t ne[4] = { (uint32_t)tensor->ne[0], 
-                       (uint32_t)tensor->ne[1], 
-                       (uint32_t)tensor->ne[2], 
+    uint32_t ne[4] = { (uint32_t)tensor->ne[0],
+                       (uint32_t)tensor->ne[1],
+                       (uint32_t)tensor->ne[2],
                        (uint32_t)tensor->ne[3] };
     file->write_u32(nd);
     file->write_u32(name_len);

From 0e269665cd87fa803202fe18941a7478b41e7b92 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 1 Jun 2023 19:41:28 +0200
Subject: [PATCH 70/86] add ggml_opt_resume_g which accepts forward and
 backward cgraphs

---
 ggml.c | 11 +++++++++++
 ggml.h |  8 ++++++++
 2 files changed, 19 insertions(+)

diff --git a/ggml.c b/ggml.c
index bde5f96985e5c..b16cd07a981e5 100644
--- a/ggml.c
+++ b/ggml.c
@@ -17457,6 +17457,17 @@ enum ggml_opt_result ggml_opt_resume(
     *gf = ggml_build_forward (f);
     *gb = ggml_build_backward(ctx, gf, true);
 
+    return ggml_opt_resume_g(ctx, opt, f, gf, gb);
+}
+
+enum ggml_opt_result ggml_opt_resume_g(
+        struct ggml_context * ctx,
+        struct ggml_opt_context * opt,
+        struct ggml_tensor * f,
+        struct ggml_cgraph * gf,
+        struct ggml_cgraph * gb) {
+
+    // build forward + backward compute graphs
     enum ggml_opt_result result = GGML_OPT_OK;
 
     switch (opt->params.type) {
diff --git a/ggml.h b/ggml.h
index a9750d89d73bf..1e85b2fb1aa27 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1208,6 +1208,14 @@ extern "C" {
             struct ggml_opt_context * opt,
             struct ggml_tensor * f);
 
+    // continue optimizing the function defined by the tensor f
+    GGML_API enum ggml_opt_result ggml_opt_resume_g(
+            struct ggml_context * ctx,
+            struct ggml_opt_context * opt,
+            struct ggml_tensor * f,
+            struct ggml_cgraph * gf,
+            struct ggml_cgraph * gb);
+
     //
     // quantization
     //

From 3164f9338109b139182def39921ad4131d57c1e1 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 1 Jun 2023 19:41:55 +0200
Subject: [PATCH 71/86] fix formulas in comments

---
 ggml.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/ggml.c b/ggml.c
index b16cd07a981e5..b3ae7f2f9030b 100644
--- a/ggml.c
+++ b/ggml.c
@@ -13592,12 +13592,12 @@ static void ggml_compute_forward_flash_attn_back_f32(
                 //   vcur   = v[:M,:D,iq2,iq3]       [M,D,1,1]  grad[vcur] = grad[S5].T @ S4
                 //   S0     = -Inf                   [D,1,1,1]
                 //  ~S1[i]  = dot(kcur[:D,i], qcur)
-                //   S1     = qcur.T @ kcur          [M,1,1,1]  grad[S1]   = grad[S2] * scale
+                //   S1     = qcur @ kcur.T          [M,1,1,1]  grad[S1]   = grad[S2] * scale
                 //   S2     = S1 * scale             [M,1,1,1]  grad[S2]   = diag_mask_zero(grad[S3], P)
                 //   S3     = diag_mask_inf(S2, P)   [M,1,1,1]  grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
                 //   S4     = softmax(S3)            [M,1,1,1]  grad[S4]   = grad[S5] @ vcur
-                //  ~S5[i]  = dot(vcur[:,i],S4)
-                //   S5     = S4.T @ vcur            [D,1,1,1]  grad[S5]   = d[:D,iq1,iq2,iq3]
+                //  ~S5[i]  = dot(vcur[:,i], S4)
+                //   S5     = S4 @ vcur.T            [D,1,1,1]  grad[S5]   = d[:D,iq1,iq2,iq3]
                 //  ~dst[i,iq1,iq2,iq3]  = S5[i]              ^
                 //   dst[:D,iq1,iq2,iq3] = S5                 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,iq1,iq2,iq3]
                 // dst                               backward-/ grad[dst]                 = d
@@ -13615,7 +13615,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
                 //
                 // in post-order:
                 //
-                // S1         = qcur.T @ kcur
+                // S1         = qcur @ kcur.T
                 // S2         = S1 * scale
                 // S3         = diag_mask_inf(S2, P)
                 // S4         = softmax(S3)
@@ -13628,7 +13628,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
                 //
                 // using less variables (SM=S4):
                 //
-                // S             = diag_mask_inf(qcur.T @ kcur * scale, P)
+                // S             = diag_mask_inf(qcur @ kcur.T * scale, P)
                 // SM            = softmax(S)
                 // S             = d[:D,iq1,iq2,iq3] @ vcur
                 // dot_SM_gradSM = dot(SM, S)

From 765b290010f544613d97a215dea56a3147fed084 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 1 Jun 2023 19:42:51 +0200
Subject: [PATCH 72/86] bug fix for ggml_compute_forward_get_rows_back_f32

the result should be set to zero, not to whatever data is in opt0
---
 ggml.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index b3ae7f2f9030b..77eb43d06be6e 100644
--- a/ggml.c
+++ b/ggml.c
@@ -11016,7 +11016,11 @@ static void ggml_compute_forward_get_rows_back_f32(
     GGML_ASSERT(ggml_is_contiguous(opt0));
     GGML_ASSERT(ggml_is_contiguous(dst));
 
-    ggml_compute_forward_dup_same_cont(params, opt0, dst);
+    // ggml_compute_forward_dup_same_cont(params, opt0, dst);
+
+    if (params->type == GGML_TASK_INIT) {
+        memset(dst->data, 0, ggml_nbytes(dst));
+    }
 
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;

From 0d4b87de3de6e0d910de5a0a2416ef6b10332fbe Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 1 Jun 2023 19:50:48 +0200
Subject: [PATCH 73/86] improve training memory usage with scratch buffers

instead of relying on the automatic backward pass, we manually create the graph for the backward pass.
it turns out that all backward pass operations need only temporary memory which can be reused after each layer.

will compute backward pass for ALL model parameters
---
 .../train-text-from-scratch.cpp               | 595 +++++++++++++++++-
 1 file changed, 578 insertions(+), 17 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 7e8d80b940859..ee17bd8e43eb0 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1337,6 +1337,505 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
     return inpL;
 }
 
+struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
+        struct my_llama_model * model,
+        struct ggml_context   * ctx0,
+        struct ggml_cgraph    * gf,
+        struct ggml_cgraph    * gb,
+        struct ggml_tensor  * * logits,
+        struct ggml_tensor    * tokens_input,
+        struct ggml_tensor    * targets,
+        void                  * compute_buf_0,
+        void                  * compute_buf_1,
+        void                  * compute_buf_2,
+        size_t                  size_buf_0,
+        size_t                  size_buf_1,
+        size_t                  size_buf_2,
+        const  int              n_tokens,
+        const  int              n_batch) {
+
+    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+
+    const int n_past = 0;
+    const int N = n_tokens;
+
+    gf->n_nodes = 0;
+    gf->n_leafs = 0;
+    gf->work_size = 0;
+    gf->perf_runs = 0;
+    gf->perf_cycles = 0;
+    gf->perf_time_us = 0;
+    gf->work = NULL;
+
+    const auto & hparams = model->hparams;
+    const int n_ctx      = hparams.n_ctx;
+    const int n_vocab    = hparams.n_vocab;
+    const int n_embd     = hparams.n_embd;
+    const int n_layer    = hparams.n_layer;
+    const int n_head     = hparams.n_head;
+    const int n_rot      = hparams.n_rot;
+    const int n_ff       = get_n_ff(&hparams);
+    const int rope_mode  = 0;
+
+    auto expand = [] (struct ggml_cgraph * g, struct ggml_tensor * t) -> struct ggml_tensor * {
+        ggml_build_forward_expand(g, t);
+        return t;
+    };
+
+    int last_buf = -1;
+    size_t buf_offs[3] = { 0, 0, 0 };
+    size_t buf_size[3] = { size_buf_0,
+                           size_buf_1,
+                           size_buf_2 };
+    void * buf_data[3] = { compute_buf_0,
+                           compute_buf_1,
+                           compute_buf_2 };
+    auto use_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data] (int buf) {
+        size_t last_offs = 0;
+        last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+        if (last_buf >= 0) {
+            buf_offs[last_buf] = last_offs;
+        }
+        if (buf >= 0) {
+            size_t offs = buf_offs[buf];
+            size_t size = buf_size[buf];
+            void * data = buf_data[buf];
+            ggml_set_scratch(ctx0, { offs, size, data, });
+        }
+        last_buf = buf;
+    };
+
+    auto clr_buf = [&buf_offs] (int buf) {
+        if (buf < 0) return;
+        // size_t last_offs = 0;
+        // last_offs = ggml_set_scratch(ctx, { 0, 0, nullptr, });
+        // if (last_buf >= 0) {
+        //     buf_offs[last_buf] = last_offs;
+        // }
+        // buf_max_size[buf] = std::max(buf_max_size[buf], buf_offs[buf]);
+        buf_offs[buf] = 0;
+        // if (last_buf >= 0) {
+        //     size_t offs = buf_offs[last_buf];
+        //     size_t size = buf_size[last_buf];
+        //     void * data = buf_data[last_buf];
+        //     ggml_set_scratch(ctx0, { offset, size, data, });
+        // }
+    };
+
+    auto view__q = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
+        int64_t ne0 = n_embd/n_head;
+        int64_t ne1 = N;
+        int64_t ne2 = n_head;
+        int64_t ne3 = n_batch;
+        size_t  nb0 = ggml_element_size(t);
+        size_t  nb1 = nb0*ne0;
+        size_t  nb2 = nb1*ne1;
+        size_t  nb3 = nb2*ne2;
+        size_t offset = 0;
+        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
+    };
+
+    auto view__k = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
+        int64_t ne0 = n_embd/n_head;
+        int64_t ne1 = N;
+        int64_t ne2 = n_head;
+        int64_t ne3 = n_batch;
+        size_t  nb0 = ggml_element_size(t);
+        size_t  nb1 = nb0*ne0;
+        size_t  nb2 = nb1*ne1;
+        size_t  nb3 = nb2*ne2;
+        size_t offset = nb3*ne3;
+        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
+    };
+
+    auto view__v = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
+        int64_t ne0 = N;
+        int64_t ne1 = n_embd/n_head;
+        int64_t ne2 = n_head;
+        int64_t ne3 = n_batch;
+        size_t  nb0 = ggml_element_size(t);
+        size_t  nb1 = nb0*ne0;
+        size_t  nb2 = nb1*ne1;
+        size_t  nb3 = nb2*ne2;
+        size_t offset = 2*nb3*ne3;
+        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
+    };
+
+    auto add_or_set = [ctx0] (struct ggml_tensor * a, struct ggml_tensor * b) -> struct ggml_tensor * {
+        if (a == NULL) {
+            return b;
+        } else {
+            return ggml_add_inplace(ctx0, a, b);
+        }
+    };
+
+    use_buf(-1);
+
+    model->tok_embeddings->grad = ggml_dup_tensor(ctx0, model->tok_embeddings->grad);
+    model->norm->grad = ggml_dup_tensor(ctx0, model->norm->grad);
+    model->output->grad = ggml_dup_tensor(ctx0, model->output->grad);
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct my_llama_layer & layer = model->layers[il];
+        layer.attention_norm->grad = ggml_dup_tensor(ctx0, layer.attention_norm->grad);
+        layer.wq->grad             = ggml_dup_tensor(ctx0, layer.wq->grad);
+        layer.wk->grad             = ggml_dup_tensor(ctx0, layer.wk->grad);
+        layer.wv->grad             = ggml_dup_tensor(ctx0, layer.wv->grad);
+        layer.wo->grad             = ggml_dup_tensor(ctx0, layer.wo->grad);
+        layer.ffn_norm->grad       = ggml_dup_tensor(ctx0, layer.ffn_norm->grad);
+        layer.w1->grad             = ggml_dup_tensor(ctx0, layer.w1->grad);
+        layer.w2->grad             = ggml_dup_tensor(ctx0, layer.w2->grad);
+        layer.w3->grad             = ggml_dup_tensor(ctx0, layer.w3->grad);
+    }
+
+    clr_buf(1);
+    clr_buf(2);
+
+    use_buf(0);
+
+    struct ggml_tensor * t00 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); assert_shape_1d(t00, N*n_batch);
+    memcpy(t00->data, tokens_input->data, ggml_element_size(t00)*N*n_batch);
+
+    struct ggml_tensor * t01 = expand(gf, ggml_get_rows(ctx0, model->tok_embeddings, t00)); assert_shape_2d(t01, n_embd, N*n_batch);
+
+    // need to remember these for the backward pass
+    std::vector<struct ggml_tensor *> t02L; t02L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t03L; t03L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t04L; t04L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t05L; t05L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t06L; t06L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t07L; t07L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t08L; t08L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t09L; t09L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t10L; t10L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t11L; t11L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t12L; t12L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t13L; t13L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t14L; t14L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t15L; t15L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t16L; t16L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t17L; t17L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t18L; t18L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t19L; t19L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t20L; t20L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t21L; t21L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t22L; t22L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t23L; t23L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t24L; t24L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t25L; t25L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t26L; t26L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t27L; t27L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t28L; t28L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t29L; t29L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t30L; t30L.resize(n_layer, NULL);
+
+    struct ggml_tensor * cur = t01;
+
+    for (int il = 0; il < n_layer; ++il) {
+        clr_buf(1);
+        struct my_llama_layer & layer = model->layers[il];
+        // tensors with values necessary for backward pass are in persistent buf(0)
+        // other tensors with buf(1) are only temporary needed, and their memory reused after layer is completed.
+        use_buf(0); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm     (ctx0, cur));                                    assert_shape_2d(t02, n_embd, N*n_batch);                  // n_embd, N*n_batch
+        use_buf(1); struct ggml_tensor * t03 = expand(gf, ggml_repeat       (ctx0, layer.attention_norm, t02));              assert_shape_2d(t03, n_embd, N*n_batch);
+        use_buf(0); struct ggml_tensor * t04 = expand(gf, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);                  // n_embd, N*n_batch
+        use_buf(1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
+        use_buf(1); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d   (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
+        use_buf(1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode));          assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
+        use_buf(1); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat      (ctx0, layer.wk, t04));                          assert_shape_2d(t08, n_embd, N*n_batch);
+        use_buf(1); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d   (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
+        use_buf(1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode));          assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
+        use_buf(1); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat      (ctx0, t04, layer.wv));                          assert_shape_2d(t11, N*n_batch, n_embd);
+        use_buf(1); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d   (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
+        use_buf(0); struct ggml_tensor * t13 = expand(gf, ggml_permute      (ctx0, t07, 0, 2, 1, 3));                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);  // n_embd/n_head, N, n_head, n_batch
+        use_buf(0); struct ggml_tensor * t14 = expand(gf, ggml_permute      (ctx0, t10, 0, 2, 1, 3));                        assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);  // n_embd/n_head, N, n_head, n_batch
+        use_buf(0); struct ggml_tensor * t15 = expand(gf, ggml_permute      (ctx0, t12, 0, 3, 1, 2));                        assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);  // N, n_embd/n_head, n_head, n_batch
+        use_buf(1); struct ggml_tensor * t16 = expand(gf, ggml_flash_attn   (ctx0, t13, t14, t15, true));                    assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
+        use_buf(1); struct ggml_tensor * t17 = expand(gf, ggml_permute      (ctx0, t16, 0, 2, 1, 3));                        assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
+        use_buf(1); struct ggml_tensor * t18 = expand(gf, ggml_cont         (ctx0, t17));                                    assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
+        use_buf(0); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d   (ctx0, t18, n_embd, N*n_batch));                 assert_shape_2d(t19, n_embd, N*n_batch);                  // n_embd, N*n_batch
+        use_buf(1); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat      (ctx0, layer.wo, t19));                          assert_shape_2d(t20, n_embd, N*n_batch);
+        use_buf(0); struct ggml_tensor * t21 = expand(gf, ggml_add          (ctx0, t20, cur));                               assert_shape_2d(t21, n_embd, N*n_batch);                  // n_embd, N*n_batch
+        use_buf(0); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm     (ctx0, t21));                                    assert_shape_2d(t22, n_embd, N*n_batch);                  // n_embd, N*n_batch
+        use_buf(1); struct ggml_tensor * t23 = expand(gf, ggml_repeat       (ctx0, layer.ffn_norm, t22));                    assert_shape_2d(t23, n_embd, N*n_batch);
+        use_buf(0); struct ggml_tensor * t24 = expand(gf, ggml_mul          (ctx0, t23, t22));                               assert_shape_2d(t24, n_embd, N*n_batch);                  // n_embd, N*n_batch
+        use_buf(0); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat      (ctx0, layer.w3, t24));                          assert_shape_2d(t25, n_ff, N*n_batch);                    // n_ff, N*n_batch
+        use_buf(0); struct ggml_tensor * t26 = expand(gf, ggml_mul_mat      (ctx0, layer.w1, t24));                          assert_shape_2d(t26, n_ff, N*n_batch);                    // n_ff, N*n_batch
+        use_buf(0); struct ggml_tensor * t27 = expand(gf, ggml_silu         (ctx0, t26));                                    assert_shape_2d(t27, n_ff, N*n_batch);                    // n_ff, N*n_batch
+        use_buf(0); struct ggml_tensor * t28 = expand(gf, ggml_mul          (ctx0, t27, t25));                               assert_shape_2d(t28, n_ff, N*n_batch);                    // n_ff, N*n_batch
+        use_buf(1); struct ggml_tensor * t29 = expand(gf, ggml_mul_mat      (ctx0, layer.w2, t28));                          assert_shape_2d(t29, n_embd, N*n_batch);
+        use_buf(0); struct ggml_tensor * t30 = expand(gf, ggml_add          (ctx0, t21, t29));                               assert_shape_2d(t30, n_embd, N*n_batch);                  // n_embd, N*n_batch
+        t02L[il] = t02;
+        t03L[il] = t03;
+        t04L[il] = t04;
+        t05L[il] = t05;
+        t06L[il] = t06;
+        t07L[il] = t07;
+        t08L[il] = t08;
+        t09L[il] = t09;
+        t10L[il] = t10;
+        t11L[il] = t11;
+        t12L[il] = t12;
+        t13L[il] = t13;
+        t14L[il] = t14;
+        t15L[il] = t15;
+        t16L[il] = t16;
+        t17L[il] = t17;
+        t18L[il] = t18;
+        t19L[il] = t19;
+        t20L[il] = t20;
+        t21L[il] = t21;
+        t22L[il] = t22;
+        t23L[il] = t23;
+        t24L[il] = t24;
+        t25L[il] = t25;
+        t26L[il] = t26;
+        t27L[il] = t27;
+        t28L[il] = t28;
+        t29L[il] = t29;
+        t30L[il] = t30;
+
+        cur      = t30;
+    }
+    clr_buf(1);
+    use_buf(1);
+    struct ggml_tensor * t31   = expand(gf, ggml_rms_norm  (ctx0, cur));                       assert_shape_2d(t31, n_embd, N*n_batch);
+    struct ggml_tensor * t32   = expand(gf, ggml_repeat    (ctx0, model->norm, t31));          assert_shape_2d(t32, n_embd, N*n_batch);
+    struct ggml_tensor * t33   = expand(gf, ggml_mul       (ctx0, t32, t31));                  assert_shape_2d(t33, n_embd, N*n_batch);
+    struct ggml_tensor * t34   = expand(gf, ggml_mul_mat   (ctx0, model->output, t33));        assert_shape_2d(t34, n_vocab, N*n_batch);
+    struct ggml_tensor * t35   = expand(gf, ggml_reshape_3d(ctx0, t34, n_vocab, N, n_batch));  assert_shape_3d(t35, n_vocab, N, n_batch);
+    struct ggml_tensor * t36   = expand(gf, ggml_cross_entropy_loss(ctx0, t35, targets));      assert_shape_1d(t36, 1);
+
+    {
+        /*
+        tok_embeddings                                                        | grad_tok_embeddings = ggml_get_rows_back(grad_t01, t00)
+        L0_att_norm                                                           | grad_L0_att_norm    = ggml_repeat_back(grad_t03L0, L0_att_norm.shape)
+        L0_wq                                                                 | grad_L0_wq          = ggml_out_prod(t04L0, grad_t05L0)
+        L0_wk                                                                 | grad_L0_wk          = ggml_out_prod(t04L0, grad_t08L0)
+        L0_wv                                                                 | grad_L0_wv          = ggml_out_prod(t04L0, ggml_transpose(grad_t11L0))
+        L0_wo                                                                 | grad_L0_wo          = ggml_out_prod(t19L0, grad_t20L0)
+        L0_ffn_norm                                                           | grad_L0_ffn_norm    = ggml_repeat_back(grad_t23L0, L0_ffn_norm.shape)
+        L0_w1                                                                 | grad_L0_w1          = ggml_out_prod(t24L0, grad_t26L0)
+        L0_w2                                                                 | grad_L0_w2          = ggml_out_prod(t28L0, grad_t29L0)
+        L0_w3                                                                 | grad_L0_w3          = ggml_out_prod(t24L0, grad_t25L0)
+        L1_att_norm                                                           | grad_L1_att_norm    = ggml_repeat_back(grad_t03L1, L1_att_norm.shape)
+        L1_wq                                                                 | grad_L1_wq          = ggml_out_prod(t04L1, grad_t05L1)
+        L1_wk                                                                 | grad_L1_wk          = ggml_out_prod(t04L1, grad_t08L1)
+        L1_wv                                                                 | grad_L1_wv          = ggml_out_prod(t04L1, ggml_transpose(grad_t11L1))
+        L1_wo                                                                 | grad_L1_wo          = ggml_out_prod(t19L1, grad_t20L1)
+        L1_ffn_norm                                                           | grad_L1_ffn_norm    = ggml_repeat_back(grad_t23L1, L1_ffn_norm.shape)
+        L1_w1                                                                 | grad_L1_w1          = ggml_out_prod(t24L1, grad_t26L1)
+        L1_w2                                                                 | grad_L1_w2          = ggml_out_prod(t28L1, grad_t29L1)
+        L1_w3                                                                 | grad_L1_w3          = ggml_out_prod(t24L1, grad_t25L1)
+        norm                                                                  | grad_norm           = ggml_repeat_back(grad_t32, norm.shape)
+        output                                                                | grad_output         = ggml_out_prod(t33, grad_t34)
+                                                                              |
+        t01 = ggml_get_rows(tok_embeddings, t00)                              | grad_t01   = grad_t21L0 + ggml_rms_norm_back(t01, grad_t02L0)
+        for layer:                                                            |
+        t02L0*= ggml_rms_norm     (t01)                                       | grad_t02L0 = ggml_mul(grad_t04L0, t03L0)
+        t03L0 = ggml_repeat       (L0_att_norm, t02L0_shape)                  | grad_t03L0 = ggml_mul(grad_t04L0, t02L0)
+        t04L0*= ggml_mul          (t02L0, t03L0)                              | grad_t04L0 = ggml_out_prod(L0_wv, grad_t11L0) + ggml_out_prod(L0_wk, ggml_transpose(grad_t08L0)) + ggml_out_prod(L0_wq, ggml_transpose(grad_t05L0))
+        t05L0 = ggml_mul_mat      (L0_wq, t04L0)                              | grad_t05L0 = ggml_reshape(grad_t06L0, t05L0_shape)
+        t06L0 = ggml_reshape_4d   (t05L0, n_embd/n_head, n_head, N, n_batch)  | grad_t06L0 = ggml_rope_back(grad_t07L0)
+        t07L0 = ggml_rope_inplace (t06L0)                                     | grad_t07L0 = ggml_permute_back(grad_t13L0, 0, 2, 1, 3) = ggml_permute(grad_t13L0, 0, 2, 1, 3)
+        t08L0 = ggml_mul_mat      (L0_wk, t04L0)                              | grad_t08L0 = ggml_reshape(grad_t09L0, t08L0_shape)
+        t09L0 = ggml_reshape_4d   (t08L0, n_embd/n_head, n_head, N, n_batch)  | grad_t09L0 = ggml_rope_back(grad_t10L0)
+        t10L0 = ggml_rope_inplace (t09L0)                                     | grad_t10L0 = ggml_permute_back(grad_t14L0, 0, 2, 1, 3) = ggml_permute(grad_t14L0, 0, 2, 1, 3)
+        t11L0 = ggml_mul_mat      (t04L0, L0_wv)                              | grad_t11L0 = ggml_reshape(grad_t12L0, t11L0_shape)
+        t12L0 = ggml_reshape_4d   (t11L0, N, n_batch, n_embd/n_head, n_head)  | grad_t12L0 = ggml_permute_back(grad_t15L0, 0, 3, 1, 2) = ggml_permute(grad_t15L0, 0, 2, 3, 1)
+        t13L0*= ggml_permute      (t07L0, 0, 2, 1, 3)                         | grad_t13L0 = view__q(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0))
+        t14L0*= ggml_permute      (t10L0, 0, 2, 1, 3)                         | grad_t14L0 = view__k(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0))
+        t15L0*= ggml_permute      (t12L0, 0, 3, 1, 2)                         | grad_t15L0 = view__v(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0))
+        t16L0 = ggml_flash_attn   (t13L0, t14L0, t15L0)                       | grad_t16L0 = ggml_permute_back(grad_t17L0, 0, 2, 1, 3) = ggml_permute(grad_t17L0, 0, 2, 1, 3)
+        t17L0 = ggml_permute      (t16L0, 0, 2, 1, 3)                         | grad_t17L0 = grad_t18L0
+        t18L0 = ggml_cont         (t17L0)                                     | grad_t18L0 = ggml_reshape(grad_t19L0, t18L0_shape)
+        t19L0*= ggml_reshape_2d   (t18L0, n_embd, N*n_batch)                  | grad_t19L0 = ggml_out_prod(L0_wo, ggml_transpose(grad_t20L0))
+        t20L0 = ggml_mul_mat      (L0_wo, t19L0)                              | grad_t20L0 = grad_t21L0
+        t21L0*= ggml_add          (t20L0, t01)                                | grad_t21L0 = grad_t30L0 + ggml_rms_norm_back(t21L0, grad_t22L0)
+        t22L0*= ggml_rms_norm     (t21L0)                                     | grad_t22L0 = ggml_mul(grad_t24L0, t23L0)
+        t23L0 = ggml_repeat       (L0_ffn_norm, t22L0_shape)                  | grad_t23L0 = ggml_mul(grad_t24L0, t22L0)
+        t24L0*= ggml_mul          (t23L0, t22L0)                              | grad_t24L0 = ggml_out_prod(L0_w1, ggml_transpose(grad_t26L0)) + ggml_out_prod(L0_w3, ggml_transpose(grad_t25L0))
+        t25L0*= ggml_mul_mat      (L0_w3, t24L0)                              | grad_t25L0 = ggml_mul(grad_t28L0, t27L0)
+        t26L0*= ggml_mul_mat      (L0_w1, t24L0)                              | grad_t26L0 = ggml_silu_back(t26L0, grad_t27L0)
+        t27L0*= ggml_silu         (t26L0)                                     | grad_t27L0 = ggml_mul(grad_t28L0, t25L0)
+        t28L0*= ggml_mul          (t27L0, t25L0)                              | grad_t28L0 = ggml_out_prod(L0_w2, ggml_transpose(grad_t29L0))
+        t29L0 = ggml_mul_mat      (L0_w2, t28L0)                              | grad_t29L0 = grad_t30L0
+        t30L0*= ggml_add          (t21L0, t29L0)                              | grad_t30L0 = ggml_rms_norm_back(t30L0, grad_t02L1) + grad_t21L1
+                                                                              ^
+        t02L1*= ggml_rms_norm     (t30L0)                                     | grad_t02L1 = ggml_mul(grad_t04L1, t03L1)
+        t03L1 = ggml_repeat       (L1_att_norm, t02L1_shape)                  | grad_t03L1 = ggml_mul(grad_t04L1, t02L1)
+        t04L1*= ggml_mul          (t02L1, t03L1)                              | grad_t04L1 = ggml_out_prod(L1_wv, grad_t11L1) + ggml_out_prod(L1_wk, ggml_transpose(grad_t08L1)) + ggml_out_prod(L1_wq, ggml_transpose(grad_t05L1))
+        t05L1 = ggml_mul_mat      (L1_wq, t04L1)                              | grad_t05L1 = ggml_reshape(grad_t06L1, t05L1_shape)
+        t06L1 = ggml_reshape_4d   (t05L1, n_embd/n_head, n_head, N, n_batch)  | grad_t06L1 = ggml_rope_back(grad_t07L1)
+        t07L1 = ggml_rope_inplace (t06L1)                                     | grad_t07L1 = ggml_permute_back(grad_t13L1, 0, 2, 1, 3) = ggml_permute(grad_t13L1, 0, 2, 1, 3)
+        t08L1 = ggml_mul_mat      (L1_wk, t04L1)                              | grad_t08L1 = ggml_reshape(grad_t09L1, t08L1_shape)
+        t09L1 = ggml_reshape_4d   (t08L1, n_embd/n_head, n_head, N, n_batch)  | grad_t09L1 = ggml_rope_back(grad_t10L1)
+        t10L1 = ggml_rope_inplace (t09L1)                                     | grad_t10L1 = ggml_permute_back(grad_t14L1, 0, 2, 1, 3) = ggml_permute(grad_t14L1, 0, 2, 1, 3)
+        t11L1 = ggml_mul_mat      (t04L1, L1_wv)                              | grad_t11L1 = ggml_reshape(grad_t12L1, t11L1_shape)
+        t12L1 = ggml_reshape_4d   (t11L1, N, n_batch, n_embd/n_head, n_head)  | grad_t12L1 = ggml_permute_back(grad_t15L1, 0, 3, 1, 2) = ggml_permute(grad_t15L1, 0, 2, 3, 1)
+        t13L1*= ggml_permute      (t07L1, 0, 2, 1, 3)                         | grad_t13L1 = view__q(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1))
+        t14L1*= ggml_permute      (t10L1, 0, 2, 1, 3)                         | grad_t14L1 = view__k(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1))
+        t15L1*= ggml_permute      (t12L1, 0, 3, 1, 2)                         | grad_t15L1 = view__v(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1))
+        t16L1 = ggml_flash_attn   (t13L1, t14L1, t15L1)                       | grad_t16L1 = ggml_permute_back(grad_t17L1, 0, 2, 1, 3) = ggml_permute(grad_t17L1, 0, 2, 1, 3)
+        t17L1 = ggml_permute      (t16L1, 0, 2, 1, 3)                         | grad_t17L1 = grad_t18L1
+        t18L1 = ggml_cont         (t17L1)                                     | grad_t18L1 = ggml_reshape(grad_t19L1, t18L1_shape)
+        t19L1*= ggml_reshape_2d   (t18L1, n_embd, N*n_batch)                  | grad_t19L1 = ggml_out_prod(L1_wo, ggml_transpose(grad_t20L1))
+        t20L1 = ggml_mul_mat      (L1_wo, t19L1)                              | grad_t20L1 = grad_t21L1
+        t21L1*= ggml_add          (t20L1, t30L0)                              | grad_t21L1 = grad_t30L1 + ggml_rms_norm_back(t21L1, grad_t22L1)
+        t22L1*= ggml_rms_norm     (t21L1)                                     | grad_t22L1 = ggml_mul(grad_t24L1, t23L1)
+        t23L1 = ggml_repeat       (L1_ffn_norm, t22L1_shape)                  | grad_t23L1 = ggml_mul(grad_t24L1, t22L1)
+        t24L1*= ggml_mul          (t23L1, t22L1)                              | grad_t24L1 = ggml_out_prod(L1_w1, ggml_transpose(grad_t26L1)) + ggml_out_prod(L1_w3, ggml_transpose(grad_t25L1))
+        t25L1*= ggml_mul_mat      (L1_w3, t24L1)                              | grad_t25L1 = ggml_mul(grad_t28L1, t27L1)
+        t26L1*= ggml_mul_mat      (L1_w1, t24L1)                              | grad_t26L1 = ggml_silu_back(t26L1, grad_t27L1)
+        t27L1*= ggml_silu         (t26L1)                                     | grad_t27L1 = ggml_mul(grad_t28L1, t25L1)
+        t28L1*= ggml_mul          (t27L1, t25L1)                              | grad_t28L1 = ggml_out_prod(L1_w2, ggml_transpose(grad_t29L1))
+        t29L1 = ggml_mul_mat      (L1_w2, t28L1)                              | grad_t29L1 = grad_t30L1
+        t30L1*= ggml_add          (t21L1, t29L1)                              | grad_t30L1 = ggml_rms_norm_back(t30L1, grad_t31)
+                                                                              ^
+        t31   = ggml_rms_norm     (t30L1)                                     | grad_t31   = ggml_mul(grad_t33, t32)
+        t32   = ggml_repeat       (norm, t31.shape)                           | grad_t32   = ggml_mul(grad_t33, t31)
+        t33   = ggml_mul          (t32, t31)                                  | grad_t33   = ggml_out_prod(output, ggml_transpose(grad_t34))
+        t34   = ggml_mul_mat      (output, t33)                               | grad_t34   = ggml_reshape(grad_t35, t34.shape)
+        t35   = ggml_reshape_3d   (t34, n_vocab, N, n_batch)                  | grad_t35   = ggml_cross_entropy_loss_back(t35, targets, grad_t36)
+        t36   = ggml_cross_entropy_loss(t35, targets)                         | grad_t36   = 1 (optimizer)
+        tensors marked with * need to be stored until grad computation
+        tensors during grad computation are all temporary
+        */
+    }
+
+    *gb = *gf;
+
+    use_buf(-1);
+    // t36->grad gets set to one by optimizer, so we need to create the tensor.
+    // initialize it with 1.0f to make sure.
+    t36->grad = ggml_new_f32(ctx0, 1.0f);
+
+    use_buf(1);
+    t35->grad = expand(gb, ggml_cross_entropy_loss_back(ctx0, t35, targets, t36->grad));              assert_shape_3d(t35->grad, n_vocab, N, n_batch);
+    t34->grad = expand(gb, ggml_reshape_2d (ctx0, t35->grad, n_vocab, N*n_batch));                    assert_shape_2d(t34->grad, n_vocab, N*n_batch);
+    t33->grad = expand(gb, ggml_out_prod   (ctx0, model->output, ggml_transpose(ctx0, t34->grad)));   assert_shape_2d(t33->grad, n_embd, N*n_batch);
+    t32->grad = expand(gb, ggml_mul        (ctx0, t33->grad, t31));                                   assert_shape_2d(t32->grad, n_embd, N*n_batch);
+
+    use_buf(-1);
+
+    model->norm->grad   = expand(gb, add_or_set(model->norm->grad,   ggml_repeat_back(ctx0, t32->grad, model->norm))); assert_shape_1d(model->norm->grad, n_embd);
+    model->output->grad = expand(gb, add_or_set(model->output->grad, ggml_out_prod(ctx0, t33, t34->grad)));            assert_shape_2d(model->output->grad, n_embd, n_vocab);
+
+    clr_buf(2);
+    use_buf(2);
+    t31->grad = expand(gb, ggml_mul(ctx0, t33->grad, t32));  assert_shape_2d(t31->grad, n_embd, N*n_batch);
+
+    struct ggml_tensor * back_layer_inp = t31;
+    struct ggml_tensor * grad_layer_inp = NULL;
+
+    for (int k = 0; k < n_layer; ++k) {
+        int il = n_layer-1-k;
+        struct my_llama_layer & layer = model->layers[il];
+
+        struct ggml_tensor * t02 = t02L[il];
+        struct ggml_tensor * t03 = t03L[il];
+        struct ggml_tensor * t04 = t04L[il];
+        struct ggml_tensor * t05 = t05L[il];
+        struct ggml_tensor * t06 = t06L[il];
+        struct ggml_tensor * t07 = t07L[il];
+        struct ggml_tensor * t08 = t08L[il];
+        struct ggml_tensor * t09 = t09L[il];
+        struct ggml_tensor * t10 = t10L[il];
+        struct ggml_tensor * t11 = t11L[il];
+        struct ggml_tensor * t12 = t12L[il];
+        struct ggml_tensor * t13 = t13L[il];
+        struct ggml_tensor * t14 = t14L[il];
+        struct ggml_tensor * t15 = t15L[il];
+        struct ggml_tensor * t16 = t16L[il];
+        struct ggml_tensor * t17 = t17L[il];
+        struct ggml_tensor * t18 = t18L[il];
+        struct ggml_tensor * t19 = t19L[il];
+        struct ggml_tensor * t20 = t20L[il];
+        struct ggml_tensor * t21 = t21L[il];
+        struct ggml_tensor * t22 = t22L[il];
+        struct ggml_tensor * t23 = t23L[il];
+        struct ggml_tensor * t24 = t24L[il];
+        struct ggml_tensor * t25 = t25L[il];
+        struct ggml_tensor * t26 = t26L[il];
+        struct ggml_tensor * t27 = t27L[il];
+        struct ggml_tensor * t28 = t28L[il];
+        struct ggml_tensor * t29 = t29L[il];
+        struct ggml_tensor * t30 = t30L[il];
+
+        clr_buf(1);
+        use_buf(1);
+        t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch);
+        if (grad_layer_inp) {
+            t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp));       assert_shape_2d(t30->grad, n_embd, N*n_batch);
+        }
+        clr_buf(2);
+        t29->grad = t30->grad; assert_shape_2d(t29->grad, n_embd, N*n_batch);
+        t28->grad = expand(gb, ggml_out_prod(ctx0, layer.w2, ggml_transpose(ctx0, t29->grad)));                       assert_shape_2d(t28->grad, n_ff, N*n_batch);
+        t27->grad = expand(gb, ggml_mul(ctx0, t28->grad, t25));                                                       assert_shape_2d(t27->grad, n_ff, N*n_batch);
+        t26->grad = expand(gb, ggml_silu_back(ctx0, t26, t27->grad));                                                 assert_shape_2d(t26->grad, n_ff, N*n_batch);
+        t25->grad = expand(gb, ggml_mul(ctx0, t28->grad, t27));                                                       assert_shape_2d(t25->grad, n_ff, N*n_batch);
+        t24->grad = expand(gb, ggml_add_inplace(ctx0,
+                        ggml_out_prod(ctx0, layer.w1, ggml_transpose(ctx0, t26->grad)),
+                        ggml_out_prod(ctx0, layer.w3, ggml_transpose(ctx0, t25->grad))));                             assert_shape_2d(t24->grad, n_embd, N*n_batch);
+        t23->grad = expand(gb, ggml_mul(ctx0, t24->grad, t22));                                                       assert_shape_2d(t23->grad, n_embd, N*n_batch);
+        t22->grad = expand(gb, ggml_mul(ctx0, t24->grad, ggml_repeat(ctx0, layer.ffn_norm, t24->grad)));              assert_shape_2d(t22->grad, n_embd, N*n_batch);
+        use_buf(2);
+        t21->grad = expand(gb, ggml_add(ctx0, t30->grad, ggml_rms_norm_back(ctx0, t21, t22->grad)));                  assert_shape_2d(t21->grad, n_embd, N*n_batch);
+        grad_layer_inp = t21;
+        use_buf(1);
+        t20->grad = t21->grad;                                                                                        assert_shape_2d(t20->grad, n_embd, N*n_batch);
+        t19->grad = expand(gb, ggml_out_prod(ctx0, layer.wo, ggml_transpose(ctx0, t20->grad)));                       assert_shape_2d(t19->grad, n_embd, N*n_batch);
+        t18->grad = expand(gb, ggml_reshape(ctx0, t19->grad, t18));                                                   assert_shape_4d(t18->grad, n_embd/n_head, n_head, N, n_batch);
+        t17->grad = t18->grad;                                                                                        assert_shape_4d(t17->grad, n_embd/n_head, n_head, N, n_batch);
+        t16->grad = expand(gb, ggml_permute(ctx0, t17->grad, 0, 2, 1, 3));                                            assert_shape_4d(t16->grad, n_embd/n_head, N, n_head, n_batch);
+        struct ggml_tensor * flash_attn = expand(gb, ggml_flash_attn_back(ctx0, t13, t14, t15, t16->grad, true));     assert_shape_4d(flash_attn, n_embd/n_head, N*3, n_head, n_batch);
+        t15->grad = expand(gb, view__v(flash_attn));                                                                  assert_shape_4d(t15->grad, N, n_embd/n_head, n_head, n_batch);
+        t14->grad = expand(gb, view__k(flash_attn));                                                                  assert_shape_4d(t14->grad, n_embd/n_head, N, n_head, n_batch);
+        t13->grad = expand(gb, view__q(flash_attn));                                                                  assert_shape_4d(t13->grad, n_embd/n_head, N, n_head, n_batch);
+        t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1));                                            assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head);
+        t11->grad = expand(gb, ggml_reshape(ctx0, ggml_cont(ctx0, t12->grad), t11));                                  assert_shape_2d(t11->grad, N*n_batch, n_embd);
+        t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3));                                            assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch);
+        t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode));                            assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch);
+        t08->grad = expand(gb, ggml_reshape(ctx0, t09->grad, t08));                                                   assert_shape_2d(t08->grad, n_embd, N*n_batch);
+        t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3));                                            assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch);
+        t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode));                            assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch);
+        t05->grad = expand(gb, ggml_reshape(ctx0, t06->grad, t05));                                                   assert_shape_2d(t05->grad, n_embd, N*n_batch);
+        t04->grad = expand(gb, ggml_add_inplace(ctx0,
+                        ggml_add_inplace(ctx0,
+                            ggml_out_prod(ctx0, layer.wv, t11->grad),
+                            ggml_out_prod(ctx0, layer.wk, ggml_transpose(ctx0, t08->grad))),
+                        ggml_out_prod(ctx0, layer.wq, ggml_transpose(ctx0, t05->grad))));                             assert_shape_2d(t04->grad, n_embd, N*n_batch);
+        t03->grad = expand(gb, ggml_mul(ctx0, t04->grad, t02));                                                       assert_shape_2d(t04->grad, n_embd, N*n_batch);
+        use_buf(2);
+        t02->grad = expand(gb, ggml_mul(ctx0, t04->grad, t03));                                                       assert_shape_2d(t02->grad, n_embd, N*n_batch);
+        back_layer_inp = t02->grad;
+        use_buf(1);
+
+        use_buf(-1);
+        layer.attention_norm->grad = expand(gb, add_or_set(layer.attention_norm->grad, ggml_repeat_back(ctx0, t03->grad, layer.attention_norm)));   assert_shape_1d(layer.attention_norm->grad, n_embd);
+        layer.wq->grad             = expand(gb, add_or_set(layer.wq->grad,             ggml_out_prod(ctx0, t04, t05->grad)));                       assert_shape_2d(layer.wq->grad,             n_embd, n_embd);
+        layer.wk->grad             = expand(gb, add_or_set(layer.wk->grad,             ggml_out_prod(ctx0, t04, t08->grad)));                       assert_shape_2d(layer.wk->grad,             n_embd, n_embd);
+        layer.wv->grad             = expand(gb, add_or_set(layer.wv->grad,             ggml_out_prod(ctx0, t04, ggml_transpose(ctx0, t11->grad)))); assert_shape_2d(layer.wv->grad,             n_embd, n_embd);
+        layer.wo->grad             = expand(gb, add_or_set(layer.wo->grad,             ggml_out_prod(ctx0, t19, t20->grad)));                       assert_shape_2d(layer.wo->grad,             n_embd, n_embd);
+        layer.ffn_norm->grad       = expand(gb, add_or_set(layer.ffn_norm->grad,       ggml_repeat_back(ctx0, t23->grad, layer.ffn_norm)));         assert_shape_1d(layer.ffn_norm->grad,       n_embd);
+        layer.w1->grad             = expand(gb, add_or_set(layer.w1->grad,             ggml_out_prod(ctx0, t24, t26->grad)));                       assert_shape_2d(layer.w1->grad,             n_embd, n_ff);
+        layer.w2->grad             = expand(gb, add_or_set(layer.w2->grad,             ggml_out_prod(ctx0, t28, t29->grad)));                       assert_shape_2d(layer.w2->grad,             n_ff, n_embd);
+        layer.w3->grad             = expand(gb, add_or_set(layer.w3->grad,             ggml_out_prod(ctx0, t24, t25->grad)));                       assert_shape_2d(layer.w3->grad,             n_embd, n_ff);
+        use_buf(1);
+    }
+    clr_buf(1);
+    use_buf(1);
+    t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad)));  assert_shape_2d(t01->grad, n_embd, N*n_batch);
+    use_buf(-1);
+    model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings));            assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab);
+    clr_buf(2);
+    clr_buf(1);
+
+    *logits = t35;
+
+    return t36;
+}
+
 void set_f32_3d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, float value) {
     float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
     *ptr = value;
@@ -2129,6 +2628,9 @@ struct train_params {
 
     int mem_model_gb;
     int mem_compute_gb;
+    int mem_compute0_gb;
+    int mem_compute1_gb;
+    int mem_compute2_gb;
 };
 
 struct train_params get_default_train_params() {
@@ -2172,7 +2674,10 @@ struct train_params get_default_train_params() {
     params.adam_decay        = 1e-3;
 
     params.mem_model_gb   = 2;
-    params.mem_compute_gb = 32;
+    params.mem_compute_gb = 8;
+    params.mem_compute0_gb = 24;
+    params.mem_compute1_gb = 8;
+    params.mem_compute2_gb = 8;
 
     return params;
 }
@@ -2215,6 +2720,9 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --adam-decay N             AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay);
     fprintf(stderr, "  --mem-model N              Memory to allocate for model and cache in gigabytes. (default %d)\n", params->mem_model_gb);
     fprintf(stderr, "  --mem-compute N            Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb);
+    fprintf(stderr, "  --mem-compute0 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute0_gb);
+    fprintf(stderr, "  --mem-compute1 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute1_gb);
+    fprintf(stderr, "  --mem-compute2 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute2_gb);
     fprintf(stderr, "\n");
 }
 
@@ -2408,6 +2916,24 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->mem_compute_gb = std::stoi(argv[i]);
+        } else if (arg == "--mem-compute0") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->mem_compute0_gb = std::stoi(argv[i]);
+        } else if (arg == "--mem-compute1") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->mem_compute1_gb = std::stoi(argv[i]);
+        } else if (arg == "--mem-compute2") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->mem_compute2_gb = std::stoi(argv[i]);
         } else if (arg == "-h" || arg == "--help") {
             train_print_usage(argc, argv, &default_params);
             exit(0);
@@ -2563,6 +3089,13 @@ int main(int argc, char ** argv) {
     size_t    compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
     uint8_t * compute_addr = new uint8_t[compute_size];
 
+    size_t size_buf_0 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute0_gb);
+    size_t size_buf_1 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute1_gb);
+    size_t size_buf_2 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute2_gb);
+    uint8_t * compute_buf_0 = new uint8_t[size_buf_0];
+    uint8_t * compute_buf_1 = new uint8_t[size_buf_1];
+    uint8_t * compute_buf_2 = new uint8_t[size_buf_2];
+
     GGML_ASSERT(train_tokens.size() > n_tokens);;
     std::vector<int> train_samples;
     train_samples.push_back(0);
@@ -2601,22 +3134,46 @@ int main(int argc, char ** argv) {
 
         int n_past = 0;
 
-        ggml_cgraph gf = {};
-        gf.n_threads = params.n_threads;
+        struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32) + (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
+        struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32) + (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
 
-        get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex,  tokens_input, target_logits, target_probs);
+        struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
+        struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
 
-        struct ggml_tensor * logits =
-            (n_past == 0)
-            ? (params.use_flash
-               ? forward_batch_wo_cache_flash_attn(&model, ctx0, &gf, tokens_input, n_tokens, n_batch)
-               : forward_batch_wo_cache(&model, ctx0, &gf, tokens_input, n_tokens, n_batch))
-            : forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch);
+        // ggml_cgraph gf = {};
+        gf->n_threads = params.n_threads;
+        gb->n_threads = params.n_threads;
 
-        struct ggml_tensor * e = cross_entropy_loss(ctx0, logits, target_probs);
+        get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex,  tokens_input, target_logits, target_probs);
 
-        ggml_build_forward_expand(&gf, e);
-        ggml_graph_compute(ctx0, &gf);
+        // struct ggml_tensor * logits =
+        //     (n_past == 0)
+        //     ? (params.use_flash
+        //        ? forward_batch_wo_cache_flash_attn(&model, ctx0, &gf, tokens_input, n_tokens, n_batch)
+        //        : forward_batch_wo_cache(&model, ctx0, &gf, tokens_input, n_tokens, n_batch))
+        //     : forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch);
+
+        // struct ggml_tensor * e = cross_entropy_loss(ctx0, logits, target_probs);
+        struct ggml_tensor * logits;
+        struct ggml_tensor * e = forward_batch_wo_cache_flash_attn_train(
+            &model,
+            ctx0,
+            gf,
+            gb,
+            &logits,
+            tokens_input,
+            target_probs,
+            compute_buf_0,
+            compute_buf_1,
+            compute_buf_2,
+            size_buf_0,
+            size_buf_1,
+            size_buf_2,
+            n_tokens,
+            n_batch);
+
+        // ggml_build_forward_expand(&gf, e);
+        ggml_graph_compute(ctx0, gf);
 
         size_t used_mem_before_opt = ggml_used_mem(ctx0);
 
@@ -2633,7 +3190,8 @@ int main(int argc, char ** argv) {
         printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched);
 
         // ggml_opt(ctx0, opt->params, e);
-        ggml_opt_resume(ctx0, opt, e);
+        // ggml_opt_resume(ctx0, opt, e);
+        ggml_opt_resume_g(ctx0, opt, e, gf, gb);
 
         size_t used_mem_after_opt = ggml_used_mem(ctx0);
 
@@ -2641,8 +3199,8 @@ int main(int argc, char ** argv) {
         model.train_samples += n_batch;
         model.train_tokens  += n_batch * n_tokens;
 
-        ggml_build_forward_expand(&gf, e);
-        ggml_graph_compute(ctx0, &gf);
+        //ggml_build_forward_expand(&gf, e);
+        ggml_graph_compute(ctx0, gf);
 
         float error_after_opt = ggml_get_f32_1d(e, 0);
 
@@ -2753,7 +3311,10 @@ int main(int argc, char ** argv) {
         }
     }
 
-    free(compute_addr);
+    delete[] compute_addr;
+    delete[] compute_buf_0;
+    delete[] compute_buf_1;
+    delete[] compute_buf_2;
     ggml_free(model.ctx);
 
     return 0;

From d9626743ac8d299f591d0a04fe04129ed58f0fb6 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 1 Jun 2023 20:59:19 +0200
Subject: [PATCH 74/86] add option to use scratch buffers in training or not

make it configurable because currently training with scratch buffers implies flash attention and optimization over all parameters.
---
 .../train-text-from-scratch.cpp               | 69 ++++++++++---------
 1 file changed, 36 insertions(+), 33 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index ee17bd8e43eb0..ff6167da8666b 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2614,6 +2614,7 @@ struct train_params {
     bool samples_start_after_nl;
     bool use_adam;
     bool use_flash;
+    bool use_scratch;
 
     // only adam
     int   warmup;
@@ -2661,6 +2662,7 @@ struct train_params get_default_train_params() {
     params.samples_start_after_nl = false;
     params.use_adam               = true;
     params.use_flash              = true;
+    params.use_scratch            = true;
 
     // only adam
     params.warmup            =  100;
@@ -2710,6 +2712,8 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --use-adam                 Use Adam optimizer (default)\n");
     fprintf(stderr, "  --no-flash                 Don't use flash attention.\n");
     fprintf(stderr, "  --use-flash                Use flash attention (default)\n");
+    fprintf(stderr, "  --no-scratch               Don't use scratch buffers\n");
+    fprintf(stderr, "  --use-scratch              Use scratch buffers (default)\n");
     fprintf(stderr, "  --warmup N                 Number of warmup steps (default %d)\n", params->warmup);
     fprintf(stderr, "  --cos-decay-steps N        Number of cosine decay steps (default %d)\n", params->cos_decay_steps);
     fprintf(stderr, "  --cos-decay-restart N      Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart);
@@ -2856,6 +2860,10 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
             params->use_flash = false;
         } else if (arg == "--use-flash") {
             params->use_flash = true;
+        } else if (arg == "--no-scratch") {
+            params->use_scratch = false;
+        } else if (arg == "--use-scratch") {
+            params->use_scratch = true;
         } else if (arg == "--warmup") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -3146,38 +3154,36 @@ int main(int argc, char ** argv) {
 
         get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex,  tokens_input, target_logits, target_probs);
 
-        // struct ggml_tensor * logits =
-        //     (n_past == 0)
-        //     ? (params.use_flash
-        //        ? forward_batch_wo_cache_flash_attn(&model, ctx0, &gf, tokens_input, n_tokens, n_batch)
-        //        : forward_batch_wo_cache(&model, ctx0, &gf, tokens_input, n_tokens, n_batch))
-        //     : forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch);
-
-        // struct ggml_tensor * e = cross_entropy_loss(ctx0, logits, target_probs);
-        struct ggml_tensor * logits;
-        struct ggml_tensor * e = forward_batch_wo_cache_flash_attn_train(
-            &model,
-            ctx0,
-            gf,
-            gb,
-            &logits,
-            tokens_input,
-            target_probs,
-            compute_buf_0,
-            compute_buf_1,
-            compute_buf_2,
-            size_buf_0,
-            size_buf_1,
-            size_buf_2,
-            n_tokens,
-            n_batch);
-
-        // ggml_build_forward_expand(&gf, e);
+        GGML_ASSERT(n_past == 0);
+
+        struct ggml_tensor * loss   = NULL;
+        struct ggml_tensor * logits = NULL;
+
+        if (params.use_scratch) {
+            loss = forward_batch_wo_cache_flash_attn_train(
+                    &model, ctx0,
+                    gf, gb,
+                    &logits, tokens_input, target_probs,
+                    compute_buf_0, compute_buf_1, compute_buf_2,
+                    size_buf_0, size_buf_1, size_buf_2,
+                    n_tokens, n_batch);
+        } else if (params.use_flash) {
+            logits = forward_batch_wo_cache_flash_attn(&model, ctx0, gf, tokens_input, n_tokens, n_batch);
+            loss   = cross_entropy_loss(ctx0, logits, target_probs);
+            ggml_build_forward_expand(gf, loss);
+            *gb = ggml_build_backward(ctx0, gf, true);
+        } else {
+            logits = forward_batch_wo_cache(&model, ctx0, gf, tokens_input, n_tokens, n_batch);
+            loss   = cross_entropy_loss(ctx0, logits, target_probs);
+            ggml_build_forward_expand(gf, loss);
+            *gb = ggml_build_backward(ctx0, gf, true);
+        }
+
         ggml_graph_compute(ctx0, gf);
 
         size_t used_mem_before_opt = ggml_used_mem(ctx0);
 
-        float error_before_opt = ggml_get_f32_1d(e, 0);
+        float error_before_opt = ggml_get_f32_1d(loss, 0);
 
         opt->params.adam.sched = (opt->iter < params.warmup)
             ? (float) opt->iter / (float) params.warmup
@@ -3189,9 +3195,7 @@ int main(int argc, char ** argv) {
 
         printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched);
 
-        // ggml_opt(ctx0, opt->params, e);
-        // ggml_opt_resume(ctx0, opt, e);
-        ggml_opt_resume_g(ctx0, opt, e, gf, gb);
+        ggml_opt_resume_g(ctx0, opt, loss, gf, gb);
 
         size_t used_mem_after_opt = ggml_used_mem(ctx0);
 
@@ -3199,10 +3203,9 @@ int main(int argc, char ** argv) {
         model.train_samples += n_batch;
         model.train_tokens  += n_batch * n_tokens;
 
-        //ggml_build_forward_expand(&gf, e);
         ggml_graph_compute(ctx0, gf);
 
-        float error_after_opt = ggml_get_f32_1d(e, 0);
+        float error_after_opt = ggml_get_f32_1d(loss, 0);
 
         if (params.print_info_interval > 0 && ex % params.print_info_interval == 0) {
             printf("Example %d, opt iter %d\n", ex, opt->iter);

From b58d73ca8c5ea1baf42c24db58746b9e763384af Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 29 May 2023 20:57:24 +0300
Subject: [PATCH 75/86] ci : disable temporary

---
 .github/workflows/editorconfig.yml | 17 -----------------
 .github/workflows/tidy-post.yml    | 20 --------------------
 .github/workflows/tidy-review.yml  | 23 -----------------------
 3 files changed, 60 deletions(-)
 delete mode 100644 .github/workflows/editorconfig.yml
 delete mode 100644 .github/workflows/tidy-post.yml
 delete mode 100644 .github/workflows/tidy-review.yml

diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml
deleted file mode 100644
index b4e535acf1f64..0000000000000
--- a/.github/workflows/editorconfig.yml
+++ /dev/null
@@ -1,17 +0,0 @@
-name: EditorConfig Checker
-
-on:
-  push:
-    branches:
-      - master
-  pull_request:
-    branches:
-      - master
-
-jobs:
-  editorconfig:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-      - uses: editorconfig-checker/action-editorconfig-checker@main
-      - run: editorconfig-checker
diff --git a/.github/workflows/tidy-post.yml b/.github/workflows/tidy-post.yml
deleted file mode 100644
index a58da0cd6493d..0000000000000
--- a/.github/workflows/tidy-post.yml
+++ /dev/null
@@ -1,20 +0,0 @@
-name: clang-tidy review post comments
-
-on:
-  workflow_run:
-    workflows: ["clang-tidy-review"]
-    types:
-      - completed
-
-jobs:
-  build:
-    runs-on: ubuntu-latest
-
-    steps:
-      - uses: ZedThree/clang-tidy-review/post@v0.13.0
-        # lgtm_comment_body, max_comments, and annotations need to be set on the posting workflow in a split setup
-        with:
-          # adjust options as necessary
-          lgtm_comment_body: ''
-          annotations: false
-          max_comments: 25
diff --git a/.github/workflows/tidy-review.yml b/.github/workflows/tidy-review.yml
deleted file mode 100644
index a4bc8d976560e..0000000000000
--- a/.github/workflows/tidy-review.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-name: clang-tidy-review
-
-on:
-  pull_request:
-    branches:
-      - master
-
-jobs:
-  clang-tidy-review:
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v3
-
-    - uses: ZedThree/clang-tidy-review@v0.13.0
-      id: review
-      with:
-        lgtm_comment_body: ''
-        build_dir: build
-        cmake_command: cmake . -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=on
-        split_workflow: true
-
-    - uses: ZedThree/clang-tidy-review/upload@v0.13.0

From 6b7487d104fe29e09b3666020d6ad1ae20b8b0c6 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 8 Jun 2023 02:33:57 +0200
Subject: [PATCH 76/86] store view offset and permute axes in opt[0] instead of
 storing it in padding

use memcpy to store offset, because offset is of type size_t.
when storing it as int32_t offset would have to be smaller than 2^31 which is not necessarily true.
---
 ggml.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 64 insertions(+), 13 deletions(-)

diff --git a/ggml.c b/ggml.c
index 77eb43d06be6e..e64dac6329af6 100644
--- a/ggml.c
+++ b/ggml.c
@@ -5884,7 +5884,17 @@ struct ggml_tensor * ggml_view_1d(
     result->src1 = NULL;
 
     if (is_node) {
-        memcpy(result->padding, &offset, sizeof(offset));
+        ggml_scratch_save(ctx);
+
+        struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+
+        GGML_ASSERT(sizeof(offset) <= ggml_nbytes(b));
+
+        memcpy(b->data, &offset, sizeof(offset));
+
+        ggml_scratch_load(ctx);
+
+        result->opt[0] = b;
     }
 
     return result;
@@ -5920,7 +5930,17 @@ struct ggml_tensor * ggml_view_2d(
     result->src1 = NULL;
 
     if (is_node) {
-        memcpy(result->padding, &offset, sizeof(offset));
+        ggml_scratch_save(ctx);
+
+        struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+
+        GGML_ASSERT(sizeof(offset) <= ggml_nbytes(b));
+
+        memcpy(b->data, &offset, sizeof(offset));
+
+        ggml_scratch_load(ctx);
+
+        result->opt[0] = b;
     }
 
     return result;
@@ -5958,7 +5978,17 @@ struct ggml_tensor * ggml_view_3d(
     result->src1 = NULL;
 
     if (is_node) {
-        memcpy(result->padding, &offset, sizeof(offset));
+        ggml_scratch_save(ctx);
+
+        struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+
+        GGML_ASSERT(sizeof(offset) <= ggml_nbytes(b));
+
+        memcpy(b->data, &offset, sizeof(offset));
+
+        ggml_scratch_load(ctx);
+
+        result->opt[0] = b;
     }
 
     return result;
@@ -5998,7 +6028,17 @@ struct ggml_tensor * ggml_view_4d(
     result->src1 = NULL;
 
     if (is_node) {
-        memcpy(result->padding, &offset, sizeof(offset));
+        ggml_scratch_save(ctx);
+
+        struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+
+        GGML_ASSERT(sizeof(offset) <= ggml_nbytes(b));
+
+        memcpy(b->data, &offset, sizeof(offset));
+
+        ggml_scratch_load(ctx);
+
+        result->opt[0] = b;
     }
 
     return result;
@@ -6062,10 +6102,18 @@ struct ggml_tensor * ggml_permute(
     result->src1 = NULL;
 
     if (is_node) {
-        result->padding[0] = axis0;
-        result->padding[1] = axis1;
-        result->padding[2] = axis2;
-        result->padding[3] = axis3;
+        ggml_scratch_save(ctx);
+
+        struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
+
+        ((int32_t *) b->data)[0] = axis0;
+        ((int32_t *) b->data)[1] = axis1;
+        ((int32_t *) b->data)[2] = axis2;
+        ((int32_t *) b->data)[3] = axis3;
+
+        ggml_scratch_load(ctx);
+
+        result->opt[0] = b;
     }
 
     return result;
@@ -14834,7 +14882,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                 // necessary for llama
                 if (src0->grad) {
                     size_t offset;
-                    memcpy(&offset, tensor->padding, sizeof(offset));
+
+                    GGML_ASSERT(sizeof(offset) <= ggml_nbytes(tensor->opt[0]));
+                    memcpy(&offset, tensor->opt[0]->data, sizeof(offset));
 
                     size_t nb1     = tensor->nb[1];
                     size_t nb2     = tensor->nb[2];
@@ -14861,10 +14911,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 // necessary for llama
                 if (src0->grad) {
-                    int axis0 = tensor->padding[0] & 0x3;
-                    int axis1 = tensor->padding[1] & 0x3;
-                    int axis2 = tensor->padding[2] & 0x3;
-                    int axis3 = tensor->padding[3] & 0x3;
+                    int32_t * axes = (int32_t *) tensor->opt[0]->data;
+                    int axis0 = axes[0] & 0x3;
+                    int axis1 = axes[1] & 0x3;
+                    int axis2 = axes[2] & 0x3;
+                    int axis3 = axes[3] & 0x3;
                     int axes_backward[4] = {0,0,0,0};
                     axes_backward[axis0] = 0;
                     axes_backward[axis1] = 1;

From e829421eda7adb5dbbae9f68c1d52dcf6e121cd5 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 11 Jun 2023 11:49:01 +0300
Subject: [PATCH 77/86] minor : fix compile warnings + minor style changes

---
 .../train-text-from-scratch.cpp               | 56 +++++++++----------
 ggml.c                                        | 18 +++---
 llama.h                                       |  6 +-
 3 files changed, 39 insertions(+), 41 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index ff6167da8666b..f933c0164e54f 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -953,7 +953,7 @@ struct ggml_tensor * forward_batch_wo_cache(
     const int N = n_tokens;
 
     const auto & hparams = model->hparams;
-    const int n_ctx   = hparams.n_ctx;
+    //const int n_ctx   = hparams.n_ctx;
     const int n_vocab = hparams.n_vocab;
     const int n_embd  = hparams.n_embd;
     const int n_layer = hparams.n_layer;
@@ -1181,7 +1181,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
     const int N = n_tokens;
 
     const auto & hparams = model->hparams;
-    const int n_ctx   = hparams.n_ctx;
+    //const int n_ctx   = hparams.n_ctx;
     const int n_vocab = hparams.n_vocab;
     const int n_embd  = hparams.n_embd;
     const int n_layer = hparams.n_layer;
@@ -1368,7 +1368,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
     gf->work = NULL;
 
     const auto & hparams = model->hparams;
-    const int n_ctx      = hparams.n_ctx;
+    //const int n_ctx      = hparams.n_ctx;
     const int n_vocab    = hparams.n_vocab;
     const int n_embd     = hparams.n_embd;
     const int n_layer    = hparams.n_layer;
@@ -1894,7 +1894,7 @@ void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
 
 void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens) {
     for (int i1=0; i1<tokens->ne[1]; ++i1) {
-        int num_newline = 0;
+        //int num_newline = 0;
         for (int i0=0; i0<tokens->ne[0]; ++i0) {
             int token = get_i32_2d(tokens, i0, i1);
             print_token(ctx, token);
@@ -1920,7 +1920,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons
     int n_tokens = tokens_input->ne[0];
     int n_vocab  = target_logits->ne[0];
 
-    int sample = train_samples[example_id % n_train_samples];
+    size_t sample = train_samples[example_id % n_train_samples];
     GGML_ASSERT(sample+n_tokens-1 < n_train_data);
 
     ggml_set_f32(target_logits, -1.0f/n_vocab);
@@ -1936,7 +1936,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons
     }
 }
 
-void get_example_targets_batch(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
+void get_example_targets_batch(struct llama_context * /*lctx*/, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
     GGML_ASSERT(tokens_input->n_dims  == 2);
     GGML_ASSERT(target_logits->n_dims == 3);
     GGML_ASSERT(target_probs->n_dims  == 3);
@@ -1953,7 +1953,7 @@ void get_example_targets_batch(struct llama_context * lctx, const int * train_sa
     ggml_set_f32(target_probs, 0.0f);
     for (int k=0; k<n_batch; ++k) {
         // printf("%s: batch %d\n", __func__, k);
-        int sample = train_samples[(example_id*n_batch + k) % n_train_samples];
+        size_t sample = train_samples[(example_id*n_batch + k) % n_train_samples];
         GGML_ASSERT(sample+n_tokens-1 < n_train_data);
 
         set_i32_2d(tokens_input, 0, k, llama_token_bos());
@@ -2120,7 +2120,7 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
     if (verify) {
         const char * in  = buf.data();
         const char * end = buf.data() + buf.size();
-        for (int i=0; i < out.size(); ++i) {
+        for (int i = 0; i < (int) out.size(); ++i) {
             const char * s = llama_token_to_str(lctx, out[i]);
             int len = strlen(s);
             if (in >= end) {
@@ -2264,7 +2264,7 @@ llama_token sample(struct my_llama_sampler * sampler, float * logits, const llam
 }
 
 void set_logits_masked(struct ggml_tensor * logits, std::vector<bool>& mask, float value) {
-    GGML_ASSERT(logits->ne[0] == mask.size());
+    GGML_ASSERT(logits->ne[0] == (int64_t) mask.size());
     for (int i2 = 0; i2 < logits->ne[2]; ++i2) {
         for (int i1 = 0; i1 < logits->ne[1]; ++i1) {
             for (int i0 = 0; i0 < logits->ne[0]; ++i0) {
@@ -2301,7 +2301,7 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
 }
 
 void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
-    uint32_t nd = file->read_u32();
+    int32_t nd = file->read_u32();
     GGML_ASSERT(nd == tensor->n_dims);
 
     uint32_t name_len       = file->read_u32();
@@ -3003,7 +3003,7 @@ int main(int argc, char ** argv) {
     if (tokenize_file(lctx, params.fn_train_data, train_tokens) < 0) {
         fprintf(stderr, "%s: failed to tokenize file '%s'\n", __func__, params.fn_train_data);
     }
-    printf("%s: number of training tokens: %d\n", __func__, train_tokens.size());
+    printf("%s: number of training tokens: %d\n", __func__, (int) train_tokens.size());
 
     struct my_llama_model model;
     model.hparams.n_vocab = llama_n_vocab(lctx);
@@ -3020,7 +3020,7 @@ int main(int argc, char ** argv) {
     std::vector<bool>   token_notavail;
     token_noccurs.resize(model.hparams.n_vocab, 0);
     token_notavail.resize(model.hparams.n_vocab, true);
-    for (int i=0; i<train_tokens.size(); ++i) {
+    for (int i = 0; i < (int) train_tokens.size(); ++i) {
         ++token_noccurs[train_tokens[i]];
         token_notavail[train_tokens[i]] = false;
     }
@@ -3028,7 +3028,7 @@ int main(int argc, char ** argv) {
     std::vector<float> token_freq;
     token_freq.resize(model.hparams.n_vocab, 0);
     int n_unique_tokens = 0;
-    for (int i=0; i<token_noccurs.size(); ++i) {
+    for (int i = 0; i < (int) token_noccurs.size(); ++i) {
         token_freq[i] = (float) token_noccurs[i] / (float) train_tokens.size();
         n_unique_tokens += (token_noccurs[i] > 0) ? 1 : 0;
     }
@@ -3104,26 +3104,26 @@ int main(int argc, char ** argv) {
     uint8_t * compute_buf_1 = new uint8_t[size_buf_1];
     uint8_t * compute_buf_2 = new uint8_t[size_buf_2];
 
-    GGML_ASSERT(train_tokens.size() > n_tokens);;
+    GGML_ASSERT(n_tokens < (int) train_tokens.size());
     std::vector<int> train_samples;
     train_samples.push_back(0);
-    for (int i=1; i<train_tokens.size()-n_tokens; ++i) {
+    for (int i = 1; i < (int) train_tokens.size() - n_tokens; ++i) {
         if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl())) {
             train_samples.push_back(i);
         }
     }
     shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size());
-    for (int i=0; i<train_samples.size(); ++i) {
-        GGML_ASSERT(train_samples[i]+n_tokens-1 < train_tokens.size());
+    for (int i = 0; i < (int) train_samples.size(); ++i) {
+        GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
     }
 
     printf("%s: begin training\n", __func__);
 
-    for (int ex=0; ex<params.n_examples; ++ex) {
-        if (ex*n_batch >= train_samples.size()) {
+    for (int ex = 0; ex < params.n_examples; ++ex) {
+        if (ex*n_batch >= (int) train_samples.size()) {
             shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size());
-            for (int i=0; i<train_samples.size(); ++i) {
-                GGML_ASSERT(train_samples[i]+n_tokens-1 < train_tokens.size());
+            for (int i = 0; i < (int) train_samples.size(); ++i) {
+                GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
             }
         }
 
@@ -3134,11 +3134,11 @@ int main(int argc, char ** argv) {
         };
         struct ggml_context * ctx0 = ggml_init(cparams);
 
-        struct ggml_tensor * after_opt_best_samples  = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
-        struct ggml_tensor * after_opt_probs         = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
-        struct ggml_tensor * tokens_input            = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
-        struct ggml_tensor * target_logits           = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
-        struct ggml_tensor * target_probs            = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
+        struct ggml_tensor * after_opt_best_samples = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
+        //struct ggml_tensor * after_opt_probs        = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
+        struct ggml_tensor * tokens_input           = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
+        struct ggml_tensor * target_logits          = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
+        struct ggml_tensor * target_probs           = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
 
         int n_past = 0;
 
@@ -3293,8 +3293,8 @@ int main(int argc, char ** argv) {
             ggml_build_forward_expand(&gf, logits);
             ggml_graph_compute(ctx0, &gf);
 
-            struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
-            struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
+            //struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
+            //struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
 
             // set_logits_masked(logits, token_notavail, -1e9);
             int token = sample(&sampler,
diff --git a/ggml.c b/ggml.c
index e64dac6329af6..b5eb9123cdc21 100644
--- a/ggml.c
+++ b/ggml.c
@@ -10498,7 +10498,7 @@ static void ggml_compute_forward_out_prod_f32(
     const int64_t ne03 = src0->ne[3];
 
     const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
+    //const int64_t ne11 = src1->ne[1];
     const int64_t ne12 = src1->ne[2];
     const int64_t ne13 = src1->ne[3];
 
@@ -10587,11 +10587,10 @@ static void ggml_compute_forward_out_prod_f32(
         const int64_t i02 = i2;
         const int64_t i03 = i3;
 
-        const int64_t i10 = i1;
+        //const int64_t i10 = i1;
         const int64_t i12 = i2;
         const int64_t i13 = i3;
 
-
         for (int64_t i01 = 0; i01 < ne01; ++i01) {
             const int64_t i11 = i01;
 
@@ -13956,8 +13955,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
         return;
     }
 
-    const float eps = 1e-9f;
-
+    const double eps = 1e-9;
 
     // rows per thread
     const int dr = (nr + nth - 1)/nth;
@@ -14002,7 +14000,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
             // sum = 1.0/sum;
         }
         // avoid log(0) by rescaling from [0..1] to [eps..1]
-        sum = (1.0f - eps) / sum;
+        sum = (1.0 - eps) / sum;
         ggml_vec_scale_f32(nc, st, sum);
         ggml_vec_add1_f32(nc, st, st, eps);
         ggml_vec_log_f32(nc, st, st);
@@ -14054,8 +14052,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
     const int64_t ith = params->ith;
     const int64_t nth = params->nth;
 
-    float * sums = (float *) params->wdata;
-
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
     }
@@ -14090,6 +14086,8 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
 #endif
         // step by step explanation:
         {
+            //float * sums = (float *) params->wdata;
+
             // forward pass with annotated gradients from backward pass
             // (built by going in reverse operation order, adding to gradients of current operation args)
             // st0 = exp(s0-max(s0))                                                       grad[st0] = grad[st1]*(1.0 - eps)/sum
@@ -14162,10 +14160,10 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
         float dot_st1_dst1 = 0;
         ggml_vec_scale_f32(nc, sm, sum);
         ggml_vec_cpy_f32  (nc, ds0, sm);
-        ggml_vec_scale_f32(nc, ds0, (1.0 - eps));
+        ggml_vec_scale_f32(nc, ds0, (1.0f - eps));
         ggml_vec_add1_f32 (nc, ds0, ds0, eps);
         ggml_vec_div_f32  (nc, ds0, s1, ds0);
-        ggml_vec_scale_f32(nc, ds0, -(1.0 - eps)*d[0]);
+        ggml_vec_scale_f32(nc, ds0, -(1.0f - eps)*d[0]);
         ggml_vec_dot_f32  (nc, &dot_st1_dst1, sm, ds0);
         ggml_vec_acc1_f32 (nc, ds0, -dot_st1_dst1);
         ggml_vec_mul_f32  (nc, ds0, ds0, sm);
diff --git a/llama.h b/llama.h
index 3947cf3e2aa17..4694c9c85554c 100644
--- a/llama.h
+++ b/llama.h
@@ -193,9 +193,9 @@ extern "C" {
     // Returns number of results.
     LLAMA_API int llama_get_vocab(
             const struct llama_context * ctx,
-            const char * * strings,
-            float  * scores,
-            int capacity);
+                          const char * * strings,
+                                 float * scores,
+                                   int   capacity);
 
     // Token logits obtained from the last call to llama_eval()
     // The logits for the last token are stored in the last row

From 7aa10d0518764857df7174d6959873f242e09704 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 11 Jun 2023 16:50:41 +0200
Subject: [PATCH 78/86] fix bug in threaded indices calculation of
 ggml_compute_forward_flash_attn_back_f32

---
 ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index b5eb9123cdc21..79f26ff2d295f 100644
--- a/ggml.c
+++ b/ggml.c
@@ -13545,7 +13545,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
     for (int ir = ir0; ir < ir1; ++ir) {
         // q indices
         const int iq3 = ir/(neq2);
-        const int iq2 = (ir - iq3*neq2)/neq2;
+        const int iq2 = ir - iq3*neq2;
         for ( int iq1 = 0; iq1 < neq1; ++iq1) {
 
 

From edf6fc252a7c6e6c02f37cc76dbc56fcd1ce656c Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 11 Jun 2023 17:07:44 +0200
Subject: [PATCH 79/86] store view offset like in master branch

---
 ggml.c | 88 +++++++++++++++++++++-------------------------------------
 1 file changed, 32 insertions(+), 56 deletions(-)

diff --git a/ggml.c b/ggml.c
index 79f26ff2d295f..128c41447bcc9 100644
--- a/ggml.c
+++ b/ggml.c
@@ -5878,24 +5878,18 @@ struct ggml_tensor * ggml_view_1d(
 
     struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
 
+    ggml_scratch_save(ctx);
+
+    struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+    memcpy(offs->data, &offset, 2*sizeof(int32_t));
+
+    ggml_scratch_load(ctx);
+
     result->op   = GGML_OP_VIEW;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src0 = a;
     result->src1 = NULL;
-
-    if (is_node) {
-        ggml_scratch_save(ctx);
-
-        struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
-
-        GGML_ASSERT(sizeof(offset) <= ggml_nbytes(b));
-
-        memcpy(b->data, &offset, sizeof(offset));
-
-        ggml_scratch_load(ctx);
-
-        result->opt[0] = b;
-    }
+    result->opt[0] = offs;
 
     return result;
 }
@@ -5920,6 +5914,13 @@ struct ggml_tensor * ggml_view_2d(
 
     struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
 
+    ggml_scratch_save(ctx);
+
+    struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+    memcpy(offs->data, &offset, 2*sizeof(int32_t));
+
+    ggml_scratch_load(ctx);
+
     result->nb[1] = nb1;
     result->nb[2] = result->nb[1]*ne1;
     result->nb[3] = result->nb[2];
@@ -5928,20 +5929,7 @@ struct ggml_tensor * ggml_view_2d(
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src0 = a;
     result->src1 = NULL;
-
-    if (is_node) {
-        ggml_scratch_save(ctx);
-
-        struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
-
-        GGML_ASSERT(sizeof(offset) <= ggml_nbytes(b));
-
-        memcpy(b->data, &offset, sizeof(offset));
-
-        ggml_scratch_load(ctx);
-
-        result->opt[0] = b;
-    }
+    result->opt[0] = offs;
 
     return result;
 }
@@ -5968,6 +5956,13 @@ struct ggml_tensor * ggml_view_3d(
 
     struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
 
+    ggml_scratch_save(ctx);
+
+    struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+    memcpy(offs->data, &offset, 2*sizeof(int32_t));
+
+    ggml_scratch_load(ctx);
+
     result->nb[1] = nb1;
     result->nb[2] = nb2;
     result->nb[3] = result->nb[2]*ne2;
@@ -5976,20 +5971,7 @@ struct ggml_tensor * ggml_view_3d(
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src0 = a;
     result->src1 = NULL;
-
-    if (is_node) {
-        ggml_scratch_save(ctx);
-
-        struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
-
-        GGML_ASSERT(sizeof(offset) <= ggml_nbytes(b));
-
-        memcpy(b->data, &offset, sizeof(offset));
-
-        ggml_scratch_load(ctx);
-
-        result->opt[0] = b;
-    }
+    result->opt[0] = offs;
 
     return result;
 }
@@ -6018,6 +6000,13 @@ struct ggml_tensor * ggml_view_4d(
 
     struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
 
+    ggml_scratch_save(ctx);
+
+    struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+    memcpy(offs->data, &offset, 2*sizeof(int32_t));
+
+    ggml_scratch_load(ctx);
+
     result->nb[1] = nb1;
     result->nb[2] = nb2;
     result->nb[3] = nb3;
@@ -6026,20 +6015,7 @@ struct ggml_tensor * ggml_view_4d(
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src0 = a;
     result->src1 = NULL;
-
-    if (is_node) {
-        ggml_scratch_save(ctx);
-
-        struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
-
-        GGML_ASSERT(sizeof(offset) <= ggml_nbytes(b));
-
-        memcpy(b->data, &offset, sizeof(offset));
-
-        ggml_scratch_load(ctx);
-
-        result->opt[0] = b;
-    }
+    result->opt[0] = offs;
 
     return result;
 }

From fdeb99784abb1f6ad399df53aa6a4fae1b977e9d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 11 Jun 2023 19:58:36 +0200
Subject: [PATCH 80/86] bug fix in forward_batch_wo_cache_flash_attn_train

---
 .../train-text-from-scratch/train-text-from-scratch.cpp    | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index f933c0164e54f..9bbeda125c576 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1708,7 +1708,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
     use_buf(-1);
     // t36->grad gets set to one by optimizer, so we need to create the tensor.
     // initialize it with 1.0f to make sure.
-    t36->grad = ggml_new_f32(ctx0, 1.0f);
+    GGML_ASSERT(t36->grad != NULL);
+    // t36->grad = expand(gb, ggml_new_f32(ctx0, 1.0f));
 
     use_buf(1);
     t35->grad = expand(gb, ggml_cross_entropy_loss_back(ctx0, t35, targets, t36->grad));              assert_shape_3d(t35->grad, n_vocab, N, n_batch);
@@ -1766,7 +1767,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
         use_buf(1);
         t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch);
         if (grad_layer_inp) {
-            t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp));       assert_shape_2d(t30->grad, n_embd, N*n_batch);
+            t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch);
         }
         clr_buf(2);
         t29->grad = t30->grad; assert_shape_2d(t29->grad, n_embd, N*n_batch);
@@ -1808,7 +1809,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
         t03->grad = expand(gb, ggml_mul(ctx0, t04->grad, t02));                                                       assert_shape_2d(t04->grad, n_embd, N*n_batch);
         use_buf(2);
         t02->grad = expand(gb, ggml_mul(ctx0, t04->grad, t03));                                                       assert_shape_2d(t02->grad, n_embd, N*n_batch);
-        back_layer_inp = t02->grad;
+        back_layer_inp = t02;
         use_buf(1);
 
         use_buf(-1);

From efd7314d27ab4f7cfbc6854f657225e8d4634f1e Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 11 Jun 2023 23:10:41 +0200
Subject: [PATCH 81/86] scratch buffer bug fixes in
 forward_batch_wo_cache_flash_attn_train

data of permute and reshape is the same as their input.
if we want to preserve the output of permute/reshape, we also need to preserve their inputs.

replace reshape(src0, src1) with reshape_nd calls so that we don't need src1.

replace (temporary) t03 with ggml_repeat(ctx0, layer.attention_norm, t02).
in the future we could also use the new broadcasting ggml_mul to avoid these repeat calls.
for this we need backward pass of broadcasting ggml_mul.
---
 .../train-text-from-scratch.cpp               | 115 ++++++++++--------
 1 file changed, 66 insertions(+), 49 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 9bbeda125c576..7e9607f5ecca3 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1405,21 +1405,26 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
         last_buf = buf;
     };
 
-    auto clr_buf = [&buf_offs] (int buf) {
+    bool track_max_mem = false;
+    size_t buf_maxs[3] = { 0, 0, 0 };
+
+    auto clr_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs, track_max_mem] (int buf) {
         if (buf < 0) return;
-        // size_t last_offs = 0;
-        // last_offs = ggml_set_scratch(ctx, { 0, 0, nullptr, });
-        // if (last_buf >= 0) {
-        //     buf_offs[last_buf] = last_offs;
-        // }
-        // buf_max_size[buf] = std::max(buf_max_size[buf], buf_offs[buf]);
+        if (track_max_mem) {
+            size_t last_offs = 0;
+            last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+            if (last_buf >= 0) {
+                buf_offs[last_buf] = last_offs;
+                buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]);
+            }
+        }
         buf_offs[buf] = 0;
-        // if (last_buf >= 0) {
-        //     size_t offs = buf_offs[last_buf];
-        //     size_t size = buf_size[last_buf];
-        //     void * data = buf_data[last_buf];
-        //     ggml_set_scratch(ctx0, { offset, size, data, });
-        // }
+        if (track_max_mem && last_buf >= 0) {
+            size_t offs = buf_offs[last_buf];
+            size_t size = buf_size[last_buf];
+            void * data = buf_data[last_buf];
+            ggml_set_scratch(ctx0, { offs, size, data, });
+        }
     };
 
     auto view__q = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
@@ -1471,6 +1476,13 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
 
     use_buf(-1);
 
+    // need to create grads for model parameters, so that expand(..) correctly populates cgraph->leafs & cgraph->grads
+    // this wastes memory, because unnecessary grad for each op is automatically created:
+    // the automatically generated grad is unnecessary because we later manually set the grad (e.g. t35->grad = expand(gb, ...) ).
+    // this discards the automatically generated grad resulting in wasted memory.
+    // TODO: improve this, possibly by changing expand(..) to not use ggml_build_forward_expand.
+    //       expand should correctly set cgraph->nodes.
+    //       cgraph->leafs & cgraph->grads could be set in another pass after the last expand call.
     model->tok_embeddings->grad = ggml_dup_tensor(ctx0, model->tok_embeddings->grad);
     model->norm->grad = ggml_dup_tensor(ctx0, model->norm->grad);
     model->output->grad = ggml_dup_tensor(ctx0, model->output->grad);
@@ -1491,11 +1503,13 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
     clr_buf(1);
     clr_buf(2);
 
-    use_buf(0);
+    use_buf(-1);
 
     struct ggml_tensor * t00 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); assert_shape_1d(t00, N*n_batch);
     memcpy(t00->data, tokens_input->data, ggml_element_size(t00)*N*n_batch);
 
+    use_buf(0);
+
     struct ggml_tensor * t01 = expand(gf, ggml_get_rows(ctx0, model->tok_embeddings, t00)); assert_shape_2d(t01, n_embd, N*n_batch);
 
     // need to remember these for the backward pass
@@ -1536,35 +1550,35 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
         struct my_llama_layer & layer = model->layers[il];
         // tensors with values necessary for backward pass are in persistent buf(0)
         // other tensors with buf(1) are only temporary needed, and their memory reused after layer is completed.
-        use_buf(0); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm     (ctx0, cur));                                    assert_shape_2d(t02, n_embd, N*n_batch);                  // n_embd, N*n_batch
+        use_buf(0); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm     (ctx0, cur));                                    assert_shape_2d(t02, n_embd, N*n_batch);
         use_buf(1); struct ggml_tensor * t03 = expand(gf, ggml_repeat       (ctx0, layer.attention_norm, t02));              assert_shape_2d(t03, n_embd, N*n_batch);
-        use_buf(0); struct ggml_tensor * t04 = expand(gf, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);                  // n_embd, N*n_batch
-        use_buf(1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
-        use_buf(1); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d   (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
-        use_buf(1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode));          assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
-        use_buf(1); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat      (ctx0, layer.wk, t04));                          assert_shape_2d(t08, n_embd, N*n_batch);
-        use_buf(1); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d   (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
-        use_buf(1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode));          assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
-        use_buf(1); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat      (ctx0, t04, layer.wv));                          assert_shape_2d(t11, N*n_batch, n_embd);
-        use_buf(1); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d   (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
-        use_buf(0); struct ggml_tensor * t13 = expand(gf, ggml_permute      (ctx0, t07, 0, 2, 1, 3));                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);  // n_embd/n_head, N, n_head, n_batch
-        use_buf(0); struct ggml_tensor * t14 = expand(gf, ggml_permute      (ctx0, t10, 0, 2, 1, 3));                        assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);  // n_embd/n_head, N, n_head, n_batch
-        use_buf(0); struct ggml_tensor * t15 = expand(gf, ggml_permute      (ctx0, t12, 0, 3, 1, 2));                        assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);  // N, n_embd/n_head, n_head, n_batch
-        use_buf(1); struct ggml_tensor * t16 = expand(gf, ggml_flash_attn   (ctx0, t13, t14, t15, true));                    assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
+        use_buf(0); struct ggml_tensor * t04 = expand(gf, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);
+        use_buf(0); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
+        use_buf(0); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d   (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
+        use_buf(0); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode));          assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
+        use_buf(0); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat      (ctx0, layer.wk, t04));                          assert_shape_2d(t08, n_embd, N*n_batch);
+        use_buf(0); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d   (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
+        use_buf(0); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode));          assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
+        use_buf(0); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat      (ctx0, t04, layer.wv));                          assert_shape_2d(t11, N*n_batch, n_embd);
+        use_buf(0); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d   (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
+        use_buf(0); struct ggml_tensor * t13 = expand(gf, ggml_permute      (ctx0, t07, 0, 2, 1, 3));                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
+        use_buf(0); struct ggml_tensor * t14 = expand(gf, ggml_permute      (ctx0, t10, 0, 2, 1, 3));                        assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
+        use_buf(0); struct ggml_tensor * t15 = expand(gf, ggml_permute      (ctx0, t12, 0, 3, 1, 2));                        assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
+        use_buf(0); struct ggml_tensor * t16 = expand(gf, ggml_flash_attn   (ctx0, t13, t14, t15, true));                    assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
         use_buf(1); struct ggml_tensor * t17 = expand(gf, ggml_permute      (ctx0, t16, 0, 2, 1, 3));                        assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
-        use_buf(1); struct ggml_tensor * t18 = expand(gf, ggml_cont         (ctx0, t17));                                    assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
-        use_buf(0); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d   (ctx0, t18, n_embd, N*n_batch));                 assert_shape_2d(t19, n_embd, N*n_batch);                  // n_embd, N*n_batch
+        use_buf(0); struct ggml_tensor * t18 = expand(gf, ggml_cont         (ctx0, t17));                                    assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
+        use_buf(0); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d   (ctx0, t18, n_embd, N*n_batch));                 assert_shape_2d(t19, n_embd, N*n_batch);
         use_buf(1); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat      (ctx0, layer.wo, t19));                          assert_shape_2d(t20, n_embd, N*n_batch);
-        use_buf(0); struct ggml_tensor * t21 = expand(gf, ggml_add          (ctx0, t20, cur));                               assert_shape_2d(t21, n_embd, N*n_batch);                  // n_embd, N*n_batch
-        use_buf(0); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm     (ctx0, t21));                                    assert_shape_2d(t22, n_embd, N*n_batch);                  // n_embd, N*n_batch
+        use_buf(0); struct ggml_tensor * t21 = expand(gf, ggml_add          (ctx0, t20, cur));                               assert_shape_2d(t21, n_embd, N*n_batch);
+        use_buf(0); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm     (ctx0, t21));                                    assert_shape_2d(t22, n_embd, N*n_batch);
         use_buf(1); struct ggml_tensor * t23 = expand(gf, ggml_repeat       (ctx0, layer.ffn_norm, t22));                    assert_shape_2d(t23, n_embd, N*n_batch);
-        use_buf(0); struct ggml_tensor * t24 = expand(gf, ggml_mul          (ctx0, t23, t22));                               assert_shape_2d(t24, n_embd, N*n_batch);                  // n_embd, N*n_batch
-        use_buf(0); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat      (ctx0, layer.w3, t24));                          assert_shape_2d(t25, n_ff, N*n_batch);                    // n_ff, N*n_batch
-        use_buf(0); struct ggml_tensor * t26 = expand(gf, ggml_mul_mat      (ctx0, layer.w1, t24));                          assert_shape_2d(t26, n_ff, N*n_batch);                    // n_ff, N*n_batch
-        use_buf(0); struct ggml_tensor * t27 = expand(gf, ggml_silu         (ctx0, t26));                                    assert_shape_2d(t27, n_ff, N*n_batch);                    // n_ff, N*n_batch
-        use_buf(0); struct ggml_tensor * t28 = expand(gf, ggml_mul          (ctx0, t27, t25));                               assert_shape_2d(t28, n_ff, N*n_batch);                    // n_ff, N*n_batch
+        use_buf(0); struct ggml_tensor * t24 = expand(gf, ggml_mul          (ctx0, t23, t22));                               assert_shape_2d(t24, n_embd, N*n_batch);
+        use_buf(0); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat      (ctx0, layer.w3, t24));                          assert_shape_2d(t25, n_ff, N*n_batch);
+        use_buf(0); struct ggml_tensor * t26 = expand(gf, ggml_mul_mat      (ctx0, layer.w1, t24));                          assert_shape_2d(t26, n_ff, N*n_batch);
+        use_buf(0); struct ggml_tensor * t27 = expand(gf, ggml_silu         (ctx0, t26));                                    assert_shape_2d(t27, n_ff, N*n_batch);
+        use_buf(0); struct ggml_tensor * t28 = expand(gf, ggml_mul          (ctx0, t27, t25));                               assert_shape_2d(t28, n_ff, N*n_batch);
         use_buf(1); struct ggml_tensor * t29 = expand(gf, ggml_mul_mat      (ctx0, layer.w2, t28));                          assert_shape_2d(t29, n_embd, N*n_batch);
-        use_buf(0); struct ggml_tensor * t30 = expand(gf, ggml_add          (ctx0, t21, t29));                               assert_shape_2d(t30, n_embd, N*n_batch);                  // n_embd, N*n_batch
+        use_buf(0); struct ggml_tensor * t30 = expand(gf, ggml_add          (ctx0, t21, t29));                               assert_shape_2d(t30, n_embd, N*n_batch);
         t02L[il] = t02;
         t03L[il] = t03;
         t04L[il] = t04;
@@ -1602,6 +1616,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
     struct ggml_tensor * t31   = expand(gf, ggml_rms_norm  (ctx0, cur));                       assert_shape_2d(t31, n_embd, N*n_batch);
     struct ggml_tensor * t32   = expand(gf, ggml_repeat    (ctx0, model->norm, t31));          assert_shape_2d(t32, n_embd, N*n_batch);
     struct ggml_tensor * t33   = expand(gf, ggml_mul       (ctx0, t32, t31));                  assert_shape_2d(t33, n_embd, N*n_batch);
+    use_buf(-1);
     struct ggml_tensor * t34   = expand(gf, ggml_mul_mat   (ctx0, model->output, t33));        assert_shape_2d(t34, n_vocab, N*n_batch);
     struct ggml_tensor * t35   = expand(gf, ggml_reshape_3d(ctx0, t34, n_vocab, N, n_batch));  assert_shape_3d(t35, n_vocab, N, n_batch);
     struct ggml_tensor * t36   = expand(gf, ggml_cross_entropy_loss(ctx0, t35, targets));      assert_shape_1d(t36, 1);
@@ -1705,10 +1720,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
 
     *gb = *gf;
 
-    use_buf(-1);
     // t36->grad gets set to one by optimizer, so we need to create the tensor.
     // initialize it with 1.0f to make sure.
     GGML_ASSERT(t36->grad != NULL);
+    // use_buf(-1);
     // t36->grad = expand(gb, ggml_new_f32(ctx0, 1.0f));
 
     use_buf(1);
@@ -1770,7 +1785,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
             t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch);
         }
         clr_buf(2);
-        t29->grad = t30->grad; assert_shape_2d(t29->grad, n_embd, N*n_batch);
+        t29->grad = t30->grad;                                                                                        assert_shape_2d(t29->grad, n_embd, N*n_batch);
         t28->grad = expand(gb, ggml_out_prod(ctx0, layer.w2, ggml_transpose(ctx0, t29->grad)));                       assert_shape_2d(t28->grad, n_ff, N*n_batch);
         t27->grad = expand(gb, ggml_mul(ctx0, t28->grad, t25));                                                       assert_shape_2d(t27->grad, n_ff, N*n_batch);
         t26->grad = expand(gb, ggml_silu_back(ctx0, t26, t27->grad));                                                 assert_shape_2d(t26->grad, n_ff, N*n_batch);
@@ -1786,7 +1801,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
         use_buf(1);
         t20->grad = t21->grad;                                                                                        assert_shape_2d(t20->grad, n_embd, N*n_batch);
         t19->grad = expand(gb, ggml_out_prod(ctx0, layer.wo, ggml_transpose(ctx0, t20->grad)));                       assert_shape_2d(t19->grad, n_embd, N*n_batch);
-        t18->grad = expand(gb, ggml_reshape(ctx0, t19->grad, t18));                                                   assert_shape_4d(t18->grad, n_embd/n_head, n_head, N, n_batch);
+        t18->grad = expand(gb, ggml_reshape_4d(ctx0, t19->grad, n_embd/n_head, n_head, N, n_batch));                  assert_shape_4d(t18->grad, n_embd/n_head, n_head, N, n_batch);
         t17->grad = t18->grad;                                                                                        assert_shape_4d(t17->grad, n_embd/n_head, n_head, N, n_batch);
         t16->grad = expand(gb, ggml_permute(ctx0, t17->grad, 0, 2, 1, 3));                                            assert_shape_4d(t16->grad, n_embd/n_head, N, n_head, n_batch);
         struct ggml_tensor * flash_attn = expand(gb, ggml_flash_attn_back(ctx0, t13, t14, t15, t16->grad, true));     assert_shape_4d(flash_attn, n_embd/n_head, N*3, n_head, n_batch);
@@ -1794,13 +1809,13 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
         t14->grad = expand(gb, view__k(flash_attn));                                                                  assert_shape_4d(t14->grad, n_embd/n_head, N, n_head, n_batch);
         t13->grad = expand(gb, view__q(flash_attn));                                                                  assert_shape_4d(t13->grad, n_embd/n_head, N, n_head, n_batch);
         t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1));                                            assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head);
-        t11->grad = expand(gb, ggml_reshape(ctx0, ggml_cont(ctx0, t12->grad), t11));                                  assert_shape_2d(t11->grad, N*n_batch, n_embd);
+        t11->grad = expand(gb, ggml_reshape_2d(ctx0, ggml_cont(ctx0, t12->grad), N*n_batch, n_embd));                 assert_shape_2d(t11->grad, N*n_batch, n_embd);
         t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3));                                            assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch);
         t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode));                            assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch);
-        t08->grad = expand(gb, ggml_reshape(ctx0, t09->grad, t08));                                                   assert_shape_2d(t08->grad, n_embd, N*n_batch);
+        t08->grad = expand(gb, ggml_reshape_2d(ctx0, t09->grad, n_embd, N*n_batch));                                  assert_shape_2d(t08->grad, n_embd, N*n_batch);
         t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3));                                            assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch);
         t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode));                            assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch);
-        t05->grad = expand(gb, ggml_reshape(ctx0, t06->grad, t05));                                                   assert_shape_2d(t05->grad, n_embd, N*n_batch);
+        t05->grad = expand(gb, ggml_reshape_2d(ctx0, t06->grad, n_embd, N*n_batch));                                  assert_shape_2d(t05->grad, n_embd, N*n_batch);
         t04->grad = expand(gb, ggml_add_inplace(ctx0,
                         ggml_add_inplace(ctx0,
                             ggml_out_prod(ctx0, layer.wv, t11->grad),
@@ -1808,9 +1823,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
                         ggml_out_prod(ctx0, layer.wq, ggml_transpose(ctx0, t05->grad))));                             assert_shape_2d(t04->grad, n_embd, N*n_batch);
         t03->grad = expand(gb, ggml_mul(ctx0, t04->grad, t02));                                                       assert_shape_2d(t04->grad, n_embd, N*n_batch);
         use_buf(2);
-        t02->grad = expand(gb, ggml_mul(ctx0, t04->grad, t03));                                                       assert_shape_2d(t02->grad, n_embd, N*n_batch);
+        t02->grad = expand(gb, ggml_mul(ctx0, t04->grad, ggml_repeat(ctx0, layer.attention_norm, t02)));              assert_shape_2d(t02->grad, n_embd, N*n_batch);
         back_layer_inp = t02;
-        use_buf(1);
 
         use_buf(-1);
         layer.attention_norm->grad = expand(gb, add_or_set(layer.attention_norm->grad, ggml_repeat_back(ctx0, t03->grad, layer.attention_norm)));   assert_shape_1d(layer.attention_norm->grad, n_embd);
@@ -1822,18 +1836,21 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
         layer.w1->grad             = expand(gb, add_or_set(layer.w1->grad,             ggml_out_prod(ctx0, t24, t26->grad)));                       assert_shape_2d(layer.w1->grad,             n_embd, n_ff);
         layer.w2->grad             = expand(gb, add_or_set(layer.w2->grad,             ggml_out_prod(ctx0, t28, t29->grad)));                       assert_shape_2d(layer.w2->grad,             n_ff, n_embd);
         layer.w3->grad             = expand(gb, add_or_set(layer.w3->grad,             ggml_out_prod(ctx0, t24, t25->grad)));                       assert_shape_2d(layer.w3->grad,             n_embd, n_ff);
-        use_buf(1);
     }
     clr_buf(1);
     use_buf(1);
     t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad)));  assert_shape_2d(t01->grad, n_embd, N*n_batch);
     use_buf(-1);
     model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings));            assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab);
-    clr_buf(2);
-    clr_buf(1);
 
     *logits = t35;
 
+    if (track_max_mem) {
+        printf("%s: max size compute buf0: %zu\n", __func__, buf_maxs[0]);
+        printf("%s: max size compute buf1: %zu\n", __func__, buf_maxs[1]);
+        printf("%s: max size compute buf2: %zu\n", __func__, buf_maxs[2]);
+    }
+
     return t36;
 }
 

From 59544f0cdfe252ddef0edca0dbba53902bcbb75f Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 11 Jun 2023 23:23:06 +0200
Subject: [PATCH 82/86] remove unnecessary scratch buffer 0

buf 0 is persistent memory, so we can just disable scratch for this by using buf -1
---
 .../train-text-from-scratch.cpp               | 141 ++++++++----------
 1 file changed, 64 insertions(+), 77 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 7e9607f5ecca3..9244088dc84da 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1347,10 +1347,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
         struct ggml_tensor    * targets,
         void                  * compute_buf_0,
         void                  * compute_buf_1,
-        void                  * compute_buf_2,
         size_t                  size_buf_0,
         size_t                  size_buf_1,
-        size_t                  size_buf_2,
         const  int              n_tokens,
         const  int              n_batch) {
 
@@ -1383,13 +1381,11 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
     };
 
     int last_buf = -1;
-    size_t buf_offs[3] = { 0, 0, 0 };
-    size_t buf_size[3] = { size_buf_0,
-                           size_buf_1,
-                           size_buf_2 };
-    void * buf_data[3] = { compute_buf_0,
-                           compute_buf_1,
-                           compute_buf_2 };
+    size_t buf_offs[2] = { 0, 0 };
+    size_t buf_size[2] = { size_buf_0,
+                           size_buf_1 };
+    void * buf_data[2] = { compute_buf_0,
+                           compute_buf_1 };
     auto use_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data] (int buf) {
         size_t last_offs = 0;
         last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, });
@@ -1406,7 +1402,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
     };
 
     bool track_max_mem = false;
-    size_t buf_maxs[3] = { 0, 0, 0 };
+    size_t buf_maxs[2] = { 0, 0 };
 
     auto clr_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs, track_max_mem] (int buf) {
         if (buf < 0) return;
@@ -1500,15 +1496,15 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
         layer.w3->grad             = ggml_dup_tensor(ctx0, layer.w3->grad);
     }
 
+    clr_buf(0);
     clr_buf(1);
-    clr_buf(2);
 
     use_buf(-1);
 
     struct ggml_tensor * t00 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); assert_shape_1d(t00, N*n_batch);
     memcpy(t00->data, tokens_input->data, ggml_element_size(t00)*N*n_batch);
 
-    use_buf(0);
+    use_buf(-1);
 
     struct ggml_tensor * t01 = expand(gf, ggml_get_rows(ctx0, model->tok_embeddings, t00)); assert_shape_2d(t01, n_embd, N*n_batch);
 
@@ -1546,39 +1542,39 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
     struct ggml_tensor * cur = t01;
 
     for (int il = 0; il < n_layer; ++il) {
-        clr_buf(1);
+        clr_buf(0);
         struct my_llama_layer & layer = model->layers[il];
-        // tensors with values necessary for backward pass are in persistent buf(0)
-        // other tensors with buf(1) are only temporary needed, and their memory reused after layer is completed.
-        use_buf(0); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm     (ctx0, cur));                                    assert_shape_2d(t02, n_embd, N*n_batch);
-        use_buf(1); struct ggml_tensor * t03 = expand(gf, ggml_repeat       (ctx0, layer.attention_norm, t02));              assert_shape_2d(t03, n_embd, N*n_batch);
-        use_buf(0); struct ggml_tensor * t04 = expand(gf, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);
-        use_buf(0); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
-        use_buf(0); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d   (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
-        use_buf(0); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode));          assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
-        use_buf(0); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat      (ctx0, layer.wk, t04));                          assert_shape_2d(t08, n_embd, N*n_batch);
-        use_buf(0); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d   (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
-        use_buf(0); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode));          assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
-        use_buf(0); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat      (ctx0, t04, layer.wv));                          assert_shape_2d(t11, N*n_batch, n_embd);
-        use_buf(0); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d   (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
-        use_buf(0); struct ggml_tensor * t13 = expand(gf, ggml_permute      (ctx0, t07, 0, 2, 1, 3));                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
-        use_buf(0); struct ggml_tensor * t14 = expand(gf, ggml_permute      (ctx0, t10, 0, 2, 1, 3));                        assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
-        use_buf(0); struct ggml_tensor * t15 = expand(gf, ggml_permute      (ctx0, t12, 0, 3, 1, 2));                        assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
-        use_buf(0); struct ggml_tensor * t16 = expand(gf, ggml_flash_attn   (ctx0, t13, t14, t15, true));                    assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
-        use_buf(1); struct ggml_tensor * t17 = expand(gf, ggml_permute      (ctx0, t16, 0, 2, 1, 3));                        assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
-        use_buf(0); struct ggml_tensor * t18 = expand(gf, ggml_cont         (ctx0, t17));                                    assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
-        use_buf(0); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d   (ctx0, t18, n_embd, N*n_batch));                 assert_shape_2d(t19, n_embd, N*n_batch);
-        use_buf(1); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat      (ctx0, layer.wo, t19));                          assert_shape_2d(t20, n_embd, N*n_batch);
-        use_buf(0); struct ggml_tensor * t21 = expand(gf, ggml_add          (ctx0, t20, cur));                               assert_shape_2d(t21, n_embd, N*n_batch);
-        use_buf(0); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm     (ctx0, t21));                                    assert_shape_2d(t22, n_embd, N*n_batch);
-        use_buf(1); struct ggml_tensor * t23 = expand(gf, ggml_repeat       (ctx0, layer.ffn_norm, t22));                    assert_shape_2d(t23, n_embd, N*n_batch);
-        use_buf(0); struct ggml_tensor * t24 = expand(gf, ggml_mul          (ctx0, t23, t22));                               assert_shape_2d(t24, n_embd, N*n_batch);
-        use_buf(0); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat      (ctx0, layer.w3, t24));                          assert_shape_2d(t25, n_ff, N*n_batch);
-        use_buf(0); struct ggml_tensor * t26 = expand(gf, ggml_mul_mat      (ctx0, layer.w1, t24));                          assert_shape_2d(t26, n_ff, N*n_batch);
-        use_buf(0); struct ggml_tensor * t27 = expand(gf, ggml_silu         (ctx0, t26));                                    assert_shape_2d(t27, n_ff, N*n_batch);
-        use_buf(0); struct ggml_tensor * t28 = expand(gf, ggml_mul          (ctx0, t27, t25));                               assert_shape_2d(t28, n_ff, N*n_batch);
-        use_buf(1); struct ggml_tensor * t29 = expand(gf, ggml_mul_mat      (ctx0, layer.w2, t28));                          assert_shape_2d(t29, n_embd, N*n_batch);
-        use_buf(0); struct ggml_tensor * t30 = expand(gf, ggml_add          (ctx0, t21, t29));                               assert_shape_2d(t30, n_embd, N*n_batch);
+        // tensors with values necessary for backward pass are in persistent buf(-1)
+        // other tensors with buf(0) and buf(1) are only temporary needed, and their memory reused after layer is completed.
+        use_buf(-1); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm     (ctx0, cur));                                    assert_shape_2d(t02, n_embd, N*n_batch);
+        use_buf( 0); struct ggml_tensor * t03 = expand(gf, ggml_repeat       (ctx0, layer.attention_norm, t02));              assert_shape_2d(t03, n_embd, N*n_batch);
+        use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);
+        use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
+        use_buf(-1); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d   (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
+        use_buf(-1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode));          assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
+        use_buf(-1); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat      (ctx0, layer.wk, t04));                          assert_shape_2d(t08, n_embd, N*n_batch);
+        use_buf(-1); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d   (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
+        use_buf(-1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode));          assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
+        use_buf(-1); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat      (ctx0, t04, layer.wv));                          assert_shape_2d(t11, N*n_batch, n_embd);
+        use_buf(-1); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d   (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
+        use_buf(-1); struct ggml_tensor * t13 = expand(gf, ggml_permute      (ctx0, t07, 0, 2, 1, 3));                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
+        use_buf(-1); struct ggml_tensor * t14 = expand(gf, ggml_permute      (ctx0, t10, 0, 2, 1, 3));                        assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
+        use_buf(-1); struct ggml_tensor * t15 = expand(gf, ggml_permute      (ctx0, t12, 0, 3, 1, 2));                        assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
+        use_buf(-1); struct ggml_tensor * t16 = expand(gf, ggml_flash_attn   (ctx0, t13, t14, t15, true));                    assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
+        use_buf( 0); struct ggml_tensor * t17 = expand(gf, ggml_permute      (ctx0, t16, 0, 2, 1, 3));                        assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
+        use_buf(-1); struct ggml_tensor * t18 = expand(gf, ggml_cont         (ctx0, t17));                                    assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
+        use_buf(-1); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d   (ctx0, t18, n_embd, N*n_batch));                 assert_shape_2d(t19, n_embd, N*n_batch);
+        use_buf( 0); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat      (ctx0, layer.wo, t19));                          assert_shape_2d(t20, n_embd, N*n_batch);
+        use_buf(-1); struct ggml_tensor * t21 = expand(gf, ggml_add          (ctx0, t20, cur));                               assert_shape_2d(t21, n_embd, N*n_batch);
+        use_buf(-1); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm     (ctx0, t21));                                    assert_shape_2d(t22, n_embd, N*n_batch);
+        use_buf( 0); struct ggml_tensor * t23 = expand(gf, ggml_repeat       (ctx0, layer.ffn_norm, t22));                    assert_shape_2d(t23, n_embd, N*n_batch);
+        use_buf(-1); struct ggml_tensor * t24 = expand(gf, ggml_mul          (ctx0, t23, t22));                               assert_shape_2d(t24, n_embd, N*n_batch);
+        use_buf(-1); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat      (ctx0, layer.w3, t24));                          assert_shape_2d(t25, n_ff, N*n_batch);
+        use_buf(-1); struct ggml_tensor * t26 = expand(gf, ggml_mul_mat      (ctx0, layer.w1, t24));                          assert_shape_2d(t26, n_ff, N*n_batch);
+        use_buf(-1); struct ggml_tensor * t27 = expand(gf, ggml_silu         (ctx0, t26));                                    assert_shape_2d(t27, n_ff, N*n_batch);
+        use_buf(-1); struct ggml_tensor * t28 = expand(gf, ggml_mul          (ctx0, t27, t25));                               assert_shape_2d(t28, n_ff, N*n_batch);
+        use_buf( 0); struct ggml_tensor * t29 = expand(gf, ggml_mul_mat      (ctx0, layer.w2, t28));                          assert_shape_2d(t29, n_embd, N*n_batch);
+        use_buf(-1); struct ggml_tensor * t30 = expand(gf, ggml_add          (ctx0, t21, t29));                               assert_shape_2d(t30, n_embd, N*n_batch);
         t02L[il] = t02;
         t03L[il] = t03;
         t04L[il] = t04;
@@ -1611,8 +1607,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
 
         cur      = t30;
     }
-    clr_buf(1);
-    use_buf(1);
+    clr_buf(0);
+    use_buf(0);
     struct ggml_tensor * t31   = expand(gf, ggml_rms_norm  (ctx0, cur));                       assert_shape_2d(t31, n_embd, N*n_batch);
     struct ggml_tensor * t32   = expand(gf, ggml_repeat    (ctx0, model->norm, t31));          assert_shape_2d(t32, n_embd, N*n_batch);
     struct ggml_tensor * t33   = expand(gf, ggml_mul       (ctx0, t32, t31));                  assert_shape_2d(t33, n_embd, N*n_batch);
@@ -1720,13 +1716,13 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
 
     *gb = *gf;
 
-    // t36->grad gets set to one by optimizer, so we need to create the tensor.
-    // initialize it with 1.0f to make sure.
+    // t36->grad gets set to one by optimizer, so we need the tensor.
     GGML_ASSERT(t36->grad != NULL);
+    // initialize it with 1.0f to make sure.
     // use_buf(-1);
     // t36->grad = expand(gb, ggml_new_f32(ctx0, 1.0f));
 
-    use_buf(1);
+    use_buf(0);
     t35->grad = expand(gb, ggml_cross_entropy_loss_back(ctx0, t35, targets, t36->grad));              assert_shape_3d(t35->grad, n_vocab, N, n_batch);
     t34->grad = expand(gb, ggml_reshape_2d (ctx0, t35->grad, n_vocab, N*n_batch));                    assert_shape_2d(t34->grad, n_vocab, N*n_batch);
     t33->grad = expand(gb, ggml_out_prod   (ctx0, model->output, ggml_transpose(ctx0, t34->grad)));   assert_shape_2d(t33->grad, n_embd, N*n_batch);
@@ -1737,8 +1733,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
     model->norm->grad   = expand(gb, add_or_set(model->norm->grad,   ggml_repeat_back(ctx0, t32->grad, model->norm))); assert_shape_1d(model->norm->grad, n_embd);
     model->output->grad = expand(gb, add_or_set(model->output->grad, ggml_out_prod(ctx0, t33, t34->grad)));            assert_shape_2d(model->output->grad, n_embd, n_vocab);
 
-    clr_buf(2);
-    use_buf(2);
+    clr_buf(1);
+    use_buf(1);
     t31->grad = expand(gb, ggml_mul(ctx0, t33->grad, t32));  assert_shape_2d(t31->grad, n_embd, N*n_batch);
 
     struct ggml_tensor * back_layer_inp = t31;
@@ -1778,13 +1774,13 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
         struct ggml_tensor * t29 = t29L[il];
         struct ggml_tensor * t30 = t30L[il];
 
-        clr_buf(1);
-        use_buf(1);
+        clr_buf(0);
+        use_buf(0);
         t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch);
         if (grad_layer_inp) {
             t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch);
         }
-        clr_buf(2);
+        clr_buf(1);
         t29->grad = t30->grad;                                                                                        assert_shape_2d(t29->grad, n_embd, N*n_batch);
         t28->grad = expand(gb, ggml_out_prod(ctx0, layer.w2, ggml_transpose(ctx0, t29->grad)));                       assert_shape_2d(t28->grad, n_ff, N*n_batch);
         t27->grad = expand(gb, ggml_mul(ctx0, t28->grad, t25));                                                       assert_shape_2d(t27->grad, n_ff, N*n_batch);
@@ -1795,10 +1791,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
                         ggml_out_prod(ctx0, layer.w3, ggml_transpose(ctx0, t25->grad))));                             assert_shape_2d(t24->grad, n_embd, N*n_batch);
         t23->grad = expand(gb, ggml_mul(ctx0, t24->grad, t22));                                                       assert_shape_2d(t23->grad, n_embd, N*n_batch);
         t22->grad = expand(gb, ggml_mul(ctx0, t24->grad, ggml_repeat(ctx0, layer.ffn_norm, t24->grad)));              assert_shape_2d(t22->grad, n_embd, N*n_batch);
-        use_buf(2);
+        use_buf(1);
         t21->grad = expand(gb, ggml_add(ctx0, t30->grad, ggml_rms_norm_back(ctx0, t21, t22->grad)));                  assert_shape_2d(t21->grad, n_embd, N*n_batch);
         grad_layer_inp = t21;
-        use_buf(1);
+        use_buf(0);
         t20->grad = t21->grad;                                                                                        assert_shape_2d(t20->grad, n_embd, N*n_batch);
         t19->grad = expand(gb, ggml_out_prod(ctx0, layer.wo, ggml_transpose(ctx0, t20->grad)));                       assert_shape_2d(t19->grad, n_embd, N*n_batch);
         t18->grad = expand(gb, ggml_reshape_4d(ctx0, t19->grad, n_embd/n_head, n_head, N, n_batch));                  assert_shape_4d(t18->grad, n_embd/n_head, n_head, N, n_batch);
@@ -1822,9 +1818,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
                             ggml_out_prod(ctx0, layer.wk, ggml_transpose(ctx0, t08->grad))),
                         ggml_out_prod(ctx0, layer.wq, ggml_transpose(ctx0, t05->grad))));                             assert_shape_2d(t04->grad, n_embd, N*n_batch);
         t03->grad = expand(gb, ggml_mul(ctx0, t04->grad, t02));                                                       assert_shape_2d(t04->grad, n_embd, N*n_batch);
-        use_buf(2);
+        use_buf(1);
         t02->grad = expand(gb, ggml_mul(ctx0, t04->grad, ggml_repeat(ctx0, layer.attention_norm, t02)));              assert_shape_2d(t02->grad, n_embd, N*n_batch);
         back_layer_inp = t02;
+        // use_buf(0);
 
         use_buf(-1);
         layer.attention_norm->grad = expand(gb, add_or_set(layer.attention_norm->grad, ggml_repeat_back(ctx0, t03->grad, layer.attention_norm)));   assert_shape_1d(layer.attention_norm->grad, n_embd);
@@ -1836,19 +1833,21 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
         layer.w1->grad             = expand(gb, add_or_set(layer.w1->grad,             ggml_out_prod(ctx0, t24, t26->grad)));                       assert_shape_2d(layer.w1->grad,             n_embd, n_ff);
         layer.w2->grad             = expand(gb, add_or_set(layer.w2->grad,             ggml_out_prod(ctx0, t28, t29->grad)));                       assert_shape_2d(layer.w2->grad,             n_ff, n_embd);
         layer.w3->grad             = expand(gb, add_or_set(layer.w3->grad,             ggml_out_prod(ctx0, t24, t25->grad)));                       assert_shape_2d(layer.w3->grad,             n_embd, n_ff);
+        // use_buf(0);
     }
-    clr_buf(1);
-    use_buf(1);
+    clr_buf(0);
+    use_buf(0);
     t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad)));  assert_shape_2d(t01->grad, n_embd, N*n_batch);
     use_buf(-1);
     model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings));            assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab);
+    // clr_buf(1);
+    // clr_buf(0);
 
     *logits = t35;
 
     if (track_max_mem) {
         printf("%s: max size compute buf0: %zu\n", __func__, buf_maxs[0]);
         printf("%s: max size compute buf1: %zu\n", __func__, buf_maxs[1]);
-        printf("%s: max size compute buf2: %zu\n", __func__, buf_maxs[2]);
     }
 
     return t36;
@@ -2649,7 +2648,6 @@ struct train_params {
     int mem_compute_gb;
     int mem_compute0_gb;
     int mem_compute1_gb;
-    int mem_compute2_gb;
 };
 
 struct train_params get_default_train_params() {
@@ -2694,10 +2692,9 @@ struct train_params get_default_train_params() {
     params.adam_decay        = 1e-3;
 
     params.mem_model_gb   = 2;
-    params.mem_compute_gb = 8;
-    params.mem_compute0_gb = 24;
-    params.mem_compute1_gb = 8;
-    params.mem_compute2_gb = 8;
+    params.mem_compute_gb = 24;
+    params.mem_compute0_gb = 8;
+    params.mem_compute1_gb = 2;
 
     return params;
 }
@@ -2744,7 +2741,6 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --mem-compute N            Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb);
     fprintf(stderr, "  --mem-compute0 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute0_gb);
     fprintf(stderr, "  --mem-compute1 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute1_gb);
-    fprintf(stderr, "  --mem-compute2 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute2_gb);
     fprintf(stderr, "\n");
 }
 
@@ -2954,12 +2950,6 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->mem_compute1_gb = std::stoi(argv[i]);
-        } else if (arg == "--mem-compute2") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->mem_compute2_gb = std::stoi(argv[i]);
         } else if (arg == "-h" || arg == "--help") {
             train_print_usage(argc, argv, &default_params);
             exit(0);
@@ -3117,10 +3107,8 @@ int main(int argc, char ** argv) {
 
     size_t size_buf_0 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute0_gb);
     size_t size_buf_1 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute1_gb);
-    size_t size_buf_2 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute2_gb);
     uint8_t * compute_buf_0 = new uint8_t[size_buf_0];
     uint8_t * compute_buf_1 = new uint8_t[size_buf_1];
-    uint8_t * compute_buf_2 = new uint8_t[size_buf_2];
 
     GGML_ASSERT(n_tokens < (int) train_tokens.size());
     std::vector<int> train_samples;
@@ -3182,8 +3170,8 @@ int main(int argc, char ** argv) {
                     &model, ctx0,
                     gf, gb,
                     &logits, tokens_input, target_probs,
-                    compute_buf_0, compute_buf_1, compute_buf_2,
-                    size_buf_0, size_buf_1, size_buf_2,
+                    compute_buf_0, compute_buf_1,
+                    size_buf_0, size_buf_1,
                     n_tokens, n_batch);
         } else if (params.use_flash) {
             logits = forward_batch_wo_cache_flash_attn(&model, ctx0, gf, tokens_input, n_tokens, n_batch);
@@ -3335,7 +3323,6 @@ int main(int argc, char ** argv) {
     delete[] compute_addr;
     delete[] compute_buf_0;
     delete[] compute_buf_1;
-    delete[] compute_buf_2;
     ggml_free(model.ctx);
 
     return 0;

From 7be3222b64ab7ada5ea297fa3841d7af8fdbb911 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 12 Jun 2023 00:01:18 +0200
Subject: [PATCH 83/86] avoid creating unnecessary grad tensors

previously we need to create grads for model parameters, so that expand(..) correctly populates cgraph->leafs & cgraph->grads
this wasted memory, because unnecessary grad for each op were automatically created:
the automatically generated grad was unnecessary because we later manually set the grad (e.g. t35->grad = expand(gb, ...) ).
this discarded the automatically generated grad resulting in wasted memory.

improved this by changing expand(..) to not use ggml_build_forward_expand.
expand set cgraph->nodes but not the leafs.
cgraph->leafs & cgraph->grads are set in another pass after the last expand call.
---
 .../train-text-from-scratch.cpp               | 124 ++++++++++++++----
 1 file changed, 96 insertions(+), 28 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 9244088dc84da..63f976f0db3dc 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1337,6 +1337,82 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
     return inpL;
 }
 
+// expand the graph nodes without creating leafs.
+struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) {
+    // check if already visited
+    for (int i = 0; i < g->n_nodes; i++) {
+        if (g->nodes[i] == t) {
+            return t;
+        }
+    }
+
+    for (int i = 0; i < g->n_leafs; i++) {
+        if (g->leafs[i] == t) {
+            return t;
+        }
+    }
+
+    if (t->src0) {
+        expand(g, t->src0);
+    }
+
+    if (t->src1) {
+        expand(g, t->src1);
+    }
+
+    for (int i = 0; i < GGML_MAX_OPT; ++i) {
+        if (t->opt[i]) {
+            expand(g, t->opt[i]);
+        }
+    }
+
+    GGML_ASSERT(g->n_nodes < GGML_MAX_NODES);
+
+    if (strlen(t->name) == 0) {
+        snprintf(t->name, sizeof(t->name), "node_%d", g->n_nodes);
+    }
+
+    g->nodes[g->n_nodes] = t;
+    g->grads[g->n_nodes] = t->grad;
+    g->n_nodes++;
+    return t;
+}
+
+void graph_set_leafs_grads(struct ggml_cgraph * g) {
+    // moves leaf nodes to g->leafs.
+    // i.e. g->n_nodes might change.
+    int n_nodes = 0;
+    for (int i = 0; i < g->n_nodes; ++i) {
+        struct ggml_tensor * node = g->nodes[i];
+        const bool is_leaf = node->op == GGML_OP_NONE && node->grad == NULL;
+        if (is_leaf) {
+            GGML_ASSERT(g->n_leafs < GGML_MAX_NODES);
+
+            if (strlen(node->name) == 0) {
+                snprintf(node->name, sizeof(node->name), "leaf_%d", g->n_leafs);
+            }
+
+            g->leafs[g->n_leafs] = node;
+            g->n_leafs++;
+        } else {
+            GGML_ASSERT(n_nodes < GGML_MAX_NODES);
+
+            if (strlen(node->name) == 0) {
+                snprintf(node->name, sizeof(node->name), "node_%d", n_nodes);
+            }
+
+            g->nodes[n_nodes] = node;
+            g->grads[n_nodes] = node->grad;
+            n_nodes++;
+        }
+    }
+    for (int i=n_nodes; i < g->n_nodes; ++i) {
+        g->nodes[n_nodes] = NULL;
+        g->grads[n_nodes] = NULL;
+    }
+    g->n_nodes = n_nodes;
+}
+
 struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
         struct my_llama_model * model,
         struct ggml_context   * ctx0,
@@ -1375,11 +1451,6 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
     const int n_ff       = get_n_ff(&hparams);
     const int rope_mode  = 0;
 
-    auto expand = [] (struct ggml_cgraph * g, struct ggml_tensor * t) -> struct ggml_tensor * {
-        ggml_build_forward_expand(g, t);
-        return t;
-    };
-
     int last_buf = -1;
     size_t buf_offs[2] = { 0, 0 };
     size_t buf_size[2] = { size_buf_0,
@@ -1423,6 +1494,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
         }
     };
 
+
     auto view__q = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
         int64_t ne0 = n_embd/n_head;
         int64_t ne1 = N;
@@ -1472,28 +1544,21 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
 
     use_buf(-1);
 
-    // need to create grads for model parameters, so that expand(..) correctly populates cgraph->leafs & cgraph->grads
-    // this wastes memory, because unnecessary grad for each op is automatically created:
-    // the automatically generated grad is unnecessary because we later manually set the grad (e.g. t35->grad = expand(gb, ...) ).
-    // this discards the automatically generated grad resulting in wasted memory.
-    // TODO: improve this, possibly by changing expand(..) to not use ggml_build_forward_expand.
-    //       expand should correctly set cgraph->nodes.
-    //       cgraph->leafs & cgraph->grads could be set in another pass after the last expand call.
-    model->tok_embeddings->grad = ggml_dup_tensor(ctx0, model->tok_embeddings->grad);
-    model->norm->grad = ggml_dup_tensor(ctx0, model->norm->grad);
-    model->output->grad = ggml_dup_tensor(ctx0, model->output->grad);
+    model->tok_embeddings->grad    = NULL;
+    model->norm->grad              = NULL;
+    model->output->grad            = NULL;
 
     for (int il = 0; il < n_layer; ++il) {
         struct my_llama_layer & layer = model->layers[il];
-        layer.attention_norm->grad = ggml_dup_tensor(ctx0, layer.attention_norm->grad);
-        layer.wq->grad             = ggml_dup_tensor(ctx0, layer.wq->grad);
-        layer.wk->grad             = ggml_dup_tensor(ctx0, layer.wk->grad);
-        layer.wv->grad             = ggml_dup_tensor(ctx0, layer.wv->grad);
-        layer.wo->grad             = ggml_dup_tensor(ctx0, layer.wo->grad);
-        layer.ffn_norm->grad       = ggml_dup_tensor(ctx0, layer.ffn_norm->grad);
-        layer.w1->grad             = ggml_dup_tensor(ctx0, layer.w1->grad);
-        layer.w2->grad             = ggml_dup_tensor(ctx0, layer.w2->grad);
-        layer.w3->grad             = ggml_dup_tensor(ctx0, layer.w3->grad);
+        layer.attention_norm->grad = NULL;
+        layer.wq->grad             = NULL;
+        layer.wk->grad             = NULL;
+        layer.wv->grad             = NULL;
+        layer.wo->grad             = NULL;
+        layer.ffn_norm->grad       = NULL;
+        layer.w1->grad             = NULL;
+        layer.w2->grad             = NULL;
+        layer.w3->grad             = NULL;
     }
 
     clr_buf(0);
@@ -1717,10 +1782,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
     *gb = *gf;
 
     // t36->grad gets set to one by optimizer, so we need the tensor.
-    GGML_ASSERT(t36->grad != NULL);
     // initialize it with 1.0f to make sure.
-    // use_buf(-1);
-    // t36->grad = expand(gb, ggml_new_f32(ctx0, 1.0f));
+    use_buf(-1);
+    t36->grad = expand(gb, ggml_new_f32(ctx0, 1.0f));
 
     use_buf(0);
     t35->grad = expand(gb, ggml_cross_entropy_loss_back(ctx0, t35, targets, t36->grad));              assert_shape_3d(t35->grad, n_vocab, N, n_batch);
@@ -1839,7 +1903,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
     use_buf(0);
     t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad)));  assert_shape_2d(t01->grad, n_embd, N*n_batch);
     use_buf(-1);
-    model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings));            assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab);
+    model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings));                  assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab);
     // clr_buf(1);
     // clr_buf(0);
 
@@ -1850,6 +1914,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
         printf("%s: max size compute buf1: %zu\n", __func__, buf_maxs[1]);
     }
 
+    // now that all grads are created, set the graph leafs and grads
+    graph_set_leafs_grads(gf);
+    graph_set_leafs_grads(gb);
+
     return t36;
 }
 

From 32dc22728471a645f0ae8020441e17025e3f33b0 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 12 Jun 2023 20:42:44 +0200
Subject: [PATCH 84/86] print used training seed

---
 .../train-text-from-scratch/train-text-from-scratch.cpp    | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 63f976f0db3dc..d4d293e232c22 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3043,12 +3043,11 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-
     if (params.seed < 0) {
-        srand(time(NULL));
-    } else {
-        srand(params.seed);
+        params.seed = time(NULL);
     }
+    printf("%s: seed: %d\n", __func__, params.seed);
+    srand(params.seed);
 
     struct llama_context_params llama_params = llama_context_default_params();
     llama_params.vocab_only = true;

From cb469f7efb911eabe3e53ece965e1be58ad9ea51 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 12 Jun 2023 20:43:48 +0200
Subject: [PATCH 85/86] zero initialize gfbuf and gbbuf

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index d4d293e232c22..51271b497ffe5 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3218,6 +3218,9 @@ int main(int argc, char ** argv) {
         struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32) + (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
         struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32) + (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
 
+        memset(gfbuf->data, 0, ggml_nbytes(gfbuf));
+        memset(gbbuf->data, 0, ggml_nbytes(gbbuf));
+
         struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
         struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
 

From d4b6438708148c36f605a482192887fac5242244 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 13 Jun 2023 21:38:00 +0300
Subject: [PATCH 86/86] ci : re-enable workflows + add README for training

---
 .github/workflows/editorconfig.yml         | 17 ++++++++++++++++
 .github/workflows/tidy-review.yml          | 23 ++++++++++++++++++++++
 examples/train-text-from-scratch/README.md | 22 +++++++++++++++++++++
 llama.cpp                                  |  2 ++
 4 files changed, 64 insertions(+)
 create mode 100644 .github/workflows/editorconfig.yml
 create mode 100644 .github/workflows/tidy-review.yml
 create mode 100644 examples/train-text-from-scratch/README.md

diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml
new file mode 100644
index 0000000000000..b4e535acf1f64
--- /dev/null
+++ b/.github/workflows/editorconfig.yml
@@ -0,0 +1,17 @@
+name: EditorConfig Checker
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  editorconfig:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: editorconfig-checker/action-editorconfig-checker@main
+      - run: editorconfig-checker
diff --git a/.github/workflows/tidy-review.yml b/.github/workflows/tidy-review.yml
new file mode 100644
index 0000000000000..a4bc8d976560e
--- /dev/null
+++ b/.github/workflows/tidy-review.yml
@@ -0,0 +1,23 @@
+name: clang-tidy-review
+
+on:
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  clang-tidy-review:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: ZedThree/clang-tidy-review@v0.13.0
+      id: review
+      with:
+        lgtm_comment_body: ''
+        build_dir: build
+        cmake_command: cmake . -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=on
+        split_workflow: true
+
+    - uses: ZedThree/clang-tidy-review/upload@v0.13.0
diff --git a/examples/train-text-from-scratch/README.md b/examples/train-text-from-scratch/README.md
new file mode 100644
index 0000000000000..5344d1f522a57
--- /dev/null
+++ b/examples/train-text-from-scratch/README.md
@@ -0,0 +1,22 @@
+# train-text-from-scratch
+
+Basic usage instructions:
+
+```bash
+# get training data
+wget https://github.com/brunoklein99/deep-learning-notes/blob/master/shakespeare.txt
+
+# train
+./bin/train-text-from-scratch \
+        --vocab-model ../models/ggml-vocab.bin \
+        --ctx 64 --embd 256 --head 8 --layer 16 \
+        --checkpoint-in  chk-shakespeare-256x16.bin \
+        --checkpoint-out chk-shakespeare-256x16.bin \
+        --model-out ggml-shakespeare-256x16-f32.bin \
+        --train-data "shakespeare.txt" \
+        -t 6 -b 16 -n 32 --seed 1 --adam-iter 16 \
+        --print-details-interval 0 --predict 16 --use-flash
+
+# predict
+./bin/main -m ggml-shakespeare-256x16-f32.bin
+```
diff --git a/llama.cpp b/llama.cpp
index dd9725ea3c27c..0dc45bd6dfe3f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1206,6 +1206,7 @@ static void llama_model_load_internal(
                 mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
 
         (void) vram_scratch;
+        (void) n_batch;
 #ifdef GGML_USE_CUBLAS
         vram_scratch = n_batch * MB;
         ggml_cuda_set_scratch_size(vram_scratch);
@@ -1233,6 +1234,7 @@ static void llama_model_load_internal(
         model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
     }
 
+    (void) tensor_split;
 #if defined(GGML_USE_CUBLAS)
     {
         ggml_cuda_set_tensor_split(tensor_split);