From bc9e84daca501568a8ca0d0618d532caec598b6e Mon Sep 17 00:00:00 2001 From: xaedes Date: Fri, 21 Apr 2023 16:31:12 +0200 Subject: [PATCH 01/86] add python wrapper https://gist.github.com/abetlen/2b90e5f153f6efd00931d098de5c73ce --- py/llama_cpp/__init__.py | 0 py/llama_cpp/llama.py | 173 +++++++++++++++++++++++++++++++++++++++ setup.py | 15 ++++ 3 files changed, 188 insertions(+) create mode 100644 py/llama_cpp/__init__.py create mode 100644 py/llama_cpp/llama.py create mode 100644 setup.py diff --git a/py/llama_cpp/__init__.py b/py/llama_cpp/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/py/llama_cpp/llama.py b/py/llama_cpp/llama.py new file mode 100644 index 0000000000000..39da58f17c83e --- /dev/null +++ b/py/llama_cpp/llama.py @@ -0,0 +1,173 @@ +import os +import sys +import glob +import ctypes + +from ctypes import c_int, c_float, c_double, c_char_p, c_void_p, c_bool, POINTER, Structure + + +# Load the library +if sys.platform == 'win32': + lib = ctypes.cdll.LoadLibrary(next(iter(glob.glob(os.path.join(os.path.dirname(__file__), '..', '..', '**', 'llama.dll'), recursive=True)))) +else: + lib = ctypes.cdll.LoadLibrary(next(iter(glob.glob(os.path.join(os.path.dirname(__file__), '..', '..', '**', 'libllama.so'), recursive=True)))) + + +# C types +llama_token = c_int +llama_token_p = POINTER(llama_token) + +class llama_token_data(Structure): + _fields_ = [ + ('id', llama_token), # token id + ('p', c_float), # probability of the token + ('plog', c_float), # log probability of the token + ] + +llama_token_data_p = POINTER(llama_token_data) +llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) +class llama_context_params(Structure): + _fields_ = [ + ('n_ctx', c_int), # text context + ('n_parts', c_int), # -1 for default + ('seed', c_int), # RNG seed, 0 for random + ('f16_kv', c_bool), # use fp16 for KV cache + ('logits_all', c_bool), # the llama_eval() call computes all logits, not just the last one + ('vocab_only', c_bool), # only load the vocabulary, no weights + ('use_mmap', c_bool), # use mmap if possible + ('use_mlock', c_bool), # force system to keep model in RAM + ('embedding', c_bool), # embedding mode only + ('progress_callback', llama_progress_callback), # called with a progress value between 0 and 1, pass NULL to disable + ('progress_callback_user_data', c_void_p), # context pointer passed to the progress callback + ] + +llama_context_params_p = POINTER(llama_context_params) + +llama_context_p = c_void_p + +# C functions +lib.llama_context_default_params.argtypes = [] +lib.llama_context_default_params.restype = llama_context_params + +lib.llama_init_from_file.argtypes = [c_char_p, llama_context_params] +lib.llama_init_from_file.restype = llama_context_p + +lib.llama_free.argtypes = [llama_context_p] +lib.llama_free.restype = None + +lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int] +lib.llama_model_quantize.restype = c_int + +lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_int] +lib.llama_eval.restype = c_int + +lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, c_bool] +lib.llama_tokenize.restype = c_int + +lib.llama_n_vocab.argtypes = [llama_context_p] +lib.llama_n_vocab.restype = c_int + +lib.llama_n_ctx.argtypes = [llama_context_p] +lib.llama_n_ctx.restype = c_int + +lib.llama_get_logits.argtypes = [llama_context_p] +lib.llama_get_logits.restype = POINTER(c_float) + +lib.llama_get_embeddings.argtypes = [llama_context_p] +lib.llama_get_embeddings.restype = POINTER(c_float) + +lib.llama_token_to_str.argtypes = [llama_context_p, llama_token] +lib.llama_token_to_str.restype = c_char_p + +lib.llama_token_bos.argtypes = [] +lib.llama_token_bos.restype = llama_token + +lib.llama_token_eos.argtypes = [] +lib.llama_token_eos.restype = llama_token + +lib.llama_sample_top_p_top_k.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_float, c_float, c_float] +lib.llama_sample_top_p_top_k.restype = llama_token + +lib.llama_print_timings.argtypes = [llama_context_p] +lib.llama_print_timings.restype = None + +lib.llama_reset_timings.argtypes = [llama_context_p] +lib.llama_reset_timings.restype = None + +lib.llama_print_system_info.argtypes = [] +lib.llama_print_system_info.restype = c_char_p + +# Python functions +def llama_context_default_params() -> llama_context_params: + params = lib.llama_context_default_params() + return params + +def llama_init_from_file(path_model: str, params: llama_context_params) -> llama_context_p: + """Various functions for loading a ggml llama model. + Allocate (almost) all memory needed for the model. + Return NULL on failure """ + return lib.llama_init_from_file(path_model.encode('utf-8'), params) + +def llama_free(ctx: llama_context_p): + """Free all allocated memory""" + lib.llama_free(ctx) + +def llama_model_quantize(fname_inp: str, fname_out: str, itype: c_int, qk: c_int) -> c_int: + """Returns 0 on success""" + return lib.llama_model_quantize(fname_inp.encode('utf-8'), fname_out.encode('utf-8'), itype, qk) + +def llama_eval(ctx: llama_context_p, tokens: llama_token_p, n_tokens: c_int, n_past: c_int, n_threads: c_int) -> c_int: + """Run the llama inference to obtain the logits and probabilities for the next token. + tokens + n_tokens is the provided batch of new tokens to process + n_past is the number of tokens to use from previous eval calls + Returns 0 on success""" + return lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads) + +def llama_tokenize(ctx: llama_context_p, text: str, tokens: llama_token_p, n_max_tokens: c_int, add_bos: c_bool) -> c_int: + """Convert the provided text into tokens. + The tokens pointer must be large enough to hold the resulting tokens. + Returns the number of tokens on success, no more than n_max_tokens + Returns a negative number on failure - the number of tokens that would have been returned""" + return lib.llama_tokenize(ctx, text.encode('utf-8'), tokens, n_max_tokens, add_bos) + +def llama_n_vocab(ctx: llama_context_p) -> c_int: + return lib.llama_n_vocab(ctx) + +def llama_n_ctx(ctx: llama_context_p) -> c_int: + return lib.llama_n_ctx(ctx) + +def llama_get_logits(ctx: llama_context_p): + """Token logits obtained from the last call to llama_eval() + The logits for the last token are stored in the last row + Can be mutated in order to change the probabilities of the next token + Rows: n_tokens + Cols: n_vocab""" + return lib.llama_get_logits(ctx) + +def llama_get_embeddings(ctx: llama_context_p): + """Get the embeddings for the input + shape: [n_embd] (1-dimensional)""" + return lib.llama_get_embeddings(ctx) + +def llama_token_to_str(ctx: llama_context_p, token: int) -> str: + """Token Id -> String. Uses the vocabulary in the provided context""" + return lib.llama_token_to_str(ctx, token).decode('utf-8') + +def llama_token_bos() -> llama_token: + return lib.llama_token_bos() + +def llama_token_eos() -> llama_token: + return lib.llama_token_eos() + +def llama_sample_top_p_top_k(ctx: llama_context_p, last_n_tokens_data: llama_token_p, last_n_tokens_size: c_int, top_k: c_int, top_p: c_float, temp: c_float, repeat_penalty: c_float) -> llama_token: + return lib.llama_sample_top_p_top_k(ctx, last_n_tokens_data, last_n_tokens_size, top_k, top_p, temp, repeat_penalty) + +def llama_print_timings(ctx: llama_context_p): + lib.llama_print_timings(ctx) + +def llama_reset_timings(ctx: llama_context_p): + lib.llama_reset_timings(ctx) + +def llama_print_system_info() -> str: + """Print system informaiton""" + return lib.llama_print_system_info().decode('utf-8') diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000..cc3a23f0739fd --- /dev/null +++ b/setup.py @@ -0,0 +1,15 @@ + +from setuptools import setup, find_packages +import glob, os + +setup( + name='llama_cpp', + version='0.0.1', + author='Anonymous', + author_email='', + license='All rights reserved', + packages=find_packages(where='py'), + package_dir={'': 'py'}, + install_requires=[], + entry_points={'console_scripts': []}, +) From 5f6b7150714e4db2d0804bab4f89d704ba656924 Mon Sep 17 00:00:00 2001 From: xaedes Date: Fri, 14 Apr 2023 14:40:06 +0200 Subject: [PATCH 02/86] fix decoding error. adds errors=ignore parameter --- py/llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py/llama_cpp/llama.py b/py/llama_cpp/llama.py index 39da58f17c83e..a09c8425e1d81 100644 --- a/py/llama_cpp/llama.py +++ b/py/llama_cpp/llama.py @@ -151,7 +151,7 @@ def llama_get_embeddings(ctx: llama_context_p): def llama_token_to_str(ctx: llama_context_p, token: int) -> str: """Token Id -> String. Uses the vocabulary in the provided context""" - return lib.llama_token_to_str(ctx, token).decode('utf-8') + return lib.llama_token_to_str(ctx, token).decode('utf-8', errors='ignore') def llama_token_bos() -> llama_token: return lib.llama_token_bos() From ed6b64fb98b16cfbba761d76eb36ce17350ac172 Mon Sep 17 00:00:00 2001 From: xaedes Date: Fri, 14 Apr 2023 03:16:50 +0200 Subject: [PATCH 03/86] add python bindings for functions to get and set the whole llama state (rng, logits, embedding and kv_cache) --- py/llama_cpp/llama.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/py/llama_cpp/llama.py b/py/llama_cpp/llama.py index a09c8425e1d81..3bd6b8efea85d 100644 --- a/py/llama_cpp/llama.py +++ b/py/llama_cpp/llama.py @@ -97,6 +97,15 @@ class llama_context_params(Structure): lib.llama_print_system_info.argtypes = [] lib.llama_print_system_info.restype = c_char_p +lib.llama_get_state_size.argtypes = [llama_context_p] +lib.llama_get_state_size.restype = c_size_t + +lib.llama_copy_state_data.argtypes = [llama_context_p, c_ubyte_p] +lib.llama_copy_state_data.restype = c_size_t + +lib.llama_set_state_data.argtypes = [llama_context_p, c_ubyte_p] +lib.llama_set_state_data.restype = c_size_t + # Python functions def llama_context_default_params() -> llama_context_params: params = lib.llama_context_default_params() @@ -171,3 +180,12 @@ def llama_reset_timings(ctx: llama_context_p): def llama_print_system_info() -> str: """Print system informaiton""" return lib.llama_print_system_info().decode('utf-8') + +def llama_get_state_size(ctx: llama_context_p) -> c_size_t: + return lib.llama_get_state_size(ctx) + +def llama_copy_state_data(ctx: llama_context_p, dst: c_ubyte_p) -> c_size_t: + return lib.llama_copy_state_data(ctx, dst) + +def llama_set_state_data(ctx: llama_context_p, src: c_ubyte_p) -> c_size_t: + return lib.llama_set_state_data(ctx, src) From 6e88dc93bdc002c7870fe0547fc9a9960ac9d1ad Mon Sep 17 00:00:00 2001 From: xaedes Date: Sat, 13 May 2023 19:05:24 +0200 Subject: [PATCH 04/86] update python bindings --- py/llama_cpp/llama.py | 220 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 178 insertions(+), 42 deletions(-) diff --git a/py/llama_cpp/llama.py b/py/llama_cpp/llama.py index 3bd6b8efea85d..bc0fa8b72855c 100644 --- a/py/llama_cpp/llama.py +++ b/py/llama_cpp/llama.py @@ -3,7 +3,7 @@ import glob import ctypes -from ctypes import c_int, c_float, c_double, c_char_p, c_void_p, c_bool, POINTER, Structure +from ctypes import c_int, c_float, c_double, c_char_p, c_void_p, c_bool, c_size_t, c_ubyte, POINTER, Structure # Load the library @@ -19,36 +19,58 @@ class llama_token_data(Structure): _fields_ = [ - ('id', llama_token), # token id - ('p', c_float), # probability of the token + ('id', llama_token), # token id + ('p', c_float), # probability of the token ('plog', c_float), # log probability of the token ] llama_token_data_p = POINTER(llama_token_data) + +class llama_token_data_array(Structure): + _fields_ = [ + ('data', llama_token_data_p), + ('size', c_size_t), + ('sorted', c_bool), + ] + +llama_token_data_array_p = POINTER(llama_token_data_array) + llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) class llama_context_params(Structure): _fields_ = [ - ('n_ctx', c_int), # text context - ('n_parts', c_int), # -1 for default - ('seed', c_int), # RNG seed, 0 for random - ('f16_kv', c_bool), # use fp16 for KV cache - ('logits_all', c_bool), # the llama_eval() call computes all logits, not just the last one - ('vocab_only', c_bool), # only load the vocabulary, no weights - ('use_mmap', c_bool), # use mmap if possible - ('use_mlock', c_bool), # force system to keep model in RAM - ('embedding', c_bool), # embedding mode only - ('progress_callback', llama_progress_callback), # called with a progress value between 0 and 1, pass NULL to disable - ('progress_callback_user_data', c_void_p), # context pointer passed to the progress callback + ('n_ctx', c_int), # text context + ('n_parts', c_int), # -1 for default + ('n_gpu_layers', c_int), # number of layers to store in VRAM + ('seed', c_int), # RNG seed, 0 for random + ('f16_kv', c_bool), # use fp16 for KV cache + ('logits_all', c_bool), # the llama_eval() call computes all logits, not just the last one + ('vocab_only', c_bool), # only load the vocabulary, no weights + ('use_mmap', c_bool), # use mmap if possible + ('use_mlock', c_bool), # force system to keep model in RAM + ('embedding', c_bool), # embedding mode only + ('progress_callback', llama_progress_callback), # called with a progress value between 0 and 1, pass NULL to disable + ('progress_callback_user_data', c_void_p), # context pointer passed to the progress callback ] + llama_context_params_p = POINTER(llama_context_params) llama_context_p = c_void_p +c_size_p = POINTER(c_size_t) +c_ubyte_p = POINTER(c_ubyte) +c_float_p = POINTER(c_float) + # C functions lib.llama_context_default_params.argtypes = [] lib.llama_context_default_params.restype = llama_context_params +lib.llama_mmap_supported.argtypes = [] +lib.llama_mmap_supported.restype = c_bool + +lib.llama_mlock_supported.argtypes = [] +lib.llama_mlock_supported.restype = c_bool + lib.llama_init_from_file.argtypes = [c_char_p, llama_context_params] lib.llama_init_from_file.restype = llama_context_p @@ -58,6 +80,30 @@ class llama_context_params(Structure): lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int] lib.llama_model_quantize.restype = c_int +lib.llama_apply_lora_from_file.argtypes = [llama_context_p, c_char_p, c_char_p, c_int] +lib.llama_apply_lora_from_file.restype = c_int + +lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p] +lib.llama_get_kv_cache_token_count.restype = c_int + +lib.llama_set_rng_seed.argtypes = [llama_context_p, c_int] +lib.llama_set_rng_seed.restype = None + +lib.llama_get_state_size.argtypes = [llama_context_p] +lib.llama_get_state_size.restype = c_size_t + +lib.llama_copy_state_data.argtypes = [llama_context_p, c_ubyte_p] +lib.llama_copy_state_data.restype = c_size_t + +lib.llama_set_state_data.argtypes = [llama_context_p, c_ubyte_p] +lib.llama_set_state_data.restype = c_size_t + +lib.llama_load_session_file.argtypes = [llama_context_p, c_char_p, llama_token_p, c_size_t, c_size_p] +lib.llama_load_session_file.restype = c_bool + +lib.llama_save_session_file.argtypes = [llama_context_p, c_char_p, llama_token_p, c_size_t] +lib.llama_save_session_file.restype = c_bool + lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_int] lib.llama_eval.restype = c_int @@ -70,11 +116,14 @@ class llama_context_params(Structure): lib.llama_n_ctx.argtypes = [llama_context_p] lib.llama_n_ctx.restype = c_int +lib.llama_n_embd.argtypes = [llama_context_p] +lib.llama_n_embd.restype = c_int + lib.llama_get_logits.argtypes = [llama_context_p] -lib.llama_get_logits.restype = POINTER(c_float) +lib.llama_get_logits.restype = c_float_p lib.llama_get_embeddings.argtypes = [llama_context_p] -lib.llama_get_embeddings.restype = POINTER(c_float) +lib.llama_get_embeddings.restype = c_float_p lib.llama_token_to_str.argtypes = [llama_context_p, llama_token] lib.llama_token_to_str.restype = c_char_p @@ -85,8 +134,44 @@ class llama_context_params(Structure): lib.llama_token_eos.argtypes = [] lib.llama_token_eos.restype = llama_token -lib.llama_sample_top_p_top_k.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_float, c_float, c_float] -lib.llama_sample_top_p_top_k.restype = llama_token +lib.llama_token_nl.argtypes = [] +lib.llama_token_nl.restype = llama_token + +lib.llama_sample_repetition_penalty.argtypes = [llama_context_p, llama_token_data_array_p, llama_token_p, c_size_t, c_float] +lib.llama_sample_repetition_penalty.restype = None + +lib.llama_sample_frequency_and_presence_penalties.argtypes = [llama_context_p, llama_token_data_array_p, llama_token_p, c_size_t, c_float, c_float] +lib.llama_sample_frequency_and_presence_penalties.restype = None + +lib.llama_sample_softmax.argtypes = [llama_context_p, llama_token_data_array_p] +lib.llama_sample_softmax.restype = None + +lib.llama_sample_top_k.argtypes = [llama_context_p, llama_token_data_array_p, c_int, c_size_t] +lib.llama_sample_top_k.restype = None + +lib.llama_sample_top_p.argtypes = [llama_context_p, llama_token_data_array_p, c_float, c_size_t] +lib.llama_sample_top_p.restype = None + +lib.llama_sample_tail_free.argtypes = [llama_context_p, llama_token_data_array_p, c_float, c_size_t] +lib.llama_sample_tail_free.restype = None + +lib.llama_sample_typical.argtypes = [llama_context_p, llama_token_data_array_p, c_float, c_size_t] +lib.llama_sample_typical.restype = None + +lib.llama_sample_temperature.argtypes = [llama_context_p, llama_token_data_array_p, c_float] +lib.llama_sample_temperature.restype = None + +lib.llama_sample_token_mirostat.argtypes = [llama_context_p, llama_token_data_array_p, c_float, c_float, c_int, c_float_p] +lib.llama_sample_token_mirostat.restype = llama_token + +lib.llama_sample_token_mirostat_v2.argtypes = [llama_context_p, llama_token_data_array_p, c_float, c_float, c_float_p] +lib.llama_sample_token_mirostat_v2.restype = llama_token + +lib.llama_sample_token_greedy.argtypes = [llama_context_p, llama_token_data_array_p] +lib.llama_sample_token_greedy.restype = llama_token + +lib.llama_sample_token.argtypes = [llama_context_p, llama_token_data_array_p] +lib.llama_sample_token.restype = llama_token lib.llama_print_timings.argtypes = [llama_context_p] lib.llama_print_timings.restype = None @@ -97,20 +182,18 @@ class llama_context_params(Structure): lib.llama_print_system_info.argtypes = [] lib.llama_print_system_info.restype = c_char_p -lib.llama_get_state_size.argtypes = [llama_context_p] -lib.llama_get_state_size.restype = c_size_t - -lib.llama_copy_state_data.argtypes = [llama_context_p, c_ubyte_p] -lib.llama_copy_state_data.restype = c_size_t - -lib.llama_set_state_data.argtypes = [llama_context_p, c_ubyte_p] -lib.llama_set_state_data.restype = c_size_t # Python functions def llama_context_default_params() -> llama_context_params: params = lib.llama_context_default_params() return params +def llama_mmap_supported() -> bool: + return lib.llama_mmap_supported() + +def llama_mlock_supported() -> bool: + return lib.llama_mlock_supported() + def llama_init_from_file(path_model: str, params: llama_context_params) -> llama_context_p: """Various functions for loading a ggml llama model. Allocate (almost) all memory needed for the model. @@ -125,6 +208,30 @@ def llama_model_quantize(fname_inp: str, fname_out: str, itype: c_int, qk: c_int """Returns 0 on success""" return lib.llama_model_quantize(fname_inp.encode('utf-8'), fname_out.encode('utf-8'), itype, qk) +def llama_apply_lora_from_file(ctx: llama_context_p, path_lora: str, path_base_model: str, n_threads: c_int) -> c_int: + return lib.llama_apply_lora_from_file(ctx, path_lora.encode('utf-8'), path_base_model.encode('utf-8'), n_threads) + +def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int: + return lib.llama_get_kv_cache_token_count(ctx) + +def llama_set_rng_seed(ctx: llama_context_p, seed: c_int): + return lib.llama_set_rng_seed(ctx, seed) + +def llama_get_state_size(ctx: llama_context_p) -> c_size_t: + return lib.llama_get_state_size(ctx) + +def llama_copy_state_data(ctx: llama_context_p, dst: c_ubyte_p) -> c_size_t: + return lib.llama_copy_state_data(ctx, dst) + +def llama_set_state_data(ctx: llama_context_p, src: c_ubyte_p) -> c_size_t: + return lib.llama_set_state_data(ctx, src) + +def llama_load_session_file(ctx: llama_context_p, path_session: str, tokens_out: llama_token_p, n_token_capacity: c_size_t, n_token_count_out: c_size_p) -> c_bool: + return lib.llama_load_session_file(ctx, path_session.encode('utf-8'), tokens_out, n_token_capacity, n_token_count_out) + +def llama_save_session_file(ctx: llama_context_p, path_session: str, tokens: llama_token_p, n_token_count: c_size_t) -> c_bool: + return lib.llama_save_session_file(ctx, path_session.encode('utf-8'), tokens, n_token_count) + def llama_eval(ctx: llama_context_p, tokens: llama_token_p, n_tokens: c_int, n_past: c_int, n_threads: c_int) -> c_int: """Run the llama inference to obtain the logits and probabilities for the next token. tokens + n_tokens is the provided batch of new tokens to process @@ -145,7 +252,10 @@ def llama_n_vocab(ctx: llama_context_p) -> c_int: def llama_n_ctx(ctx: llama_context_p) -> c_int: return lib.llama_n_ctx(ctx) -def llama_get_logits(ctx: llama_context_p): +def llama_n_embd(ctx: llama_context_p) -> c_int: + return lib.llama_n_embd(ctx) + +def llama_get_logits(ctx: llama_context_p) -> c_float_p: """Token logits obtained from the last call to llama_eval() The logits for the last token are stored in the last row Can be mutated in order to change the probabilities of the next token @@ -153,7 +263,7 @@ def llama_get_logits(ctx: llama_context_p): Cols: n_vocab""" return lib.llama_get_logits(ctx) -def llama_get_embeddings(ctx: llama_context_p): +def llama_get_embeddings(ctx: llama_context_p) -> c_float_p: """Get the embeddings for the input shape: [n_embd] (1-dimensional)""" return lib.llama_get_embeddings(ctx) @@ -168,8 +278,44 @@ def llama_token_bos() -> llama_token: def llama_token_eos() -> llama_token: return lib.llama_token_eos() -def llama_sample_top_p_top_k(ctx: llama_context_p, last_n_tokens_data: llama_token_p, last_n_tokens_size: c_int, top_k: c_int, top_p: c_float, temp: c_float, repeat_penalty: c_float) -> llama_token: - return lib.llama_sample_top_p_top_k(ctx, last_n_tokens_data, last_n_tokens_size, top_k, top_p, temp, repeat_penalty) +def llama_token_nl() -> llama_token: + return lib.llama_token_nl() + +def llama_sample_repetition_penalty(ctx: llama_context_p, candidates: llama_token_data_array_p, last_tokens: llama_token_p, last_tokens_size: c_size_t, penalty: float): + lib.llama_sample_repetition_penalty(ctx, candidates, last_tokens, last_tokens_size, penalty) + +def llama_sample_frequency_and_presence_penalties(ctx: llama_context_p, candidates: llama_token_data_array_p, last_tokens: llama_token_p, last_tokens_size: c_size_t, alpha_frequency: float, alpha_presence: float): + lib.llama_sample_frequency_and_presence_penalties(ctx, candidates, last_tokens, last_tokens_size, alpha_frequency, alpha_presence) + +def llama_sample_softmax(ctx: llama_context_p, candidates: llama_token_data_array_p): + lib.llama_sample_softmax(ctx, candidates) + +def llama_sample_top_k(ctx: llama_context_p, candidates: llama_token_data_array_p, k: c_int, min_keep: c_size_t): + lib.llama_sample_top_k(ctx, candidates, k, min_keep) + +def llama_sample_top_p(ctx: llama_context_p, candidates: llama_token_data_array_p, p: float, min_keep: c_size_t): + lib.llama_sample_top_p(ctx, candidates, c_float(p), c_size_t(min_keep)) + +def llama_sample_tail_free(ctx: llama_context_p, candidates: llama_token_data_array_p, z: float, min_keep: c_size_t): + lib.llama_sample_tail_free(ctx, candidates, z, min_keep) + +def llama_sample_typical(ctx: llama_context_p, candidates: llama_token_data_array_p, p: float, min_keep: c_size_t): + lib.llama_sample_typical(ctx, candidates, p, min_keep) + +def llama_sample_temperature(ctx: llama_context_p, candidates: llama_token_data_array_p, temp: float): + lib.llama_sample_temperature(ctx, candidates, temp) + +def llama_sample_token_mirostat(ctx: llama_context_p, candidates: llama_token_data_array_p, tau: float, eta: float, m: c_int, mu: c_float_p) -> llama_token: + return lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) + +def llama_sample_token_mirostat_v2(ctx: llama_context_p, candidates: llama_token_data_array_p, tau: float, eta: float, mu: c_float_p) -> llama_token: + return lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) + +def llama_sample_token_greedy(ctx: llama_context_p, candidates: llama_token_data_array_p) -> llama_token: + return lib.llama_sample_token_greedy(ctx, candidates) + +def llama_sample_token(ctx: llama_context_p, candidates: llama_token_data_array_p) -> llama_token: + return lib.llama_sample_token(ctx, candidates) def llama_print_timings(ctx: llama_context_p): lib.llama_print_timings(ctx) @@ -177,15 +323,5 @@ def llama_print_timings(ctx: llama_context_p): def llama_reset_timings(ctx: llama_context_p): lib.llama_reset_timings(ctx) -def llama_print_system_info() -> str: - """Print system informaiton""" - return lib.llama_print_system_info().decode('utf-8') - -def llama_get_state_size(ctx: llama_context_p) -> c_size_t: - return lib.llama_get_state_size(ctx) - -def llama_copy_state_data(ctx: llama_context_p, dst: c_ubyte_p) -> c_size_t: - return lib.llama_copy_state_data(ctx, dst) - -def llama_set_state_data(ctx: llama_context_p, src: c_ubyte_p) -> c_size_t: - return lib.llama_set_state_data(ctx, src) +def llama_print_system_info() -> c_char_p: + return lib.llama_print_system_info() \ No newline at end of file From 6e968d22b06c75041e089aabd9d55d9f9a1e3f43 Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 14 May 2023 16:07:08 +0200 Subject: [PATCH 05/86] add text generating baby-llama from scratch example --- examples/baby-llama/baby-llama-text.cpp | 1359 +++++++++++++++++++++++ 1 file changed, 1359 insertions(+) create mode 100644 examples/baby-llama/baby-llama-text.cpp diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp new file mode 100644 index 0000000000000..d114d689e05a1 --- /dev/null +++ b/examples/baby-llama/baby-llama-text.cpp @@ -0,0 +1,1359 @@ +#include "ggml.h" +#include "llama.h" +#include +#include +#include +#include +#include +#include +#include + + +struct random_normal_distribution { + std::mt19937 gen; + std::normal_distribution rd; + float min; + float max; +}; + + +struct random_uniform_distribution { + std::mt19937 gen; + std::uniform_real_distribution rd; +}; + +void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) { + rnd->gen = std::mt19937(seed); + rnd->rd = std::normal_distribution{mean, std}; + rnd->min = min; + rnd->max = max; +} + +void init_random_uniform_distribution(struct random_uniform_distribution * rnd, int seed, float min, float max) { + rnd->gen = std::mt19937(seed); + rnd->rd = std::uniform_real_distribution{min, max}; +} + +int clamp(const int v, const int min, const int max) { + return ((v < min) ? (min) : (v > max) ? (max) : v); +} + +float fclamp(const float v, const float min, const float max) { + return ((v < min) ? (min) : (v > max) ? (max) : v); +} + +float frand_normal(struct random_normal_distribution * rnd) { + return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max); +} + +float frand_uniform(struct random_uniform_distribution * rnd) { + return rnd->rd(rnd->gen); +} + +struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) { + switch (tensor->n_dims) { + case 1: + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); + *dst = frand_normal(rnd); + } + break; + case 2: + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + *dst = frand_normal(rnd); + } + } + break; + case 3: + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); + *dst = frand_normal(rnd); + } + } + } + break; + case 4: + for (int i3 = 0; i3 < tensor->ne[3]; i3++) { + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]); + *dst = frand_normal(rnd); + } + } + } + } + break; + default: + assert(false); + }; + return tensor; +} + +struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) { + switch (tensor->n_dims) { + case 1: + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); + *dst = frand_uniform(rnd); + } + break; + case 2: + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + *dst = frand_uniform(rnd); + } + } + break; + case 3: + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); + *dst = frand_uniform(rnd); + } + } + } + break; + case 4: + for (int i3 = 0; i3 < tensor->ne[3]; i3++) { + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]); + *dst = frand_uniform(rnd); + } + } + } + } + break; + default: + assert(false); + }; + return tensor; +} + +struct my_llama_hparams { + uint32_t n_vocab = 32000; + uint32_t n_ctx = 512; // this is provided as user input? + uint32_t n_embd = 4096; + uint32_t n_mult = 4; + uint32_t n_head = 32; + uint32_t n_layer = 32; + uint32_t n_rot = 64; + + bool operator!=(const my_llama_hparams& other) const { + return memcmp(this, &other, sizeof(my_llama_hparams)); + } +}; + +uint32_t get_n_ff(const struct my_llama_hparams* hparams) { + const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult; + return n_ff; +} + +struct my_llama_layer { + // normalization + struct ggml_tensor * attention_norm; + + // attention + struct ggml_tensor * wq; + struct ggml_tensor * wk; + struct ggml_tensor * wv; + struct ggml_tensor * wo; + + // normalization + struct ggml_tensor * ffn_norm; + + // ff + struct ggml_tensor * w1; + struct ggml_tensor * w2; + struct ggml_tensor * w3; +}; + +struct my_llama_kv_cache { + struct ggml_context * ctx = NULL; + + struct ggml_tensor * k; + struct ggml_tensor * v; + + // llama_ctx_buffer buf; + + int n; // number of tokens currently in the cache +}; + +struct my_llama_model { + struct ggml_context * ctx = NULL; + + my_llama_hparams hparams; + + struct ggml_tensor * tok_embeddings; + + struct ggml_tensor * norm; + struct ggml_tensor * output; + + std::vector layers; +}; + +void init_model(struct my_llama_model * model) { + const auto & hparams = model->hparams; + + const uint32_t n_embd = hparams.n_embd; + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_vocab = hparams.n_vocab; + + const uint32_t n_ff = get_n_ff(&hparams); + + struct ggml_context * ctx = model->ctx; + + model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab}); + model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // ("norm.weight", {n_embd}); + model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("output.weight", {n_embd, n_vocab}); + + model->layers.resize(n_layer); + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = model->layers[i]; + + // std::string layers_i = "layers." + std::to_string(i); + + layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd}); + + layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wq.weight", {n_embd, n_embd}); + layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wk.weight", {n_embd, n_embd}); + layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wv.weight", {n_embd, n_embd}); + layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wo.weight", {n_embd, n_embd}); + + layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".ffn_norm.weight", {n_embd}); + + layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}); + layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); // (layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}); + layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}); + } +} + + +void set_param_model(struct my_llama_model * model) { + const auto& hparams = model->hparams; + + const uint32_t n_layer = hparams.n_layer; + + struct ggml_context* ctx = model->ctx; + + ggml_set_param(ctx, model->tok_embeddings); + ggml_set_param(ctx, model->norm); + ggml_set_param(ctx, model->output); + + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = model->layers[i]; + + ggml_set_param(ctx, layer.attention_norm); + ggml_set_param(ctx, layer.wq); + ggml_set_param(ctx, layer.wk); + ggml_set_param(ctx, layer.wv); + ggml_set_param(ctx, layer.wo); + ggml_set_param(ctx, layer.ffn_norm); + ggml_set_param(ctx, layer.w1); + ggml_set_param(ctx, layer.w2); + ggml_set_param(ctx, layer.w3); + } +} + +void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) { + const auto & hparams = model->hparams; + + const uint32_t n_layer = hparams.n_layer; + + struct random_normal_distribution rnd; + init_random_normal_distribution(&rnd, seed, mean, std, min, max); + + randomize_tensor_normal(model->tok_embeddings, &rnd); + randomize_tensor_normal(model->norm, &rnd); + randomize_tensor_normal(model->output, &rnd); + + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = model->layers[i]; + randomize_tensor_normal(layer.attention_norm, &rnd); + + randomize_tensor_normal(layer.wq, &rnd); + randomize_tensor_normal(layer.wk, &rnd); + randomize_tensor_normal(layer.wv, &rnd); + randomize_tensor_normal(layer.wo, &rnd); + + randomize_tensor_normal(layer.ffn_norm, &rnd); + + randomize_tensor_normal(layer.w1, &rnd); + randomize_tensor_normal(layer.w2, &rnd); + randomize_tensor_normal(layer.w3, &rnd); + } +} + +bool init_kv_cache(struct my_llama_kv_cache* cache, struct my_llama_model * model, int n_batch) { + const auto & hparams = model->hparams; + + const uint32_t n_ctx = hparams.n_ctx; + const uint32_t n_embd = hparams.n_embd; + const uint32_t n_layer = hparams.n_layer; + + const int64_t n_mem = n_layer*n_ctx*n_batch; + const int64_t n_elements = n_embd*n_mem; + + // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); + + // struct ggml_init_params params; + // params.mem_size = cache.buf.size; + // params.mem_buffer = cache.buf.addr; + // params.no_alloc = false; + if (!cache->ctx) { + struct ggml_init_params params; + params.mem_size = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024; + params.mem_buffer = NULL; + params.no_alloc = false; + + cache->ctx = ggml_init(params); + + if (!cache->ctx) { + fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__); + return false; + } + } + + cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); + cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); + + return true; +} + +struct ggml_tensor * forward( + struct my_llama_model * model, + struct my_llama_kv_cache * cache, + struct ggml_context * ctx0, + struct ggml_cgraph * gf, + struct ggml_tensor * tokens_input, + const int n_tokens, + const int n_past) { + + const int N = n_tokens; + + struct my_llama_kv_cache& kv_self = *cache; + const auto & hparams = model->hparams; + const int n_ctx = hparams.n_ctx; + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_head = hparams.n_head; + const int n_rot = hparams.n_rot; + + struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens)); + + struct ggml_tensor * kc = kv_self.k; + struct ggml_tensor * vc = kv_self.v; + + // inpL shape [n_embd,N,1,1] + struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + struct ggml_tensor * cur; + + // lctx.use_buf(ctx0, 0); + + // norm + { + // cur shape [n_embd,N,1,1] + cur = ggml_rms_norm(ctx0, inpL); + + // cur = attention_norm*cur + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].attention_norm, cur), + cur); + } + + // self-attention + { + // compute Q and K and RoPE them + // wq shape [n_embd, n_embd, 1, 1] + // wk shape [n_embd, n_embd, 1, 1] + // Qcur shape [n_embd/n_head, n_head, N, 1] + // Kcur shape [n_embd/n_head, n_head, N, 1] + struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); + struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); + + // store key and value to memory + { + // compute the transposed [N, n_embd] V matrix + // wv shape [n_embd, n_embd, 1, 1] + // Vcur shape [n_embd, N, 1, 1] + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N))); + + // kv_self.k shape [n_embd * n_ctx * n_layer, 1] + // kv_self.v shape [n_embd * n_ctx * n_layer, 1] + // k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0] + // v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0] + + /* { + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); + + // important: storing RoPE-ed version of K in the KV cache! + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } //*/ + + kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + vc = ggml_set_2d(ctx0, vc, Vcur, ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); + } + + // Qcur shape [n_embd/n_head, n_head, N, 1] + // Q shape [n_embd/n_head, N, n_head, 1] + struct ggml_tensor * Q = + ggml_permute(ctx0, + Qcur, + 0, 2, 1, 3); + + // kv_self.k shape [n_embd * n_ctx * n_layer, 1] + // K shape [n_embd/n_head, n_past + N, n_head, 1] + struct ggml_tensor * K = + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd), + n_embd/n_head, n_head, n_past + N), + 0, 2, 1, 3); + + // K * Q + // KQ shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + // KQ_scaled shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ_scaled = + ggml_scale(ctx0, + KQ, + ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); + + // KQ_masked = mask_past(KQ_scaled) + // KQ_masked shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); + + // KQ = soft_max(KQ_masked) + // KQ_soft_max shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + + // split cached V into n_head heads + //// V shape [n_past + N, n_embd/n_head, n_head, 1] + // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1] + struct ggml_tensor * V = + ggml_view_3d(ctx0, vc, + n_past + N, n_embd/n_head, n_head, + n_ctx*ggml_element_size(vc), + n_ctx*ggml_element_size(vc)*n_embd/n_head, + il*n_ctx*ggml_element_size(vc)*n_embd); + + // KQV shape [n_embd/n_head, N, n_head, 1] + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + // KQV_merged shape [n_embd/n_head, n_head, N, 1] + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + // KQV_merged shape + + // cur = KQV_merged.contiguous().view(n_embd, N) + // cur shape [n_embd,N,1,1] + cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N); + // cur = ggml_cpy(ctx0, + // KQV_merged, + // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + + // projection (no bias) + // cur shape [n_embd,N,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].wo, + cur); + } + + // lctx.use_buf(ctx0, 1); + + // inpFF shape [n_embd,N,1,1] + struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); + + // feed-forward network + { + // norm + { + // cur shape [n_embd,N,1,1] + cur = ggml_rms_norm(ctx0, inpFF); + + // cur = ffn_norm*cur + // cur shape [n_embd,N,1,1] + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), + cur); + } + + // tmp shape [n_ff,N,1,1] + struct ggml_tensor * tmp = ggml_mul_mat(ctx0, + model->layers[il].w3, + cur); + + // cur shape [n_ff,N,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].w1, + cur); + + // SILU activation + // cur shape [n_ff,N,1,1] + cur = ggml_silu(ctx0, cur); + + // cur shape [n_ff,N,1,1] + cur = ggml_mul(ctx0, cur, tmp); + + // cur shape [n_embd,N,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].w2, + cur); + } + + // cur shape [n_embd,N,1,1] + cur = ggml_add(ctx0, cur, inpFF); + + // input for next layer + // inpL shape [n_embd,N,1,1] + inpL = cur; + } + + // norm + { + + // inpL shape [n_embd,N,1,1] + inpL = ggml_rms_norm(ctx0, inpL); + + // inpL = norm*inpL + // inpL shape [n_embd,N,1,1] + inpL = ggml_mul(ctx0, + ggml_repeat(ctx0, model->norm, inpL), + inpL); + + //embeddings = inpL; + } + + // lm_head + // inpL shape [n_vocab,N,1,1] + inpL = ggml_mul_mat(ctx0, model->output, inpL); + + // run the computation + ggml_build_forward_expand(gf, inpL); + + return inpL; +} + +void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { + GGML_ASSERT(tensor->n_dims == 1); + GGML_ASSERT(tensor->ne[0] == ne0); +} + +void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { + GGML_ASSERT(tensor->n_dims == 2); + GGML_ASSERT(tensor->ne[0] == ne0); + GGML_ASSERT(tensor->ne[1] == ne1); +} + +void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { + GGML_ASSERT(tensor->n_dims == 3); + GGML_ASSERT(tensor->ne[0] == ne0); + GGML_ASSERT(tensor->ne[1] == ne1); + GGML_ASSERT(tensor->ne[2] == ne2); +} + +void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { + GGML_ASSERT(tensor->n_dims == 4); + GGML_ASSERT(tensor->ne[0] == ne0); + GGML_ASSERT(tensor->ne[1] == ne1); + GGML_ASSERT(tensor->ne[2] == ne2); + GGML_ASSERT(tensor->ne[3] == ne3); +} + +struct ggml_tensor * forward_batch( + struct my_llama_model * model, + struct my_llama_kv_cache * cache, + struct ggml_context * ctx0, + struct ggml_cgraph * gf, + struct ggml_tensor * tokens_input, + const int n_tokens, + const int n_past, + const int n_batch) { + + const int N = n_tokens; + + struct my_llama_kv_cache& kv_self = *cache; + const auto & hparams = model->hparams; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_head = hparams.n_head; + const int n_rot = hparams.n_rot; + const int n_ff = get_n_ff(&hparams); + + struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); + memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch); + + struct ggml_tensor * kc = kv_self.k; + struct ggml_tensor * vc = kv_self.v; + + // inpL shape [n_embd,N*n_batch,1] + struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); + assert_shape_2d(inpL, n_embd, N*n_batch); + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + struct ggml_tensor * cur; + + // lctx.use_buf(ctx0, 0); + + // norm + { + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_rms_norm(ctx0, inpL); + assert_shape_2d(cur, n_embd, N*n_batch); + + // cur = attention_norm*cur + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].attention_norm, cur), + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // self-attention + { + // compute Q and K and RoPE them + // wq shape [n_embd, n_embd, 1, 1] + // wk shape [n_embd, n_embd, 1, 1] + // Qcur shape [n_embd/n_head, n_head, N, n_batch] + // Kcur shape [n_embd/n_head, n_head, N, n_batch] + struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); + struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); + assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch); + assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch); + + // store key and value to memory + { + // compute the transposed [N, n_embd] V matrix + // wv shape [n_embd, n_embd, 1, 1] + // Vcur shape [N, n_embd, n_batch, 1] + struct ggml_tensor * Vcur = ggml_cont(ctx0, + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_mul_mat(ctx0, + model->layers[il].wv, + cur), + n_embd, N, n_batch), + 1, 0, 2, 3)); + + assert_shape_3d(Vcur, N, n_embd, n_batch); + + // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer] + // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer] + // k shape [n_embd * N, n_batch] == kv_self.k[:,n_past:n_past+N,:,il] + // v shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il] + + /* { + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); + + // important: storing RoPE-ed version of K in the KV cache! + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } //*/ + + kc = ggml_set_2d(ctx0, kc, + ggml_reshape_2d(ctx0, Kcur, n_embd*N, n_batch), + ggml_element_size(kc)*n_embd*n_ctx, + (ggml_element_size(kc)*n_embd)*(il*n_batch*n_ctx + n_past)); + vc = ggml_set_2d(ctx0, vc, + ggml_reshape_2d(ctx0, Vcur, N*n_embd, n_batch), + ggml_element_size(vc)*n_ctx*n_embd, + ggml_element_size(vc)*(n_past + il*n_embd*n_batch*n_ctx)); + + assert_shape_1d(kc, n_embd * n_ctx * n_batch * n_layer); + assert_shape_1d(vc, n_embd * n_ctx * n_batch * n_layer); + } + + // Qcur shape [n_embd/n_head, n_head, N, n_batch] + // Q shape [n_embd/n_head, N, n_head, n_batch] + struct ggml_tensor * Q = + ggml_permute(ctx0, + Qcur, + 0, 2, 1, 3); + assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch); + + // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer] + // K shape [n_embd/n_head, n_past + N, n_head, n_batch] + struct ggml_tensor * K = + ggml_permute(ctx0, + ggml_reshape_4d(ctx0, + ggml_view_3d(ctx0, + kc, + n_embd, + (n_past + N), + n_batch, + n_embd*ggml_element_size(kc), + n_ctx*n_embd*ggml_element_size(kc), + il*n_batch*n_ctx*n_embd*ggml_element_size(kc)), + n_embd/n_head, n_head, n_past + N, n_batch), + 0, 2, 1, 3); + assert_shape_4d(K, n_embd/n_head, n_past + N, n_head, n_batch); + + // K * Q + // KQ shape [n_past + N, N, n_head, n_batch] + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + assert_shape_4d(KQ, n_past + N, N, n_head, n_batch); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + // KQ_scaled shape [n_past + N, N, n_head, n_batch] + struct ggml_tensor * KQ_scaled = + ggml_scale(ctx0, + KQ, + ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); + assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch); + + // KQ_masked = mask_past(KQ_scaled) + // KQ_masked shape [n_past + N, N, n_head, n_batch] + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); + assert_shape_4d(KQ_masked, n_past + N, N, n_head, n_batch); + + // KQ = soft_max(KQ_masked) + // KQ_soft_max shape [n_past + N, N, n_head, n_batch] + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch); + + // split cached V into n_head heads + // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer] + // V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il] + struct ggml_tensor * V = + ggml_view_4d(ctx0, vc, + n_past + N, n_embd/n_head, n_head, n_batch, + ggml_element_size(vc)*n_ctx, + ggml_element_size(vc)*n_ctx*n_embd/n_head, + ggml_element_size(vc)*n_ctx*n_embd, + il*n_batch*n_ctx*n_embd*ggml_element_size(vc)); + assert_shape_4d(V, n_past + N, n_embd/n_head, n_head, n_batch); + + // KQV shape [n_embd/n_head, N, n_head, n_batch] + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + // KQV_merged shape [n_embd/n_head, n_head, N, n_batch] + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch); + // KQV_merged shape + + // cur = KQV_merged.contiguous().view(n_embd, N) + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch); + assert_shape_2d(cur, n_embd, N*n_batch); + // cur = ggml_cpy(ctx0, + // KQV_merged, + // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + + // projection (no bias) + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].wo, + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // lctx.use_buf(ctx0, 1); + + // inpFF shape [n_embd,N*n_batch,1,1] + struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); + assert_shape_2d(inpFF, n_embd, N*n_batch); + + // feed-forward network + { + // norm + { + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_rms_norm(ctx0, inpFF); + assert_shape_2d(cur, n_embd, N*n_batch); + + // cur = ffn_norm*cur + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // tmp shape [n_ff,N*n_batch,1,1] + struct ggml_tensor * tmp = ggml_mul_mat(ctx0, + model->layers[il].w3, + cur); + assert_shape_2d(tmp, n_ff, N*n_batch); + + // cur shape [n_ff,N*n_batch,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].w1, + cur); + assert_shape_2d(cur, n_ff, N*n_batch); + + // SILU activation + // cur shape [n_ff,N*n_batch,1,1] + cur = ggml_silu(ctx0, cur); + assert_shape_2d(cur, n_ff, N*n_batch); + + // cur shape [n_ff,N*n_batch,1,1] + cur = ggml_mul(ctx0, cur, tmp); + assert_shape_2d(cur, n_ff, N*n_batch); + + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].w2, + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_add(ctx0, cur, inpFF); + assert_shape_2d(cur, n_embd, N*n_batch); + + // input for next layer + // inpL shape [n_embd,N*n_batch,1,1] + inpL = cur; + assert_shape_2d(inpL, n_embd, N*n_batch); + } + + // norm + { + + // inpL shape [n_embd,N*n_batch,1,1] + inpL = ggml_rms_norm(ctx0, inpL); + assert_shape_2d(inpL, n_embd, N*n_batch); + + // inpL = norm*inpL + // inpL shape [n_embd,N*n_batch,1,1] + inpL = ggml_mul(ctx0, + ggml_repeat(ctx0, model->norm, inpL), + inpL); + + assert_shape_2d(inpL, n_embd, N*n_batch); + + //embeddings = inpL; + } + + // lm_head + // inpL shape [n_vocab,N*n_batch,1,1] + inpL = ggml_mul_mat(ctx0, model->output, inpL); + assert_shape_2d(inpL, n_vocab, N*n_batch); + + { + // inpL shape [n_vocab,N,n_batch,1] + inpL = ggml_reshape_3d(ctx0, + inpL, + n_vocab, N, n_batch); + assert_shape_3d(inpL, n_vocab, N, n_batch); + } + + // run the computation + ggml_build_forward_expand(gf, inpL); + + return inpL; +} + +void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { + assert(logits->n_dims == 2); + assert(probs->n_dims == 2); + assert(best_samples->n_dims == 1); + assert(logits->ne[1] == best_samples->ne[0]); + assert(logits->ne[0] == probs->ne[0]); + assert(logits->ne[1] == probs->ne[1]); + for (int i = 0; i < logits->ne[1]; ++i) { + float max_logit = ggml_get_f32_1d(logits, i * logits->ne[0]); + ggml_set_i32_1d(best_samples, i, 0); + for (int k = 0; k < logits->ne[0]; ++k) { + float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k); + if (logit > max_logit) { + max_logit = logit; + ggml_set_i32_1d(best_samples, i, k); + } + } + float psum = 0; + for (int k = 0; k < logits->ne[0]; ++k) { + float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k); + float p = (logit == -INFINITY) ? 0 : expf(logit - max_logit); + psum += p; + ggml_set_f32_1d(probs, i * probs->ne[0] + k, p); + } + for (int k = 0; k < logits->ne[0]; ++k) { + float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); + ggml_set_f32_1d(probs, i * probs->ne[0] + k, p / psum); + } + } +} + +void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { + GGML_ASSERT(best_samples->n_dims == 2); + GGML_ASSERT(logits->n_dims == 3); + GGML_ASSERT(probs->n_dims == 3); + int n_tokens = best_samples->ne[0]; + int n_batch = best_samples->ne[1]; + int n_vocab = logits->ne[0]; + GGML_ASSERT(n_tokens == logits->ne[1]); + GGML_ASSERT(n_batch == logits->ne[2]); + GGML_ASSERT(n_vocab == probs->ne[0]); + GGML_ASSERT(n_tokens == probs->ne[1]); + GGML_ASSERT(n_batch == probs->ne[2]); + + for (int k = 0; k < n_batch; ++k) { + struct ggml_tensor * best_samples_k = ggml_view_1d(ctx, + best_samples, + best_samples->ne[0], + k*best_samples->nb[1]); + struct ggml_tensor * logits_k = ggml_view_2d(ctx, + logits, + logits->ne[0], + logits->ne[1], + logits->nb[1], + k*logits->nb[2]); + struct ggml_tensor * probs_k = ggml_view_2d(ctx, + probs, + probs->ne[0], + probs->ne[1], + probs->nb[1], + k*probs->nb[2]); + sample_softmax(logits_k, probs_k, best_samples_k); + } +} + + +void print_row(struct ggml_tensor * probs, int i) { + for (int k = 0; k < probs->ne[0]; ++k) { + float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); + printf(" %.2f", p); + } + printf("\n"); +} + +void print_matrix(struct ggml_tensor * probs) { + assert(probs->n_dims == 2); + for (int i = 0; i < probs->ne[1]; ++i) { + for (int k = 0; k < probs->ne[0]; ++k) { + float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); + printf(" %.2f", p); + } + printf("\n"); + } +} + + +void print_token(struct llama_context * ctx, llama_token token) { + printf("%s", llama_token_to_str(ctx, token)); +} + +void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) { + for (int i=0; ine[0]; ++i) { + int token = ggml_get_i32_1d(tokens, i); + print_token(ctx, token); + } +} + +void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens) { + for (int i1=0; i1ne[1]; ++i1) { + for (int i0=0; i0ne[0]; ++i0) { + int token = ggml_get_i32_1d(tokens, i0 + i1*tokens->ne[0]); + print_token(ctx, token); + } + printf("\n--\n"); + } +} + +void get_example_targets(const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { + int n_tokens = tokens_input->ne[0]; + int n_vocab = targets->ne[0]; + + int n_examples = (n_train_data / (size_t) n_tokens); + int begin = (example_id % n_examples) * n_tokens; + GGML_ASSERT(begin+n_tokens-1 < n_train_data); + + ggml_set_f32(targets, -1.0f); + ggml_set_i32_1d(tokens_input, 0, llama_token_bos()); + for (int i=1; in_dims == 2); + GGML_ASSERT( targets->n_dims == 3); + int n_tokens = tokens_input->ne[0]; + int n_batch = tokens_input->ne[1]; + GGML_ASSERT(n_tokens == targets->ne[1]); + GGML_ASSERT(n_batch == targets->ne[2]); + + for (int k=0; kne[0], + k*tokens_input->nb[1]); + struct ggml_tensor * targets_k = ggml_view_2d(ctx, + targets, + targets->ne[0], + targets->ne[1], + targets->nb[1], + k*targets->nb[2]); + + get_example_targets(train_data, n_train_data, + example_id*n_batch + k, tokens_input_k, targets_k); + } +} + + +void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) { + int n_tokens = tokens_input->ne[0]; + int n_vocab = targets->ne[0]; + for (int i=0; i= 0 && size < INT_MAX); + std::vector buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + GGML_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), size); +} + +struct llama_file { + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + size_t size; + + llama_file(const char * fname, const char * mode) { + fp = std::fopen(fname, mode); + if (fp == NULL) { + throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); + } + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + + size_t tell() const { +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); +#else + long ret = std::ftell(fp); +#endif + GGML_ASSERT(ret != -1); // this really shouldn't fail + return (size_t) ret; + } + + void seek(size_t offset, int whence) { +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset, whence); +#else + int ret = std::fseek(fp, (long) offset, whence); +#endif + GGML_ASSERT(ret == 0); // same + } + + void read_raw(void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + std::size_t ret = std::fread(ptr, size, 1, fp); + if (ferror(fp)) { + throw std::runtime_error(format("read error: %s", strerror(errno))); + } + if (ret != 1) { + throw std::runtime_error(std::string("unexpectedly reached end of file")); + } + } + + std::uint32_t read_u32() { + std::uint32_t ret; + read_raw(&ret, sizeof(ret)); + return ret; + } + + std::string read_string(std::uint32_t len) { + std::vector chars(len); + read_raw(chars.data(), len); + return std::string(chars.data(), len); + } + + void write_raw(const void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + size_t ret = std::fwrite(ptr, size, 1, fp); + if (ret != 1) { + throw std::runtime_error(format("write error: %s", strerror(errno))); + } + } + + void write_u32(std::uint32_t val) { + write_raw(&val, sizeof(val)); + } + + ~llama_file() { + if (fp) { + std::fclose(fp); + } + } +}; + +int tokenize_file(struct llama_context * lctx, const char * filename, std::vector& out) { + struct llama_file f(filename, "rb"); + + std::vector buf; + buf.resize(f.size); + + f.read_raw(buf.data(), f.size); + + out.resize(buf.size()); + + int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false); + + if (n_tokens >= 0) { + out.resize(n_tokens); + } + + return n_tokens; +} + +int main(int argc, char ** argv) { + const char * default_model = "ggml-vic7b-uncensored-q4_0.bin"; + const char * default_train = "shakespeare.txt"; + const char * default_argv[3] = {argv[0], default_model, default_train}; + + if (argc < 3) { + fprintf(stderr, "usage: %s model training_data\n", argv[0]); + //return 1; + } + + const char * fn_model = (argc >= 2) ? argv[1] : default_argv[1]; + const char * fn_train = (argc >= 3) ? argv[2] : default_argv[2]; + + struct llama_context_params llama_params = llama_context_default_params(); + llama_params.vocab_only = true; + + struct llama_context * lctx = llama_init_from_file(fn_model, llama_params); + + std::vector train_tokens; + if (tokenize_file(lctx, fn_train, train_tokens) < 0) { + fprintf(stderr, "%s: failed to tokenize file '%s'\n", __func__, fn_train); + } + + struct my_llama_model model; + model.hparams.n_vocab = llama_n_vocab(lctx); + model.hparams.n_ctx = 16; + model.hparams.n_embd = 64; + model.hparams.n_mult = 8; + model.hparams.n_head = 8; + model.hparams.n_layer = 1; + model.hparams.n_rot = std::min(16u, model.hparams.n_embd / model.hparams.n_head); + + struct my_llama_kv_cache kv_self; + + int n_batch = 8; + + struct ggml_init_params lcparams; + lcparams.mem_size = 1024ll*1024ll*1024ll; + lcparams.mem_buffer = NULL; + lcparams.no_alloc = false; + + model.ctx = ggml_init(lcparams); + kv_self.ctx = model.ctx; + + printf("init model\n"); + init_model(&model); + set_param_model(&model); + randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f); + init_kv_cache(&kv_self, &model, n_batch); + + size_t compute_size = 1024ll*1024ll*1024ll*32ll; + uint8_t * compute_addr = new uint8_t[compute_size]; + + int n_examples = 256; + int n_tokens = model.hparams.n_ctx; + int n_vocab = model.hparams.n_vocab; + + for (int ex=0; ex Date: Sun, 14 May 2023 17:00:19 +0200 Subject: [PATCH 06/86] fix race condition bug in ggml_compute_forward_diag_mask_f32 --- ggml.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/ggml.c b/ggml.c index e5b3528d8a742..3e17dfeb6656d 100644 --- a/ggml.c +++ b/ggml.c @@ -10321,20 +10321,23 @@ static void ggml_compute_forward_diag_mask_f32( assert(src1->type == GGML_TYPE_I32); assert(ggml_nelements(src1) == 2); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { - return; - } - const int ith = params->ith; const int nth = params->nth; const int n_past = ((int32_t *) src1->data)[0]; const bool inplace = (bool)((int32_t *) src1->data)[1]; - if (!inplace) { + + if (!inplace && (params->type == GGML_TASK_INIT)) { + // dup needs to be synchronized across threads to avoid race conditions. + // => do it in INIT phase ggml_compute_forward_dup_same_cont(params, src0, dst); } + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + // TODO: handle transposed/permuted matrices const int n = ggml_nrows(src0); From ec1aea09ec041ab06ab898ac14bc23145f10ed8d Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 14 May 2023 17:16:26 +0200 Subject: [PATCH 07/86] implement ggml_soft_max_back for more performant backward pass of soft_max avoids creating big intermediate matrices of size n_embd x n_embd for llama layers and n_vocab x n_vocab for cross entropy loss --- ggml.c | 196 ++++++++++++++++++++++++++++++++++++++++++++------------- ggml.h | 12 ++++ 2 files changed, 164 insertions(+), 44 deletions(-) diff --git a/ggml.c b/ggml.c index 3e17dfeb6656d..2cc51fcc0d8d7 100644 --- a/ggml.c +++ b/ggml.c @@ -3325,6 +3325,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "DIAG_MASK_INF", "DIAG_MASK_ZERO", "SOFT_MAX", + "SOFT_MAX_BACK", "ROPE", "ROPE_BACK", "ALIBI", @@ -3338,7 +3339,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "MAP_BINARY", }; -static_assert(GGML_OP_COUNT == 50, "GGML_OP_COUNT != 50"); +static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -3385,6 +3386,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "diag_mask_inf(x)", "diag_mask_zero(x)", "soft_max(x)", + "soft_max_back(x)", "rope(x)", "rope_back(x)", "alibi(x)", @@ -3398,7 +3400,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "f(x,y)", }; -static_assert(GGML_OP_COUNT == 50, "GGML_OP_COUNT != 50"); +static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); @@ -5927,6 +5929,44 @@ struct ggml_tensor * ggml_soft_max_inplace( return ggml_soft_max_impl(ctx, a, true); } + +// ggml_soft_max_back + +struct ggml_tensor * ggml_soft_max_back_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { + bool is_node = false; + + if (a->grad || b->grad) { + is_node = true; // TODO : implement backward pass + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_SOFT_MAX_BACK; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + +struct ggml_tensor * ggml_soft_max_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_soft_max_back_impl(ctx, a, b, false); +} + +struct ggml_tensor * ggml_soft_max_back_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_soft_max_back_impl(ctx, a, b, true); +} + // ggml_rope struct ggml_tensor * ggml_rope_impl( @@ -10482,6 +10522,103 @@ static void ggml_compute_forward_soft_max( } } +// ggml_compute_forward_soft_max_back + +static void ggml_compute_forward_soft_max_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_are_same_shape(src1, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + // TODO: handle transposed/permuted matrices + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + float *dy = (float *)((char *) src0->data + i1*src0->nb[1]); + float *y = (float *)((char *) src1->data + i1*src1->nb[1]); + float *dx = (float *)((char *) dst->data + i1*dst->nb[1]); + +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + //printf("p[%d] = %f\n", i, p[i]); + assert(!isnan(s0[i])); + assert(!isnan(s1[i])); + } +#endif + // Jii = yi - yi*yi + // Jij = -yi*yj + // J = diag(y)-y.T*y + // dx = J * dy + // dxk = sum_i(Jki * dyi) + + // quadratic runtime, linear memory + for (int k = 0; k < nc; k++) { + + ggml_float sum = 0.0; + + for (int i = 0; i < k; i++) { + float Jki = -y[k]*y[i]; + sum += (ggml_float) Jki * dy[i]; + } + + float Jkk = y[k] - y[k]*y[k]; + sum += (ggml_float) Jkk * dy[k]; + + for (int i = k+1; i < nc; i++) { + float Jki = -y[k]*y[i]; + sum += (ggml_float) Jki * dy[i]; + } + + dx[k] = (float) sum; + } + +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + assert(!isnan(dx[i])); + assert(!isinf(dx[i])); + } +#endif + } +} + +static void ggml_compute_forward_soft_max_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_soft_max_back_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_alibi static void ggml_compute_forward_alibi_f32( @@ -12529,6 +12666,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_soft_max(params, tensor->src0, tensor); } break; + case GGML_OP_SOFT_MAX_BACK: + { + ggml_compute_forward_soft_max_back(params, tensor->src0, tensor->src1, tensor); + } break; case GGML_OP_ROPE: { ggml_compute_forward_rope(params, tensor->src0, tensor->src1, tensor); @@ -13146,50 +13287,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { - // y = softmax(x) - // - // Jii = yi - yi*yi - // Jij = -yi*yj - // J = diag(y)-y.*y - // dx = J * dy - // dxk = sum(Jkj * dyk) - - int64_t ne2[4] = { - tensor->ne[0], - 1, - tensor->ne[1]*tensor->ne[2], - tensor->ne[3] - }; - struct ggml_tensor * tensor2 = ggml_cont(ctx, - ggml_reshape_4d(ctx, - ggml_cont(ctx, tensor), - ne2[0], ne2[1], ne2[2], ne2[3])); - - struct ggml_tensor * grad2 = ggml_cont(ctx, - ggml_reshape_4d(ctx, - ggml_cont(ctx, tensor->grad), - ne2[0], ne2[1], ne2[2], ne2[3])); - - struct ggml_tensor * tensor2_t = ggml_cont(ctx, // [1,ne0,ne1*ne2,ne3] - ggml_permute(ctx, // [1,ne0,ne1*ne2,ne3] - tensor2, // [ne0,1,ne1*ne2,ne3] - 1, 0, 2, 3)); - src0->grad = - ggml_add_impl(ctx, - src0->grad, // [ne0,ne1,ne2,ne3] - ggml_reshape(ctx, // [ne0,ne1,ne2,ne3] - ggml_mul_mat(ctx, // [ne0,1,ne1*ne2,ne3] - ggml_sub(ctx, // [ne0,ne0,ne1*ne2,ne3] - ggml_diag(ctx, // [ne0,ne0,ne1*ne2,ne3] - tensor2), // [ne0,1,ne1*ne2,ne3] - ggml_mul_mat(ctx, // [ne0,ne0,ne1*ne2,ne3] - tensor2_t, // [1,ne0,ne1*ne2,ne3] - tensor2_t)), // [1,ne0,ne1*ne2,ne3] - grad2), // [ne0,1,ne1*ne2,ne3] - src0->grad), - inplace); + ggml_add_impl(ctx, src0->grad, + ggml_soft_max_back(ctx, tensor->grad, tensor), + inplace); } + + } break; + case GGML_OP_SOFT_MAX_BACK: + { + GGML_ASSERT(false); // TODO: not implemented } break; case GGML_OP_ROPE: { @@ -13718,6 +13825,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } break; case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: + case GGML_OP_SOFT_MAX_BACK: case GGML_OP_ROPE: case GGML_OP_ROPE_BACK: { diff --git a/ggml.h b/ggml.h index 967ef72d034dd..0a0989516265b 100644 --- a/ggml.h +++ b/ggml.h @@ -307,6 +307,7 @@ extern "C" { GGML_OP_DIAG_MASK_INF, GGML_OP_DIAG_MASK_ZERO, GGML_OP_SOFT_MAX, + GGML_OP_SOFT_MAX_BACK, GGML_OP_ROPE, GGML_OP_ROPE_BACK, GGML_OP_ALIBI, @@ -860,6 +861,17 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_soft_max_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_soft_max_back_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + // rotary position embedding // if mode & 1 == 1, skip n_past elements // if mode & 2 == 1, GPT-NeoX style From 4339f8cf285c95af503a7105b747b29ef7b1d64b Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 14 May 2023 17:55:02 +0200 Subject: [PATCH 08/86] improve softmax backward pass go from quadratic runtime to linear runtime by simplifying the formulas --- ggml.c | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/ggml.c b/ggml.c index 2cc51fcc0d8d7..0935549a4aacf 100644 --- a/ggml.c +++ b/ggml.c @@ -10571,27 +10571,25 @@ static void ggml_compute_forward_soft_max_back_f32( // J = diag(y)-y.T*y // dx = J * dy // dxk = sum_i(Jki * dyi) - - // quadratic runtime, linear memory - for (int k = 0; k < nc; k++) { - - ggml_float sum = 0.0; - - for (int i = 0; i < k; i++) { - float Jki = -y[k]*y[i]; - sum += (ggml_float) Jki * dy[i]; - } - - float Jkk = y[k] - y[k]*y[k]; - sum += (ggml_float) Jkk * dy[k]; - - for (int i = k+1; i < nc; i++) { - float Jki = -y[k]*y[i]; - sum += (ggml_float) Jki * dy[i]; - } - - dx[k] = (float) sum; - } + // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk + // dxk = sum_i(-yk*yi * dyi) + yk*dyk + // dxk = -yk * sum_i(yi * dyi) + yk*dyk + // dxk = -yk * dot(y, dy) + yk*dyk + // dxk = yk * (- dot(y, dy) + dyk) + // dxk = yk * (dyk - dot(y, dy)) + // + // post-order: + // dot_y_dy := dot(y, dy) + // dx := dy + // dx := dx - dot_y_dy + // dx := dx * y + + // linear runtime, no additional memory + float dot_y_dy = 0; + ggml_vec_dot_f32 (nc, &dot_y_dy, y, dy); + ggml_vec_cpy_f32 (nc, dx, dy); + ggml_vec_acc1_f32(nc, dx, -dot_y_dy); + ggml_vec_mul_f32 (nc, dx, dx, y); #ifndef NDEBUG for (int i = 0; i < nc; ++i) { From 69108167cd17ef40f40272defb4dabd106b3f003 Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 14 May 2023 20:54:57 +0200 Subject: [PATCH 09/86] fix race condition bug in non-inplace ggml_compute_forward_diag_mask_f32 memcpy needs to be synchronized across threads to avoid race conditions. => do it in INIT phase --- ggml.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/ggml.c b/ggml.c index 0935549a4aacf..2286c615da5f3 100644 --- a/ggml.c +++ b/ggml.c @@ -10358,8 +10358,8 @@ static void ggml_compute_forward_diag_mask_f32( const struct ggml_tensor * src1, struct ggml_tensor * dst, const float value) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 2); + GGML_ASSERT(src1->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_nelements(src1) == 2); const int ith = params->ith; const int nth = params->nth; @@ -10369,9 +10369,12 @@ static void ggml_compute_forward_diag_mask_f32( if (!inplace && (params->type == GGML_TASK_INIT)) { - // dup needs to be synchronized across threads to avoid race conditions. + // memcpy needs to be synchronized across threads to avoid race conditions. // => do it in INIT phase - ggml_compute_forward_dup_same_cont(params, src0, dst); + memcpy( + ((char *) dst->data), + ((char *) src0->data), + ggml_nbytes(dst)); } if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -10385,8 +10388,8 @@ static void ggml_compute_forward_diag_mask_f32( const int nr = src0->ne[1]; const int nz = n/nr; - assert( dst->nb[0] == sizeof(float)); - assert(src0->nb[0] == sizeof(float)); + GGML_ASSERT( dst->nb[0] == sizeof(float)); + GGML_ASSERT(src0->nb[0] == sizeof(float)); for (int k = 0; k < nz; k++) { for (int j = ith; j < nr; j += nth) { From 1f2b76de01ce3da98417518d4fad6f5d1fa89f6f Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 14 May 2023 20:55:24 +0200 Subject: [PATCH 10/86] fix bug in ggml_compute_forward_soft_max_back_f32 on DEBUG build --- ggml.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml.c b/ggml.c index 2286c615da5f3..06e3feea05424 100644 --- a/ggml.c +++ b/ggml.c @@ -10565,8 +10565,8 @@ static void ggml_compute_forward_soft_max_back_f32( #ifndef NDEBUG for (int i = 0; i < nc; ++i) { //printf("p[%d] = %f\n", i, p[i]); - assert(!isnan(s0[i])); - assert(!isnan(s1[i])); + assert(!isnan(dy[i])); + assert(!isnan(y[i])); } #endif // Jii = yi - yi*yi From c054079fb81a25acf941c9c27c19087c0eaed632 Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 14 May 2023 20:56:50 +0200 Subject: [PATCH 11/86] improve performance of mul_mat backward pass avoid transpose by using mul_mat with swapped arguments --- ggml.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/ggml.c b/ggml.c index 06e3feea05424..9a0a07aa57d40 100644 --- a/ggml.c +++ b/ggml.c @@ -13050,15 +13050,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // src1, // [n,p] // tensor->grad), // [m,p] // for now just using A*B==(B.T*A.T).T - ggml_cont(ctx, // [n,m] - ggml_transpose(ctx, // [n,m] - ggml_mul_mat(ctx, // [m,n] - ggml_cont(ctx, // [p,m] - ggml_transpose(ctx, // [p,m] - tensor->grad)), // [m,p] - ggml_cont(ctx, // [p,n] - ggml_transpose(ctx, // [p,n] - src1))))), // [n,p] + ggml_mul_mat(ctx, // [n,m] + ggml_cont(ctx, // [p,n] + ggml_transpose(ctx, // [p,n] + src1)), // [n,p] + ggml_cont(ctx, // [p,m] + ggml_transpose(ctx, // [p,m] + tensor->grad))), // [m,p] inplace); } if (src1->grad) { @@ -13070,6 +13068,15 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor ggml_cont(ctx, // [m,n] ggml_transpose(ctx, src0)), // [m,n] tensor->grad), // [m,p] + + // // when src0 is bigger than tensor->grad (this is the case in llama), + // // avoid transpose of src0, rather transpose smaller tensor->grad + // // and then use ggml_out_prod + // ggml_out_prod(ctx, // [n,p] + // src0, // [n,m] + // ggml_cont(ctx, // [p,m] + // ggml_transpose(ctx, // [p,m] + // tensor->grad)), // [m,p] inplace); } } break; From d9b526872880acdd430d14a97ebd9b386c15a8eb Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 14 May 2023 20:57:47 +0200 Subject: [PATCH 12/86] avoid printing too much newlines in baby-llama-text --- examples/baby-llama/baby-llama-text.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index d114d689e05a1..c5abf66a4b686 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -969,9 +969,16 @@ void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) { void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens) { for (int i1=0; i1ne[1]; ++i1) { + int num_newline = 0; for (int i0=0; i0ne[0]; ++i0) { int token = ggml_get_i32_1d(tokens, i0 + i1*tokens->ne[0]); - print_token(ctx, token); + bool isnl = (token == llama_token_nl()); + if (isnl) { + ++num_newline; + } + if (!isnl || (num_newline < 2)) { + print_token(ctx, token); + } } printf("\n--\n"); } From a703d7a85f010c1ed25ec02bc6bb0bfd77eb72ba Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 14 May 2023 20:58:43 +0200 Subject: [PATCH 13/86] activate threading in baby-llama-text --- examples/baby-llama/baby-llama-text.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index c5abf66a4b686..9f2ff90340289 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -1199,10 +1199,12 @@ int main(int argc, char ** argv) { struct llama_context * lctx = llama_init_from_file(fn_model, llama_params); + printf("%s: tokenize training data\n", __func__); std::vector train_tokens; if (tokenize_file(lctx, fn_train, train_tokens) < 0) { fprintf(stderr, "%s: failed to tokenize file '%s'\n", __func__, fn_train); } + printf("%s: number of training tokens: %d\n", __func__, train_tokens.size()); struct my_llama_model model; model.hparams.n_vocab = llama_n_vocab(lctx); @@ -1225,7 +1227,7 @@ int main(int argc, char ** argv) { model.ctx = ggml_init(lcparams); kv_self.ctx = model.ctx; - printf("init model\n"); + printf("%s: init model\n", __func__); init_model(&model); set_param_model(&model); randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f); @@ -1238,6 +1240,8 @@ int main(int argc, char ** argv) { int n_tokens = model.hparams.n_ctx; int n_vocab = model.hparams.n_vocab; + printf("%s: begin training\n", __func__); + for (int ex=0; ex Date: Mon, 15 May 2023 14:17:42 +0200 Subject: [PATCH 14/86] add ggml_out_prod and use it for mul_mat backward pass for improved performance performance stats report improvement from 37 seconds to 16 seconds runtime during my training tests --- ggml.c | 264 ++++++++++++++++++++++++++++++++++++++++++++++++++------- ggml.h | 13 ++- 2 files changed, 246 insertions(+), 31 deletions(-) diff --git a/ggml.c b/ggml.c index 9a0a07aa57d40..77b654809972a 100644 --- a/ggml.c +++ b/ggml.c @@ -3310,6 +3310,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "RMS_NORM_BACK", "MUL_MAT", + "OUT_PROD", "SCALE", "SET", @@ -3339,7 +3340,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "MAP_BINARY", }; -static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51"); +static_assert(GGML_OP_COUNT == 52, "GGML_OP_COUNT != 52"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -3370,6 +3371,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "rms_norm(x)", "rms_norm_back(x)", + "X*Y", "X*Y", "x*v", @@ -3400,7 +3402,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "f(x,y)", }; -static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51"); +static_assert(GGML_OP_COUNT == 52, "GGML_OP_COUNT != 52"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); @@ -3566,6 +3568,15 @@ static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct (t0->ne[3] == t1->ne[3]); } +static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return + (t0->ne[1] == t1->ne[1]) && + (t0->ne[2] == t1->ne[2]) && + (t0->ne[3] == t1->ne[3]); +} + bool ggml_is_quantized(enum ggml_type type) { return GGML_IS_QUANTIZED[type]; } @@ -5156,6 +5167,32 @@ struct ggml_tensor * ggml_mul_mat( return result; } +// ggml_out_prod + +struct ggml_tensor * ggml_out_prod( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(ggml_can_out_prod(a, b)); + GGML_ASSERT(!ggml_is_transposed(a)); + + bool is_node = false; + + if (a->grad || b->grad) { + is_node = true; + } + + const int64_t ne[4] = { a->ne[0], b->ne[0], a->ne[2], b->ne[3] }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MIN(a->n_dims, b->n_dims), ne); + + result->op = GGML_OP_OUT_PROD; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + // ggml_scale struct ggml_tensor * ggml_scale_impl( @@ -9802,6 +9839,178 @@ static void ggml_compute_forward_mul_mat( } } +// ggml_compute_forward_out_prod + + +static void ggml_compute_forward_out_prod_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + + const int nb00 = src0->nb[0]; + const int nb01 = src0->nb[1]; + const int nb02 = src0->nb[2]; + const int nb03 = src0->nb[3]; + + const int nb10 = src1->nb[0]; + const int nb11 = src1->nb[1]; + const int nb12 = src1->nb[2]; + const int nb13 = src1->nb[3]; + + const int nb0 = dst->nb[0]; + const int nb1 = dst->nb[1]; + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + const int ith = params->ith; + const int nth = params->nth; + + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne03 == ne13); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + // GGML_ASSERT(nb0 <= nb1); + // GGML_ASSERT(nb1 <= nb2); + // GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(ne0 == ne00); + GGML_ASSERT(ne1 == ne10); + GGML_ASSERT(ne2 == ne02); + GGML_ASSERT(ne3 == ne03); + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + + // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod + // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) + + if (params->type == GGML_TASK_INIT) { + ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // parallelize by last two dimensions + + // total parallel in src0 + const int64_t np = ne02*ne03; + + // per thread + const int64_t dp = (np + nth - 1)/nth; + + // range for this thread + const int64_t ip0 = dp*ith; + const int64_t ip1 = MIN(ip0 + dp, np); + + // dst[:,:,:,:] = 0 + // for i2,i3: + // for i01: + // for i1: + // for i0: + // dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3] + + for (int64_t ip = ip0; ip < ip1; ++ip) { + // src0 indices + const int64_t i3 = ip/ne02; + const int64_t i2 = ip - i3*ne02; + + const int64_t i02 = i2; + const int64_t i03 = i3; + + const int64_t i12 = i2; + const int64_t i13 = i3; + + for (int64_t i01 = 0; i01 < ne01; ++i01) { + const int64_t i11 = i01; + + for (int64_t i1 = 0; i1 < ne1; ++i1) { + const int64_t i10 = i1; + + float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); + float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); + float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); + + ggml_vec_mad_f32(ne0, d, s0, *s1); + // for (int64_t i0 = 0; i0 < ne0; ++i0) { + // d[i0] += s0[i0] * s1[i1]; + // } + } + } + } + + //int64_t t1 = ggml_perf_time_us(); + //static int64_t acc = 0; + //acc += t1 - t0; + //if (t1 - t0 > 10) { + // printf("\n"); + // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03); + // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03); + // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13); + // printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13); + + // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc); + //} +} + +static void ggml_compute_forward_out_prod( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + { + GGML_ASSERT(false); // todo + // ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_F16: + { + GGML_ASSERT(false); // todo + // ggml_compute_forward_out_prod_f16_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_out_prod_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_scale static void ggml_compute_forward_scale_f32( @@ -10380,7 +10589,7 @@ static void ggml_compute_forward_diag_mask_f32( if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - + // TODO: handle transposed/permuted matrices const int n = ggml_nrows(src0); @@ -10541,7 +10750,7 @@ static void ggml_compute_forward_soft_max_back_f32( if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - + // TODO: handle transposed/permuted matrices const int ith = params->ith; @@ -10580,7 +10789,7 @@ static void ggml_compute_forward_soft_max_back_f32( // dxk = -yk * dot(y, dy) + yk*dyk // dxk = yk * (- dot(y, dy) + dyk) // dxk = yk * (dyk - dot(y, dy)) - // + // // post-order: // dot_y_dy := dot(y, dy) // dx := dy @@ -12611,6 +12820,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_mul_mat(params, tensor->src0, tensor->src1, tensor); } break; + case GGML_OP_OUT_PROD: + { + ggml_compute_forward_out_prod(params, tensor->src0, tensor->src1, tensor); + } break; case GGML_OP_SCALE: { ggml_compute_forward_scale(params, tensor->src0, tensor->src1, tensor); @@ -13041,45 +13254,37 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // necessary for llama if (src0->grad) { - // TODO: this requires outer product - ggml_out_prod(ctx, src1, tensor->grad); src0->grad = ggml_add_impl(ctx, src0->grad, - // ds0 = dt.dot(s1.T) - // ggml_out_prod(ctx, // [n,m] - // src1, // [n,p] - // tensor->grad), // [m,p] - // for now just using A*B==(B.T*A.T).T - ggml_mul_mat(ctx, // [n,m] - ggml_cont(ctx, // [p,n] - ggml_transpose(ctx, // [p,n] - src1)), // [n,p] - ggml_cont(ctx, // [p,m] - ggml_transpose(ctx, // [p,m] - tensor->grad))), // [m,p] + ggml_out_prod(ctx, // [n,m] + src1, // [n,p] + tensor->grad), // [m,p] inplace); } if (src1->grad) { src1->grad = ggml_add_impl(ctx, src1->grad, - // ds1 = s0.T.dot(dt): - ggml_mul_mat(ctx, // [n,p] - ggml_cont(ctx, // [m,n] - ggml_transpose(ctx, src0)), // [m,n] - tensor->grad), // [m,p] + // ggml_mul_mat(ctx, // [n,p] + // ggml_cont(ctx, // [m,n] + // ggml_transpose(ctx, src0)), // [m,n] + // tensor->grad), // [m,p] - // // when src0 is bigger than tensor->grad (this is the case in llama), + // // when src0 is bigger than tensor->grad (this is mostly the case in llama), // // avoid transpose of src0, rather transpose smaller tensor->grad // // and then use ggml_out_prod - // ggml_out_prod(ctx, // [n,p] - // src0, // [n,m] - // ggml_cont(ctx, // [p,m] - // ggml_transpose(ctx, // [p,m] - // tensor->grad)), // [m,p] + ggml_out_prod(ctx, // [n,p] + src0, // [n,m] + ggml_transpose(ctx, // [p,m] + tensor->grad)), // [m,p] inplace); } } break; + case GGML_OP_OUT_PROD: + { + GGML_ASSERT(false); // TODO: not implemented + } break; case GGML_OP_SCALE: { // necessary for llama @@ -13757,6 +13962,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) node->n_tasks = n_threads; } break; case GGML_OP_MUL_MAT: + case GGML_OP_OUT_PROD: { node->n_tasks = n_threads; diff --git a/ggml.h b/ggml.h index 0a0989516265b..aa75fd726b18d 100644 --- a/ggml.h +++ b/ggml.h @@ -292,6 +292,7 @@ extern "C" { GGML_OP_RMS_NORM_BACK, GGML_OP_MUL_MAT, + GGML_OP_OUT_PROD, GGML_OP_SCALE, GGML_OP_SET, @@ -643,14 +644,22 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); - // A: m rows, n columns - // B: p rows, n columns (i.e. we transpose it internally) + // A: n columns, m rows + // B: n columns, p rows (i.e. we transpose it internally) // result is m columns, p rows GGML_API struct ggml_tensor * ggml_mul_mat( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b); + // A: m columns, n rows, + // B: p columns, n rows, + // result is m columns, p rows + GGML_API struct ggml_tensor * ggml_out_prod( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + // // operations on tensors without backpropagation // From f3cf7df21fcd95ca618caa6c57e63483cdc3eb12 Mon Sep 17 00:00:00 2001 From: xaedes Date: Mon, 15 May 2023 14:18:57 +0200 Subject: [PATCH 15/86] better weight initialization improves training convergence at start --- examples/baby-llama/baby-llama.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 5573c154b5622..e5639da37e576 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -79,34 +79,39 @@ struct ggml_tensor * randomize_tensor_normal( int ndims, const int64_t ne[], struct random_normal_distribution * rnd) { + float scale = 1.0; // xavier switch (ndims) { case 1: + scale /= sqrtf(ne[0]); for (int i0 = 0; i0 < ne[0]; i0++) { - ((float *)tensor->data)[i0] = frand_normal(rnd); + ((float *)tensor->data)[i0] = scale * frand_normal(rnd); } break; case 2: + scale /= sqrtf(ne[0]+ne[1]); for (int i1 = 0; i1 < ne[1]; i1++) { for (int i0 = 0; i0 < ne[0]; i0++) { - ((float *)tensor->data)[i1*ne[0] + i0] = frand_normal(rnd); + ((float *)tensor->data)[i1*ne[0] + i0] = scale * frand_normal(rnd); } } break; case 3: + scale /= sqrtf(ne[0]+ne[1]); for (int i2 = 0; i2 < ne[2]; i2++) { for (int i1 = 0; i1 < ne[1]; i1++) { for (int i0 = 0; i0 < ne[0]; i0++) { - ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand_normal(rnd); + ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd); } } } break; case 4: + scale /= sqrtf(ne[0]+ne[1]); for (int i3 = 0; i3 < ne[3]; i3++) { for (int i2 = 0; i2 < ne[2]; i2++) { for (int i1 = 0; i1 < ne[1]; i1++) { for (int i0 = 0; i0 < ne[0]; i0++) { - ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand_normal(rnd); + ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd); } } } From 19fb91899bad98423a5a14cea8cfa22a3334432d Mon Sep 17 00:00:00 2001 From: xaedes Date: Mon, 15 May 2023 14:19:38 +0200 Subject: [PATCH 16/86] better weight initialization improves training convergence at start --- examples/baby-llama/baby-llama-text.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index 9f2ff90340289..b5177ed5b99dd 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -51,38 +51,43 @@ float frand_uniform(struct random_uniform_distribution * rnd) { } struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) { + float scale = 1.0f; // xavier switch (tensor->n_dims) { case 1: + scale /= sqrtf(tensor->ne[0]); for (int i0 = 0; i0 < tensor->ne[0]; i0++) { float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); - *dst = frand_normal(rnd); + *dst = scale * frand_normal(rnd); } break; case 2: + scale /= sqrtf(tensor->ne[0]*tensor->ne[1]); for (int i1 = 0; i1 < tensor->ne[1]; i1++) { for (int i0 = 0; i0 < tensor->ne[0]; i0++) { float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - *dst = frand_normal(rnd); + *dst = scale * frand_normal(rnd); } } break; case 3: + scale /= sqrtf(tensor->ne[0]*tensor->ne[1]); for (int i2 = 0; i2 < tensor->ne[2]; i2++) { for (int i1 = 0; i1 < tensor->ne[1]; i1++) { for (int i0 = 0; i0 < tensor->ne[0]; i0++) { float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); - *dst = frand_normal(rnd); + *dst = scale * frand_normal(rnd); } } } break; case 4: + scale /= sqrtf(tensor->ne[0]*tensor->ne[1]); for (int i3 = 0; i3 < tensor->ne[3]; i3++) { for (int i2 = 0; i2 < tensor->ne[2]; i2++) { for (int i1 = 0; i1 < tensor->ne[1]; i1++) { for (int i0 = 0; i0 < tensor->ne[0]; i0++) { float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]); - *dst = frand_normal(rnd); + *dst = scale * frand_normal(rnd); } } } From ec881156f6ad56ed552c06698083f5a263ff0a6d Mon Sep 17 00:00:00 2001 From: xaedes Date: Mon, 15 May 2023 14:42:24 +0200 Subject: [PATCH 17/86] improve ggml_out_prod performance - change iteration order (>15s -> 10s runtime) - parallelize over one more dimension: over dst matrix rows (10s -> <5s runtime) --- ggml.c | 51 +++++++++++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/ggml.c b/ggml.c index 77b654809972a..52a9c9bcc0abb 100644 --- a/ggml.c +++ b/ggml.c @@ -9917,51 +9917,50 @@ static void ggml_compute_forward_out_prod_f32( return; } - // parallelize by last two dimensions + // parallelize by last three dimensions - // total parallel in src0 - const int64_t np = ne02*ne03; + // total rows in dst + const int64_t nr = ne1*ne2*ne3; - // per thread - const int64_t dp = (np + nth - 1)/nth; + // rows per thread + const int64_t dr = (nr + nth - 1)/nth; - // range for this thread - const int64_t ip0 = dp*ith; - const int64_t ip1 = MIN(ip0 + dp, np); + // row range for this thread + const int64_t ir0 = dr*ith; + const int64_t ir1 = MIN(ir0 + dr, nr); // dst[:,:,:,:] = 0 // for i2,i3: - // for i01: - // for i1: + // for i1: + // for i01: // for i0: // dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3] - for (int64_t ip = ip0; ip < ip1; ++ip) { - // src0 indices - const int64_t i3 = ip/ne02; - const int64_t i2 = ip - i3*ne02; - + for (int64_t ir = ir0; ir < ir1; ++ir) { + // dst indices + const int64_t i3 = ir/(ne2*ne1); + const int64_t i2 = (ir - i3*ne2*ne1)/ne1; + const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1); + const int64_t i02 = i2; const int64_t i03 = i3; + const int64_t i10 = i1; const int64_t i12 = i2; const int64_t i13 = i3; + for (int64_t i01 = 0; i01 < ne01; ++i01) { const int64_t i11 = i01; - for (int64_t i1 = 0; i1 < ne1; ++i1) { - const int64_t i10 = i1; - - float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); - float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); - float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); + float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); + float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); + float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); - ggml_vec_mad_f32(ne0, d, s0, *s1); - // for (int64_t i0 = 0; i0 < ne0; ++i0) { - // d[i0] += s0[i0] * s1[i1]; - // } - } + ggml_vec_mad_f32(ne0, d, s0, *s1); + // for (int64_t i0 = 0; i0 < ne0; ++i0) { + // d[i0] += s0[i0] * s1[i1]; + // } } } From e063135d0bcbdec80b54e182faa7954771d9d989 Mon Sep 17 00:00:00 2001 From: xaedes Date: Mon, 15 May 2023 21:12:28 +0200 Subject: [PATCH 18/86] add llama sampler, shuffle samples and constrain sampling to tokens occurring in train data --- examples/baby-llama/baby-llama-text.cpp | 235 ++++++++++++++++++++++-- 1 file changed, 219 insertions(+), 16 deletions(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index b5177ed5b99dd..ed7dc9666e4ba 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -7,6 +7,7 @@ #include #include #include +#include struct random_normal_distribution { @@ -42,6 +43,10 @@ float fclamp(const float v, const float min, const float max) { return ((v < min) ? (min) : (v > max) ? (max) : v); } +float frand() { + return (float)rand()/(float)RAND_MAX; +} + float frand_normal(struct random_normal_distribution * rnd) { return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max); } @@ -162,6 +167,17 @@ uint32_t get_n_ff(const struct my_llama_hparams* hparams) { return n_ff; } +void print_params(struct my_llama_hparams * params) { + printf("%s: n_vocab: %d\n", __func__, params->n_vocab); + printf("%s: n_ctx: %d\n", __func__, params->n_ctx); + printf("%s: n_embd: %d\n", __func__, params->n_embd); + printf("%s: n_mult: %d\n", __func__, params->n_mult); + printf("%s: n_head: %d\n", __func__, params->n_head); + printf("%s: n_ff: %d\n", __func__, get_n_ff(params)); + printf("%s: n_layer: %d\n", __func__, params->n_layer); + printf("%s: n_rot: %d\n", __func__, params->n_rot); +} + struct my_llama_layer { // normalization struct ggml_tensor * attention_norm; @@ -989,18 +1005,17 @@ void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens) } } -void get_example_targets(const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { +void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { int n_tokens = tokens_input->ne[0]; int n_vocab = targets->ne[0]; - int n_examples = (n_train_data / (size_t) n_tokens); - int begin = (example_id % n_examples) * n_tokens; - GGML_ASSERT(begin+n_tokens-1 < n_train_data); + int sample = train_samples[example_id % n_train_samples]; + GGML_ASSERT(sample+n_tokens-1 < n_train_data); - ggml_set_f32(targets, -1.0f); + ggml_set_f32(targets, -1.0f/n_vocab); ggml_set_i32_1d(tokens_input, 0, llama_token_bos()); for (int i=1; in_dims == 2); GGML_ASSERT( targets->n_dims == 3); int n_tokens = tokens_input->ne[0]; @@ -1028,7 +1043,7 @@ void get_example_targets_batch(struct ggml_context * ctx, const llama_token * tr targets->nb[1], k*targets->nb[2]); - get_example_targets(train_data, n_train_data, + get_example_targets(train_samples, n_train_samples, train_data, n_train_data, example_id*n_batch + k, tokens_input_k, targets_k); } } @@ -1171,10 +1186,11 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto struct llama_file f(filename, "rb"); std::vector buf; - buf.resize(f.size); + buf.resize(f.size+1); f.read_raw(buf.data(), f.size); - + buf[f.size] = '\0'; + out.resize(buf.size()); int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false); @@ -1186,6 +1202,143 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto return n_tokens; } +void shuffle_ints(int * begin, int * end) { + if (end <= begin) return; + int max=begin[0]; + for (int i=1; i max) { + max = begin[i]; + } + } + std::vector vals; + vals.resize(max+1); + for (int i=0; i candidates; + llama_token_data_array candidates_p; + +}; + +void init_sampler(struct my_llama_sampler * sampler, struct llama_context * ctx) { + sampler->ctx = ctx; + sampler->n_vocab = llama_n_vocab(sampler->ctx); + sampler->n_ctx = llama_n_ctx(sampler->ctx); + sampler->mirostat_mu = 2.0f * sampler->params.mirostat_tau; +} + +llama_token sample(struct my_llama_sampler * sampler, float * logits, const llama_token * last_tokens, int n_last_tokens) { + GGML_ASSERT(sampler->ctx != NULL); + + struct llama_context * ctx = sampler->ctx; + + sampler->candidates.resize(sampler->n_vocab); + for (llama_token token_id = 0; token_id < sampler->n_vocab; ++token_id) { + sampler->candidates[token_id].id = token_id; + sampler->candidates[token_id].logit = logits[token_id]; + sampler->candidates[token_id].p = 0.0; + } + + llama_token_data_array * candidates_p = & sampler->candidates_p; + + candidates_p->data = sampler->candidates.data(); + candidates_p->size = sampler->candidates.size(); + candidates_p->sorted = false; + + const auto params = sampler->params; + + // Apply penalties + const float nl_logit = logits[llama_token_nl()]; + + const int n_last = std::min(std::min(n_last_tokens, params.repeat_last_n), sampler->n_ctx); + + llama_sample_repetition_penalty( + ctx, + candidates_p, + last_tokens + n_last_tokens - n_last, + n_last, + params.repeat_penalty); + llama_sample_frequency_and_presence_penalties( + ctx, + candidates_p, + last_tokens + n_last_tokens - n_last, + n_last, + params.alpha_frequency, + params.alpha_presence); + + if (!params.penalize_nl) { + logits[llama_token_nl()] = nl_logit; + } + + llama_token token = 0; + if (params.temp <= 0) { + // Greedy sampling + token = llama_sample_token_greedy(ctx, candidates_p); + } else { + if (params.mirostat == 1) { + int mirostat_m = 100; + llama_sample_temperature(ctx, candidates_p, params.temp); + token = llama_sample_token_mirostat(ctx, candidates_p, params.mirostat_tau, params.mirostat_eta, mirostat_m, &sampler->mirostat_mu); + } else if (params.mirostat == 2) { + llama_sample_temperature(ctx, candidates_p, params.temp); + token = llama_sample_token_mirostat_v2(ctx, candidates_p, params.mirostat_tau, params.mirostat_eta, &sampler->mirostat_mu); + } else { + // Temperature sampling + llama_sample_top_k (ctx, candidates_p, params.top_k, 1); + llama_sample_tail_free (ctx, candidates_p, params.tfs_z, 1); + llama_sample_typical (ctx, candidates_p, params.typical_p, 1); + + llama_sample_top_p (ctx, candidates_p, params.top_p, 1); + llama_sample_temperature (ctx, candidates_p, params.temp); + token = llama_sample_token(ctx, candidates_p); + } + } + return token; +} + +void set_logits_masked(struct ggml_tensor * logits, std::vector& mask, float value) { + GGML_ASSERT(logits->ne[0] == mask.size()); + for (int i2 = 0; i2 < logits->ne[2]; ++i2) { + for (int i1 = 0; i1 < logits->ne[1]; ++i1) { + for (int i0 = 0; i0 < logits->ne[0]; ++i0) { + if (!mask[i0]) continue; + float * ptr = (float *) ((char *) logits->data + i2*logits->nb[2] + i1*logits->nb[1] + i0*logits->nb[0]); + *ptr = value; + } + } + } +} + int main(int argc, char ** argv) { const char * default_model = "ggml-vic7b-uncensored-q4_0.bin"; const char * default_train = "shakespeare.txt"; @@ -1220,6 +1373,17 @@ int main(int argc, char ** argv) { model.hparams.n_layer = 1; model.hparams.n_rot = std::min(16u, model.hparams.n_embd / model.hparams.n_head); + print_params(&model.hparams); + + std::vector token_occurs; + std::vector token_notavail; + token_occurs.resize(model.hparams.n_vocab, false); + token_notavail.resize(model.hparams.n_vocab, true); + for (int i=0; i train_samples; + for (int i=0; i= train_samples.size()) { + shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size()); + for (int i=0; idata + i*logits->nb[2] + k*logits->nb[1]), + (llama_token *) ((char *) tokens_input->data + i*tokens_input->nb[1]), + k); + * ((int32_t *) ((char *) after_opt_best_samples->data + i*after_opt_best_samples->nb[1] + k*after_opt_best_samples->nb[0])) = token; + } + } + + // sample_softmax_batch(ctx0, logits, after_opt_probs, after_opt_best_samples); // printf("probabilities after optimization:\n"); // print_matrix(after_opt_probs); printf("Example:\n---\n"); print_tokens_batch(lctx, tokens_input); printf("\n---\n"); - printf("best samples after optimization:\n---\n"); + // printf("best samples after optimization:\n---\n"); + printf("samples after optimization:\n---\n"); print_tokens_batch(lctx, after_opt_best_samples); printf("\n---\n"); } @@ -1320,13 +1517,15 @@ int main(int argc, char ** argv) { { int n_gen = 128; int sample_ctx = n_tokens - n_tokens/8; + + init_sampler(&sampler, lctx); printf("Generating %d tokens.\n", n_gen); struct ggml_tensor * tokens_input = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, n_tokens); struct ggml_tensor * targets = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens); - get_example_targets(train_tokens.data(), train_tokens.size(), 137, tokens_input, targets); + get_example_targets(train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), 137, tokens_input, targets); for (int i=sample_ctx; idata + (sample_ctx-1)*logits->nb[1]), + (llama_token *) tokens_input->data, + sample_ctx-1); + // sample_softmax(logits, probs, best_samples); + //int token = ggml_get_i32_1d(best_samples, sample_ctx-1); // print_row(probs, sample_at); print_token(lctx, token); From d328472f16b55ea206b1a30fa043c0e5df444bc6 Mon Sep 17 00:00:00 2001 From: xaedes Date: Wed, 17 May 2023 12:52:10 +0200 Subject: [PATCH 19/86] fix get_samples call, add model tensor names, increase model size, start training samples after newline --- examples/baby-llama/baby-llama-text.cpp | 117 +++++++++++++++--------- 1 file changed, 72 insertions(+), 45 deletions(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index ed7dc9666e4ba..2de3171f1c675 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -8,6 +8,7 @@ #include #include #include +#include struct random_normal_distribution { @@ -162,22 +163,6 @@ struct my_llama_hparams { } }; -uint32_t get_n_ff(const struct my_llama_hparams* hparams) { - const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult; - return n_ff; -} - -void print_params(struct my_llama_hparams * params) { - printf("%s: n_vocab: %d\n", __func__, params->n_vocab); - printf("%s: n_ctx: %d\n", __func__, params->n_ctx); - printf("%s: n_embd: %d\n", __func__, params->n_embd); - printf("%s: n_mult: %d\n", __func__, params->n_mult); - printf("%s: n_head: %d\n", __func__, params->n_head); - printf("%s: n_ff: %d\n", __func__, get_n_ff(params)); - printf("%s: n_layer: %d\n", __func__, params->n_layer); - printf("%s: n_rot: %d\n", __func__, params->n_rot); -} - struct my_llama_layer { // normalization struct ggml_tensor * attention_norm; @@ -221,6 +206,22 @@ struct my_llama_model { std::vector layers; }; +uint32_t get_n_ff(const struct my_llama_hparams* hparams) { + const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult; + return n_ff; +} + +void print_params(struct my_llama_hparams * params) { + printf("%s: n_vocab: %d\n", __func__, params->n_vocab); + printf("%s: n_ctx: %d\n", __func__, params->n_ctx); + printf("%s: n_embd: %d\n", __func__, params->n_embd); + printf("%s: n_mult: %d\n", __func__, params->n_mult); + printf("%s: n_head: %d\n", __func__, params->n_head); + printf("%s: n_ff: %d\n", __func__, get_n_ff(params)); + printf("%s: n_layer: %d\n", __func__, params->n_layer); + printf("%s: n_rot: %d\n", __func__, params->n_rot); +} + void init_model(struct my_llama_model * model) { const auto & hparams = model->hparams; @@ -232,32 +233,48 @@ void init_model(struct my_llama_model * model) { struct ggml_context * ctx = model->ctx; - model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab}); - model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // ("norm.weight", {n_embd}); - model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("output.weight", {n_embd, n_vocab}); + model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); + model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); + + ggml_set_name(model->tok_embeddings, "tok_embeddings.weight"); + ggml_set_name(model->norm, "norm.weight"); + ggml_set_name(model->output, "output.weight"); model->layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { auto & layer = model->layers[i]; - // std::string layers_i = "layers." + std::to_string(i); + std::string layers_i = "layers." + std::to_string(i); + + layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); + layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); + layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); + layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); + + layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd}); + layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); + layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); + layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); - layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wq.weight", {n_embd, n_embd}); - layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wk.weight", {n_embd, n_embd}); - layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wv.weight", {n_embd, n_embd}); - layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wo.weight", {n_embd, n_embd}); + ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str()); + + ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str()); + ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str()); + ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str()); + ggml_set_name(layer.wo, (layers_i + ".attention.wo.weight").c_str()); - layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".ffn_norm.weight", {n_embd}); + ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str()); - layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}); - layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); // (layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}); - layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}); + ggml_set_name(layer.w1, (layers_i + ".feed_forward.w1.weight").c_str()); + ggml_set_name(layer.w2, (layers_i + ".feed_forward.w2.weight").c_str()); + ggml_set_name(layer.w3, (layers_i + ".feed_forward.w3.weight").c_str()); } } - void set_param_model(struct my_llama_model * model) { const auto& hparams = model->hparams; @@ -676,7 +693,6 @@ struct ggml_tensor * forward_batch( cur), n_embd, N, n_batch), 1, 0, 2, 3)); - assert_shape_3d(Vcur, N, n_embd, n_batch); // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer] @@ -1366,12 +1382,12 @@ int main(int argc, char ** argv) { struct my_llama_model model; model.hparams.n_vocab = llama_n_vocab(lctx); - model.hparams.n_ctx = 16; - model.hparams.n_embd = 64; - model.hparams.n_mult = 8; - model.hparams.n_head = 8; - model.hparams.n_layer = 1; - model.hparams.n_rot = std::min(16u, model.hparams.n_embd / model.hparams.n_head); + model.hparams.n_ctx = 32; + model.hparams.n_embd = 128; + model.hparams.n_mult = 64; + model.hparams.n_head = 16; + model.hparams.n_layer = 4; + model.hparams.n_rot = std::min(64u, model.hparams.n_embd / model.hparams.n_head); print_params(&model.hparams); @@ -1383,13 +1399,18 @@ int main(int argc, char ** argv) { token_occurs[train_tokens[i]] = true; token_notavail[train_tokens[i]] = false; } + int n_unique_tokens = 0; + for (int i=0; i train_samples; - for (int i=0; i Date: Wed, 17 May 2023 13:49:32 +0200 Subject: [PATCH 20/86] save train trained model to checkpoint and load model to be trained from checkpoint --- examples/baby-llama/baby-llama-text.cpp | 149 ++++++++++++++++++++++-- 1 file changed, 140 insertions(+), 9 deletions(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index 2de3171f1c675..542b5e3866530 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -204,6 +204,9 @@ struct my_llama_model { struct ggml_tensor * output; std::vector layers; + + uint32_t train_its = 0; + uint32_t train_samples = 0; }; uint32_t get_n_ff(const struct my_llama_hparams* hparams) { @@ -1124,11 +1127,12 @@ struct llama_file { llama_file(const char * fname, const char * mode) { fp = std::fopen(fname, mode); if (fp == NULL) { - throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); + size = 0; + } else { + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); } - seek(0, SEEK_END); - size = tell(); - seek(0, SEEK_SET); } size_t tell() const { @@ -1355,18 +1359,135 @@ void set_logits_masked(struct ggml_tensor * logits, std::vector& mask, flo } } +enum llama_file_version { + LLAMA_FILE_VERSION_GGML, + LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab + LLAMA_FILE_VERSION_GGJT_V1, // added padding + LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format +}; + +void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) { + const char * name = ggml_get_name(tensor); + uint32_t name_len = strlen(name); + uint32_t nd = tensor->n_dims; + uint32_t ne[4] = { tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3] }; + file->write_u32(nd); + file->write_u32(name_len); + file->write_u32(tensor->type); + file->write_raw(ne, sizeof(ne[0]) * nd); + file->write_raw(name, name_len); + file->seek(-file->tell() & 31, SEEK_CUR); + file->write_raw(tensor->data, ggml_nbytes(tensor)); +} + +void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) { + uint32_t nd = file->read_u32(); + GGML_ASSERT(nd == tensor->n_dims); + uint32_t name_len = file->read_u32(); + enum ggml_type type = (enum ggml_type) file->read_u32(); + GGML_ASSERT(type == tensor->type); + uint32_t ne[4]; + file->read_raw(ne, sizeof(ne[0]) * nd); + for (int i=0; ine[i]); + } + std::string name = file->read_string(name_len); + file->seek(-file->tell() & 31, SEEK_CUR); + + GGML_ASSERT(strcmp(ggml_get_name(tensor), name.c_str()) == 0); + file->read_raw(tensor->data, ggml_nbytes(tensor)); +} + +void save_model(struct my_llama_model * model, const char * filename) { + struct llama_file file(filename, "wb"); + if (file.fp == NULL) { + return; + } + file.write_u32(model->train_its); + file.write_u32(model->train_samples); + file.write_u32(model->hparams.n_vocab); + file.write_u32(model->hparams.n_embd); + file.write_u32(model->hparams.n_mult); + file.write_u32(model->hparams.n_head); + file.write_u32(model->hparams.n_layer); + file.write_u32(model->hparams.n_rot); + + write_tensor(&file, model->tok_embeddings); + write_tensor(&file, model->norm); + write_tensor(&file, model->output); + + for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { + auto & layer = model->layers[i]; + + write_tensor(&file, layer.attention_norm); + write_tensor(&file, layer.wq); + write_tensor(&file, layer.wk); + write_tensor(&file, layer.wv); + write_tensor(&file, layer.wo); + write_tensor(&file, layer.ffn_norm); + write_tensor(&file, layer.w1); + write_tensor(&file, layer.w2); + write_tensor(&file, layer.w3); + } +} + +void load_model(struct my_llama_model * model, const char * filename, bool init) { + struct llama_file file(filename, "rb"); + + if (file.fp) { + printf("%s: Loading model from '%s'.\n", __func__, filename); + model->train_its = file.read_u32(); + model->train_samples = file.read_u32(); + model->hparams.n_vocab = file.read_u32(); + model->hparams.n_embd = file.read_u32(); + model->hparams.n_mult = file.read_u32(); + model->hparams.n_head = file.read_u32(); + model->hparams.n_layer = file.read_u32(); + model->hparams.n_rot = file.read_u32(); + printf("%s: Training iterations: %u.\n", __func__, model->train_its); + printf("%s: Training samples: %u.\n", __func__, model->train_samples); + print_params(&model->hparams); + } + + if (init) { + init_model(model); + } + + if (file.fp) { + read_tensor(&file, model->tok_embeddings); + read_tensor(&file, model->norm); + read_tensor(&file, model->output); + + for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { + auto & layer = model->layers[i]; + + read_tensor(&file, layer.attention_norm); + read_tensor(&file, layer.wq); + read_tensor(&file, layer.wk); + read_tensor(&file, layer.wv); + read_tensor(&file, layer.wo); + read_tensor(&file, layer.ffn_norm); + read_tensor(&file, layer.w1); + read_tensor(&file, layer.w2); + read_tensor(&file, layer.w3); + } + } +} + int main(int argc, char ** argv) { const char * default_model = "ggml-vic7b-uncensored-q4_0.bin"; const char * default_train = "shakespeare.txt"; - const char * default_argv[3] = {argv[0], default_model, default_train}; + const char * default_checkpoint = "checkpoint.bin"; + const char * default_argv[4] = {argv[0], default_model, default_train, default_checkpoint}; - if (argc < 3) { + if (argc < 4) { fprintf(stderr, "usage: %s model training_data\n", argv[0]); //return 1; } const char * fn_model = (argc >= 2) ? argv[1] : default_argv[1]; const char * fn_train = (argc >= 3) ? argv[2] : default_argv[2]; + const char * fn_chkpt = (argc >= 4) ? argv[3] : default_argv[3]; struct llama_context_params llama_params = llama_context_default_params(); llama_params.vocab_only = true; @@ -1420,7 +1541,7 @@ int main(int argc, char ** argv) { my_llama_sampler sampler; printf("%s: init model\n", __func__); - init_model(&model); + load_model(&model, fn_chkpt, true); set_param_model(&model); randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f); init_kv_cache(&kv_self, &model, n_batch); @@ -1498,8 +1619,16 @@ int main(int argc, char ** argv) { opt_params_lbfgs.print_backward_graph = false; opt_params_lbfgs.n_threads = gf.n_threads; opt_params_lbfgs.lbfgs.n_iter = 16; - ggml_opt(ctx0, opt_params_adam, e); - // ggml_opt(ctx0, opt_params_lbfgs, e); + + bool use_adam = true; + if (use_adam) { + ggml_opt(ctx0, opt_params_adam, e); + } else { + ggml_opt(ctx0, opt_params_lbfgs, e); + } + + model.train_its += use_adam ? opt_params_adam.adam.n_iter : opt_params_lbfgs.lbfgs.n_iter; + model.train_samples += n_batch; ggml_build_forward_expand(&gf, e); ggml_graph_compute(ctx0, &gf); @@ -1541,6 +1670,8 @@ int main(int argc, char ** argv) { ggml_free(ctx0); } + save_model(&model, fn_chkpt); + { int n_gen = 128; int sample_ctx = n_tokens - n_tokens/8; From 25fe1c3815eec962d70516993844635d116bf30f Mon Sep 17 00:00:00 2001 From: xaedes Date: Fri, 19 May 2023 14:53:21 +0200 Subject: [PATCH 21/86] use inplace functions where possible --- examples/baby-llama/baby-llama-text.cpp | 26 ++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index 542b5e3866530..7517e203737e3 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -421,8 +421,8 @@ struct ggml_tensor * forward( // wk shape [n_embd, n_embd, 1, 1] // Qcur shape [n_embd/n_head, n_head, N, 1] // Kcur shape [n_embd/n_head, n_head, N, 1] - struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); - struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); + struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); + struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); // store key and value to memory { @@ -447,8 +447,8 @@ struct ggml_tensor * forward( ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } //*/ - kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); - vc = ggml_set_2d(ctx0, vc, Vcur, ( n_ctx)*ggml_element_size(kv_self.v), + kc = ggml_set_1d_inplace(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + vc = ggml_set_2d_inplace(ctx0, vc, Vcur, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); } @@ -678,8 +678,8 @@ struct ggml_tensor * forward_batch( // wk shape [n_embd, n_embd, 1, 1] // Qcur shape [n_embd/n_head, n_head, N, n_batch] // Kcur shape [n_embd/n_head, n_head, N, n_batch] - struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); - struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); + struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); + struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch); assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch); @@ -714,11 +714,11 @@ struct ggml_tensor * forward_batch( ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } //*/ - kc = ggml_set_2d(ctx0, kc, + kc = ggml_set_2d_inplace(ctx0, kc, ggml_reshape_2d(ctx0, Kcur, n_embd*N, n_batch), ggml_element_size(kc)*n_embd*n_ctx, (ggml_element_size(kc)*n_embd)*(il*n_batch*n_ctx + n_past)); - vc = ggml_set_2d(ctx0, vc, + vc = ggml_set_2d_inplace(ctx0, vc, ggml_reshape_2d(ctx0, Vcur, N*n_embd, n_batch), ggml_element_size(vc)*n_ctx*n_embd, ggml_element_size(vc)*(n_past + il*n_embd*n_batch*n_ctx)); @@ -760,19 +760,19 @@ struct ggml_tensor * forward_batch( // KQ_scaled = KQ / sqrt(n_embd/n_head) // KQ_scaled shape [n_past + N, N, n_head, n_batch] struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, + ggml_scale_inplace(ctx0, KQ, ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch); // KQ_masked = mask_past(KQ_scaled) // KQ_masked shape [n_past + N, N, n_head, n_batch] - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); assert_shape_4d(KQ_masked, n_past + N, N, n_head, n_batch); // KQ = soft_max(KQ_masked) // KQ_soft_max shape [n_past + N, N, n_head, n_batch] - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch); // split cached V into n_head heads @@ -816,7 +816,7 @@ struct ggml_tensor * forward_batch( // lctx.use_buf(ctx0, 1); // inpFF shape [n_embd,N*n_batch,1,1] - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); + struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA); assert_shape_2d(inpFF, n_embd, N*n_batch); // feed-forward network @@ -864,7 +864,7 @@ struct ggml_tensor * forward_batch( } // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_add(ctx0, cur, inpFF); + cur = ggml_add_inplace(ctx0, cur, inpFF); assert_shape_2d(cur, n_embd, N*n_batch); // input for next layer From d8b066642965f738c48594a944c3f63e607b0f70 Mon Sep 17 00:00:00 2001 From: xaedes Date: Fri, 19 May 2023 18:29:47 +0200 Subject: [PATCH 22/86] initialize rng with srand --- examples/baby-llama/baby-llama-text.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index 7517e203737e3..aa8c3ace49564 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -3,10 +3,12 @@ #include #include #include -#include #include #include #include +#include +#include +#include #include #include @@ -1485,6 +1487,8 @@ int main(int argc, char ** argv) { //return 1; } + srand(time(NULL)); + const char * fn_model = (argc >= 2) ? argv[1] : default_argv[1]; const char * fn_train = (argc >= 3) ? argv[2] : default_argv[2]; const char * fn_chkpt = (argc >= 4) ? argv[3] : default_argv[3]; From 44d83558bc9783831c6353799324ff8d7ff089b8 Mon Sep 17 00:00:00 2001 From: xaedes Date: Fri, 19 May 2023 18:34:18 +0200 Subject: [PATCH 23/86] use different arguments for input and output checkpoint --- examples/baby-llama/baby-llama-text.cpp | 48 +++++++++++++++---------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index aa8c3ace49564..5c019f4bba009 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -1433,7 +1433,7 @@ void save_model(struct my_llama_model * model, const char * filename) { } } -void load_model(struct my_llama_model * model, const char * filename, bool init) { +bool load_model(struct my_llama_model * model, const char * filename, bool init) { struct llama_file file(filename, "rb"); if (file.fp) { @@ -1474,24 +1474,28 @@ void load_model(struct my_llama_model * model, const char * filename, bool init) read_tensor(&file, layer.w3); } } + + return (file.fp != NULL); } int main(int argc, char ** argv) { const char * default_model = "ggml-vic7b-uncensored-q4_0.bin"; const char * default_train = "shakespeare.txt"; - const char * default_checkpoint = "checkpoint.bin"; - const char * default_argv[4] = {argv[0], default_model, default_train, default_checkpoint}; + const char * default_chkpt_in = "checkpoint.bin"; + const char * default_chkpt_out = "checkpoint.bin"; + const char * default_argv[5] = {argv[0], default_model, default_train, default_chkpt_in, default_chkpt_out}; - if (argc < 4) { - fprintf(stderr, "usage: %s model training_data\n", argv[0]); + if (argc < 5) { + fprintf(stderr, "usage: %s model training_data chkpt_in chkpt_out\n", argv[0]); //return 1; } srand(time(NULL)); - const char * fn_model = (argc >= 2) ? argv[1] : default_argv[1]; - const char * fn_train = (argc >= 3) ? argv[2] : default_argv[2]; - const char * fn_chkpt = (argc >= 4) ? argv[3] : default_argv[3]; + const char * fn_model = (argc >= 2) ? argv[1] : default_argv[1]; + const char * fn_train = (argc >= 3) ? argv[2] : default_argv[2]; + const char * fn_chkpt_in = (argc >= 4) ? argv[3] : default_argv[3]; + const char * fn_chkpt_out = (argc >= 5) ? argv[4] : default_argv[4]; struct llama_context_params llama_params = llama_context_default_params(); llama_params.vocab_only = true; @@ -1516,17 +1520,20 @@ int main(int argc, char ** argv) { print_params(&model.hparams); - std::vector token_occurs; - std::vector token_notavail; - token_occurs.resize(model.hparams.n_vocab, false); + std::vector token_noccurs; + std::vector token_notavail; + token_noccurs.resize(model.hparams.n_vocab, 0); token_notavail.resize(model.hparams.n_vocab, true); for (int i=0; i token_freq; + token_freq.resize(model.hparams.n_vocab, 0); int n_unique_tokens = 0; - for (int i=0; i 0) ? 1 : 0; } printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens); @@ -1545,9 +1552,12 @@ int main(int argc, char ** argv) { my_llama_sampler sampler; printf("%s: init model\n", __func__); - load_model(&model, fn_chkpt, true); + bool existed = load_model(&model, fn_chkpt_in, true); + bool from_scratch = !existed; set_param_model(&model); - randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f); + if (from_scratch) { + randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f); + } init_kv_cache(&kv_self, &model, n_batch); init_sampler(&sampler, lctx); @@ -1559,10 +1569,12 @@ int main(int argc, char ** argv) { int n_tokens = model.hparams.n_ctx; int n_vocab = model.hparams.n_vocab; + bool samples_start_after_nl = false; + std::vector train_samples; train_samples.push_back(0); for (int i=1; i Date: Fri, 19 May 2023 18:35:40 +0200 Subject: [PATCH 24/86] ggml fixes to support backward pass on inplace operations --- ggml.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/ggml.c b/ggml.c index 52a9c9bcc0abb..7039a3cec45b7 100644 --- a/ggml.c +++ b/ggml.c @@ -4334,7 +4334,7 @@ struct ggml_tensor * ggml_add_impl( bool is_node = false; - if (!inplace && (a->grad || b->grad)) { + if (a->grad || b->grad) { is_node = true; } @@ -5248,7 +5248,7 @@ struct ggml_tensor * ggml_set_impl( bool is_node = false; - if (!inplace && (a->grad || b->grad)) { + if (a->grad || b->grad) { is_node = true; } @@ -6016,7 +6016,7 @@ struct ggml_tensor * ggml_rope_impl( GGML_ASSERT(n_past >= 0); bool is_node = false; - if (!inplace && a->grad) { + if (a->grad) { is_node = true; } @@ -6065,8 +6065,7 @@ struct ggml_tensor * ggml_rope_back( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward - is_node = true; + is_node = false; // TODO: implement backward } struct ggml_tensor * result = ggml_dup_tensor(ctx, a); From 09b304d01540ac2efb00ef3b1d9706da08d0c2bf Mon Sep 17 00:00:00 2001 From: xaedes Date: Fri, 19 May 2023 18:36:05 +0200 Subject: [PATCH 25/86] remove duplicate include --- examples/baby-llama/baby-llama-text.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index 5c019f4bba009..b56441f9a7ec2 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include From da86a1d736f02793ea6a0942d6a5a72427fe55b8 Mon Sep 17 00:00:00 2001 From: xaedes Date: Fri, 19 May 2023 18:39:38 +0200 Subject: [PATCH 26/86] fix cross entropy loss - add target probabilities for each sample which is then used in cross entropy loss --- examples/baby-llama/baby-llama-text.cpp | 111 ++++++++++++++---------- 1 file changed, 64 insertions(+), 47 deletions(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index b56441f9a7ec2..e65d2d1867c32 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -1025,78 +1025,93 @@ void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens) } } -void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { +void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) { int n_tokens = tokens_input->ne[0]; - int n_vocab = targets->ne[0]; + int n_vocab = target_logits->ne[0]; + + const float eps = 1e-6f; + const float target_prob = 1.0f; int sample = train_samples[example_id % n_train_samples]; GGML_ASSERT(sample+n_tokens-1 < n_train_data); - ggml_set_f32(targets, -1.0f/n_vocab); + ggml_set_f32(target_logits, -1.0f/n_vocab); + ggml_set_f32(target_probs, 0.0f); ggml_set_i32_1d(tokens_input, 0, llama_token_bos()); for (int i=1; in_dims == 2); - GGML_ASSERT( targets->n_dims == 3); +void get_example_targets_batch(struct ggml_context * ctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) { + GGML_ASSERT(tokens_input->n_dims == 2); + GGML_ASSERT(target_logits->n_dims == 3); + GGML_ASSERT(target_probs->n_dims == 3); int n_tokens = tokens_input->ne[0]; int n_batch = tokens_input->ne[1]; - GGML_ASSERT(n_tokens == targets->ne[1]); - GGML_ASSERT(n_batch == targets->ne[2]); + GGML_ASSERT(n_tokens == target_logits->ne[1]); + GGML_ASSERT(n_batch == target_logits->ne[2]); + GGML_ASSERT(n_tokens == target_probs->ne[1]); + GGML_ASSERT(n_batch == target_probs->ne[2]); for (int k=0; kne[0], k*tokens_input->nb[1]); - struct ggml_tensor * targets_k = ggml_view_2d(ctx, - targets, - targets->ne[0], - targets->ne[1], - targets->nb[1], - k*targets->nb[2]); + struct ggml_tensor * target_logits_k = ggml_view_2d(ctx, + target_logits, + target_logits->ne[0], + target_logits->ne[1], + target_logits->nb[1], + k*target_logits->nb[2]); + + struct ggml_tensor * target_probs_k = ggml_view_2d(ctx, + target_probs, + target_probs->ne[0], + target_probs->ne[1], + target_probs->nb[1], + k*target_probs->nb[2]); get_example_targets(train_samples, n_train_samples, train_data, n_train_data, - example_id*n_batch + k, tokens_input_k, targets_k); + example_id*n_batch + k, tokens_input_k, target_logits_k, target_probs_k); } } -void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) { +void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs, int n_shift) { int n_tokens = tokens_input->ne[0]; - int n_vocab = targets->ne[0]; + int n_vocab = target_logits->ne[0]; for (int i=0; idata + (sample_ctx-1)*logits->nb[1]), (llama_token *) tokens_input->data, @@ -1739,7 +1756,7 @@ int main(int argc, char ** argv) { // print_row(probs, sample_at); print_token(lctx, token); - lshift_examples(tokens_input, targets, 1); + lshift_examples(tokens_input, target_logits, target_probs, 1); ggml_set_i32_1d(tokens_input, 0, 0); ggml_set_i32_1d(tokens_input, sample_ctx-1, token); From e19ead6e3f30a8c0c944e238d90caf5902c92415 Mon Sep 17 00:00:00 2001 From: xaedes Date: Fri, 19 May 2023 18:40:20 +0200 Subject: [PATCH 27/86] print used memory before and after optimization --- examples/baby-llama/baby-llama-text.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index e65d2d1867c32..099863bb8ad0c 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -1637,6 +1637,8 @@ int main(int argc, char ** argv) { ggml_build_forward_expand(&gf, e); ggml_graph_compute(ctx0, &gf); + size_t used_mem_before_opt = ggml_used_mem(ctx0); + float error_before_opt = ggml_get_f32_1d(e, 0); struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM); @@ -1645,6 +1647,7 @@ int main(int argc, char ** argv) { opt_params_adam.print_backward_graph = false; opt_params_adam.n_threads = gf.n_threads; opt_params_adam.adam.n_iter = 16; + opt_params_adam.adam.alpha = 1e-4; opt_params_lbfgs.print_forward_graph = false; opt_params_lbfgs.print_backward_graph = false; @@ -1658,6 +1661,8 @@ int main(int argc, char ** argv) { ggml_opt(ctx0, opt_params_lbfgs, e); } + size_t used_mem_after_opt = ggml_used_mem(ctx0); + model.train_its += use_adam ? opt_params_adam.adam.n_iter : opt_params_lbfgs.lbfgs.n_iter; model.train_samples += n_batch; @@ -1666,6 +1671,9 @@ int main(int argc, char ** argv) { float error_after_opt = ggml_get_f32_1d(e, 0); + printf("used_mem_before_opt: %zu bytes\n", used_mem_before_opt); + printf("used_mem_after_opt: %zu bytes\n", used_mem_after_opt); + if (ex % 1 == 0) { printf("Example %d\n", ex); printf("error_before_opt: %.6f\n", error_before_opt); From 332003584eb39bbd4465d69666033155e087ef46 Mon Sep 17 00:00:00 2001 From: xaedes Date: Fri, 19 May 2023 18:41:06 +0200 Subject: [PATCH 28/86] sample with non-greedy sampling parameters at the end of training --- examples/baby-llama/baby-llama-text.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index 099863bb8ad0c..beacf46861a96 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -1712,9 +1712,12 @@ int main(int argc, char ** argv) { save_model(&model, fn_chkpt_out); { - int n_gen = 128; + int n_gen = 1024; int sample_ctx = n_tokens - n_tokens/8; + sampler.params.temp = 0.2; + sampler.params.repeat_penalty = 1.1; + sampler.params.mirostat = 2; init_sampler(&sampler, lctx); printf("Generating %d tokens.\n", n_gen); From 08a330a13661b1623c866a75fe95faee0074285f Mon Sep 17 00:00:00 2001 From: xaedes Date: Fri, 19 May 2023 18:41:26 +0200 Subject: [PATCH 29/86] add cmake target for baby-llama-text --- examples/baby-llama/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/baby-llama/CMakeLists.txt b/examples/baby-llama/CMakeLists.txt index d2ce36367474f..c89dc792b9a0e 100644 --- a/examples/baby-llama/CMakeLists.txt +++ b/examples/baby-llama/CMakeLists.txt @@ -2,3 +2,7 @@ set(TARGET baby-llama) add_executable(${TARGET} baby-llama.cpp) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) + +add_executable(${TARGET}-text baby-llama-text.cpp) +target_link_libraries(${TARGET}-text PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET}-text PRIVATE cxx_std_11) From a6aafdd719c7ce0dbcc9a182c6131125039d8ffb Mon Sep 17 00:00:00 2001 From: xaedes Date: Sat, 20 May 2023 14:47:56 +0200 Subject: [PATCH 30/86] add ggml_add1_inplace to header --- ggml.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ggml.h b/ggml.h index aa75fd726b18d..62da0bd3553dd 100644 --- a/ggml.h +++ b/ggml.h @@ -520,6 +520,11 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_add1_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_acc( struct ggml_context * ctx, struct ggml_tensor * a, From f4e9ce79989e8fc62310ca33919346cd0ec79a07 Mon Sep 17 00:00:00 2001 From: xaedes Date: Sat, 20 May 2023 14:49:19 +0200 Subject: [PATCH 31/86] enable gradient propagation for inplace add1 and scale operations those functions backward passes don't need the original src0, so they also work when forward is inplace --- ggml.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml.c b/ggml.c index 7039a3cec45b7..7b66846ee73b6 100644 --- a/ggml.c +++ b/ggml.c @@ -4374,7 +4374,7 @@ struct ggml_tensor * ggml_add1_impl( bool is_node = false; - if (!inplace && (a->grad || b->grad)) { + if (a->grad || b->grad) { is_node = true; } @@ -5205,7 +5205,7 @@ struct ggml_tensor * ggml_scale_impl( bool is_node = false; - if (!inplace && (a->grad || b->grad)) { + if (a->grad || b->grad) { is_node = true; } From ef17d99f657aef57b977ce33fc13345f467b1f44 Mon Sep 17 00:00:00 2001 From: xaedes Date: Sat, 20 May 2023 14:54:40 +0200 Subject: [PATCH 32/86] implement AdamW in ggml_opt_adam by adding weight decay parameter (default 0.001f) also add a schedule parameter (default 1.0f) that can be used to scale alpha and decay according to learning schedule. setting the decay parameter to zero disables AdamW resulting in normal Adam optimizer. since the difference between Adam and AdamW is minimal it is not implemented as another optimizer, but integrated into the existing Adam optimizer. --- ggml.c | 11 +++++++++-- ggml.h | 2 ++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/ggml.c b/ggml.c index 7b66846ee73b6..f259f5605ad6a 100644 --- a/ggml.c +++ b/ggml.c @@ -14603,7 +14603,9 @@ static enum ggml_opt_result ggml_opt_adam( } // constants - const float alpha = params.adam.alpha; + const float sched = params.adam.sched; + const float decay = params.adam.decay * sched; + const float alpha = params.adam.alpha * sched; const float beta1 = params.adam.beta1; const float beta2 = params.adam.beta2; const float eps = params.adam.eps; @@ -14673,7 +14675,11 @@ static enum ggml_opt_result ggml_opt_adam( // m^hat = m_t / (1 - beta1^t) // v^hat = v_t / (1 - beta2^t) - // x_t = x_t-1 - alpha*m^hat/(sqrt(v^hat) + eps) + // x_t = x_t-1 - sched*(alpha*m^hat/(sqrt(v^hat) + eps) + decay*x_t-1) + // x_t = x_t-1 - sched*alpha*m^hat/(sqrt(v^hat) + eps) - sched*decay*x_t-1 + // x_t = x_t-1*(1-sched*decay) - sched*alpha*m^hat/(sqrt(v^hat) + eps) + // x_t = x_t-1*(1-sched*decay) + sched*decay*(-alpha/decay)*m^hat/(sqrt(v^hat) + eps) + // x_t = mix(x_t-1, (-alpha/decay)*m^hat/(sqrt(v^hat) + eps), sched*decay) ggml_vec_cpy_f32 (nx, mh, m); ggml_vec_cpy_f32 (nx, vh, v); @@ -14684,6 +14690,7 @@ static enum ggml_opt_result ggml_opt_adam( ggml_vec_acc1_f32 (nx, vh, eps); ggml_vec_div_f32 (nx, mh, mh, vh); + ggml_vec_scale_f32(nx, x, 1.0f - decay); ggml_vec_sub_f32 (nx, x, x, mh); // update the parameters diff --git a/ggml.h b/ggml.h index 62da0bd3553dd..6ce660c7454d4 100644 --- a/ggml.h +++ b/ggml.h @@ -1055,6 +1055,8 @@ extern "C" { struct { int n_iter; + float sched; // schedule multiplier (fixed, decay or warmup) + float decay; // weight decay for AdamW, use 0.0f to disable float alpha; // learning rate float beta1; float beta2; From 96514971dddaaba3e4424b822224f665c52c13ee Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 21 May 2023 12:17:57 +0200 Subject: [PATCH 33/86] use inplace operations in cross_entropy_loss --- examples/baby-llama/baby-llama-text.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index beacf46861a96..0c13f7fd4382f 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -1107,8 +1107,8 @@ struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_t ggml_mul(ctx, probs, ggml_log(ctx, - ggml_add1(ctx, - ggml_scale(ctx, + ggml_add1_inplace(ctx, + ggml_scale_inplace(ctx, ggml_soft_max(ctx, a), ggml_new_f32(ctx, 1.0f-eps)), ggml_new_f32(ctx, eps))))); From 57c2f4f9095906711ad7c38e831e1659cbbc2785 Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 21 May 2023 12:18:47 +0200 Subject: [PATCH 34/86] fix random weight initialization scale --- examples/baby-llama/baby-llama-text.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index 0c13f7fd4382f..4c3cd631b45a8 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -68,7 +68,7 @@ struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct } break; case 2: - scale /= sqrtf(tensor->ne[0]*tensor->ne[1]); + scale /= sqrtf(tensor->ne[0]+tensor->ne[1]); for (int i1 = 0; i1 < tensor->ne[1]; i1++) { for (int i0 = 0; i0 < tensor->ne[0]; i0++) { float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); @@ -77,7 +77,7 @@ struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct } break; case 3: - scale /= sqrtf(tensor->ne[0]*tensor->ne[1]); + scale /= sqrtf(tensor->ne[0]+tensor->ne[1]); for (int i2 = 0; i2 < tensor->ne[2]; i2++) { for (int i1 = 0; i1 < tensor->ne[1]; i1++) { for (int i0 = 0; i0 < tensor->ne[0]; i0++) { @@ -88,7 +88,7 @@ struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct } break; case 4: - scale /= sqrtf(tensor->ne[0]*tensor->ne[1]); + scale /= sqrtf(tensor->ne[0]+tensor->ne[1]); for (int i3 = 0; i3 < tensor->ne[3]; i3++) { for (int i2 = 0; i2 < tensor->ne[2]; i2++) { for (int i1 = 0; i1 < tensor->ne[1]; i1++) { From 1eee9255e74dcd17ee248a9bb65c0c060fd97454 Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 21 May 2023 15:03:51 +0200 Subject: [PATCH 35/86] add missing default parameters for adam optimizer --- ggml.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml.c b/ggml.c index f259f5605ad6a..b8253de9344ea 100644 --- a/ggml.c +++ b/ggml.c @@ -15117,6 +15117,8 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { .adam = { .n_iter = 10000, + .sched = 1.000f, + .decay = 0.001f, .alpha = 0.001f, .beta1 = 0.9f, .beta2 = 0.999f, From ec1783c3e0e61d09eb8777b776ec227b9aae0d5f Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 21 May 2023 15:16:07 +0200 Subject: [PATCH 36/86] add ggml_opt_context, so that we can properly resume training otherwise the optimizer states, tracking statistics about the error function and its derivates, will reset to zero each time ggml_opt is called, hindering convergence on resumed training. now the optimizer context and all its memory is stored in a separate struct. --- ggml.c | 314 +++++++++++++++++++++++++++++++++++++++------------------ ggml.h | 56 ++++++++++ 2 files changed, 270 insertions(+), 100 deletions(-) diff --git a/ggml.c b/ggml.c index b8253de9344ea..cfc9bb455aec2 100644 --- a/ggml.c +++ b/ggml.c @@ -14577,6 +14577,7 @@ static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g static enum ggml_opt_result ggml_opt_adam( struct ggml_context * ctx, + struct ggml_opt_context * opt, struct ggml_opt_params params, struct ggml_tensor * f, struct ggml_cgraph * gf, @@ -14602,6 +14603,12 @@ static enum ggml_opt_result ggml_opt_adam( } } + if ((opt->params.type != params.type) || (opt->nx != nx) || (opt->params.past != params.past)) { + int iter = opt->iter; + ggml_opt_init(opt->ctx, opt, params, nx); + opt->iter = iter; + } + // constants const float sched = params.adam.sched; const float decay = params.adam.decay * sched; @@ -14610,19 +14617,15 @@ static enum ggml_opt_result ggml_opt_adam( const float beta2 = params.adam.beta2; const float eps = params.adam.eps; - float * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // view of the parameters - float * g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // gradient - float * g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // gradient squared - float * m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // first moment - float * v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // second moment - float * mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // first moment hat - float * vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // second moment hat + float * x = opt->adam.x->data; // view of the parameters + float * g1 = opt->adam.g1->data; // gradient + float * g2 = opt->adam.g2->data; // gradient squared + float * m = opt->adam.m->data; // first moment + float * v = opt->adam.v->data; // second moment + float * mh = opt->adam.mh->data; // first moment hat + float * vh = opt->adam.vh->data; // second moment hat - float * pf = params.past > 0 ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)->data : NULL; // past function values - - // initialize - ggml_vec_set_f32(nx, m, 0.0f); - ggml_vec_set_f32(nx, v, 0.0f); + float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values // update view ggml_opt_get_params(np, ps, x); @@ -14632,16 +14635,27 @@ static enum ggml_opt_result ggml_opt_adam( ggml_set_f32 (f->grad, 1.0f); ggml_graph_compute(ctx, gb); - float fx_prev = ggml_get_f32_1d(f, 0); + opt->adam.fx_prev = ggml_get_f32_1d(f, 0); + opt->adam.fx_best = opt->adam.fx_prev; if (pf) { - pf[0] = fx_prev; + pf[opt->iter % params.past] = opt->adam.fx_prev; + } + + // initialize + if (opt->just_initialized) { + opt->adam.n_no_improvement = 0; + opt->just_initialized = false; } - int n_no_improvement = 0; - float fx_best = fx_prev; + float * fx_best = &opt->adam.fx_best; + float * fx_prev = &opt->adam.fx_prev; + int * n_no_improvement = &opt->adam.n_no_improvement; + + int iter0 = opt->iter; // run the optimizer for (int t = 0; t < params.adam.n_iter; ++t) { + opt->iter = iter0 + t + 1; GGML_PRINT_DEBUG ("=== iter %d ===\n", t); GGML_PRINT_DEBUG ("f = %10.6f\n", ggml_get_f32_1d(f, 0)); @@ -14683,8 +14697,8 @@ static enum ggml_opt_result ggml_opt_adam( ggml_vec_cpy_f32 (nx, mh, m); ggml_vec_cpy_f32 (nx, vh, v); - ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, t + 1))); - ggml_vec_scale_f32(nx, vh, 1.0f/(1.0f - powf(beta2, t + 1))); + ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, opt->iter))); + ggml_vec_scale_f32(nx, vh, 1.0f/(1.0f - powf(beta2, opt->iter))); ggml_vec_sqrt_f32 (nx, vh, vh); ggml_vec_acc1_f32 (nx, vh, eps); @@ -14704,7 +14718,7 @@ static enum ggml_opt_result ggml_opt_adam( const float fx = ggml_get_f32_1d(f, 0); // check convergence - if (fabsf(fx - fx_prev)/fx < params.adam.eps_f) { + if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) { GGML_PRINT_DEBUG("converged\n"); return GGML_OPT_OK; @@ -14713,32 +14727,32 @@ static enum ggml_opt_result ggml_opt_adam( // delta-based convergence test if (pf != NULL) { // need at least params.past iterations to start checking for convergence - if (params.past <= t) { - const float rate = (pf[t%params.past] - fx)/fx; + if (params.past <= iter0 + t) { + const float rate = (pf[(iter0 + t)%params.past] - fx)/fx; if (fabsf(rate) < params.delta) { return GGML_OPT_OK; } } - pf[t%params.past] = fx; + pf[(iter0 + t)%params.past] = fx; } // check for improvement if (params.max_no_improvement > 0) { - if (fx_best > fx) { - fx_best = fx; - n_no_improvement = 0; + if (fx_best[0] > fx) { + fx_best[0] = fx; + n_no_improvement[0] = 0; } else { - ++n_no_improvement; + ++n_no_improvement[0]; - if (n_no_improvement >= params.max_no_improvement) { + if (n_no_improvement[0] >= params.max_no_improvement) { return GGML_OPT_OK; } } } - fx_prev = fx; + fx_prev[0] = fx; { const int64_t t_end_cpu = ggml_cycles(); @@ -14877,6 +14891,7 @@ static enum ggml_opt_result linesearch_backtracking( static enum ggml_opt_result ggml_opt_lbfgs( struct ggml_context * ctx, + struct ggml_opt_context * opt, struct ggml_opt_params params, struct ggml_tensor * f, struct ggml_cgraph * gf, @@ -14909,31 +14924,32 @@ static enum ggml_opt_result ggml_opt_lbfgs( } } - float * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // current parameters - float * xp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // previous parameters - float * g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // current gradient - float * gp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // previous gradient - float * d = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // search direction + if ((opt->params.type != params.type) || (opt->nx != nx) || (opt->params.past != params.past) || (opt->params.lbfgs.m != params.lbfgs.m)) { + int iter = opt->iter; + ggml_opt_init(ctx, opt, params, nx); + opt->iter = iter; + } + + float * x = opt->lbfgs.x->data; // current parameters + float * xp = opt->lbfgs.xp->data; // previous parameters + float * g = opt->lbfgs.g->data; // current gradient + float * gp = opt->lbfgs.gp->data; // previous gradient + float * d = opt->lbfgs.d->data; // search direction - float * pf = params.past > 0 ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)->data : NULL; // past function values + float * pf = params.past > 0 ? opt->lbfgs.pf->data : NULL; // past function values float fx = 0.0f; // cost function value float xnorm = 0.0f; // ||x|| float gnorm = 0.0f; // ||g|| - float step = 0.0f; // initialize x from the graph nodes ggml_opt_get_params(np, ps, x); // the L-BFGS memory - struct ggml_lbfgs_iteration_data * lm = alloca(sizeof(struct ggml_lbfgs_iteration_data)*m); - - for (int i = 0; i < m; ++i) { - lm[i].alpha = 0.0f; - lm[i].ys = 0.0f; - lm[i].s = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; - lm[i].y = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; - } + float * lm_alpha = opt->lbfgs.lmal->data; + float * lm_ys = opt->lbfgs.lmys->data; + float * lm_s = opt->lbfgs.lms->data; + float * lm_y = opt->lbfgs.lmy->data; // evaluate the function value and its gradient { @@ -14948,12 +14964,6 @@ static enum ggml_opt_result ggml_opt_lbfgs( fx = ggml_get_f32_1d(f, 0); } - if (pf) { - pf[0] = fx; - } - - float fx_best = fx; - // search direction = -gradient ggml_vec_neg_f32(nx, d, g); @@ -14970,26 +14980,43 @@ static enum ggml_opt_result ggml_opt_lbfgs( return GGML_OPT_OK; } - // initial step - ggml_vec_norm_inv_f32(nx, &step, d); + if (opt->just_initialized) { + if (pf) { + pf[0] = fx; + } + opt->lbfgs.fx_best = fx; + + // initial step + ggml_vec_norm_inv_f32(nx, &opt->lbfgs.step, d); + opt->lbfgs.j = 0; + opt->lbfgs.k = 1; + opt->lbfgs.end = 0; + opt->lbfgs.n_no_improvement = 0; + opt->just_initialized = false; + } + + float * fx_best = &opt->lbfgs.fx_best; + float * step = &opt->lbfgs.step; + int * j = &opt->lbfgs.j; + int * k = &opt->lbfgs.k; + int * end = &opt->lbfgs.end; + int * n_no_improvement = &opt->lbfgs.n_no_improvement; - int j = 0; - int k = 1; - int ls = 0; - int end = 0; - int bound = 0; - int n_no_improvement = 0; + int ls = 0; + int bound = 0; float ys = 0.0f; float yy = 0.0f; float beta = 0.0f; + int it = 0; + while (true) { // store the current position and gradient vectors ggml_vec_cpy_f32(nx, xp, x); ggml_vec_cpy_f32(nx, gp, g); - ls = linesearch_backtracking(ctx, ¶ms, nx, x, &fx, g, d, &step, xp, f, gf, gb, np, ps); + ls = linesearch_backtracking(ctx, ¶ms, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps); if (ls < 0) { // linesearch failed - go back to the previous point and return @@ -15015,32 +15042,32 @@ static enum ggml_opt_result ggml_opt_lbfgs( // delta-based convergence test if (pf != NULL) { // need at least params.past iterations to start checking for convergence - if (params.past <= k) { - const float rate = (pf[k%params.past] - fx)/fx; + if (params.past <= k[0]) { + const float rate = (pf[k[0]%params.past] - fx)/fx; if (fabsf(rate) < params.delta) { return GGML_OPT_OK; } } - pf[k%params.past] = fx; + pf[k[0]%params.past] = fx; } // check for improvement if (params.max_no_improvement > 0) { - if (fx < fx_best) { - fx_best = fx; - n_no_improvement = 0; + if (fx < fx_best[0]) { + fx_best[0] = fx; + n_no_improvement[0] = 0; } else { - n_no_improvement++; + n_no_improvement[0]++; - if (n_no_improvement >= params.max_no_improvement) { + if (n_no_improvement[0] >= params.max_no_improvement) { return GGML_OPT_OK; } } } - if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < k + 1) { + if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < it + 1) { // reached the maximum number of iterations return GGML_OPT_DID_NOT_CONVERGE; } @@ -15049,50 +15076,51 @@ static enum ggml_opt_result ggml_opt_lbfgs( // s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}. // y_{k+1} = g_{k+1} - g_{k}. // - ggml_vec_sub_f32(nx, lm[end].s, x, xp); - ggml_vec_sub_f32(nx, lm[end].y, g, gp); + ggml_vec_sub_f32(nx, &lm_s[end[0]*nx], x, xp); + ggml_vec_sub_f32(nx, &lm_y[end[0]*nx], g, gp); // compute scalars ys and yy: // ys = y^t \cdot s -> 1 / \rho. // yy = y^t \cdot y. // - ggml_vec_dot_f32(nx, &ys, lm[end].y, lm[end].s); - ggml_vec_dot_f32(nx, &yy, lm[end].y, lm[end].y); + ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0] *nx]); + ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]); - lm[end].ys = ys; + lm_ys[end[0]] = ys; // find new search direction // ref: https://en.wikipedia.org/wiki/Limited-memory_BFGS - bound = (m <= k) ? m : k; - k++; - end = (end + 1)%m; + bound = (m <= k[0]) ? m : k[0]; + k[0]++; + it++; + end[0] = (end[0] + 1)%m; // initialize search direction with -g ggml_vec_neg_f32(nx, d, g); - j = end; + j[0] = end[0]; for (int i = 0; i < bound; ++i) { - j = (j + m - 1) % m; + j[0] = (j[0] + m - 1) % m; // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1} - ggml_vec_dot_f32(nx, &lm[j].alpha, lm[j].s, d); - lm[j].alpha /= lm[j].ys; + ggml_vec_dot_f32(nx, &lm_alpha[j[0]], &lm_s[j[0]*nx], d); + lm_alpha[j[0]] /= lm_ys[j[0]]; // q_{i} = q_{i+1} - \alpha_{i} y_{i} - ggml_vec_mad_f32(nx, d, lm[j].y, -lm[j].alpha); + ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]); } ggml_vec_scale_f32(nx, d, ys/yy); for (int i = 0; i < bound; ++i) { // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i} - ggml_vec_dot_f32(nx, &beta, lm[j].y, d); - beta /= lm[j].ys; + ggml_vec_dot_f32(nx, &beta, &lm_y[j[0]*nx], d); + beta /= lm_ys[j[0]]; // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j} - ggml_vec_mad_f32(nx, d, lm[j].s, lm[j].alpha - beta); - j = (j + 1)%m; + ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta); + j[0] = (j[0] + 1)%m; } - step = 1.0; + step[0] = 1.0; } return GGML_OPT_DID_NOT_CONVERGE; @@ -15161,6 +15189,71 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { return result; } +GGML_API void ggml_opt_init( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_opt_params params, + int64_t nx) { + opt->ctx = ctx; + opt->params = params; + opt->iter = 0; + opt->nx = nx; + opt->just_initialized = true; + switch (opt->params.type) { + case GGML_OPT_ADAM: + { + opt->adam.x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->adam.g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->adam.g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->adam.m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->adam.v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->adam.mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->adam.vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->adam.pf = params.past > 0 + ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past) + : NULL; + ggml_set_zero(opt->adam.x); + ggml_set_zero(opt->adam.g1); + ggml_set_zero(opt->adam.g2); + ggml_set_zero(opt->adam.m); + ggml_set_zero(opt->adam.v); + ggml_set_zero(opt->adam.mh); + ggml_set_zero(opt->adam.vh); + if (opt->adam.pf) { + ggml_set_zero(opt->adam.pf); + } + } break; + case GGML_OPT_LBFGS: + { + opt->lbfgs.x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->lbfgs.xp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->lbfgs.g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->lbfgs.gp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->lbfgs.d = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->lbfgs.pf = params.past > 0 + ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past) + : NULL; + opt->lbfgs.lmal = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.lbfgs.m); + opt->lbfgs.lmys = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.lbfgs.m); + opt->lbfgs.lms = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, params.lbfgs.m); + opt->lbfgs.lmy = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, params.lbfgs.m); + ggml_set_zero(opt->lbfgs.x); + ggml_set_zero(opt->lbfgs.xp); + ggml_set_zero(opt->lbfgs.g); + ggml_set_zero(opt->lbfgs.gp); + ggml_set_zero(opt->lbfgs.d); + ggml_set_zero(opt->lbfgs.pf); + if (opt->lbfgs.pf) { + ggml_set_zero(opt->lbfgs.pf); + } + ggml_set_zero(opt->lbfgs.lmal); + ggml_set_zero(opt->lbfgs.lmys); + ggml_set_zero(opt->lbfgs.lms); + ggml_set_zero(opt->lbfgs.lmy); + } break; + } +} + enum ggml_opt_result ggml_opt( struct ggml_context * ctx, struct ggml_opt_params params, @@ -15183,33 +15276,54 @@ enum ggml_opt_result ggml_opt( enum ggml_opt_result result = GGML_OPT_OK; + struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context)); + + ggml_opt_init(ctx, opt, params, 0); + result = ggml_opt_resume(ctx, opt, f); + + if (free_ctx) { + ggml_free(ctx); + } + + return result; +} + +enum ggml_opt_result ggml_opt_resume( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_tensor * f) { + // build forward + backward compute graphs - struct ggml_cgraph gf = ggml_build_forward (f); - struct ggml_cgraph gb = ggml_build_backward(ctx, &gf, true); + struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0)); + struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0)); + + struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data; + struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data; + + *gf = ggml_build_forward (f); + *gb = ggml_build_backward(ctx, gf, true); - switch (params.type) { + enum ggml_opt_result result = GGML_OPT_OK; + + switch (opt->params.type) { case GGML_OPT_ADAM: { - result = ggml_opt_adam(ctx, params, f, &gf, &gb); + result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb); } break; case GGML_OPT_LBFGS: { - result = ggml_opt_lbfgs(ctx, params, f, &gf, &gb); + result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb); } break; } - if (params.print_forward_graph) { - ggml_graph_print (&gf); - ggml_graph_dump_dot(&gf, NULL, "opt-forward.dot"); - } - - if (params.print_backward_graph) { - ggml_graph_print (&gb); - ggml_graph_dump_dot(&gb, &gf, "opt-backward.dot"); + if (opt->params.print_forward_graph) { + ggml_graph_print (gf); + ggml_graph_dump_dot(gf, NULL, "opt-forward.dot"); } - if (free_ctx) { - ggml_free(ctx); + if (opt->params.print_backward_graph) { + ggml_graph_print (gb); + ggml_graph_dump_dot(gb, gf, "opt-backward.dot"); } return result; diff --git a/ggml.h b/ggml.h index 6ce660c7454d4..64de9eb3ea76f 100644 --- a/ggml.h +++ b/ggml.h @@ -1081,6 +1081,49 @@ extern "C" { } lbfgs; }; + struct ggml_opt_context { + struct ggml_context * ctx; + struct ggml_opt_params params; + + int iter; + int64_t nx; // number of parameter elements + + bool just_initialized; + + struct { + struct ggml_tensor * x; // view of the parameters + struct ggml_tensor * g1; // gradient + struct ggml_tensor * g2; // gradient squared + struct ggml_tensor * m; // first moment + struct ggml_tensor * v; // second moment + struct ggml_tensor * mh; // first moment hat + struct ggml_tensor * vh; // second moment hat + struct ggml_tensor * pf; // past function values + float fx_best; + float fx_prev; + int n_no_improvement; + } adam; + + struct { + struct ggml_tensor * x; // current parameters + struct ggml_tensor * xp; // previous parameters + struct ggml_tensor * g; // current gradient + struct ggml_tensor * gp; // previous gradient + struct ggml_tensor * d; // search direction + struct ggml_tensor * pf; // past function values + struct ggml_tensor * lmal; // the L-BFGS memory alpha + struct ggml_tensor * lmys; // the L-BFGS memory ys + struct ggml_tensor * lms; // the L-BFGS memory s + struct ggml_tensor * lmy; // the L-BFGS memory y + float fx_best; + float step; + int j; + int k; + int end; + int n_no_improvement; + } lbfgs; + }; + GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type); // optimize the function defined by the tensor f @@ -1089,6 +1132,19 @@ extern "C" { struct ggml_opt_params params, struct ggml_tensor * f); + // initialize optimizer context + GGML_API void ggml_opt_init( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_opt_params params, + int64_t nx); + + // continue optimizing the function defined by the tensor f + GGML_API enum ggml_opt_result ggml_opt_resume( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_tensor * f); + // // quantization // From 2afd2184793541f67f660ca1aabe399aaa71719e Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 21 May 2023 21:12:10 +0200 Subject: [PATCH 37/86] fix bug in llama_sample_token_mirostat_v2 when all candidates are filtered out through mu threshold, the following soft_max operation will fail. so keep at least one. --- llama.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llama.cpp b/llama.cpp index 98f49abd7cf48..ca61a69e0bb17 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1882,6 +1882,10 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok return -log2f(candidate.p) > *mu; })); + if (candidates->size == 0) { + candidates->size = 1; + } + // Normalize the probabilities of the remaining words llama_sample_softmax(ctx, candidates); From 93eb8f77522a090d7e69bd206ab61bfaa207679f Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 21 May 2023 21:14:49 +0200 Subject: [PATCH 38/86] add forward function without using cache, for more performant training during training on whole samples no cache is required. removing the cache and simplifying the remaining code results in performance and memory usage improvement. --- examples/baby-llama/baby-llama-text.cpp | 234 +++++++++++++++++++++++- 1 file changed, 233 insertions(+), 1 deletion(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index 4c3cd631b45a8..cda1edece92a9 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -911,6 +911,234 @@ struct ggml_tensor * forward_batch( return inpL; } +struct ggml_tensor * forward_batch_wo_cache( + struct my_llama_model * model, + struct ggml_context * ctx0, + struct ggml_cgraph * gf, + struct ggml_tensor * tokens_input, + const int n_tokens, + const int n_batch) { + + const int n_past = 0; + const int N = n_tokens; + + const auto & hparams = model->hparams; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_head = hparams.n_head; + const int n_rot = hparams.n_rot; + const int n_ff = get_n_ff(&hparams); + + struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); + memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch); + + // inpL shape [n_embd,N*n_batch,1] + struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); + assert_shape_2d(inpL, n_embd, N*n_batch); + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + struct ggml_tensor * cur; + + // lctx.use_buf(ctx0, 0); + + // norm + { + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_rms_norm(ctx0, inpL); + assert_shape_2d(cur, n_embd, N*n_batch); + + // cur = attention_norm*cur + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].attention_norm, cur), + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // self-attention + { + // compute Q and K and RoPE them + // wq shape [n_embd, n_embd, 1, 1] + // wk shape [n_embd, n_embd, 1, 1] + // Qcur shape [n_embd/n_head, n_head, N, n_batch] + // Kcur shape [n_embd/n_head, n_head, N, n_batch] + struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); + struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); + assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch); + assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch); + + // Vcur shape [N, n_batch, n_embd/n_head, n_head] + struct ggml_tensor * Vcur = ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, cur, model->layers[il].wv), N, n_batch, n_embd/n_head, n_head); + assert_shape_4d(Vcur, N, n_batch, n_embd/n_head, n_head); + + // Qcur shape [n_embd/n_head, n_head, N, n_batch] + // Q shape [n_embd/n_head, N, n_head, n_batch] + struct ggml_tensor * Q = + ggml_permute(ctx0, + Qcur, + 0, 2, 1, 3); + assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch); + + // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer] + // K shape [n_embd/n_head, N, n_head, n_batch] + struct ggml_tensor * K = + ggml_permute(ctx0, + Kcur, + 0, 2, 1, 3); + assert_shape_4d(K, n_embd/n_head, N, n_head, n_batch); + + // K * Q + // KQ shape [N, N, n_head, n_batch] + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + assert_shape_4d(KQ, N, N, n_head, n_batch); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + // KQ_scaled shape [N, N, n_head, n_batch] + struct ggml_tensor * KQ_scaled = + ggml_scale_inplace(ctx0, + KQ, + ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); + assert_shape_4d(KQ_scaled, N, N, n_head, n_batch); + + // KQ_masked = mask_past(KQ_scaled) + // KQ_masked shape [N, N, n_head, n_batch] + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + assert_shape_4d(KQ_masked, N, N, n_head, n_batch); + + // KQ = soft_max(KQ_masked) + // KQ_soft_max shape [N, N, n_head, n_batch] + struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + assert_shape_4d(KQ_soft_max, N, N, n_head, n_batch); + + // Vcur shape [N, n_batch, n_embd/n_head, n_head] + // V shape [N, n_embd/n_head, n_head, n_batch] + struct ggml_tensor * V = + ggml_permute(ctx0, + Vcur, + 0, 3, 1, 2); + assert_shape_4d(V, N, n_embd/n_head, n_head, n_batch); + + // KQV shape [n_embd/n_head, N, n_head, n_batch] + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + // KQV_merged shape [n_embd/n_head, n_head, N, n_batch] + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch); + // KQV_merged shape + + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch); + assert_shape_2d(cur, n_embd, N*n_batch); + + // projection (no bias) + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].wo, + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // lctx.use_buf(ctx0, 1); + + // inpFF shape [n_embd,N*n_batch,1,1] + struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA); + assert_shape_2d(inpFF, n_embd, N*n_batch); + + // feed-forward network + { + // norm + { + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_rms_norm(ctx0, inpFF); + assert_shape_2d(cur, n_embd, N*n_batch); + + // cur = ffn_norm*cur + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // tmp shape [n_ff,N*n_batch,1,1] + struct ggml_tensor * tmp = ggml_mul_mat(ctx0, + model->layers[il].w3, + cur); + assert_shape_2d(tmp, n_ff, N*n_batch); + + // cur shape [n_ff,N*n_batch,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].w1, + cur); + assert_shape_2d(cur, n_ff, N*n_batch); + + // SILU activation + // cur shape [n_ff,N*n_batch,1,1] + cur = ggml_silu(ctx0, cur); + assert_shape_2d(cur, n_ff, N*n_batch); + + // cur shape [n_ff,N*n_batch,1,1] + cur = ggml_mul(ctx0, cur, tmp); + assert_shape_2d(cur, n_ff, N*n_batch); + + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].w2, + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_add_inplace(ctx0, cur, inpFF); + assert_shape_2d(cur, n_embd, N*n_batch); + + // input for next layer + // inpL shape [n_embd,N*n_batch,1,1] + inpL = cur; + assert_shape_2d(inpL, n_embd, N*n_batch); + } + + // norm + { + + // inpL shape [n_embd,N*n_batch,1,1] + inpL = ggml_rms_norm(ctx0, inpL); + assert_shape_2d(inpL, n_embd, N*n_batch); + + // inpL = norm*inpL + // inpL shape [n_embd,N*n_batch,1,1] + inpL = ggml_mul(ctx0, + ggml_repeat(ctx0, model->norm, inpL), + inpL); + + assert_shape_2d(inpL, n_embd, N*n_batch); + + //embeddings = inpL; + } + + // lm_head + // inpL shape [n_vocab,N*n_batch,1,1] + inpL = ggml_mul_mat(ctx0, model->output, inpL); + assert_shape_2d(inpL, n_vocab, N*n_batch); + + { + // inpL shape [n_vocab,N,n_batch,1] + inpL = ggml_reshape_3d(ctx0, + inpL, + n_vocab, N, n_batch); + assert_shape_3d(inpL, n_vocab, N, n_batch); + } + + // run the computation + ggml_build_forward_expand(gf, inpL); + + return inpL; +} + void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { assert(logits->n_dims == 2); assert(probs->n_dims == 2); @@ -1627,7 +1855,11 @@ int main(int argc, char ** argv) { get_example_targets_batch(ctx0, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs); - struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch); + struct ggml_tensor * logits = + (n_past == 0) + ? forward_batch_wo_cache(&model, ctx0, &gf, tokens_input, n_tokens, n_batch) + : forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch); + // struct ggml_tensor * se = square_error_loss(ctx0, logits, target_logits); struct ggml_tensor * ce = cross_entropy_loss(ctx0, logits, target_probs); // struct ggml_tensor * e = ggml_add(ctx0, se, ce); From 37c69435f04f0e827eaccd3988d78ff385206869 Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 21 May 2023 21:17:46 +0200 Subject: [PATCH 39/86] print suppressed newline tokens as string "\n" printing too much actual newlines is suppressed to avoid flooding the console. --- examples/baby-llama/baby-llama-text.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index cda1edece92a9..e4df2eca595e0 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -1247,6 +1247,8 @@ void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens) } if (!isnl || (num_newline < 2)) { print_token(ctx, token); + } else { + printf("\\n"); } } printf("\n--\n"); From 42d9b4cfc2c13a434e55e2739e255e9af728d842 Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 21 May 2023 21:36:04 +0200 Subject: [PATCH 40/86] store optimizer state in training checkpoint and add learning schedule persistent optimizer state allows to resume training without resetting the optimizer learning schedule consists of linear warmup ramp followed by cosine decay with restarts --- examples/baby-llama/baby-llama-text.cpp | 263 ++++++++++++++++++++---- 1 file changed, 226 insertions(+), 37 deletions(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index e4df2eca595e0..ff213ea485435 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -208,6 +208,7 @@ struct my_llama_model { uint32_t train_its = 0; uint32_t train_samples = 0; + uint32_t train_tokens = 0; }; uint32_t get_n_ff(const struct my_llama_hparams* hparams) { @@ -237,6 +238,10 @@ void init_model(struct my_llama_model * model) { struct ggml_context * ctx = model->ctx; + model->train_its = 0; + model->train_samples = 0; + model->train_tokens = 0; + model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); @@ -1613,6 +1618,13 @@ enum llama_file_version { }; void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) { + if (tensor == NULL) { + file->write_u32(0); + file->write_u32(0); + file->write_u32(GGML_TYPE_F32); + file->seek(-file->tell() & 31, SEEK_CUR); + return; + } const char * name = ggml_get_name(tensor); uint32_t name_len = strlen(name); uint32_t nd = tensor->n_dims; @@ -1629,28 +1641,135 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) { void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) { uint32_t nd = file->read_u32(); GGML_ASSERT(nd == tensor->n_dims); - uint32_t name_len = file->read_u32(); - enum ggml_type type = (enum ggml_type) file->read_u32(); + + uint32_t name_len = file->read_u32(); + enum ggml_type type = (enum ggml_type) file->read_u32(); GGML_ASSERT(type == tensor->type); + uint32_t ne[4]; file->read_raw(ne, sizeof(ne[0]) * nd); for (int i=0; ine[i]); } - std::string name = file->read_string(name_len); - file->seek(-file->tell() & 31, SEEK_CUR); + std::string name = file->read_string(name_len); GGML_ASSERT(strcmp(ggml_get_name(tensor), name.c_str()) == 0); + + file->seek(-file->tell() & 31, SEEK_CUR); file->read_raw(tensor->data, ggml_nbytes(tensor)); } -void save_model(struct my_llama_model * model, const char * filename) { +void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt) { + const uint32_t version = 0; + GGML_ASSERT(opt->nx >= 0); + GGML_ASSERT(opt->iter >= 0); + file->write_u32(version); + file->write_raw(&opt->params, sizeof(opt->params)); + file->write_raw(&opt->nx, sizeof(opt->nx)); + file->write_raw(&opt->iter, sizeof(opt->iter)); + file->write_u32((uint32_t) opt->just_initialized); + switch (opt->params.type) { + case GGML_OPT_ADAM: + { + GGML_ASSERT(opt->adam.x != NULL); + write_tensor(file, opt->adam.x); + write_tensor(file, opt->adam.g1); + write_tensor(file, opt->adam.g2); + write_tensor(file, opt->adam.m); + write_tensor(file, opt->adam.v); + write_tensor(file, opt->adam.mh); + write_tensor(file, opt->adam.vh); + write_tensor(file, opt->adam.pf); + file->write_raw(&opt->adam.fx_best, sizeof(opt->adam.fx_best)); + file->write_raw(&opt->adam.fx_prev, sizeof(opt->adam.fx_prev)); + file->write_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement)); + } break; + case GGML_OPT_LBFGS: + { + GGML_ASSERT(opt->adam.x != NULL); + write_tensor(file, opt->lbfgs.x); + write_tensor(file, opt->lbfgs.xp); + write_tensor(file, opt->lbfgs.g); + write_tensor(file, opt->lbfgs.gp); + write_tensor(file, opt->lbfgs.d); + write_tensor(file, opt->lbfgs.pf); + write_tensor(file, opt->lbfgs.lmal); + write_tensor(file, opt->lbfgs.lmys); + write_tensor(file, opt->lbfgs.lms); + write_tensor(file, opt->lbfgs.lmy); + file->write_raw(&opt->lbfgs.fx_best, sizeof(opt->lbfgs.fx_best)); + file->write_raw(&opt->lbfgs.step, sizeof(opt->lbfgs.step)); + file->write_raw(&opt->lbfgs.j, sizeof(opt->lbfgs.j)); + file->write_raw(&opt->lbfgs.k, sizeof(opt->lbfgs.k)); + file->write_raw(&opt->lbfgs.end, sizeof(opt->lbfgs.end)); + file->write_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement)); + } break; + } +} + +void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) { + uint32_t version = file->read_u32(); + GGML_ASSERT(version == 0); + + file->read_raw(&opt->params, sizeof(opt->params)); + file->read_raw(&opt->nx, sizeof(opt->nx)); + ggml_opt_init(ctx, opt, opt->params, opt->nx); + + file->read_raw(&opt->iter, sizeof(opt->iter)); + opt->just_initialized = (bool) file->read_u32(); + + switch (opt->params.type) { + case GGML_OPT_ADAM: + { + read_tensor(file, opt->adam.x); + read_tensor(file, opt->adam.g1); + read_tensor(file, opt->adam.g2); + read_tensor(file, opt->adam.m); + read_tensor(file, opt->adam.v); + read_tensor(file, opt->adam.mh); + read_tensor(file, opt->adam.vh); + if (opt->adam.pf) { read_tensor(file, opt->adam.pf); } + file->read_raw(&opt->adam.fx_best, sizeof(opt->adam.fx_best)); + file->read_raw(&opt->adam.fx_prev, sizeof(opt->adam.fx_prev)); + file->read_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement)); + } break; + case GGML_OPT_LBFGS: + { + GGML_ASSERT(opt->adam.x != NULL); + read_tensor(file, opt->lbfgs.x); + read_tensor(file, opt->lbfgs.xp); + read_tensor(file, opt->lbfgs.g); + read_tensor(file, opt->lbfgs.gp); + read_tensor(file, opt->lbfgs.d); + if (opt->lbfgs.pf) { read_tensor(file, opt->lbfgs.pf); } + read_tensor(file, opt->lbfgs.lmal); + read_tensor(file, opt->lbfgs.lmys); + read_tensor(file, opt->lbfgs.lms); + read_tensor(file, opt->lbfgs.lmy); + file->read_raw(&opt->lbfgs.fx_best, sizeof(opt->lbfgs.fx_best)); + file->read_raw(&opt->lbfgs.step, sizeof(opt->lbfgs.step)); + file->read_raw(&opt->lbfgs.j, sizeof(opt->lbfgs.j)); + file->read_raw(&opt->lbfgs.k, sizeof(opt->lbfgs.k)); + file->read_raw(&opt->lbfgs.end, sizeof(opt->lbfgs.end)); + file->read_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement)); + } break; + } +} + +void save_checkpoint(struct my_llama_model * model, struct ggml_opt_context * opt, const char * filename) { struct llama_file file(filename, "wb"); if (file.fp == NULL) { return; } + + const uint32_t magic = 'ggcp'; + const uint32_t version = 0; + + file.write_u32(magic); + file.write_u32(version); file.write_u32(model->train_its); file.write_u32(model->train_samples); + file.write_u32(model->train_tokens); file.write_u32(model->hparams.n_vocab); file.write_u32(model->hparams.n_embd); file.write_u32(model->hparams.n_mult); @@ -1675,23 +1794,35 @@ void save_model(struct my_llama_model * model, const char * filename) { write_tensor(&file, layer.w2); write_tensor(&file, layer.w3); } + + write_opt_context(&file, opt); } -bool load_model(struct my_llama_model * model, const char * filename, bool init) { +bool load_checkpoint(struct my_llama_model * model, struct ggml_opt_context * opt, const char * filename, bool init) { struct llama_file file(filename, "rb"); - + + uint32_t magic; + uint32_t version; + + uint32_t train_its = 0; + uint32_t train_samples = 0; + uint32_t train_tokens = 0; + if (file.fp) { printf("%s: Loading model from '%s'.\n", __func__, filename); - model->train_its = file.read_u32(); - model->train_samples = file.read_u32(); + magic = file.read_u32(); + GGML_ASSERT(magic == 'ggcp'); + version = file.read_u32(); + GGML_ASSERT(version == 0); + train_its = file.read_u32(); + train_samples = file.read_u32(); + train_tokens = file.read_u32(); model->hparams.n_vocab = file.read_u32(); model->hparams.n_embd = file.read_u32(); model->hparams.n_mult = file.read_u32(); model->hparams.n_head = file.read_u32(); model->hparams.n_layer = file.read_u32(); model->hparams.n_rot = file.read_u32(); - printf("%s: Training iterations: %u.\n", __func__, model->train_its); - printf("%s: Training samples: %u.\n", __func__, model->train_samples); print_params(&model->hparams); } @@ -1699,6 +1830,16 @@ bool load_model(struct my_llama_model * model, const char * filename, bool init) init_model(model); } + if (file.fp) { + model->train_its = train_its; + model->train_samples = train_samples; + model->train_tokens = train_tokens; + } + + printf("%s: Training iterations: %u.\n", __func__, model->train_its); + printf("%s: Training samples: %u.\n", __func__, model->train_samples); + printf("%s: Training tokens: %u.\n", __func__, model->train_tokens); + if (file.fp) { read_tensor(&file, model->tok_embeddings); read_tensor(&file, model->norm); @@ -1717,11 +1858,30 @@ bool load_model(struct my_llama_model * model, const char * filename, bool init) read_tensor(&file, layer.w2); read_tensor(&file, layer.w3); } + + read_opt_context(&file, model->ctx, opt); } return (file.fp != NULL); } +float cosine_decay(const int decay_steps, const float alpha, int step) { + if (step > decay_steps) { + step = decay_steps; + } + const float cosine_decay = 0.50f*(1.0f + cosf(3.14159265359f*step/decay_steps)); + const float decay = (1 - alpha)*cosine_decay + alpha; + return decay; +} + +float cosine_decay_restart(int decay_steps, const float alpha, int step, float restart_step_mult) { + while (step > decay_steps) { + step -= decay_steps; + decay_steps = (int) restart_step_mult * decay_steps; + } + return cosine_decay(decay_steps, alpha, step); +} + int main(int argc, char ** argv) { const char * default_model = "ggml-vic7b-uncensored-q4_0.bin"; const char * default_train = "shakespeare.txt"; @@ -1795,16 +1955,55 @@ int main(int argc, char ** argv) { my_llama_sampler sampler; + + int n_threads = 6; + + bool use_adam = true; + + int warmup = 100; + int cos_decay_steps = 1000; + float cos_decay_restart = 1.1f; + float cos_decay_alpha = 0.0f; + + struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context)); + memset(opt, 0, sizeof(struct ggml_opt_context)); + + struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM); + struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS); + opt_params_adam.print_forward_graph = false; + opt_params_adam.print_backward_graph = false; + opt_params_adam.n_threads = n_threads; + opt_params_adam.adam.n_iter = 16; + opt_params_adam.adam.sched = 1.0f; + opt_params_adam.adam.alpha = 1e-3; + opt_params_adam.adam.decay = 1e-3; + + opt_params_lbfgs.print_forward_graph = false; + opt_params_lbfgs.print_backward_graph = false; + opt_params_lbfgs.n_threads = n_threads; + opt_params_lbfgs.lbfgs.n_iter = 16; + + opt->ctx = model.ctx; + opt->params = use_adam ? opt_params_adam : opt_params_lbfgs; + printf("%s: init model\n", __func__); - bool existed = load_model(&model, fn_chkpt_in, true); - bool from_scratch = !existed; + bool existed = load_checkpoint(&model, opt, fn_chkpt_in, true); set_param_model(&model); + + opt->iter = model.train_its; + printf("%s: opt iter %d\n", __func__, opt->iter); + + bool from_scratch = !existed; if (from_scratch) { randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f); } - init_kv_cache(&kv_self, &model, n_batch); + + init_kv_cache(&kv_self, &model, 1); + // init_kv_cache(&kv_self, &model, n_batch); init_sampler(&sampler, lctx); + printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx)); + // ggml_print_tensor_objects(model.ctx); size_t compute_size = 1024ll*1024ll*1024ll*32ll; uint8_t * compute_addr = new uint8_t[compute_size]; @@ -1853,7 +2052,7 @@ int main(int argc, char ** argv) { int n_past = 0; ggml_cgraph gf = {}; - gf.n_threads = 6; + gf.n_threads = n_threads; get_example_targets_batch(ctx0, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs); @@ -1875,30 +2074,20 @@ int main(int argc, char ** argv) { float error_before_opt = ggml_get_f32_1d(e, 0); - struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM); - struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS); - opt_params_adam.print_forward_graph = false; - opt_params_adam.print_backward_graph = false; - opt_params_adam.n_threads = gf.n_threads; - opt_params_adam.adam.n_iter = 16; - opt_params_adam.adam.alpha = 1e-4; - - opt_params_lbfgs.print_forward_graph = false; - opt_params_lbfgs.print_backward_graph = false; - opt_params_lbfgs.n_threads = gf.n_threads; - opt_params_lbfgs.lbfgs.n_iter = 16; - - bool use_adam = true; - if (use_adam) { - ggml_opt(ctx0, opt_params_adam, e); - } else { - ggml_opt(ctx0, opt_params_lbfgs, e); - } + opt->params.adam.sched = (opt->iter < warmup) + ? (float) opt->iter / (float) warmup + : cosine_decay_restart(cos_decay_steps, cos_decay_alpha, opt->iter - warmup, cos_decay_restart); + printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched); + + // ggml_opt(ctx0, opt->params, e); + ggml_opt_resume(ctx0, opt, e); size_t used_mem_after_opt = ggml_used_mem(ctx0); - model.train_its += use_adam ? opt_params_adam.adam.n_iter : opt_params_lbfgs.lbfgs.n_iter; + model.train_its = opt->iter; + // model.train_its += use_adam ? opt_params_adam.adam.n_iter : opt_params_lbfgs.lbfgs.n_iter; model.train_samples += n_batch; + model.train_tokens += n_batch * n_tokens; ggml_build_forward_expand(&gf, e); ggml_graph_compute(ctx0, &gf); @@ -1909,7 +2098,7 @@ int main(int argc, char ** argv) { printf("used_mem_after_opt: %zu bytes\n", used_mem_after_opt); if (ex % 1 == 0) { - printf("Example %d\n", ex); + printf("Example %d, opt iter %d\n", ex, opt->iter); printf("error_before_opt: %.6f\n", error_before_opt); printf("error_after_opt: %.6f\n", error_after_opt); } @@ -1943,7 +2132,7 @@ int main(int argc, char ** argv) { ggml_free(ctx0); } - save_model(&model, fn_chkpt_out); + save_checkpoint(&model, opt, fn_chkpt_out); { int n_gen = 1024; From b763d6f1f233bedb7fc1c89dba0f3f39a59ba8c7 Mon Sep 17 00:00:00 2001 From: xaedes Date: Mon, 22 May 2023 16:54:21 +0200 Subject: [PATCH 41/86] remove unused functions --- examples/baby-llama/baby-llama-text.cpp | 68 ------------------------- 1 file changed, 68 deletions(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index ff213ea485435..b187bfd1728fa 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -1144,72 +1144,6 @@ struct ggml_tensor * forward_batch_wo_cache( return inpL; } -void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { - assert(logits->n_dims == 2); - assert(probs->n_dims == 2); - assert(best_samples->n_dims == 1); - assert(logits->ne[1] == best_samples->ne[0]); - assert(logits->ne[0] == probs->ne[0]); - assert(logits->ne[1] == probs->ne[1]); - for (int i = 0; i < logits->ne[1]; ++i) { - float max_logit = ggml_get_f32_1d(logits, i * logits->ne[0]); - ggml_set_i32_1d(best_samples, i, 0); - for (int k = 0; k < logits->ne[0]; ++k) { - float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k); - if (logit > max_logit) { - max_logit = logit; - ggml_set_i32_1d(best_samples, i, k); - } - } - float psum = 0; - for (int k = 0; k < logits->ne[0]; ++k) { - float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k); - float p = (logit == -INFINITY) ? 0 : expf(logit - max_logit); - psum += p; - ggml_set_f32_1d(probs, i * probs->ne[0] + k, p); - } - for (int k = 0; k < logits->ne[0]; ++k) { - float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); - ggml_set_f32_1d(probs, i * probs->ne[0] + k, p / psum); - } - } -} - -void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { - GGML_ASSERT(best_samples->n_dims == 2); - GGML_ASSERT(logits->n_dims == 3); - GGML_ASSERT(probs->n_dims == 3); - int n_tokens = best_samples->ne[0]; - int n_batch = best_samples->ne[1]; - int n_vocab = logits->ne[0]; - GGML_ASSERT(n_tokens == logits->ne[1]); - GGML_ASSERT(n_batch == logits->ne[2]); - GGML_ASSERT(n_vocab == probs->ne[0]); - GGML_ASSERT(n_tokens == probs->ne[1]); - GGML_ASSERT(n_batch == probs->ne[2]); - - for (int k = 0; k < n_batch; ++k) { - struct ggml_tensor * best_samples_k = ggml_view_1d(ctx, - best_samples, - best_samples->ne[0], - k*best_samples->nb[1]); - struct ggml_tensor * logits_k = ggml_view_2d(ctx, - logits, - logits->ne[0], - logits->ne[1], - logits->nb[1], - k*logits->nb[2]); - struct ggml_tensor * probs_k = ggml_view_2d(ctx, - probs, - probs->ne[0], - probs->ne[1], - probs->nb[1], - k*probs->nb[2]); - sample_softmax(logits_k, probs_k, best_samples_k); - } -} - - void print_row(struct ggml_tensor * probs, int i) { for (int k = 0; k < probs->ne[0]; ++k) { float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); @@ -2116,7 +2050,6 @@ int main(int argc, char ** argv) { } } - // sample_softmax_batch(ctx0, logits, after_opt_probs, after_opt_best_samples); // printf("probabilities after optimization:\n"); // print_matrix(after_opt_probs); printf("Example:\n---\n"); @@ -2184,7 +2117,6 @@ int main(int argc, char ** argv) { (float *) ((char *) logits->data + (sample_ctx-1)*logits->nb[1]), (llama_token *) tokens_input->data, sample_ctx-1); - // sample_softmax(logits, probs, best_samples); //int token = ggml_get_i32_1d(best_samples, sample_ctx-1); // print_row(probs, sample_at); From cc440bd4381bfc9bf2de464e9992cc4511e64969 Mon Sep 17 00:00:00 2001 From: xaedes Date: Mon, 22 May 2023 16:55:52 +0200 Subject: [PATCH 42/86] fix bug in get_samples which corrupted training targets --- examples/baby-llama/baby-llama-text.cpp | 34 +++++++++++++++++++++---- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index b187bfd1728fa..a21403a7733fe 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -1184,16 +1184,40 @@ void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens) if (isnl) { ++num_newline; } - if (!isnl || (num_newline < 2)) { - print_token(ctx, token); + if (isnl) { + if (num_newline < 2) { + print_token(ctx, token); + } else { + printf("\\n"); + } } else { - printf("\\n"); + print_token(ctx, token); } } printf("\n--\n"); } } +void set_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, float value) { + float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + *ptr = value; +} + +void set_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int32_t value) { + int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + *ptr = value; +} + +float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { + float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + return *ptr; +} + +int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { + int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + return *ptr; +} + void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) { int n_tokens = tokens_input->ne[0]; int n_vocab = target_logits->ne[0]; @@ -1209,8 +1233,8 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons ggml_set_i32_1d(tokens_input, 0, llama_token_bos()); for (int i=1; i Date: Mon, 22 May 2023 16:56:28 +0200 Subject: [PATCH 43/86] save checkpoint only when it was trained --- examples/baby-llama/baby-llama-text.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index a21403a7733fe..84ef911f849d4 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -2089,7 +2089,9 @@ int main(int argc, char ** argv) { ggml_free(ctx0); } - save_checkpoint(&model, opt, fn_chkpt_out); + if (n_examples > 0) { + save_checkpoint(&model, opt, fn_chkpt_out); + } { int n_gen = 1024; From d3acbf644e96fb5dd18d2bc9f4dd119c732a8f17 Mon Sep 17 00:00:00 2001 From: xaedes Date: Mon, 22 May 2023 20:53:57 +0200 Subject: [PATCH 44/86] simplify code --- examples/baby-llama/baby-llama-text.cpp | 190 +++++++++++++----------- 1 file changed, 106 insertions(+), 84 deletions(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index 84ef911f849d4..9a193b81dd1b5 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -1144,9 +1144,34 @@ struct ggml_tensor * forward_batch_wo_cache( return inpL; } +void set_f32_3d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, float value) { + float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); + *ptr = value; +} + +void set_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, float value) { + float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + *ptr = value; +} + +void set_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int32_t value) { + int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + *ptr = value; +} + +float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { + float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + return *ptr; +} + +int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { + int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + return *ptr; +} + void print_row(struct ggml_tensor * probs, int i) { for (int k = 0; k < probs->ne[0]; ++k) { - float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); + float p = get_f32_2d(probs, k, i); printf(" %.2f", p); } printf("\n"); @@ -1156,7 +1181,7 @@ void print_matrix(struct ggml_tensor * probs) { assert(probs->n_dims == 2); for (int i = 0; i < probs->ne[1]; ++i) { for (int k = 0; k < probs->ne[0]; ++k) { - float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); + float p = get_f32_2d(probs, k, i); printf(" %.2f", p); } printf("\n"); @@ -1179,52 +1204,30 @@ void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens) for (int i1=0; i1ne[1]; ++i1) { int num_newline = 0; for (int i0=0; i0ne[0]; ++i0) { - int token = ggml_get_i32_1d(tokens, i0 + i1*tokens->ne[0]); - bool isnl = (token == llama_token_nl()); - if (isnl) { - ++num_newline; - } - if (isnl) { - if (num_newline < 2) { - print_token(ctx, token); - } else { - printf("\\n"); - } - } else { - print_token(ctx, token); - } + int token = get_i32_2d(tokens, i0, i1); + print_token(ctx, token); + // bool isnl = (token == llama_token_nl()); + // if (isnl) { + // ++num_newline; + // } + // if (isnl) { + // if (num_newline < 2) { + // print_token(ctx, token); + // } else { + // printf("\\n"); + // } + // } else { + // print_token(ctx, token); + // } } printf("\n--\n"); } } -void set_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, float value) { - float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - *ptr = value; -} - -void set_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int32_t value) { - int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - *ptr = value; -} - -float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { - float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - return *ptr; -} - -int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { - int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - return *ptr; -} - void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) { int n_tokens = tokens_input->ne[0]; int n_vocab = target_logits->ne[0]; - const float eps = 1e-6f; - const float target_prob = 1.0f; - int sample = train_samples[example_id % n_train_samples]; GGML_ASSERT(sample+n_tokens-1 < n_train_data); @@ -1241,38 +1244,42 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons } } -void get_example_targets_batch(struct ggml_context * ctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) { +void get_example_targets_batch(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) { GGML_ASSERT(tokens_input->n_dims == 2); GGML_ASSERT(target_logits->n_dims == 3); GGML_ASSERT(target_probs->n_dims == 3); + int n_vocab = target_logits->ne[0]; int n_tokens = tokens_input->ne[0]; int n_batch = tokens_input->ne[1]; GGML_ASSERT(n_tokens == target_logits->ne[1]); GGML_ASSERT(n_batch == target_logits->ne[2]); + GGML_ASSERT(n_vocab == target_probs->ne[0]); GGML_ASSERT(n_tokens == target_probs->ne[1]); GGML_ASSERT(n_batch == target_probs->ne[2]); + ggml_set_f32(target_logits, -1.0f/n_vocab); + ggml_set_f32(target_probs, 0.0f); for (int k=0; kne[0], - k*tokens_input->nb[1]); - struct ggml_tensor * target_logits_k = ggml_view_2d(ctx, - target_logits, - target_logits->ne[0], - target_logits->ne[1], - target_logits->nb[1], - k*target_logits->nb[2]); - - struct ggml_tensor * target_probs_k = ggml_view_2d(ctx, - target_probs, - target_probs->ne[0], - target_probs->ne[1], - target_probs->nb[1], - k*target_probs->nb[2]); - - get_example_targets(train_samples, n_train_samples, train_data, n_train_data, - example_id*n_batch + k, tokens_input_k, target_logits_k, target_probs_k); + // printf("%s: batch %d\n", __func__, k); + int sample = train_samples[(example_id*n_batch + k) % n_train_samples]; + GGML_ASSERT(sample+n_tokens-1 < n_train_data); + + set_i32_2d(tokens_input, 0, k, llama_token_bos()); + for (int i=1; i= 0) { out.resize(n_tokens); } + bool verify = false; + if (verify) { + const char * in = buf.data(); + const char * end = buf.data() + buf.size(); + for (int i=0; i < out.size(); ++i) { + const char * s = llama_token_to_str(lctx, out[i]); + int len = strlen(s); + if (in >= end) { + printf("%s: unexpected end of original text.\n", __func__); + break; + } + const bool matches = (strncmp(in, s, len) == 0); + if (matches) { + in += len; + } else { + printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s); + } + } + } + return n_tokens; } @@ -1841,9 +1867,9 @@ float cosine_decay_restart(int decay_steps, const float alpha, int step, float r } int main(int argc, char ** argv) { - const char * default_model = "ggml-vic7b-uncensored-q4_0.bin"; - const char * default_train = "shakespeare.txt"; - const char * default_chkpt_in = "checkpoint.bin"; + const char * default_model = "ggml-vic7b-uncensored-q4_0.bin"; + const char * default_train = "shakespeare.txt"; + const char * default_chkpt_in = "checkpoint.bin"; const char * default_chkpt_out = "checkpoint.bin"; const char * default_argv[5] = {argv[0], default_model, default_train, default_chkpt_in, default_chkpt_out}; @@ -1890,6 +1916,7 @@ int main(int argc, char ** argv) { ++token_noccurs[train_tokens[i]]; token_notavail[train_tokens[i]] = false; } + std::vector token_freq; token_freq.resize(model.hparams.n_vocab, 0); int n_unique_tokens = 0; @@ -1901,10 +1928,9 @@ int main(int argc, char ** argv) { struct my_llama_kv_cache kv_self; - int n_batch = 32; struct ggml_init_params lcparams; - lcparams.mem_size = 1024ll*1024ll*1024ll*8ll; + lcparams.mem_size = 1024ll*1024ll*1024ll*2ll; lcparams.mem_buffer = NULL; lcparams.no_alloc = false; @@ -1913,15 +1939,21 @@ int main(int argc, char ** argv) { my_llama_sampler sampler; + int n_threads = 6; + int n_batch = 32; + int n_examples = 32; - int n_threads = 6; - + bool samples_start_after_nl = false; bool use_adam = true; - int warmup = 100; - int cos_decay_steps = 1000; + int warmup = 100; + int cos_decay_steps = 1000; float cos_decay_restart = 1.1f; - float cos_decay_alpha = 0.0f; + float cos_decay_alpha = 0.0f; + + + int n_tokens = model.hparams.n_ctx; + int n_vocab = model.hparams.n_vocab; struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context)); memset(opt, 0, sizeof(struct ggml_opt_context)); @@ -1965,12 +1997,7 @@ int main(int argc, char ** argv) { size_t compute_size = 1024ll*1024ll*1024ll*32ll; uint8_t * compute_addr = new uint8_t[compute_size]; - - int n_examples = 256; - int n_tokens = model.hparams.n_ctx; - int n_vocab = model.hparams.n_vocab; - bool samples_start_after_nl = false; std::vector train_samples; train_samples.push_back(0); @@ -2012,18 +2039,14 @@ int main(int argc, char ** argv) { ggml_cgraph gf = {}; gf.n_threads = n_threads; - get_example_targets_batch(ctx0, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs); + get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs); struct ggml_tensor * logits = (n_past == 0) ? forward_batch_wo_cache(&model, ctx0, &gf, tokens_input, n_tokens, n_batch) : forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch); - // struct ggml_tensor * se = square_error_loss(ctx0, logits, target_logits); - struct ggml_tensor * ce = cross_entropy_loss(ctx0, logits, target_probs); - // struct ggml_tensor * e = ggml_add(ctx0, se, ce); - struct ggml_tensor * e = ce; - // struct ggml_tensor * e = se; + struct ggml_tensor * e = cross_entropy_loss(ctx0, logits, target_probs); ggml_build_forward_expand(&gf, e); ggml_graph_compute(ctx0, &gf); @@ -2043,9 +2066,8 @@ int main(int argc, char ** argv) { size_t used_mem_after_opt = ggml_used_mem(ctx0); model.train_its = opt->iter; - // model.train_its += use_adam ? opt_params_adam.adam.n_iter : opt_params_lbfgs.lbfgs.n_iter; model.train_samples += n_batch; - model.train_tokens += n_batch * n_tokens; + model.train_tokens += n_batch * n_tokens; ggml_build_forward_expand(&gf, e); ggml_graph_compute(ctx0, &gf); From 6d40cc3a44768d71b1b7f978012c39d9c4ed5186 Mon Sep 17 00:00:00 2001 From: xaedes Date: Mon, 22 May 2023 20:56:35 +0200 Subject: [PATCH 45/86] remove trailing whitespace --- examples/baby-llama/baby-llama-text.cpp | 56 ++++++++++++------------- ggml.c | 4 +- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index 9a193b81dd1b5..cf7fdbb0128b4 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -270,7 +270,7 @@ void init_model(struct my_llama_model * model) { layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str()); - + ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str()); ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str()); ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str()); @@ -1019,7 +1019,7 @@ struct ggml_tensor * forward_batch_wo_cache( // Vcur shape [N, n_batch, n_embd/n_head, n_head] // V shape [N, n_embd/n_head, n_head, n_batch] - struct ggml_tensor * V = + struct ggml_tensor * V = ggml_permute(ctx0, Vcur, 0, 3, 1, 2); @@ -1430,7 +1430,7 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto out.resize(buf.size()); int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false); - if (n_tokens >= 0) { + if (n_tokens >= 0) { out.resize(n_tokens); } @@ -1470,7 +1470,7 @@ void shuffle_ints(int * begin, int * end) { for (int i=0; in_ctx); llama_sample_repetition_penalty( - ctx, + ctx, candidates_p, last_tokens + n_last_tokens - n_last, n_last, params.repeat_penalty); llama_sample_frequency_and_presence_penalties( - ctx, + ctx, candidates_p, last_tokens + n_last_tokens - n_last, - n_last, - params.alpha_frequency, + n_last, + params.alpha_frequency, params.alpha_presence); if (!params.penalize_nl) { @@ -1572,7 +1572,7 @@ llama_token sample(struct my_llama_sampler * sampler, float * logits, const llam llama_sample_top_k (ctx, candidates_p, params.top_k, 1); llama_sample_tail_free (ctx, candidates_p, params.tfs_z, 1); llama_sample_typical (ctx, candidates_p, params.typical_p, 1); - + llama_sample_top_p (ctx, candidates_p, params.top_p, 1); llama_sample_temperature (ctx, candidates_p, params.temp); token = llama_sample_token(ctx, candidates_p); @@ -1809,7 +1809,7 @@ bool load_checkpoint(struct my_llama_model * model, struct ggml_opt_context * op model->hparams.n_rot = file.read_u32(); print_params(&model->hparams); } - + if (init) { init_model(model); } @@ -1872,7 +1872,7 @@ int main(int argc, char ** argv) { const char * default_chkpt_in = "checkpoint.bin"; const char * default_chkpt_out = "checkpoint.bin"; const char * default_argv[5] = {argv[0], default_model, default_train, default_chkpt_in, default_chkpt_out}; - + if (argc < 5) { fprintf(stderr, "usage: %s model training_data chkpt_in chkpt_out\n", argv[0]); //return 1; @@ -1979,13 +1979,13 @@ int main(int argc, char ** argv) { printf("%s: init model\n", __func__); bool existed = load_checkpoint(&model, opt, fn_chkpt_in, true); set_param_model(&model); - + opt->iter = model.train_its; printf("%s: opt iter %d\n", __func__, opt->iter); bool from_scratch = !existed; - if (from_scratch) { - randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f); + if (from_scratch) { + randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f); } init_kv_cache(&kv_self, &model, 1); @@ -2041,8 +2041,8 @@ int main(int argc, char ** argv) { get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs); - struct ggml_tensor * logits = - (n_past == 0) + struct ggml_tensor * logits = + (n_past == 0) ? forward_batch_wo_cache(&model, ctx0, &gf, tokens_input, n_tokens, n_batch) : forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch); @@ -2054,9 +2054,9 @@ int main(int argc, char ** argv) { size_t used_mem_before_opt = ggml_used_mem(ctx0); float error_before_opt = ggml_get_f32_1d(e, 0); - - opt->params.adam.sched = (opt->iter < warmup) - ? (float) opt->iter / (float) warmup + + opt->params.adam.sched = (opt->iter < warmup) + ? (float) opt->iter / (float) warmup : cosine_decay_restart(cos_decay_steps, cos_decay_alpha, opt->iter - warmup, cos_decay_restart); printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched); @@ -2088,9 +2088,9 @@ int main(int argc, char ** argv) { for (int i=0; idata + i*logits->nb[2] + k*logits->nb[1]), - (llama_token *) ((char *) tokens_input->data + i*tokens_input->nb[1]), + int32_t token = sample(&sampler, + (float *) ((char *) logits->data + i*logits->nb[2] + k*logits->nb[1]), + (llama_token *) ((char *) tokens_input->data + i*tokens_input->nb[1]), k); * ((int32_t *) ((char *) after_opt_best_samples->data + i*after_opt_best_samples->nb[1] + k*after_opt_best_samples->nb[0])) = token; } @@ -2118,7 +2118,7 @@ int main(int argc, char ** argv) { { int n_gen = 1024; int sample_ctx = n_tokens - n_tokens/8; - + sampler.params.temp = 0.2; sampler.params.repeat_penalty = 1.1; sampler.params.mirostat = 2; @@ -2161,9 +2161,9 @@ int main(int argc, char ** argv) { struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx); // set_logits_masked(logits, token_notavail, -1e9); - int token = sample(&sampler, - (float *) ((char *) logits->data + (sample_ctx-1)*logits->nb[1]), - (llama_token *) tokens_input->data, + int token = sample(&sampler, + (float *) ((char *) logits->data + (sample_ctx-1)*logits->nb[1]), + (llama_token *) tokens_input->data, sample_ctx-1); //int token = ggml_get_i32_1d(best_samples, sample_ctx-1); @@ -2175,7 +2175,7 @@ int main(int argc, char ** argv) { ggml_set_i32_1d(tokens_input, sample_ctx-1, token); ggml_free(ctx0); - } + } } free(compute_addr); diff --git a/ggml.c b/ggml.c index cfc9bb455aec2..1ff5b97c25bce 100644 --- a/ggml.c +++ b/ggml.c @@ -9940,7 +9940,7 @@ static void ggml_compute_forward_out_prod_f32( const int64_t i3 = ir/(ne2*ne1); const int64_t i2 = (ir - i3*ne2*ne1)/ne1; const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1); - + const int64_t i02 = i2; const int64_t i03 = i3; @@ -15296,7 +15296,7 @@ enum ggml_opt_result ggml_opt_resume( // build forward + backward compute graphs struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0)); struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0)); - + struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data; struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data; From c47df098420606f5f85947f886b3b05be3fb9cd7 Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 28 May 2023 17:32:01 +0200 Subject: [PATCH 46/86] simplify backward pass for SQRT --- ggml.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml.c b/ggml.c index 1ff5b97c25bce..1a964c5b263af 100644 --- a/ggml.c +++ b/ggml.c @@ -13063,11 +13063,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src0->grad = ggml_add_impl(ctx, src0->grad, - ggml_mul(ctx, - tensor->grad, // this was not catched by test_grad because in test_grad tensor->grad is 1 + ggml_scale(ctx, ggml_div(ctx, - ggml_repeat(ctx, ggml_new_f32(ctx, 0.5f), tensor), - tensor)), + tensor->grad, + tensor), + ggml_new_f32(ctx, 0.5f)), inplace); } } break; From 05cb629c8efccb06166fd801fe2cad870fa80350 Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 28 May 2023 18:00:17 +0200 Subject: [PATCH 47/86] replace inefficient repeat backward pass with dedicated repeat_back operation --- ggml.c | 183 +++++++++++++++++++++++++++++++++++++++++++++------------ ggml.h | 6 ++ 2 files changed, 150 insertions(+), 39 deletions(-) diff --git a/ggml.c b/ggml.c index 1a964c5b263af..0571777d19159 100644 --- a/ggml.c +++ b/ggml.c @@ -3297,6 +3297,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "SUM_ROWS", "MEAN", "REPEAT", + "REPEAT_BACK", "ABS", "SGN", "NEG", @@ -3340,7 +3341,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "MAP_BINARY", }; -static_assert(GGML_OP_COUNT == 52, "GGML_OP_COUNT != 52"); +static_assert(GGML_OP_COUNT == 53, "GGML_OP_COUNT != 53"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -3359,6 +3360,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "Σx_k", "Σx/n", "repeat(x)", + "repeat_back(x)", "abs(x)", "sgn(x)", "-x", @@ -3402,7 +3404,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "f(x,y)", }; -static_assert(GGML_OP_COUNT == 52, "GGML_OP_COUNT != 52"); +static_assert(GGML_OP_COUNT == 53, "GGML_OP_COUNT != 53"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); @@ -4790,6 +4792,34 @@ struct ggml_tensor * ggml_repeat( return result; } +// ggml_repeat_back + +struct ggml_tensor * ggml_repeat_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(ggml_can_repeat(b, a)); + + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + if (ggml_are_same_shape(a, b) && !is_node) { + return a; + } + + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne); + + result->op = GGML_OP_REPEAT_BACK; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + // ggml_abs struct ggml_tensor * ggml_abs_impl( @@ -8430,6 +8460,99 @@ static void ggml_compute_forward_repeat( } } +// ggml_compute_forward_repeat_back + +static void ggml_compute_forward_repeat_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_can_repeat(dst, src0)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; + + // guaranteed to be an integer due to the check in ggml_can_repeat + const int nr0 = (int)(ne00/ne0); + const int nr1 = (int)(ne01/ne1); + const int nr2 = (int)(ne02/ne2); + const int nr3 = (int)(ne03/ne3); + + // TODO: support for transposed / permuted tensors + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + if (ggml_is_contiguous(dst)) { + ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); + } else { + for (int k3 = 0; k3 < ne3; k3++) { + for (int k2 = 0; k2 < ne2; k2++) { + for (int k1 = 0; k1 < ne1; k1++) { + ggml_vec_set_f32(ne0, + (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3), + 0); + } + } + } + } + + // TODO: maybe this is not optimal? + for (int i3 = 0; i3 < nr3; i3++) { + for (int k3 = 0; k3 < ne3; k3++) { + for (int i2 = 0; i2 < nr2; i2++) { + for (int k2 = 0; k2 < ne2; k2++) { + for (int i1 = 0; i1 < nr1; i1++) { + for (int k1 = 0; k1 < ne1; k1++) { + for (int i0 = 0; i0 < nr0; i0++) { + ggml_vec_acc_f32(ne0, + (float *) ((char *) dst->data + ( k3)*nb3 + ( k2)*nb2 + ( k1)*nb1), + (float *) ((char *) src0->data + (i3*ne3 + k3)*nb03 + (i2*ne2 + k2)*nb02 + (i1*ne1 + k1)*nb01 + (i0*ne0)*nb00)); + } + } + } + } + } + } + } +} + +static void ggml_compute_forward_repeat_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_repeat_back_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_abs static void ggml_compute_forward_abs_f32( @@ -12770,6 +12893,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_repeat(params, tensor->src0, tensor); } break; + case GGML_OP_REPEAT_BACK: + { + ggml_compute_forward_repeat_back(params, tensor->src0, tensor); + } break; case GGML_OP_ABS: { ggml_compute_forward_abs(params, tensor->src0, tensor); @@ -13113,43 +13240,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { - GGML_ASSERT(src0->n_dims == 1 || src0->n_dims == 2); - const int nc = tensor->ne[0]; - const int nr = tensor->ne[1]; - const int nc0 = src0->ne[0]; - const int nr0 = src0->ne[1]; - const int ncr = nc/nc0; // guaranteed to be an integer due to the check in ggml_can_repeat - const int nrr = nr/nr0; // guaranteed to be an integer due to the check in ggml_can_repeat - // tensor->grad [nc,nr,1,1] - // reshape [nc0,nc/nc0,nr0,nr/nr0] - // permute [nc0,nr0,nc/nc0,nr/nr0] - // substitute [nc0,nr0,ncr,nrr] - // reshape [nc0*nr0,ncr*nrr,1,1] - // transpose [ncr*nrr,nc0*nr0,1,1] - // sum rows [1,nc0*nr0,1,1] - // transpose [nc0*nr0,1,1] - // reshape [nc0,nr0,1,1] reshape_1d or reshape_2d - // add to src0->grad - - int64_t ne[4] = {nc0,ncr,nr0,nrr}; - - struct ggml_tensor* F00 = tensor->grad; - struct ggml_tensor* F01 = ggml_reshape (ctx, F00, ggml_new_tensor(ctx,tensor->grad->type,4,ne)); - struct ggml_tensor* F02 = ggml_permute (ctx, F01, 0,2,1,3); - struct ggml_tensor* F03 = ggml_cont (ctx, F02); - struct ggml_tensor* F04 = ggml_reshape_2d(ctx, F03, nc0*nr0, ncr*nrr); - struct ggml_tensor* F05 = ggml_transpose (ctx, F04); - struct ggml_tensor* F06 = ggml_cont (ctx, F05); - struct ggml_tensor* F07 = ggml_sum_rows (ctx, F06); - struct ggml_tensor* F08 = ggml_transpose (ctx, F07); - struct ggml_tensor* F09 = ggml_cont (ctx, F08); - struct ggml_tensor* F10 = ggml_reshape (ctx, F09, src0->grad); - - src0->grad = - ggml_add_impl(ctx, - src0->grad, - F10, - inplace); + src0->grad = ggml_add_impl(ctx, + src0->grad, + ggml_repeat_back(ctx, tensor->grad, src0->grad), + inplace); + } + } break; + case GGML_OP_REPEAT_BACK: + { + if (src0->grad) { + // TODO: test this + src0->grad = ggml_add_impl(ctx, + src0->grad, + ggml_repeat(ctx, tensor->grad, src0->grad), + inplace); } } break; case GGML_OP_ABS: @@ -13941,6 +14045,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_SUM_ROWS: case GGML_OP_MEAN: case GGML_OP_REPEAT: + case GGML_OP_REPEAT_BACK: case GGML_OP_ABS: case GGML_OP_SGN: case GGML_OP_NEG: diff --git a/ggml.h b/ggml.h index 64de9eb3ea76f..711d34e78a25a 100644 --- a/ggml.h +++ b/ggml.h @@ -279,6 +279,7 @@ extern "C" { GGML_OP_SUM_ROWS, GGML_OP_MEAN, GGML_OP_REPEAT, + GGML_OP_REPEAT_BACK, GGML_OP_ABS, GGML_OP_SGN, GGML_OP_NEG, @@ -596,6 +597,11 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_repeat_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_abs( struct ggml_context * ctx, struct ggml_tensor * a); From 71aaf8dedf0e5e4e427e4251a041d2fe9e4d0656 Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 28 May 2023 21:57:38 +0200 Subject: [PATCH 48/86] add ggml_cross_entropy_loss with backward pass for faster training cross entropy loss can also be implemented using softmax and log, but as dedicated operation it is faster and especially avoids unnecessary memory overhead. --- ggml.c | 377 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- ggml.h | 16 +++ 2 files changed, 391 insertions(+), 2 deletions(-) diff --git a/ggml.c b/ggml.c index 0571777d19159..01604ec95ec50 100644 --- a/ggml.c +++ b/ggml.c @@ -3339,9 +3339,12 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "MAP_UNARY", "MAP_BINARY", + + "CROSS_ENTROPY_LOSS", + "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 53, "GGML_OP_COUNT != 53"); +static_assert(GGML_OP_COUNT == 55, "GGML_OP_COUNT != 55"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -3402,9 +3405,12 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "f(x)", "f(x,y)", + + "cross_entropy_loss(x,y)", + "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 53, "GGML_OP_COUNT != 53"); +static_assert(GGML_OP_COUNT == 55, "GGML_OP_COUNT != 55"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); @@ -6347,6 +6353,50 @@ struct ggml_tensor * ggml_map_binary_inplace_f32( return ggml_map_binary_impl_f32(ctx, a, b, fun, true); } +// ggml_cross_entropy_loss + +struct ggml_tensor * ggml_cross_entropy_loss( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(ggml_are_same_shape(a, b)); + bool is_node = false; + + if (a->grad || b->grad) { + is_node = true; + } + + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1); + + result->op = GGML_OP_CROSS_ENTROPY_LOSS; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + +// ggml_cross_entropy_loss_back + +struct ggml_tensor * ggml_cross_entropy_loss_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c) { + GGML_ASSERT(ggml_are_same_shape(a, b)); + GGML_ASSERT(ggml_is_scalar(c)); + + struct ggml_tensor * result = ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_CROSS_ENTROPY_LOSS_BACK; + result->grad = NULL; + result->src0 = a; + result->src1 = b; + result->opt[0] = c; + + return result; +} + //////////////////////////////////////////////////////////////////////////////// void ggml_set_param( @@ -12831,6 +12881,287 @@ static void ggml_compute_forward_map_binary( } } +// ggml_compute_forward_cross_entropy_loss + +static void ggml_compute_forward_cross_entropy_loss_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(ggml_is_scalar(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, src1)); + + const int ith = params->ith; + const int nth = params->nth; + + float * sums = (float *) params->wdata; + + // TODO: handle transposed/permuted matrices + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + if (params->type == GGML_TASK_INIT) { + if (ith == 0) { + memset(sums, 0, sizeof(float) * (nth + nth * nc)); + } + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + if (ith == 0) { + float * dp = (float *) dst->data; + ggml_vec_sum_f32(nth, dp, sums); + dp[0] *= -1.0f; + } + return; + } + + const float eps = 1e-9f; + + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]); + float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]); + float * st = (float *) params->wdata + nth + ith*nc; + +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + //printf("p[%d] = %f\n", i, p[i]); + assert(!isnan(s0[i])); + assert(!isnan(s1[i])); + } +#endif + // soft_max + ggml_float sum = 0.0; + { + float max = -INFINITY; + ggml_vec_max_f32(nc, &max, s0); + + uint16_t scvt; + for (int i = 0; i < nc; i++) { + if (s0[i] == -INFINITY) { + st[i] = 0.0f; + } else { + // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max); + ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max); + memcpy(&scvt, &s, sizeof(scvt)); + const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]); + sum += (ggml_float)val; + st[i] = val; + } + } + + assert(sum > 0.0); + sum = 1.0/sum; + } + // avoid log(0) by rescaling from [0..1] to [eps..1] + sum = sum * (1.0f - eps); + ggml_vec_scale_f32(nc, st, sum); + ggml_vec_add1_f32(nc, st, st, eps); + ggml_vec_log_f32(nc, st, st); + ggml_vec_mul_f32(nc, st, st, s1); + + ggml_vec_sum_f32(nc, sums + ith, st); + +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + assert(!isnan(st[i])); + assert(!isinf(st[i])); + } +#endif + } + +} + +static void ggml_compute_forward_cross_entropy_loss( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_cross_entropy_loss_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_cross_entropy_loss_back + +static void ggml_compute_forward_cross_entropy_loss_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(ggml_is_contiguous(opt0)); + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + const int64_t ith = params->ith; + const int64_t nth = params->nth; + + float * sums = (float *) params->wdata; + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const float eps = 1e-9f; + + // TODO: handle transposed/permuted matrices + const int64_t nc = src0->ne[0]; + const int64_t nr = ggml_nrows(src0); + + // rows per thread + const int64_t dr = (nr + nth - 1)/nth; + + // row range for this thread + const int64_t ir0 = dr*ith; + const int64_t ir1 = MIN(ir0 + dr, nr); + + float * d = (float *) opt0->data; + + for (int64_t i1 = ir0; i1 < ir1; i1++) { + float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]); + float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]); + float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]); + float * sm = (float *) params->wdata + ith*nc; + +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + //printf("p[%d] = %f\n", i, p[i]); + assert(!isnan(s0[i])); + assert(!isnan(s1[i])); + } +#endif + // step by step explanation: + { + // forward pass with annotated gradients from backward pass + // (built by going in reverse operation order, adding to gradients of current operation args) + // st0 = exp(s0-max(s0)) grad[st0] = grad[st1]*(1.0 - eps)/sum + // from softmax_back: grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1])) + // ggml_vec_scale_f32(nc, st, sum); // st1 = st0*/sum = softmax(s0) grad[st1] = grad[st2]*(1.0 - eps) + // ggml_vec_scale_f32(nc, st, (1.0f - eps)); // st2 = st1*(1.0 - eps) grad[st2] = grad[st3] + // ggml_vec_add1_f32(nc, st, st, eps); // st3 = st2 + eps grad[st3] = grad[st4]/st3 + // ggml_vec_log_f32(nc, st, st); // st4 = log(st3) grad[st4] = grad[st5] * s1 + // ggml_vec_mul_f32(nc, st, st, s1); // st5 = st4 * s1 grad[st5] = grad[sums[ith]] + // ggml_vec_sum_f32(nc, sums + ith, st); // sums[ith] = st5 grad[sums[ith]] = grad[cross_entropy_loss] = -grad[cel] + + // substitute into grad[st1], because we can reuse softmax_back from this point on + // grad[st1] = -grad[cel]*s1*(1.0 - eps)/(eps + softmax(s0)*(1.0 - eps)) + // postorder: + // grad[st1] := softmax(s0) + // grad[st1] := grad[st1]*(1.0 - eps) + // grad[st1] := grad[st1] + eps + // grad[st1] := s1 / grad[st1] + // grad[st1] := grad[st1]*(1.0-eps)*-grad[cel] + + // src0 gradients by going through softmax_back + // grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1])) + // from softmax_back: + // dxk = yk * (dyk - dot(y, dy)) + // dot_y_dy := dot(y, dy) + // dx := dy + // dx := dx - dot_y_dy + // dx := dx * y + // postorder: + // dot_st1_dst1 := dot(st1, grad[st1]) + // grad[s0] := grad[st1] + // grad[s0] := grad[s0] - dot_st1_dst1 + // grad[s0] := grad[s0] * st1 + + // prepend postorder from grad[st1] directly using grad[s0] as memory location, as we will grad[s0] := grad[st1] + // sm := softmax(s0) + // grad[s0] := sm*(1.0 - eps) + // grad[s0] := grad[s0] + eps + // grad[s0] := s1 / grad[s0] + // grad[s0] := grad[s0]*(1.0-eps)*-grad[cel] + // dot_st1_dst1 := dot(sm, grad[s0]) + // grad[s0] := grad[s0] - dot_st1_dst1 + // grad[s0] := grad[s0] * sm + } + + // soft_max + ggml_float sum = 0.0; + { + float max = -INFINITY; + ggml_vec_max_f32(nc, &max, s0); + + uint16_t scvt; + for (int i = 0; i < nc; i++) { + if (s0[i] == -INFINITY) { + sm[i] = 0.0f; + } else { + // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max); + ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max); + memcpy(&scvt, &s, sizeof(scvt)); + const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]); + sum += (ggml_float)val; + sm[i] = val; + } + } + + assert(sum > 0.0); + sum = 1.0/sum; + } + + float dot_st1_dst1 = 0; + ggml_vec_scale_f32(nc, sm, sum); + ggml_vec_cpy_f32 (nc, ds0, sm); + ggml_vec_scale_f32(nc, ds0, (1.0 - eps)); + ggml_vec_add1_f32 (nc, ds0, ds0, eps); + ggml_vec_div_f32 (nc, ds0, s1, ds0); + ggml_vec_scale_f32(nc, ds0, -(1.0 - eps)*d[0]); + ggml_vec_dot_f32 (nc, &dot_st1_dst1, sm, ds0); + ggml_vec_acc1_f32 (nc, ds0, -dot_st1_dst1); + ggml_vec_mul_f32 (nc, ds0, ds0, sm); + +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + assert(!isnan(sm[i])); + assert(!isinf(sm[i])); + assert(!isnan(ds0[i])); + assert(!isinf(ds0[i])); + } +#endif + } +} + +static void ggml_compute_forward_cross_entropy_loss_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_cross_entropy_loss_back_f32(params, src0, src1, opt0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + + ///////////////////////////////// static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { @@ -13052,6 +13383,16 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun); } break; + case GGML_OP_CROSS_ENTROPY_LOSS: + { + ggml_compute_forward_cross_entropy_loss(params, tensor->src0, tensor->src1, tensor); + } + break; + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: + { + ggml_compute_forward_cross_entropy_loss_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); + } + break; case GGML_OP_NONE: { // nop @@ -13677,6 +14018,22 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ASSERT(false); // not supported } break; + case GGML_OP_CROSS_ENTROPY_LOSS: + { + if (src0->grad) { + src0->grad = ggml_add_impl(ctx, + src0->grad, + ggml_cross_entropy_loss_back(ctx, + src0, + src1, + tensor->grad), + inplace); + } + } break; + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: + { + GGML_ASSERT(false); // not supported + } break; case GGML_OP_NONE: { // nop @@ -14225,6 +14582,22 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { node->n_tasks = 1; } break; + case GGML_OP_CROSS_ENTROPY_LOSS: + { + node->n_tasks = n_threads; + + size_t cur = ggml_type_size(node->type)*(node->n_tasks + node->src0->ne[0]*node->n_tasks); + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: + { + node->n_tasks = n_threads; + + size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*node->n_tasks; + + work_size = MAX(work_size, cur); + } break; case GGML_OP_NONE: { node->n_tasks = 1; diff --git a/ggml.h b/ggml.h index 711d34e78a25a..ba60588d6b521 100644 --- a/ggml.h +++ b/ggml.h @@ -322,6 +322,9 @@ extern "C" { GGML_OP_MAP_UNARY, GGML_OP_MAP_BINARY, + GGML_OP_CROSS_ENTROPY_LOSS, + GGML_OP_CROSS_ENTROPY_LOSS_BACK, + GGML_OP_COUNT, }; @@ -972,6 +975,19 @@ extern "C" { struct ggml_tensor * b, ggml_binary_op_f32_t fun); + // loss function + + GGML_API struct ggml_tensor * ggml_cross_entropy_loss( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c); + // // automatic differentiation // From f056a04a80127b1d45d91335ebd814b5c7a18a73 Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 28 May 2023 21:59:17 +0200 Subject: [PATCH 49/86] add tests for cross_entropy_loss backward pass finite differences regularly results in estimated gradient of zero, despite the backward pass giving non zero gradient. _probably_ the finite differences fails due to numerical issues --- tests/test-grad0.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/test-grad0.c b/tests/test-grad0.c index ec5059220078d..b7d68cad9fd28 100644 --- a/tests/test-grad0.c +++ b/tests/test-grad0.c @@ -1090,6 +1090,25 @@ int main(int argc, const char ** argv) { } } + // cross_entropy_loss + { + const int nargs = 1; + + int64_t ne2[4]; + get_random_dims(ne2, 4); + + for (int ndims = 1; ndims <= 3; ++ndims) { + x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f); + x[1] = get_random_tensor(ctx0, ndims, ne2, 0.0f, 1.0f); + ggml_set_param(ctx0, x[0]); + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_cross_entropy_loss(ctx0, x[0], x[1])); + + check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-1f, 1e-2f, INFINITY); + // finite differences regularly fails! + } + } + // rope { const int nargs = 1; From 1fbd19abe162d26fa51e16d607b4ed2f3b43109d Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 28 May 2023 22:00:26 +0200 Subject: [PATCH 50/86] use ggml_cross_entropy_loss in text training example --- examples/baby-llama/baby-llama-text.cpp | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index cf7fdbb0128b4..22f4b56a33043 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -1237,7 +1237,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons for (int i=1; i Date: Sun, 28 May 2023 22:00:56 +0200 Subject: [PATCH 51/86] remove trailing whitespace --- ggml.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml.c b/ggml.c index 01604ec95ec50..bdd29ac50f389 100644 --- a/ggml.c +++ b/ggml.c @@ -8559,8 +8559,8 @@ static void ggml_compute_forward_repeat_back_f32( for (int k3 = 0; k3 < ne3; k3++) { for (int k2 = 0; k2 < ne2; k2++) { for (int k1 = 0; k1 < ne1; k1++) { - ggml_vec_set_f32(ne0, - (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3), + ggml_vec_set_f32(ne0, + (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3), 0); } } From 89475fb320168e0a82f19e74285748f843106242 Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 28 May 2023 22:40:58 +0200 Subject: [PATCH 52/86] slightly improve how cross entropy loss is compute btw: directly implemented cross entropy loss seems to have way lower magnitudes than when implemented with softmax and log. probably the input to log gets closer to zero due to float numerics. maybe the multiplication by (1.0-eps)/sum is more accurate.. --- ggml.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml.c b/ggml.c index bdd29ac50f389..b75d55c3d6a72 100644 --- a/ggml.c +++ b/ggml.c @@ -12961,10 +12961,10 @@ static void ggml_compute_forward_cross_entropy_loss_f32( } assert(sum > 0.0); - sum = 1.0/sum; + // sum = 1.0/sum; } // avoid log(0) by rescaling from [0..1] to [eps..1] - sum = sum * (1.0f - eps); + sum = (1.0f - eps) / sum; ggml_vec_scale_f32(nc, st, sum); ggml_vec_add1_f32(nc, st, st, eps); ggml_vec_log_f32(nc, st, st); From bf4d9b3b812bf34c72258ec841b940911c336bcb Mon Sep 17 00:00:00 2001 From: xaedes Date: Mon, 29 May 2023 01:20:26 +0200 Subject: [PATCH 53/86] add llama_get_vocab to get the vocabulary as output parameters --- llama.cpp | 13 +++++++++++++ llama.h | 8 ++++++++ 2 files changed, 21 insertions(+) diff --git a/llama.cpp b/llama.cpp index ca61a69e0bb17..3095c71738175 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2852,6 +2852,19 @@ int llama_n_embd(const struct llama_context * ctx) { return ctx->model.hparams.n_embd; } +int llama_get_vocab( + const struct llama_context * ctx, + const char * * strings, + float * scores, + int capacity) { + int n = std::min(capacity, (int) ctx->vocab.id_to_token.size()); + for (int i = 0; ivocab.id_to_token[i].tok.c_str(); + scores[i] = ctx->vocab.id_to_token[i].score; + } + return n; +} + float * llama_get_logits(struct llama_context * ctx) { return ctx->logits.data(); } diff --git a/llama.h b/llama.h index 21cba8cf61061..33385d0ea4192 100644 --- a/llama.h +++ b/llama.h @@ -172,6 +172,14 @@ extern "C" { LLAMA_API int llama_n_ctx (const struct llama_context * ctx); LLAMA_API int llama_n_embd (const struct llama_context * ctx); + // Get the vocabulary as output parameters. + // Returns number of results. + LLAMA_API int llama_get_vocab( + const struct llama_context * ctx, + const char * * strings, + float * scores, + int capacity); + // Token logits obtained from the last call to llama_eval() // The logits for the last token are stored in the last row // Can be mutated in order to change the probabilities of the next token From 2da5c8cf246518806cb7aea830a0ed0ffa146ed8 Mon Sep 17 00:00:00 2001 From: xaedes Date: Mon, 29 May 2023 01:20:55 +0200 Subject: [PATCH 54/86] set default model.type for unknown models with few layers --- llama.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/llama.cpp b/llama.cpp index 3095c71738175..a05a9fc62baa3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -905,6 +905,12 @@ static void llama_model_load_internal( case 40: model.type = e_model::MODEL_13B; break; case 60: model.type = e_model::MODEL_30B; break; case 80: model.type = e_model::MODEL_65B; break; + default: + { + if (hparams.n_layer < 32) { + model.type = e_model::MODEL_7B; + } + } break; } hparams.n_ctx = n_ctx; From 4b81c32d5bafed56e73ae50f33737190e7cb4457 Mon Sep 17 00:00:00 2001 From: xaedes Date: Mon, 29 May 2023 01:27:09 +0200 Subject: [PATCH 55/86] add export of training checkpoint to llama compatible model file --- examples/baby-llama/baby-llama-text.cpp | 97 +++++++++++++++++++++---- 1 file changed, 83 insertions(+), 14 deletions(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index 22f4b56a33043..34a6d10511bbd 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -150,6 +150,19 @@ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struc return tensor; } +struct llama_vocab { + using id = int32_t; + using token = std::string; + + struct token_score { + token tok; + float score; + }; + + std::unordered_map token_to_id; + std::vector id_to_token; +}; + struct my_llama_hparams { uint32_t n_vocab = 32000; uint32_t n_ctx = 512; // this is provided as user input? @@ -278,9 +291,20 @@ void init_model(struct my_llama_model * model) { ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str()); - ggml_set_name(layer.w1, (layers_i + ".feed_forward.w1.weight").c_str()); - ggml_set_name(layer.w2, (layers_i + ".feed_forward.w2.weight").c_str()); - ggml_set_name(layer.w3, (layers_i + ".feed_forward.w3.weight").c_str()); + // 'layers.10.feed_forward.w1.weight' has length of 32. + // ggml_tensor->name only has 32 characters, but we need one more for the '\0' terminator. + // ggml_set_name will set the last character to '\0', so we can only store 'layers.10.feed_forward.w1.weigh'. + // when saving llama compatible model the tensors names will miss a character. + // ggml_set_name(layer.w1, (layers_i + ".feed_forward.w1.weight").c_str()); + // ggml_set_name(layer.w2, (layers_i + ".feed_forward.w2.weight").c_str()); + // ggml_set_name(layer.w3, (layers_i + ".feed_forward.w3.weight").c_str()); + + strncpy(layer.w1->name, (layers_i + ".feed_forward.w1.weight").c_str(), sizeof(layer.w1->name)); + strncpy(layer.w2->name, (layers_i + ".feed_forward.w2.weight").c_str(), sizeof(layer.w2->name)); + strncpy(layer.w3->name, (layers_i + ".feed_forward.w3.weight").c_str(), sizeof(layer.w3->name)); + layer.w1->padding[0] = 0; + layer.w2->padding[0] = 0; + layer.w3->padding[0] = 0; } } @@ -1584,13 +1608,6 @@ void set_logits_masked(struct ggml_tensor * logits, std::vector& mask, flo } } -enum llama_file_version { - LLAMA_FILE_VERSION_GGML, - LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab - LLAMA_FILE_VERSION_GGJT_V1, // added padding - LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format -}; - void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) { if (tensor == NULL) { file->write_u32(0); @@ -1627,7 +1644,7 @@ void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) { } std::string name = file->read_string(name_len); - GGML_ASSERT(strcmp(ggml_get_name(tensor), name.c_str()) == 0); + GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)) == 0); file->seek(-file->tell() & 31, SEEK_CUR); file->read_raw(tensor->data, ggml_nbytes(tensor)); @@ -1839,6 +1856,50 @@ bool load_checkpoint(struct my_llama_model * model, struct ggml_opt_context * op return (file.fp != NULL); } +void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, const char * filename) { + struct llama_file file(filename, "wb"); + if (file.fp == NULL) { + return; + } + + // write_magic + file.write_u32(LLAMA_FILE_MAGIC); // magic + file.write_u32(LLAMA_FILE_VERSION); // version + // write_hparams + file.write_u32(model->hparams.n_vocab); + file.write_u32(model->hparams.n_embd); + file.write_u32(model->hparams.n_mult); + file.write_u32(model->hparams.n_head); + file.write_u32(model->hparams.n_layer); + file.write_u32(model->hparams.n_rot); + file.write_u32(LLAMA_FTYPE_ALL_F32); + // write_vocab + uint32_t n_vocab = model->hparams.n_vocab; + for (uint32_t i = 0; i < n_vocab; i++) { + const auto & token_score = vocab->id_to_token.at(i); + file.write_u32((uint32_t) token_score.tok.size()); + file.write_raw(token_score.tok.data(), token_score.tok.size()); + file.write_raw(&token_score.score, sizeof(token_score.score)); + } + // write tensors + write_tensor(&file, model->tok_embeddings); + write_tensor(&file, model->norm); + write_tensor(&file, model->output); + for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { + auto & layer = model->layers[i]; + + write_tensor(&file, layer.attention_norm); + write_tensor(&file, layer.wq); + write_tensor(&file, layer.wk); + write_tensor(&file, layer.wv); + write_tensor(&file, layer.wo); + write_tensor(&file, layer.ffn_norm); + write_tensor(&file, layer.w1); + write_tensor(&file, layer.w2); + write_tensor(&file, layer.w3); + } +} + float cosine_decay(const int decay_steps, const float alpha, int step) { if (step > decay_steps) { step = decay_steps; @@ -1861,10 +1922,11 @@ int main(int argc, char ** argv) { const char * default_train = "shakespeare.txt"; const char * default_chkpt_in = "checkpoint.bin"; const char * default_chkpt_out = "checkpoint.bin"; - const char * default_argv[5] = {argv[0], default_model, default_train, default_chkpt_in, default_chkpt_out}; + const char * default_model_out = "ggml-checkpoint-f32.bin"; + const char * default_argv[6] = {argv[0], default_model, default_train, default_chkpt_in, default_chkpt_out, default_model_out}; - if (argc < 5) { - fprintf(stderr, "usage: %s model training_data chkpt_in chkpt_out\n", argv[0]); + if (argc < 6) { + fprintf(stderr, "usage: %s model training_data chkpt_in chkpt_out model_out\n", argv[0]); //return 1; } @@ -1874,6 +1936,7 @@ int main(int argc, char ** argv) { const char * fn_train = (argc >= 3) ? argv[2] : default_argv[2]; const char * fn_chkpt_in = (argc >= 4) ? argv[3] : default_argv[3]; const char * fn_chkpt_out = (argc >= 5) ? argv[4] : default_argv[4]; + const char * fn_model_out = (argc >= 6) ? argv[5] : default_argv[5]; struct llama_context_params llama_params = llama_context_default_params(); llama_params.vocab_only = true; @@ -1970,6 +2033,8 @@ int main(int argc, char ** argv) { bool existed = load_checkpoint(&model, opt, fn_chkpt_in, true); set_param_model(&model); + opt->params = use_adam ? opt_params_adam : opt_params_lbfgs; + opt->iter = model.train_its; printf("%s: opt iter %d\n", __func__, opt->iter); @@ -2105,6 +2170,10 @@ int main(int argc, char ** argv) { save_checkpoint(&model, opt, fn_chkpt_out); } + if (strlen(fn_model_out) > 0) { + save_as_llama_model(&vocab, &model, fn_model_out); + } + { int n_gen = 1024; int sample_ctx = n_tokens - n_tokens/8; From 56895e28f6630457f6a02d82feee62d05f50c134 Mon Sep 17 00:00:00 2001 From: xaedes Date: Mon, 29 May 2023 02:25:18 +0200 Subject: [PATCH 56/86] get vocabulary for exporting training checkpoint to llama compatible model file --- examples/baby-llama/baby-llama-text.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index 34a6d10511bbd..267f44321bbef 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -1943,6 +1943,25 @@ int main(int argc, char ** argv) { struct llama_context * lctx = llama_init_from_file(fn_model, llama_params); + struct llama_vocab vocab; + { + std::vector strings; + std::vector scores; + int n_vocab = llama_n_vocab(lctx); + strings.resize(n_vocab, NULL); + scores.resize(n_vocab, 0); + n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab); + GGML_ASSERT(n_vocab == llama_n_vocab(lctx)); + vocab.id_to_token.resize(n_vocab); + for (int i=0; i train_tokens; if (tokenize_file(lctx, fn_train, train_tokens) < 0) { From 22a7279ffb2c7669926ab899df57154d144d893f Mon Sep 17 00:00:00 2001 From: xaedes Date: Mon, 29 May 2023 22:00:40 +0200 Subject: [PATCH 57/86] implement backward pass of flash attention --- ggml.c | 647 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- ggml.h | 9 + 2 files changed, 652 insertions(+), 4 deletions(-) diff --git a/ggml.c b/ggml.c index b75d55c3d6a72..353b42cee2679 100644 --- a/ggml.c +++ b/ggml.c @@ -3336,6 +3336,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "FLASH_ATTN", "FLASH_FF", + "FLASH_ATTN_BACK", "MAP_UNARY", "MAP_BINARY", @@ -3344,7 +3345,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 55, "GGML_OP_COUNT != 55"); +static_assert(GGML_OP_COUNT == 56, "GGML_OP_COUNT != 56"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -3402,6 +3403,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "flash_attn(x)", "flash_ff(x)", + "flash_attn_back(x)", "f(x)", "f(x,y)", @@ -3410,7 +3412,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 55, "GGML_OP_COUNT != 55"); +static_assert(GGML_OP_COUNT == 56, "GGML_OP_COUNT != 56"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); @@ -6251,7 +6253,6 @@ struct ggml_tensor * ggml_flash_ff( bool is_node = false; if (a->grad || b0->grad || b1->grad || c0->grad || c1->grad) { - GGML_ASSERT(false); // TODO: implement backward is_node = true; } @@ -6269,6 +6270,71 @@ struct ggml_tensor * ggml_flash_ff( return result; } +// ggml_flash_attn_back + +struct ggml_tensor * ggml_flash_attn_back( + struct ggml_context * ctx, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * d, + bool masked) { + GGML_ASSERT(ggml_can_mul_mat(k, q)); + // TODO: check if vT can be multiplied by (k*qT) + + // d shape [D,N,ne2,ne3] + // q shape [D,N,ne2,ne3] + // k shape [D,M,ne2,ne3] + // v shape [M,D,ne2,ne3] + + const int64_t D = q->ne[0]; + const int64_t N = q->ne[1]; + const int64_t M = k->ne[1]; + const int64_t ne2 = q->ne[2]; + const int64_t ne3 = q->ne[3]; + + GGML_ASSERT(k->ne[0] == D); + GGML_ASSERT(v->ne[0] == M); + GGML_ASSERT(v->ne[1] == D); + GGML_ASSERT(d->ne[0] == D); + GGML_ASSERT(d->ne[1] == N); + GGML_ASSERT(k->ne[2] == ne2); + GGML_ASSERT(k->ne[3] == ne3); + GGML_ASSERT(v->ne[2] == ne2); + GGML_ASSERT(v->ne[3] == ne3); + GGML_ASSERT(d->ne[2] == ne2); + GGML_ASSERT(d->ne[3] == ne3); + + bool is_node = false; + + if (q->grad || k->grad || v->grad) { + // when using this operation (in backwards pass) these grads are set. + // we don't want to create (big) grad of our result, so is_node is false. + is_node = false; + } + + // store gradients of q, k and v as continuous tensors concatenated in result. + // q shape[D,N,ne2,ne3] ; k shape [D,M,ne2,ne3] ; v shape [M,D,ne2,ne3] + // gradq->data = result->data + // gradk->data = result->data + nb0*D*N*ne2*ne3 + // gradv->data = result->data + nb0*D*N*ne2*ne3 + nb0*D*M*ne2*ne3 + // note: v and gradv are actually transposed, i.e. v->ne[0] != D. + int64_t ne[4] = {D,M+N+M,ne2,ne3}; + + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + result->op = GGML_OP_FLASH_ATTN_BACK; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = q; + result->src1 = k; + result->opt[0] = v; + result->opt[1] = d; + result->opt[2] = ggml_new_i32(ctx, masked ? 1 : 0); + + return result; +} + + // ggml_map_unary struct ggml_tensor * ggml_map_unary_impl_f32( @@ -12788,6 +12854,394 @@ static void ggml_compute_forward_flash_ff( } } +// ggml_compute_forward_flash_attn_back + +static void ggml_compute_forward_flash_attn_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * q, + const struct ggml_tensor * k, + const struct ggml_tensor * v, + const struct ggml_tensor * d, + const bool masked, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + const int64_t neq0 = q->ne[0]; + const int64_t neq1 = q->ne[1]; + const int64_t neq2 = q->ne[2]; + const int64_t neq3 = q->ne[3]; + + const int64_t nek0 = k->ne[0]; + const int64_t nek1 = k->ne[1]; + //const int64_t nek2 = k->ne[2]; + //const int64_t nek3 = k->ne[3]; + + const int64_t nev0 = v->ne[0]; + const int64_t nev1 = v->ne[1]; + //const int64_t nev2 = v->ne[2]; + //const int64_t nev3 = v->ne[3]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + //const int64_t ne2 = dst->ne[2]; + //const int64_t ne3 = dst->ne[3]; + + const int nbk0 = k->nb[0]; + const int nbk1 = k->nb[1]; + const int nbk2 = k->nb[2]; + const int nbk3 = k->nb[3]; + + const int nbq0 = q->nb[0]; + const int nbq1 = q->nb[1]; + const int nbq2 = q->nb[2]; + const int nbq3 = q->nb[3]; + + const int nbv0 = v->nb[0]; + const int nbv1 = v->nb[1]; + const int nbv2 = v->nb[2]; + const int nbv3 = v->nb[3]; + + const int nbd0 = d->nb[0]; + const int nbd1 = d->nb[1]; + const int nbd2 = d->nb[2]; + const int nbd3 = d->nb[3]; + + const int nb0 = dst->nb[0]; + const int nb1 = dst->nb[1]; + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t D = neq0; + const int64_t N = neq1; + const int64_t P = nek1 - N; + const int64_t M = P + N; + + const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL); + const int mxDM = MAX(D, Mup); + + GGML_ASSERT(ne0 == D); + GGML_ASSERT(ne1 == N); + GGML_ASSERT(P >= 0); + + GGML_ASSERT(nbq0 == sizeof(float)); + GGML_ASSERT(nbk0 == sizeof(float)); + GGML_ASSERT(nbv0 == sizeof(float)); + + GGML_ASSERT(neq0 == D); + GGML_ASSERT(nek0 == D); + GGML_ASSERT(nev1 == D); + + GGML_ASSERT(neq1 == N); + GGML_ASSERT(nek1 == N + P); + GGML_ASSERT(nev1 == D); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + if (params->type == GGML_TASK_INIT) { + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // parallelize by q rows using ggml_vec_dot_f32 + + // total rows in q + const int nr = neq1*neq2*neq3; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + const float scale = 1.0f/sqrtf(D); + + //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale); + + for (int ir = ir0; ir < ir1; ++ir) { + // q indices + const int iq3 = ir/(neq2*neq1); + const int iq2 = (ir - iq3*neq2*neq1)/neq1; + const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1); + + // not sure about CACHE_LINE_SIZE_F32.. + // - maybe it must not be multiplied by 2 and excluded from .. in SM 1*(..) offset? + float * S = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 0*(mxDM+CACHE_LINE_SIZE_F32); + float * SM = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 1*(mxDM+CACHE_LINE_SIZE_F32); + + for (int i = M; i < Mup; ++i) { + S[i] = -INFINITY; + } + + for (int64_t ic = 0; ic < nek1; ++ic) { + // k indices + const int ik3 = iq3; + const int ik2 = iq2; + const int ik1 = ic; + + // S indices + const int i1 = ik1; + + ggml_vec_dot_f32(neq0, + S + i1, + (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), + (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); + } + + // scale + ggml_vec_scale_f32(nek1, S, scale); + + if (masked) { + for (int64_t i = P; i < M; i++) { + if (i > P + iq1) { + S[i] = -INFINITY; + } + } + } + + // softmax + { + float max = -INFINITY; + ggml_vec_max_f32(M, &max, S); + + ggml_float sum = 0.0; + { +#ifdef GGML_SOFT_MAX_ACCELERATE + max = -max; + vDSP_vsadd(SM, 1, &max, SM, 1, Mup); + vvexpf(SM, SM, &Mup); + ggml_vec_sum_f32(Mup, &sum, SM); +#else + uint16_t scvt[GGML_SOFT_MAX_UNROLL]; + ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 }; + + for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) { + float * SS = SM + i; + + for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) { + if (SS[j] == -INFINITY) { + SS[j] = 0.0f; + } else { + ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max); + memcpy(&scvt[j], &s, sizeof(uint16_t)); + const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); + sump[j] += (ggml_float)val; + SS[j] = val; + } + } + } + + for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) { + sum += sump[i]; + } +#endif + } + + assert(sum > 0.0); + + sum = 1.0/sum; + ggml_vec_scale_f32(M, SM, sum); + + } + + // step-by-step explanation + { + // forward-process shape grads from backward process + // parallel_for iq2,iq3: + // k[:D,:M,:,:] [D,M,:,:] grad[k][:D,:M,iq2,iq3] += grad[kcur] + // q[:D,:N,:,:] [D,N,:,:] grad[q][:D,iq1,iq2,iq3] += grad[qcur] + // v[:M,:D,:,:] [M,D,:,:] grad[v][:M,:D,iq2,iq3] += grad[vcur] + // for iq1: + // kcur = k[:D,:M,iq2,iq3] [D,M,1,1] grad[kcur] = grad[S1].T @ qcur + // qcur = q[:D,iq1,iq2,iq3] [D,1,1,1] grad[qcur] = grad[S1] @ kcur + // vcur = v[:M,:D,iq2,iq3] [M,D,1,1] grad[vcur] = grad[S5].T @ S4 + // S0 = -Inf [D,1,1,1] + // ~S1[i] = dot(kcur[:D,i], qcur) + // S1 = qcur.T @ kcur [M,1,1,1] grad[S1] = grad[S2] * scale + // S2 = S1 * scale [M,1,1,1] grad[S2] = diag_mask_zero(grad[S3], P) + // S3 = diag_mask_inf(S2, P) [M,1,1,1] grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) + // S4 = softmax(S3) [M,1,1,1] grad[S4] = grad[S5] @ vcur + // ~S5[i] = dot(vcur[:,i],S4) + // S5 = S4.T @ vcur [D,1,1,1] grad[S5] = d[:D,iq1,iq2,iq3] + // ~dst[i,iq1,iq2,iq3] = S5[i] ^ + // dst[:D,iq1,iq2,iq3] = S5 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,iq1,iq2,iq3] + // dst backward-/ grad[dst] = d + // + // output gradients with their dependencies: + // + // grad[kcur] = grad[S1].T @ qcur + // grad[S1] = diag_mask_zero(grad[S3], P) * scale + // grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) + // grad[S4] = grad[S5] @ vcur + // grad[S4] = d[:D,iq1,iq2,iq3] @ vcur + // grad[qcur] = grad[S1] @ kcur + // grad[vcur] = grad[S5].T @ S4 + // grad[vcur] = d[:D,iq1,iq2,iq3].T @ S4 + // + // in post-order: + // + // S1 = qcur.T @ kcur + // S2 = S1 * scale + // S3 = diag_mask_inf(S2, P) + // S4 = softmax(S3) + // grad[S4] = d[:D,iq1,iq2,iq3] @ vcur + // grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) + // grad[S1] = diag_mask_zero(grad[S3], P) * scale + // grad[qcur] = grad[S1] @ kcur + // grad[kcur] = grad[S1].T @ qcur + // grad[vcur] = d[:D,iq1,iq2,iq3].T @ S4 + // + // using less variables (SM=S4): + // + // S = diag_mask_inf(qcur.T @ kcur * scale, P) + // SM = softmax(S) + // S = d[:D,iq1,iq2,iq3] @ vcur + // dot_SM_gradSM = dot(SM, S) + // S = SM * (S - dot(SM, S)) + // S = diag_mask_zero(S, P) * scale + // + // grad[q][:D,iq1,iq2,iq3] += S @ kcur + // grad[k][:D,:M,iq2,iq3] += S.T @ qcur + // grad[v][:M,:D,iq2,iq3] += d[:D,iq1,iq2,iq3].T @ SM + } + + // S = gradSM = d[:D,iq1,iq2,iq3] @ vcur + // S = d[:D,iq1,iq2,iq3] @ vcur + // S[:M] += vcur[:,ic] * d[ic,iq1,iq2,iq3] + ggml_vec_set_f32(D, S, 0); + for (int64_t ic = 0; ic < D; ++ic) { + // dst indices + const int i1 = iq1; + const int i2 = iq2; + const int i3 = iq3; + + ggml_vec_mad_f32(M, + S, + (float *) ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), + *(float *) ((char *) d->data + (ic*nbd1 + i1*nbd2 + i2*nbd2 + i3*nbd3))); + } + + // S = SM * (S - dot(SM, S)) + float dot_SM_gradSM = 0; + ggml_vec_dot_f32 (M, &dot_SM_gradSM, SM, S); + ggml_vec_acc1_f32(M, S, -dot_SM_gradSM); + ggml_vec_mul_f32 (M, S, S, SM); + + // S = diag_mask_zero(S, P) * scale + if (masked) { + for (int64_t i = P + iq1 + 1; i < M; i++) { + S[i] = 0; + } + } + ggml_vec_scale_f32(M, S, scale); + + void * grad_q = (char *) dst->data; + void * grad_k = (char *) dst->data + nb0*D*N*neq2*neq3; + void * grad_v = (char *) dst->data + nb0*D*N*neq2*neq3 + nb0*D*M*neq2*neq3; + + const size_t nbgq1 = nb0*neq0; + const size_t nbgq2 = nb0*neq0*neq1; + const size_t nbgq3 = nb0*neq0*neq1*neq2; + + const size_t nbgk1 = nb0*nek0; + const size_t nbgk2 = nb0*nek0*nek1; + const size_t nbgk3 = nb0*nek0*nek1*neq2; + + const size_t nbgv1 = nb0*nev0; + const size_t nbgv2 = nb0*nev0*nev1; + const size_t nbgv3 = nb0*nev0*nev1*neq2; + + // S shape [M,1] + // SM shape [M,1] + // kcur shape [D,M] + // qcur shape [D,1] + // vcur shape [M,D] + // + // grad[q][:D,iq1,iq2,iq3] += S @ kcur + // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M] + // grad[q][ic,iq1,iq2,iq3] += dot(kcur[:,ic],S.T) + // grad[q][ic,iq1,iq2,iq3] += dot(k[:D,ic,iq2,iq3],S.T) + for (int64_t ic = 0; ic < M; ++ic) { + // dst indices + const int i1 = iq1; + const int i2 = iq2; + const int i3 = iq3; + + ggml_vec_dot_f32(D, + (float *) ((char *) grad_q + (ic*nb0 + i1*nbgq1 + i2*nbgq2 + i3*nbgq3)), + (float *) ((char *) k->data + ( ic*nbk1 + i2*nbk2 + i3*nbk3)), + S); + } + + // grad[k][:D,:M,iq2,iq3] += S.T @ qcur + // grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0] + // grad[k][:D,ic,iq2,iq3] += S[ic] * qcur[:D,0] + for (int64_t ic = 0; ic < M; ++ic) { + // dst indices + const int i1 = iq1; + const int i2 = iq2; + const int i3 = iq3; + + ggml_vec_set_f32(D, + (float *) ((char *) grad_k + (ic*nbgk1 + i2*nbgk2 + i3*nbgk3)), + 0); + ggml_vec_mad_f32(D, + (float *) ((char *) grad_k + (ic*nbgk1 + i2*nbgk2 + i3*nbgk3)), + (float *) ((char *) q->data + (i1*nbk1 + i2*nbk2 + i3*nbk3)), + S[ic]); + } + + // grad[v][:M,:D,iq2,iq3] += d[:D,iq1,iq2,iq3].T @ SM + // grad[v][:M,ic,iq2,iq3] += d[:D,iq1,iq2,iq3].T[0,ic] * SM[:M] + // grad[v][:M,ic,iq2,iq3] += d[ic,iq1,iq2,iq3] * SM[:M] + for (int64_t ic = 0; ic < D; ++ic) { + // dst indices + const int i1 = iq1; + const int i2 = iq2; + const int i3 = iq3; + + ggml_vec_set_f32(M, + (float *) ((char *) grad_v + ( ic*nbgv1 + i2*nbgv2 + i3*nbgv3)), + 0); + ggml_vec_mad_f32(M, + (float *) ((char *) grad_v + ( ic*nbgv1 + i2*nbgv2 + i3*nbgv3)), + SM, + *(float *) ((char *) d->data + (ic*nbd1 + i1*nbd2 + i2*nbd2 + i3*nbd3))); + } + } +} + +static void ggml_compute_forward_flash_attn_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * q, + const struct ggml_tensor * k, + const struct ggml_tensor * v, + const struct ggml_tensor * d, + const bool masked, + struct ggml_tensor * dst) { + switch (q->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_flash_attn_back_f32(params, q, k, v, d, masked, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_map_unary static void ggml_compute_forward_map_unary_f32( @@ -13371,6 +13825,13 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor); } break; + case GGML_OP_FLASH_ATTN_BACK: + { + int32_t t = ggml_get_i32_1d(tensor->opt[2], 0); + GGML_ASSERT(t == 0 || t == 1); + bool masked = t != 0; + ggml_compute_forward_flash_attn_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], masked, tensor); + } break; case GGML_OP_MAP_UNARY: { const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data); @@ -14007,12 +14468,169 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_FLASH_ATTN: { - GGML_ASSERT(false); // not supported + struct ggml_tensor * flash_grad = NULL; + if (src0->grad || src1->grad || tensor->opt[0]->grad) { + int32_t t = ggml_get_i32_1d(tensor->opt[1], 0); + GGML_ASSERT(t == 0 || t == 1); + bool masked = t != 0; + flash_grad = + ggml_flash_attn_back(ctx, + src0->grad, + src1->grad, + tensor->opt[0]->grad, + tensor->grad, + masked); + } + + if (src0->grad) { + struct ggml_tensor * grad_q = NULL; + const size_t nb0 = flash_grad->nb[0]; + const size_t offset = 0; + switch(src0->n_dims) { + case 2: + { + grad_q = ggml_view_2d(ctx, + flash_grad, + src0->ne[0], + src0->ne[1], + nb0*src0->ne[0], + offset); + } break; + case 3: + { + grad_q = ggml_view_3d(ctx, + flash_grad, + src0->ne[0], + src0->ne[1], + src0->ne[2], + nb0*src0->ne[0], + nb0*src0->ne[0]*src0->ne[1], + offset); + } break; + case 4: + { + grad_q = ggml_view_3d(ctx, + flash_grad, + src0->ne[0], + src0->ne[1], + src0->ne[2], + src0->ne[3], + nb0*src0->ne[0], + nb0*src0->ne[0]*src0->ne[1], + nb0*src0->ne[0]*src0->ne[1]*src0->ne[2], + offset); + } break; + } + + src0->grad = ggml_add_impl(ctx, + src0->grad, + grad_q, + inplace); + } + + if (src1->grad) { + struct ggml_tensor * grad_k = NULL; + const size_t nb0 = flash_grad->nb[0]; + const size_t offset = nb0*src0->ne[0]*src0->ne[1]*src0->ne[2]*src0->ne[3]; + switch(src1->n_dims) { + case 2: + { + grad_k = ggml_view_2d(ctx, + flash_grad, + src1->ne[0], + src1->ne[1], + nb0*src1->ne[0], + offset); + } break; + case 3: + { + grad_k = ggml_view_3d(ctx, + flash_grad, + src1->ne[0], + src1->ne[1], + src1->ne[2], + nb0*src1->ne[0], + nb0*src1->ne[0]*src1->ne[1], + offset); + } break; + case 4: + { + grad_k = ggml_view_3d(ctx, + flash_grad, + src1->ne[0], + src1->ne[1], + src1->ne[2], + src1->ne[3], + nb0*src1->ne[0], + nb0*src1->ne[0]*src1->ne[1], + nb0*src1->ne[0]*src1->ne[1]*src1->ne[2], + offset); + } break; + } + + src1->grad = ggml_add_impl(ctx, + src1->grad, + grad_k, + inplace); + } + + struct ggml_tensor * opt0 = tensor->opt[0]; + + if (opt0->grad) { + struct ggml_tensor * grad_v = NULL; + const size_t nb0 = flash_grad->nb[0]; + const size_t offset = nb0*src0->ne[0]*src0->ne[1]*src0->ne[2]*src0->ne[3] + + nb0*src1->ne[0]*src1->ne[1]*src1->ne[2]*src1->ne[3]; + switch(opt0->n_dims) { + case 2: + { + grad_v = ggml_view_2d(ctx, + flash_grad, + opt0->ne[0], + opt0->ne[1], + nb0*opt0->ne[0], + offset); + } break; + case 3: + { + grad_v = ggml_view_3d(ctx, + flash_grad, + opt0->ne[0], + opt0->ne[1], + opt0->ne[2], + nb0*opt0->ne[0], + nb0*opt0->ne[0]*opt0->ne[1], + offset); + } break; + case 4: + { + grad_v = ggml_view_3d(ctx, + flash_grad, + opt0->ne[0], + opt0->ne[1], + opt0->ne[2], + opt0->ne[3], + nb0*opt0->ne[0], + nb0*opt0->ne[0]*opt0->ne[1], + nb0*opt0->ne[0]*opt0->ne[1]*opt0->ne[2], + offset); + } break; + } + + opt0->grad = ggml_add_impl(ctx, + opt0->grad, + grad_v, + inplace); + } } break; case GGML_OP_FLASH_FF: { GGML_ASSERT(false); // not supported } break; + case GGML_OP_FLASH_ATTN_BACK: + { + GGML_ASSERT(false); // not supported + } break; case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: { @@ -14575,6 +15193,27 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 } + work_size = MAX(work_size, cur); + } break; + case GGML_OP_FLASH_ATTN_BACK: + { + node->n_tasks = n_threads; + + size_t cur = 0; + + const int64_t D = node->src0->ne[0]; + const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); + const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back + if (node->src1->type == GGML_TYPE_F32) { + cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2 + } + + if (node->src1->type == GGML_TYPE_F16) { + cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2 + } + work_size = MAX(work_size, cur); } break; case GGML_OP_MAP_UNARY: diff --git a/ggml.h b/ggml.h index ba60588d6b521..5dc80e74beb1c 100644 --- a/ggml.h +++ b/ggml.h @@ -318,6 +318,7 @@ extern "C" { GGML_OP_FLASH_ATTN, GGML_OP_FLASH_FF, + GGML_OP_FLASH_ATTN_BACK, GGML_OP_MAP_UNARY, GGML_OP_MAP_BINARY, @@ -952,6 +953,14 @@ extern "C" { struct ggml_tensor * v, bool masked); + GGML_API struct ggml_tensor * ggml_flash_attn_back( + struct ggml_context * ctx, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * d, + bool masked); + GGML_API struct ggml_tensor * ggml_flash_ff( struct ggml_context * ctx, struct ggml_tensor * a, From 38560b6d51b53c614ff934d1da4e67ad9ed96517 Mon Sep 17 00:00:00 2001 From: xaedes Date: Mon, 29 May 2023 23:45:58 +0200 Subject: [PATCH 58/86] bugfixes for backward pass of flash attention --- ggml.c | 463 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 241 insertions(+), 222 deletions(-) diff --git a/ggml.c b/ggml.c index 353b42cee2679..47a01cea6cc42 100644 --- a/ggml.c +++ b/ggml.c @@ -6221,7 +6221,6 @@ struct ggml_tensor * ggml_flash_attn( bool is_node = false; if (q->grad || k->grad || v->grad) { - GGML_ASSERT(false); // TODO: implement backward is_node = true; } @@ -12882,10 +12881,15 @@ static void ggml_compute_forward_flash_attn_back_f32( //const int64_t nev2 = v->ne[2]; //const int64_t nev3 = v->ne[3]; + const int64_t ned0 = d->ne[0]; + const int64_t ned1 = d->ne[1]; + //const int64_t ned2 = d->ne[2]; + //const int64_t ned3 = d->ne[3]; + const int64_t ne0 = dst->ne[0]; const int64_t ne1 = dst->ne[1]; - //const int64_t ne2 = dst->ne[2]; - //const int64_t ne3 = dst->ne[3]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; const int nbk0 = k->nb[0]; const int nbk1 = k->nb[1]; @@ -12923,8 +12927,8 @@ static void ggml_compute_forward_flash_attn_back_f32( const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL); const int mxDM = MAX(D, Mup); - GGML_ASSERT(ne0 == D); - GGML_ASSERT(ne1 == N); + // GGML_ASSERT(ne0 == D); + // GGML_ASSERT(ne1 == N); GGML_ASSERT(P >= 0); GGML_ASSERT(nbq0 == sizeof(float)); @@ -12934,10 +12938,12 @@ static void ggml_compute_forward_flash_attn_back_f32( GGML_ASSERT(neq0 == D); GGML_ASSERT(nek0 == D); GGML_ASSERT(nev1 == D); + GGML_ASSERT(ned0 == D); GGML_ASSERT(neq1 == N); GGML_ASSERT(nek1 == N + P); GGML_ASSERT(nev1 == D); + GGML_ASSERT(ned1 == N); // dst cannot be transposed or permuted GGML_ASSERT(nb0 == sizeof(float)); @@ -12946,6 +12952,9 @@ static void ggml_compute_forward_flash_attn_back_f32( GGML_ASSERT(nb2 <= nb3); if (params->type == GGML_TASK_INIT) { + if (ith == 0) { + memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3); + } return; } @@ -12956,7 +12965,7 @@ static void ggml_compute_forward_flash_attn_back_f32( // parallelize by q rows using ggml_vec_dot_f32 // total rows in q - const int nr = neq1*neq2*neq3; + const int nr = neq2*neq3; // rows per thread const int dr = (nr + nth - 1)/nth; @@ -12971,253 +12980,263 @@ static void ggml_compute_forward_flash_attn_back_f32( for (int ir = ir0; ir < ir1; ++ir) { // q indices - const int iq3 = ir/(neq2*neq1); - const int iq2 = (ir - iq3*neq2*neq1)/neq1; - const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1); + const int iq3 = ir/(neq2); + const int iq2 = (ir - iq3*neq2)/neq2; + for ( int iq1 = 0; iq1 < neq1; ++iq1) { - // not sure about CACHE_LINE_SIZE_F32.. - // - maybe it must not be multiplied by 2 and excluded from .. in SM 1*(..) offset? - float * S = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 0*(mxDM+CACHE_LINE_SIZE_F32); - float * SM = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 1*(mxDM+CACHE_LINE_SIZE_F32); - for (int i = M; i < Mup; ++i) { - S[i] = -INFINITY; - } + // not sure about CACHE_LINE_SIZE_F32.. + // - maybe it must not be multiplied by 2 and excluded from .. in SM 1*(..) offset? + float * S = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 0*(mxDM+CACHE_LINE_SIZE_F32); + float * SM = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 1*(mxDM+CACHE_LINE_SIZE_F32); - for (int64_t ic = 0; ic < nek1; ++ic) { - // k indices - const int ik3 = iq3; - const int ik2 = iq2; - const int ik1 = ic; + for (int i = M; i < Mup; ++i) { + S[i] = -INFINITY; + } - // S indices - const int i1 = ik1; + for (int64_t ic = 0; ic < nek1; ++ic) { + // k indices + const int ik3 = iq3; + const int ik2 = iq2; + const int ik1 = ic; - ggml_vec_dot_f32(neq0, - S + i1, - (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), - (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); - } + // S indices + const int i1 = ik1; - // scale - ggml_vec_scale_f32(nek1, S, scale); + ggml_vec_dot_f32(neq0, + S + i1, + (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), + (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); + } - if (masked) { - for (int64_t i = P; i < M; i++) { - if (i > P + iq1) { - S[i] = -INFINITY; + // scale + ggml_vec_scale_f32(nek1, S, scale); + + if (masked) { + for (int64_t i = P; i < M; i++) { + if (i > P + iq1) { + S[i] = -INFINITY; + } } } - } - // softmax - { - float max = -INFINITY; - ggml_vec_max_f32(M, &max, S); - - ggml_float sum = 0.0; + // softmax { + float max = -INFINITY; + ggml_vec_max_f32(M, &max, S); + + ggml_float sum = 0.0; + { #ifdef GGML_SOFT_MAX_ACCELERATE - max = -max; - vDSP_vsadd(SM, 1, &max, SM, 1, Mup); - vvexpf(SM, SM, &Mup); - ggml_vec_sum_f32(Mup, &sum, SM); + max = -max; + vDSP_vsadd(SM, 1, &max, SM, 1, Mup); + vvexpf(SM, SM, &Mup); + ggml_vec_sum_f32(Mup, &sum, SM); #else - uint16_t scvt[GGML_SOFT_MAX_UNROLL]; - ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 }; + uint16_t scvt[GGML_SOFT_MAX_UNROLL]; + ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 }; - for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) { - float * SS = SM + i; + for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) { + float * SR = S + i; + float * SW = SM + i; - for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) { - if (SS[j] == -INFINITY) { - SS[j] = 0.0f; - } else { - ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max); - memcpy(&scvt[j], &s, sizeof(uint16_t)); - const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); - sump[j] += (ggml_float)val; - SS[j] = val; + for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) { + if (SR[j] == -INFINITY) { + SW[j] = 0.0f; + } else { + ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max); + memcpy(&scvt[j], &s, sizeof(uint16_t)); + const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); + sump[j] += (ggml_float)val; + SW[j] = val; + } } } - } - for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) { - sum += sump[i]; - } + for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) { + sum += sump[i]; + } #endif - } + } - assert(sum > 0.0); + assert(sum > 0.0); - sum = 1.0/sum; - ggml_vec_scale_f32(M, SM, sum); + sum = 1.0/sum; + ggml_vec_scale_f32(M, SM, sum); - } + } - // step-by-step explanation - { - // forward-process shape grads from backward process - // parallel_for iq2,iq3: - // k[:D,:M,:,:] [D,M,:,:] grad[k][:D,:M,iq2,iq3] += grad[kcur] - // q[:D,:N,:,:] [D,N,:,:] grad[q][:D,iq1,iq2,iq3] += grad[qcur] - // v[:M,:D,:,:] [M,D,:,:] grad[v][:M,:D,iq2,iq3] += grad[vcur] - // for iq1: - // kcur = k[:D,:M,iq2,iq3] [D,M,1,1] grad[kcur] = grad[S1].T @ qcur - // qcur = q[:D,iq1,iq2,iq3] [D,1,1,1] grad[qcur] = grad[S1] @ kcur - // vcur = v[:M,:D,iq2,iq3] [M,D,1,1] grad[vcur] = grad[S5].T @ S4 - // S0 = -Inf [D,1,1,1] - // ~S1[i] = dot(kcur[:D,i], qcur) - // S1 = qcur.T @ kcur [M,1,1,1] grad[S1] = grad[S2] * scale - // S2 = S1 * scale [M,1,1,1] grad[S2] = diag_mask_zero(grad[S3], P) - // S3 = diag_mask_inf(S2, P) [M,1,1,1] grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) - // S4 = softmax(S3) [M,1,1,1] grad[S4] = grad[S5] @ vcur - // ~S5[i] = dot(vcur[:,i],S4) - // S5 = S4.T @ vcur [D,1,1,1] grad[S5] = d[:D,iq1,iq2,iq3] - // ~dst[i,iq1,iq2,iq3] = S5[i] ^ - // dst[:D,iq1,iq2,iq3] = S5 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,iq1,iq2,iq3] - // dst backward-/ grad[dst] = d - // - // output gradients with their dependencies: - // - // grad[kcur] = grad[S1].T @ qcur - // grad[S1] = diag_mask_zero(grad[S3], P) * scale - // grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) - // grad[S4] = grad[S5] @ vcur - // grad[S4] = d[:D,iq1,iq2,iq3] @ vcur - // grad[qcur] = grad[S1] @ kcur - // grad[vcur] = grad[S5].T @ S4 - // grad[vcur] = d[:D,iq1,iq2,iq3].T @ S4 - // - // in post-order: - // - // S1 = qcur.T @ kcur - // S2 = S1 * scale - // S3 = diag_mask_inf(S2, P) - // S4 = softmax(S3) - // grad[S4] = d[:D,iq1,iq2,iq3] @ vcur - // grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) - // grad[S1] = diag_mask_zero(grad[S3], P) * scale - // grad[qcur] = grad[S1] @ kcur - // grad[kcur] = grad[S1].T @ qcur - // grad[vcur] = d[:D,iq1,iq2,iq3].T @ S4 - // - // using less variables (SM=S4): - // - // S = diag_mask_inf(qcur.T @ kcur * scale, P) - // SM = softmax(S) - // S = d[:D,iq1,iq2,iq3] @ vcur - // dot_SM_gradSM = dot(SM, S) - // S = SM * (S - dot(SM, S)) - // S = diag_mask_zero(S, P) * scale - // - // grad[q][:D,iq1,iq2,iq3] += S @ kcur - // grad[k][:D,:M,iq2,iq3] += S.T @ qcur - // grad[v][:M,:D,iq2,iq3] += d[:D,iq1,iq2,iq3].T @ SM - } + // step-by-step explanation + { + // forward-process shape grads from backward process + // parallel_for iq2,iq3: + // k[:D,:M,:,:] [D,M,:,:] grad[k][:D,:M,iq2,iq3] += grad[kcur] + // q[:D,:N,:,:] [D,N,:,:] grad[q][:D,iq1,iq2,iq3] += grad[qcur] + // v[:M,:D,:,:] [M,D,:,:] grad[v][:M,:D,iq2,iq3] += grad[vcur] + // for iq1: + // kcur = k[:D,:M,iq2,iq3] [D,M,1,1] grad[kcur] = grad[S1].T @ qcur + // qcur = q[:D,iq1,iq2,iq3] [D,1,1,1] grad[qcur] = grad[S1] @ kcur + // vcur = v[:M,:D,iq2,iq3] [M,D,1,1] grad[vcur] = grad[S5].T @ S4 + // S0 = -Inf [D,1,1,1] + // ~S1[i] = dot(kcur[:D,i], qcur) + // S1 = qcur.T @ kcur [M,1,1,1] grad[S1] = grad[S2] * scale + // S2 = S1 * scale [M,1,1,1] grad[S2] = diag_mask_zero(grad[S3], P) + // S3 = diag_mask_inf(S2, P) [M,1,1,1] grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) + // S4 = softmax(S3) [M,1,1,1] grad[S4] = grad[S5] @ vcur + // ~S5[i] = dot(vcur[:,i],S4) + // S5 = S4.T @ vcur [D,1,1,1] grad[S5] = d[:D,iq1,iq2,iq3] + // ~dst[i,iq1,iq2,iq3] = S5[i] ^ + // dst[:D,iq1,iq2,iq3] = S5 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,iq1,iq2,iq3] + // dst backward-/ grad[dst] = d + // + // output gradients with their dependencies: + // + // grad[kcur] = grad[S1].T @ qcur + // grad[S1] = diag_mask_zero(grad[S3], P) * scale + // grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) + // grad[S4] = grad[S5] @ vcur + // grad[S4] = d[:D,iq1,iq2,iq3] @ vcur + // grad[qcur] = grad[S1] @ kcur + // grad[vcur] = grad[S5].T @ S4 + // grad[vcur] = d[:D,iq1,iq2,iq3].T @ S4 + // + // in post-order: + // + // S1 = qcur.T @ kcur + // S2 = S1 * scale + // S3 = diag_mask_inf(S2, P) + // S4 = softmax(S3) + // grad[S4] = d[:D,iq1,iq2,iq3] @ vcur + // grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) + // grad[S1] = diag_mask_zero(grad[S3], P) * scale + // grad[qcur] = grad[S1] @ kcur + // grad[kcur] = grad[S1].T @ qcur + // grad[vcur] = d[:D,iq1,iq2,iq3].T @ S4 + // + // using less variables (SM=S4): + // + // S = diag_mask_inf(qcur.T @ kcur * scale, P) + // SM = softmax(S) + // S = d[:D,iq1,iq2,iq3] @ vcur + // dot_SM_gradSM = dot(SM, S) + // S = SM * (S - dot(SM, S)) + // S = diag_mask_zero(S, P) * scale + // + // grad[q][:D,iq1,iq2,iq3] += S @ kcur + // grad[k][:D,:M,iq2,iq3] += S.T @ qcur + // grad[v][:M,:D,iq2,iq3] += d[:D,iq1,iq2,iq3].T @ SM + } - // S = gradSM = d[:D,iq1,iq2,iq3] @ vcur - // S = d[:D,iq1,iq2,iq3] @ vcur - // S[:M] += vcur[:,ic] * d[ic,iq1,iq2,iq3] - ggml_vec_set_f32(D, S, 0); - for (int64_t ic = 0; ic < D; ++ic) { - // dst indices - const int i1 = iq1; - const int i2 = iq2; - const int i3 = iq3; + // S = gradSM = d[:D,iq1,iq2,iq3] @ vcur + // S = d[:D,iq1,iq2,iq3] @ vcur + // S[:M] += vcur[:M,ic] * d[ic,iq1,iq2,iq3] + ggml_vec_set_f32(M, S, 0); + for (int64_t ic = 0; ic < D; ++ic) { + // dst indices + const int i1 = iq1; + const int i2 = iq2; + const int i3 = iq3; - ggml_vec_mad_f32(M, - S, - (float *) ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), - *(float *) ((char *) d->data + (ic*nbd1 + i1*nbd2 + i2*nbd2 + i3*nbd3))); - } + ggml_vec_mad_f32(M, + S, + (float *) ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), + *(float *) ((char *) d->data + (ic*nbd0 + i1*nbd1 + i2*nbd2 + i3*nbd3))); + } - // S = SM * (S - dot(SM, S)) - float dot_SM_gradSM = 0; - ggml_vec_dot_f32 (M, &dot_SM_gradSM, SM, S); - ggml_vec_acc1_f32(M, S, -dot_SM_gradSM); - ggml_vec_mul_f32 (M, S, S, SM); + // S = SM * (S - dot(SM, S)) + float dot_SM_gradSM = 0; + ggml_vec_dot_f32 (M, &dot_SM_gradSM, SM, S); + ggml_vec_acc1_f32(M, S, -dot_SM_gradSM); + ggml_vec_mul_f32 (M, S, S, SM); - // S = diag_mask_zero(S, P) * scale - if (masked) { - for (int64_t i = P + iq1 + 1; i < M; i++) { - S[i] = 0; + // S = diag_mask_zero(S, P) * scale + if (masked) { + // for (int64_t i = P + iq1 + 1; i < M; i++) { + // S[i] = 0; + // } + for (int64_t i = P; i < M; i++) { + if (i > P + iq1) { + S[i] = 0; + } + } } - } - ggml_vec_scale_f32(M, S, scale); + ggml_vec_scale_f32(M, S, scale); - void * grad_q = (char *) dst->data; - void * grad_k = (char *) dst->data + nb0*D*N*neq2*neq3; - void * grad_v = (char *) dst->data + nb0*D*N*neq2*neq3 + nb0*D*M*neq2*neq3; + void * grad_q = (char *) dst->data; + void * grad_k = (char *) dst->data + nb0*D*N*neq2*neq3; + void * grad_v = (char *) dst->data + nb0*D*N*neq2*neq3 + nb0*D*M*neq2*neq3; - const size_t nbgq1 = nb0*neq0; - const size_t nbgq2 = nb0*neq0*neq1; - const size_t nbgq3 = nb0*neq0*neq1*neq2; + const size_t nbgq1 = nb0*neq0; + const size_t nbgq2 = nb0*neq0*neq1; + const size_t nbgq3 = nb0*neq0*neq1*neq2; - const size_t nbgk1 = nb0*nek0; - const size_t nbgk2 = nb0*nek0*nek1; - const size_t nbgk3 = nb0*nek0*nek1*neq2; + const size_t nbgk1 = nb0*nek0; + const size_t nbgk2 = nb0*nek0*nek1; + const size_t nbgk3 = nb0*nek0*nek1*neq2; - const size_t nbgv1 = nb0*nev0; - const size_t nbgv2 = nb0*nev0*nev1; - const size_t nbgv3 = nb0*nev0*nev1*neq2; + const size_t nbgv1 = nb0*nev0; + const size_t nbgv2 = nb0*nev0*nev1; + const size_t nbgv3 = nb0*nev0*nev1*neq2; - // S shape [M,1] - // SM shape [M,1] - // kcur shape [D,M] - // qcur shape [D,1] - // vcur shape [M,D] - // - // grad[q][:D,iq1,iq2,iq3] += S @ kcur - // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M] - // grad[q][ic,iq1,iq2,iq3] += dot(kcur[:,ic],S.T) - // grad[q][ic,iq1,iq2,iq3] += dot(k[:D,ic,iq2,iq3],S.T) - for (int64_t ic = 0; ic < M; ++ic) { - // dst indices - const int i1 = iq1; - const int i2 = iq2; - const int i3 = iq3; - - ggml_vec_dot_f32(D, - (float *) ((char *) grad_q + (ic*nb0 + i1*nbgq1 + i2*nbgq2 + i3*nbgq3)), - (float *) ((char *) k->data + ( ic*nbk1 + i2*nbk2 + i3*nbk3)), - S); - } + // S shape [M,1] + // SM shape [M,1] + // kcur shape [D,M] + // qcur shape [D,1] + // vcur shape [M,D] + // + // grad[q][:D,iq1,iq2,iq3] += S @ kcur + // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M] + // grad[q][:D,iq1,iq2,iq3] += S[ic] * kcur[:D,ic] + // + //// grad[q][ic,iq1,iq2,iq3] += dot(kcur[:,ic],S.T) + //// grad[q][ic,iq1,iq2,iq3] += dot(k[:D,ic,iq2,iq3],S.T) + for (int64_t ic = 0; ic < M; ++ic) { + // dst indices + const int i1 = iq1; + const int i2 = iq2; + const int i3 = iq3; + + ggml_vec_mad_f32(D, + (float *) ((char *) grad_q + (i1*nbgq1 + i2*nbgq2 + i3*nbgq3)), + (float *) ((char *) k->data + (ic*nbk1 + i2*nbk2 + i3*nbk3)), + S[ic]); + } - // grad[k][:D,:M,iq2,iq3] += S.T @ qcur - // grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0] - // grad[k][:D,ic,iq2,iq3] += S[ic] * qcur[:D,0] - for (int64_t ic = 0; ic < M; ++ic) { - // dst indices - const int i1 = iq1; - const int i2 = iq2; - const int i3 = iq3; + // grad[k][:D,:M,iq2,iq3] += S.T @ qcur + // grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0] + // grad[k][:D,ic,iq2,iq3] += S[ic] * qcur[:D,0] + for (int64_t ic = 0; ic < M; ++ic) { + // dst indices + const int i1 = iq1; + const int i2 = iq2; + const int i3 = iq3; - ggml_vec_set_f32(D, - (float *) ((char *) grad_k + (ic*nbgk1 + i2*nbgk2 + i3*nbgk3)), - 0); - ggml_vec_mad_f32(D, - (float *) ((char *) grad_k + (ic*nbgk1 + i2*nbgk2 + i3*nbgk3)), - (float *) ((char *) q->data + (i1*nbk1 + i2*nbk2 + i3*nbk3)), - S[ic]); - } + // ggml_vec_set_f32(D, + // (float *) ((char *) grad_k + (ic*nbgk1 + i2*nbgk2 + i3*nbgk3)), + // 0); + ggml_vec_mad_f32(D, + (float *) ((char *) grad_k + (ic*nbgk1 + i2*nbgk2 + i3*nbgk3)), + (float *) ((char *) q->data + (i1*nbq1 + i2*nbq2 + i3*nbq3)), + S[ic]); + } - // grad[v][:M,:D,iq2,iq3] += d[:D,iq1,iq2,iq3].T @ SM - // grad[v][:M,ic,iq2,iq3] += d[:D,iq1,iq2,iq3].T[0,ic] * SM[:M] - // grad[v][:M,ic,iq2,iq3] += d[ic,iq1,iq2,iq3] * SM[:M] - for (int64_t ic = 0; ic < D; ++ic) { - // dst indices - const int i1 = iq1; - const int i2 = iq2; - const int i3 = iq3; + // grad[v][:M,:D,iq2,iq3] += d[:D,iq1,iq2,iq3].T @ SM + // grad[v][:M,ic,iq2,iq3] += d[:D,iq1,iq2,iq3].T[0,ic] * SM[:M] + // grad[v][:M,ic,iq2,iq3] += d[ic,iq1,iq2,iq3] * SM[:M] + for (int64_t ic = 0; ic < D; ++ic) { + // dst indices + const int i1 = iq1; + const int i2 = iq2; + const int i3 = iq3; - ggml_vec_set_f32(M, - (float *) ((char *) grad_v + ( ic*nbgv1 + i2*nbgv2 + i3*nbgv3)), - 0); - ggml_vec_mad_f32(M, - (float *) ((char *) grad_v + ( ic*nbgv1 + i2*nbgv2 + i3*nbgv3)), - SM, - *(float *) ((char *) d->data + (ic*nbd1 + i1*nbd2 + i2*nbd2 + i3*nbd3))); + // ggml_vec_set_f32(M, + // (float *) ((char *) grad_v + ( ic*nbgv1 + i2*nbgv2 + i3*nbgv3)), + // 0); + ggml_vec_mad_f32(M, + (float *) ((char *) grad_v + ( ic*nbgv1 + i2*nbgv2 + i3*nbgv3)), + SM, + *(float *) ((char *) d->data + (ic*nbd0 + i1*nbd1 + i2*nbd2 + i3*nbd3))); + } } } } @@ -14475,9 +14494,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor bool masked = t != 0; flash_grad = ggml_flash_attn_back(ctx, - src0->grad, - src1->grad, - tensor->opt[0]->grad, + src0, + src1, + tensor->opt[0], tensor->grad, masked); } @@ -14509,7 +14528,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case 4: { - grad_q = ggml_view_3d(ctx, + grad_q = ggml_view_4d(ctx, flash_grad, src0->ne[0], src0->ne[1], @@ -14555,7 +14574,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case 4: { - grad_k = ggml_view_3d(ctx, + grad_k = ggml_view_4d(ctx, flash_grad, src1->ne[0], src1->ne[1], @@ -14604,7 +14623,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case 4: { - grad_v = ggml_view_3d(ctx, + grad_v = ggml_view_4d(ctx, flash_grad, opt0->ne[0], opt0->ne[1], From 70c08318af062c31e47fd6a914e9f3abf8db385e Mon Sep 17 00:00:00 2001 From: xaedes Date: Mon, 29 May 2023 23:51:40 +0200 Subject: [PATCH 59/86] test flash attention backward pass need to set loose error bounds to pass. the finitie differences are close to numeric limits and often return quite different values than the backward pass. reducing eps further lets the gradients vanish completely. likewise setting eps to big results in wronger values. the softmax in the middle of the function is probably the most responsible for the numeric issues using finite differences. --- tests/test-grad0.c | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/tests/test-grad0.c b/tests/test-grad0.c index b7d68cad9fd28..c8c2c0f717e32 100644 --- a/tests/test-grad0.c +++ b/tests/test-grad0.c @@ -5,7 +5,7 @@ #include #include -#define MAX_NARGS 2 +#define MAX_NARGS 3 #undef MIN #undef MAX @@ -1143,6 +1143,45 @@ int main(int argc, const char ** argv) { } } + // flash_attn + { + const int nargs = 3; + + int64_t ne2[4]; + + get_random_dims(ne2, 4); + int64_t D = ne2[0]; + int64_t N = ne2[1]; + int64_t M = ne2[2] + N; + int64_t B = ne2[3]; + + for (int masked = 0; masked <= 1; ++masked) { + for (int ndims = 2; ndims <= 4; ++ndims) { + int64_t neq[4] = { D, N, B, ne[3] }; + int64_t nek[4] = { D, M, B, ne[3] }; + int64_t nev[4] = { M, D, B, ne[3] }; + if (ndims == 2) { + neq[2] = 1; neq[3] = 1; + nek[2] = 1; nek[3] = 1; + nev[2] = 1; nev[3] = 1; + } else if (ndims == 3) { + neq[3] = 1; + nek[3] = 1; + nev[3] = 1; + } + x[0] = get_random_tensor(ctx0, ndims, neq, -0.1250f, 0.1250f); + x[1] = get_random_tensor(ctx0, ndims, nek, -0.1250f, 0.1250f); + x[2] = get_random_tensor(ctx0, ndims, nev, -0.1250f, 0.1250f); + ggml_set_param(ctx0, x[0]); + ggml_set_param(ctx0, x[1]); + ggml_set_param(ctx0, x[2]); + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0))); + + check_gradient("flash_attn", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f); + } + } + } ggml_free(ctx0); } From fcbc4457d6a45112659d419104e294f3d0f68be3 Mon Sep 17 00:00:00 2001 From: xaedes Date: Tue, 30 May 2023 13:17:58 +0200 Subject: [PATCH 60/86] add option to train with flash attention and move options to the top of the main function training from scratch also works with flash attention training convergence and generation results after fix number of iterations are worse than when not using flash attention. maybe there still lingers a bug in the flash attention backward pass? but training works, just with slower convergence. flash attention is still worth to use, because it requires way less memory and is faster with high n_ctx --- examples/baby-llama/baby-llama-text.cpp | 318 +++++++++++++++++++++--- 1 file changed, 288 insertions(+), 30 deletions(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index 267f44321bbef..418cc5fff47aa 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -1168,6 +1168,239 @@ struct ggml_tensor * forward_batch_wo_cache( return inpL; } +struct ggml_tensor * forward_batch_wo_cache_flash_attn( + struct my_llama_model * model, + struct ggml_context * ctx0, + struct ggml_cgraph * gf, + struct ggml_tensor * tokens_input, + const int n_tokens, + const int n_batch) { + + const int n_past = 0; + const int N = n_tokens; + + const auto & hparams = model->hparams; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_head = hparams.n_head; + const int n_rot = hparams.n_rot; + const int n_ff = get_n_ff(&hparams); + + struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); + memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch); + + // inpL shape [n_embd,N*n_batch,1] + struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); + assert_shape_2d(inpL, n_embd, N*n_batch); + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + struct ggml_tensor * cur; + + // lctx.use_buf(ctx0, 0); + + // norm + { + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_rms_norm(ctx0, inpL); + assert_shape_2d(cur, n_embd, N*n_batch); + + // cur = attention_norm*cur + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].attention_norm, cur), + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // self-attention + { + // compute Q and K and RoPE them + // wq shape [n_embd, n_embd, 1, 1] + // wk shape [n_embd, n_embd, 1, 1] + // Qcur shape [n_embd/n_head, n_head, N, n_batch] + // Kcur shape [n_embd/n_head, n_head, N, n_batch] + struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); + struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); + assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch); + assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch); + + // Vcur shape [N, n_batch, n_embd/n_head, n_head] + struct ggml_tensor * Vcur = ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, cur, model->layers[il].wv), N, n_batch, n_embd/n_head, n_head); + assert_shape_4d(Vcur, N, n_batch, n_embd/n_head, n_head); + + // Qcur shape [n_embd/n_head, n_head, N, n_batch] + // Q shape [n_embd/n_head, N, n_head, n_batch] + struct ggml_tensor * Q = + ggml_permute(ctx0, + Qcur, + 0, 2, 1, 3); + assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch); + + // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer] + // K shape [n_embd/n_head, N, n_head, n_batch] + struct ggml_tensor * K = + ggml_permute(ctx0, + Kcur, + 0, 2, 1, 3); + assert_shape_4d(K, n_embd/n_head, N, n_head, n_batch); + + // // K * Q + // // KQ shape [N, N, n_head, n_batch] + // struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + // assert_shape_4d(KQ, N, N, n_head, n_batch); + + // // KQ_scaled = KQ / sqrt(n_embd/n_head) + // // KQ_scaled shape [N, N, n_head, n_batch] + // struct ggml_tensor * KQ_scaled = + // ggml_scale_inplace(ctx0, + // KQ, + // ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); + // assert_shape_4d(KQ_scaled, N, N, n_head, n_batch); + + // // KQ_masked = mask_past(KQ_scaled) + // // KQ_masked shape [N, N, n_head, n_batch] + // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + // assert_shape_4d(KQ_masked, N, N, n_head, n_batch); + + // // KQ = soft_max(KQ_masked) + // // KQ_soft_max shape [N, N, n_head, n_batch] + // struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + // assert_shape_4d(KQ_soft_max, N, N, n_head, n_batch); + + // Vcur shape [N, n_batch, n_embd/n_head, n_head] + // V shape [N, n_embd/n_head, n_head, n_batch] + struct ggml_tensor * V = + ggml_permute(ctx0, + Vcur, + 0, 3, 1, 2); + assert_shape_4d(V, N, n_embd/n_head, n_head, n_batch); + + // // KQV shape [n_embd/n_head, N, n_head, n_batch] + // struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + // assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch); + + + bool masked = true; + struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, masked); + assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + // KQV_merged shape [n_embd/n_head, n_head, N, n_batch] + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch); + // KQV_merged shape + + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch); + assert_shape_2d(cur, n_embd, N*n_batch); + + // projection (no bias) + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].wo, + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // lctx.use_buf(ctx0, 1); + + // inpFF shape [n_embd,N*n_batch,1,1] + struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA); + assert_shape_2d(inpFF, n_embd, N*n_batch); + + // feed-forward network + { + // norm + { + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_rms_norm(ctx0, inpFF); + assert_shape_2d(cur, n_embd, N*n_batch); + + // cur = ffn_norm*cur + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // tmp shape [n_ff,N*n_batch,1,1] + struct ggml_tensor * tmp = ggml_mul_mat(ctx0, + model->layers[il].w3, + cur); + assert_shape_2d(tmp, n_ff, N*n_batch); + + // cur shape [n_ff,N*n_batch,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].w1, + cur); + assert_shape_2d(cur, n_ff, N*n_batch); + + // SILU activation + // cur shape [n_ff,N*n_batch,1,1] + cur = ggml_silu(ctx0, cur); + assert_shape_2d(cur, n_ff, N*n_batch); + + // cur shape [n_ff,N*n_batch,1,1] + cur = ggml_mul(ctx0, cur, tmp); + assert_shape_2d(cur, n_ff, N*n_batch); + + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].w2, + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_add_inplace(ctx0, cur, inpFF); + assert_shape_2d(cur, n_embd, N*n_batch); + + // input for next layer + // inpL shape [n_embd,N*n_batch,1,1] + inpL = cur; + assert_shape_2d(inpL, n_embd, N*n_batch); + } + + // norm + { + + // inpL shape [n_embd,N*n_batch,1,1] + inpL = ggml_rms_norm(ctx0, inpL); + assert_shape_2d(inpL, n_embd, N*n_batch); + + // inpL = norm*inpL + // inpL shape [n_embd,N*n_batch,1,1] + inpL = ggml_mul(ctx0, + ggml_repeat(ctx0, model->norm, inpL), + inpL); + + assert_shape_2d(inpL, n_embd, N*n_batch); + + //embeddings = inpL; + } + + // lm_head + // inpL shape [n_vocab,N*n_batch,1,1] + inpL = ggml_mul_mat(ctx0, model->output, inpL); + assert_shape_2d(inpL, n_vocab, N*n_batch); + + { + // inpL shape [n_vocab,N,n_batch,1] + inpL = ggml_reshape_3d(ctx0, + inpL, + n_vocab, N, n_batch); + assert_shape_3d(inpL, n_vocab, N, n_batch); + } + + // run the computation + ggml_build_forward_expand(gf, inpL); + + return inpL; +} + void set_f32_3d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, float value) { float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); *ptr = value; @@ -1644,7 +1877,7 @@ void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) { } std::string name = file->read_string(name_len); - GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)) == 0); + GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0); file->seek(-file->tell() & 31, SEEK_CUR); file->read_raw(tensor->data, ggml_nbytes(tensor)); @@ -1930,7 +2163,42 @@ int main(int argc, char ** argv) { //return 1; } - srand(time(NULL)); + int seed = 1; + int n_ctx = 256; + // int n_ctx = 64; + int n_embd = 256; + int n_mult = 256; + int n_head = 8; + int n_layer = 16; + int n_rotmax = 64; + + int n_threads = 6; + int n_batch = 8; + int n_examples = 32; + + int print_info_interval = 1; + int print_details_interval = 2; + + bool samples_start_after_nl = false; + bool use_adam = true; + bool use_flash = false; + + // only adam + int warmup = 100; + int cos_decay_steps = 1000; + float cos_decay_restart = 1.1f; + float cos_decay_alpha = 0.0f; + + int lbfgs_n_iter = 16; + int adam_n_iter = 16; + float adam_alpha = 1e-3; + float adam_decay = 1e-3; + + if (seed < 0) { + srand(time(NULL)); + } else { + srand(seed); + } const char * fn_model = (argc >= 2) ? argv[1] : default_argv[1]; const char * fn_train = (argc >= 3) ? argv[2] : default_argv[2]; @@ -1971,12 +2239,12 @@ int main(int argc, char ** argv) { struct my_llama_model model; model.hparams.n_vocab = llama_n_vocab(lctx); - model.hparams.n_ctx = 32; - model.hparams.n_embd = 128; - model.hparams.n_mult = 64; - model.hparams.n_head = 16; - model.hparams.n_layer = 4; - model.hparams.n_rot = std::min(64u, model.hparams.n_embd / model.hparams.n_head); + model.hparams.n_ctx = n_ctx; + model.hparams.n_embd = n_embd; + model.hparams.n_mult = n_mult; + model.hparams.n_head = n_head; + model.hparams.n_layer = n_layer; + model.hparams.n_rot = std::min((uint32_t)n_rotmax, model.hparams.n_embd / model.hparams.n_head); print_params(&model.hparams); @@ -2011,18 +2279,6 @@ int main(int argc, char ** argv) { my_llama_sampler sampler; - int n_threads = 6; - int n_batch = 32; - int n_examples = 32; - - bool samples_start_after_nl = false; - bool use_adam = true; - - int warmup = 100; - int cos_decay_steps = 1000; - float cos_decay_restart = 1.1f; - float cos_decay_alpha = 0.0f; - int n_tokens = model.hparams.n_ctx; int n_vocab = model.hparams.n_vocab; @@ -2035,15 +2291,15 @@ int main(int argc, char ** argv) { opt_params_adam.print_forward_graph = false; opt_params_adam.print_backward_graph = false; opt_params_adam.n_threads = n_threads; - opt_params_adam.adam.n_iter = 16; + opt_params_adam.adam.n_iter = adam_n_iter; opt_params_adam.adam.sched = 1.0f; - opt_params_adam.adam.alpha = 1e-3; - opt_params_adam.adam.decay = 1e-3; + opt_params_adam.adam.alpha = adam_alpha; + opt_params_adam.adam.decay = adam_decay; opt_params_lbfgs.print_forward_graph = false; opt_params_lbfgs.print_backward_graph = false; - opt_params_lbfgs.n_threads = n_threads; - opt_params_lbfgs.lbfgs.n_iter = 16; + opt_params_lbfgs.n_threads = n_threads; + opt_params_lbfgs.lbfgs.n_iter = lbfgs_n_iter; opt->ctx = model.ctx; opt->params = use_adam ? opt_params_adam : opt_params_lbfgs; @@ -2117,7 +2373,9 @@ int main(int argc, char ** argv) { struct ggml_tensor * logits = (n_past == 0) - ? forward_batch_wo_cache(&model, ctx0, &gf, tokens_input, n_tokens, n_batch) + ? (use_flash + ? forward_batch_wo_cache_flash_attn(&model, ctx0, &gf, tokens_input, n_tokens, n_batch) + : forward_batch_wo_cache(&model, ctx0, &gf, tokens_input, n_tokens, n_batch)) : forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch); struct ggml_tensor * e = cross_entropy_loss(ctx0, logits, target_probs); @@ -2148,16 +2406,16 @@ int main(int argc, char ** argv) { float error_after_opt = ggml_get_f32_1d(e, 0); - printf("used_mem_before_opt: %zu bytes\n", used_mem_before_opt); - printf("used_mem_after_opt: %zu bytes\n", used_mem_after_opt); - if (ex % 1 == 0) { + if (ex % print_info_interval == 0) { printf("Example %d, opt iter %d\n", ex, opt->iter); printf("error_before_opt: %.6f\n", error_before_opt); printf("error_after_opt: %.6f\n", error_after_opt); + printf("used_mem_before_opt: %zu bytes\n", used_mem_before_opt); + printf("used_mem_after_opt: %zu bytes\n", used_mem_after_opt); } - if (ex % 2 == 0) { + if (ex % print_details_interval == 0) { // set_logits_masked(logits, token_notavail, -1e9); for (int i=0; i Date: Tue, 30 May 2023 15:53:55 +0200 Subject: [PATCH 61/86] add train_params and command line option parser --- examples/baby-llama/baby-llama-text.cpp | 454 +++++++++++++++++++----- 1 file changed, 367 insertions(+), 87 deletions(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index 418cc5fff47aa..ecdb418bf2285 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -2150,66 +2150,341 @@ float cosine_decay_restart(int decay_steps, const float alpha, int step, float r return cosine_decay(decay_steps, alpha, step); } -int main(int argc, char ** argv) { - const char * default_model = "ggml-vic7b-uncensored-q4_0.bin"; - const char * default_train = "shakespeare.txt"; - const char * default_chkpt_in = "checkpoint.bin"; - const char * default_chkpt_out = "checkpoint.bin"; - const char * default_model_out = "ggml-checkpoint-f32.bin"; - const char * default_argv[6] = {argv[0], default_model, default_train, default_chkpt_in, default_chkpt_out, default_model_out}; - - if (argc < 6) { - fprintf(stderr, "usage: %s model training_data chkpt_in chkpt_out model_out\n", argv[0]); - //return 1; - } - - int seed = 1; - int n_ctx = 256; - // int n_ctx = 64; - int n_embd = 256; - int n_mult = 256; - int n_head = 8; - int n_layer = 16; - int n_rotmax = 64; - - int n_threads = 6; - int n_batch = 8; - int n_examples = 32; +struct train_params { + const char * fn_vocab_model; + const char * fn_train_data; + const char * fn_checkpoint_in; + const char * fn_checkpoint_out; + const char * fn_model_out; + + int seed; + int n_ctx; + int n_embd; + int n_mult; + int n_head; + int n_layer; + int n_rotmax; + + int n_threads; + int n_batch; + int n_examples; + int n_predict; - int print_info_interval = 1; - int print_details_interval = 2; + int print_info_interval; + int print_details_interval; - bool samples_start_after_nl = false; - bool use_adam = true; - bool use_flash = false; + bool samples_start_after_nl; + bool use_adam; + bool use_flash; // only adam - int warmup = 100; - int cos_decay_steps = 1000; - float cos_decay_restart = 1.1f; - float cos_decay_alpha = 0.0f; + int warmup; + int cos_decay_steps; + float cos_decay_restart; + float cos_decay_alpha; + + int lbfgs_n_iter; + int adam_n_iter; + float adam_alpha; + float adam_decay; +}; + +struct train_params get_default_train_params() { + struct train_params params; + params.fn_vocab_model = "ggml-vic7b-uncensored-q4_0.bin"; + params.fn_train_data = "shakespeare.txt"; + params.fn_checkpoint_in = "checkpoint.bin"; + params.fn_checkpoint_out = "checkpoint.bin"; + params.fn_model_out = "ggml-checkpoint-f32.bin"; + + params.seed = -1; + + params.n_ctx = 128; + params.n_embd = 256; + params.n_mult = 256; + params.n_head = 8; + params.n_layer = 16; + params.n_rotmax = 64; + + params.n_threads = 6; + params.n_batch = 8; + params.n_examples = 8; + params.n_predict = 1024; - int lbfgs_n_iter = 16; - int adam_n_iter = 16; - float adam_alpha = 1e-3; - float adam_decay = 1e-3; + params.print_info_interval = 1; + params.print_details_interval = 2; + + params.samples_start_after_nl = false; + params.use_adam = true; + params.use_flash = true; + + // only adam + params.warmup = 100; + params.cos_decay_steps = 1000; + params.cos_decay_restart = 1.1f; + params.cos_decay_alpha = 0.0f; + + params.lbfgs_n_iter = 16; + params.adam_n_iter = 16; + params.adam_alpha = 1e-3; + params.adam_decay = 1e-3; + + return params; +} + +void train_print_usage(int /*argc*/, char ** argv, const struct train_params * params) { + fprintf(stderr, "usage: %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " --vocab-model FNAME model path from which to load vocab (default '%s')\n", params->fn_vocab_model); + fprintf(stderr, " --train-data FNAME path from which to load training data (default '%s')\n", params->fn_train_data); + fprintf(stderr, " --checkpoint-in FNAME path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in); + fprintf(stderr, " --checkpoint-out FNAME path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out); + fprintf(stderr, " --model-out FNAME path to save ggml model (default '%s')\n", params->fn_model_out); + fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); + fprintf(stderr, " -c N, --ctx N Context size used during training (default %d)\n", params->n_ctx); + fprintf(stderr, " --embd N Embedding size used for new models (default %d)\n", params->n_embd); + fprintf(stderr, " --mult N Mult size used for new models, influences feedforward size. (default %d)\n", params->n_mult); + fprintf(stderr, " --head N Number of heads for new models (default %d)\n", params->n_head); + fprintf(stderr, " --layer N Number of layers for new models (default %d)\n", params->n_layer); + fprintf(stderr, " --rotmax N Maximal number Rope dimensions for new models (default %d)\n", params->n_rotmax); + fprintf(stderr, " -t N, --threads N Number of threads (default %d)\n", params->n_threads); + fprintf(stderr, " -b N, --batch N Parallel batch size (default %d)\n", params->n_batch); + fprintf(stderr, " -n N, --examples N Number of examples to train (default %d)\n", params->n_examples); + fprintf(stderr, " --predict N Number of tokens to generate after training (default %d)\n", params->n_predict); + fprintf(stderr, " --print-info-interval N Print infos during training each N examples (default %d)\n", params->print_info_interval); + fprintf(stderr, " --print-details-interval N Print details during training each N examples (default %d)\n", params->print_details_interval); + fprintf(stderr, " --samples-after-nl Training samples start after newlines. (default %s)\n", params->samples_start_after_nl ? "on" : "off"); + fprintf(stderr, " --use-lbfgs Use LBFGS optimizer instead of default Adam\n"); + fprintf(stderr, " --use-adam Use Adam optimizer (default)\n"); + fprintf(stderr, " --no-flash Don't use flash attention.\n"); + fprintf(stderr, " --use-flash Use flash attention (default)\n"); + fprintf(stderr, " --warmup N Number of warmup steps (default %d)\n", params->warmup); + fprintf(stderr, " --cos-decay-steps N Number of cosine decay steps (default %d)\n", params->cos_decay_steps); + fprintf(stderr, " --cos-decay-restart N Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart); + fprintf(stderr, " --cos-decay-alpha N Cosine decay alpha (default %f)\n", params->cos_decay_alpha); + fprintf(stderr, " --lbfgs-iter N Maximum number of LBFGS optimization iterations for each batch (default %d)\n", params->lbfgs_n_iter); + fprintf(stderr, " --adam-iter N Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter); + fprintf(stderr, " --adam-alpha N Adam learning rate alpha (default %f)\n", params->adam_alpha); + fprintf(stderr, " --adam-decay N AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay); + fprintf(stderr, "\n"); +} - if (seed < 0) { +bool train_params_parse(int argc, char ** argv, struct train_params * params) { + bool invalid_param = false; + std::string arg; + struct train_params default_params = get_default_train_params(); + const std::string arg_prefix = "--"; + + for (int i = 1; i < argc; i++) { + arg = argv[i]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + + if (arg == "--vocab-model") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->fn_vocab_model = argv[i]; + } else if (arg == "--train-data") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->fn_train_data = argv[i]; + } else if (arg == "--checkpoint-in") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->fn_checkpoint_in = argv[i]; + } else if (arg == "--checkpoint-out") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->fn_checkpoint_out = argv[i]; + } else if (arg == "--model-out") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->fn_model_out = argv[i]; + } else if (arg == "-s" || arg == "--seed") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->seed = std::stoi(argv[i]); + } else if (arg == "-c" || arg == "--ctx") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_ctx = std::stoi(argv[i]); + } else if (arg == "--embd") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_embd = std::stoi(argv[i]); + } else if (arg == "--mult") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_mult = std::stoi(argv[i]); + } else if (arg == "--head") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_head = std::stoi(argv[i]); + } else if (arg == "--layer") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_layer = std::stoi(argv[i]); + } else if (arg == "--rotmax") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_rotmax = std::stoi(argv[i]); + } else if (arg == "-t" || arg == "--threads") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_threads = std::stoi(argv[i]); + } else if (arg == "-b" || arg == "--batch") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_batch = std::stoi(argv[i]); + } else if (arg == "-n" || arg == "--examples") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_examples = std::stoi(argv[i]); + } else if (arg == "--predict") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_predict = std::stoi(argv[i]); + } else if (arg == "--print-info-interval") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->print_info_interval = std::stoi(argv[i]); + } else if (arg == "--print-details-interval") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->print_details_interval = std::stoi(argv[i]); + } else if (arg == "--samples-after-nl") { + params->samples_start_after_nl = true; + } else if (arg == "--use-lbfgs") { + params->use_adam = false; + } else if (arg == "--use-adam") { + params->use_adam = true; + } else if (arg == "--no-flash") { + params->use_flash = false; + } else if (arg == "--use-flash") { + params->use_flash = true; + } else if (arg == "--warmup") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->warmup = std::stoi(argv[i]); + } else if (arg == "--cos-decay-steps") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->cos_decay_steps = std::stof(argv[i]); + } else if (arg == "--cos-decay-restart") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->cos_decay_restart = std::stof(argv[i]); + } else if (arg == "--cos-decay-alpha") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->cos_decay_alpha = std::stof(argv[i]); + } else if (arg == "--lbfgs-iter") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->lbfgs_n_iter = std::stoi(argv[i]); + } else if (arg == "--adam-iter") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->adam_n_iter = std::stoi(argv[i]); + } else if (arg == "--adam-alpha") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->adam_alpha = std::stof(argv[i]); + } else if (arg == "--adam-decay") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->adam_decay = std::stof(argv[i]); + } else if (arg == "-h" || arg == "--help") { + train_print_usage(argc, argv, &default_params); + exit(0); + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + train_print_usage(argc, argv, &default_params); + exit(1); + } + } + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + train_print_usage(argc, argv, &default_params); + exit(1); + } + + return true; +} + +int main(int argc, char ** argv) { + struct train_params params = get_default_train_params(); + + if (!train_params_parse(argc, argv, ¶ms)) { + return 1; + } + + + if (params.seed < 0) { srand(time(NULL)); } else { - srand(seed); + srand(params.seed); } - const char * fn_model = (argc >= 2) ? argv[1] : default_argv[1]; - const char * fn_train = (argc >= 3) ? argv[2] : default_argv[2]; - const char * fn_chkpt_in = (argc >= 4) ? argv[3] : default_argv[3]; - const char * fn_chkpt_out = (argc >= 5) ? argv[4] : default_argv[4]; - const char * fn_model_out = (argc >= 6) ? argv[5] : default_argv[5]; - struct llama_context_params llama_params = llama_context_default_params(); llama_params.vocab_only = true; - struct llama_context * lctx = llama_init_from_file(fn_model, llama_params); + struct llama_context * lctx = llama_init_from_file(params.fn_vocab_model, llama_params); struct llama_vocab vocab; { @@ -2232,19 +2507,19 @@ int main(int argc, char ** argv) { printf("%s: tokenize training data\n", __func__); std::vector train_tokens; - if (tokenize_file(lctx, fn_train, train_tokens) < 0) { - fprintf(stderr, "%s: failed to tokenize file '%s'\n", __func__, fn_train); + if (tokenize_file(lctx, params.fn_train_data, train_tokens) < 0) { + fprintf(stderr, "%s: failed to tokenize file '%s'\n", __func__, params.fn_train_data); } printf("%s: number of training tokens: %d\n", __func__, train_tokens.size()); struct my_llama_model model; model.hparams.n_vocab = llama_n_vocab(lctx); - model.hparams.n_ctx = n_ctx; - model.hparams.n_embd = n_embd; - model.hparams.n_mult = n_mult; - model.hparams.n_head = n_head; - model.hparams.n_layer = n_layer; - model.hparams.n_rot = std::min((uint32_t)n_rotmax, model.hparams.n_embd / model.hparams.n_head); + model.hparams.n_ctx = params.n_ctx; + model.hparams.n_embd = params.n_embd; + model.hparams.n_mult = params.n_mult; + model.hparams.n_head = params.n_head; + model.hparams.n_layer = params.n_layer; + model.hparams.n_rot = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head); print_params(&model.hparams); @@ -2282,6 +2557,7 @@ int main(int argc, char ** argv) { int n_tokens = model.hparams.n_ctx; int n_vocab = model.hparams.n_vocab; + int n_batch = params.n_batch; struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context)); memset(opt, 0, sizeof(struct ggml_opt_context)); @@ -2290,32 +2566,32 @@ int main(int argc, char ** argv) { struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS); opt_params_adam.print_forward_graph = false; opt_params_adam.print_backward_graph = false; - opt_params_adam.n_threads = n_threads; - opt_params_adam.adam.n_iter = adam_n_iter; - opt_params_adam.adam.sched = 1.0f; - opt_params_adam.adam.alpha = adam_alpha; - opt_params_adam.adam.decay = adam_decay; + opt_params_adam.n_threads = params.n_threads; + opt_params_adam.adam.n_iter = params.adam_n_iter; + opt_params_adam.adam.sched = 1.0f; + opt_params_adam.adam.alpha = params.adam_alpha; + opt_params_adam.adam.decay = params.adam_decay; opt_params_lbfgs.print_forward_graph = false; opt_params_lbfgs.print_backward_graph = false; - opt_params_lbfgs.n_threads = n_threads; - opt_params_lbfgs.lbfgs.n_iter = lbfgs_n_iter; + opt_params_lbfgs.n_threads = params.n_threads; + opt_params_lbfgs.lbfgs.n_iter = params.lbfgs_n_iter; opt->ctx = model.ctx; - opt->params = use_adam ? opt_params_adam : opt_params_lbfgs; + opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs; printf("%s: init model\n", __func__); - bool existed = load_checkpoint(&model, opt, fn_chkpt_in, true); + bool existed = load_checkpoint(&model, opt, params.fn_checkpoint_in, true); set_param_model(&model); - opt->params = use_adam ? opt_params_adam : opt_params_lbfgs; + opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs; opt->iter = model.train_its; printf("%s: opt iter %d\n", __func__, opt->iter); bool from_scratch = !existed; if (from_scratch) { - randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f); + randomize_model(&model, params.seed, 0.0f, 1.0f, -1.0f, +1.0f); } init_kv_cache(&kv_self, &model, 1); @@ -2328,11 +2604,11 @@ int main(int argc, char ** argv) { size_t compute_size = 1024ll*1024ll*1024ll*32ll; uint8_t * compute_addr = new uint8_t[compute_size]; - + GGML_ASSERT(train_tokens.size() > n_tokens);; std::vector train_samples; train_samples.push_back(0); for (int i=1; i= train_samples.size()) { shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size()); for (int i=0; iparams.adam.sched = (opt->iter < warmup) - ? (float) opt->iter / (float) warmup - : cosine_decay_restart(cos_decay_steps, cos_decay_alpha, opt->iter - warmup, cos_decay_restart); + opt->params.adam.sched = (opt->iter < params.warmup) + ? (float) opt->iter / (float) params.warmup + : cosine_decay_restart( + params.cos_decay_steps, + params.cos_decay_alpha, + opt->iter - params.warmup, + params.cos_decay_restart); + printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched); // ggml_opt(ctx0, opt->params, e); @@ -2406,8 +2687,7 @@ int main(int argc, char ** argv) { float error_after_opt = ggml_get_f32_1d(e, 0); - - if (ex % print_info_interval == 0) { + if (params.print_info_interval > 0 && ex % params.print_info_interval == 0) { printf("Example %d, opt iter %d\n", ex, opt->iter); printf("error_before_opt: %.6f\n", error_before_opt); printf("error_after_opt: %.6f\n", error_after_opt); @@ -2415,7 +2695,7 @@ int main(int argc, char ** argv) { printf("used_mem_after_opt: %zu bytes\n", used_mem_after_opt); } - if (ex % print_details_interval == 0) { + if (params.print_details_interval > 0 && ex % params.print_details_interval == 0) { // set_logits_masked(logits, token_notavail, -1e9); for (int i=0; i 0) { - save_checkpoint(&model, opt, fn_chkpt_out); + if (params.n_examples > 0) { + save_checkpoint(&model, opt, params.fn_checkpoint_out); } - if (strlen(fn_model_out) > 0) { - save_as_llama_model(&vocab, &model, fn_model_out); + if (strlen(params.fn_model_out) > 0) { + save_as_llama_model(&vocab, &model, params.fn_model_out); } { - int n_gen = 1024; + int n_gen = params.n_predict; int sample_ctx = n_tokens - n_tokens/8; sampler.params.temp = 0.2; @@ -2477,15 +2757,15 @@ int main(int argc, char ** argv) { printf("---\n"); for (int i=0; i Date: Tue, 30 May 2023 15:58:22 +0200 Subject: [PATCH 62/86] remove unnecessary comments --- examples/baby-llama/baby-llama-text.cpp | 65 ------------------------- 1 file changed, 65 deletions(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index ecdb418bf2285..5d48b7155f279 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -1191,7 +1191,6 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn( struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch); - // inpL shape [n_embd,N*n_batch,1] struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); assert_shape_2d(inpL, n_embd, N*n_batch); for (int il = 0; il < n_layer; ++il) { @@ -1199,11 +1198,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn( struct ggml_tensor * cur; - // lctx.use_buf(ctx0, 0); - // norm { - // cur shape [n_embd,N*n_batch,1,1] cur = ggml_rms_norm(ctx0, inpL); assert_shape_2d(cur, n_embd, N*n_batch); @@ -1219,94 +1215,48 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn( // compute Q and K and RoPE them // wq shape [n_embd, n_embd, 1, 1] // wk shape [n_embd, n_embd, 1, 1] - // Qcur shape [n_embd/n_head, n_head, N, n_batch] - // Kcur shape [n_embd/n_head, n_head, N, n_batch] struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch); assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch); - // Vcur shape [N, n_batch, n_embd/n_head, n_head] struct ggml_tensor * Vcur = ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, cur, model->layers[il].wv), N, n_batch, n_embd/n_head, n_head); assert_shape_4d(Vcur, N, n_batch, n_embd/n_head, n_head); - // Qcur shape [n_embd/n_head, n_head, N, n_batch] - // Q shape [n_embd/n_head, N, n_head, n_batch] struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch); - // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer] - // K shape [n_embd/n_head, N, n_head, n_batch] struct ggml_tensor * K = ggml_permute(ctx0, Kcur, 0, 2, 1, 3); assert_shape_4d(K, n_embd/n_head, N, n_head, n_batch); - // // K * Q - // // KQ shape [N, N, n_head, n_batch] - // struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - // assert_shape_4d(KQ, N, N, n_head, n_batch); - - // // KQ_scaled = KQ / sqrt(n_embd/n_head) - // // KQ_scaled shape [N, N, n_head, n_batch] - // struct ggml_tensor * KQ_scaled = - // ggml_scale_inplace(ctx0, - // KQ, - // ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); - // assert_shape_4d(KQ_scaled, N, N, n_head, n_batch); - - // // KQ_masked = mask_past(KQ_scaled) - // // KQ_masked shape [N, N, n_head, n_batch] - // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); - // assert_shape_4d(KQ_masked, N, N, n_head, n_batch); - - // // KQ = soft_max(KQ_masked) - // // KQ_soft_max shape [N, N, n_head, n_batch] - // struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - // assert_shape_4d(KQ_soft_max, N, N, n_head, n_batch); - - // Vcur shape [N, n_batch, n_embd/n_head, n_head] - // V shape [N, n_embd/n_head, n_head, n_batch] struct ggml_tensor * V = ggml_permute(ctx0, Vcur, 0, 3, 1, 2); assert_shape_4d(V, N, n_embd/n_head, n_head, n_batch); - // // KQV shape [n_embd/n_head, N, n_head, n_batch] - // struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - // assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch); - - bool masked = true; struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, masked); assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch); - // KQV_merged = KQV.permute(0, 2, 1, 3) - // KQV_merged shape [n_embd/n_head, n_head, N, n_batch] struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch); - // KQV_merged shape - - // cur shape [n_embd,N*n_batch,1,1] cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch); assert_shape_2d(cur, n_embd, N*n_batch); // projection (no bias) - // cur shape [n_embd,N*n_batch,1,1] cur = ggml_mul_mat(ctx0, model->layers[il].wo, cur); assert_shape_2d(cur, n_embd, N*n_batch); } - // lctx.use_buf(ctx0, 1); - - // inpFF shape [n_embd,N*n_batch,1,1] struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA); assert_shape_2d(inpFF, n_embd, N*n_batch); @@ -1314,52 +1264,43 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn( { // norm { - // cur shape [n_embd,N*n_batch,1,1] cur = ggml_rms_norm(ctx0, inpFF); assert_shape_2d(cur, n_embd, N*n_batch); // cur = ffn_norm*cur - // cur shape [n_embd,N*n_batch,1,1] cur = ggml_mul(ctx0, ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), cur); assert_shape_2d(cur, n_embd, N*n_batch); } - // tmp shape [n_ff,N*n_batch,1,1] struct ggml_tensor * tmp = ggml_mul_mat(ctx0, model->layers[il].w3, cur); assert_shape_2d(tmp, n_ff, N*n_batch); - // cur shape [n_ff,N*n_batch,1,1] cur = ggml_mul_mat(ctx0, model->layers[il].w1, cur); assert_shape_2d(cur, n_ff, N*n_batch); // SILU activation - // cur shape [n_ff,N*n_batch,1,1] cur = ggml_silu(ctx0, cur); assert_shape_2d(cur, n_ff, N*n_batch); - // cur shape [n_ff,N*n_batch,1,1] cur = ggml_mul(ctx0, cur, tmp); assert_shape_2d(cur, n_ff, N*n_batch); - // cur shape [n_embd,N*n_batch,1,1] cur = ggml_mul_mat(ctx0, model->layers[il].w2, cur); assert_shape_2d(cur, n_embd, N*n_batch); } - // cur shape [n_embd,N*n_batch,1,1] cur = ggml_add_inplace(ctx0, cur, inpFF); assert_shape_2d(cur, n_embd, N*n_batch); // input for next layer - // inpL shape [n_embd,N*n_batch,1,1] inpL = cur; assert_shape_2d(inpL, n_embd, N*n_batch); } @@ -1367,28 +1308,22 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn( // norm { - // inpL shape [n_embd,N*n_batch,1,1] inpL = ggml_rms_norm(ctx0, inpL); assert_shape_2d(inpL, n_embd, N*n_batch); // inpL = norm*inpL - // inpL shape [n_embd,N*n_batch,1,1] inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model->norm, inpL), inpL); assert_shape_2d(inpL, n_embd, N*n_batch); - - //embeddings = inpL; } // lm_head - // inpL shape [n_vocab,N*n_batch,1,1] inpL = ggml_mul_mat(ctx0, model->output, inpL); assert_shape_2d(inpL, n_vocab, N*n_batch); { - // inpL shape [n_vocab,N,n_batch,1] inpL = ggml_reshape_3d(ctx0, inpL, n_vocab, N, n_batch); From 1074a81e819b9076599d9da93d6ebd99dcce93b8 Mon Sep 17 00:00:00 2001 From: xaedes Date: Tue, 30 May 2023 16:06:20 +0200 Subject: [PATCH 63/86] add train params to specify memory size --- examples/baby-llama/baby-llama-text.cpp | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index 5d48b7155f279..03f93c749e660 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -2122,6 +2122,9 @@ struct train_params { int adam_n_iter; float adam_alpha; float adam_decay; + + int mem_model_gb; + int mem_compute_gb; }; struct train_params get_default_train_params() { @@ -2164,6 +2167,9 @@ struct train_params get_default_train_params() { params.adam_alpha = 1e-3; params.adam_decay = 1e-3; + params.mem_model_gb = 2; + params.mem_compute_gb = 32; + return params; } @@ -2203,6 +2209,8 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p fprintf(stderr, " --adam-iter N Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter); fprintf(stderr, " --adam-alpha N Adam learning rate alpha (default %f)\n", params->adam_alpha); fprintf(stderr, " --adam-decay N AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay); + fprintf(stderr, " --mem-model N Memory to allocate for model and cache in gigabytes. (default %d)\n", params->mem_model_gb); + fprintf(stderr, " --mem-compute N Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb); fprintf(stderr, "\n"); } @@ -2384,6 +2392,18 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) { break; } params->adam_decay = std::stof(argv[i]); + } else if (arg == "--mem-model") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->mem_model_gb = std::stoi(argv[i]); + } else if (arg == "--mem-compute") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->mem_compute_gb = std::stoi(argv[i]); } else if (arg == "-h" || arg == "--help") { train_print_usage(argc, argv, &default_params); exit(0); @@ -2480,7 +2500,7 @@ int main(int argc, char ** argv) { struct ggml_init_params lcparams; - lcparams.mem_size = 1024ll*1024ll*1024ll*2ll; + lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb); lcparams.mem_buffer = NULL; lcparams.no_alloc = false; @@ -2536,7 +2556,7 @@ int main(int argc, char ** argv) { printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx)); // ggml_print_tensor_objects(model.ctx); - size_t compute_size = 1024ll*1024ll*1024ll*32ll; + size_t compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb); uint8_t * compute_addr = new uint8_t[compute_size]; GGML_ASSERT(train_tokens.size() > n_tokens);; From 21b11b55d4a07a68b235228cde1cf4d65d5e5f66 Mon Sep 17 00:00:00 2001 From: xaedes Date: Tue, 30 May 2023 17:03:09 +0200 Subject: [PATCH 64/86] remove python bindings --- py/llama_cpp/__init__.py | 0 py/llama_cpp/llama.py | 327 --------------------------------------- setup.py | 15 -- 3 files changed, 342 deletions(-) delete mode 100644 py/llama_cpp/__init__.py delete mode 100644 py/llama_cpp/llama.py delete mode 100644 setup.py diff --git a/py/llama_cpp/__init__.py b/py/llama_cpp/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/py/llama_cpp/llama.py b/py/llama_cpp/llama.py deleted file mode 100644 index bc0fa8b72855c..0000000000000 --- a/py/llama_cpp/llama.py +++ /dev/null @@ -1,327 +0,0 @@ -import os -import sys -import glob -import ctypes - -from ctypes import c_int, c_float, c_double, c_char_p, c_void_p, c_bool, c_size_t, c_ubyte, POINTER, Structure - - -# Load the library -if sys.platform == 'win32': - lib = ctypes.cdll.LoadLibrary(next(iter(glob.glob(os.path.join(os.path.dirname(__file__), '..', '..', '**', 'llama.dll'), recursive=True)))) -else: - lib = ctypes.cdll.LoadLibrary(next(iter(glob.glob(os.path.join(os.path.dirname(__file__), '..', '..', '**', 'libllama.so'), recursive=True)))) - - -# C types -llama_token = c_int -llama_token_p = POINTER(llama_token) - -class llama_token_data(Structure): - _fields_ = [ - ('id', llama_token), # token id - ('p', c_float), # probability of the token - ('plog', c_float), # log probability of the token - ] - -llama_token_data_p = POINTER(llama_token_data) - -class llama_token_data_array(Structure): - _fields_ = [ - ('data', llama_token_data_p), - ('size', c_size_t), - ('sorted', c_bool), - ] - -llama_token_data_array_p = POINTER(llama_token_data_array) - -llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) -class llama_context_params(Structure): - _fields_ = [ - ('n_ctx', c_int), # text context - ('n_parts', c_int), # -1 for default - ('n_gpu_layers', c_int), # number of layers to store in VRAM - ('seed', c_int), # RNG seed, 0 for random - ('f16_kv', c_bool), # use fp16 for KV cache - ('logits_all', c_bool), # the llama_eval() call computes all logits, not just the last one - ('vocab_only', c_bool), # only load the vocabulary, no weights - ('use_mmap', c_bool), # use mmap if possible - ('use_mlock', c_bool), # force system to keep model in RAM - ('embedding', c_bool), # embedding mode only - ('progress_callback', llama_progress_callback), # called with a progress value between 0 and 1, pass NULL to disable - ('progress_callback_user_data', c_void_p), # context pointer passed to the progress callback - ] - - -llama_context_params_p = POINTER(llama_context_params) - -llama_context_p = c_void_p - -c_size_p = POINTER(c_size_t) -c_ubyte_p = POINTER(c_ubyte) -c_float_p = POINTER(c_float) - -# C functions -lib.llama_context_default_params.argtypes = [] -lib.llama_context_default_params.restype = llama_context_params - -lib.llama_mmap_supported.argtypes = [] -lib.llama_mmap_supported.restype = c_bool - -lib.llama_mlock_supported.argtypes = [] -lib.llama_mlock_supported.restype = c_bool - -lib.llama_init_from_file.argtypes = [c_char_p, llama_context_params] -lib.llama_init_from_file.restype = llama_context_p - -lib.llama_free.argtypes = [llama_context_p] -lib.llama_free.restype = None - -lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int] -lib.llama_model_quantize.restype = c_int - -lib.llama_apply_lora_from_file.argtypes = [llama_context_p, c_char_p, c_char_p, c_int] -lib.llama_apply_lora_from_file.restype = c_int - -lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p] -lib.llama_get_kv_cache_token_count.restype = c_int - -lib.llama_set_rng_seed.argtypes = [llama_context_p, c_int] -lib.llama_set_rng_seed.restype = None - -lib.llama_get_state_size.argtypes = [llama_context_p] -lib.llama_get_state_size.restype = c_size_t - -lib.llama_copy_state_data.argtypes = [llama_context_p, c_ubyte_p] -lib.llama_copy_state_data.restype = c_size_t - -lib.llama_set_state_data.argtypes = [llama_context_p, c_ubyte_p] -lib.llama_set_state_data.restype = c_size_t - -lib.llama_load_session_file.argtypes = [llama_context_p, c_char_p, llama_token_p, c_size_t, c_size_p] -lib.llama_load_session_file.restype = c_bool - -lib.llama_save_session_file.argtypes = [llama_context_p, c_char_p, llama_token_p, c_size_t] -lib.llama_save_session_file.restype = c_bool - -lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_int] -lib.llama_eval.restype = c_int - -lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, c_bool] -lib.llama_tokenize.restype = c_int - -lib.llama_n_vocab.argtypes = [llama_context_p] -lib.llama_n_vocab.restype = c_int - -lib.llama_n_ctx.argtypes = [llama_context_p] -lib.llama_n_ctx.restype = c_int - -lib.llama_n_embd.argtypes = [llama_context_p] -lib.llama_n_embd.restype = c_int - -lib.llama_get_logits.argtypes = [llama_context_p] -lib.llama_get_logits.restype = c_float_p - -lib.llama_get_embeddings.argtypes = [llama_context_p] -lib.llama_get_embeddings.restype = c_float_p - -lib.llama_token_to_str.argtypes = [llama_context_p, llama_token] -lib.llama_token_to_str.restype = c_char_p - -lib.llama_token_bos.argtypes = [] -lib.llama_token_bos.restype = llama_token - -lib.llama_token_eos.argtypes = [] -lib.llama_token_eos.restype = llama_token - -lib.llama_token_nl.argtypes = [] -lib.llama_token_nl.restype = llama_token - -lib.llama_sample_repetition_penalty.argtypes = [llama_context_p, llama_token_data_array_p, llama_token_p, c_size_t, c_float] -lib.llama_sample_repetition_penalty.restype = None - -lib.llama_sample_frequency_and_presence_penalties.argtypes = [llama_context_p, llama_token_data_array_p, llama_token_p, c_size_t, c_float, c_float] -lib.llama_sample_frequency_and_presence_penalties.restype = None - -lib.llama_sample_softmax.argtypes = [llama_context_p, llama_token_data_array_p] -lib.llama_sample_softmax.restype = None - -lib.llama_sample_top_k.argtypes = [llama_context_p, llama_token_data_array_p, c_int, c_size_t] -lib.llama_sample_top_k.restype = None - -lib.llama_sample_top_p.argtypes = [llama_context_p, llama_token_data_array_p, c_float, c_size_t] -lib.llama_sample_top_p.restype = None - -lib.llama_sample_tail_free.argtypes = [llama_context_p, llama_token_data_array_p, c_float, c_size_t] -lib.llama_sample_tail_free.restype = None - -lib.llama_sample_typical.argtypes = [llama_context_p, llama_token_data_array_p, c_float, c_size_t] -lib.llama_sample_typical.restype = None - -lib.llama_sample_temperature.argtypes = [llama_context_p, llama_token_data_array_p, c_float] -lib.llama_sample_temperature.restype = None - -lib.llama_sample_token_mirostat.argtypes = [llama_context_p, llama_token_data_array_p, c_float, c_float, c_int, c_float_p] -lib.llama_sample_token_mirostat.restype = llama_token - -lib.llama_sample_token_mirostat_v2.argtypes = [llama_context_p, llama_token_data_array_p, c_float, c_float, c_float_p] -lib.llama_sample_token_mirostat_v2.restype = llama_token - -lib.llama_sample_token_greedy.argtypes = [llama_context_p, llama_token_data_array_p] -lib.llama_sample_token_greedy.restype = llama_token - -lib.llama_sample_token.argtypes = [llama_context_p, llama_token_data_array_p] -lib.llama_sample_token.restype = llama_token - -lib.llama_print_timings.argtypes = [llama_context_p] -lib.llama_print_timings.restype = None - -lib.llama_reset_timings.argtypes = [llama_context_p] -lib.llama_reset_timings.restype = None - -lib.llama_print_system_info.argtypes = [] -lib.llama_print_system_info.restype = c_char_p - - -# Python functions -def llama_context_default_params() -> llama_context_params: - params = lib.llama_context_default_params() - return params - -def llama_mmap_supported() -> bool: - return lib.llama_mmap_supported() - -def llama_mlock_supported() -> bool: - return lib.llama_mlock_supported() - -def llama_init_from_file(path_model: str, params: llama_context_params) -> llama_context_p: - """Various functions for loading a ggml llama model. - Allocate (almost) all memory needed for the model. - Return NULL on failure """ - return lib.llama_init_from_file(path_model.encode('utf-8'), params) - -def llama_free(ctx: llama_context_p): - """Free all allocated memory""" - lib.llama_free(ctx) - -def llama_model_quantize(fname_inp: str, fname_out: str, itype: c_int, qk: c_int) -> c_int: - """Returns 0 on success""" - return lib.llama_model_quantize(fname_inp.encode('utf-8'), fname_out.encode('utf-8'), itype, qk) - -def llama_apply_lora_from_file(ctx: llama_context_p, path_lora: str, path_base_model: str, n_threads: c_int) -> c_int: - return lib.llama_apply_lora_from_file(ctx, path_lora.encode('utf-8'), path_base_model.encode('utf-8'), n_threads) - -def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int: - return lib.llama_get_kv_cache_token_count(ctx) - -def llama_set_rng_seed(ctx: llama_context_p, seed: c_int): - return lib.llama_set_rng_seed(ctx, seed) - -def llama_get_state_size(ctx: llama_context_p) -> c_size_t: - return lib.llama_get_state_size(ctx) - -def llama_copy_state_data(ctx: llama_context_p, dst: c_ubyte_p) -> c_size_t: - return lib.llama_copy_state_data(ctx, dst) - -def llama_set_state_data(ctx: llama_context_p, src: c_ubyte_p) -> c_size_t: - return lib.llama_set_state_data(ctx, src) - -def llama_load_session_file(ctx: llama_context_p, path_session: str, tokens_out: llama_token_p, n_token_capacity: c_size_t, n_token_count_out: c_size_p) -> c_bool: - return lib.llama_load_session_file(ctx, path_session.encode('utf-8'), tokens_out, n_token_capacity, n_token_count_out) - -def llama_save_session_file(ctx: llama_context_p, path_session: str, tokens: llama_token_p, n_token_count: c_size_t) -> c_bool: - return lib.llama_save_session_file(ctx, path_session.encode('utf-8'), tokens, n_token_count) - -def llama_eval(ctx: llama_context_p, tokens: llama_token_p, n_tokens: c_int, n_past: c_int, n_threads: c_int) -> c_int: - """Run the llama inference to obtain the logits and probabilities for the next token. - tokens + n_tokens is the provided batch of new tokens to process - n_past is the number of tokens to use from previous eval calls - Returns 0 on success""" - return lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads) - -def llama_tokenize(ctx: llama_context_p, text: str, tokens: llama_token_p, n_max_tokens: c_int, add_bos: c_bool) -> c_int: - """Convert the provided text into tokens. - The tokens pointer must be large enough to hold the resulting tokens. - Returns the number of tokens on success, no more than n_max_tokens - Returns a negative number on failure - the number of tokens that would have been returned""" - return lib.llama_tokenize(ctx, text.encode('utf-8'), tokens, n_max_tokens, add_bos) - -def llama_n_vocab(ctx: llama_context_p) -> c_int: - return lib.llama_n_vocab(ctx) - -def llama_n_ctx(ctx: llama_context_p) -> c_int: - return lib.llama_n_ctx(ctx) - -def llama_n_embd(ctx: llama_context_p) -> c_int: - return lib.llama_n_embd(ctx) - -def llama_get_logits(ctx: llama_context_p) -> c_float_p: - """Token logits obtained from the last call to llama_eval() - The logits for the last token are stored in the last row - Can be mutated in order to change the probabilities of the next token - Rows: n_tokens - Cols: n_vocab""" - return lib.llama_get_logits(ctx) - -def llama_get_embeddings(ctx: llama_context_p) -> c_float_p: - """Get the embeddings for the input - shape: [n_embd] (1-dimensional)""" - return lib.llama_get_embeddings(ctx) - -def llama_token_to_str(ctx: llama_context_p, token: int) -> str: - """Token Id -> String. Uses the vocabulary in the provided context""" - return lib.llama_token_to_str(ctx, token).decode('utf-8', errors='ignore') - -def llama_token_bos() -> llama_token: - return lib.llama_token_bos() - -def llama_token_eos() -> llama_token: - return lib.llama_token_eos() - -def llama_token_nl() -> llama_token: - return lib.llama_token_nl() - -def llama_sample_repetition_penalty(ctx: llama_context_p, candidates: llama_token_data_array_p, last_tokens: llama_token_p, last_tokens_size: c_size_t, penalty: float): - lib.llama_sample_repetition_penalty(ctx, candidates, last_tokens, last_tokens_size, penalty) - -def llama_sample_frequency_and_presence_penalties(ctx: llama_context_p, candidates: llama_token_data_array_p, last_tokens: llama_token_p, last_tokens_size: c_size_t, alpha_frequency: float, alpha_presence: float): - lib.llama_sample_frequency_and_presence_penalties(ctx, candidates, last_tokens, last_tokens_size, alpha_frequency, alpha_presence) - -def llama_sample_softmax(ctx: llama_context_p, candidates: llama_token_data_array_p): - lib.llama_sample_softmax(ctx, candidates) - -def llama_sample_top_k(ctx: llama_context_p, candidates: llama_token_data_array_p, k: c_int, min_keep: c_size_t): - lib.llama_sample_top_k(ctx, candidates, k, min_keep) - -def llama_sample_top_p(ctx: llama_context_p, candidates: llama_token_data_array_p, p: float, min_keep: c_size_t): - lib.llama_sample_top_p(ctx, candidates, c_float(p), c_size_t(min_keep)) - -def llama_sample_tail_free(ctx: llama_context_p, candidates: llama_token_data_array_p, z: float, min_keep: c_size_t): - lib.llama_sample_tail_free(ctx, candidates, z, min_keep) - -def llama_sample_typical(ctx: llama_context_p, candidates: llama_token_data_array_p, p: float, min_keep: c_size_t): - lib.llama_sample_typical(ctx, candidates, p, min_keep) - -def llama_sample_temperature(ctx: llama_context_p, candidates: llama_token_data_array_p, temp: float): - lib.llama_sample_temperature(ctx, candidates, temp) - -def llama_sample_token_mirostat(ctx: llama_context_p, candidates: llama_token_data_array_p, tau: float, eta: float, m: c_int, mu: c_float_p) -> llama_token: - return lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) - -def llama_sample_token_mirostat_v2(ctx: llama_context_p, candidates: llama_token_data_array_p, tau: float, eta: float, mu: c_float_p) -> llama_token: - return lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) - -def llama_sample_token_greedy(ctx: llama_context_p, candidates: llama_token_data_array_p) -> llama_token: - return lib.llama_sample_token_greedy(ctx, candidates) - -def llama_sample_token(ctx: llama_context_p, candidates: llama_token_data_array_p) -> llama_token: - return lib.llama_sample_token(ctx, candidates) - -def llama_print_timings(ctx: llama_context_p): - lib.llama_print_timings(ctx) - -def llama_reset_timings(ctx: llama_context_p): - lib.llama_reset_timings(ctx) - -def llama_print_system_info() -> c_char_p: - return lib.llama_print_system_info() \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index cc3a23f0739fd..0000000000000 --- a/setup.py +++ /dev/null @@ -1,15 +0,0 @@ - -from setuptools import setup, find_packages -import glob, os - -setup( - name='llama_cpp', - version='0.0.1', - author='Anonymous', - author_email='', - license='All rights reserved', - packages=find_packages(where='py'), - package_dir={'': 'py'}, - install_requires=[], - entry_points={'console_scripts': []}, -) From 8fd8599f613115279c6997d04b5636702c6834da Mon Sep 17 00:00:00 2001 From: xaedes Date: Tue, 30 May 2023 17:07:03 +0200 Subject: [PATCH 65/86] rename baby-llama-text to train-text-from-scratch --- examples/CMakeLists.txt | 1 + examples/baby-llama/CMakeLists.txt | 4 ---- examples/train-text-from-scratch/CMakeLists.txt | 4 ++++ .../train-text-from-scratch.cpp} | 0 4 files changed, 5 insertions(+), 4 deletions(-) create mode 100644 examples/train-text-from-scratch/CMakeLists.txt rename examples/{baby-llama/baby-llama-text.cpp => train-text-from-scratch/train-text-from-scratch.cpp} (100%) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index e4ce5aca7b98b..e8eab351ded94 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -37,6 +37,7 @@ else() add_subdirectory(save-load-state) add_subdirectory(benchmark) add_subdirectory(baby-llama) + add_subdirectory(train-text-from-scratch) if(LLAMA_BUILD_SERVER) add_subdirectory(server) endif() diff --git a/examples/baby-llama/CMakeLists.txt b/examples/baby-llama/CMakeLists.txt index c89dc792b9a0e..d2ce36367474f 100644 --- a/examples/baby-llama/CMakeLists.txt +++ b/examples/baby-llama/CMakeLists.txt @@ -2,7 +2,3 @@ set(TARGET baby-llama) add_executable(${TARGET} baby-llama.cpp) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) - -add_executable(${TARGET}-text baby-llama-text.cpp) -target_link_libraries(${TARGET}-text PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET}-text PRIVATE cxx_std_11) diff --git a/examples/train-text-from-scratch/CMakeLists.txt b/examples/train-text-from-scratch/CMakeLists.txt new file mode 100644 index 0000000000000..1a44c4961c084 --- /dev/null +++ b/examples/train-text-from-scratch/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET train-text-from-scratch) +add_executable(${TARGET} train-text-from-scratch.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp similarity index 100% rename from examples/baby-llama/baby-llama-text.cpp rename to examples/train-text-from-scratch/train-text-from-scratch.cpp From 7f172c1070d514e450e002e430957773093572ba Mon Sep 17 00:00:00 2001 From: xaedes Date: Wed, 31 May 2023 00:25:37 +0200 Subject: [PATCH 66/86] replace auto parameters in lambda function --- examples/train-text-from-scratch/train-text-from-scratch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index aa3b237890a8a..b8caded0047b3 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -1652,7 +1652,7 @@ void shuffle_ints(int * begin, int * end) { for (int i=0; i Date: Wed, 31 May 2023 12:38:26 +0200 Subject: [PATCH 67/86] add #include --- examples/train-text-from-scratch/train-text-from-scratch.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index b8caded0047b3..45da62995896c 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include From 01fc3faf71bc245fa3e31c160397ce915a17d8d3 Mon Sep 17 00:00:00 2001 From: xaedes Date: Wed, 31 May 2023 15:00:54 +0200 Subject: [PATCH 68/86] add explicit cast to fix compile error "error: non-constant-expression cannot be narrowed from type 'int64_t' (aka 'long long') to 'uint32_t' (aka 'unsigned int') in initializer list [-Wc++11-narrowing]" --- examples/train-text-from-scratch/train-text-from-scratch.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 45da62995896c..3dcb7fc16026b 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -1788,7 +1788,10 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) { const char * name = ggml_get_name(tensor); uint32_t name_len = strlen(name); uint32_t nd = tensor->n_dims; - uint32_t ne[4] = { tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3] }; + uint32_t ne[4] = { (uint32_t)tensor->ne[0], + (uint32_t)tensor->ne[1], + (uint32_t)tensor->ne[2], + (uint32_t)tensor->ne[3] }; file->write_u32(nd); file->write_u32(name_len); file->write_u32(tensor->type); From 83a34444afabc6838542b3f66b8c139c86273665 Mon Sep 17 00:00:00 2001 From: xaedes Date: Wed, 31 May 2023 15:02:38 +0200 Subject: [PATCH 69/86] remove trailing whitespace --- .../train-text-from-scratch/train-text-from-scratch.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 3dcb7fc16026b..7e8d80b940859 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -1788,9 +1788,9 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) { const char * name = ggml_get_name(tensor); uint32_t name_len = strlen(name); uint32_t nd = tensor->n_dims; - uint32_t ne[4] = { (uint32_t)tensor->ne[0], - (uint32_t)tensor->ne[1], - (uint32_t)tensor->ne[2], + uint32_t ne[4] = { (uint32_t)tensor->ne[0], + (uint32_t)tensor->ne[1], + (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3] }; file->write_u32(nd); file->write_u32(name_len); From 0e269665cd87fa803202fe18941a7478b41e7b92 Mon Sep 17 00:00:00 2001 From: xaedes Date: Thu, 1 Jun 2023 19:41:28 +0200 Subject: [PATCH 70/86] add ggml_opt_resume_g which accepts forward and backward cgraphs --- ggml.c | 11 +++++++++++ ggml.h | 8 ++++++++ 2 files changed, 19 insertions(+) diff --git a/ggml.c b/ggml.c index bde5f96985e5c..b16cd07a981e5 100644 --- a/ggml.c +++ b/ggml.c @@ -17457,6 +17457,17 @@ enum ggml_opt_result ggml_opt_resume( *gf = ggml_build_forward (f); *gb = ggml_build_backward(ctx, gf, true); + return ggml_opt_resume_g(ctx, opt, f, gf, gb); +} + +enum ggml_opt_result ggml_opt_resume_g( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_tensor * f, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb) { + + // build forward + backward compute graphs enum ggml_opt_result result = GGML_OPT_OK; switch (opt->params.type) { diff --git a/ggml.h b/ggml.h index a9750d89d73bf..1e85b2fb1aa27 100644 --- a/ggml.h +++ b/ggml.h @@ -1208,6 +1208,14 @@ extern "C" { struct ggml_opt_context * opt, struct ggml_tensor * f); + // continue optimizing the function defined by the tensor f + GGML_API enum ggml_opt_result ggml_opt_resume_g( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_tensor * f, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb); + // // quantization // From 3164f9338109b139182def39921ad4131d57c1e1 Mon Sep 17 00:00:00 2001 From: xaedes Date: Thu, 1 Jun 2023 19:41:55 +0200 Subject: [PATCH 71/86] fix formulas in comments --- ggml.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ggml.c b/ggml.c index b16cd07a981e5..b3ae7f2f9030b 100644 --- a/ggml.c +++ b/ggml.c @@ -13592,12 +13592,12 @@ static void ggml_compute_forward_flash_attn_back_f32( // vcur = v[:M,:D,iq2,iq3] [M,D,1,1] grad[vcur] = grad[S5].T @ S4 // S0 = -Inf [D,1,1,1] // ~S1[i] = dot(kcur[:D,i], qcur) - // S1 = qcur.T @ kcur [M,1,1,1] grad[S1] = grad[S2] * scale + // S1 = qcur @ kcur.T [M,1,1,1] grad[S1] = grad[S2] * scale // S2 = S1 * scale [M,1,1,1] grad[S2] = diag_mask_zero(grad[S3], P) // S3 = diag_mask_inf(S2, P) [M,1,1,1] grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) // S4 = softmax(S3) [M,1,1,1] grad[S4] = grad[S5] @ vcur - // ~S5[i] = dot(vcur[:,i],S4) - // S5 = S4.T @ vcur [D,1,1,1] grad[S5] = d[:D,iq1,iq2,iq3] + // ~S5[i] = dot(vcur[:,i], S4) + // S5 = S4 @ vcur.T [D,1,1,1] grad[S5] = d[:D,iq1,iq2,iq3] // ~dst[i,iq1,iq2,iq3] = S5[i] ^ // dst[:D,iq1,iq2,iq3] = S5 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,iq1,iq2,iq3] // dst backward-/ grad[dst] = d @@ -13615,7 +13615,7 @@ static void ggml_compute_forward_flash_attn_back_f32( // // in post-order: // - // S1 = qcur.T @ kcur + // S1 = qcur @ kcur.T // S2 = S1 * scale // S3 = diag_mask_inf(S2, P) // S4 = softmax(S3) @@ -13628,7 +13628,7 @@ static void ggml_compute_forward_flash_attn_back_f32( // // using less variables (SM=S4): // - // S = diag_mask_inf(qcur.T @ kcur * scale, P) + // S = diag_mask_inf(qcur @ kcur.T * scale, P) // SM = softmax(S) // S = d[:D,iq1,iq2,iq3] @ vcur // dot_SM_gradSM = dot(SM, S) From 765b290010f544613d97a215dea56a3147fed084 Mon Sep 17 00:00:00 2001 From: xaedes Date: Thu, 1 Jun 2023 19:42:51 +0200 Subject: [PATCH 72/86] bug fix for ggml_compute_forward_get_rows_back_f32 the result should be set to zero, not to whatever data is in opt0 --- ggml.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index b3ae7f2f9030b..77eb43d06be6e 100644 --- a/ggml.c +++ b/ggml.c @@ -11016,7 +11016,11 @@ static void ggml_compute_forward_get_rows_back_f32( GGML_ASSERT(ggml_is_contiguous(opt0)); GGML_ASSERT(ggml_is_contiguous(dst)); - ggml_compute_forward_dup_same_cont(params, opt0, dst); + // ggml_compute_forward_dup_same_cont(params, opt0, dst); + + if (params->type == GGML_TASK_INIT) { + memset(dst->data, 0, ggml_nbytes(dst)); + } if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; From 0d4b87de3de6e0d910de5a0a2416ef6b10332fbe Mon Sep 17 00:00:00 2001 From: xaedes Date: Thu, 1 Jun 2023 19:50:48 +0200 Subject: [PATCH 73/86] improve training memory usage with scratch buffers instead of relying on the automatic backward pass, we manually create the graph for the backward pass. it turns out that all backward pass operations need only temporary memory which can be reused after each layer. will compute backward pass for ALL model parameters --- .../train-text-from-scratch.cpp | 595 +++++++++++++++++- 1 file changed, 578 insertions(+), 17 deletions(-) diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 7e8d80b940859..ee17bd8e43eb0 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -1337,6 +1337,505 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn( return inpL; } +struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( + struct my_llama_model * model, + struct ggml_context * ctx0, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb, + struct ggml_tensor * * logits, + struct ggml_tensor * tokens_input, + struct ggml_tensor * targets, + void * compute_buf_0, + void * compute_buf_1, + void * compute_buf_2, + size_t size_buf_0, + size_t size_buf_1, + size_t size_buf_2, + const int n_tokens, + const int n_batch) { + + ggml_set_scratch(ctx0, { 0, 0, nullptr, }); + + const int n_past = 0; + const int N = n_tokens; + + gf->n_nodes = 0; + gf->n_leafs = 0; + gf->work_size = 0; + gf->perf_runs = 0; + gf->perf_cycles = 0; + gf->perf_time_us = 0; + gf->work = NULL; + + const auto & hparams = model->hparams; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_head = hparams.n_head; + const int n_rot = hparams.n_rot; + const int n_ff = get_n_ff(&hparams); + const int rope_mode = 0; + + auto expand = [] (struct ggml_cgraph * g, struct ggml_tensor * t) -> struct ggml_tensor * { + ggml_build_forward_expand(g, t); + return t; + }; + + int last_buf = -1; + size_t buf_offs[3] = { 0, 0, 0 }; + size_t buf_size[3] = { size_buf_0, + size_buf_1, + size_buf_2 }; + void * buf_data[3] = { compute_buf_0, + compute_buf_1, + compute_buf_2 }; + auto use_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data] (int buf) { + size_t last_offs = 0; + last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, }); + if (last_buf >= 0) { + buf_offs[last_buf] = last_offs; + } + if (buf >= 0) { + size_t offs = buf_offs[buf]; + size_t size = buf_size[buf]; + void * data = buf_data[buf]; + ggml_set_scratch(ctx0, { offs, size, data, }); + } + last_buf = buf; + }; + + auto clr_buf = [&buf_offs] (int buf) { + if (buf < 0) return; + // size_t last_offs = 0; + // last_offs = ggml_set_scratch(ctx, { 0, 0, nullptr, }); + // if (last_buf >= 0) { + // buf_offs[last_buf] = last_offs; + // } + // buf_max_size[buf] = std::max(buf_max_size[buf], buf_offs[buf]); + buf_offs[buf] = 0; + // if (last_buf >= 0) { + // size_t offs = buf_offs[last_buf]; + // size_t size = buf_size[last_buf]; + // void * data = buf_data[last_buf]; + // ggml_set_scratch(ctx0, { offset, size, data, }); + // } + }; + + auto view__q = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * { + int64_t ne0 = n_embd/n_head; + int64_t ne1 = N; + int64_t ne2 = n_head; + int64_t ne3 = n_batch; + size_t nb0 = ggml_element_size(t); + size_t nb1 = nb0*ne0; + size_t nb2 = nb1*ne1; + size_t nb3 = nb2*ne2; + size_t offset = 0; + return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset); + }; + + auto view__k = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * { + int64_t ne0 = n_embd/n_head; + int64_t ne1 = N; + int64_t ne2 = n_head; + int64_t ne3 = n_batch; + size_t nb0 = ggml_element_size(t); + size_t nb1 = nb0*ne0; + size_t nb2 = nb1*ne1; + size_t nb3 = nb2*ne2; + size_t offset = nb3*ne3; + return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset); + }; + + auto view__v = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * { + int64_t ne0 = N; + int64_t ne1 = n_embd/n_head; + int64_t ne2 = n_head; + int64_t ne3 = n_batch; + size_t nb0 = ggml_element_size(t); + size_t nb1 = nb0*ne0; + size_t nb2 = nb1*ne1; + size_t nb3 = nb2*ne2; + size_t offset = 2*nb3*ne3; + return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset); + }; + + auto add_or_set = [ctx0] (struct ggml_tensor * a, struct ggml_tensor * b) -> struct ggml_tensor * { + if (a == NULL) { + return b; + } else { + return ggml_add_inplace(ctx0, a, b); + } + }; + + use_buf(-1); + + model->tok_embeddings->grad = ggml_dup_tensor(ctx0, model->tok_embeddings->grad); + model->norm->grad = ggml_dup_tensor(ctx0, model->norm->grad); + model->output->grad = ggml_dup_tensor(ctx0, model->output->grad); + + for (int il = 0; il < n_layer; ++il) { + struct my_llama_layer & layer = model->layers[il]; + layer.attention_norm->grad = ggml_dup_tensor(ctx0, layer.attention_norm->grad); + layer.wq->grad = ggml_dup_tensor(ctx0, layer.wq->grad); + layer.wk->grad = ggml_dup_tensor(ctx0, layer.wk->grad); + layer.wv->grad = ggml_dup_tensor(ctx0, layer.wv->grad); + layer.wo->grad = ggml_dup_tensor(ctx0, layer.wo->grad); + layer.ffn_norm->grad = ggml_dup_tensor(ctx0, layer.ffn_norm->grad); + layer.w1->grad = ggml_dup_tensor(ctx0, layer.w1->grad); + layer.w2->grad = ggml_dup_tensor(ctx0, layer.w2->grad); + layer.w3->grad = ggml_dup_tensor(ctx0, layer.w3->grad); + } + + clr_buf(1); + clr_buf(2); + + use_buf(0); + + struct ggml_tensor * t00 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); assert_shape_1d(t00, N*n_batch); + memcpy(t00->data, tokens_input->data, ggml_element_size(t00)*N*n_batch); + + struct ggml_tensor * t01 = expand(gf, ggml_get_rows(ctx0, model->tok_embeddings, t00)); assert_shape_2d(t01, n_embd, N*n_batch); + + // need to remember these for the backward pass + std::vector t02L; t02L.resize(n_layer, NULL); + std::vector t03L; t03L.resize(n_layer, NULL); + std::vector t04L; t04L.resize(n_layer, NULL); + std::vector t05L; t05L.resize(n_layer, NULL); + std::vector t06L; t06L.resize(n_layer, NULL); + std::vector t07L; t07L.resize(n_layer, NULL); + std::vector t08L; t08L.resize(n_layer, NULL); + std::vector t09L; t09L.resize(n_layer, NULL); + std::vector t10L; t10L.resize(n_layer, NULL); + std::vector t11L; t11L.resize(n_layer, NULL); + std::vector t12L; t12L.resize(n_layer, NULL); + std::vector t13L; t13L.resize(n_layer, NULL); + std::vector t14L; t14L.resize(n_layer, NULL); + std::vector t15L; t15L.resize(n_layer, NULL); + std::vector t16L; t16L.resize(n_layer, NULL); + std::vector t17L; t17L.resize(n_layer, NULL); + std::vector t18L; t18L.resize(n_layer, NULL); + std::vector t19L; t19L.resize(n_layer, NULL); + std::vector t20L; t20L.resize(n_layer, NULL); + std::vector t21L; t21L.resize(n_layer, NULL); + std::vector t22L; t22L.resize(n_layer, NULL); + std::vector t23L; t23L.resize(n_layer, NULL); + std::vector t24L; t24L.resize(n_layer, NULL); + std::vector t25L; t25L.resize(n_layer, NULL); + std::vector t26L; t26L.resize(n_layer, NULL); + std::vector t27L; t27L.resize(n_layer, NULL); + std::vector t28L; t28L.resize(n_layer, NULL); + std::vector t29L; t29L.resize(n_layer, NULL); + std::vector t30L; t30L.resize(n_layer, NULL); + + struct ggml_tensor * cur = t01; + + for (int il = 0; il < n_layer; ++il) { + clr_buf(1); + struct my_llama_layer & layer = model->layers[il]; + // tensors with values necessary for backward pass are in persistent buf(0) + // other tensors with buf(1) are only temporary needed, and their memory reused after layer is completed. + use_buf(0); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm (ctx0, cur)); assert_shape_2d(t02, n_embd, N*n_batch); // n_embd, N*n_batch + use_buf(1); struct ggml_tensor * t03 = expand(gf, ggml_repeat (ctx0, layer.attention_norm, t02)); assert_shape_2d(t03, n_embd, N*n_batch); + use_buf(0); struct ggml_tensor * t04 = expand(gf, ggml_mul (ctx0, t02, t03)); assert_shape_2d(t04, n_embd, N*n_batch); // n_embd, N*n_batch + use_buf(1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat (ctx0, layer.wq, t04)); assert_shape_2d(t05, n_embd, N*n_batch); + use_buf(1); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch); + use_buf(1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode)); assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch); + use_buf(1); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat (ctx0, layer.wk, t04)); assert_shape_2d(t08, n_embd, N*n_batch); + use_buf(1); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch); + use_buf(1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode)); assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch); + use_buf(1); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat (ctx0, t04, layer.wv)); assert_shape_2d(t11, N*n_batch, n_embd); + use_buf(1); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head); + use_buf(0); struct ggml_tensor * t13 = expand(gf, ggml_permute (ctx0, t07, 0, 2, 1, 3)); assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch); // n_embd/n_head, N, n_head, n_batch + use_buf(0); struct ggml_tensor * t14 = expand(gf, ggml_permute (ctx0, t10, 0, 2, 1, 3)); assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch); // n_embd/n_head, N, n_head, n_batch + use_buf(0); struct ggml_tensor * t15 = expand(gf, ggml_permute (ctx0, t12, 0, 3, 1, 2)); assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch); // N, n_embd/n_head, n_head, n_batch + use_buf(1); struct ggml_tensor * t16 = expand(gf, ggml_flash_attn (ctx0, t13, t14, t15, true)); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch); + use_buf(1); struct ggml_tensor * t17 = expand(gf, ggml_permute (ctx0, t16, 0, 2, 1, 3)); assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch); + use_buf(1); struct ggml_tensor * t18 = expand(gf, ggml_cont (ctx0, t17)); assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch); + use_buf(0); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d (ctx0, t18, n_embd, N*n_batch)); assert_shape_2d(t19, n_embd, N*n_batch); // n_embd, N*n_batch + use_buf(1); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat (ctx0, layer.wo, t19)); assert_shape_2d(t20, n_embd, N*n_batch); + use_buf(0); struct ggml_tensor * t21 = expand(gf, ggml_add (ctx0, t20, cur)); assert_shape_2d(t21, n_embd, N*n_batch); // n_embd, N*n_batch + use_buf(0); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm (ctx0, t21)); assert_shape_2d(t22, n_embd, N*n_batch); // n_embd, N*n_batch + use_buf(1); struct ggml_tensor * t23 = expand(gf, ggml_repeat (ctx0, layer.ffn_norm, t22)); assert_shape_2d(t23, n_embd, N*n_batch); + use_buf(0); struct ggml_tensor * t24 = expand(gf, ggml_mul (ctx0, t23, t22)); assert_shape_2d(t24, n_embd, N*n_batch); // n_embd, N*n_batch + use_buf(0); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat (ctx0, layer.w3, t24)); assert_shape_2d(t25, n_ff, N*n_batch); // n_ff, N*n_batch + use_buf(0); struct ggml_tensor * t26 = expand(gf, ggml_mul_mat (ctx0, layer.w1, t24)); assert_shape_2d(t26, n_ff, N*n_batch); // n_ff, N*n_batch + use_buf(0); struct ggml_tensor * t27 = expand(gf, ggml_silu (ctx0, t26)); assert_shape_2d(t27, n_ff, N*n_batch); // n_ff, N*n_batch + use_buf(0); struct ggml_tensor * t28 = expand(gf, ggml_mul (ctx0, t27, t25)); assert_shape_2d(t28, n_ff, N*n_batch); // n_ff, N*n_batch + use_buf(1); struct ggml_tensor * t29 = expand(gf, ggml_mul_mat (ctx0, layer.w2, t28)); assert_shape_2d(t29, n_embd, N*n_batch); + use_buf(0); struct ggml_tensor * t30 = expand(gf, ggml_add (ctx0, t21, t29)); assert_shape_2d(t30, n_embd, N*n_batch); // n_embd, N*n_batch + t02L[il] = t02; + t03L[il] = t03; + t04L[il] = t04; + t05L[il] = t05; + t06L[il] = t06; + t07L[il] = t07; + t08L[il] = t08; + t09L[il] = t09; + t10L[il] = t10; + t11L[il] = t11; + t12L[il] = t12; + t13L[il] = t13; + t14L[il] = t14; + t15L[il] = t15; + t16L[il] = t16; + t17L[il] = t17; + t18L[il] = t18; + t19L[il] = t19; + t20L[il] = t20; + t21L[il] = t21; + t22L[il] = t22; + t23L[il] = t23; + t24L[il] = t24; + t25L[il] = t25; + t26L[il] = t26; + t27L[il] = t27; + t28L[il] = t28; + t29L[il] = t29; + t30L[il] = t30; + + cur = t30; + } + clr_buf(1); + use_buf(1); + struct ggml_tensor * t31 = expand(gf, ggml_rms_norm (ctx0, cur)); assert_shape_2d(t31, n_embd, N*n_batch); + struct ggml_tensor * t32 = expand(gf, ggml_repeat (ctx0, model->norm, t31)); assert_shape_2d(t32, n_embd, N*n_batch); + struct ggml_tensor * t33 = expand(gf, ggml_mul (ctx0, t32, t31)); assert_shape_2d(t33, n_embd, N*n_batch); + struct ggml_tensor * t34 = expand(gf, ggml_mul_mat (ctx0, model->output, t33)); assert_shape_2d(t34, n_vocab, N*n_batch); + struct ggml_tensor * t35 = expand(gf, ggml_reshape_3d(ctx0, t34, n_vocab, N, n_batch)); assert_shape_3d(t35, n_vocab, N, n_batch); + struct ggml_tensor * t36 = expand(gf, ggml_cross_entropy_loss(ctx0, t35, targets)); assert_shape_1d(t36, 1); + + { + /* + tok_embeddings | grad_tok_embeddings = ggml_get_rows_back(grad_t01, t00) + L0_att_norm | grad_L0_att_norm = ggml_repeat_back(grad_t03L0, L0_att_norm.shape) + L0_wq | grad_L0_wq = ggml_out_prod(t04L0, grad_t05L0) + L0_wk | grad_L0_wk = ggml_out_prod(t04L0, grad_t08L0) + L0_wv | grad_L0_wv = ggml_out_prod(t04L0, ggml_transpose(grad_t11L0)) + L0_wo | grad_L0_wo = ggml_out_prod(t19L0, grad_t20L0) + L0_ffn_norm | grad_L0_ffn_norm = ggml_repeat_back(grad_t23L0, L0_ffn_norm.shape) + L0_w1 | grad_L0_w1 = ggml_out_prod(t24L0, grad_t26L0) + L0_w2 | grad_L0_w2 = ggml_out_prod(t28L0, grad_t29L0) + L0_w3 | grad_L0_w3 = ggml_out_prod(t24L0, grad_t25L0) + L1_att_norm | grad_L1_att_norm = ggml_repeat_back(grad_t03L1, L1_att_norm.shape) + L1_wq | grad_L1_wq = ggml_out_prod(t04L1, grad_t05L1) + L1_wk | grad_L1_wk = ggml_out_prod(t04L1, grad_t08L1) + L1_wv | grad_L1_wv = ggml_out_prod(t04L1, ggml_transpose(grad_t11L1)) + L1_wo | grad_L1_wo = ggml_out_prod(t19L1, grad_t20L1) + L1_ffn_norm | grad_L1_ffn_norm = ggml_repeat_back(grad_t23L1, L1_ffn_norm.shape) + L1_w1 | grad_L1_w1 = ggml_out_prod(t24L1, grad_t26L1) + L1_w2 | grad_L1_w2 = ggml_out_prod(t28L1, grad_t29L1) + L1_w3 | grad_L1_w3 = ggml_out_prod(t24L1, grad_t25L1) + norm | grad_norm = ggml_repeat_back(grad_t32, norm.shape) + output | grad_output = ggml_out_prod(t33, grad_t34) + | + t01 = ggml_get_rows(tok_embeddings, t00) | grad_t01 = grad_t21L0 + ggml_rms_norm_back(t01, grad_t02L0) + for layer: | + t02L0*= ggml_rms_norm (t01) | grad_t02L0 = ggml_mul(grad_t04L0, t03L0) + t03L0 = ggml_repeat (L0_att_norm, t02L0_shape) | grad_t03L0 = ggml_mul(grad_t04L0, t02L0) + t04L0*= ggml_mul (t02L0, t03L0) | grad_t04L0 = ggml_out_prod(L0_wv, grad_t11L0) + ggml_out_prod(L0_wk, ggml_transpose(grad_t08L0)) + ggml_out_prod(L0_wq, ggml_transpose(grad_t05L0)) + t05L0 = ggml_mul_mat (L0_wq, t04L0) | grad_t05L0 = ggml_reshape(grad_t06L0, t05L0_shape) + t06L0 = ggml_reshape_4d (t05L0, n_embd/n_head, n_head, N, n_batch) | grad_t06L0 = ggml_rope_back(grad_t07L0) + t07L0 = ggml_rope_inplace (t06L0) | grad_t07L0 = ggml_permute_back(grad_t13L0, 0, 2, 1, 3) = ggml_permute(grad_t13L0, 0, 2, 1, 3) + t08L0 = ggml_mul_mat (L0_wk, t04L0) | grad_t08L0 = ggml_reshape(grad_t09L0, t08L0_shape) + t09L0 = ggml_reshape_4d (t08L0, n_embd/n_head, n_head, N, n_batch) | grad_t09L0 = ggml_rope_back(grad_t10L0) + t10L0 = ggml_rope_inplace (t09L0) | grad_t10L0 = ggml_permute_back(grad_t14L0, 0, 2, 1, 3) = ggml_permute(grad_t14L0, 0, 2, 1, 3) + t11L0 = ggml_mul_mat (t04L0, L0_wv) | grad_t11L0 = ggml_reshape(grad_t12L0, t11L0_shape) + t12L0 = ggml_reshape_4d (t11L0, N, n_batch, n_embd/n_head, n_head) | grad_t12L0 = ggml_permute_back(grad_t15L0, 0, 3, 1, 2) = ggml_permute(grad_t15L0, 0, 2, 3, 1) + t13L0*= ggml_permute (t07L0, 0, 2, 1, 3) | grad_t13L0 = view__q(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0)) + t14L0*= ggml_permute (t10L0, 0, 2, 1, 3) | grad_t14L0 = view__k(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0)) + t15L0*= ggml_permute (t12L0, 0, 3, 1, 2) | grad_t15L0 = view__v(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0)) + t16L0 = ggml_flash_attn (t13L0, t14L0, t15L0) | grad_t16L0 = ggml_permute_back(grad_t17L0, 0, 2, 1, 3) = ggml_permute(grad_t17L0, 0, 2, 1, 3) + t17L0 = ggml_permute (t16L0, 0, 2, 1, 3) | grad_t17L0 = grad_t18L0 + t18L0 = ggml_cont (t17L0) | grad_t18L0 = ggml_reshape(grad_t19L0, t18L0_shape) + t19L0*= ggml_reshape_2d (t18L0, n_embd, N*n_batch) | grad_t19L0 = ggml_out_prod(L0_wo, ggml_transpose(grad_t20L0)) + t20L0 = ggml_mul_mat (L0_wo, t19L0) | grad_t20L0 = grad_t21L0 + t21L0*= ggml_add (t20L0, t01) | grad_t21L0 = grad_t30L0 + ggml_rms_norm_back(t21L0, grad_t22L0) + t22L0*= ggml_rms_norm (t21L0) | grad_t22L0 = ggml_mul(grad_t24L0, t23L0) + t23L0 = ggml_repeat (L0_ffn_norm, t22L0_shape) | grad_t23L0 = ggml_mul(grad_t24L0, t22L0) + t24L0*= ggml_mul (t23L0, t22L0) | grad_t24L0 = ggml_out_prod(L0_w1, ggml_transpose(grad_t26L0)) + ggml_out_prod(L0_w3, ggml_transpose(grad_t25L0)) + t25L0*= ggml_mul_mat (L0_w3, t24L0) | grad_t25L0 = ggml_mul(grad_t28L0, t27L0) + t26L0*= ggml_mul_mat (L0_w1, t24L0) | grad_t26L0 = ggml_silu_back(t26L0, grad_t27L0) + t27L0*= ggml_silu (t26L0) | grad_t27L0 = ggml_mul(grad_t28L0, t25L0) + t28L0*= ggml_mul (t27L0, t25L0) | grad_t28L0 = ggml_out_prod(L0_w2, ggml_transpose(grad_t29L0)) + t29L0 = ggml_mul_mat (L0_w2, t28L0) | grad_t29L0 = grad_t30L0 + t30L0*= ggml_add (t21L0, t29L0) | grad_t30L0 = ggml_rms_norm_back(t30L0, grad_t02L1) + grad_t21L1 + ^ + t02L1*= ggml_rms_norm (t30L0) | grad_t02L1 = ggml_mul(grad_t04L1, t03L1) + t03L1 = ggml_repeat (L1_att_norm, t02L1_shape) | grad_t03L1 = ggml_mul(grad_t04L1, t02L1) + t04L1*= ggml_mul (t02L1, t03L1) | grad_t04L1 = ggml_out_prod(L1_wv, grad_t11L1) + ggml_out_prod(L1_wk, ggml_transpose(grad_t08L1)) + ggml_out_prod(L1_wq, ggml_transpose(grad_t05L1)) + t05L1 = ggml_mul_mat (L1_wq, t04L1) | grad_t05L1 = ggml_reshape(grad_t06L1, t05L1_shape) + t06L1 = ggml_reshape_4d (t05L1, n_embd/n_head, n_head, N, n_batch) | grad_t06L1 = ggml_rope_back(grad_t07L1) + t07L1 = ggml_rope_inplace (t06L1) | grad_t07L1 = ggml_permute_back(grad_t13L1, 0, 2, 1, 3) = ggml_permute(grad_t13L1, 0, 2, 1, 3) + t08L1 = ggml_mul_mat (L1_wk, t04L1) | grad_t08L1 = ggml_reshape(grad_t09L1, t08L1_shape) + t09L1 = ggml_reshape_4d (t08L1, n_embd/n_head, n_head, N, n_batch) | grad_t09L1 = ggml_rope_back(grad_t10L1) + t10L1 = ggml_rope_inplace (t09L1) | grad_t10L1 = ggml_permute_back(grad_t14L1, 0, 2, 1, 3) = ggml_permute(grad_t14L1, 0, 2, 1, 3) + t11L1 = ggml_mul_mat (t04L1, L1_wv) | grad_t11L1 = ggml_reshape(grad_t12L1, t11L1_shape) + t12L1 = ggml_reshape_4d (t11L1, N, n_batch, n_embd/n_head, n_head) | grad_t12L1 = ggml_permute_back(grad_t15L1, 0, 3, 1, 2) = ggml_permute(grad_t15L1, 0, 2, 3, 1) + t13L1*= ggml_permute (t07L1, 0, 2, 1, 3) | grad_t13L1 = view__q(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1)) + t14L1*= ggml_permute (t10L1, 0, 2, 1, 3) | grad_t14L1 = view__k(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1)) + t15L1*= ggml_permute (t12L1, 0, 3, 1, 2) | grad_t15L1 = view__v(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1)) + t16L1 = ggml_flash_attn (t13L1, t14L1, t15L1) | grad_t16L1 = ggml_permute_back(grad_t17L1, 0, 2, 1, 3) = ggml_permute(grad_t17L1, 0, 2, 1, 3) + t17L1 = ggml_permute (t16L1, 0, 2, 1, 3) | grad_t17L1 = grad_t18L1 + t18L1 = ggml_cont (t17L1) | grad_t18L1 = ggml_reshape(grad_t19L1, t18L1_shape) + t19L1*= ggml_reshape_2d (t18L1, n_embd, N*n_batch) | grad_t19L1 = ggml_out_prod(L1_wo, ggml_transpose(grad_t20L1)) + t20L1 = ggml_mul_mat (L1_wo, t19L1) | grad_t20L1 = grad_t21L1 + t21L1*= ggml_add (t20L1, t30L0) | grad_t21L1 = grad_t30L1 + ggml_rms_norm_back(t21L1, grad_t22L1) + t22L1*= ggml_rms_norm (t21L1) | grad_t22L1 = ggml_mul(grad_t24L1, t23L1) + t23L1 = ggml_repeat (L1_ffn_norm, t22L1_shape) | grad_t23L1 = ggml_mul(grad_t24L1, t22L1) + t24L1*= ggml_mul (t23L1, t22L1) | grad_t24L1 = ggml_out_prod(L1_w1, ggml_transpose(grad_t26L1)) + ggml_out_prod(L1_w3, ggml_transpose(grad_t25L1)) + t25L1*= ggml_mul_mat (L1_w3, t24L1) | grad_t25L1 = ggml_mul(grad_t28L1, t27L1) + t26L1*= ggml_mul_mat (L1_w1, t24L1) | grad_t26L1 = ggml_silu_back(t26L1, grad_t27L1) + t27L1*= ggml_silu (t26L1) | grad_t27L1 = ggml_mul(grad_t28L1, t25L1) + t28L1*= ggml_mul (t27L1, t25L1) | grad_t28L1 = ggml_out_prod(L1_w2, ggml_transpose(grad_t29L1)) + t29L1 = ggml_mul_mat (L1_w2, t28L1) | grad_t29L1 = grad_t30L1 + t30L1*= ggml_add (t21L1, t29L1) | grad_t30L1 = ggml_rms_norm_back(t30L1, grad_t31) + ^ + t31 = ggml_rms_norm (t30L1) | grad_t31 = ggml_mul(grad_t33, t32) + t32 = ggml_repeat (norm, t31.shape) | grad_t32 = ggml_mul(grad_t33, t31) + t33 = ggml_mul (t32, t31) | grad_t33 = ggml_out_prod(output, ggml_transpose(grad_t34)) + t34 = ggml_mul_mat (output, t33) | grad_t34 = ggml_reshape(grad_t35, t34.shape) + t35 = ggml_reshape_3d (t34, n_vocab, N, n_batch) | grad_t35 = ggml_cross_entropy_loss_back(t35, targets, grad_t36) + t36 = ggml_cross_entropy_loss(t35, targets) | grad_t36 = 1 (optimizer) + tensors marked with * need to be stored until grad computation + tensors during grad computation are all temporary + */ + } + + *gb = *gf; + + use_buf(-1); + // t36->grad gets set to one by optimizer, so we need to create the tensor. + // initialize it with 1.0f to make sure. + t36->grad = ggml_new_f32(ctx0, 1.0f); + + use_buf(1); + t35->grad = expand(gb, ggml_cross_entropy_loss_back(ctx0, t35, targets, t36->grad)); assert_shape_3d(t35->grad, n_vocab, N, n_batch); + t34->grad = expand(gb, ggml_reshape_2d (ctx0, t35->grad, n_vocab, N*n_batch)); assert_shape_2d(t34->grad, n_vocab, N*n_batch); + t33->grad = expand(gb, ggml_out_prod (ctx0, model->output, ggml_transpose(ctx0, t34->grad))); assert_shape_2d(t33->grad, n_embd, N*n_batch); + t32->grad = expand(gb, ggml_mul (ctx0, t33->grad, t31)); assert_shape_2d(t32->grad, n_embd, N*n_batch); + + use_buf(-1); + + model->norm->grad = expand(gb, add_or_set(model->norm->grad, ggml_repeat_back(ctx0, t32->grad, model->norm))); assert_shape_1d(model->norm->grad, n_embd); + model->output->grad = expand(gb, add_or_set(model->output->grad, ggml_out_prod(ctx0, t33, t34->grad))); assert_shape_2d(model->output->grad, n_embd, n_vocab); + + clr_buf(2); + use_buf(2); + t31->grad = expand(gb, ggml_mul(ctx0, t33->grad, t32)); assert_shape_2d(t31->grad, n_embd, N*n_batch); + + struct ggml_tensor * back_layer_inp = t31; + struct ggml_tensor * grad_layer_inp = NULL; + + for (int k = 0; k < n_layer; ++k) { + int il = n_layer-1-k; + struct my_llama_layer & layer = model->layers[il]; + + struct ggml_tensor * t02 = t02L[il]; + struct ggml_tensor * t03 = t03L[il]; + struct ggml_tensor * t04 = t04L[il]; + struct ggml_tensor * t05 = t05L[il]; + struct ggml_tensor * t06 = t06L[il]; + struct ggml_tensor * t07 = t07L[il]; + struct ggml_tensor * t08 = t08L[il]; + struct ggml_tensor * t09 = t09L[il]; + struct ggml_tensor * t10 = t10L[il]; + struct ggml_tensor * t11 = t11L[il]; + struct ggml_tensor * t12 = t12L[il]; + struct ggml_tensor * t13 = t13L[il]; + struct ggml_tensor * t14 = t14L[il]; + struct ggml_tensor * t15 = t15L[il]; + struct ggml_tensor * t16 = t16L[il]; + struct ggml_tensor * t17 = t17L[il]; + struct ggml_tensor * t18 = t18L[il]; + struct ggml_tensor * t19 = t19L[il]; + struct ggml_tensor * t20 = t20L[il]; + struct ggml_tensor * t21 = t21L[il]; + struct ggml_tensor * t22 = t22L[il]; + struct ggml_tensor * t23 = t23L[il]; + struct ggml_tensor * t24 = t24L[il]; + struct ggml_tensor * t25 = t25L[il]; + struct ggml_tensor * t26 = t26L[il]; + struct ggml_tensor * t27 = t27L[il]; + struct ggml_tensor * t28 = t28L[il]; + struct ggml_tensor * t29 = t29L[il]; + struct ggml_tensor * t30 = t30L[il]; + + clr_buf(1); + use_buf(1); + t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch); + if (grad_layer_inp) { + t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp)); assert_shape_2d(t30->grad, n_embd, N*n_batch); + } + clr_buf(2); + t29->grad = t30->grad; assert_shape_2d(t29->grad, n_embd, N*n_batch); + t28->grad = expand(gb, ggml_out_prod(ctx0, layer.w2, ggml_transpose(ctx0, t29->grad))); assert_shape_2d(t28->grad, n_ff, N*n_batch); + t27->grad = expand(gb, ggml_mul(ctx0, t28->grad, t25)); assert_shape_2d(t27->grad, n_ff, N*n_batch); + t26->grad = expand(gb, ggml_silu_back(ctx0, t26, t27->grad)); assert_shape_2d(t26->grad, n_ff, N*n_batch); + t25->grad = expand(gb, ggml_mul(ctx0, t28->grad, t27)); assert_shape_2d(t25->grad, n_ff, N*n_batch); + t24->grad = expand(gb, ggml_add_inplace(ctx0, + ggml_out_prod(ctx0, layer.w1, ggml_transpose(ctx0, t26->grad)), + ggml_out_prod(ctx0, layer.w3, ggml_transpose(ctx0, t25->grad)))); assert_shape_2d(t24->grad, n_embd, N*n_batch); + t23->grad = expand(gb, ggml_mul(ctx0, t24->grad, t22)); assert_shape_2d(t23->grad, n_embd, N*n_batch); + t22->grad = expand(gb, ggml_mul(ctx0, t24->grad, ggml_repeat(ctx0, layer.ffn_norm, t24->grad))); assert_shape_2d(t22->grad, n_embd, N*n_batch); + use_buf(2); + t21->grad = expand(gb, ggml_add(ctx0, t30->grad, ggml_rms_norm_back(ctx0, t21, t22->grad))); assert_shape_2d(t21->grad, n_embd, N*n_batch); + grad_layer_inp = t21; + use_buf(1); + t20->grad = t21->grad; assert_shape_2d(t20->grad, n_embd, N*n_batch); + t19->grad = expand(gb, ggml_out_prod(ctx0, layer.wo, ggml_transpose(ctx0, t20->grad))); assert_shape_2d(t19->grad, n_embd, N*n_batch); + t18->grad = expand(gb, ggml_reshape(ctx0, t19->grad, t18)); assert_shape_4d(t18->grad, n_embd/n_head, n_head, N, n_batch); + t17->grad = t18->grad; assert_shape_4d(t17->grad, n_embd/n_head, n_head, N, n_batch); + t16->grad = expand(gb, ggml_permute(ctx0, t17->grad, 0, 2, 1, 3)); assert_shape_4d(t16->grad, n_embd/n_head, N, n_head, n_batch); + struct ggml_tensor * flash_attn = expand(gb, ggml_flash_attn_back(ctx0, t13, t14, t15, t16->grad, true)); assert_shape_4d(flash_attn, n_embd/n_head, N*3, n_head, n_batch); + t15->grad = expand(gb, view__v(flash_attn)); assert_shape_4d(t15->grad, N, n_embd/n_head, n_head, n_batch); + t14->grad = expand(gb, view__k(flash_attn)); assert_shape_4d(t14->grad, n_embd/n_head, N, n_head, n_batch); + t13->grad = expand(gb, view__q(flash_attn)); assert_shape_4d(t13->grad, n_embd/n_head, N, n_head, n_batch); + t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1)); assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head); + t11->grad = expand(gb, ggml_reshape(ctx0, ggml_cont(ctx0, t12->grad), t11)); assert_shape_2d(t11->grad, N*n_batch, n_embd); + t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3)); assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch); + t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode)); assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch); + t08->grad = expand(gb, ggml_reshape(ctx0, t09->grad, t08)); assert_shape_2d(t08->grad, n_embd, N*n_batch); + t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3)); assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch); + t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode)); assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch); + t05->grad = expand(gb, ggml_reshape(ctx0, t06->grad, t05)); assert_shape_2d(t05->grad, n_embd, N*n_batch); + t04->grad = expand(gb, ggml_add_inplace(ctx0, + ggml_add_inplace(ctx0, + ggml_out_prod(ctx0, layer.wv, t11->grad), + ggml_out_prod(ctx0, layer.wk, ggml_transpose(ctx0, t08->grad))), + ggml_out_prod(ctx0, layer.wq, ggml_transpose(ctx0, t05->grad)))); assert_shape_2d(t04->grad, n_embd, N*n_batch); + t03->grad = expand(gb, ggml_mul(ctx0, t04->grad, t02)); assert_shape_2d(t04->grad, n_embd, N*n_batch); + use_buf(2); + t02->grad = expand(gb, ggml_mul(ctx0, t04->grad, t03)); assert_shape_2d(t02->grad, n_embd, N*n_batch); + back_layer_inp = t02->grad; + use_buf(1); + + use_buf(-1); + layer.attention_norm->grad = expand(gb, add_or_set(layer.attention_norm->grad, ggml_repeat_back(ctx0, t03->grad, layer.attention_norm))); assert_shape_1d(layer.attention_norm->grad, n_embd); + layer.wq->grad = expand(gb, add_or_set(layer.wq->grad, ggml_out_prod(ctx0, t04, t05->grad))); assert_shape_2d(layer.wq->grad, n_embd, n_embd); + layer.wk->grad = expand(gb, add_or_set(layer.wk->grad, ggml_out_prod(ctx0, t04, t08->grad))); assert_shape_2d(layer.wk->grad, n_embd, n_embd); + layer.wv->grad = expand(gb, add_or_set(layer.wv->grad, ggml_out_prod(ctx0, t04, ggml_transpose(ctx0, t11->grad)))); assert_shape_2d(layer.wv->grad, n_embd, n_embd); + layer.wo->grad = expand(gb, add_or_set(layer.wo->grad, ggml_out_prod(ctx0, t19, t20->grad))); assert_shape_2d(layer.wo->grad, n_embd, n_embd); + layer.ffn_norm->grad = expand(gb, add_or_set(layer.ffn_norm->grad, ggml_repeat_back(ctx0, t23->grad, layer.ffn_norm))); assert_shape_1d(layer.ffn_norm->grad, n_embd); + layer.w1->grad = expand(gb, add_or_set(layer.w1->grad, ggml_out_prod(ctx0, t24, t26->grad))); assert_shape_2d(layer.w1->grad, n_embd, n_ff); + layer.w2->grad = expand(gb, add_or_set(layer.w2->grad, ggml_out_prod(ctx0, t28, t29->grad))); assert_shape_2d(layer.w2->grad, n_ff, n_embd); + layer.w3->grad = expand(gb, add_or_set(layer.w3->grad, ggml_out_prod(ctx0, t24, t25->grad))); assert_shape_2d(layer.w3->grad, n_embd, n_ff); + use_buf(1); + } + clr_buf(1); + use_buf(1); + t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad))); assert_shape_2d(t01->grad, n_embd, N*n_batch); + use_buf(-1); + model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings)); assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab); + clr_buf(2); + clr_buf(1); + + *logits = t35; + + return t36; +} + void set_f32_3d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, float value) { float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); *ptr = value; @@ -2129,6 +2628,9 @@ struct train_params { int mem_model_gb; int mem_compute_gb; + int mem_compute0_gb; + int mem_compute1_gb; + int mem_compute2_gb; }; struct train_params get_default_train_params() { @@ -2172,7 +2674,10 @@ struct train_params get_default_train_params() { params.adam_decay = 1e-3; params.mem_model_gb = 2; - params.mem_compute_gb = 32; + params.mem_compute_gb = 8; + params.mem_compute0_gb = 24; + params.mem_compute1_gb = 8; + params.mem_compute2_gb = 8; return params; } @@ -2215,6 +2720,9 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p fprintf(stderr, " --adam-decay N AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay); fprintf(stderr, " --mem-model N Memory to allocate for model and cache in gigabytes. (default %d)\n", params->mem_model_gb); fprintf(stderr, " --mem-compute N Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb); + fprintf(stderr, " --mem-compute0 N Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute0_gb); + fprintf(stderr, " --mem-compute1 N Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute1_gb); + fprintf(stderr, " --mem-compute2 N Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute2_gb); fprintf(stderr, "\n"); } @@ -2408,6 +2916,24 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) { break; } params->mem_compute_gb = std::stoi(argv[i]); + } else if (arg == "--mem-compute0") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->mem_compute0_gb = std::stoi(argv[i]); + } else if (arg == "--mem-compute1") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->mem_compute1_gb = std::stoi(argv[i]); + } else if (arg == "--mem-compute2") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->mem_compute2_gb = std::stoi(argv[i]); } else if (arg == "-h" || arg == "--help") { train_print_usage(argc, argv, &default_params); exit(0); @@ -2563,6 +3089,13 @@ int main(int argc, char ** argv) { size_t compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb); uint8_t * compute_addr = new uint8_t[compute_size]; + size_t size_buf_0 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute0_gb); + size_t size_buf_1 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute1_gb); + size_t size_buf_2 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute2_gb); + uint8_t * compute_buf_0 = new uint8_t[size_buf_0]; + uint8_t * compute_buf_1 = new uint8_t[size_buf_1]; + uint8_t * compute_buf_2 = new uint8_t[size_buf_2]; + GGML_ASSERT(train_tokens.size() > n_tokens);; std::vector train_samples; train_samples.push_back(0); @@ -2601,22 +3134,46 @@ int main(int argc, char ** argv) { int n_past = 0; - ggml_cgraph gf = {}; - gf.n_threads = params.n_threads; + struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32) + (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0)); + struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32) + (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0)); - get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs); + struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data; + struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data; - struct ggml_tensor * logits = - (n_past == 0) - ? (params.use_flash - ? forward_batch_wo_cache_flash_attn(&model, ctx0, &gf, tokens_input, n_tokens, n_batch) - : forward_batch_wo_cache(&model, ctx0, &gf, tokens_input, n_tokens, n_batch)) - : forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch); + // ggml_cgraph gf = {}; + gf->n_threads = params.n_threads; + gb->n_threads = params.n_threads; - struct ggml_tensor * e = cross_entropy_loss(ctx0, logits, target_probs); + get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs); - ggml_build_forward_expand(&gf, e); - ggml_graph_compute(ctx0, &gf); + // struct ggml_tensor * logits = + // (n_past == 0) + // ? (params.use_flash + // ? forward_batch_wo_cache_flash_attn(&model, ctx0, &gf, tokens_input, n_tokens, n_batch) + // : forward_batch_wo_cache(&model, ctx0, &gf, tokens_input, n_tokens, n_batch)) + // : forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch); + + // struct ggml_tensor * e = cross_entropy_loss(ctx0, logits, target_probs); + struct ggml_tensor * logits; + struct ggml_tensor * e = forward_batch_wo_cache_flash_attn_train( + &model, + ctx0, + gf, + gb, + &logits, + tokens_input, + target_probs, + compute_buf_0, + compute_buf_1, + compute_buf_2, + size_buf_0, + size_buf_1, + size_buf_2, + n_tokens, + n_batch); + + // ggml_build_forward_expand(&gf, e); + ggml_graph_compute(ctx0, gf); size_t used_mem_before_opt = ggml_used_mem(ctx0); @@ -2633,7 +3190,8 @@ int main(int argc, char ** argv) { printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched); // ggml_opt(ctx0, opt->params, e); - ggml_opt_resume(ctx0, opt, e); + // ggml_opt_resume(ctx0, opt, e); + ggml_opt_resume_g(ctx0, opt, e, gf, gb); size_t used_mem_after_opt = ggml_used_mem(ctx0); @@ -2641,8 +3199,8 @@ int main(int argc, char ** argv) { model.train_samples += n_batch; model.train_tokens += n_batch * n_tokens; - ggml_build_forward_expand(&gf, e); - ggml_graph_compute(ctx0, &gf); + //ggml_build_forward_expand(&gf, e); + ggml_graph_compute(ctx0, gf); float error_after_opt = ggml_get_f32_1d(e, 0); @@ -2753,7 +3311,10 @@ int main(int argc, char ** argv) { } } - free(compute_addr); + delete[] compute_addr; + delete[] compute_buf_0; + delete[] compute_buf_1; + delete[] compute_buf_2; ggml_free(model.ctx); return 0; From d9626743ac8d299f591d0a04fe04129ed58f0fb6 Mon Sep 17 00:00:00 2001 From: xaedes Date: Thu, 1 Jun 2023 20:59:19 +0200 Subject: [PATCH 74/86] add option to use scratch buffers in training or not make it configurable because currently training with scratch buffers implies flash attention and optimization over all parameters. --- .../train-text-from-scratch.cpp | 69 ++++++++++--------- 1 file changed, 36 insertions(+), 33 deletions(-) diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index ee17bd8e43eb0..ff6167da8666b 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -2614,6 +2614,7 @@ struct train_params { bool samples_start_after_nl; bool use_adam; bool use_flash; + bool use_scratch; // only adam int warmup; @@ -2661,6 +2662,7 @@ struct train_params get_default_train_params() { params.samples_start_after_nl = false; params.use_adam = true; params.use_flash = true; + params.use_scratch = true; // only adam params.warmup = 100; @@ -2710,6 +2712,8 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p fprintf(stderr, " --use-adam Use Adam optimizer (default)\n"); fprintf(stderr, " --no-flash Don't use flash attention.\n"); fprintf(stderr, " --use-flash Use flash attention (default)\n"); + fprintf(stderr, " --no-scratch Don't use scratch buffers\n"); + fprintf(stderr, " --use-scratch Use scratch buffers (default)\n"); fprintf(stderr, " --warmup N Number of warmup steps (default %d)\n", params->warmup); fprintf(stderr, " --cos-decay-steps N Number of cosine decay steps (default %d)\n", params->cos_decay_steps); fprintf(stderr, " --cos-decay-restart N Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart); @@ -2856,6 +2860,10 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) { params->use_flash = false; } else if (arg == "--use-flash") { params->use_flash = true; + } else if (arg == "--no-scratch") { + params->use_scratch = false; + } else if (arg == "--use-scratch") { + params->use_scratch = true; } else if (arg == "--warmup") { if (++i >= argc) { invalid_param = true; @@ -3146,38 +3154,36 @@ int main(int argc, char ** argv) { get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs); - // struct ggml_tensor * logits = - // (n_past == 0) - // ? (params.use_flash - // ? forward_batch_wo_cache_flash_attn(&model, ctx0, &gf, tokens_input, n_tokens, n_batch) - // : forward_batch_wo_cache(&model, ctx0, &gf, tokens_input, n_tokens, n_batch)) - // : forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch); - - // struct ggml_tensor * e = cross_entropy_loss(ctx0, logits, target_probs); - struct ggml_tensor * logits; - struct ggml_tensor * e = forward_batch_wo_cache_flash_attn_train( - &model, - ctx0, - gf, - gb, - &logits, - tokens_input, - target_probs, - compute_buf_0, - compute_buf_1, - compute_buf_2, - size_buf_0, - size_buf_1, - size_buf_2, - n_tokens, - n_batch); - - // ggml_build_forward_expand(&gf, e); + GGML_ASSERT(n_past == 0); + + struct ggml_tensor * loss = NULL; + struct ggml_tensor * logits = NULL; + + if (params.use_scratch) { + loss = forward_batch_wo_cache_flash_attn_train( + &model, ctx0, + gf, gb, + &logits, tokens_input, target_probs, + compute_buf_0, compute_buf_1, compute_buf_2, + size_buf_0, size_buf_1, size_buf_2, + n_tokens, n_batch); + } else if (params.use_flash) { + logits = forward_batch_wo_cache_flash_attn(&model, ctx0, gf, tokens_input, n_tokens, n_batch); + loss = cross_entropy_loss(ctx0, logits, target_probs); + ggml_build_forward_expand(gf, loss); + *gb = ggml_build_backward(ctx0, gf, true); + } else { + logits = forward_batch_wo_cache(&model, ctx0, gf, tokens_input, n_tokens, n_batch); + loss = cross_entropy_loss(ctx0, logits, target_probs); + ggml_build_forward_expand(gf, loss); + *gb = ggml_build_backward(ctx0, gf, true); + } + ggml_graph_compute(ctx0, gf); size_t used_mem_before_opt = ggml_used_mem(ctx0); - float error_before_opt = ggml_get_f32_1d(e, 0); + float error_before_opt = ggml_get_f32_1d(loss, 0); opt->params.adam.sched = (opt->iter < params.warmup) ? (float) opt->iter / (float) params.warmup @@ -3189,9 +3195,7 @@ int main(int argc, char ** argv) { printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched); - // ggml_opt(ctx0, opt->params, e); - // ggml_opt_resume(ctx0, opt, e); - ggml_opt_resume_g(ctx0, opt, e, gf, gb); + ggml_opt_resume_g(ctx0, opt, loss, gf, gb); size_t used_mem_after_opt = ggml_used_mem(ctx0); @@ -3199,10 +3203,9 @@ int main(int argc, char ** argv) { model.train_samples += n_batch; model.train_tokens += n_batch * n_tokens; - //ggml_build_forward_expand(&gf, e); ggml_graph_compute(ctx0, gf); - float error_after_opt = ggml_get_f32_1d(e, 0); + float error_after_opt = ggml_get_f32_1d(loss, 0); if (params.print_info_interval > 0 && ex % params.print_info_interval == 0) { printf("Example %d, opt iter %d\n", ex, opt->iter); From b58d73ca8c5ea1baf42c24db58746b9e763384af Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 29 May 2023 20:57:24 +0300 Subject: [PATCH 75/86] ci : disable temporary --- .github/workflows/editorconfig.yml | 17 ----------------- .github/workflows/tidy-post.yml | 20 -------------------- .github/workflows/tidy-review.yml | 23 ----------------------- 3 files changed, 60 deletions(-) delete mode 100644 .github/workflows/editorconfig.yml delete mode 100644 .github/workflows/tidy-post.yml delete mode 100644 .github/workflows/tidy-review.yml diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml deleted file mode 100644 index b4e535acf1f64..0000000000000 --- a/.github/workflows/editorconfig.yml +++ /dev/null @@ -1,17 +0,0 @@ -name: EditorConfig Checker - -on: - push: - branches: - - master - pull_request: - branches: - - master - -jobs: - editorconfig: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: editorconfig-checker/action-editorconfig-checker@main - - run: editorconfig-checker diff --git a/.github/workflows/tidy-post.yml b/.github/workflows/tidy-post.yml deleted file mode 100644 index a58da0cd6493d..0000000000000 --- a/.github/workflows/tidy-post.yml +++ /dev/null @@ -1,20 +0,0 @@ -name: clang-tidy review post comments - -on: - workflow_run: - workflows: ["clang-tidy-review"] - types: - - completed - -jobs: - build: - runs-on: ubuntu-latest - - steps: - - uses: ZedThree/clang-tidy-review/post@v0.13.0 - # lgtm_comment_body, max_comments, and annotations need to be set on the posting workflow in a split setup - with: - # adjust options as necessary - lgtm_comment_body: '' - annotations: false - max_comments: 25 diff --git a/.github/workflows/tidy-review.yml b/.github/workflows/tidy-review.yml deleted file mode 100644 index a4bc8d976560e..0000000000000 --- a/.github/workflows/tidy-review.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: clang-tidy-review - -on: - pull_request: - branches: - - master - -jobs: - clang-tidy-review: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v3 - - - uses: ZedThree/clang-tidy-review@v0.13.0 - id: review - with: - lgtm_comment_body: '' - build_dir: build - cmake_command: cmake . -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=on - split_workflow: true - - - uses: ZedThree/clang-tidy-review/upload@v0.13.0 From 6b7487d104fe29e09b3666020d6ad1ae20b8b0c6 Mon Sep 17 00:00:00 2001 From: xaedes Date: Thu, 8 Jun 2023 02:33:57 +0200 Subject: [PATCH 76/86] store view offset and permute axes in opt[0] instead of storing it in padding use memcpy to store offset, because offset is of type size_t. when storing it as int32_t offset would have to be smaller than 2^31 which is not necessarily true. --- ggml.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 64 insertions(+), 13 deletions(-) diff --git a/ggml.c b/ggml.c index 77eb43d06be6e..e64dac6329af6 100644 --- a/ggml.c +++ b/ggml.c @@ -5884,7 +5884,17 @@ struct ggml_tensor * ggml_view_1d( result->src1 = NULL; if (is_node) { - memcpy(result->padding, &offset, sizeof(offset)); + ggml_scratch_save(ctx); + + struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + + GGML_ASSERT(sizeof(offset) <= ggml_nbytes(b)); + + memcpy(b->data, &offset, sizeof(offset)); + + ggml_scratch_load(ctx); + + result->opt[0] = b; } return result; @@ -5920,7 +5930,17 @@ struct ggml_tensor * ggml_view_2d( result->src1 = NULL; if (is_node) { - memcpy(result->padding, &offset, sizeof(offset)); + ggml_scratch_save(ctx); + + struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + + GGML_ASSERT(sizeof(offset) <= ggml_nbytes(b)); + + memcpy(b->data, &offset, sizeof(offset)); + + ggml_scratch_load(ctx); + + result->opt[0] = b; } return result; @@ -5958,7 +5978,17 @@ struct ggml_tensor * ggml_view_3d( result->src1 = NULL; if (is_node) { - memcpy(result->padding, &offset, sizeof(offset)); + ggml_scratch_save(ctx); + + struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + + GGML_ASSERT(sizeof(offset) <= ggml_nbytes(b)); + + memcpy(b->data, &offset, sizeof(offset)); + + ggml_scratch_load(ctx); + + result->opt[0] = b; } return result; @@ -5998,7 +6028,17 @@ struct ggml_tensor * ggml_view_4d( result->src1 = NULL; if (is_node) { - memcpy(result->padding, &offset, sizeof(offset)); + ggml_scratch_save(ctx); + + struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + + GGML_ASSERT(sizeof(offset) <= ggml_nbytes(b)); + + memcpy(b->data, &offset, sizeof(offset)); + + ggml_scratch_load(ctx); + + result->opt[0] = b; } return result; @@ -6062,10 +6102,18 @@ struct ggml_tensor * ggml_permute( result->src1 = NULL; if (is_node) { - result->padding[0] = axis0; - result->padding[1] = axis1; - result->padding[2] = axis2; - result->padding[3] = axis3; + ggml_scratch_save(ctx); + + struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4); + + ((int32_t *) b->data)[0] = axis0; + ((int32_t *) b->data)[1] = axis1; + ((int32_t *) b->data)[2] = axis2; + ((int32_t *) b->data)[3] = axis3; + + ggml_scratch_load(ctx); + + result->opt[0] = b; } return result; @@ -14834,7 +14882,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // necessary for llama if (src0->grad) { size_t offset; - memcpy(&offset, tensor->padding, sizeof(offset)); + + GGML_ASSERT(sizeof(offset) <= ggml_nbytes(tensor->opt[0])); + memcpy(&offset, tensor->opt[0]->data, sizeof(offset)); size_t nb1 = tensor->nb[1]; size_t nb2 = tensor->nb[2]; @@ -14861,10 +14911,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { - int axis0 = tensor->padding[0] & 0x3; - int axis1 = tensor->padding[1] & 0x3; - int axis2 = tensor->padding[2] & 0x3; - int axis3 = tensor->padding[3] & 0x3; + int32_t * axes = (int32_t *) tensor->opt[0]->data; + int axis0 = axes[0] & 0x3; + int axis1 = axes[1] & 0x3; + int axis2 = axes[2] & 0x3; + int axis3 = axes[3] & 0x3; int axes_backward[4] = {0,0,0,0}; axes_backward[axis0] = 0; axes_backward[axis1] = 1; From e829421eda7adb5dbbae9f68c1d52dcf6e121cd5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 11 Jun 2023 11:49:01 +0300 Subject: [PATCH 77/86] minor : fix compile warnings + minor style changes --- .../train-text-from-scratch.cpp | 56 +++++++++---------- ggml.c | 18 +++--- llama.h | 6 +- 3 files changed, 39 insertions(+), 41 deletions(-) diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index ff6167da8666b..f933c0164e54f 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -953,7 +953,7 @@ struct ggml_tensor * forward_batch_wo_cache( const int N = n_tokens; const auto & hparams = model->hparams; - const int n_ctx = hparams.n_ctx; + //const int n_ctx = hparams.n_ctx; const int n_vocab = hparams.n_vocab; const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; @@ -1181,7 +1181,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn( const int N = n_tokens; const auto & hparams = model->hparams; - const int n_ctx = hparams.n_ctx; + //const int n_ctx = hparams.n_ctx; const int n_vocab = hparams.n_vocab; const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; @@ -1368,7 +1368,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( gf->work = NULL; const auto & hparams = model->hparams; - const int n_ctx = hparams.n_ctx; + //const int n_ctx = hparams.n_ctx; const int n_vocab = hparams.n_vocab; const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; @@ -1894,7 +1894,7 @@ void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) { void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens) { for (int i1=0; i1ne[1]; ++i1) { - int num_newline = 0; + //int num_newline = 0; for (int i0=0; i0ne[0]; ++i0) { int token = get_i32_2d(tokens, i0, i1); print_token(ctx, token); @@ -1920,7 +1920,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons int n_tokens = tokens_input->ne[0]; int n_vocab = target_logits->ne[0]; - int sample = train_samples[example_id % n_train_samples]; + size_t sample = train_samples[example_id % n_train_samples]; GGML_ASSERT(sample+n_tokens-1 < n_train_data); ggml_set_f32(target_logits, -1.0f/n_vocab); @@ -1936,7 +1936,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons } } -void get_example_targets_batch(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) { +void get_example_targets_batch(struct llama_context * /*lctx*/, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) { GGML_ASSERT(tokens_input->n_dims == 2); GGML_ASSERT(target_logits->n_dims == 3); GGML_ASSERT(target_probs->n_dims == 3); @@ -1953,7 +1953,7 @@ void get_example_targets_batch(struct llama_context * lctx, const int * train_sa ggml_set_f32(target_probs, 0.0f); for (int k=0; k= end) { @@ -2264,7 +2264,7 @@ llama_token sample(struct my_llama_sampler * sampler, float * logits, const llam } void set_logits_masked(struct ggml_tensor * logits, std::vector& mask, float value) { - GGML_ASSERT(logits->ne[0] == mask.size()); + GGML_ASSERT(logits->ne[0] == (int64_t) mask.size()); for (int i2 = 0; i2 < logits->ne[2]; ++i2) { for (int i1 = 0; i1 < logits->ne[1]; ++i1) { for (int i0 = 0; i0 < logits->ne[0]; ++i0) { @@ -2301,7 +2301,7 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) { } void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) { - uint32_t nd = file->read_u32(); + int32_t nd = file->read_u32(); GGML_ASSERT(nd == tensor->n_dims); uint32_t name_len = file->read_u32(); @@ -3003,7 +3003,7 @@ int main(int argc, char ** argv) { if (tokenize_file(lctx, params.fn_train_data, train_tokens) < 0) { fprintf(stderr, "%s: failed to tokenize file '%s'\n", __func__, params.fn_train_data); } - printf("%s: number of training tokens: %d\n", __func__, train_tokens.size()); + printf("%s: number of training tokens: %d\n", __func__, (int) train_tokens.size()); struct my_llama_model model; model.hparams.n_vocab = llama_n_vocab(lctx); @@ -3020,7 +3020,7 @@ int main(int argc, char ** argv) { std::vector token_notavail; token_noccurs.resize(model.hparams.n_vocab, 0); token_notavail.resize(model.hparams.n_vocab, true); - for (int i=0; i token_freq; token_freq.resize(model.hparams.n_vocab, 0); int n_unique_tokens = 0; - for (int i=0; i 0) ? 1 : 0; } @@ -3104,26 +3104,26 @@ int main(int argc, char ** argv) { uint8_t * compute_buf_1 = new uint8_t[size_buf_1]; uint8_t * compute_buf_2 = new uint8_t[size_buf_2]; - GGML_ASSERT(train_tokens.size() > n_tokens);; + GGML_ASSERT(n_tokens < (int) train_tokens.size()); std::vector train_samples; train_samples.push_back(0); - for (int i=1; i= train_samples.size()) { + for (int ex = 0; ex < params.n_examples; ++ex) { + if (ex*n_batch >= (int) train_samples.size()) { shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size()); - for (int i=0; ine[3]; const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; + //const int64_t ne11 = src1->ne[1]; const int64_t ne12 = src1->ne[2]; const int64_t ne13 = src1->ne[3]; @@ -10587,11 +10587,10 @@ static void ggml_compute_forward_out_prod_f32( const int64_t i02 = i2; const int64_t i03 = i3; - const int64_t i10 = i1; + //const int64_t i10 = i1; const int64_t i12 = i2; const int64_t i13 = i3; - for (int64_t i01 = 0; i01 < ne01; ++i01) { const int64_t i11 = i01; @@ -13956,8 +13955,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32( return; } - const float eps = 1e-9f; - + const double eps = 1e-9; // rows per thread const int dr = (nr + nth - 1)/nth; @@ -14002,7 +14000,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32( // sum = 1.0/sum; } // avoid log(0) by rescaling from [0..1] to [eps..1] - sum = (1.0f - eps) / sum; + sum = (1.0 - eps) / sum; ggml_vec_scale_f32(nc, st, sum); ggml_vec_add1_f32(nc, st, st, eps); ggml_vec_log_f32(nc, st, st); @@ -14054,8 +14052,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( const int64_t ith = params->ith; const int64_t nth = params->nth; - float * sums = (float *) params->wdata; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -14090,6 +14086,8 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( #endif // step by step explanation: { + //float * sums = (float *) params->wdata; + // forward pass with annotated gradients from backward pass // (built by going in reverse operation order, adding to gradients of current operation args) // st0 = exp(s0-max(s0)) grad[st0] = grad[st1]*(1.0 - eps)/sum @@ -14162,10 +14160,10 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( float dot_st1_dst1 = 0; ggml_vec_scale_f32(nc, sm, sum); ggml_vec_cpy_f32 (nc, ds0, sm); - ggml_vec_scale_f32(nc, ds0, (1.0 - eps)); + ggml_vec_scale_f32(nc, ds0, (1.0f - eps)); ggml_vec_add1_f32 (nc, ds0, ds0, eps); ggml_vec_div_f32 (nc, ds0, s1, ds0); - ggml_vec_scale_f32(nc, ds0, -(1.0 - eps)*d[0]); + ggml_vec_scale_f32(nc, ds0, -(1.0f - eps)*d[0]); ggml_vec_dot_f32 (nc, &dot_st1_dst1, sm, ds0); ggml_vec_acc1_f32 (nc, ds0, -dot_st1_dst1); ggml_vec_mul_f32 (nc, ds0, ds0, sm); diff --git a/llama.h b/llama.h index 3947cf3e2aa17..4694c9c85554c 100644 --- a/llama.h +++ b/llama.h @@ -193,9 +193,9 @@ extern "C" { // Returns number of results. LLAMA_API int llama_get_vocab( const struct llama_context * ctx, - const char * * strings, - float * scores, - int capacity); + const char * * strings, + float * scores, + int capacity); // Token logits obtained from the last call to llama_eval() // The logits for the last token are stored in the last row From 7aa10d0518764857df7174d6959873f242e09704 Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 11 Jun 2023 16:50:41 +0200 Subject: [PATCH 78/86] fix bug in threaded indices calculation of ggml_compute_forward_flash_attn_back_f32 --- ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index b5eb9123cdc21..79f26ff2d295f 100644 --- a/ggml.c +++ b/ggml.c @@ -13545,7 +13545,7 @@ static void ggml_compute_forward_flash_attn_back_f32( for (int ir = ir0; ir < ir1; ++ir) { // q indices const int iq3 = ir/(neq2); - const int iq2 = (ir - iq3*neq2)/neq2; + const int iq2 = ir - iq3*neq2; for ( int iq1 = 0; iq1 < neq1; ++iq1) { From edf6fc252a7c6e6c02f37cc76dbc56fcd1ce656c Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 11 Jun 2023 17:07:44 +0200 Subject: [PATCH 79/86] store view offset like in master branch --- ggml.c | 88 +++++++++++++++++++++------------------------------------- 1 file changed, 32 insertions(+), 56 deletions(-) diff --git a/ggml.c b/ggml.c index 79f26ff2d295f..128c41447bcc9 100644 --- a/ggml.c +++ b/ggml.c @@ -5878,24 +5878,18 @@ struct ggml_tensor * ggml_view_1d( struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset); + ggml_scratch_save(ctx); + + struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + memcpy(offs->data, &offset, 2*sizeof(int32_t)); + + ggml_scratch_load(ctx); + result->op = GGML_OP_VIEW; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; - - if (is_node) { - ggml_scratch_save(ctx); - - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); - - GGML_ASSERT(sizeof(offset) <= ggml_nbytes(b)); - - memcpy(b->data, &offset, sizeof(offset)); - - ggml_scratch_load(ctx); - - result->opt[0] = b; - } + result->opt[0] = offs; return result; } @@ -5920,6 +5914,13 @@ struct ggml_tensor * ggml_view_2d( struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset); + ggml_scratch_save(ctx); + + struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + memcpy(offs->data, &offset, 2*sizeof(int32_t)); + + ggml_scratch_load(ctx); + result->nb[1] = nb1; result->nb[2] = result->nb[1]*ne1; result->nb[3] = result->nb[2]; @@ -5928,20 +5929,7 @@ struct ggml_tensor * ggml_view_2d( result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; - - if (is_node) { - ggml_scratch_save(ctx); - - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); - - GGML_ASSERT(sizeof(offset) <= ggml_nbytes(b)); - - memcpy(b->data, &offset, sizeof(offset)); - - ggml_scratch_load(ctx); - - result->opt[0] = b; - } + result->opt[0] = offs; return result; } @@ -5968,6 +5956,13 @@ struct ggml_tensor * ggml_view_3d( struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset); + ggml_scratch_save(ctx); + + struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + memcpy(offs->data, &offset, 2*sizeof(int32_t)); + + ggml_scratch_load(ctx); + result->nb[1] = nb1; result->nb[2] = nb2; result->nb[3] = result->nb[2]*ne2; @@ -5976,20 +5971,7 @@ struct ggml_tensor * ggml_view_3d( result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; - - if (is_node) { - ggml_scratch_save(ctx); - - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); - - GGML_ASSERT(sizeof(offset) <= ggml_nbytes(b)); - - memcpy(b->data, &offset, sizeof(offset)); - - ggml_scratch_load(ctx); - - result->opt[0] = b; - } + result->opt[0] = offs; return result; } @@ -6018,6 +6000,13 @@ struct ggml_tensor * ggml_view_4d( struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset); + ggml_scratch_save(ctx); + + struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + memcpy(offs->data, &offset, 2*sizeof(int32_t)); + + ggml_scratch_load(ctx); + result->nb[1] = nb1; result->nb[2] = nb2; result->nb[3] = nb3; @@ -6026,20 +6015,7 @@ struct ggml_tensor * ggml_view_4d( result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; - - if (is_node) { - ggml_scratch_save(ctx); - - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); - - GGML_ASSERT(sizeof(offset) <= ggml_nbytes(b)); - - memcpy(b->data, &offset, sizeof(offset)); - - ggml_scratch_load(ctx); - - result->opt[0] = b; - } + result->opt[0] = offs; return result; } From fdeb99784abb1f6ad399df53aa6a4fae1b977e9d Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 11 Jun 2023 19:58:36 +0200 Subject: [PATCH 80/86] bug fix in forward_batch_wo_cache_flash_attn_train --- .../train-text-from-scratch/train-text-from-scratch.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index f933c0164e54f..9bbeda125c576 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -1708,7 +1708,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( use_buf(-1); // t36->grad gets set to one by optimizer, so we need to create the tensor. // initialize it with 1.0f to make sure. - t36->grad = ggml_new_f32(ctx0, 1.0f); + GGML_ASSERT(t36->grad != NULL); + // t36->grad = expand(gb, ggml_new_f32(ctx0, 1.0f)); use_buf(1); t35->grad = expand(gb, ggml_cross_entropy_loss_back(ctx0, t35, targets, t36->grad)); assert_shape_3d(t35->grad, n_vocab, N, n_batch); @@ -1766,7 +1767,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( use_buf(1); t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch); if (grad_layer_inp) { - t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp)); assert_shape_2d(t30->grad, n_embd, N*n_batch); + t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch); } clr_buf(2); t29->grad = t30->grad; assert_shape_2d(t29->grad, n_embd, N*n_batch); @@ -1808,7 +1809,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( t03->grad = expand(gb, ggml_mul(ctx0, t04->grad, t02)); assert_shape_2d(t04->grad, n_embd, N*n_batch); use_buf(2); t02->grad = expand(gb, ggml_mul(ctx0, t04->grad, t03)); assert_shape_2d(t02->grad, n_embd, N*n_batch); - back_layer_inp = t02->grad; + back_layer_inp = t02; use_buf(1); use_buf(-1); From efd7314d27ab4f7cfbc6854f657225e8d4634f1e Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 11 Jun 2023 23:10:41 +0200 Subject: [PATCH 81/86] scratch buffer bug fixes in forward_batch_wo_cache_flash_attn_train data of permute and reshape is the same as their input. if we want to preserve the output of permute/reshape, we also need to preserve their inputs. replace reshape(src0, src1) with reshape_nd calls so that we don't need src1. replace (temporary) t03 with ggml_repeat(ctx0, layer.attention_norm, t02). in the future we could also use the new broadcasting ggml_mul to avoid these repeat calls. for this we need backward pass of broadcasting ggml_mul. --- .../train-text-from-scratch.cpp | 115 ++++++++++-------- 1 file changed, 66 insertions(+), 49 deletions(-) diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 9bbeda125c576..7e9607f5ecca3 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -1405,21 +1405,26 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( last_buf = buf; }; - auto clr_buf = [&buf_offs] (int buf) { + bool track_max_mem = false; + size_t buf_maxs[3] = { 0, 0, 0 }; + + auto clr_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs, track_max_mem] (int buf) { if (buf < 0) return; - // size_t last_offs = 0; - // last_offs = ggml_set_scratch(ctx, { 0, 0, nullptr, }); - // if (last_buf >= 0) { - // buf_offs[last_buf] = last_offs; - // } - // buf_max_size[buf] = std::max(buf_max_size[buf], buf_offs[buf]); + if (track_max_mem) { + size_t last_offs = 0; + last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, }); + if (last_buf >= 0) { + buf_offs[last_buf] = last_offs; + buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]); + } + } buf_offs[buf] = 0; - // if (last_buf >= 0) { - // size_t offs = buf_offs[last_buf]; - // size_t size = buf_size[last_buf]; - // void * data = buf_data[last_buf]; - // ggml_set_scratch(ctx0, { offset, size, data, }); - // } + if (track_max_mem && last_buf >= 0) { + size_t offs = buf_offs[last_buf]; + size_t size = buf_size[last_buf]; + void * data = buf_data[last_buf]; + ggml_set_scratch(ctx0, { offs, size, data, }); + } }; auto view__q = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * { @@ -1471,6 +1476,13 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( use_buf(-1); + // need to create grads for model parameters, so that expand(..) correctly populates cgraph->leafs & cgraph->grads + // this wastes memory, because unnecessary grad for each op is automatically created: + // the automatically generated grad is unnecessary because we later manually set the grad (e.g. t35->grad = expand(gb, ...) ). + // this discards the automatically generated grad resulting in wasted memory. + // TODO: improve this, possibly by changing expand(..) to not use ggml_build_forward_expand. + // expand should correctly set cgraph->nodes. + // cgraph->leafs & cgraph->grads could be set in another pass after the last expand call. model->tok_embeddings->grad = ggml_dup_tensor(ctx0, model->tok_embeddings->grad); model->norm->grad = ggml_dup_tensor(ctx0, model->norm->grad); model->output->grad = ggml_dup_tensor(ctx0, model->output->grad); @@ -1491,11 +1503,13 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( clr_buf(1); clr_buf(2); - use_buf(0); + use_buf(-1); struct ggml_tensor * t00 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); assert_shape_1d(t00, N*n_batch); memcpy(t00->data, tokens_input->data, ggml_element_size(t00)*N*n_batch); + use_buf(0); + struct ggml_tensor * t01 = expand(gf, ggml_get_rows(ctx0, model->tok_embeddings, t00)); assert_shape_2d(t01, n_embd, N*n_batch); // need to remember these for the backward pass @@ -1536,35 +1550,35 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( struct my_llama_layer & layer = model->layers[il]; // tensors with values necessary for backward pass are in persistent buf(0) // other tensors with buf(1) are only temporary needed, and their memory reused after layer is completed. - use_buf(0); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm (ctx0, cur)); assert_shape_2d(t02, n_embd, N*n_batch); // n_embd, N*n_batch + use_buf(0); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm (ctx0, cur)); assert_shape_2d(t02, n_embd, N*n_batch); use_buf(1); struct ggml_tensor * t03 = expand(gf, ggml_repeat (ctx0, layer.attention_norm, t02)); assert_shape_2d(t03, n_embd, N*n_batch); - use_buf(0); struct ggml_tensor * t04 = expand(gf, ggml_mul (ctx0, t02, t03)); assert_shape_2d(t04, n_embd, N*n_batch); // n_embd, N*n_batch - use_buf(1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat (ctx0, layer.wq, t04)); assert_shape_2d(t05, n_embd, N*n_batch); - use_buf(1); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch); - use_buf(1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode)); assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch); - use_buf(1); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat (ctx0, layer.wk, t04)); assert_shape_2d(t08, n_embd, N*n_batch); - use_buf(1); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch); - use_buf(1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode)); assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch); - use_buf(1); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat (ctx0, t04, layer.wv)); assert_shape_2d(t11, N*n_batch, n_embd); - use_buf(1); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head); - use_buf(0); struct ggml_tensor * t13 = expand(gf, ggml_permute (ctx0, t07, 0, 2, 1, 3)); assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch); // n_embd/n_head, N, n_head, n_batch - use_buf(0); struct ggml_tensor * t14 = expand(gf, ggml_permute (ctx0, t10, 0, 2, 1, 3)); assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch); // n_embd/n_head, N, n_head, n_batch - use_buf(0); struct ggml_tensor * t15 = expand(gf, ggml_permute (ctx0, t12, 0, 3, 1, 2)); assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch); // N, n_embd/n_head, n_head, n_batch - use_buf(1); struct ggml_tensor * t16 = expand(gf, ggml_flash_attn (ctx0, t13, t14, t15, true)); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch); + use_buf(0); struct ggml_tensor * t04 = expand(gf, ggml_mul (ctx0, t02, t03)); assert_shape_2d(t04, n_embd, N*n_batch); + use_buf(0); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat (ctx0, layer.wq, t04)); assert_shape_2d(t05, n_embd, N*n_batch); + use_buf(0); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch); + use_buf(0); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode)); assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch); + use_buf(0); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat (ctx0, layer.wk, t04)); assert_shape_2d(t08, n_embd, N*n_batch); + use_buf(0); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch); + use_buf(0); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode)); assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch); + use_buf(0); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat (ctx0, t04, layer.wv)); assert_shape_2d(t11, N*n_batch, n_embd); + use_buf(0); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head); + use_buf(0); struct ggml_tensor * t13 = expand(gf, ggml_permute (ctx0, t07, 0, 2, 1, 3)); assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch); + use_buf(0); struct ggml_tensor * t14 = expand(gf, ggml_permute (ctx0, t10, 0, 2, 1, 3)); assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch); + use_buf(0); struct ggml_tensor * t15 = expand(gf, ggml_permute (ctx0, t12, 0, 3, 1, 2)); assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch); + use_buf(0); struct ggml_tensor * t16 = expand(gf, ggml_flash_attn (ctx0, t13, t14, t15, true)); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch); use_buf(1); struct ggml_tensor * t17 = expand(gf, ggml_permute (ctx0, t16, 0, 2, 1, 3)); assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch); - use_buf(1); struct ggml_tensor * t18 = expand(gf, ggml_cont (ctx0, t17)); assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch); - use_buf(0); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d (ctx0, t18, n_embd, N*n_batch)); assert_shape_2d(t19, n_embd, N*n_batch); // n_embd, N*n_batch + use_buf(0); struct ggml_tensor * t18 = expand(gf, ggml_cont (ctx0, t17)); assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch); + use_buf(0); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d (ctx0, t18, n_embd, N*n_batch)); assert_shape_2d(t19, n_embd, N*n_batch); use_buf(1); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat (ctx0, layer.wo, t19)); assert_shape_2d(t20, n_embd, N*n_batch); - use_buf(0); struct ggml_tensor * t21 = expand(gf, ggml_add (ctx0, t20, cur)); assert_shape_2d(t21, n_embd, N*n_batch); // n_embd, N*n_batch - use_buf(0); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm (ctx0, t21)); assert_shape_2d(t22, n_embd, N*n_batch); // n_embd, N*n_batch + use_buf(0); struct ggml_tensor * t21 = expand(gf, ggml_add (ctx0, t20, cur)); assert_shape_2d(t21, n_embd, N*n_batch); + use_buf(0); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm (ctx0, t21)); assert_shape_2d(t22, n_embd, N*n_batch); use_buf(1); struct ggml_tensor * t23 = expand(gf, ggml_repeat (ctx0, layer.ffn_norm, t22)); assert_shape_2d(t23, n_embd, N*n_batch); - use_buf(0); struct ggml_tensor * t24 = expand(gf, ggml_mul (ctx0, t23, t22)); assert_shape_2d(t24, n_embd, N*n_batch); // n_embd, N*n_batch - use_buf(0); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat (ctx0, layer.w3, t24)); assert_shape_2d(t25, n_ff, N*n_batch); // n_ff, N*n_batch - use_buf(0); struct ggml_tensor * t26 = expand(gf, ggml_mul_mat (ctx0, layer.w1, t24)); assert_shape_2d(t26, n_ff, N*n_batch); // n_ff, N*n_batch - use_buf(0); struct ggml_tensor * t27 = expand(gf, ggml_silu (ctx0, t26)); assert_shape_2d(t27, n_ff, N*n_batch); // n_ff, N*n_batch - use_buf(0); struct ggml_tensor * t28 = expand(gf, ggml_mul (ctx0, t27, t25)); assert_shape_2d(t28, n_ff, N*n_batch); // n_ff, N*n_batch + use_buf(0); struct ggml_tensor * t24 = expand(gf, ggml_mul (ctx0, t23, t22)); assert_shape_2d(t24, n_embd, N*n_batch); + use_buf(0); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat (ctx0, layer.w3, t24)); assert_shape_2d(t25, n_ff, N*n_batch); + use_buf(0); struct ggml_tensor * t26 = expand(gf, ggml_mul_mat (ctx0, layer.w1, t24)); assert_shape_2d(t26, n_ff, N*n_batch); + use_buf(0); struct ggml_tensor * t27 = expand(gf, ggml_silu (ctx0, t26)); assert_shape_2d(t27, n_ff, N*n_batch); + use_buf(0); struct ggml_tensor * t28 = expand(gf, ggml_mul (ctx0, t27, t25)); assert_shape_2d(t28, n_ff, N*n_batch); use_buf(1); struct ggml_tensor * t29 = expand(gf, ggml_mul_mat (ctx0, layer.w2, t28)); assert_shape_2d(t29, n_embd, N*n_batch); - use_buf(0); struct ggml_tensor * t30 = expand(gf, ggml_add (ctx0, t21, t29)); assert_shape_2d(t30, n_embd, N*n_batch); // n_embd, N*n_batch + use_buf(0); struct ggml_tensor * t30 = expand(gf, ggml_add (ctx0, t21, t29)); assert_shape_2d(t30, n_embd, N*n_batch); t02L[il] = t02; t03L[il] = t03; t04L[il] = t04; @@ -1602,6 +1616,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( struct ggml_tensor * t31 = expand(gf, ggml_rms_norm (ctx0, cur)); assert_shape_2d(t31, n_embd, N*n_batch); struct ggml_tensor * t32 = expand(gf, ggml_repeat (ctx0, model->norm, t31)); assert_shape_2d(t32, n_embd, N*n_batch); struct ggml_tensor * t33 = expand(gf, ggml_mul (ctx0, t32, t31)); assert_shape_2d(t33, n_embd, N*n_batch); + use_buf(-1); struct ggml_tensor * t34 = expand(gf, ggml_mul_mat (ctx0, model->output, t33)); assert_shape_2d(t34, n_vocab, N*n_batch); struct ggml_tensor * t35 = expand(gf, ggml_reshape_3d(ctx0, t34, n_vocab, N, n_batch)); assert_shape_3d(t35, n_vocab, N, n_batch); struct ggml_tensor * t36 = expand(gf, ggml_cross_entropy_loss(ctx0, t35, targets)); assert_shape_1d(t36, 1); @@ -1705,10 +1720,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( *gb = *gf; - use_buf(-1); // t36->grad gets set to one by optimizer, so we need to create the tensor. // initialize it with 1.0f to make sure. GGML_ASSERT(t36->grad != NULL); + // use_buf(-1); // t36->grad = expand(gb, ggml_new_f32(ctx0, 1.0f)); use_buf(1); @@ -1770,7 +1785,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch); } clr_buf(2); - t29->grad = t30->grad; assert_shape_2d(t29->grad, n_embd, N*n_batch); + t29->grad = t30->grad; assert_shape_2d(t29->grad, n_embd, N*n_batch); t28->grad = expand(gb, ggml_out_prod(ctx0, layer.w2, ggml_transpose(ctx0, t29->grad))); assert_shape_2d(t28->grad, n_ff, N*n_batch); t27->grad = expand(gb, ggml_mul(ctx0, t28->grad, t25)); assert_shape_2d(t27->grad, n_ff, N*n_batch); t26->grad = expand(gb, ggml_silu_back(ctx0, t26, t27->grad)); assert_shape_2d(t26->grad, n_ff, N*n_batch); @@ -1786,7 +1801,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( use_buf(1); t20->grad = t21->grad; assert_shape_2d(t20->grad, n_embd, N*n_batch); t19->grad = expand(gb, ggml_out_prod(ctx0, layer.wo, ggml_transpose(ctx0, t20->grad))); assert_shape_2d(t19->grad, n_embd, N*n_batch); - t18->grad = expand(gb, ggml_reshape(ctx0, t19->grad, t18)); assert_shape_4d(t18->grad, n_embd/n_head, n_head, N, n_batch); + t18->grad = expand(gb, ggml_reshape_4d(ctx0, t19->grad, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t18->grad, n_embd/n_head, n_head, N, n_batch); t17->grad = t18->grad; assert_shape_4d(t17->grad, n_embd/n_head, n_head, N, n_batch); t16->grad = expand(gb, ggml_permute(ctx0, t17->grad, 0, 2, 1, 3)); assert_shape_4d(t16->grad, n_embd/n_head, N, n_head, n_batch); struct ggml_tensor * flash_attn = expand(gb, ggml_flash_attn_back(ctx0, t13, t14, t15, t16->grad, true)); assert_shape_4d(flash_attn, n_embd/n_head, N*3, n_head, n_batch); @@ -1794,13 +1809,13 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( t14->grad = expand(gb, view__k(flash_attn)); assert_shape_4d(t14->grad, n_embd/n_head, N, n_head, n_batch); t13->grad = expand(gb, view__q(flash_attn)); assert_shape_4d(t13->grad, n_embd/n_head, N, n_head, n_batch); t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1)); assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head); - t11->grad = expand(gb, ggml_reshape(ctx0, ggml_cont(ctx0, t12->grad), t11)); assert_shape_2d(t11->grad, N*n_batch, n_embd); + t11->grad = expand(gb, ggml_reshape_2d(ctx0, ggml_cont(ctx0, t12->grad), N*n_batch, n_embd)); assert_shape_2d(t11->grad, N*n_batch, n_embd); t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3)); assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch); t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode)); assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch); - t08->grad = expand(gb, ggml_reshape(ctx0, t09->grad, t08)); assert_shape_2d(t08->grad, n_embd, N*n_batch); + t08->grad = expand(gb, ggml_reshape_2d(ctx0, t09->grad, n_embd, N*n_batch)); assert_shape_2d(t08->grad, n_embd, N*n_batch); t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3)); assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch); t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode)); assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch); - t05->grad = expand(gb, ggml_reshape(ctx0, t06->grad, t05)); assert_shape_2d(t05->grad, n_embd, N*n_batch); + t05->grad = expand(gb, ggml_reshape_2d(ctx0, t06->grad, n_embd, N*n_batch)); assert_shape_2d(t05->grad, n_embd, N*n_batch); t04->grad = expand(gb, ggml_add_inplace(ctx0, ggml_add_inplace(ctx0, ggml_out_prod(ctx0, layer.wv, t11->grad), @@ -1808,9 +1823,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( ggml_out_prod(ctx0, layer.wq, ggml_transpose(ctx0, t05->grad)))); assert_shape_2d(t04->grad, n_embd, N*n_batch); t03->grad = expand(gb, ggml_mul(ctx0, t04->grad, t02)); assert_shape_2d(t04->grad, n_embd, N*n_batch); use_buf(2); - t02->grad = expand(gb, ggml_mul(ctx0, t04->grad, t03)); assert_shape_2d(t02->grad, n_embd, N*n_batch); + t02->grad = expand(gb, ggml_mul(ctx0, t04->grad, ggml_repeat(ctx0, layer.attention_norm, t02))); assert_shape_2d(t02->grad, n_embd, N*n_batch); back_layer_inp = t02; - use_buf(1); use_buf(-1); layer.attention_norm->grad = expand(gb, add_or_set(layer.attention_norm->grad, ggml_repeat_back(ctx0, t03->grad, layer.attention_norm))); assert_shape_1d(layer.attention_norm->grad, n_embd); @@ -1822,18 +1836,21 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( layer.w1->grad = expand(gb, add_or_set(layer.w1->grad, ggml_out_prod(ctx0, t24, t26->grad))); assert_shape_2d(layer.w1->grad, n_embd, n_ff); layer.w2->grad = expand(gb, add_or_set(layer.w2->grad, ggml_out_prod(ctx0, t28, t29->grad))); assert_shape_2d(layer.w2->grad, n_ff, n_embd); layer.w3->grad = expand(gb, add_or_set(layer.w3->grad, ggml_out_prod(ctx0, t24, t25->grad))); assert_shape_2d(layer.w3->grad, n_embd, n_ff); - use_buf(1); } clr_buf(1); use_buf(1); t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad))); assert_shape_2d(t01->grad, n_embd, N*n_batch); use_buf(-1); model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings)); assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab); - clr_buf(2); - clr_buf(1); *logits = t35; + if (track_max_mem) { + printf("%s: max size compute buf0: %zu\n", __func__, buf_maxs[0]); + printf("%s: max size compute buf1: %zu\n", __func__, buf_maxs[1]); + printf("%s: max size compute buf2: %zu\n", __func__, buf_maxs[2]); + } + return t36; } From 59544f0cdfe252ddef0edca0dbba53902bcbb75f Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 11 Jun 2023 23:23:06 +0200 Subject: [PATCH 82/86] remove unnecessary scratch buffer 0 buf 0 is persistent memory, so we can just disable scratch for this by using buf -1 --- .../train-text-from-scratch.cpp | 141 ++++++++---------- 1 file changed, 64 insertions(+), 77 deletions(-) diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 7e9607f5ecca3..9244088dc84da 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -1347,10 +1347,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( struct ggml_tensor * targets, void * compute_buf_0, void * compute_buf_1, - void * compute_buf_2, size_t size_buf_0, size_t size_buf_1, - size_t size_buf_2, const int n_tokens, const int n_batch) { @@ -1383,13 +1381,11 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( }; int last_buf = -1; - size_t buf_offs[3] = { 0, 0, 0 }; - size_t buf_size[3] = { size_buf_0, - size_buf_1, - size_buf_2 }; - void * buf_data[3] = { compute_buf_0, - compute_buf_1, - compute_buf_2 }; + size_t buf_offs[2] = { 0, 0 }; + size_t buf_size[2] = { size_buf_0, + size_buf_1 }; + void * buf_data[2] = { compute_buf_0, + compute_buf_1 }; auto use_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data] (int buf) { size_t last_offs = 0; last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, }); @@ -1406,7 +1402,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( }; bool track_max_mem = false; - size_t buf_maxs[3] = { 0, 0, 0 }; + size_t buf_maxs[2] = { 0, 0 }; auto clr_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs, track_max_mem] (int buf) { if (buf < 0) return; @@ -1500,15 +1496,15 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( layer.w3->grad = ggml_dup_tensor(ctx0, layer.w3->grad); } + clr_buf(0); clr_buf(1); - clr_buf(2); use_buf(-1); struct ggml_tensor * t00 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); assert_shape_1d(t00, N*n_batch); memcpy(t00->data, tokens_input->data, ggml_element_size(t00)*N*n_batch); - use_buf(0); + use_buf(-1); struct ggml_tensor * t01 = expand(gf, ggml_get_rows(ctx0, model->tok_embeddings, t00)); assert_shape_2d(t01, n_embd, N*n_batch); @@ -1546,39 +1542,39 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( struct ggml_tensor * cur = t01; for (int il = 0; il < n_layer; ++il) { - clr_buf(1); + clr_buf(0); struct my_llama_layer & layer = model->layers[il]; - // tensors with values necessary for backward pass are in persistent buf(0) - // other tensors with buf(1) are only temporary needed, and their memory reused after layer is completed. - use_buf(0); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm (ctx0, cur)); assert_shape_2d(t02, n_embd, N*n_batch); - use_buf(1); struct ggml_tensor * t03 = expand(gf, ggml_repeat (ctx0, layer.attention_norm, t02)); assert_shape_2d(t03, n_embd, N*n_batch); - use_buf(0); struct ggml_tensor * t04 = expand(gf, ggml_mul (ctx0, t02, t03)); assert_shape_2d(t04, n_embd, N*n_batch); - use_buf(0); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat (ctx0, layer.wq, t04)); assert_shape_2d(t05, n_embd, N*n_batch); - use_buf(0); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch); - use_buf(0); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode)); assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch); - use_buf(0); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat (ctx0, layer.wk, t04)); assert_shape_2d(t08, n_embd, N*n_batch); - use_buf(0); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch); - use_buf(0); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode)); assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch); - use_buf(0); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat (ctx0, t04, layer.wv)); assert_shape_2d(t11, N*n_batch, n_embd); - use_buf(0); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head); - use_buf(0); struct ggml_tensor * t13 = expand(gf, ggml_permute (ctx0, t07, 0, 2, 1, 3)); assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch); - use_buf(0); struct ggml_tensor * t14 = expand(gf, ggml_permute (ctx0, t10, 0, 2, 1, 3)); assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch); - use_buf(0); struct ggml_tensor * t15 = expand(gf, ggml_permute (ctx0, t12, 0, 3, 1, 2)); assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch); - use_buf(0); struct ggml_tensor * t16 = expand(gf, ggml_flash_attn (ctx0, t13, t14, t15, true)); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch); - use_buf(1); struct ggml_tensor * t17 = expand(gf, ggml_permute (ctx0, t16, 0, 2, 1, 3)); assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch); - use_buf(0); struct ggml_tensor * t18 = expand(gf, ggml_cont (ctx0, t17)); assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch); - use_buf(0); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d (ctx0, t18, n_embd, N*n_batch)); assert_shape_2d(t19, n_embd, N*n_batch); - use_buf(1); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat (ctx0, layer.wo, t19)); assert_shape_2d(t20, n_embd, N*n_batch); - use_buf(0); struct ggml_tensor * t21 = expand(gf, ggml_add (ctx0, t20, cur)); assert_shape_2d(t21, n_embd, N*n_batch); - use_buf(0); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm (ctx0, t21)); assert_shape_2d(t22, n_embd, N*n_batch); - use_buf(1); struct ggml_tensor * t23 = expand(gf, ggml_repeat (ctx0, layer.ffn_norm, t22)); assert_shape_2d(t23, n_embd, N*n_batch); - use_buf(0); struct ggml_tensor * t24 = expand(gf, ggml_mul (ctx0, t23, t22)); assert_shape_2d(t24, n_embd, N*n_batch); - use_buf(0); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat (ctx0, layer.w3, t24)); assert_shape_2d(t25, n_ff, N*n_batch); - use_buf(0); struct ggml_tensor * t26 = expand(gf, ggml_mul_mat (ctx0, layer.w1, t24)); assert_shape_2d(t26, n_ff, N*n_batch); - use_buf(0); struct ggml_tensor * t27 = expand(gf, ggml_silu (ctx0, t26)); assert_shape_2d(t27, n_ff, N*n_batch); - use_buf(0); struct ggml_tensor * t28 = expand(gf, ggml_mul (ctx0, t27, t25)); assert_shape_2d(t28, n_ff, N*n_batch); - use_buf(1); struct ggml_tensor * t29 = expand(gf, ggml_mul_mat (ctx0, layer.w2, t28)); assert_shape_2d(t29, n_embd, N*n_batch); - use_buf(0); struct ggml_tensor * t30 = expand(gf, ggml_add (ctx0, t21, t29)); assert_shape_2d(t30, n_embd, N*n_batch); + // tensors with values necessary for backward pass are in persistent buf(-1) + // other tensors with buf(0) and buf(1) are only temporary needed, and their memory reused after layer is completed. + use_buf(-1); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm (ctx0, cur)); assert_shape_2d(t02, n_embd, N*n_batch); + use_buf( 0); struct ggml_tensor * t03 = expand(gf, ggml_repeat (ctx0, layer.attention_norm, t02)); assert_shape_2d(t03, n_embd, N*n_batch); + use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul (ctx0, t02, t03)); assert_shape_2d(t04, n_embd, N*n_batch); + use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat (ctx0, layer.wq, t04)); assert_shape_2d(t05, n_embd, N*n_batch); + use_buf(-1); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch); + use_buf(-1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode)); assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch); + use_buf(-1); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat (ctx0, layer.wk, t04)); assert_shape_2d(t08, n_embd, N*n_batch); + use_buf(-1); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch); + use_buf(-1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode)); assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch); + use_buf(-1); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat (ctx0, t04, layer.wv)); assert_shape_2d(t11, N*n_batch, n_embd); + use_buf(-1); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head); + use_buf(-1); struct ggml_tensor * t13 = expand(gf, ggml_permute (ctx0, t07, 0, 2, 1, 3)); assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch); + use_buf(-1); struct ggml_tensor * t14 = expand(gf, ggml_permute (ctx0, t10, 0, 2, 1, 3)); assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch); + use_buf(-1); struct ggml_tensor * t15 = expand(gf, ggml_permute (ctx0, t12, 0, 3, 1, 2)); assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch); + use_buf(-1); struct ggml_tensor * t16 = expand(gf, ggml_flash_attn (ctx0, t13, t14, t15, true)); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch); + use_buf( 0); struct ggml_tensor * t17 = expand(gf, ggml_permute (ctx0, t16, 0, 2, 1, 3)); assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch); + use_buf(-1); struct ggml_tensor * t18 = expand(gf, ggml_cont (ctx0, t17)); assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch); + use_buf(-1); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d (ctx0, t18, n_embd, N*n_batch)); assert_shape_2d(t19, n_embd, N*n_batch); + use_buf( 0); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat (ctx0, layer.wo, t19)); assert_shape_2d(t20, n_embd, N*n_batch); + use_buf(-1); struct ggml_tensor * t21 = expand(gf, ggml_add (ctx0, t20, cur)); assert_shape_2d(t21, n_embd, N*n_batch); + use_buf(-1); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm (ctx0, t21)); assert_shape_2d(t22, n_embd, N*n_batch); + use_buf( 0); struct ggml_tensor * t23 = expand(gf, ggml_repeat (ctx0, layer.ffn_norm, t22)); assert_shape_2d(t23, n_embd, N*n_batch); + use_buf(-1); struct ggml_tensor * t24 = expand(gf, ggml_mul (ctx0, t23, t22)); assert_shape_2d(t24, n_embd, N*n_batch); + use_buf(-1); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat (ctx0, layer.w3, t24)); assert_shape_2d(t25, n_ff, N*n_batch); + use_buf(-1); struct ggml_tensor * t26 = expand(gf, ggml_mul_mat (ctx0, layer.w1, t24)); assert_shape_2d(t26, n_ff, N*n_batch); + use_buf(-1); struct ggml_tensor * t27 = expand(gf, ggml_silu (ctx0, t26)); assert_shape_2d(t27, n_ff, N*n_batch); + use_buf(-1); struct ggml_tensor * t28 = expand(gf, ggml_mul (ctx0, t27, t25)); assert_shape_2d(t28, n_ff, N*n_batch); + use_buf( 0); struct ggml_tensor * t29 = expand(gf, ggml_mul_mat (ctx0, layer.w2, t28)); assert_shape_2d(t29, n_embd, N*n_batch); + use_buf(-1); struct ggml_tensor * t30 = expand(gf, ggml_add (ctx0, t21, t29)); assert_shape_2d(t30, n_embd, N*n_batch); t02L[il] = t02; t03L[il] = t03; t04L[il] = t04; @@ -1611,8 +1607,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( cur = t30; } - clr_buf(1); - use_buf(1); + clr_buf(0); + use_buf(0); struct ggml_tensor * t31 = expand(gf, ggml_rms_norm (ctx0, cur)); assert_shape_2d(t31, n_embd, N*n_batch); struct ggml_tensor * t32 = expand(gf, ggml_repeat (ctx0, model->norm, t31)); assert_shape_2d(t32, n_embd, N*n_batch); struct ggml_tensor * t33 = expand(gf, ggml_mul (ctx0, t32, t31)); assert_shape_2d(t33, n_embd, N*n_batch); @@ -1720,13 +1716,13 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( *gb = *gf; - // t36->grad gets set to one by optimizer, so we need to create the tensor. - // initialize it with 1.0f to make sure. + // t36->grad gets set to one by optimizer, so we need the tensor. GGML_ASSERT(t36->grad != NULL); + // initialize it with 1.0f to make sure. // use_buf(-1); // t36->grad = expand(gb, ggml_new_f32(ctx0, 1.0f)); - use_buf(1); + use_buf(0); t35->grad = expand(gb, ggml_cross_entropy_loss_back(ctx0, t35, targets, t36->grad)); assert_shape_3d(t35->grad, n_vocab, N, n_batch); t34->grad = expand(gb, ggml_reshape_2d (ctx0, t35->grad, n_vocab, N*n_batch)); assert_shape_2d(t34->grad, n_vocab, N*n_batch); t33->grad = expand(gb, ggml_out_prod (ctx0, model->output, ggml_transpose(ctx0, t34->grad))); assert_shape_2d(t33->grad, n_embd, N*n_batch); @@ -1737,8 +1733,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( model->norm->grad = expand(gb, add_or_set(model->norm->grad, ggml_repeat_back(ctx0, t32->grad, model->norm))); assert_shape_1d(model->norm->grad, n_embd); model->output->grad = expand(gb, add_or_set(model->output->grad, ggml_out_prod(ctx0, t33, t34->grad))); assert_shape_2d(model->output->grad, n_embd, n_vocab); - clr_buf(2); - use_buf(2); + clr_buf(1); + use_buf(1); t31->grad = expand(gb, ggml_mul(ctx0, t33->grad, t32)); assert_shape_2d(t31->grad, n_embd, N*n_batch); struct ggml_tensor * back_layer_inp = t31; @@ -1778,13 +1774,13 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( struct ggml_tensor * t29 = t29L[il]; struct ggml_tensor * t30 = t30L[il]; - clr_buf(1); - use_buf(1); + clr_buf(0); + use_buf(0); t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch); if (grad_layer_inp) { t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch); } - clr_buf(2); + clr_buf(1); t29->grad = t30->grad; assert_shape_2d(t29->grad, n_embd, N*n_batch); t28->grad = expand(gb, ggml_out_prod(ctx0, layer.w2, ggml_transpose(ctx0, t29->grad))); assert_shape_2d(t28->grad, n_ff, N*n_batch); t27->grad = expand(gb, ggml_mul(ctx0, t28->grad, t25)); assert_shape_2d(t27->grad, n_ff, N*n_batch); @@ -1795,10 +1791,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( ggml_out_prod(ctx0, layer.w3, ggml_transpose(ctx0, t25->grad)))); assert_shape_2d(t24->grad, n_embd, N*n_batch); t23->grad = expand(gb, ggml_mul(ctx0, t24->grad, t22)); assert_shape_2d(t23->grad, n_embd, N*n_batch); t22->grad = expand(gb, ggml_mul(ctx0, t24->grad, ggml_repeat(ctx0, layer.ffn_norm, t24->grad))); assert_shape_2d(t22->grad, n_embd, N*n_batch); - use_buf(2); + use_buf(1); t21->grad = expand(gb, ggml_add(ctx0, t30->grad, ggml_rms_norm_back(ctx0, t21, t22->grad))); assert_shape_2d(t21->grad, n_embd, N*n_batch); grad_layer_inp = t21; - use_buf(1); + use_buf(0); t20->grad = t21->grad; assert_shape_2d(t20->grad, n_embd, N*n_batch); t19->grad = expand(gb, ggml_out_prod(ctx0, layer.wo, ggml_transpose(ctx0, t20->grad))); assert_shape_2d(t19->grad, n_embd, N*n_batch); t18->grad = expand(gb, ggml_reshape_4d(ctx0, t19->grad, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t18->grad, n_embd/n_head, n_head, N, n_batch); @@ -1822,9 +1818,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( ggml_out_prod(ctx0, layer.wk, ggml_transpose(ctx0, t08->grad))), ggml_out_prod(ctx0, layer.wq, ggml_transpose(ctx0, t05->grad)))); assert_shape_2d(t04->grad, n_embd, N*n_batch); t03->grad = expand(gb, ggml_mul(ctx0, t04->grad, t02)); assert_shape_2d(t04->grad, n_embd, N*n_batch); - use_buf(2); + use_buf(1); t02->grad = expand(gb, ggml_mul(ctx0, t04->grad, ggml_repeat(ctx0, layer.attention_norm, t02))); assert_shape_2d(t02->grad, n_embd, N*n_batch); back_layer_inp = t02; + // use_buf(0); use_buf(-1); layer.attention_norm->grad = expand(gb, add_or_set(layer.attention_norm->grad, ggml_repeat_back(ctx0, t03->grad, layer.attention_norm))); assert_shape_1d(layer.attention_norm->grad, n_embd); @@ -1836,19 +1833,21 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( layer.w1->grad = expand(gb, add_or_set(layer.w1->grad, ggml_out_prod(ctx0, t24, t26->grad))); assert_shape_2d(layer.w1->grad, n_embd, n_ff); layer.w2->grad = expand(gb, add_or_set(layer.w2->grad, ggml_out_prod(ctx0, t28, t29->grad))); assert_shape_2d(layer.w2->grad, n_ff, n_embd); layer.w3->grad = expand(gb, add_or_set(layer.w3->grad, ggml_out_prod(ctx0, t24, t25->grad))); assert_shape_2d(layer.w3->grad, n_embd, n_ff); + // use_buf(0); } - clr_buf(1); - use_buf(1); + clr_buf(0); + use_buf(0); t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad))); assert_shape_2d(t01->grad, n_embd, N*n_batch); use_buf(-1); model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings)); assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab); + // clr_buf(1); + // clr_buf(0); *logits = t35; if (track_max_mem) { printf("%s: max size compute buf0: %zu\n", __func__, buf_maxs[0]); printf("%s: max size compute buf1: %zu\n", __func__, buf_maxs[1]); - printf("%s: max size compute buf2: %zu\n", __func__, buf_maxs[2]); } return t36; @@ -2649,7 +2648,6 @@ struct train_params { int mem_compute_gb; int mem_compute0_gb; int mem_compute1_gb; - int mem_compute2_gb; }; struct train_params get_default_train_params() { @@ -2694,10 +2692,9 @@ struct train_params get_default_train_params() { params.adam_decay = 1e-3; params.mem_model_gb = 2; - params.mem_compute_gb = 8; - params.mem_compute0_gb = 24; - params.mem_compute1_gb = 8; - params.mem_compute2_gb = 8; + params.mem_compute_gb = 24; + params.mem_compute0_gb = 8; + params.mem_compute1_gb = 2; return params; } @@ -2744,7 +2741,6 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p fprintf(stderr, " --mem-compute N Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb); fprintf(stderr, " --mem-compute0 N Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute0_gb); fprintf(stderr, " --mem-compute1 N Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute1_gb); - fprintf(stderr, " --mem-compute2 N Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute2_gb); fprintf(stderr, "\n"); } @@ -2954,12 +2950,6 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) { break; } params->mem_compute1_gb = std::stoi(argv[i]); - } else if (arg == "--mem-compute2") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->mem_compute2_gb = std::stoi(argv[i]); } else if (arg == "-h" || arg == "--help") { train_print_usage(argc, argv, &default_params); exit(0); @@ -3117,10 +3107,8 @@ int main(int argc, char ** argv) { size_t size_buf_0 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute0_gb); size_t size_buf_1 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute1_gb); - size_t size_buf_2 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute2_gb); uint8_t * compute_buf_0 = new uint8_t[size_buf_0]; uint8_t * compute_buf_1 = new uint8_t[size_buf_1]; - uint8_t * compute_buf_2 = new uint8_t[size_buf_2]; GGML_ASSERT(n_tokens < (int) train_tokens.size()); std::vector train_samples; @@ -3182,8 +3170,8 @@ int main(int argc, char ** argv) { &model, ctx0, gf, gb, &logits, tokens_input, target_probs, - compute_buf_0, compute_buf_1, compute_buf_2, - size_buf_0, size_buf_1, size_buf_2, + compute_buf_0, compute_buf_1, + size_buf_0, size_buf_1, n_tokens, n_batch); } else if (params.use_flash) { logits = forward_batch_wo_cache_flash_attn(&model, ctx0, gf, tokens_input, n_tokens, n_batch); @@ -3335,7 +3323,6 @@ int main(int argc, char ** argv) { delete[] compute_addr; delete[] compute_buf_0; delete[] compute_buf_1; - delete[] compute_buf_2; ggml_free(model.ctx); return 0; From 7be3222b64ab7ada5ea297fa3841d7af8fdbb911 Mon Sep 17 00:00:00 2001 From: xaedes Date: Mon, 12 Jun 2023 00:01:18 +0200 Subject: [PATCH 83/86] avoid creating unnecessary grad tensors previously we need to create grads for model parameters, so that expand(..) correctly populates cgraph->leafs & cgraph->grads this wasted memory, because unnecessary grad for each op were automatically created: the automatically generated grad was unnecessary because we later manually set the grad (e.g. t35->grad = expand(gb, ...) ). this discarded the automatically generated grad resulting in wasted memory. improved this by changing expand(..) to not use ggml_build_forward_expand. expand set cgraph->nodes but not the leafs. cgraph->leafs & cgraph->grads are set in another pass after the last expand call. --- .../train-text-from-scratch.cpp | 124 ++++++++++++++---- 1 file changed, 96 insertions(+), 28 deletions(-) diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 9244088dc84da..63f976f0db3dc 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -1337,6 +1337,82 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn( return inpL; } +// expand the graph nodes without creating leafs. +struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) { + // check if already visited + for (int i = 0; i < g->n_nodes; i++) { + if (g->nodes[i] == t) { + return t; + } + } + + for (int i = 0; i < g->n_leafs; i++) { + if (g->leafs[i] == t) { + return t; + } + } + + if (t->src0) { + expand(g, t->src0); + } + + if (t->src1) { + expand(g, t->src1); + } + + for (int i = 0; i < GGML_MAX_OPT; ++i) { + if (t->opt[i]) { + expand(g, t->opt[i]); + } + } + + GGML_ASSERT(g->n_nodes < GGML_MAX_NODES); + + if (strlen(t->name) == 0) { + snprintf(t->name, sizeof(t->name), "node_%d", g->n_nodes); + } + + g->nodes[g->n_nodes] = t; + g->grads[g->n_nodes] = t->grad; + g->n_nodes++; + return t; +} + +void graph_set_leafs_grads(struct ggml_cgraph * g) { + // moves leaf nodes to g->leafs. + // i.e. g->n_nodes might change. + int n_nodes = 0; + for (int i = 0; i < g->n_nodes; ++i) { + struct ggml_tensor * node = g->nodes[i]; + const bool is_leaf = node->op == GGML_OP_NONE && node->grad == NULL; + if (is_leaf) { + GGML_ASSERT(g->n_leafs < GGML_MAX_NODES); + + if (strlen(node->name) == 0) { + snprintf(node->name, sizeof(node->name), "leaf_%d", g->n_leafs); + } + + g->leafs[g->n_leafs] = node; + g->n_leafs++; + } else { + GGML_ASSERT(n_nodes < GGML_MAX_NODES); + + if (strlen(node->name) == 0) { + snprintf(node->name, sizeof(node->name), "node_%d", n_nodes); + } + + g->nodes[n_nodes] = node; + g->grads[n_nodes] = node->grad; + n_nodes++; + } + } + for (int i=n_nodes; i < g->n_nodes; ++i) { + g->nodes[n_nodes] = NULL; + g->grads[n_nodes] = NULL; + } + g->n_nodes = n_nodes; +} + struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( struct my_llama_model * model, struct ggml_context * ctx0, @@ -1375,11 +1451,6 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( const int n_ff = get_n_ff(&hparams); const int rope_mode = 0; - auto expand = [] (struct ggml_cgraph * g, struct ggml_tensor * t) -> struct ggml_tensor * { - ggml_build_forward_expand(g, t); - return t; - }; - int last_buf = -1; size_t buf_offs[2] = { 0, 0 }; size_t buf_size[2] = { size_buf_0, @@ -1423,6 +1494,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( } }; + auto view__q = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * { int64_t ne0 = n_embd/n_head; int64_t ne1 = N; @@ -1472,28 +1544,21 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( use_buf(-1); - // need to create grads for model parameters, so that expand(..) correctly populates cgraph->leafs & cgraph->grads - // this wastes memory, because unnecessary grad for each op is automatically created: - // the automatically generated grad is unnecessary because we later manually set the grad (e.g. t35->grad = expand(gb, ...) ). - // this discards the automatically generated grad resulting in wasted memory. - // TODO: improve this, possibly by changing expand(..) to not use ggml_build_forward_expand. - // expand should correctly set cgraph->nodes. - // cgraph->leafs & cgraph->grads could be set in another pass after the last expand call. - model->tok_embeddings->grad = ggml_dup_tensor(ctx0, model->tok_embeddings->grad); - model->norm->grad = ggml_dup_tensor(ctx0, model->norm->grad); - model->output->grad = ggml_dup_tensor(ctx0, model->output->grad); + model->tok_embeddings->grad = NULL; + model->norm->grad = NULL; + model->output->grad = NULL; for (int il = 0; il < n_layer; ++il) { struct my_llama_layer & layer = model->layers[il]; - layer.attention_norm->grad = ggml_dup_tensor(ctx0, layer.attention_norm->grad); - layer.wq->grad = ggml_dup_tensor(ctx0, layer.wq->grad); - layer.wk->grad = ggml_dup_tensor(ctx0, layer.wk->grad); - layer.wv->grad = ggml_dup_tensor(ctx0, layer.wv->grad); - layer.wo->grad = ggml_dup_tensor(ctx0, layer.wo->grad); - layer.ffn_norm->grad = ggml_dup_tensor(ctx0, layer.ffn_norm->grad); - layer.w1->grad = ggml_dup_tensor(ctx0, layer.w1->grad); - layer.w2->grad = ggml_dup_tensor(ctx0, layer.w2->grad); - layer.w3->grad = ggml_dup_tensor(ctx0, layer.w3->grad); + layer.attention_norm->grad = NULL; + layer.wq->grad = NULL; + layer.wk->grad = NULL; + layer.wv->grad = NULL; + layer.wo->grad = NULL; + layer.ffn_norm->grad = NULL; + layer.w1->grad = NULL; + layer.w2->grad = NULL; + layer.w3->grad = NULL; } clr_buf(0); @@ -1717,10 +1782,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( *gb = *gf; // t36->grad gets set to one by optimizer, so we need the tensor. - GGML_ASSERT(t36->grad != NULL); // initialize it with 1.0f to make sure. - // use_buf(-1); - // t36->grad = expand(gb, ggml_new_f32(ctx0, 1.0f)); + use_buf(-1); + t36->grad = expand(gb, ggml_new_f32(ctx0, 1.0f)); use_buf(0); t35->grad = expand(gb, ggml_cross_entropy_loss_back(ctx0, t35, targets, t36->grad)); assert_shape_3d(t35->grad, n_vocab, N, n_batch); @@ -1839,7 +1903,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( use_buf(0); t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad))); assert_shape_2d(t01->grad, n_embd, N*n_batch); use_buf(-1); - model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings)); assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab); + model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings)); assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab); // clr_buf(1); // clr_buf(0); @@ -1850,6 +1914,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( printf("%s: max size compute buf1: %zu\n", __func__, buf_maxs[1]); } + // now that all grads are created, set the graph leafs and grads + graph_set_leafs_grads(gf); + graph_set_leafs_grads(gb); + return t36; } From 32dc22728471a645f0ae8020441e17025e3f33b0 Mon Sep 17 00:00:00 2001 From: xaedes Date: Mon, 12 Jun 2023 20:42:44 +0200 Subject: [PATCH 84/86] print used training seed --- .../train-text-from-scratch/train-text-from-scratch.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 63f976f0db3dc..d4d293e232c22 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -3043,12 +3043,11 @@ int main(int argc, char ** argv) { return 1; } - if (params.seed < 0) { - srand(time(NULL)); - } else { - srand(params.seed); + params.seed = time(NULL); } + printf("%s: seed: %d\n", __func__, params.seed); + srand(params.seed); struct llama_context_params llama_params = llama_context_default_params(); llama_params.vocab_only = true; From cb469f7efb911eabe3e53ece965e1be58ad9ea51 Mon Sep 17 00:00:00 2001 From: xaedes Date: Mon, 12 Jun 2023 20:43:48 +0200 Subject: [PATCH 85/86] zero initialize gfbuf and gbbuf --- examples/train-text-from-scratch/train-text-from-scratch.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index d4d293e232c22..51271b497ffe5 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -3218,6 +3218,9 @@ int main(int argc, char ** argv) { struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32) + (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0)); struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32) + (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0)); + memset(gfbuf->data, 0, ggml_nbytes(gfbuf)); + memset(gbbuf->data, 0, ggml_nbytes(gbbuf)); + struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data; struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data; From d4b6438708148c36f605a482192887fac5242244 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 13 Jun 2023 21:38:00 +0300 Subject: [PATCH 86/86] ci : re-enable workflows + add README for training --- .github/workflows/editorconfig.yml | 17 ++++++++++++++++ .github/workflows/tidy-review.yml | 23 ++++++++++++++++++++++ examples/train-text-from-scratch/README.md | 22 +++++++++++++++++++++ llama.cpp | 2 ++ 4 files changed, 64 insertions(+) create mode 100644 .github/workflows/editorconfig.yml create mode 100644 .github/workflows/tidy-review.yml create mode 100644 examples/train-text-from-scratch/README.md diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml new file mode 100644 index 0000000000000..b4e535acf1f64 --- /dev/null +++ b/.github/workflows/editorconfig.yml @@ -0,0 +1,17 @@ +name: EditorConfig Checker + +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + editorconfig: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: editorconfig-checker/action-editorconfig-checker@main + - run: editorconfig-checker diff --git a/.github/workflows/tidy-review.yml b/.github/workflows/tidy-review.yml new file mode 100644 index 0000000000000..a4bc8d976560e --- /dev/null +++ b/.github/workflows/tidy-review.yml @@ -0,0 +1,23 @@ +name: clang-tidy-review + +on: + pull_request: + branches: + - master + +jobs: + clang-tidy-review: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - uses: ZedThree/clang-tidy-review@v0.13.0 + id: review + with: + lgtm_comment_body: '' + build_dir: build + cmake_command: cmake . -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=on + split_workflow: true + + - uses: ZedThree/clang-tidy-review/upload@v0.13.0 diff --git a/examples/train-text-from-scratch/README.md b/examples/train-text-from-scratch/README.md new file mode 100644 index 0000000000000..5344d1f522a57 --- /dev/null +++ b/examples/train-text-from-scratch/README.md @@ -0,0 +1,22 @@ +# train-text-from-scratch + +Basic usage instructions: + +```bash +# get training data +wget https://github.com/brunoklein99/deep-learning-notes/blob/master/shakespeare.txt + +# train +./bin/train-text-from-scratch \ + --vocab-model ../models/ggml-vocab.bin \ + --ctx 64 --embd 256 --head 8 --layer 16 \ + --checkpoint-in chk-shakespeare-256x16.bin \ + --checkpoint-out chk-shakespeare-256x16.bin \ + --model-out ggml-shakespeare-256x16-f32.bin \ + --train-data "shakespeare.txt" \ + -t 6 -b 16 -n 32 --seed 1 --adam-iter 16 \ + --print-details-interval 0 --predict 16 --use-flash + +# predict +./bin/main -m ggml-shakespeare-256x16-f32.bin +``` diff --git a/llama.cpp b/llama.cpp index dd9725ea3c27c..0dc45bd6dfe3f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1206,6 +1206,7 @@ static void llama_model_load_internal( mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); (void) vram_scratch; + (void) n_batch; #ifdef GGML_USE_CUBLAS vram_scratch = n_batch * MB; ggml_cuda_set_scratch_size(vram_scratch); @@ -1233,6 +1234,7 @@ static void llama_model_load_internal( model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor); } + (void) tensor_split; #if defined(GGML_USE_CUBLAS) { ggml_cuda_set_tensor_split(tensor_split);