From 1659d775156773b457d1e3ecdac2438fac7ae614 Mon Sep 17 00:00:00 2001
From: SuperUserNameMan <yoann@terminajones.com>
Date: Tue, 13 Jun 2023 19:08:37 +0200
Subject: [PATCH 1/6] Create `simple.cpp`

---
 examples/simple/simple.cpp | 177 +++++++++++++++++++++++++++++++++++++
 1 file changed, 177 insertions(+)
 create mode 100644 examples/simple/simple.cpp
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
new file mode 100644
index 0000000000000..6593fdbc8b796
--- /dev/null
+++ b/examples/simple/simple.cpp
@@ -0,0 +1,177 @@
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include "common.h"
+#include "llama.h"
+#include "build-info.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+#include <signal.h>
+#include <unistd.h>
+#elif defined (_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
+#include <signal.h>
+#endif
+
+
+
+int main(int argc, char ** argv)
+{
+    gpt_params params;
+
+    //---------------------------------
+    // Print help :
+    //---------------------------------
+
+    if ( argc == 1 || argv[1][0] == '-' )
+    {
+        printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
+        return 1 ;
+    }
+
+    //---------------------------------
+    // Load parameters :
+    //---------------------------------
+
+    if ( argc >= 2 )
+    {
+        params.model = argv[1];
+    }
+
+    if ( argc >= 3 )
+    {
+        params.prompt = argv[2]; 
+    }
+    
+    if ( params.prompt.empty() )
+    {
+        params.prompt = "Hello my name is";
+    }
+
+    //---------------------------------
+    // Init LLM :
+    //---------------------------------
+
+    llama_init_backend();
+
+    llama_context * ctx ;
+
+    ctx = llama_init_from_gpt_params( params );
+    
+    if ( ctx == NULL ) 
+    {
+        fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
+        return 1;
+    }
+
+    //---------------------------------
+    // Tokenize the prompt
+    //---------------------------------
+
+    std::vector<llama_token> tokens_list;
+    tokens_list = ::llama_tokenize( ctx , params.prompt , true );
+   
+    const int max_context_size     = llama_n_ctx( ctx );
+    const int max_tokens_list_size = max_context_size - 4 ;
+
+    if ( (int)tokens_list.size() > max_tokens_list_size ) 
+    {
+        fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" , 
+             __func__ , (int)tokens_list.size() , max_tokens_list_size );
+        return 1;
+    }
+
+    fprintf( stderr, "\n\n" );
+
+    // Print the tokens from the prompt :
+
+    for( auto id : tokens_list ) 
+    {
+        printf( "%s" , llama_token_to_str( ctx , id ) );
+    }
+    fflush(stdout);
+
+
+    //---------------------------------
+    // Main prediction loop :
+    //---------------------------------
+
+    // The LLM keeps a contextual cache memory of previous token evaluation.
+    // Usually, once this cache is full, it is required to recompute a compressed context based on previous
+    // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
+    // example, we will just going to stop the loop.
+
+    while ( llama_get_kv_cache_token_count( ctx ) < max_context_size ) 
+    {
+
+        //---------------------------------
+        // Evaluate the tokens :
+        //---------------------------------
+
+        if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) ) 
+        {
+            fprintf( stderr,  "%s : failed to eval\n" , __func__ );
+            return 1;
+        }
+  
+        tokens_list.clear();
+
+        //---------------------------------
+        // Select the best prediction :
+        //---------------------------------
+
+        llama_token new_token_id = 0;
+        
+        auto logits  = llama_get_logits( ctx );
+        auto n_vocab = llama_n_vocab( ctx ); 
+
+        std::vector<llama_token_data> candidates;
+        candidates.reserve( n_vocab );
+
+        for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
+        {
+            candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
+        }
+
+        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+        // Select it using the "Greedy sampling" method :
+        new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+
+
+        // is it an end of stream ?
+        if ( new_token_id == llama_token_eos() )
+        {
+            fprintf(stderr, " [end of text]\n");
+            break;
+        }
+
+        // Print the new token :
+        printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
+        fflush(stdout);
+
+        // Push this new token for next evaluation :
+        tokens_list.push_back( new_token_id );
+
+    } // wend of main loop
+
+    llama_free(ctx);
+
+    return 0;
+}
+
+// EOF

From ba636acb1fc2e9c152f8fe1efb736faa362e3dff Mon Sep 17 00:00:00 2001
From: SuperUserNameMan <yoann@terminajones.com>
Date: Tue, 13 Jun 2023 19:09:44 +0200
Subject: [PATCH 2/6] minimalist example `CMakeLists.txt`

---
 examples/simple/CMakeLists.txt | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 examples/simple/CMakeLists.txt

diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt
new file mode 100644
index 0000000000000..1568f7364184a
--- /dev/null
+++ b/examples/simple/CMakeLists.txt
@@ -0,0 +1,7 @@
+set(TARGET simple)
+add_executable(${TARGET} simple.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()

From bbe9c59618295f9f33df12138ca760352d6f527e Mon Sep 17 00:00:00 2001
From: SuperUserNameMan <yoann@terminajones.com>
Date: Tue, 13 Jun 2023 19:12:45 +0200
Subject: [PATCH 3/6] Update Makefile for minimalist example

---
 Makefile | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 9a08d610b2207..55d5c2e52d4d7 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot simple
 
 ifdef LLAMA_BUILD_SERVER
 	BUILD_TARGETS += server
@@ -270,6 +270,12 @@ main: examples/main/main.cpp                                  build-info.h ggml.
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo
+	
+simple: examples/simple/simple.cpp                                  build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	@echo
+	@echo '====  Run ./simple -h for help.  ===='
+	@echo
 
 quantize: examples/quantize/quantize.cpp                      build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

From c369d11905f88291d5fa04d81bb98d90af59f13d Mon Sep 17 00:00:00 2001
From: SuperUserNameMan <yoann@terminajones.com>
Date: Tue, 13 Jun 2023 19:36:27 +0200
Subject: [PATCH 4/6] remove 273: Trailing whitespace

---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 55d5c2e52d4d7..b6d6aeb0a8e59 100644
--- a/Makefile
+++ b/Makefile
@@ -270,8 +270,8 @@ main: examples/main/main.cpp                                  build-info.h ggml.
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo
-	
-simple: examples/simple/simple.cpp                                  build-info.h ggml.o llama.o common.o $(OBJS)
+
+simple: examples/simple/simple.cpp                            build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./simple -h for help.  ===='

From 7a4f712a29a2e65648b3eed20582ee73173a3181 Mon Sep 17 00:00:00 2001
From: SuperUserNameMan <yoann@terminajones.com>
Date: Wed, 14 Jun 2023 08:58:18 +0200
Subject: [PATCH 5/6] removed trailing white spaces simple.cpp

---
 examples/simple/simple.cpp | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 6593fdbc8b796..45ed4b8b9f68c 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -54,9 +54,9 @@ int main(int argc, char ** argv)
 
     if ( argc >= 3 )
     {
-        params.prompt = argv[2]; 
+        params.prompt = argv[2];
     }
-    
+
     if ( params.prompt.empty() )
     {
         params.prompt = "Hello my name is";
@@ -71,8 +71,8 @@ int main(int argc, char ** argv)
     llama_context * ctx ;
 
     ctx = llama_init_from_gpt_params( params );
-    
-    if ( ctx == NULL ) 
+
+    if ( ctx == NULL )
     {
         fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
         return 1;
@@ -84,13 +84,13 @@ int main(int argc, char ** argv)
 
     std::vector<llama_token> tokens_list;
     tokens_list = ::llama_tokenize( ctx , params.prompt , true );
-   
+
     const int max_context_size     = llama_n_ctx( ctx );
     const int max_tokens_list_size = max_context_size - 4 ;
 
-    if ( (int)tokens_list.size() > max_tokens_list_size ) 
+    if ( (int)tokens_list.size() > max_tokens_list_size )
     {
-        fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" , 
+        fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
              __func__ , (int)tokens_list.size() , max_tokens_list_size );
         return 1;
     }
@@ -99,7 +99,7 @@ int main(int argc, char ** argv)
 
     // Print the tokens from the prompt :
 
-    for( auto id : tokens_list ) 
+    for( auto id : tokens_list )
     {
         printf( "%s" , llama_token_to_str( ctx , id ) );
     }
@@ -115,19 +115,19 @@ int main(int argc, char ** argv)
     // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
     // example, we will just going to stop the loop.
 
-    while ( llama_get_kv_cache_token_count( ctx ) < max_context_size ) 
+    while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
     {
 
         //---------------------------------
         // Evaluate the tokens :
         //---------------------------------
 
-        if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) ) 
+        if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
         {
             fprintf( stderr,  "%s : failed to eval\n" , __func__ );
             return 1;
         }
-  
+
         tokens_list.clear();
 
         //---------------------------------
@@ -135,9 +135,9 @@ int main(int argc, char ** argv)
         //---------------------------------
 
         llama_token new_token_id = 0;
-        
+
         auto logits  = llama_get_logits( ctx );
-        auto n_vocab = llama_n_vocab( ctx ); 
+        auto n_vocab = llama_n_vocab( ctx );
 
         std::vector<llama_token_data> candidates;
         candidates.reserve( n_vocab );

From 8f65eecf20383f716dcc36f558c6924a5f27e21b Mon Sep 17 00:00:00 2001
From: SuperUserNameMan <yoann@terminajones.com>
Date: Wed, 14 Jun 2023 09:33:31 +0200
Subject: [PATCH 6/6] typo and comments simple.cpp

---
 examples/simple/simple.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 45ed4b8b9f68c..76f991cdc028f 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -79,7 +79,7 @@ int main(int argc, char ** argv)
     }
 
     //---------------------------------
-    // Tokenize the prompt
+    // Tokenize the prompt :
     //---------------------------------
 
     std::vector<llama_token> tokens_list;
@@ -103,6 +103,7 @@ int main(int argc, char ** argv)
     {
         printf( "%s" , llama_token_to_str( ctx , id ) );
     }
+
     fflush(stdout);
 
 
@@ -113,11 +114,10 @@ int main(int argc, char ** argv)
     // The LLM keeps a contextual cache memory of previous token evaluation.
     // Usually, once this cache is full, it is required to recompute a compressed context based on previous
     // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
-    // example, we will just going to stop the loop.
+    // example, we will just stop the loop once this cache is full or once an end of stream is detected.
 
     while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
     {
-
         //---------------------------------
         // Evaluate the tokens :
         //---------------------------------
@@ -137,7 +137,7 @@ int main(int argc, char ** argv)
         llama_token new_token_id = 0;
 
         auto logits  = llama_get_logits( ctx );
-        auto n_vocab = llama_n_vocab( ctx );
+        auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
 
         std::vector<llama_token_data> candidates;
         candidates.reserve( n_vocab );
@@ -150,7 +150,7 @@ int main(int argc, char ** argv)
         llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
 
         // Select it using the "Greedy sampling" method :
-        new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+        new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
 
 
         // is it an end of stream ?
@@ -162,14 +162,14 @@ int main(int argc, char ** argv)
 
         // Print the new token :
         printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
-        fflush(stdout);
+        fflush( stdout );
 
         // Push this new token for next evaluation :
         tokens_list.push_back( new_token_id );
 
     } // wend of main loop
 
-    llama_free(ctx);
+    llama_free( ctx );
 
     return 0;
 }