From 1659d775156773b457d1e3ecdac2438fac7ae614 Mon Sep 17 00:00:00 2001 From: SuperUserNameMan Date: Tue, 13 Jun 2023 19:08:37 +0200 Subject: [PATCH 1/6] Create `simple.cpp` --- examples/simple/simple.cpp | 177 +++++++++++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 examples/simple/simple.cpp diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp new file mode 100644 index 0000000000000..6593fdbc8b796 --- /dev/null +++ b/examples/simple/simple.cpp @@ -0,0 +1,177 @@ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include "common.h" +#include "llama.h" +#include "build-info.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) +#include +#include +#elif defined (_WIN32) +#define WIN32_LEAN_AND_MEAN +#define NOMINMAX +#include +#include +#endif + + + +int main(int argc, char ** argv) +{ + gpt_params params; + + //--------------------------------- + // Print help : + //--------------------------------- + + if ( argc == 1 || argv[1][0] == '-' ) + { + printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] ); + return 1 ; + } + + //--------------------------------- + // Load parameters : + //--------------------------------- + + if ( argc >= 2 ) + { + params.model = argv[1]; + } + + if ( argc >= 3 ) + { + params.prompt = argv[2]; + } + + if ( params.prompt.empty() ) + { + params.prompt = "Hello my name is"; + } + + //--------------------------------- + // Init LLM : + //--------------------------------- + + llama_init_backend(); + + llama_context * ctx ; + + ctx = llama_init_from_gpt_params( params ); + + if ( ctx == NULL ) + { + fprintf( stderr , "%s: error: unable to load model\n" , __func__ ); + return 1; + } + + //--------------------------------- + // Tokenize the prompt + //--------------------------------- + + std::vector tokens_list; + tokens_list = ::llama_tokenize( ctx , params.prompt , true ); + + const int max_context_size = llama_n_ctx( ctx ); + const int max_tokens_list_size = max_context_size - 4 ; + + if ( (int)tokens_list.size() > max_tokens_list_size ) + { + fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" , + __func__ , (int)tokens_list.size() , max_tokens_list_size ); + return 1; + } + + fprintf( stderr, "\n\n" ); + + // Print the tokens from the prompt : + + for( auto id : tokens_list ) + { + printf( "%s" , llama_token_to_str( ctx , id ) ); + } + fflush(stdout); + + + //--------------------------------- + // Main prediction loop : + //--------------------------------- + + // The LLM keeps a contextual cache memory of previous token evaluation. + // Usually, once this cache is full, it is required to recompute a compressed context based on previous + // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist + // example, we will just going to stop the loop. + + while ( llama_get_kv_cache_token_count( ctx ) < max_context_size ) + { + + //--------------------------------- + // Evaluate the tokens : + //--------------------------------- + + if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) ) + { + fprintf( stderr, "%s : failed to eval\n" , __func__ ); + return 1; + } + + tokens_list.clear(); + + //--------------------------------- + // Select the best prediction : + //--------------------------------- + + llama_token new_token_id = 0; + + auto logits = llama_get_logits( ctx ); + auto n_vocab = llama_n_vocab( ctx ); + + std::vector candidates; + candidates.reserve( n_vocab ); + + for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ ) + { + candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } ); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + + // Select it using the "Greedy sampling" method : + new_token_id = llama_sample_token_greedy(ctx, &candidates_p); + + + // is it an end of stream ? + if ( new_token_id == llama_token_eos() ) + { + fprintf(stderr, " [end of text]\n"); + break; + } + + // Print the new token : + printf( "%s" , llama_token_to_str( ctx , new_token_id ) ); + fflush(stdout); + + // Push this new token for next evaluation : + tokens_list.push_back( new_token_id ); + + } // wend of main loop + + llama_free(ctx); + + return 0; +} + +// EOF From ba636acb1fc2e9c152f8fe1efb736faa362e3dff Mon Sep 17 00:00:00 2001 From: SuperUserNameMan Date: Tue, 13 Jun 2023 19:09:44 +0200 Subject: [PATCH 2/6] minimalist example `CMakeLists.txt` --- examples/simple/CMakeLists.txt | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 examples/simple/CMakeLists.txt diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt new file mode 100644 index 0000000000000..1568f7364184a --- /dev/null +++ b/examples/simple/CMakeLists.txt @@ -0,0 +1,7 @@ +set(TARGET simple) +add_executable(${TARGET} simple.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() From bbe9c59618295f9f33df12138ca760352d6f527e Mon Sep 17 00:00:00 2001 From: SuperUserNameMan Date: Tue, 13 Jun 2023 19:12:45 +0200 Subject: [PATCH 3/6] Update Makefile for minimalist example --- Makefile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9a08d610b2207..55d5c2e52d4d7 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Define the default target now so that it is always the first target -BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot +BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot simple ifdef LLAMA_BUILD_SERVER BUILD_TARGETS += server @@ -270,6 +270,12 @@ main: examples/main/main.cpp build-info.h ggml. @echo @echo '==== Run ./main -h for help. ====' @echo + +simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + @echo + @echo '==== Run ./simple -h for help. ====' + @echo quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) From c369d11905f88291d5fa04d81bb98d90af59f13d Mon Sep 17 00:00:00 2001 From: SuperUserNameMan Date: Tue, 13 Jun 2023 19:36:27 +0200 Subject: [PATCH 4/6] remove 273: Trailing whitespace --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 55d5c2e52d4d7..b6d6aeb0a8e59 100644 --- a/Makefile +++ b/Makefile @@ -270,8 +270,8 @@ main: examples/main/main.cpp build-info.h ggml. @echo @echo '==== Run ./main -h for help. ====' @echo - -simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS) + +simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) @echo @echo '==== Run ./simple -h for help. ====' From 7a4f712a29a2e65648b3eed20582ee73173a3181 Mon Sep 17 00:00:00 2001 From: SuperUserNameMan Date: Wed, 14 Jun 2023 08:58:18 +0200 Subject: [PATCH 5/6] removed trailing white spaces simple.cpp --- examples/simple/simple.cpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 6593fdbc8b796..45ed4b8b9f68c 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -54,9 +54,9 @@ int main(int argc, char ** argv) if ( argc >= 3 ) { - params.prompt = argv[2]; + params.prompt = argv[2]; } - + if ( params.prompt.empty() ) { params.prompt = "Hello my name is"; @@ -71,8 +71,8 @@ int main(int argc, char ** argv) llama_context * ctx ; ctx = llama_init_from_gpt_params( params ); - - if ( ctx == NULL ) + + if ( ctx == NULL ) { fprintf( stderr , "%s: error: unable to load model\n" , __func__ ); return 1; @@ -84,13 +84,13 @@ int main(int argc, char ** argv) std::vector tokens_list; tokens_list = ::llama_tokenize( ctx , params.prompt , true ); - + const int max_context_size = llama_n_ctx( ctx ); const int max_tokens_list_size = max_context_size - 4 ; - if ( (int)tokens_list.size() > max_tokens_list_size ) + if ( (int)tokens_list.size() > max_tokens_list_size ) { - fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" , + fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" , __func__ , (int)tokens_list.size() , max_tokens_list_size ); return 1; } @@ -99,7 +99,7 @@ int main(int argc, char ** argv) // Print the tokens from the prompt : - for( auto id : tokens_list ) + for( auto id : tokens_list ) { printf( "%s" , llama_token_to_str( ctx , id ) ); } @@ -115,19 +115,19 @@ int main(int argc, char ** argv) // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist // example, we will just going to stop the loop. - while ( llama_get_kv_cache_token_count( ctx ) < max_context_size ) + while ( llama_get_kv_cache_token_count( ctx ) < max_context_size ) { //--------------------------------- // Evaluate the tokens : //--------------------------------- - if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) ) + if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) ) { fprintf( stderr, "%s : failed to eval\n" , __func__ ); return 1; } - + tokens_list.clear(); //--------------------------------- @@ -135,9 +135,9 @@ int main(int argc, char ** argv) //--------------------------------- llama_token new_token_id = 0; - + auto logits = llama_get_logits( ctx ); - auto n_vocab = llama_n_vocab( ctx ); + auto n_vocab = llama_n_vocab( ctx ); std::vector candidates; candidates.reserve( n_vocab ); From 8f65eecf20383f716dcc36f558c6924a5f27e21b Mon Sep 17 00:00:00 2001 From: SuperUserNameMan Date: Wed, 14 Jun 2023 09:33:31 +0200 Subject: [PATCH 6/6] typo and comments simple.cpp --- examples/simple/simple.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 45ed4b8b9f68c..76f991cdc028f 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -79,7 +79,7 @@ int main(int argc, char ** argv) } //--------------------------------- - // Tokenize the prompt + // Tokenize the prompt : //--------------------------------- std::vector tokens_list; @@ -103,6 +103,7 @@ int main(int argc, char ** argv) { printf( "%s" , llama_token_to_str( ctx , id ) ); } + fflush(stdout); @@ -113,11 +114,10 @@ int main(int argc, char ** argv) // The LLM keeps a contextual cache memory of previous token evaluation. // Usually, once this cache is full, it is required to recompute a compressed context based on previous // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist - // example, we will just going to stop the loop. + // example, we will just stop the loop once this cache is full or once an end of stream is detected. while ( llama_get_kv_cache_token_count( ctx ) < max_context_size ) { - //--------------------------------- // Evaluate the tokens : //--------------------------------- @@ -137,7 +137,7 @@ int main(int argc, char ** argv) llama_token new_token_id = 0; auto logits = llama_get_logits( ctx ); - auto n_vocab = llama_n_vocab( ctx ); + auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens) std::vector candidates; candidates.reserve( n_vocab ); @@ -150,7 +150,7 @@ int main(int argc, char ** argv) llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; // Select it using the "Greedy sampling" method : - new_token_id = llama_sample_token_greedy(ctx, &candidates_p); + new_token_id = llama_sample_token_greedy( ctx , &candidates_p ); // is it an end of stream ? @@ -162,14 +162,14 @@ int main(int argc, char ** argv) // Print the new token : printf( "%s" , llama_token_to_str( ctx , new_token_id ) ); - fflush(stdout); + fflush( stdout ); // Push this new token for next evaluation : tokens_list.push_back( new_token_id ); } // wend of main loop - llama_free(ctx); + llama_free( ctx ); return 0; }