diff --git a/chat.cpp b/chat.cpp index 38b39771ad982..20759315dc6d9 100644 --- a/chat.cpp +++ b/chat.cpp @@ -318,7 +318,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab fin.close(); std::vector tmp; - + for (int i = 0; i < n_parts; ++i) { const int part_id = i; //const int part_id = n_parts - i - 1; @@ -797,14 +797,6 @@ int main(int argc, char ** argv) { gpt_params params; - params.temp = 0.1f; - params.top_p = 0.95f; - params.n_ctx = 2048; - params.interactive = true; - params.interactive_start = true; - params.use_color = true; - params.model = "ggml-alpaca-7b-q4.bin"; - if (gpt_params_parse(argc, argv, params) == false) { return 1; } @@ -856,13 +848,26 @@ int main(int argc, char ** argv) { // Add a space in front of the first character to match OG llama tokenizer behavior // params.prompt.insert(0, 1, ' '); // tokenize the prompt - std::vector embd_inp;// = ::llama_tokenize(vocab, params.prompt, true); + std::vector embd_inp; // params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); // // tokenize the reverse prompt // std::vector antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false); + + std::vector instruct_inp = ::llama_tokenize(vocab, " Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n", true); + std::vector prompt_inp = ::llama_tokenize(vocab, "### Instruction:\n\n", true); + std::vector response_inp = ::llama_tokenize(vocab, "### Response:\n\n", false); + embd_inp.insert(embd_inp.end(), instruct_inp.begin(), instruct_inp.end()); + + if(!params.prompt.empty()) { + std::vector param_inp = ::llama_tokenize(vocab, params.prompt, true); + embd_inp.insert(embd_inp.end(), prompt_inp.begin(), prompt_inp.end()); + embd_inp.insert(embd_inp.end(), param_inp.begin(), param_inp.end()); + embd_inp.insert(embd_inp.end(), response_inp.begin(), response_inp.end()); + } + // fprintf(stderr, "\n"); // fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); // fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); @@ -871,13 +876,6 @@ int main(int argc, char ** argv) { // } // fprintf(stderr, "\n"); - std::vector instruct_inp = ::llama_tokenize(vocab, " Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n", true); - std::vector prompt_inp = ::llama_tokenize(vocab, "### Instruction:\n\n", true); - std::vector response_inp = ::llama_tokenize(vocab, "### Response:\n\n", false); - - embd_inp.insert(embd_inp.end(), instruct_inp.begin(), instruct_inp.end()); - - if (params.interactive) { #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) struct sigaction sigint_action; @@ -1076,9 +1074,14 @@ int main(int argc, char ** argv) { // end of text token if (embd.back() == 2) { - // fprintf(stderr, " [end of text]\n"); - is_interacting = true; - continue; + if (params.interactive) { + is_interacting = true; + continue; + } else { + printf("\n"); + fprintf(stderr, " [end of text]\n"); + break; + } } } diff --git a/utils.cpp b/utils.cpp index d739b5d489239..420fc26374307 100644 --- a/utils.cpp +++ b/utils.cpp @@ -24,9 +24,17 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); } else if (arg == "-p" || arg == "--prompt") { + params.interactive = false; + params.interactive_start = false; + params.use_color = false; + params.prompt = argv[++i]; } else if (arg == "-f" || arg == "--file") { + params.interactive = false; + params.interactive_start = false; + params.use_color = false; + std::ifstream file(argv[++i]); std::copy(std::istreambuf_iterator(file), diff --git a/utils.h b/utils.h index 021120b0513c7..2a843371a35e0 100644 --- a/utils.h +++ b/utils.h @@ -12,28 +12,29 @@ // CLI argument parsing // +// The default parameters struct gpt_params { int32_t seed = -1; // RNG seed int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); int32_t n_predict = 128; // new tokens to predict int32_t repeat_last_n = 64; // last n tokens to penalize - int32_t n_ctx = 512; //context size + int32_t n_ctx = 2048; //context size // sampling parameters int32_t top_k = 40; float top_p = 0.95f; - float temp = 0.80f; + float temp = 0.10f; float repeat_penalty = 1.30f; int32_t n_batch = 8; // batch size for prompt processing - std::string model = "models/lamma-7B/ggml-model.bin"; // model path + std::string model = "ggml-alpaca-7b-q4.bin"; // model path std::string prompt; - bool use_color = false; // use color to distinguish generations and inputs + bool use_color = true; // use color to distinguish generations and inputs - bool interactive = false; // interactive mode - bool interactive_start = false; // reverse prompt immediately + bool interactive = true; // interactive mode + bool interactive_start = true; // reverse prompt immediately std::string antiprompt = ""; // string upon seeing which more user input is prompted };