@@ -718,13 +718,16 @@ int llama_main(
718
718
gpt_vocab vocab,
719
719
llama_model model,
720
720
int64_t t_load_us,
721
- int64_t t_main_start_us) {
721
+ int64_t t_main_start_us,
722
+ FILE *instream,
723
+ FILE *outstream,
724
+ FILE *errstream) {
722
725
723
726
if (params.seed < 0 ) {
724
727
params.seed = time (NULL );
725
728
}
726
729
727
- fprintf (stderr , " %s: seed = %d\n " , __func__, params.seed );
730
+ fprintf (errstream , " %s: seed = %d\n " , __func__, params.seed );
728
731
729
732
std::mt19937 rng (params.seed );
730
733
if (params.prompt .empty ()) {
@@ -751,13 +754,13 @@ int llama_main(
751
754
// tokenize the reverse prompt
752
755
std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize (vocab, params.antiprompt , false );
753
756
754
- fprintf (stderr , " \n " );
755
- fprintf (stderr , " %s: prompt: '%s'\n " , __func__, params.prompt .c_str ());
756
- fprintf (stderr , " %s: number of tokens in prompt = %zu\n " , __func__, embd_inp.size ());
757
+ fprintf (errstream , " \n " );
758
+ fprintf (errstream , " %s: prompt: '%s'\n " , __func__, params.prompt .c_str ());
759
+ fprintf (errstream , " %s: number of tokens in prompt = %zu\n " , __func__, embd_inp.size ());
757
760
for (int i = 0 ; i < (int ) embd_inp.size (); i++) {
758
- fprintf (stderr , " %6d -> '%s'\n " , embd_inp[i], vocab.id_to_token .at (embd_inp[i]).c_str ());
761
+ fprintf (errstream , " %6d -> '%s'\n " , embd_inp[i], vocab.id_to_token .at (embd_inp[i]).c_str ());
759
762
}
760
- fprintf (stderr , " \n " );
763
+ fprintf (errstream , " \n " );
761
764
if (params.interactive ) {
762
765
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
763
766
struct sigaction sigint_action;
@@ -769,19 +772,19 @@ int llama_main(
769
772
signal (SIGINT, sigint_handler);
770
773
#endif
771
774
772
- fprintf (stderr , " %s: interactive mode on.\n " , __func__);
775
+ fprintf (errstream , " %s: interactive mode on.\n " , __func__);
773
776
774
777
if (antiprompt_inp.size ()) {
775
- fprintf (stderr , " %s: reverse prompt: '%s'\n " , __func__, params.antiprompt .c_str ());
776
- fprintf (stderr , " %s: number of tokens in reverse prompt = %zu\n " , __func__, antiprompt_inp.size ());
778
+ fprintf (errstream , " %s: reverse prompt: '%s'\n " , __func__, params.antiprompt .c_str ());
779
+ fprintf (errstream , " %s: number of tokens in reverse prompt = %zu\n " , __func__, antiprompt_inp.size ());
777
780
for (int i = 0 ; i < (int ) antiprompt_inp.size (); i++) {
778
- fprintf (stderr , " %6d -> '%s'\n " , antiprompt_inp[i], vocab.id_to_token .at (antiprompt_inp[i]).c_str ());
781
+ fprintf (errstream , " %6d -> '%s'\n " , antiprompt_inp[i], vocab.id_to_token .at (antiprompt_inp[i]).c_str ());
779
782
}
780
- fprintf (stderr , " \n " );
783
+ fprintf (errstream , " \n " );
781
784
}
782
785
}
783
- fprintf (stderr , " sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n " , params.temp , params.top_k , params.top_p , params.repeat_last_n , params.repeat_penalty );
784
- fprintf (stderr , " \n\n " );
786
+ fprintf (errstream , " sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n " , params.temp , params.top_k , params.top_p , params.repeat_last_n , params.repeat_penalty );
787
+ fprintf (errstream , " \n\n " );
785
788
786
789
std::vector<gpt_vocab::id> embd;
787
790
@@ -795,7 +798,7 @@ int llama_main(
795
798
796
799
797
800
if (params.interactive ) {
798
- fprintf (stderr , " == Running in interactive mode. ==\n "
801
+ fprintf (errstream , " == Running in interactive mode. ==\n "
799
802
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
800
803
" - Press Ctrl+C to interject at any time.\n "
801
804
#endif
@@ -814,7 +817,7 @@ int llama_main(
814
817
815
818
// set the color for the prompt which will be output initially
816
819
if (params.use_color ) {
817
- printf ( ANSI_COLOR_YELLOW);
820
+ fprintf (outstream, ANSI_COLOR_YELLOW);
818
821
}
819
822
820
823
while (remaining_tokens > 0 ) {
@@ -823,7 +826,7 @@ int llama_main(
823
826
const int64_t t_start_us = ggml_time_us ();
824
827
825
828
if (!llama_eval (model, params.n_threads , n_past, embd, logits, mem_per_token)) {
826
- fprintf (stderr , " Failed to predict\n " );
829
+ fprintf (errstream , " Failed to predict\n " );
827
830
return 1 ;
828
831
}
829
832
@@ -877,16 +880,16 @@ int llama_main(
877
880
878
881
// reset color to default if we there is no pending user input
879
882
if (!input_noecho && params.use_color && embd_inp.size () == input_consumed) {
880
- printf ( ANSI_COLOR_RESET);
883
+ fprintf (outstream, ANSI_COLOR_RESET);
881
884
}
882
885
}
883
886
884
887
// display text
885
888
if (!input_noecho) {
886
889
for (auto id : embd) {
887
- printf ( " %s" , vocab.id_to_token [id].c_str ());
890
+ fprintf (outstream, " %s" , vocab.id_to_token [id].c_str ());
888
891
}
889
- fflush (stdout );
892
+ fflush (outstream );
890
893
}
891
894
892
895
// in interactive mode, and not currently processing queued inputs;
@@ -901,16 +904,16 @@ int llama_main(
901
904
// currently being interactive
902
905
bool another_line=true ;
903
906
while (another_line) {
904
- fflush (stdout );
907
+ fflush (outstream );
905
908
char buf[256 ] = {0 };
906
909
int n_read;
907
- if (params.use_color ) printf ( ANSI_BOLD ANSI_COLOR_GREEN);
908
- if (scanf ( " %255[^\n ]%n%*c" , buf, &n_read) <= 0 ) {
910
+ if (params.use_color ) fprintf (outstream, ANSI_BOLD ANSI_COLOR_GREEN);
911
+ if (fscanf (instream, " %255[^\n ]%n%*c" , buf, &n_read) <= 0 ) {
909
912
// presumable empty line, consume the newline
910
- std::ignore = scanf ( " %*c" );
913
+ std::ignore = fscanf (instream, " %*c" );
911
914
n_read=0 ;
912
915
}
913
- if (params.use_color ) printf ( ANSI_COLOR_RESET);
916
+ if (params.use_color ) fprintf (outstream, ANSI_COLOR_RESET);
914
917
915
918
if (n_read > 0 && buf[n_read-1 ]==' \\ ' ) {
916
919
another_line = true ;
@@ -936,7 +939,7 @@ int llama_main(
936
939
937
940
// end of text token
938
941
if (embd.back () == 2 ) {
939
- fprintf (stderr , " [end of text]\n " );
942
+ fprintf (errstream , " [end of text]\n " );
940
943
break ;
941
944
}
942
945
}
@@ -949,18 +952,18 @@ int llama_main(
949
952
{
950
953
const int64_t t_main_end_us = ggml_time_us ();
951
954
952
- fprintf (stderr , " \n\n " );
953
- fprintf (stderr , " %s: mem per token = %8zu bytes\n " , __func__, mem_per_token);
954
- fprintf (stderr , " %s: load time = %8.2f ms\n " , __func__, t_load_us/1000 .0f );
955
- fprintf (stderr , " %s: sample time = %8.2f ms\n " , __func__, t_sample_us/1000 .0f );
956
- fprintf (stderr , " %s: predict time = %8.2f ms / %.2f ms per token\n " , __func__, t_predict_us/1000 .0f , t_predict_us/1000 .0f /n_past);
957
- fprintf (stderr , " %s: total time = %8.2f ms\n " , __func__, (t_main_end_us - t_main_start_us)/1000 .0f );
955
+ fprintf (errstream , " \n\n " );
956
+ fprintf (errstream , " %s: mem per token = %8zu bytes\n " , __func__, mem_per_token);
957
+ fprintf (errstream , " %s: load time = %8.2f ms\n " , __func__, t_load_us/1000 .0f );
958
+ fprintf (errstream , " %s: sample time = %8.2f ms\n " , __func__, t_sample_us/1000 .0f );
959
+ fprintf (errstream , " %s: predict time = %8.2f ms / %.2f ms per token\n " , __func__, t_predict_us/1000 .0f , t_predict_us/1000 .0f /n_past);
960
+ fprintf (errstream , " %s: total time = %8.2f ms\n " , __func__, (t_main_end_us - t_main_start_us)/1000 .0f );
958
961
}
959
962
960
963
ggml_free (model.ctx );
961
964
962
965
if (params.use_color ) {
963
- printf ( ANSI_COLOR_RESET);
966
+ fprintf (outstream, ANSI_COLOR_RESET);
964
967
}
965
968
966
969
return 0 ;
0 commit comments