Skip to content

Commit 41aee4d

Browse files
authored
speculative : ensure draft and target model vocab matches (#3812)
* speculative: Ensure draft and target model vocab matches * Tolerate small differences when checking dft vs tgt vocab
1 parent 6d459cb commit 41aee4d

File tree

1 file changed

+32
-1
lines changed

1 file changed

+32
-1
lines changed

examples/speculative/speculative.cpp

+32-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
#include <string>
99
#include <vector>
1010

11+
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
12+
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
13+
1114
struct seq_draft {
1215
bool active = false;
1316
bool drafting = false;
@@ -64,6 +67,33 @@ int main(int argc, char ** argv) {
6467
params.n_gpu_layers = params.n_gpu_layers_draft;
6568
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
6669

70+
{
71+
const int n_vocab_tgt = llama_n_vocab(model_tgt);
72+
const int n_vocab_dft = llama_n_vocab(model_dft);
73+
const int vocab_diff = n_vocab_tgt > n_vocab_dft
74+
? n_vocab_tgt - n_vocab_dft
75+
: n_vocab_dft - n_vocab_tgt;
76+
77+
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
78+
fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__);
79+
fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
80+
n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
81+
return 1;
82+
}
83+
84+
for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
85+
const char * token_text_tgt = llama_token_get_text(model_tgt, i);
86+
const char * token_text_dft = llama_token_get_text(model_dft, i);
87+
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
88+
fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
89+
fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
90+
llama_token_to_piece(ctx_tgt, i).c_str(),
91+
llama_token_to_piece(ctx_dft, i).c_str());
92+
return 1;
93+
}
94+
}
95+
}
96+
6797
// tokenize the prompt
6898
std::vector<llama_token> inp;
6999
inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
@@ -227,6 +257,7 @@ int main(int argc, char ** argv) {
227257
llama_batch_add (batch_dft, id, n_past_dft, { 0 }, true);
228258

229259
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
260+
// LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
230261
llama_decode (ctx_dft, batch_dft);
231262

232263
++n_past_dft;
@@ -370,7 +401,7 @@ int main(int argc, char ** argv) {
370401
llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
371402
}
372403

373-
//LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt));
404+
// LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
374405
llama_decode(ctx_tgt, batch_tgt);
375406
++n_past_tgt;
376407
}

0 commit comments

Comments
 (0)