Skip to content

Commit b3b4b9f

Browse files
jan-wassenbergcopybara-github
authored andcommitted
With new matmul, much larger batch sizes are advantageous, default to 256.
Can still override via command line argument. PiperOrigin-RevId: 730502653
1 parent 9a2360d commit b3b4b9f

File tree

2 files changed

+2
-2
lines changed

2 files changed

+2
-2
lines changed

gemma/gemma.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ struct RuntimeConfig {
100100

101101
// These defaults are overridden by InferenceArgs::CopyTo(*this):
102102
// Max tokens per batch during prefill.
103-
size_t prefill_tbatch_size = 32;
103+
size_t prefill_tbatch_size = 256;
104104
// Max queries per batch (one token from each) during decode.
105105
size_t decode_qbatch_size = 16;
106106

util/app.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ struct InferenceArgs : public ArgsBase<InferenceArgs> {
273273
visitor(max_generated_tokens, "max_generated_tokens", size_t{2048},
274274
"Maximum number of tokens to generate.");
275275

276-
visitor(prefill_tbatch_size, "prefill_tbatch", size_t{64},
276+
visitor(prefill_tbatch_size, "prefill_tbatch", size_t{256},
277277
"Prefill: max tokens per batch.");
278278
visitor(decode_qbatch_size, "decode_qbatch", size_t{16},
279279
"Decode: max queries per batch.");

0 commit comments

Comments
 (0)