Skip to content

Commit 6bb4439

Browse files
committed
Merge commit '5c86c9ed3ef1cc7307fdce05f0f0e2e45253cf90' into concedo_experimental
# Conflicts: # tools/imatrix/imatrix.cpp # tools/mtmd/README.md # tools/run/README.md # tools/run/run.cpp
2 parents 702e8a6 + 5c86c9e commit 6bb4439

File tree

7 files changed

+47
-21
lines changed

7 files changed

+47
-21
lines changed

common/arg.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2628,6 +2628,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
26282628
params.i_chunk = value;
26292629
}
26302630
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2631+
add_opt(common_arg(
2632+
{"--parse-special"},
2633+
string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
2634+
[](common_params & params) {
2635+
params.parse_special = true;
2636+
}
2637+
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
26312638
add_opt(common_arg(
26322639
{"-pps"},
26332640
string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,7 @@ struct common_params {
405405

406406
bool process_output = false; // collect data for the output tensor
407407
bool compute_ppl = true; // whether to compute perplexity
408+
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
408409

409410
// cvector-generator params
410411
int n_pca_batch = 100;

ggml/src/ggml-cuda/getrows.cu

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,11 @@ static __global__ void k_get_rows(
1010
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
1111
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
1212

13-
const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
14-
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
15-
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
16-
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
13+
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
14+
const int i00 = (blockIdx.y * blockDim.x + threadIdx.x)*2;
15+
const int i10 = blockIdx.x;
16+
const int i11 = blockIdx.z / ne12;
17+
const int i12 = blockIdx.z % ne12;
1718

1819
if (i00 >= ne00) {
1920
return;
@@ -46,10 +47,11 @@ static __global__ void k_get_rows_float(
4647
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
4748
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
4849

49-
const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
50-
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
51-
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
52-
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
50+
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
51+
const int i00 = blockIdx.y * blockDim.x + threadIdx.x;
52+
const int i10 = blockIdx.x;
53+
const int i11 = blockIdx.z / ne12;
54+
const int i12 = blockIdx.z % ne12;
5355

5456
if (i00 >= ne00) {
5557
return;
@@ -94,8 +96,8 @@ static void get_rows_cuda_q(
9496
const size_t nb1, const size_t nb2, const size_t nb3,
9597
cudaStream_t stream) {
9698
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
97-
const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
98-
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
99+
const int block_num_y = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
100+
const dim3 block_nums(ne10, block_num_y, ne11*ne12);
99101

100102
// strides in elements
101103
// const size_t s0 = nb0 / sizeof(dst_t);
@@ -127,8 +129,8 @@ static void get_rows_cuda_float(
127129
const size_t nb1, const size_t nb2, const size_t nb3,
128130
cudaStream_t stream) {
129131
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
130-
const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
131-
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
132+
const int block_num_y = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
133+
const dim3 block_nums(ne10, block_num_y, ne11*ne12);
132134

133135
// strides in elements
134136
// const size_t s0 = nb0 / sizeof(dst_t);

src/llama-chat.cpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
3535
{ "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
3636
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
3737
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
38+
{ "mistral-v7-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN },
3839
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
3940
{ "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
4041
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
@@ -202,19 +203,20 @@ int32_t llm_chat_apply_template(
202203
if (add_ass) {
203204
ss << "<|im_start|>assistant\n";
204205
}
205-
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
206+
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN) {
206207
// Official mistral 'v7' template
207208
// See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
209+
// https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503#basic-instruct-template-v7-tekken
210+
const char * trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 ? " " : "";
208211
for (auto message : chat) {
209212
std::string role(message->role);
210213
std::string content(message->content);
211214
if (role == "system") {
212-
ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
215+
ss << "[SYSTEM_PROMPT]" << trailing_space << content << "[/SYSTEM_PROMPT]";
213216
} else if (role == "user") {
214-
ss << "[INST] " << content << "[/INST]";
215-
}
216-
else {
217-
ss << " " << content << "</s>";
217+
ss << "[INST]" << trailing_space << content << "[/INST]";
218+
} else {
219+
ss << trailing_space << content << "</s>";
218220
}
219221
}
220222
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1

src/llama-chat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ enum llm_chat_template {
1414
LLM_CHAT_TEMPLATE_MISTRAL_V3,
1515
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
1616
LLM_CHAT_TEMPLATE_MISTRAL_V7,
17+
LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN,
1718
LLM_CHAT_TEMPLATE_PHI_3,
1819
LLM_CHAT_TEMPLATE_PHI_4,
1920
LLM_CHAT_TEMPLATE_FALCON_3,

src/llama-model.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13490,6 +13490,14 @@ const char * llama_model_chat_template(const llama_model * model, const char * n
1349013490
: LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
1349113491
const auto & it = model->gguf_kv.find(key);
1349213492
if (it == model->gguf_kv.end()) {
13493+
// one-off fix for very popular models (so we are not flooded with issues)
13494+
// do not extend this list unless absolutely necessary
13495+
// Mistral-Small-2503 does not have built-in chat template
13496+
llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
13497+
if (pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
13498+
return "mistral-v7-tekken";
13499+
}
13500+
1349313501
return nullptr;
1349413502
}
1349513503

tools/mtmd/mtmd.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -554,14 +554,19 @@ struct decode_embd_batch {
554554
llama_batch get_view(int offset, int n_tokens) {
555555
llama_pos * pos_ptr;
556556
pos_view.clear();
557-
pos_view.resize(n_tokens * n_pos_per_embd);
557+
pos_view.reserve(n_tokens * n_pos_per_embd);
558558
if (n_pos_per_embd > 1) {
559559
// mrope
560560
// for example, with layout of src: 1234...1234...1234...1234...
561561
// offset 2 will give us dst: 34...34...34...34...
562562
for (int i = 0; i < n_pos_per_embd; i++) {
563-
auto src = pos.begin() + i * batch.n_tokens + offset;
564-
pos_view.insert(pos_view.end(), src, src + n_tokens);
563+
// assume n_tokens is less than or equal to batch.n_tokens
564+
// batch.n_tokens is number of **total** tokens
565+
// n_tokens is number of viewed token
566+
size_t src_idx = i * batch.n_tokens + offset;
567+
pos_view.insert(pos_view.end(),
568+
pos.data() + src_idx,
569+
pos.data() + src_idx + n_tokens);
565570
}
566571
pos_ptr = pos_view.data();
567572
} else {

0 commit comments

Comments
 (0)