@@ -1312,7 +1312,7 @@ struct server_slot {
1312
1312
return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK;
1313
1313
}
1314
1314
1315
- bool can_batch_with (server_slot & other_slot) {
1315
+ bool can_batch_with (server_slot & other_slot) const {
1316
1316
return is_non_causal () == other_slot.is_non_causal ()
1317
1317
&& are_lora_equal (lora, other_slot.lora );
1318
1318
}
@@ -2157,14 +2157,6 @@ struct server_context {
2157
2157
}
2158
2158
2159
2159
if (slot.has_new_line ) {
2160
- // if we have already seen a new line, we stop after a certain time limit
2161
- if (slot.params .t_max_predict_ms > 0 && (ggml_time_us () - slot.t_start_generation > 1000 .0f *slot.params .t_max_predict_ms )) {
2162
- slot.stop = STOP_TYPE_LIMIT;
2163
- slot.has_next_token = false ;
2164
-
2165
- SLT_DBG (slot, " stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n " , slot.n_decoded , (int ) slot.params .t_max_predict_ms );
2166
- }
2167
-
2168
2160
// require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent
2169
2161
if (slot.params .n_indent > 0 ) {
2170
2162
// check the current indentation
@@ -2203,6 +2195,14 @@ struct server_context {
2203
2195
// check if there is a new line in the generated text
2204
2196
if (result.text_to_send .find (' \n ' ) != std::string::npos) {
2205
2197
slot.has_new_line = true ;
2198
+
2199
+ // if we have seen a new line, we stop after a certain time limit, but only upon another new line
2200
+ if (slot.params .t_max_predict_ms > 0 && (ggml_time_us () - slot.t_start_generation > 1000 .0f *slot.params .t_max_predict_ms )) {
2201
+ slot.stop = STOP_TYPE_LIMIT;
2202
+ slot.has_next_token = false ;
2203
+
2204
+ SLT_DBG (slot, " stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n " , slot.n_decoded , (int ) slot.params .t_max_predict_ms );
2205
+ }
2206
2206
}
2207
2207
2208
2208
// if context shift is disabled, we stop when it reaches the context limit
0 commit comments