25
25
26
26
#include < cstddef>
27
27
#include < thread>
28
- #include < mutex>
29
28
#include < chrono>
30
29
#include < condition_variable>
31
30
#include < atomic>
@@ -328,10 +327,8 @@ struct llama_server_context
328
327
// slots / clients
329
328
std::vector<llama_client_slot> slots;
330
329
331
- llama_server_queue<task_server> queue_tasks;
332
- llama_server_response_event queue_results;
333
- std::vector<task_multi> queue_multitasks;
334
- std::mutex mutex_multitasks;
330
+ llama_server_queue queue_tasks;
331
+ llama_server_response queue_results;
335
332
336
333
~llama_server_context ()
337
334
{
@@ -961,30 +958,6 @@ struct llama_server_context
961
958
queue_results.send (res);
962
959
}
963
960
964
- void add_multitask (int id, std::vector<int >& sub_ids)
965
- {
966
- std::lock_guard<std::mutex> lock (mutex_multitasks);
967
- task_multi multi;
968
- multi.id = id;
969
- std::copy (sub_ids.begin (), sub_ids.end (), std::inserter (multi.subtasks_remaining , multi.subtasks_remaining .end ()));
970
- queue_multitasks.push_back (multi);
971
- // TODO @ngxson : Do we need to notify the queue_tasks?
972
- }
973
-
974
- void update_multitask (int multitask_id, int subtask_id, task_result& result)
975
- {
976
- std::lock_guard<std::mutex> lock (mutex_multitasks);
977
- for (auto & multitask : queue_multitasks)
978
- {
979
- if (multitask.id == multitask_id)
980
- {
981
- multitask.subtasks_remaining .erase (subtask_id);
982
- multitask.results .push_back (result);
983
- // TODO @ngxson : Do we need to notify the queue_tasks?
984
- }
985
- }
986
- }
987
-
988
961
json get_model_props ()
989
962
{
990
963
return get_formated_generation (slots[0 ]);
@@ -1120,7 +1093,7 @@ struct llama_server_context
1120
1093
// parent multitask, if any, needs to be updated
1121
1094
if (slot.multitask_id != -1 )
1122
1095
{
1123
- update_multitask (slot.multitask_id , slot.task_id , res);
1096
+ queue_tasks. update_multitask (slot.multitask_id , slot.task_id , res);
1124
1097
}
1125
1098
}
1126
1099
@@ -1157,7 +1130,6 @@ struct llama_server_context
1157
1130
1158
1131
int request_completion (json data, bool infill, bool embedding, int multitask_id)
1159
1132
{
1160
- std::unique_lock<std::mutex> lock (mutex_multitasks);
1161
1133
task_server task;
1162
1134
task.target_id = 0 ;
1163
1135
task.data = std::move (data);
@@ -1169,7 +1141,6 @@ struct llama_server_context
1169
1141
// when a completion task's prompt array is not a singleton, we split it into multiple requests
1170
1142
if (task.data .count (" prompt" ) && task.data .at (" prompt" ).size () > 1 )
1171
1143
{
1172
- lock.unlock (); // entering new func scope
1173
1144
return split_multiprompt_task (task);
1174
1145
}
1175
1146
@@ -1270,11 +1241,11 @@ struct llama_server_context
1270
1241
}
1271
1242
1272
1243
// queue up the multitask so we can track its subtask progression
1273
- add_multitask (multitask_id, subtask_ids);
1244
+ queue_tasks. add_multitask (multitask_id, subtask_ids);
1274
1245
return multitask_id;
1275
1246
}
1276
1247
1277
- void process_single_task (task_server task)
1248
+ void process_single_task (task_server& task)
1278
1249
{
1279
1250
switch (task.type )
1280
1251
{
@@ -1283,7 +1254,7 @@ struct llama_server_context
1283
1254
if (slot == nullptr )
1284
1255
{
1285
1256
// if no slot is available, we defer this task for processing later
1286
- LOG_TEE (" no slot\n " );
1257
+ LOG_VERBOSE (" no slot is available " , {} );
1287
1258
queue_tasks.defer (task);
1288
1259
break ;
1289
1260
}
@@ -1333,42 +1304,23 @@ struct llama_server_context
1333
1304
}
1334
1305
}
1335
1306
1336
- void process_multitask ( )
1307
+ void on_finish_multitask (task_multi& multitask )
1337
1308
{
1338
- // remove finished multitasks from the queue of multitasks, and add the corresponding result to the result queue
1339
- std::vector<task_result> agg_results;
1340
- auto queue_iterator = queue_multitasks.begin ();
1341
- while (queue_iterator != queue_multitasks.end ())
1342
- {
1343
- if (queue_iterator->subtasks_remaining .empty ())
1344
- {
1345
- // all subtasks done == multitask is done
1346
- task_result aggregate_result;
1347
- aggregate_result.id = queue_iterator->id ;
1348
- aggregate_result.stop = true ;
1349
- aggregate_result.error = false ;
1350
-
1351
- // collect json results into one json result
1352
- std::vector<json> result_jsons;
1353
- for (auto & subres : queue_iterator->results )
1354
- {
1355
- result_jsons.push_back (subres.result_json );
1356
- aggregate_result.error = aggregate_result.error && subres.error ;
1357
- }
1358
- aggregate_result.result_json = json{ " results" , result_jsons };
1359
- agg_results.push_back (aggregate_result);
1360
- queue_iterator = queue_multitasks.erase (queue_iterator);
1361
- }
1362
- else
1363
- {
1364
- ++queue_iterator;
1365
- }
1366
- }
1309
+ // all subtasks done == multitask is done
1310
+ task_result result;
1311
+ result.id = multitask.id ;
1312
+ result.stop = true ;
1313
+ result.error = false ;
1367
1314
1368
- // copy aggregate results of complete multi-tasks to the results queue
1369
- for (auto & res : agg_results) {
1370
- queue_results.send (res);
1315
+ // collect json results into one json result
1316
+ std::vector<json> result_jsons;
1317
+ for (auto & subres : multitask.results )
1318
+ {
1319
+ result_jsons.push_back (subres.result_json );
1320
+ result.error = result.error && subres.error ;
1371
1321
}
1322
+ result.result_json = json{ " results" , result_jsons };
1323
+ queue_results.send (result);
1372
1324
}
1373
1325
1374
1326
bool update_slots () {
@@ -1704,7 +1656,6 @@ struct llama_server_context
1704
1656
}
1705
1657
1706
1658
void run_on_all_tasks_finished () {
1707
- process_multitask ();
1708
1659
update_slots ();
1709
1660
}
1710
1661
};
@@ -2861,16 +2812,18 @@ int main(int argc, char **argv)
2861
2812
2862
2813
llama.queue_tasks .on_new_task (std::bind (
2863
2814
&llama_server_context::process_single_task, &llama, std::placeholders::_1));
2815
+ llama.queue_tasks .on_finish_multitask (std::bind (
2816
+ &llama_server_context::on_finish_multitask, &llama, std::placeholders::_1));
2864
2817
llama.queue_tasks .on_all_tasks_finished (std::bind (
2865
2818
&llama_server_context::run_on_all_tasks_finished, &llama));
2866
- llama.queue_tasks .start_loop ();
2867
2819
llama.queue_results .on_multitask_update (std::bind (
2868
- &llama_server_context ::update_multitask,
2869
- &llama,
2820
+ &llama_server_queue ::update_multitask,
2821
+ &llama. queue_tasks ,
2870
2822
std::placeholders::_1,
2871
2823
std::placeholders::_2,
2872
2824
std::placeholders::_3
2873
2825
));
2826
+ llama.queue_tasks .start_loop ();
2874
2827
2875
2828
t.join ();
2876
2829
0 commit comments