@@ -160,11 +160,13 @@ async def handle_request(self, request: Request):
160
160
chat_request = ChatCompletionRequest .parse_obj (data )
161
161
prompt = self ._handle_message (chat_request .messages )
162
162
parameters = LLMParams (
163
- max_new_tokens = chat_request .max_tokens if chat_request .max_tokens else 1024 ,
163
+ max_tokens = chat_request .max_tokens if chat_request .max_tokens else 1024 ,
164
164
top_k = chat_request .top_k if chat_request .top_k else 10 ,
165
165
top_p = chat_request .top_p if chat_request .top_p else 0.95 ,
166
166
temperature = chat_request .temperature if chat_request .temperature else 0.01 ,
167
- repetition_penalty = chat_request .presence_penalty if chat_request .presence_penalty else 1.03 ,
167
+ frequency_penalty = chat_request .frequency_penalty if chat_request .frequency_penalty else 0.0 ,
168
+ presence_penalty = chat_request .presence_penalty if chat_request .presence_penalty else 0.0 ,
169
+ repetition_penalty = chat_request .repetition_penalty if chat_request .repetition_penalty else 1.03 ,
168
170
streaming = stream_opt ,
169
171
chat_template = chat_request .chat_template if chat_request .chat_template else None ,
170
172
)
@@ -214,11 +216,13 @@ async def handle_request(self, request: Request):
214
216
chat_request = ChatCompletionRequest .parse_obj (data )
215
217
prompt = self ._handle_message (chat_request .messages )
216
218
parameters = LLMParams (
217
- max_new_tokens = chat_request .max_tokens if chat_request .max_tokens else 1024 ,
219
+ max_tokens = chat_request .max_tokens if chat_request .max_tokens else 1024 ,
218
220
top_k = chat_request .top_k if chat_request .top_k else 10 ,
219
221
top_p = chat_request .top_p if chat_request .top_p else 0.95 ,
220
222
temperature = chat_request .temperature if chat_request .temperature else 0.01 ,
221
- repetition_penalty = chat_request .presence_penalty if chat_request .presence_penalty else 1.03 ,
223
+ frequency_penalty = chat_request .frequency_penalty if chat_request .frequency_penalty else 0.0 ,
224
+ presence_penalty = chat_request .presence_penalty if chat_request .presence_penalty else 0.0 ,
225
+ repetition_penalty = chat_request .repetition_penalty if chat_request .repetition_penalty else 1.03 ,
222
226
streaming = stream_opt ,
223
227
)
224
228
result_dict , runtime_graph = await self .megaservice .schedule (
@@ -350,11 +354,13 @@ async def handle_request(self, request: Request):
350
354
chat_request = ChatCompletionRequest .parse_obj (data )
351
355
prompt = self ._handle_message (chat_request .messages )
352
356
parameters = LLMParams (
353
- max_new_tokens = chat_request .max_tokens if chat_request .max_tokens else 1024 ,
357
+ max_tokens = chat_request .max_tokens if chat_request .max_tokens else 1024 ,
354
358
top_k = chat_request .top_k if chat_request .top_k else 10 ,
355
359
top_p = chat_request .top_p if chat_request .top_p else 0.95 ,
356
360
temperature = chat_request .temperature if chat_request .temperature else 0.01 ,
357
- repetition_penalty = chat_request .presence_penalty if chat_request .presence_penalty else 1.03 ,
361
+ frequency_penalty = chat_request .frequency_penalty if chat_request .frequency_penalty else 0.0 ,
362
+ presence_penalty = chat_request .presence_penalty if chat_request .presence_penalty else 0.0 ,
363
+ repetition_penalty = chat_request .repetition_penalty if chat_request .repetition_penalty else 1.03 ,
358
364
streaming = stream_opt ,
359
365
)
360
366
result_dict , runtime_graph = await self .megaservice .schedule (
@@ -399,11 +405,13 @@ async def handle_request(self, request: Request):
399
405
chat_request = AudioChatCompletionRequest .parse_obj (data )
400
406
parameters = LLMParams (
401
407
# relatively lower max_tokens for audio conversation
402
- max_new_tokens = chat_request .max_tokens if chat_request .max_tokens else 128 ,
408
+ max_tokens = chat_request .max_tokens if chat_request .max_tokens else 128 ,
403
409
top_k = chat_request .top_k if chat_request .top_k else 10 ,
404
410
top_p = chat_request .top_p if chat_request .top_p else 0.95 ,
405
411
temperature = chat_request .temperature if chat_request .temperature else 0.01 ,
406
- repetition_penalty = chat_request .presence_penalty if chat_request .presence_penalty else 1.03 ,
412
+ frequency_penalty = chat_request .frequency_penalty if chat_request .frequency_penalty else 0.0 ,
413
+ presence_penalty = chat_request .presence_penalty if chat_request .presence_penalty else 0.0 ,
414
+ repetition_penalty = chat_request .repetition_penalty if chat_request .repetition_penalty else 1.03 ,
407
415
streaming = False , # TODO add streaming LLM output as input to TTS
408
416
)
409
417
result_dict , runtime_graph = await self .megaservice .schedule (
@@ -428,11 +436,13 @@ async def handle_request(self, request: Request):
428
436
chat_request = ChatCompletionRequest .parse_obj (data )
429
437
prompt = self ._handle_message (chat_request .messages )
430
438
parameters = LLMParams (
431
- max_new_tokens = chat_request .max_tokens if chat_request .max_tokens else 1024 ,
439
+ max_tokens = chat_request .max_tokens if chat_request .max_tokens else 1024 ,
432
440
top_k = chat_request .top_k if chat_request .top_k else 10 ,
433
441
top_p = chat_request .top_p if chat_request .top_p else 0.95 ,
434
442
temperature = chat_request .temperature if chat_request .temperature else 0.01 ,
435
- repetition_penalty = chat_request .presence_penalty if chat_request .presence_penalty else 1.03 ,
443
+ frequency_penalty = chat_request .frequency_penalty if chat_request .frequency_penalty else 0.0 ,
444
+ presence_penalty = chat_request .presence_penalty if chat_request .presence_penalty else 0.0 ,
445
+ repetition_penalty = chat_request .repetition_penalty if chat_request .repetition_penalty else 1.03 ,
436
446
streaming = stream_opt ,
437
447
)
438
448
result_dict , runtime_graph = await self .megaservice .schedule (
@@ -472,11 +482,13 @@ async def handle_request(self, request: Request):
472
482
chat_request = ChatCompletionRequest .parse_obj (data )
473
483
prompt = self ._handle_message (chat_request .messages )
474
484
parameters = LLMParams (
475
- max_new_tokens = chat_request .max_tokens if chat_request .max_tokens else 1024 ,
485
+ max_tokens = chat_request .max_tokens if chat_request .max_tokens else 1024 ,
476
486
top_k = chat_request .top_k if chat_request .top_k else 10 ,
477
487
top_p = chat_request .top_p if chat_request .top_p else 0.95 ,
478
488
temperature = chat_request .temperature if chat_request .temperature else 0.01 ,
479
- repetition_penalty = chat_request .presence_penalty if chat_request .presence_penalty else 1.03 ,
489
+ frequency_penalty = chat_request .frequency_penalty if chat_request .frequency_penalty else 0.0 ,
490
+ presence_penalty = chat_request .presence_penalty if chat_request .presence_penalty else 0.0 ,
491
+ repetition_penalty = chat_request .repetition_penalty if chat_request .repetition_penalty else 1.03 ,
480
492
streaming = stream_opt ,
481
493
)
482
494
result_dict , runtime_graph = await self .megaservice .schedule (
@@ -520,7 +532,9 @@ async def handle_request(self, request: Request):
520
532
top_k = chat_request .top_k if chat_request .top_k else 10 ,
521
533
top_p = chat_request .top_p if chat_request .top_p else 0.95 ,
522
534
temperature = chat_request .temperature if chat_request .temperature else 0.01 ,
523
- repetition_penalty = chat_request .presence_penalty if chat_request .presence_penalty else 1.03 ,
535
+ frequency_penalty = chat_request .frequency_penalty if chat_request .frequency_penalty else 0.0 ,
536
+ presence_penalty = chat_request .presence_penalty if chat_request .presence_penalty else 0.0 ,
537
+ repetition_penalty = chat_request .repetition_penalty if chat_request .repetition_penalty else 1.03 ,
524
538
streaming = stream_opt ,
525
539
)
526
540
result_dict , runtime_graph = await self .megaservice .schedule (
@@ -569,7 +583,9 @@ async def handle_request(self, request: Request):
569
583
top_k = chat_request .top_k if chat_request .top_k else 10 ,
570
584
top_p = chat_request .top_p if chat_request .top_p else 0.95 ,
571
585
temperature = chat_request .temperature if chat_request .temperature else 0.01 ,
572
- repetition_penalty = chat_request .presence_penalty if chat_request .presence_penalty else 1.03 ,
586
+ frequency_penalty = chat_request .frequency_penalty if chat_request .frequency_penalty else 0.0 ,
587
+ presence_penalty = chat_request .presence_penalty if chat_request .presence_penalty else 0.0 ,
588
+ repetition_penalty = chat_request .repetition_penalty if chat_request .repetition_penalty else 1.03 ,
573
589
streaming = stream_opt ,
574
590
)
575
591
result_dict , runtime_graph = await self .megaservice .schedule (
@@ -758,7 +774,9 @@ async def handle_request(self, request: Request):
758
774
top_k = chat_request .top_k if chat_request .top_k else 10 ,
759
775
top_p = chat_request .top_p if chat_request .top_p else 0.95 ,
760
776
temperature = chat_request .temperature if chat_request .temperature else 0.01 ,
761
- repetition_penalty = chat_request .presence_penalty if chat_request .presence_penalty else 1.03 ,
777
+ frequency_penalty = chat_request .frequency_penalty if chat_request .frequency_penalty else 0.0 ,
778
+ presence_penalty = chat_request .presence_penalty if chat_request .presence_penalty else 0.0 ,
779
+ repetition_penalty = chat_request .repetition_penalty if chat_request .repetition_penalty else 1.03 ,
762
780
streaming = stream_opt ,
763
781
chat_template = chat_request .chat_template if chat_request .chat_template else None ,
764
782
)
0 commit comments