@@ -20,10 +20,6 @@ def translate_request(self, data: Dict, api_key: str) -> ChatCompletionRequest:
2020 if completion_request is None :
2121 raise Exception ("Couldn't translate the request" )
2222
23- # Replace n_predict option with max_tokens
24- if 'n_predict' in completion_request :
25- completion_request ['max_tokens' ] = completion_request ['n_predict' ]
26- del completion_request ['n_predict' ]
2723 return ChatCompletionRequest (** completion_request )
2824
2925 def translate_streaming_response (
@@ -50,12 +46,20 @@ async def execute_completion(
5046 stream : bool = False
5147 ) -> Union [ModelResponse , AsyncIterator [ModelResponse ]]:
5248 """
53- Execute the completion request with LiteLLM's API
49+ Execute the completion request with inference engine API
5450 """
55- response = await self .inference_engine .chat (Config .get_config ().chat_model_path ,
56- Config .get_config ().chat_model_n_ctx ,
57- Config .get_config ().chat_model_n_gpu_layers ,
58- ** request )
51+ model_path = f'{ Config .get_config ().model_base_path } /{ request ['model' ]} .gguf'
52+
53+ if 'prompt' in request :
54+ response = await self .inference_engine .complete (model_path ,
55+ Config .get_config ().chat_model_n_ctx ,
56+ Config .get_config ().chat_model_n_gpu_layers ,
57+ ** request )
58+ else :
59+ response = await self .inference_engine .chat (model_path ,
60+ Config .get_config ().chat_model_n_ctx ,
61+ Config .get_config ().chat_model_n_gpu_layers ,
62+ ** request )
5963 return response
6064
6165 def create_streaming_response (
0 commit comments