fix(utils.py): fix openai-like api response format parsing (#7273)

krrishdholakia · web-flow · commit 224ead153179 · 2024-12-17T12:49:09.000-08:00
* fix(utils.py): fix openai-like api response format parsing

Fixes issue passing structured output to litellm_proxy/ route

* fix(cost_calculator.py): fix whisper transcription cost calc to use file duration, not response time

'

* test: skip test if credentials not found
diff --git a/litellm/cost_calculator.py b/litellm/cost_calculator.py
@@ -111,6 +111,7 @@ def cost_per_token(  # noqa: PLR0915
     usage_object: Optional[Usage] = None,  # just read the usage object if provided
     ### CALL TYPE ###
     call_type: CallTypesLiteral = "completion",
+    audio_transcription_file_duration: float = 0.0,  # for audio transcription calls - the file time in seconds
 ) -> Tuple[float, float]:  # type: ignore
     """
     Calculates the cost per token for a given model, prompt tokens, and completion tokens.
@@ -236,6 +237,12 @@ def cost_per_token(  # noqa: PLR0915
             model=model,
             custom_llm_provider=custom_llm_provider,
         )
+    elif call_type == "atranscription" or call_type == "transcription":
+        return openai_cost_per_second(
+            model=model,
+            custom_llm_provider=custom_llm_provider,
+            duration=audio_transcription_file_duration,
+        )
     elif custom_llm_provider == "vertex_ai":
         cost_router = google_cost_router(
             model=model_without_prefix,
@@ -261,13 +268,7 @@ def cost_per_token(  # noqa: PLR0915
     elif custom_llm_provider == "anthropic":
         return anthropic_cost_per_token(model=model, usage=usage_block)
     elif custom_llm_provider == "openai":
-        openai_cost_route = openai_cost_router(call_type=CallTypes(call_type))
-        if openai_cost_route == "cost_per_token":
-            return openai_cost_per_token(model=model, usage=usage_block)
-        elif openai_cost_route == "cost_per_second":
-            return openai_cost_per_second(
-                model=model, usage=usage_block, response_time_ms=response_time_ms
-            )
+        return openai_cost_per_token(model=model, usage=usage_block)
     elif custom_llm_provider == "databricks":
         return databricks_cost_per_token(model=model, usage=usage_block)
     elif custom_llm_provider == "fireworks_ai":
@@ -484,6 +485,7 @@ def completion_cost(  # noqa: PLR0915
         completion_characters: Optional[int] = None
         cache_creation_input_tokens: Optional[int] = None
         cache_read_input_tokens: Optional[int] = None
+        audio_transcription_file_duration: float = 0.0
         cost_per_token_usage_object: Optional[Usage] = _get_usage_object(
             completion_response=completion_response
         )
@@ -632,6 +634,13 @@ def completion_cost(  # noqa: PLR0915
             call_type == CallTypes.speech.value or call_type == CallTypes.aspeech.value
         ):
             prompt_characters = litellm.utils._count_characters(text=prompt)
+        elif (
+            call_type == CallTypes.atranscription.value
+            or call_type == CallTypes.transcription.value
+        ):
+            audio_transcription_file_duration = getattr(
+                completion_response, "duration", 0.0
+            )
         elif (
             call_type == CallTypes.rerank.value or call_type == CallTypes.arerank.value
         ):
@@ -708,6 +717,7 @@ def completion_cost(  # noqa: PLR0915
             cache_read_input_tokens=cache_read_input_tokens,
             usage_object=cost_per_token_usage_object,
             call_type=call_type,
+            audio_transcription_file_duration=audio_transcription_file_duration,
         )
         _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
 
@@ -814,3 +824,11 @@ def rerank_cost(
         )
     except Exception as e:
         raise e
+
+
+def transcription_cost(
+    model: str, custom_llm_provider: Optional[str], duration: float
+) -> Tuple[float, float]:
+    return openai_cost_per_second(
+        model=model, custom_llm_provider=custom_llm_provider, duration=duration
+    )
diff --git a/litellm/llms/openai/cost_calculation.py b/litellm/llms/openai/cost_calculation.py
@@ -78,36 +78,44 @@ def cost_per_token(model: str, usage: Usage) -> Tuple[float, float]:
 
 
 def cost_per_second(
-    model: str, usage: Usage, response_time_ms: Optional[float] = 0.0
+    model: str, custom_llm_provider: Optional[str], duration: float = 0.0
 ) -> Tuple[float, float]:
     """
     Calculates the cost per second for a given model, prompt tokens, and completion tokens.
+
+    Input:
+        - model: str, the model name without provider prefix
+        - custom_llm_provider: str, the custom llm provider
+        - duration: float, the duration of the response in seconds
+
+    Returns:
+        Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
     """
     ## GET MODEL INFO
-    model_info = get_model_info(model=model, custom_llm_provider="openai")
+    model_info = get_model_info(
+        model=model, custom_llm_provider=custom_llm_provider or "openai"
+    )
     prompt_cost = 0.0
     completion_cost = 0.0
     ## Speech / Audio cost calculation
     if (
         "output_cost_per_second" in model_info
         and model_info["output_cost_per_second"] is not None
-        and response_time_ms is not None
     ):
         verbose_logger.debug(
-            f"For model={model} - output_cost_per_second: {model_info.get('output_cost_per_second')}; response time: {response_time_ms}"
+            f"For model={model} - output_cost_per_second: {model_info.get('output_cost_per_second')}; duration: {duration}"
         )
         ## COST PER SECOND ##
-        completion_cost = model_info["output_cost_per_second"] * response_time_ms / 1000
+        completion_cost = model_info["output_cost_per_second"] * duration
     elif (
         "input_cost_per_second" in model_info
         and model_info["input_cost_per_second"] is not None
-        and response_time_ms is not None
     ):
         verbose_logger.debug(
-            f"For model={model} - input_cost_per_second: {model_info.get('input_cost_per_second')}; response time: {response_time_ms}"
+            f"For model={model} - input_cost_per_second: {model_info.get('input_cost_per_second')}; duration: {duration}"
         )
         ## COST PER SECOND ##
-        prompt_cost = model_info["input_cost_per_second"] * response_time_ms / 1000
+        prompt_cost = model_info["input_cost_per_second"] * duration
         completion_cost = 0.0
 
     return prompt_cost, completion_cost
diff --git a/litellm/utils.py b/litellm/utils.py
@@ -3612,53 +3612,21 @@ def _map_and_modify_arg(supported_params: dict, provider: str, model: str):
                     else False
                 ),
             )
-    else:  # assume passing in params for text-completion openai
+    else:  # assume passing in params for openai-like api
         supported_params = get_supported_openai_params(
             model=model, custom_llm_provider="custom_openai"
         )
         _check_valid_arg(supported_params=supported_params)
-        if functions is not None:
-            optional_params["functions"] = functions
-        if function_call is not None:
-            optional_params["function_call"] = function_call
-        if temperature is not None:
-            optional_params["temperature"] = temperature
-        if top_p is not None:
-            optional_params["top_p"] = top_p
-        if n is not None:
-            optional_params["n"] = n
-        if stream is not None:
-            optional_params["stream"] = stream
-        if stream_options is not None:
-            optional_params["stream_options"] = stream_options
-        if stop is not None:
-            optional_params["stop"] = stop
-        if max_tokens is not None:
-            optional_params["max_tokens"] = max_tokens
-        if presence_penalty is not None:
-            optional_params["presence_penalty"] = presence_penalty
-        if frequency_penalty is not None:
-            optional_params["frequency_penalty"] = frequency_penalty
-        if logit_bias is not None:
-            optional_params["logit_bias"] = logit_bias
-        if user is not None:
-            optional_params["user"] = user
-        if response_format is not None:
-            optional_params["response_format"] = response_format
-        if seed is not None:
-            optional_params["seed"] = seed
-        if tools is not None:
-            optional_params["tools"] = tools
-        if tool_choice is not None:
-            optional_params["tool_choice"] = tool_choice
-        if max_retries is not None:
-            optional_params["max_retries"] = max_retries
-        if logprobs is not None:
-            optional_params["logprobs"] = logprobs
-        if top_logprobs is not None:
-            optional_params["top_logprobs"] = top_logprobs
-        if extra_headers is not None:
-            optional_params["extra_headers"] = extra_headers
+        optional_params = litellm.OpenAILikeChatConfig().map_openai_params(
+            non_default_params=non_default_params,
+            optional_params=optional_params,
+            model=model,
+            drop_params=(
+                drop_params
+                if drop_params is not None and isinstance(drop_params, bool)
+                else False
+            ),
+        )
     if (
         custom_llm_provider
         in ["openai", "azure", "text-completion-openai"]
diff --git a/tests/local_testing/test_audio_speech.py b/tests/local_testing/test_audio_speech.py
@@ -138,10 +138,14 @@ def return_val():
         mock_async_post.return_value = mock_response
         model = "vertex_ai/test"
 
-        response = await litellm.aspeech(
-            model=model,
-            input="async hello what llm guardrail do you have",
-        )
+        try:
+            response = await litellm.aspeech(
+                model=model,
+                input="async hello what llm guardrail do you have",
+            )
+        except litellm.APIConnectionError as e:
+            if "Your default credentials were not found" in str(e):
+                pytest.skip("skipping test, credentials not found")
 
         # Assert asynchronous call
         mock_async_post.assert_called_once()
@@ -181,18 +185,22 @@ def return_val():
         mock_async_post.return_value = mock_response
         model = "vertex_ai/test"
 
-        response = await litellm.aspeech(
-            model=model,
-            input="async hello what llm guardrail do you have",
-            voice={
-                "languageCode": "en-UK",
-                "name": "en-UK-Studio-O",
-            },
-            audioConfig={
-                "audioEncoding": "LINEAR22",
-                "speakingRate": "10",
-            },
-        )
+        try:
+            response = await litellm.aspeech(
+                model=model,
+                input="async hello what llm guardrail do you have",
+                voice={
+                    "languageCode": "en-UK",
+                    "name": "en-UK-Studio-O",
+                },
+                audioConfig={
+                    "audioEncoding": "LINEAR22",
+                    "speakingRate": "10",
+                },
+            )
+        except litellm.APIConnectionError as e:
+            if "Your default credentials were not found" in str(e):
+                pytest.skip("skipping test, credentials not found")
 
         # Assert asynchronous call
         mock_async_post.assert_called_once()
@@ -239,18 +247,22 @@ def return_val():
         mock_async_post.return_value = mock_response
         model = "vertex_ai/test"
 
-        response = await litellm.aspeech(
-            input=ssml,
-            model=model,
-            voice={
-                "languageCode": "en-UK",
-                "name": "en-UK-Studio-O",
-            },
-            audioConfig={
-                "audioEncoding": "LINEAR22",
-                "speakingRate": "10",
-            },
-        )
+        try:
+            response = await litellm.aspeech(
+                input=ssml,
+                model=model,
+                voice={
+                    "languageCode": "en-UK",
+                    "name": "en-UK-Studio-O",
+                },
+                audioConfig={
+                    "audioEncoding": "LINEAR22",
+                    "speakingRate": "10",
+                },
+            )
+        except litellm.APIConnectionError as e:
+            if "Your default credentials were not found" in str(e):
+                pytest.skip("skipping test, credentials not found")
 
         # Assert asynchronous call
         mock_async_post.assert_called_once()
diff --git a/tests/local_testing/test_completion.py b/tests/local_testing/test_completion.py
@@ -1819,6 +1819,43 @@ async def test_litellm_gateway_from_sdk():
         assert "hello" in mock_call.call_args.kwargs["extra_body"]
 
 
+@pytest.mark.asyncio
+async def test_litellm_gateway_from_sdk_structured_output():
+    from pydantic import BaseModel
+
+    class Result(BaseModel):
+        answer: str
+
+    litellm.set_verbose = True
+    from openai import OpenAI
+
+    openai_client = OpenAI(api_key="fake-key")
+
+    with patch.object(
+        openai_client.chat.completions, "create", new=MagicMock()
+    ) as mock_call:
+        try:
+            litellm.completion(
+                model="litellm_proxy/openai/gpt-4o",
+                messages=[
+                    {"role": "user", "content": "What is the capital of France?"}
+                ],
+                api_key="my-test-api-key",
+                user="test",
+                response_format=Result,
+                base_url="https://litellm.ml-serving-internal.scale.com",
+                client=openai_client,
+            )
+        except Exception as e:
+            print(e)
+
+        mock_call.assert_called_once()
+
+        print("Call KWARGS - {}".format(mock_call.call_args.kwargs))
+        json_schema = mock_call.call_args.kwargs["response_format"]
+        assert "json_schema" in json_schema
+
+
 # ################### Hugging Face Conversational models ########################
 # def hf_test_completion_conv():
 #     try:
diff --git a/tests/local_testing/test_completion_cost.py b/tests/local_testing/test_completion_cost.py
@@ -393,6 +393,8 @@ def test_whisper_openai():
     transcription = TranscriptionResponse(
         text="Four score and seven years ago, our fathers brought forth on this continent a new nation, conceived in liberty and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure."
     )
+
+    setattr(transcription, "duration", 3)
     transcription._hidden_params = {
         "model": "whisper-1",
         "custom_llm_provider": "openai",
@@ -401,7 +403,6 @@ def test_whisper_openai():
     }
     _total_time_in_seconds = 3
 
-    transcription._response_ms = _total_time_in_seconds * 1000
     cost = litellm.completion_cost(model="whisper-1", completion_response=transcription)
 
     print(f"cost: {cost}")
@@ -411,7 +412,7 @@ def test_whisper_openai():
         * _total_time_in_seconds,
         5,
     )
-    assert cost == expected_cost
+    assert round(cost, 5) == round(expected_cost, 5)
 
 
 def test_whisper_azure():
@@ -426,8 +427,8 @@ def test_whisper_azure():
         "model_id": None,
     }
     _total_time_in_seconds = 3
+    setattr(transcription, "duration", _total_time_in_seconds)
 
-    transcription._response_ms = _total_time_in_seconds * 1000
     cost = litellm.completion_cost(
         model="azure/azure-whisper", completion_response=transcription
     )
@@ -439,7 +440,7 @@ def test_whisper_azure():
         * _total_time_in_seconds,
         5,
     )
-    assert cost == expected_cost
+    assert round(cost, 5) == round(expected_cost, 5)
 
 
 def test_dalle_3_azure_cost_tracking():

Original file line number	Diff line number	Diff line change
`@@ -393,6 +393,8 @@ def test_whisper_openai():`
`393`	`393`	`transcription = TranscriptionResponse(`
`394`	`394`	`text="Four score and seven years ago, our fathers brought forth on this continent a new nation, conceived in liberty and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure."`
`395`	`395`	`)`
	`396`	`+`
	`397`	`+ setattr(transcription, "duration", 3)`
`396`	`398`	`transcription._hidden_params = {`
`397`	`399`	`"model": "whisper-1",`
`398`	`400`	`"custom_llm_provider": "openai",`
`@@ -401,7 +403,6 @@ def test_whisper_openai():`
`401`	`403`	`}`
`402`	`404`	`_total_time_in_seconds = 3`
`403`	`405`
`404`		`- transcription._response_ms = _total_time_in_seconds * 1000`
`405`	`406`	`cost = litellm.completion_cost(model="whisper-1", completion_response=transcription)`
`406`	`407`
`407`	`408`	`print(f"cost: {cost}")`
`@@ -411,7 +412,7 @@ def test_whisper_openai():`
`411`	`412`	`* _total_time_in_seconds,`
`412`	`413`	`5,`
`413`	`414`	`)`
`414`		`- assert cost == expected_cost`
	`415`	`+ assert round(cost, 5) == round(expected_cost, 5)`
`415`	`416`
`416`	`417`
`417`	`418`	`def test_whisper_azure():`
`@@ -426,8 +427,8 @@ def test_whisper_azure():`
`426`	`427`	`"model_id": None,`
`427`	`428`	`}`
`428`	`429`	`_total_time_in_seconds = 3`
	`430`	`+ setattr(transcription, "duration", _total_time_in_seconds)`
`429`	`431`
`430`		`- transcription._response_ms = _total_time_in_seconds * 1000`
`431`	`432`	`cost = litellm.completion_cost(`
`432`	`433`	`model="azure/azure-whisper", completion_response=transcription`
`433`	`434`	`)`
`@@ -439,7 +440,7 @@ def test_whisper_azure():`
`439`	`440`	`* _total_time_in_seconds,`
`440`	`441`	`5,`
`441`	`442`	`)`
`442`		`- assert cost == expected_cost`
	`443`	`+ assert round(cost, 5) == round(expected_cost, 5)`
`443`	`444`
`444`	`445`
`445`	`446`	`def test_dalle_3_azure_cost_tracking():`