Skip to content

Commit 224ead1

Browse files
fix(utils.py): fix openai-like api response format parsing (#7273)
* fix(utils.py): fix openai-like api response format parsing Fixes issue passing structured output to litellm_proxy/ route * fix(cost_calculator.py): fix whisper transcription cost calc to use file duration, not response time ' * test: skip test if credentials not found
1 parent 3addbf1 commit 224ead1

File tree

6 files changed

+134
-90
lines changed

6 files changed

+134
-90
lines changed

litellm/cost_calculator.py

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ def cost_per_token( # noqa: PLR0915
111111
usage_object: Optional[Usage] = None, # just read the usage object if provided
112112
### CALL TYPE ###
113113
call_type: CallTypesLiteral = "completion",
114+
audio_transcription_file_duration: float = 0.0, # for audio transcription calls - the file time in seconds
114115
) -> Tuple[float, float]: # type: ignore
115116
"""
116117
Calculates the cost per token for a given model, prompt tokens, and completion tokens.
@@ -236,6 +237,12 @@ def cost_per_token( # noqa: PLR0915
236237
model=model,
237238
custom_llm_provider=custom_llm_provider,
238239
)
240+
elif call_type == "atranscription" or call_type == "transcription":
241+
return openai_cost_per_second(
242+
model=model,
243+
custom_llm_provider=custom_llm_provider,
244+
duration=audio_transcription_file_duration,
245+
)
239246
elif custom_llm_provider == "vertex_ai":
240247
cost_router = google_cost_router(
241248
model=model_without_prefix,
@@ -261,13 +268,7 @@ def cost_per_token( # noqa: PLR0915
261268
elif custom_llm_provider == "anthropic":
262269
return anthropic_cost_per_token(model=model, usage=usage_block)
263270
elif custom_llm_provider == "openai":
264-
openai_cost_route = openai_cost_router(call_type=CallTypes(call_type))
265-
if openai_cost_route == "cost_per_token":
266-
return openai_cost_per_token(model=model, usage=usage_block)
267-
elif openai_cost_route == "cost_per_second":
268-
return openai_cost_per_second(
269-
model=model, usage=usage_block, response_time_ms=response_time_ms
270-
)
271+
return openai_cost_per_token(model=model, usage=usage_block)
271272
elif custom_llm_provider == "databricks":
272273
return databricks_cost_per_token(model=model, usage=usage_block)
273274
elif custom_llm_provider == "fireworks_ai":
@@ -484,6 +485,7 @@ def completion_cost( # noqa: PLR0915
484485
completion_characters: Optional[int] = None
485486
cache_creation_input_tokens: Optional[int] = None
486487
cache_read_input_tokens: Optional[int] = None
488+
audio_transcription_file_duration: float = 0.0
487489
cost_per_token_usage_object: Optional[Usage] = _get_usage_object(
488490
completion_response=completion_response
489491
)
@@ -632,6 +634,13 @@ def completion_cost( # noqa: PLR0915
632634
call_type == CallTypes.speech.value or call_type == CallTypes.aspeech.value
633635
):
634636
prompt_characters = litellm.utils._count_characters(text=prompt)
637+
elif (
638+
call_type == CallTypes.atranscription.value
639+
or call_type == CallTypes.transcription.value
640+
):
641+
audio_transcription_file_duration = getattr(
642+
completion_response, "duration", 0.0
643+
)
635644
elif (
636645
call_type == CallTypes.rerank.value or call_type == CallTypes.arerank.value
637646
):
@@ -708,6 +717,7 @@ def completion_cost( # noqa: PLR0915
708717
cache_read_input_tokens=cache_read_input_tokens,
709718
usage_object=cost_per_token_usage_object,
710719
call_type=call_type,
720+
audio_transcription_file_duration=audio_transcription_file_duration,
711721
)
712722
_final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
713723

@@ -814,3 +824,11 @@ def rerank_cost(
814824
)
815825
except Exception as e:
816826
raise e
827+
828+
829+
def transcription_cost(
830+
model: str, custom_llm_provider: Optional[str], duration: float
831+
) -> Tuple[float, float]:
832+
return openai_cost_per_second(
833+
model=model, custom_llm_provider=custom_llm_provider, duration=duration
834+
)

litellm/llms/openai/cost_calculation.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -78,36 +78,44 @@ def cost_per_token(model: str, usage: Usage) -> Tuple[float, float]:
7878

7979

8080
def cost_per_second(
81-
model: str, usage: Usage, response_time_ms: Optional[float] = 0.0
81+
model: str, custom_llm_provider: Optional[str], duration: float = 0.0
8282
) -> Tuple[float, float]:
8383
"""
8484
Calculates the cost per second for a given model, prompt tokens, and completion tokens.
85+
86+
Input:
87+
- model: str, the model name without provider prefix
88+
- custom_llm_provider: str, the custom llm provider
89+
- duration: float, the duration of the response in seconds
90+
91+
Returns:
92+
Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
8593
"""
8694
## GET MODEL INFO
87-
model_info = get_model_info(model=model, custom_llm_provider="openai")
95+
model_info = get_model_info(
96+
model=model, custom_llm_provider=custom_llm_provider or "openai"
97+
)
8898
prompt_cost = 0.0
8999
completion_cost = 0.0
90100
## Speech / Audio cost calculation
91101
if (
92102
"output_cost_per_second" in model_info
93103
and model_info["output_cost_per_second"] is not None
94-
and response_time_ms is not None
95104
):
96105
verbose_logger.debug(
97-
f"For model={model} - output_cost_per_second: {model_info.get('output_cost_per_second')}; response time: {response_time_ms}"
106+
f"For model={model} - output_cost_per_second: {model_info.get('output_cost_per_second')}; duration: {duration}"
98107
)
99108
## COST PER SECOND ##
100-
completion_cost = model_info["output_cost_per_second"] * response_time_ms / 1000
109+
completion_cost = model_info["output_cost_per_second"] * duration
101110
elif (
102111
"input_cost_per_second" in model_info
103112
and model_info["input_cost_per_second"] is not None
104-
and response_time_ms is not None
105113
):
106114
verbose_logger.debug(
107-
f"For model={model} - input_cost_per_second: {model_info.get('input_cost_per_second')}; response time: {response_time_ms}"
115+
f"For model={model} - input_cost_per_second: {model_info.get('input_cost_per_second')}; duration: {duration}"
108116
)
109117
## COST PER SECOND ##
110-
prompt_cost = model_info["input_cost_per_second"] * response_time_ms / 1000
118+
prompt_cost = model_info["input_cost_per_second"] * duration
111119
completion_cost = 0.0
112120

113121
return prompt_cost, completion_cost

litellm/utils.py

Lines changed: 11 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -3612,53 +3612,21 @@ def _map_and_modify_arg(supported_params: dict, provider: str, model: str):
36123612
else False
36133613
),
36143614
)
3615-
else: # assume passing in params for text-completion openai
3615+
else: # assume passing in params for openai-like api
36163616
supported_params = get_supported_openai_params(
36173617
model=model, custom_llm_provider="custom_openai"
36183618
)
36193619
_check_valid_arg(supported_params=supported_params)
3620-
if functions is not None:
3621-
optional_params["functions"] = functions
3622-
if function_call is not None:
3623-
optional_params["function_call"] = function_call
3624-
if temperature is not None:
3625-
optional_params["temperature"] = temperature
3626-
if top_p is not None:
3627-
optional_params["top_p"] = top_p
3628-
if n is not None:
3629-
optional_params["n"] = n
3630-
if stream is not None:
3631-
optional_params["stream"] = stream
3632-
if stream_options is not None:
3633-
optional_params["stream_options"] = stream_options
3634-
if stop is not None:
3635-
optional_params["stop"] = stop
3636-
if max_tokens is not None:
3637-
optional_params["max_tokens"] = max_tokens
3638-
if presence_penalty is not None:
3639-
optional_params["presence_penalty"] = presence_penalty
3640-
if frequency_penalty is not None:
3641-
optional_params["frequency_penalty"] = frequency_penalty
3642-
if logit_bias is not None:
3643-
optional_params["logit_bias"] = logit_bias
3644-
if user is not None:
3645-
optional_params["user"] = user
3646-
if response_format is not None:
3647-
optional_params["response_format"] = response_format
3648-
if seed is not None:
3649-
optional_params["seed"] = seed
3650-
if tools is not None:
3651-
optional_params["tools"] = tools
3652-
if tool_choice is not None:
3653-
optional_params["tool_choice"] = tool_choice
3654-
if max_retries is not None:
3655-
optional_params["max_retries"] = max_retries
3656-
if logprobs is not None:
3657-
optional_params["logprobs"] = logprobs
3658-
if top_logprobs is not None:
3659-
optional_params["top_logprobs"] = top_logprobs
3660-
if extra_headers is not None:
3661-
optional_params["extra_headers"] = extra_headers
3620+
optional_params = litellm.OpenAILikeChatConfig().map_openai_params(
3621+
non_default_params=non_default_params,
3622+
optional_params=optional_params,
3623+
model=model,
3624+
drop_params=(
3625+
drop_params
3626+
if drop_params is not None and isinstance(drop_params, bool)
3627+
else False
3628+
),
3629+
)
36623630
if (
36633631
custom_llm_provider
36643632
in ["openai", "azure", "text-completion-openai"]

tests/local_testing/test_audio_speech.py

Lines changed: 40 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -138,10 +138,14 @@ def return_val():
138138
mock_async_post.return_value = mock_response
139139
model = "vertex_ai/test"
140140

141-
response = await litellm.aspeech(
142-
model=model,
143-
input="async hello what llm guardrail do you have",
144-
)
141+
try:
142+
response = await litellm.aspeech(
143+
model=model,
144+
input="async hello what llm guardrail do you have",
145+
)
146+
except litellm.APIConnectionError as e:
147+
if "Your default credentials were not found" in str(e):
148+
pytest.skip("skipping test, credentials not found")
145149

146150
# Assert asynchronous call
147151
mock_async_post.assert_called_once()
@@ -181,18 +185,22 @@ def return_val():
181185
mock_async_post.return_value = mock_response
182186
model = "vertex_ai/test"
183187

184-
response = await litellm.aspeech(
185-
model=model,
186-
input="async hello what llm guardrail do you have",
187-
voice={
188-
"languageCode": "en-UK",
189-
"name": "en-UK-Studio-O",
190-
},
191-
audioConfig={
192-
"audioEncoding": "LINEAR22",
193-
"speakingRate": "10",
194-
},
195-
)
188+
try:
189+
response = await litellm.aspeech(
190+
model=model,
191+
input="async hello what llm guardrail do you have",
192+
voice={
193+
"languageCode": "en-UK",
194+
"name": "en-UK-Studio-O",
195+
},
196+
audioConfig={
197+
"audioEncoding": "LINEAR22",
198+
"speakingRate": "10",
199+
},
200+
)
201+
except litellm.APIConnectionError as e:
202+
if "Your default credentials were not found" in str(e):
203+
pytest.skip("skipping test, credentials not found")
196204

197205
# Assert asynchronous call
198206
mock_async_post.assert_called_once()
@@ -239,18 +247,22 @@ def return_val():
239247
mock_async_post.return_value = mock_response
240248
model = "vertex_ai/test"
241249

242-
response = await litellm.aspeech(
243-
input=ssml,
244-
model=model,
245-
voice={
246-
"languageCode": "en-UK",
247-
"name": "en-UK-Studio-O",
248-
},
249-
audioConfig={
250-
"audioEncoding": "LINEAR22",
251-
"speakingRate": "10",
252-
},
253-
)
250+
try:
251+
response = await litellm.aspeech(
252+
input=ssml,
253+
model=model,
254+
voice={
255+
"languageCode": "en-UK",
256+
"name": "en-UK-Studio-O",
257+
},
258+
audioConfig={
259+
"audioEncoding": "LINEAR22",
260+
"speakingRate": "10",
261+
},
262+
)
263+
except litellm.APIConnectionError as e:
264+
if "Your default credentials were not found" in str(e):
265+
pytest.skip("skipping test, credentials not found")
254266

255267
# Assert asynchronous call
256268
mock_async_post.assert_called_once()

tests/local_testing/test_completion.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1819,6 +1819,43 @@ async def test_litellm_gateway_from_sdk():
18191819
assert "hello" in mock_call.call_args.kwargs["extra_body"]
18201820

18211821

1822+
@pytest.mark.asyncio
1823+
async def test_litellm_gateway_from_sdk_structured_output():
1824+
from pydantic import BaseModel
1825+
1826+
class Result(BaseModel):
1827+
answer: str
1828+
1829+
litellm.set_verbose = True
1830+
from openai import OpenAI
1831+
1832+
openai_client = OpenAI(api_key="fake-key")
1833+
1834+
with patch.object(
1835+
openai_client.chat.completions, "create", new=MagicMock()
1836+
) as mock_call:
1837+
try:
1838+
litellm.completion(
1839+
model="litellm_proxy/openai/gpt-4o",
1840+
messages=[
1841+
{"role": "user", "content": "What is the capital of France?"}
1842+
],
1843+
api_key="my-test-api-key",
1844+
user="test",
1845+
response_format=Result,
1846+
base_url="https://litellm.ml-serving-internal.scale.com",
1847+
client=openai_client,
1848+
)
1849+
except Exception as e:
1850+
print(e)
1851+
1852+
mock_call.assert_called_once()
1853+
1854+
print("Call KWARGS - {}".format(mock_call.call_args.kwargs))
1855+
json_schema = mock_call.call_args.kwargs["response_format"]
1856+
assert "json_schema" in json_schema
1857+
1858+
18221859
# ################### Hugging Face Conversational models ########################
18231860
# def hf_test_completion_conv():
18241861
# try:

tests/local_testing/test_completion_cost.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,8 @@ def test_whisper_openai():
393393
transcription = TranscriptionResponse(
394394
text="Four score and seven years ago, our fathers brought forth on this continent a new nation, conceived in liberty and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure."
395395
)
396+
397+
setattr(transcription, "duration", 3)
396398
transcription._hidden_params = {
397399
"model": "whisper-1",
398400
"custom_llm_provider": "openai",
@@ -401,7 +403,6 @@ def test_whisper_openai():
401403
}
402404
_total_time_in_seconds = 3
403405

404-
transcription._response_ms = _total_time_in_seconds * 1000
405406
cost = litellm.completion_cost(model="whisper-1", completion_response=transcription)
406407

407408
print(f"cost: {cost}")
@@ -411,7 +412,7 @@ def test_whisper_openai():
411412
* _total_time_in_seconds,
412413
5,
413414
)
414-
assert cost == expected_cost
415+
assert round(cost, 5) == round(expected_cost, 5)
415416

416417

417418
def test_whisper_azure():
@@ -426,8 +427,8 @@ def test_whisper_azure():
426427
"model_id": None,
427428
}
428429
_total_time_in_seconds = 3
430+
setattr(transcription, "duration", _total_time_in_seconds)
429431

430-
transcription._response_ms = _total_time_in_seconds * 1000
431432
cost = litellm.completion_cost(
432433
model="azure/azure-whisper", completion_response=transcription
433434
)
@@ -439,7 +440,7 @@ def test_whisper_azure():
439440
* _total_time_in_seconds,
440441
5,
441442
)
442-
assert cost == expected_cost
443+
assert round(cost, 5) == round(expected_cost, 5)
443444

444445

445446
def test_dalle_3_azure_cost_tracking():

0 commit comments

Comments
 (0)