-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathllm-audio.py
More file actions
68 lines (55 loc) · 1.87 KB
/
llm-audio.py
File metadata and controls
68 lines (55 loc) · 1.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env python3
"""
A simple command‑line tool that sends a WAV audio file to an LLM backend
and prints the model’s response.
Usage:
python llm-audio.py <audio_path> <prompt>
The script reads the audio file, base64‑encodes it, and sends it as an
`input_audio` message to the OpenAI-compatible API.
"""
import base64
import sys
from openai import OpenAI
# ---------------------------------------------------------------------------
def main() -> None:
if len(sys.argv) < 3:
print("Usage: python llm-audio.py <audio_path> <prompt>")
sys.exit(1)
path = sys.argv[1]
prompt = sys.argv[2]
# Choose the model you want to use.
model = "Qwen2.5-Omni-3B-Q8_0"
# Configure the client to point at the local Ollama server.
client = OpenAI(base_url="http://localhost:9090/v1", api_key="none")
# Read and base64‑encode the audio file.
try:
with open(path, "rb") as f:
audio_bytes = f.read()
base64_audio = base64.b64encode(audio_bytes).decode("utf-8")
except Exception as exc:
print(f"Error reading audio file: {exc}")
sys.exit(1)
# Build the chat completion request.
completion = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "input_audio",
"input_audio": {"data": base64_audio, "format": "wav"},
},
],
},
],
max_tokens=-1,
stream=False,
temperature=0.8,
)
# Print the assistant’s response.
print(completion.choices[0].message.content.strip())
if __name__ == "__main__":
main()