llm-scripts/llm-python-vision-multi-images.py at main · Jay4242/llm-scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/bin/python3

# Adapted from OpenAI's Vision example
from openai import OpenAI
import base64
import sys
import os
import httpx
import re

# Point to the local server
client = OpenAI(base_url="http://localhost:9595/v1", api_key="none", timeout=httpx.Timeout(3600))

# Model selection
model = "Qwen3-VL-30B-A3B-Thinking"

# Retrieve the prompt, temperature, and image paths from the arguments
prompt = sys.argv[1]
temperature = float(sys.argv[2]) # New: Temperature argument
image_paths = sys.argv[3:]       # Shifted: Image paths start from 3rd argument

# Extract frame numbers from image paths
frame_numbers = []
for path in image_paths:
    match = re.search(r'frame_(\d+)\.jpg', path)
    if match:
        frame_numbers.append(int(match.group(1)))
    else:
        print(f"Could not extract frame number from {path}.  Exiting.")
        exit()

# Determine the image range
if frame_numbers:
    start_frame = min(frame_numbers)
    end_frame = max(frame_numbers)
    image_range = f"{start_frame}-{end_frame}"
else:
    image_range = "No images found"

# Append image range to the prompt
prompt += f" The images are frames {image_range}"

# Prepare the messages for the LLM
messages = [
    {
        "role": "system",
        "content": "You are a helpful assistant.",
    },
    {
        "role": "user",
        "content": [], # Initialize content as an empty list
    },
]

# Add the text prompt to the messages (first)
messages[1]["content"].append({"type": "text", "text": prompt})

# Read each image, encode it to base64, and add it to the messages (after prompt)
for image_path in image_paths:
    # Extract the frame number from the filename (e.g., frame_00123.jpg)
    _frame_match = re.search(r'frame_(\d+)\.jpg', image_path)
    _frame_num = _frame_match.group(1) if _frame_match else "unknown"

    # Insert a text line indicating the frame number before the image
    messages[1]["content"].append(
        {"type": "text", "text": f"Frame: {_frame_num}"}
    )

    try:
        with open(image_path.replace("'", ""), "rb") as image_file:
            image_data = image_file.read()
            base64_image = base64.b64encode(image_data).decode("utf-8")
            messages[1]["content"].append(
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
                }
            )
    except FileNotFoundError:
        print(f"Couldn't read the image at {image_path}. Make sure the path is correct and the file exists.")
        exit()


# Send the messages to the LLM
completion = client.chat.completions.create(
    model=model,
    messages=messages,
    max_tokens=-1,
    stream=False,
    temperature=temperature, # New: Pass temperature
)

# Print the response from the LLM
# Get the raw response from the LLM
raw_output = completion.choices[0].message.content

# Extract any <think>...</think> block and print it to stderr
think_match = re.search(r'<think>(.*?)</think>', raw_output, flags=re.DOTALL)
if think_match:
    thinking_text = think_match.group(1).strip()
    print(f"Thinking Text: {thinking_text}", file=sys.stderr)
else:
    # No <think> block was found – still emit a message to stderr to confirm stderr is functional
    print("No thinking block detected.", file=sys.stderr)

# Remove any <think>...</think> block (including the newline after </think>) from the output
# The DOTALL flag allows '.' to match newlines so the entire block is removed
clean_output = re.sub(r'<think>.*?</think>\s*', '', raw_output, flags=re.DOTALL).strip()

print(clean_output)