llm-scripts/transcribe_video.py at main · Jay4242/llm-scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python3

import os
import sys
import subprocess
import tempfile
import requests
import argparse

def transcribe_video(video_path, translate=False, server_url="http://localhost:9191/inference"):
    """
    Transcribes a video file using a whisper.cpp server.

    Args:
        video_path (str): The path to the video file.
        translate (bool): Whether to request translation to English.
        server_url (str): The URL of the whisper.cpp server's inference endpoint.
    """
    if not os.path.exists(video_path):
        print(f"Error: Video file not found at '{video_path}'")
        return

    print(f"Processing video: {video_path}")
    if translate:
        print("Translation to English requested.")

    # 1. Extract audio and convert to 16kHz mono WAV using ffmpeg
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav_file:
        tmp_wav_path = tmp_wav_file.name

    try:
        print("Extracting audio with ffmpeg...")
        command = [
            "ffmpeg",
            "-i", video_path,
            "-ar", "16000",
            "-ac", "1",
            "-c:a", "pcm_s16le",
            "-y",
            "-loglevel", "error",
            tmp_wav_path
        ]

        # Capture ffmpeg output and ignore decoding errors
        result = subprocess.run(command, check=True, capture_output=True, text=True, errors="ignore")

    except FileNotFoundError:
        print("Error: 'ffmpeg' not found. Please make sure it is installed and in your system's PATH.")
        os.remove(tmp_wav_path)
        return
    except subprocess.CalledProcessError as e:
        print("Error during ffmpeg audio extraction:")
        print(e.stderr)
        os.remove(tmp_wav_path)
        return

    # 2. Send audio to whisper-server
    try:
        print(f"Sending audio to whisper server at {server_url}...")
        with open(tmp_wav_path, "rb") as audio_file:
            files = {"file": (os.path.basename(tmp_wav_path), audio_file, "audio/wav")}
            params = {"response_format": "srt"}
            if translate:
                params["translate"] = "true"

            response = requests.post(server_url, files=files, data=params, timeout=3600) # 1 hour timeout
            response.raise_for_status()

    except requests.exceptions.RequestException as e:
        print(f"Error sending request to whisper server: {e}")
        return
    finally:
        # 4. Clean up temporary WAV file
        print("Cleaning up temporary audio file...")
        os.remove(tmp_wav_path)

    # 3. Save response to SRT file
    srt_path = os.path.splitext(video_path)[0] + ".srt"
    try:
        with open(srt_path, "w", encoding="utf-8") as srt_file:
            srt_file.write(response.text)
        print(f"Successfully created SRT file: {srt_path}")
    except IOError as e:
        print(f"Error writing SRT file: {e}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Transcribe a video file using a whisper.cpp server.")
    parser.add_argument("video_path", help="The path to the video file.")
    parser.add_argument("--translate", action="store_true", help="Request translation to English.")

    args = parser.parse_args()

    transcribe_video(args.video_path, args.translate)