@@ -158,10 +158,18 @@ def encode_audio_stream(path, ffmpeg_path, audio_stream_language=None):
158
158
# Use the ISO 639-2 code if available
159
159
audio_stream_language = get_ISO_639_2_code (audio_stream_language )
160
160
logger .debug (f"Whisper will use the '{ audio_stream_language } ' audio stream for { path } " )
161
- inp = inp [f'a:m:language:{ audio_stream_language } ' ]
162
-
163
- out , _ = inp .output ("-" , format = "s16le" , acodec = "pcm_s16le" , ac = 1 , ar = 16000 , af = "aresample=async=1" ) \
164
- .run (cmd = [ffmpeg_path , "-nostdin" ], capture_stdout = True , capture_stderr = True )
161
+ # 0 = Pick first stream in case there are multiple language streams of the same language,
162
+ # otherwise ffmpeg will try to combine multiple streams, but our output format doesn't support that.
163
+ # The first stream is probably the correct one, as later streams are usually commentaries
164
+ lang_map = f"0:m:language:{ audio_stream_language } "
165
+ else :
166
+ # there is only one stream, so just use that one
167
+ lang_map = ""
168
+ out , _ = (
169
+ inp .output ("-" , format = "s16le" , acodec = "pcm_s16le" , ac = 1 , ar = 16000 , af = "aresample=async=1" )
170
+ .global_args ("-map" , lang_map )
171
+ .run (cmd = [ffmpeg_path , "-nostdin" ], capture_stdout = True , capture_stderr = True )
172
+ )
165
173
166
174
except ffmpeg .Error as e :
167
175
logger .warning (f"ffmpeg failed to load audio: { e .stderr .decode ()} " )
0 commit comments