157157
158158import argparse
159159import time
160- import wave
161- import numpy as np
162160
163161import sherpa_onnx
164162import soundfile as sf
@@ -321,110 +319,6 @@ def add_kitten_args(parser):
321319 )
322320
323321
324- def add_zipvoice_args (parser ):
325- parser .add_argument (
326- "--zipvoice-tokens" ,
327- type = str ,
328- default = "" ,
329- help = "Path to tokens.txt for Zipvoice models." ,
330- )
331-
332- parser .add_argument (
333- "--zipvoice-text-model" ,
334- type = str ,
335- default = "" ,
336- help = "Path to zipvoice text model." ,
337- )
338-
339- parser .add_argument (
340- "--zipvoice-flow-matching-model" ,
341- type = str ,
342- default = "" ,
343- help = "Path to zipvoice flow matching model." ,
344- )
345-
346- parser .add_argument (
347- "--zipvoice-data-dir" ,
348- type = str ,
349- default = "" ,
350- help = "Path to the dict directory of espeak-ng." ,
351- )
352-
353- parser .add_argument (
354- "--zipvoice-pinyin-dict" ,
355- type = str ,
356- default = "" ,
357- help = "Path to the pinyin dictionary." ,
358- )
359-
360- parser .add_argument (
361- "--zipvoice-vocoder" ,
362- type = str ,
363- default = "" ,
364- help = "Path to the vocos vocoder." ,
365- )
366-
367- parser .add_argument (
368- "--zipvoice-num-steps" ,
369- type = int ,
370- default = 4 ,
371- help = "Number of steps for Zipvoice." ,
372- )
373-
374- parser .add_argument (
375- "--zipvoice-feat-scale" ,
376- type = float ,
377- default = 0.1 ,
378- help = "Scale factor for Zipvoice features." ,
379- )
380-
381- parser .add_argument (
382- "--zipvoice-t-shift" ,
383- type = float ,
384- default = 0.5 ,
385- help = "Shift t to smaller ones if t-shift < 1.0." ,
386- )
387-
388- parser .add_argument (
389- "--zipvoice-target-rms" ,
390- type = float ,
391- default = 0.1 ,
392- help = "Target speech normalization RMS value for Zipvoice." ,
393- )
394-
395- parser .add_argument (
396- "--zipvoice-guidance-scale" ,
397- type = float ,
398- default = 1.0 ,
399- help = "The scale classifier-free guidance during inference for for Zipvoice." ,
400- )
401-
402-
403- def read_wave (wave_filename : str ) -> Tuple [np .ndarray , int ]:
404- """
405- Args:
406- wave_filename:
407- Path to a wave file. It should be single channel and each sample should
408- be 16-bit. Its sample rate does not need to be 16kHz.
409- Returns:
410- Return a tuple containing:
411- - A 1-D array of dtype np.float32 containing the samples, which are
412- normalized to the range [-1, 1].
413- - sample rate of the wave file
414- """
415-
416- with wave .open (wave_filename ) as f :
417- assert f .getnchannels () == 1 , f .getnchannels ()
418- assert f .getsampwidth () == 2 , f .getsampwidth () # it is in bytes
419- num_samples = f .getnframes ()
420- samples = f .readframes (num_samples )
421- samples_int16 = np .frombuffer (samples , dtype = np .int16 )
422- samples_float32 = samples_int16 .astype (np .float32 )
423-
424- samples_float32 = samples_float32 / 32768
425- return samples_float32 , f .getframerate ()
426-
427-
428322def get_args ():
429323 parser = argparse .ArgumentParser (
430324 formatter_class = argparse .ArgumentDefaultsHelpFormatter
@@ -433,7 +327,6 @@ def get_args():
433327 add_vits_args (parser )
434328 add_matcha_args (parser )
435329 add_kokoro_args (parser )
436- add_zipvoice_args (parser )
437330 add_kitten_args (parser )
438331
439332 parser .add_argument (
@@ -499,18 +392,6 @@ def get_args():
499392 help = "Speech speed. Larger->faster; smaller->slower" ,
500393 )
501394
502- parser .add_argument (
503- "--prompt-text" ,
504- type = str ,
505- help = "The transcription of prompt audio. Used only for Zipvoice models." ,
506- )
507-
508- parser .add_argument (
509- "--prompt-audio" ,
510- type = str ,
511- help = "The path to prompt audio. Used only for Zipvoice models." ,
512- )
513-
514395 parser .add_argument (
515396 "text" ,
516397 type = str ,
@@ -549,19 +430,6 @@ def main():
549430 dict_dir = args .kokoro_dict_dir ,
550431 lexicon = args .kokoro_lexicon ,
551432 ),
552- zipvoice = sherpa_onnx .OfflineTtsZipvoiceModelConfig (
553- tokens = args .zipvoice_tokens ,
554- text_model = args .zipvoice_text_model ,
555- flow_matching_model = args .zipvoice_flow_matching_model ,
556- data_dir = args .zipvoice_data_dir ,
557- pinyin_dict = args .zipvoice_pinyin_dict ,
558- vocoder = args .zipvoice_vocoder ,
559- num_steps = args .zipvoice_num_steps ,
560- feat_scale = args .zipvoice_feat_scale ,
561- t_shift = args .zipvoice_t_shift ,
562- target_rms = args .zipvoice_target_rms ,
563- guidance_scale = args .zipvoice_guidance_scale ,
564- ),
565433 kitten = sherpa_onnx .OfflineTtsKittenModelConfig (
566434 model = args .kitten_model ,
567435 voices = args .kitten_voices ,
@@ -581,18 +449,7 @@ def main():
581449 tts = sherpa_onnx .OfflineTts (tts_config )
582450
583451 start = time .time ()
584- if args .zipvoice_flow_matching_model :
585- prompt_samples , sample_rate = read_wave (args .prompt_audio )
586- audio = tts .generate (
587- args .text ,
588- args .prompt_text ,
589- prompt_samples ,
590- sample_rate ,
591- speed = args .speed ,
592- num_steps = args .zipvoice_num_steps ,
593- )
594- else :
595- audio = tts .generate (args .text , sid = args .sid , speed = args .speed )
452+ audio = tts .generate (args .text , sid = args .sid , speed = args .speed )
596453 end = time .time ()
597454
598455 if len (audio .samples ) == 0 :
0 commit comments