k2-fsa
diff --git a/‎python-api-examples/offline-tts.py‎
Lines changed: 1 addition & 144 deletions b/‎python-api-examples/offline-tts.py‎
Lines changed: 1 addition & 144 deletions
@@ -157,8 +157,6 @@
 
 import argparse
 import time
-import wave
-import numpy as np
 
 import sherpa_onnx
 import soundfile as sf
@@ -321,110 +319,6 @@ def add_kitten_args(parser):
     )
 
 
-def add_zipvoice_args(parser):
-    parser.add_argument(
-        "--zipvoice-tokens",
-        type=str,
-        default="",
-        help="Path to tokens.txt for Zipvoice models.",
-    )
-
-    parser.add_argument(
-        "--zipvoice-text-model",
-        type=str,
-        default="",
-        help="Path to zipvoice text model.",
-    )
-
-    parser.add_argument(
-        "--zipvoice-flow-matching-model",
-        type=str,
-        default="",
-        help="Path to zipvoice flow matching model.",
-    )
-
-    parser.add_argument(
-        "--zipvoice-data-dir",
-        type=str,
-        default="",
-        help="Path to the dict directory of espeak-ng.",
-    )
-
-    parser.add_argument(
-        "--zipvoice-pinyin-dict",
-        type=str,
-        default="",
-        help="Path to the pinyin dictionary.",
-    )
-
-    parser.add_argument(
-        "--zipvoice-vocoder",
-        type=str,
-        default="",
-        help="Path to the vocos vocoder.",
-    )
-
-    parser.add_argument(
-        "--zipvoice-num-steps",
-        type=int,
-        default=4,
-        help="Number of steps for Zipvoice.",
-    )
-
-    parser.add_argument(
-        "--zipvoice-feat-scale",
-        type=float,
-        default=0.1,
-        help="Scale factor for Zipvoice features.",
-    )
-
-    parser.add_argument(
-        "--zipvoice-t-shift",
-        type=float,
-        default=0.5,
-        help="Shift t to smaller ones if t-shift < 1.0.",
-    )
-
-    parser.add_argument(
-        "--zipvoice-target-rms",
-        type=float,
-        default=0.1,
-        help="Target speech normalization RMS value for Zipvoice.",
-    )
-
-    parser.add_argument(
-        "--zipvoice-guidance-scale",
-        type=float,
-        default=1.0,
-        help="The scale classifier-free guidance during inference for for Zipvoice.",
-    )
-
-
-def read_wave(wave_filename: str) -> Tuple[np.ndarray, int]:
-    """
-    Args:
-      wave_filename:
-        Path to a wave file. It should be single channel and each sample should
-        be 16-bit. Its sample rate does not need to be 16kHz.
-    Returns:
-      Return a tuple containing:
-       - A 1-D array of dtype np.float32 containing the samples, which are
-       normalized to the range [-1, 1].
-       - sample rate of the wave file
-    """
-
-    with wave.open(wave_filename) as f:
-        assert f.getnchannels() == 1, f.getnchannels()
-        assert f.getsampwidth() == 2, f.getsampwidth()  # it is in bytes
-        num_samples = f.getnframes()
-        samples = f.readframes(num_samples)
-        samples_int16 = np.frombuffer(samples, dtype=np.int16)
-        samples_float32 = samples_int16.astype(np.float32)
-
-        samples_float32 = samples_float32 / 32768
-        return samples_float32, f.getframerate()
-
-
 def get_args():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
@@ -433,7 +327,6 @@ def get_args():
     add_vits_args(parser)
     add_matcha_args(parser)
     add_kokoro_args(parser)
-    add_zipvoice_args(parser)
     add_kitten_args(parser)
 
     parser.add_argument(
@@ -499,18 +392,6 @@ def get_args():
         help="Speech speed. Larger->faster; smaller->slower",
     )
 
-    parser.add_argument(
-        "--prompt-text",
-        type=str,
-        help="The transcription of prompt audio. Used only for Zipvoice models.",
-    )
-
-    parser.add_argument(
-        "--prompt-audio",
-        type=str,
-        help="The path to prompt audio. Used only for Zipvoice models.",
-    )
-
     parser.add_argument(
         "text",
         type=str,
@@ -549,19 +430,6 @@ def main():
                 dict_dir=args.kokoro_dict_dir,
                 lexicon=args.kokoro_lexicon,
             ),
-            zipvoice=sherpa_onnx.OfflineTtsZipvoiceModelConfig(
-                tokens=args.zipvoice_tokens,
-                text_model=args.zipvoice_text_model,
-                flow_matching_model=args.zipvoice_flow_matching_model,
-                data_dir=args.zipvoice_data_dir,
-                pinyin_dict=args.zipvoice_pinyin_dict,
-                vocoder=args.zipvoice_vocoder,
-                num_steps=args.zipvoice_num_steps,
-                feat_scale=args.zipvoice_feat_scale,
-                t_shift=args.zipvoice_t_shift,
-                target_rms=args.zipvoice_target_rms,
-                guidance_scale=args.zipvoice_guidance_scale,
-            ),
             kitten=sherpa_onnx.OfflineTtsKittenModelConfig(
                 model=args.kitten_model,
                 voices=args.kitten_voices,
@@ -581,18 +449,7 @@ def main():
     tts = sherpa_onnx.OfflineTts(tts_config)
 
     start = time.time()
-    if args.zipvoice_flow_matching_model:
-        prompt_samples, sample_rate = read_wave(args.prompt_audio)
-        audio = tts.generate(
-            args.text,
-            args.prompt_text,
-            prompt_samples,
-            sample_rate,
-            speed=args.speed,
-            num_steps=args.zipvoice_num_steps,
-        )
-    else:
-        audio = tts.generate(args.text, sid=args.sid, speed=args.speed)
+    audio = tts.generate(args.text, sid=args.sid, speed=args.speed)
     end = time.time()
 
     if len(audio.samples) == 0: