Skip to content

Commit e0ddc39

Browse files
committed
Add documents
1 parent 98bf9cb commit e0ddc39

4 files changed

Lines changed: 310 additions & 158 deletions

File tree

python-api-examples/offline-tts.py

Lines changed: 1 addition & 144 deletions
Original file line numberDiff line numberDiff line change
@@ -157,8 +157,6 @@
157157

158158
import argparse
159159
import time
160-
import wave
161-
import numpy as np
162160

163161
import sherpa_onnx
164162
import soundfile as sf
@@ -321,110 +319,6 @@ def add_kitten_args(parser):
321319
)
322320

323321

324-
def add_zipvoice_args(parser):
325-
parser.add_argument(
326-
"--zipvoice-tokens",
327-
type=str,
328-
default="",
329-
help="Path to tokens.txt for Zipvoice models.",
330-
)
331-
332-
parser.add_argument(
333-
"--zipvoice-text-model",
334-
type=str,
335-
default="",
336-
help="Path to zipvoice text model.",
337-
)
338-
339-
parser.add_argument(
340-
"--zipvoice-flow-matching-model",
341-
type=str,
342-
default="",
343-
help="Path to zipvoice flow matching model.",
344-
)
345-
346-
parser.add_argument(
347-
"--zipvoice-data-dir",
348-
type=str,
349-
default="",
350-
help="Path to the dict directory of espeak-ng.",
351-
)
352-
353-
parser.add_argument(
354-
"--zipvoice-pinyin-dict",
355-
type=str,
356-
default="",
357-
help="Path to the pinyin dictionary.",
358-
)
359-
360-
parser.add_argument(
361-
"--zipvoice-vocoder",
362-
type=str,
363-
default="",
364-
help="Path to the vocos vocoder.",
365-
)
366-
367-
parser.add_argument(
368-
"--zipvoice-num-steps",
369-
type=int,
370-
default=4,
371-
help="Number of steps for Zipvoice.",
372-
)
373-
374-
parser.add_argument(
375-
"--zipvoice-feat-scale",
376-
type=float,
377-
default=0.1,
378-
help="Scale factor for Zipvoice features.",
379-
)
380-
381-
parser.add_argument(
382-
"--zipvoice-t-shift",
383-
type=float,
384-
default=0.5,
385-
help="Shift t to smaller ones if t-shift < 1.0.",
386-
)
387-
388-
parser.add_argument(
389-
"--zipvoice-target-rms",
390-
type=float,
391-
default=0.1,
392-
help="Target speech normalization RMS value for Zipvoice.",
393-
)
394-
395-
parser.add_argument(
396-
"--zipvoice-guidance-scale",
397-
type=float,
398-
default=1.0,
399-
help="The scale classifier-free guidance during inference for for Zipvoice.",
400-
)
401-
402-
403-
def read_wave(wave_filename: str) -> Tuple[np.ndarray, int]:
404-
"""
405-
Args:
406-
wave_filename:
407-
Path to a wave file. It should be single channel and each sample should
408-
be 16-bit. Its sample rate does not need to be 16kHz.
409-
Returns:
410-
Return a tuple containing:
411-
- A 1-D array of dtype np.float32 containing the samples, which are
412-
normalized to the range [-1, 1].
413-
- sample rate of the wave file
414-
"""
415-
416-
with wave.open(wave_filename) as f:
417-
assert f.getnchannels() == 1, f.getnchannels()
418-
assert f.getsampwidth() == 2, f.getsampwidth() # it is in bytes
419-
num_samples = f.getnframes()
420-
samples = f.readframes(num_samples)
421-
samples_int16 = np.frombuffer(samples, dtype=np.int16)
422-
samples_float32 = samples_int16.astype(np.float32)
423-
424-
samples_float32 = samples_float32 / 32768
425-
return samples_float32, f.getframerate()
426-
427-
428322
def get_args():
429323
parser = argparse.ArgumentParser(
430324
formatter_class=argparse.ArgumentDefaultsHelpFormatter
@@ -433,7 +327,6 @@ def get_args():
433327
add_vits_args(parser)
434328
add_matcha_args(parser)
435329
add_kokoro_args(parser)
436-
add_zipvoice_args(parser)
437330
add_kitten_args(parser)
438331

439332
parser.add_argument(
@@ -499,18 +392,6 @@ def get_args():
499392
help="Speech speed. Larger->faster; smaller->slower",
500393
)
501394

502-
parser.add_argument(
503-
"--prompt-text",
504-
type=str,
505-
help="The transcription of prompt audio. Used only for Zipvoice models.",
506-
)
507-
508-
parser.add_argument(
509-
"--prompt-audio",
510-
type=str,
511-
help="The path to prompt audio. Used only for Zipvoice models.",
512-
)
513-
514395
parser.add_argument(
515396
"text",
516397
type=str,
@@ -549,19 +430,6 @@ def main():
549430
dict_dir=args.kokoro_dict_dir,
550431
lexicon=args.kokoro_lexicon,
551432
),
552-
zipvoice=sherpa_onnx.OfflineTtsZipvoiceModelConfig(
553-
tokens=args.zipvoice_tokens,
554-
text_model=args.zipvoice_text_model,
555-
flow_matching_model=args.zipvoice_flow_matching_model,
556-
data_dir=args.zipvoice_data_dir,
557-
pinyin_dict=args.zipvoice_pinyin_dict,
558-
vocoder=args.zipvoice_vocoder,
559-
num_steps=args.zipvoice_num_steps,
560-
feat_scale=args.zipvoice_feat_scale,
561-
t_shift=args.zipvoice_t_shift,
562-
target_rms=args.zipvoice_target_rms,
563-
guidance_scale=args.zipvoice_guidance_scale,
564-
),
565433
kitten=sherpa_onnx.OfflineTtsKittenModelConfig(
566434
model=args.kitten_model,
567435
voices=args.kitten_voices,
@@ -581,18 +449,7 @@ def main():
581449
tts = sherpa_onnx.OfflineTts(tts_config)
582450

583451
start = time.time()
584-
if args.zipvoice_flow_matching_model:
585-
prompt_samples, sample_rate = read_wave(args.prompt_audio)
586-
audio = tts.generate(
587-
args.text,
588-
args.prompt_text,
589-
prompt_samples,
590-
sample_rate,
591-
speed=args.speed,
592-
num_steps=args.zipvoice_num_steps,
593-
)
594-
else:
595-
audio = tts.generate(args.text, sid=args.sid, speed=args.speed)
452+
audio = tts.generate(args.text, sid=args.sid, speed=args.speed)
596453
end = time.time()
597454

598455
if len(audio.samples) == 0:

0 commit comments

Comments
 (0)