-
Notifications
You must be signed in to change notification settings - Fork 3.4k
Expand file tree
/
Copy pathbuffered_rnnt.yaml
More file actions
145 lines (130 loc) · 8.51 KB
/
buffered_rnnt.yaml
File metadata and controls
145 lines (130 loc) · 8.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# ================================
# ASR Configuration
# ================================
asr:
model_name: nvidia/parakeet-rnnt-1.1b # Pre-trained RNNT/hybrid model from NGC/HuggingFace or local .nemo file path
device: cuda # Device for inference: 'cuda' or 'cpu'
device_id: 0 # GPU device ID
compute_dtype: bfloat16 # Compute precision: 'bfloat16' for Ampere+, 'float16' for older GPUs, or 'float32'
use_amp: false # Enable Automatic Mixed Precision
decoding:
strategy: "greedy_batch"
preserve_alignments: false
fused_batch_size: -1
greedy:
use_cuda_graph_decoder: true
enable_per_stream_biasing: true # Per-stream biasing in decoder
max_symbols: 10
# n-gram LM
ngram_lm_model: null # The path to built '.nemo' NGPU-LM model
ngram_lm_alpha: 0.0 # Weight of the LM model
# phrase boosting
boosting_tree:
model_path: null # The path to built '.nemo' boosting tree model
key_phrases_file: null # The path to the context-biasing list file (one phrase per line)
key_phrases_list: null # The list of context-biasing phrases ['word1', 'word2', 'word3', ...]
key_phrase_items_list: null # The list of context-biasing phrases with custom fields
# in CLI: [{phrase:"word1",lang:en},{phrase:"frase dos",lang:es}]
# in code: [PhraseItem(phrase="word1, lang="en"), PhraseItem(phrase2="frase dos", lang="es")]
source_lang: "en" # The source language of the context-biasing phrases (for aggregate tokenizer),
# used with `key_phrases_file` and `key_phrases_list`
boosting_tree_alpha: 0.0
# ==========================================
# Inverse Text Normalization Configuration
# ==========================================
itn:
input_case: lower_cased # Input text case handling: 'lower_cased', 'cased'
whitelist: null # Custom whitelist for ITN processing
overwrite_cache: false # Whether to overwrite existing cache files
max_number_of_permutations_per_split: 729 # Maximum permutations allowed per text split during ITN processing
left_padding_size: 4 # Padding size (#spans) for ITN context
batch_size: 32 # Batch size for ITN inference
n_jobs: 16 # Number of parallel jobs for ITN processing
# ================================
# Neural Machine Translation Configuration
# ================================
nmt:
model_name: "utter-project/EuroLLM-1.7B-Instruct" # vLLM-supported model name
source_language: "English" # Source language code
target_language: "Russian" # Target language code
waitk: -1 # Max allowed lag (in words) between ASR transcript and translation; -1 disables it and uses only the longest common prefix between current and previous translations.
device: cuda # Device for translation: 'cuda'. 'cpu' is not supported.
device_id: 1 # GPU device ID for translation
batch_size: 16 # Batch size for translation, if -1, the batch size is equal to the ASR batch size
llm_params: # See https://docs.vllm.ai/en/v0.8.1/api/offline_inference/llm.html for more details
dtype: "auto" # Compute precision
seed: 42 # The seed to initialize the random number generator for sampling
sampling_params: # See https://docs.vllm.ai/en/v0.6.4/dev/sampling_params.html for more details
max_tokens: 100 # Maximum number of tokens to generate with LLM
temperature: 0.0 # LLM sampling temperature, default for translation is 0 (greedy)
top_p: 0.9 # The cumulative probability threshold for nucleus sampling
seed: 42 # The seed to initialize the random number generator for sampling
# ========================
# Confidence estimation
# ========================
confidence:
exclude_blank: true # Exclude blank tokens when calculating confidence
aggregation: mean # Aggregation method for confidence across time steps
method_cfg:
name: entropy # Confidence estimation method: 'max_prob' or 'entropy'
entropy_type: tsallis
alpha: 0.5
entropy_norm: exp
# ========================
# Endpointing settings
# ========================
endpointing:
stop_history_eou: 800 # Time window (ms) for evaluating EoU
residue_tokens_at_end: 2 # Number of residual tokens used for EoU
# ========================
# Streaming configuration
# ========================
streaming:
sample_rate: 16000 # Audio sample rate in Hz
batch_size: 256 # Number of audio frames per batch
left_padding_size: 1.6 # Left padding duration in seconds
right_padding_size: 1.6 # Right padding duration in seconds
chunk_size: 4.8 # Audio chunk size in seconds
word_boundary_tolerance: 4 # Tolerance for word boundaries
request_type: feature_buffer # Type of request: frame or feature_buffer
stateful: true # Whether to use stateful processing
padding_mode: right # Padding mode: left or right. How to pad frames to match the required buffer length
# ========================
# Pipeline settings
# ========================
matmul_precision: high # Matrix multiplication precision: highest, high, medium
log_level: 20 # Logging level: 0 (NOTSET), 10 (DEBUG), 20 (INFO), 30 (WARNING), 40 (ERROR), 50 (CRITICAL)
pipeline_type: buffered # Pipeline type: buffered, cache_aware
asr_decoding_type: rnnt # Decoding method: ctc or rnnt
# ========================
# Runtime arguments defined at runtime via command line
# ========================
audio_file: null # Path to audio file, directory, or manifest JSON
output_filename: null # Path to output transcription JSON file
output_dir: null # Directory to save time-aligned output
enable_pnc: false # Whether to apply punctuation & capitalization
enable_itn: false # Whether to apply inverse text normalization
enable_nmt: false # Whether to apply neural machine translation
asr_output_granularity: segment # Output granularity: word or segment
cache_dir: null # Directory to store cache (e.g., .far files)
lang: null # Language code for ASR model
return_tail_result: false # Whether to return the tail labels left in the right padded side of the buffer
calculate_wer: true # Whether to calculate WER
calculate_bleu: true # Whether to calculate BLEU score
# ========================
# Metrics
# ========================
metrics:
asr:
gt_text_attr_name: text # Attribute name for ground truth text
clean_groundtruth_text: false # Whether to clean ground truth text
langid: en # Language code for text normalization; only "en" is supported
use_cer: false # Whether to use character error rate
ignore_capitalization: true # Whether to ignore capitalization
ignore_punctuation: true # Whether to ignore punctuation
strip_punc_space: false # Whether to strip punctuation and space
nmt:
gt_text_attr_name: answer # Attribute name for ground truth text
ignore_capitalization: false # Whether to ignore capitalization
ignore_punctuation: false # Whether to ignore punctuation
strip_punc_space: false # Whether to strip punctuation and space