NeMo/examples/asr/conf/asr_streaming_inference/buffered_rnnt.yaml at 5470fc7fe57183aecf3ff35d10295234466e0fd4 · NVIDIA-NeMo/NeMo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# ================================
# ASR Configuration
# ================================
asr:
  model_name: nvidia/parakeet-rnnt-1.1b        # Pre-trained RNNT/hybrid model from NGC/HuggingFace or local .nemo file path
  device: cuda                                 # Device for inference: 'cuda' or 'cpu'
  device_id: 0                                 # GPU device ID
  compute_dtype: bfloat16                      # Compute precision: 'bfloat16' for Ampere+, 'float16' for older GPUs, or 'float32'
  use_amp: false                               # Enable Automatic Mixed Precision
  decoding:
    strategy: "greedy_batch"
    preserve_alignments: false
    fused_batch_size: -1
    greedy:
      use_cuda_graph_decoder: true
      enable_per_stream_biasing: true  # Per-stream biasing in decoder
      max_symbols: 10
      # n-gram LM
      ngram_lm_model: null  # The path to built '.nemo' NGPU-LM model
      ngram_lm_alpha: 0.0  # Weight of the LM model
      # phrase boosting
      boosting_tree:
        model_path: null  # The path to built '.nemo' boosting tree model
        key_phrases_file: null  # The path to the context-biasing list file (one phrase per line)
        key_phrases_list: null  # The list of context-biasing phrases ['word1', 'word2', 'word3', ...]
        key_phrase_items_list: null  # The list of context-biasing phrases with custom fields
        # in CLI: [{phrase:"word1",lang:en},{phrase:"frase dos",lang:es}]
        # in code: [PhraseItem(phrase="word1, lang="en"), PhraseItem(phrase2="frase dos", lang="es")]
        source_lang: "en"  # The source language of the context-biasing phrases (for aggregate tokenizer),
        # used with `key_phrases_file` and `key_phrases_list`
      boosting_tree_alpha: 0.0


# ==========================================
# Inverse Text Normalization Configuration
# ==========================================
itn:
  input_case: lower_cased                       # Input text case handling: 'lower_cased', 'cased'
  whitelist: null                               # Custom whitelist for ITN processing
  overwrite_cache: false                        # Whether to overwrite existing cache files
  max_number_of_permutations_per_split: 729     # Maximum permutations allowed per text split during ITN processing
  left_padding_size: 4                          # Padding size (#spans) for ITN context
  batch_size: 32                                # Batch size for ITN inference
  n_jobs: 16                                    # Number of parallel jobs for ITN processing


# ================================
# Neural Machine Translation Configuration
# ================================
nmt:
  model_name: "utter-project/EuroLLM-1.7B-Instruct"  # vLLM-supported model name
  source_language: "English"                         # Source language code
  target_language: "Russian"                         # Target language code
  waitk: -1                                          # Max allowed lag (in words) between ASR transcript and translation; -1 disables it and uses only the longest common prefix between current and previous translations.
  device: cuda                                       # Device for translation: 'cuda'. 'cpu' is not supported.
  device_id: 1                                       # GPU device ID for translation
  batch_size: 16                                     # Batch size for translation, if -1, the batch size is equal to the ASR batch size
  llm_params:                                        # See https://docs.vllm.ai/en/v0.8.1/api/offline_inference/llm.html for more details
    dtype: "auto"                                    # Compute precision
    seed: 42                                         # The seed to initialize the random number generator for sampling
  sampling_params:                                   # See https://docs.vllm.ai/en/v0.6.4/dev/sampling_params.html for more details
    max_tokens: 100                                  # Maximum number of tokens to generate with LLM
    temperature: 0.0                                 # LLM sampling temperature, default for translation is 0 (greedy)
    top_p: 0.9                                       # The cumulative probability threshold for nucleus sampling
    seed: 42                                         # The seed to initialize the random number generator for sampling


# ========================
# Confidence estimation
# ========================
confidence:
  exclude_blank: true                         # Exclude blank tokens when calculating confidence
  aggregation: mean                           # Aggregation method for confidence across time steps
  method_cfg:
    name: entropy                             # Confidence estimation method: 'max_prob' or 'entropy'
    entropy_type: tsallis
    alpha: 0.5
    entropy_norm: exp


# ========================
# Endpointing settings
# ========================
endpointing:
  stop_history_eou: 800                       # Time window (ms) for evaluating EoU
  residue_tokens_at_end: 2                    # Number of residual tokens used for EoU


# ========================
# Streaming configuration
# ========================
streaming:
  sample_rate: 16000                          # Audio sample rate in Hz
  batch_size: 256                             # Number of audio frames per batch
  left_padding_size: 1.6                      # Left padding duration in seconds
  right_padding_size: 1.6                     # Right padding duration in seconds
  chunk_size: 4.8                             # Audio chunk size in seconds
  word_boundary_tolerance: 4                  # Tolerance for word boundaries
  request_type: feature_buffer                # Type of request: frame or feature_buffer
  stateful: true                              # Whether to use stateful processing
  padding_mode: right                         # Padding mode: left or right. How to pad frames to match the required buffer length


# ========================
# Pipeline settings
# ========================
matmul_precision: high                     # Matrix multiplication precision: highest, high, medium
log_level: 20                              # Logging level: 0 (NOTSET), 10 (DEBUG), 20 (INFO), 30 (WARNING), 40 (ERROR), 50 (CRITICAL)
pipeline_type: buffered                    # Pipeline type: buffered, cache_aware
asr_decoding_type: rnnt                    # Decoding method: ctc or rnnt


# ========================
# Runtime arguments defined at runtime via command line
# ========================
audio_file: null                              # Path to audio file, directory, or manifest JSON
output_filename: null                         # Path to output transcription JSON file
output_dir: null                              # Directory to save time-aligned output
enable_pnc: false                             # Whether to apply punctuation & capitalization
enable_itn: false                             # Whether to apply inverse text normalization
enable_nmt: false                             # Whether to apply neural machine translation
asr_output_granularity: segment               # Output granularity: word or segment
cache_dir: null                               # Directory to store cache (e.g., .far files)
lang: null                                    # Language code for ASR model
return_tail_result: false                     # Whether to return the tail labels left in the right padded side of the buffer
calculate_wer: true                           # Whether to calculate WER
calculate_bleu: true                          # Whether to calculate BLEU score

# ========================
# Metrics
# ========================
metrics:
  asr:
    gt_text_attr_name: text                     # Attribute name for ground truth text
    clean_groundtruth_text: false               # Whether to clean ground truth text
    langid: en                                  # Language code for text normalization; only "en" is supported
    use_cer: false                              # Whether to use character error rate
    ignore_capitalization: true                 # Whether to ignore capitalization
    ignore_punctuation: true                    # Whether to ignore punctuation
    strip_punc_space: false                     # Whether to strip punctuation and space
  nmt:
    gt_text_attr_name: answer                   # Attribute name for ground truth text
    ignore_capitalization: false                # Whether to ignore capitalization
    ignore_punctuation: false                   # Whether to ignore punctuation
    strip_punc_space: false                     # Whether to strip punctuation and space