NeMo/examples/asr/conf/asr_streaming_inference/cache_aware_rnnt.yaml at 4f3638d6715299430a53d8ffd8e22b80e37f9729 · NVIDIA-NeMo/NeMo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# ================================
# ASR Configuration
# ================================
asr:
  model_name: stt_en_fastconformer_hybrid_large_streaming_multi         # Pre-trained CTC/hybrid model from NGC/HuggingFace or local .nemo file path
  device: cuda                                                          # Device for inference: 'cuda' or 'cpu'
  device_id: 0                                                          # GPU device ID
  compute_dtype: bfloat16                                               # Compute precision: 'bfloat16' for Ampere+, 'float16' for older GPUs, or 'float32'
  use_amp: true                                                         # Enable Automatic Mixed Precision
  decoding:
    strategy: "greedy_batch"
    preserve_alignments: false
    fused_batch_size: -1
    greedy:
      use_cuda_graph_decoder: false  # Disabled due to issues with decoding
      max_symbols: 10
      # n-gram LM
      ngram_lm_model: null  # The path to built '.nemo' NGPU-LM model
      ngram_lm_alpha: 0.0  # Weight of the LM model
      # phrase boosting
      boosting_tree:
        model_path: null  # The path to built '.nemo' boosting tree model
        key_phrases_file: null  # The path to the context-biasing list file (one phrase per line)
        key_phrases_list: null  # The list of context-biasing phrases ['word1', 'word2', 'word3', ...]
        source_lang: "en"  # The source language of the context-biasing phrases (for aggregate tokenizer)
      boosting_tree_alpha: 0.0  # Weight of the boosting tree

# ==========================================
# Inverse Text Normalization Configuration
# ==========================================
itn:
  input_case: lower_cased                       # Input text case handling: 'lower_cased', 'cased'
  whitelist: null                               # Custom whitelist for ITN processing
  overwrite_cache: false                        # Whether to overwrite existing cache files
  max_number_of_permutations_per_split: 729     # Maximum permutations allowed per text split during ITN processing
  left_padding_size: 4                          # Padding size (#spans) for ITN context
  batch_size: 32                                # Batch size for ITN inference
  n_jobs: 16                                    # Number of parallel jobs for ITN processing


# ========================
# Confidence estimation
# ========================
confidence:
  exclude_blank: true                         # Exclude blank tokens when calculating confidence
  aggregation: mean                           # Aggregation method for confidence across time steps
  method_cfg:
    name: entropy                             # Confidence estimation method: 'max_prob' or 'entropy'
    entropy_type: tsallis
    alpha: 0.5
    entropy_norm: exp


# ========================
# Endpointing settings
# ========================
endpointing:
  stop_history_eou: 800                       # Time window (ms) for evaluating EoU
  residue_tokens_at_end: 2                    # Number of residual tokens used for EoU


# ========================
# Streaming configuration
# ========================
streaming:
  sample_rate: 16000                          # Audio sample rate in Hz
  batch_size: 256                             # Number of audio frames per batch
  word_boundary_tolerance: 4                  # Tolerance for word boundaries
  att_context_size: [70,13]                   # Attention context size: [70,13],[70,6],[70,1],[70,0]
  use_cache: true                             # Whether to use cache for streaming
  use_feat_cache: true                        # Whether to cache mel-spec features, set false to re-calculate all mel-spec features in audio buffer
  chunk_size_in_secs: null                    # Amount of audio to load for each streaming step, e.g., 0.08s for FastConformer. Set to `null` for using default size equal to 1+lookahead frames.
  request_type: frame                         # Type of request: frame, only frame is supported for cache-aware streaming
  num_slots: 1024                             # Number of slots in the context manager: must be >= batch_size


# ========================
# Pipeline settings
# ========================
matmul_precision: high                        # Matrix multiplication precision: highest, high, medium
log_level: 20                                 # Logging level: 0 (NOTSET), 10 (DEBUG), 20 (INFO), 30 (WARNING), 40 (ERROR), 50 (CRITICAL)
pipeline_type: cache_aware                    # Pipeline type: buffered, cache_aware
asr_decoding_type: rnnt                       # Decoding method: ctc or rnnt


# ========================
# Runtime arguments defined at runtime   via command line
# ========================
audio_file: null                              # Path to audio file, directory, or manifest JSON
output_filename: null                         # Path to output transcription JSON file
output_dir: null                              # Directory to save time-aligned output
enable_pnc: false                             # Whether to apply punctuation & capitalization
enable_itn: false                             # Whether to apply inverse text normalization
asr_output_granularity: segment               # Output granularity: word or segment
cache_dir: null                               # Directory to store cache (e.g., .far files)
lang: null                                    # Language code for ASR model
return_tail_result: false                     # Whether to return the tail labels left in the right padded side of the buffer