NVIDIA-NeMo · artbataev · Jan 14, 2026 · Nov 17, 2025 · Nov 17, 2025 · Nov 17, 2025
diff --git a/nemo/collections/asr/inference/factory/pipeline_builder.py b/nemo/collections/asr/inference/factory/pipeline_builder.py
@@ -13,13 +13,12 @@
 # limitations under the License.
 
 
-from typing import Any
-
 import torch
 from omegaconf.dictconfig import DictConfig
 
 from nemo.collections.asr.inference.factory.buffered_pipeline_builder import BufferedPipelineBuilder
 from nemo.collections.asr.inference.factory.cache_aware_pipeline_builder import CacheAwarePipelineBuilder
+from nemo.collections.asr.inference.pipelines.base_pipeline import BasePipeline
 from nemo.collections.asr.inference.utils.enums import PipelineType
 from nemo.utils import logging
 
@@ -54,7 +53,7 @@ def set_log_level(log_level: int) -> None:
         logging.setLevel(log_level)
 
     @staticmethod
-    def build_pipeline(cfg: DictConfig) -> Any:
+    def build_pipeline(cfg: DictConfig) -> BasePipeline:
         """
         Build the pipeline based on the config.
         Args:

diff --git a/nemo/collections/asr/inference/pipelines/buffered_rnnt_pipeline.py b/nemo/collections/asr/inference/pipelines/buffered_rnnt_pipeline.py
@@ -16,6 +16,7 @@
 
 import math
 from typing import TYPE_CHECKING
+import numpy as np
 
 import torch
 from omegaconf import DictConfig
@@ -39,6 +40,7 @@
     update_punctuation_and_language_tokens_timestamps,
 )
 from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis as NemoHypothesis
+from nemo.utils import logging
 
 if TYPE_CHECKING:
     from nemo.collections.asr.inference.itn.inverse_normalizer import AlignmentPreservingInverseNormalizer
@@ -463,29 +465,58 @@ def stateful_transcribe_step(
         states = [self.get_state(request.stream_id) for request in requests]
         partial_hypotheses, rnnt_states = [], []
         all_rnnt_states_are_none = True
-        for state in states:
+        all_multi_biasing_models_empty = True
+        multi_biasing_ids = np.full([len(states)], fill_value=-1)
+        for i, state in enumerate(states):
             hyp_state = state.hyp_decoding_state
+            rnnt_states.append(hyp_state)
             if hyp_state is not None:
+                all_rnnt_states_are_none = False
+            if state.options is not None and state.options.has_biasing_request():
+                if state.options.biasing_cfg.multi_model_id is not None:
+                    all_multi_biasing_models_empty = False
+                    multi_biasing_ids[i] = state.options.biasing_cfg.multi_model_id
+                elif state.options.biasing_cfg.auto_manage_multi_model:
+                    state.options.biasing_cfg.add_to_multi_model(
+                        tokenizer=self.asr_model.tokenizer,
+                        biasing_multi_model=self.decoding_computer.biasing_multi_model,
+                    )
+                    assert state.options.biasing_cfg.multi_model_id is not None
+                    all_multi_biasing_models_empty = False
+                    multi_biasing_ids[i] = state.options.biasing_cfg.multi_model_id
+                else:
+                    logging.warning("Biasing request is not empty, not auto managed and not compiled. Skipping")
+            if hyp_state is not None or (state.options is not None and state.options.has_biasing_request()):
                 partial_hypotheses.append(
-                    NemoHypothesis(score=0.0, y_sequence=torch.zeros([0], dtype=torch.long), dec_state=hyp_state)
+                    NemoHypothesis(
+                        score=0.0,
+                        y_sequence=torch.zeros([0], dtype=torch.long),
+                        dec_state=hyp_state,
+                        biasing_cfg=state.options.biasing_cfg,
+                    )
                 )
-                rnnt_states.append(hyp_state)
-                all_rnnt_states_are_none = False
             else:
                 partial_hypotheses.append(None)
-                rnnt_states.append(None)
 
         batched_rnnt_states = None
         if not all_rnnt_states_are_none:
             batched_rnnt_states = self.decoding_computer.merge_to_batched_state(rnnt_states)
 
+        if all_multi_biasing_models_empty:
+            multi_biasing_ids = None
+        else:
+            multi_biasing_ids = torch.from_numpy(multi_biasing_ids).to(device=enc_lens_chunk.device)
+
         batched_state = None
         if self.tokens_per_right_padding > 0:
             with torch.inference_mode(), torch.no_grad():
                 best_hyp_chunk, alignments, batched_state = self.decoding_computer(
-                    encs.transpose(1, 2), enc_lens_chunk, batched_rnnt_states
+                    encs.transpose(1, 2),
+                    enc_lens_chunk,
+                    batched_rnnt_states,
+                    multi_biasing_ids=multi_biasing_ids,
                 )
-
+        # TODO: remove double-decoding
         best_hyp = self.asr_model.decode(encs, enc_lens, partial_hypotheses=partial_hypotheses)
         if self.tokens_per_right_padding > 0 and batched_state is not None:
             for state, rnnt_state in zip(states, self.decoding_computer.split_batched_state(batched_state)):
@@ -499,6 +530,13 @@ def stateful_transcribe_step(
             curr_state.timestamp_offset += self.tokens_per_frame_float
         ready_state_ids.update(ready_states)
 
+        for request, state in zip(requests, states):
+            # only the first request contains biasing options; biasing options for the stream are stored in state
+            if request.is_last and (state.options is not None and state.options.has_biasing_request()):
+                state.options.biasing_cfg.remove_from_multi_model(
+                    biasing_multi_model=self.decoding_computer.biasing_multi_model
+                )
+
     def decode_step(self, best_hyp: list, requests: list[Request], states: list[RNNTStreamingState]) -> set:
         """
         Perform greedy RNNT decoding to get the best hypothesis and update the state.

diff --git a/nemo/collections/asr/inference/streaming/framing/request_options.py b/nemo/collections/asr/inference/streaming/framing/request_options.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import TypeAlias
 from nemo.collections.asr.inference.utils.enums import ASROutputGranularity
+from nemo.collections.asr.parts.context_biasing.biasing_multi_model import BiasingRequestItemConfig
 
 
 @dataclass(slots=True)
@@ -29,6 +30,7 @@ class ASRRequestOptions:
     enable_pnc: bool = None
     stop_history_eou: int = None
     asr_output_granularity: ASROutputGranularity | str = None
+    biasing_cfg: BiasingRequestItemConfig = field(default_factory=BiasingRequestItemConfig)
 
     def __post_init__(self) -> None:
         """
@@ -76,7 +78,11 @@ def augment_with_defaults(
             asr_output_granularity=(
                 default_asr_output_granularity if self.asr_output_granularity is None else self.asr_output_granularity
             ),
+            biasing_cfg=self.biasing_cfg,
         )
 
+    def has_biasing_request(self):
+        return not self.biasing_cfg.is_empty()
+
 
 RequestOptions: TypeAlias = ASRRequestOptions
diff --git a/nemo/collections/asr/inference/streaming/state/state.py b/nemo/collections/asr/inference/streaming/state/state.py
@@ -34,6 +34,8 @@ class StreamingState:
     Generic state for the streaming ASR pipeline
     """
 
+    options: RequestOptions | None
+
     def __init__(self):
         """
         Initialize the StreamingState

diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py
@@ -1001,6 +1001,42 @@ def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLo
         temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config))
         return temporary_datalayer
 
+    def _transcribe_on_begin(self, audio, trcfg: TranscribeConfig):
+        super()._transcribe_on_begin(audio=audio, trcfg=trcfg)
+        # add biasing requests to the decoding computer
+        try:
+            biasing_multi_model = self.decoding.decoding.decoding_computer.biasing_multi_model
+        except AttributeError:
+            biasing_multi_model = None
+        if biasing_multi_model is not None and trcfg.partial_hypothesis:
+            for partial_hyp in trcfg.partial_hypothesis:
+                if (
+                    isinstance(partial_hyp, Hypothesis)
+                    and partial_hyp.has_biasing_request()
+                    and partial_hyp.biasing_cfg.auto_manage_multi_model
+                    and partial_hyp.biasing_cfg.multi_model_id is None
+                ):
+                    partial_hyp.biasing_cfg.add_to_multi_model(
+                        tokenizer=self.tokenizer, biasing_multi_model=biasing_multi_model
+                    )
+
+    def _transcribe_on_end(self, trcfg: TranscribeConfig):
+        super()._transcribe_on_end(trcfg=trcfg)
+        try:
+            biasing_multi_model = self.decoding.decoding.decoding_computer.biasing_multi_model
+        except AttributeError:
+            biasing_multi_model = None
+
+        # remove biasing requests from the decoding computer
+        if biasing_multi_model is not None and trcfg.partial_hypothesis:
+            for partial_hyp in trcfg.partial_hypothesis:
+                if (
+                    isinstance(partial_hyp, Hypothesis)
+                    and partial_hyp.has_biasing_request()
+                    and partial_hyp.biasing_cfg.auto_manage_multi_model
+                ):
+                    partial_hyp.biasing_cfg.remove_from_multi_model(biasing_multi_model=biasing_multi_model)
+
     def on_after_backward(self):
         super().on_after_backward()
         if self._optim_variational_noise_std > 0 and self.global_step >= self._optim_variational_noise_start: