[V1] guidance backend for structured output

russellb · JC1DA · russellb · commit 122da1cf6be4 · 2025-03-18T20:17:48.000Z
This is the V1 integration for [guidance](https://github.com/guidance-ai/llguidance) as a backend for structured output. There is a V0 integration in vllm-project#14589. This backend provides some key benefits to V1: * Broader jsonschema support * Quick startup performance for large schemas Instead of precomputing the masks for all states, this is done on the fly. We see very fast request startup times, even for large schemas. This should make V1 roughly feature equivalent to V0 in terms of the types of schemas it can support. More technical details are available in the llguidance git repo. Signed-off-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: Loc Huynh <jc1da.3011@gmail.com>
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
@@ -989,11 +989,12 @@ def main(args: argparse.Namespace):
                         type=float,
                         default=1.0,
                         help="Ratio of Structured Outputs requests")
-    parser.add_argument("--structured-output-backend",
-                        type=str,
-                        choices=["outlines", "lm-format-enforcer", "xgrammar"],
-                        default="xgrammar",
-                        help="Backend to use for structured outputs")
+    parser.add_argument(
+        "--structured-output-backend",
+        type=str,
+        choices=["outlines", "lm-format-enforcer", "xgrammar", "guidance"],
+        default="xgrammar",
+        help="Backend to use for structured outputs")
 
     args = parser.parse_args()
     main(args)
diff --git a/requirements/common.txt b/requirements/common.txt
@@ -21,6 +21,7 @@ lm-format-enforcer >= 0.10.11, < 0.11
 outlines == 0.1.11
 lark == 1.2.2
 xgrammar == 0.1.16; platform_machine == "x86_64" or platform_machine == "aarch64"
+llguidance==0.6.31
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
diff --git a/vllm/config.py b/vllm/config.py
@@ -2785,10 +2785,17 @@ def compute_hash(self) -> str:
         return hash_str
 
     def __post_init__(self):
-        valid_guided_backends = ['outlines', 'lm-format-enforcer', 'xgrammar']
+        v0_valid_guided_backends = [
+            'outlines', 'lm-format-enforcer', 'xgrammar'
+        ]
+        v1_valid_guided_backends = ['xgrammar', 'guidance']
 
         backend = GuidedDecodingParams(
             backend=self.guided_decoding_backend).backend_name
+        if envs.VLLM_USE_V1:
+            valid_guided_backends = v1_valid_guided_backends
+        else:
+            valid_guided_backends = v0_valid_guided_backends
         if backend not in valid_guided_backends:
             raise ValueError(f"Invalid guided_decoding_backend '{backend},"
                              f" must be one of {valid_guided_backends}")
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
@@ -20,7 +20,8 @@
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.structured_output.utils import validate_structured_output_request
+from vllm.v1.structured_output.utils import (
+    validate_structured_output_request_xgrammar)
 
 
 class Processor:
@@ -120,7 +121,7 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
         if not params.guided_decoding or not self.decoding_config:
             return
 
-        supported_backends = ["xgrammar"]
+        supported_backends = ["xgrammar", "guidance"]
         engine_level_backend = self.decoding_config.guided_decoding_backend
         if engine_level_backend not in supported_backends:
             raise ValueError(f"Only {supported_backends} structured output is "
@@ -137,7 +138,8 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
         if vllm.platforms.current_platform.is_tpu():
             raise ValueError("Structured output is not supported on TPU.")
 
-        validate_structured_output_request(params)
+        if engine_level_backend == "xgrammar":
+            validate_structured_output_request_xgrammar(params)
 
     def process_inputs(
         self,
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
@@ -7,6 +7,7 @@
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.v1.structured_output.backend_guidance import GuidanceBackend
 from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
                                                      StructuredOutputGrammar)
 from vllm.v1.structured_output.backend_xgrammar import XgrammarBackend
@@ -48,6 +49,8 @@ def grammar_init(self, request: Request) -> None:
             backend_name = request.sampling_params.guided_decoding.backend_name
             if backend_name == "xgrammar":
                 self.backend = XgrammarBackend(self.vllm_config)
+            elif backend_name == "guidance":
+                self.backend = GuidanceBackend(self.vllm_config)
             else:
                 raise ValueError(
                     f"Unsupported structured output backend: {backend_name}")
diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import os
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
+from vllm.utils import LazyLoader
+from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
+                                                     StructuredOutputGrammar,
+                                                     StructuredOutputOptions)
+
+if TYPE_CHECKING:
+    import llguidance
+    import llguidance.hf as llguidance_hf
+    import llguidance.torch as llguidance_torch
+else:
+    llguidance = LazyLoader("llguidance", globals(), "llguidance")
+    llguidance_hf = LazyLoader("llguidance.hf", globals(), "llguidance.hf")
+    llguidance_torch = LazyLoader("llguidance.torch", globals(),
+                                  "llguidance.torch")
+
+logger = init_logger(__name__)
+
+
+class GuidanceBackend(StructuredOutputBackend):
+
+    def __init__(self, vllm_config: VllmConfig):
+        self.vllm_config = vllm_config
+        tokenizer_group = init_tokenizer_from_configs(
+            model_config=vllm_config.model_config,
+            scheduler_config=vllm_config.scheduler_config,
+            parallel_config=vllm_config.parallel_config,
+            lora_config=vllm_config.lora_config)  # type: ignore[arg-type]
+        tokenizer_group.ping()
+        self.vllm_config = vllm_config
+        self.vocab_size = vllm_config.model_config.get_vocab_size()
+
+        tokenizer = tokenizer_group.get_lora_tokenizer(None)
+        self.ll_tokenizer = llguidance_hf.from_tokenizer(tokenizer, None)
+
+    def compile_grammar(self, request_type: StructuredOutputOptions,
+                        grammar_spec: str) -> StructuredOutputGrammar:
+
+        if request_type == StructuredOutputOptions.JSON:
+            if isinstance(grammar_spec, dict):
+                schema = json.dumps(grammar_spec)
+            else:
+                schema = str(grammar_spec)
+
+            # TODO: make whitespace_flexible configurable
+            compiler = llguidance.JsonCompiler(whitespace_flexible=False)
+            self.serialized_grammar = compiler.compile(schema)
+        elif request_type == StructuredOutputOptions.JSON_OBJECT:
+            compiler = llguidance.JsonCompiler(whitespace_flexible=False)
+            self.serialized_grammar = compiler.compile('{"type": "object"}')
+        elif (request_type == StructuredOutputOptions.REGEX
+              or request_type == StructuredOutputOptions.CHOICE):
+            compiler = llguidance.RegexCompiler()
+            self.serialized_grammar = compiler.compile(regex=grammar_spec)
+        elif request_type == StructuredOutputOptions.GRAMMAR:
+            if isinstance(grammar_spec, dict):
+                self.serialized_grammar = json.dumps(grammar_spec)
+            else:
+                self.serialized_grammar = str(grammar_spec)
+        else:
+            logger.error(
+                "Validation should have already occurred. Please file an issue."
+            )
+            raise ValueError(
+                f"grammar is not of valid supported types. ({request_type!s})")
+
+        ll_interpreter = llguidance.LLInterpreter(
+            self.ll_tokenizer,
+            self.serialized_grammar,
+            enable_backtrack=False,
+            enable_ff_tokens=False,
+            log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")),
+        )
+
+        return GuidanceGrammar(
+            ll_interpreter=ll_interpreter,
+            ll_tokenizer=self.ll_tokenizer,
+            vocab_size=self.vocab_size,
+        )
+
+    def allocate_token_bitmask(self, max_num_seqs: int):
+        return llguidance_torch.allocate_token_bitmask(
+            max_num_seqs, self.ll_tokenizer.vocab_size)
+
+
+@dataclass
+class GuidanceGrammar(StructuredOutputGrammar):
+
+    ll_interpreter: llguidance.LLInterpreter
+    ll_tokenizer: llguidance_hf.LLTokenizer
+    vocab_size: int
+    stopped: bool = False
+
+    def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
+        """Accepts a list of tokens and advances the FSM.
+
+        Returns True if the FSM was advanced successfully.
+        Returns False if the FSM failed to advance.
+        """
+
+        if self.stopped:
+            return True
+
+        for token in tokens:
+            # TODO - Add jump decoding support in the future.
+            # For now we turn this off when creating the LLInterpreter.
+            #backtrack, ff_tokens = self.ll_interpreter.commit_token(token)
+            self.ll_interpreter.commit_token(token)
+
+        return True
+
+    def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None:
+        if self.ll_interpreter.has_pending_stop():
+            # fill bitmask with eos token before is_terminated() return True
+            eos_token = self.ll_tokenizer.eos_token
+            bitmask[idx, :] = 0
+            bitmask[idx, eos_token // 32] = 1 << (eos_token % 32)
+            self.stopped = True
+        else:
+            llguidance_torch.fill_next_token_bitmask(self.ll_interpreter,
+                                                     bitmask, idx)
+
+    def is_terminated(self) -> bool:
+        return self.stopped
+
+    def reset(self):
+        # This method may be not needed anymore? TODO
+        pass
diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
@@ -239,7 +239,7 @@ def escape_ebnf_string(s: str) -> str:
     return grammar
 
 
-def validate_structured_output_request(
+def validate_structured_output_request_xgrammar(
         sampling_params: SamplingParams) -> None:
     """Validate that the request is supported by structured output.