vllm-project
diff --git a/‎examples/llm_engine_example.py
Lines changed: 1 addition & 1 deletion b/‎examples/llm_engine_example.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/async_engine/test_request_tracker.py
Lines changed: 1 addition & 1 deletion b/‎tests/async_engine/test_request_tracker.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/conftest.py
Lines changed: 33 additions & 0 deletions b/‎tests/conftest.py
Lines changed: 33 additions & 0 deletions
diff --git a/‎tests/samplers/test_logprobs.py
Lines changed: 55 additions & 0 deletions b/‎tests/samplers/test_logprobs.py
Lines changed: 55 additions & 0 deletions
diff --git a/‎vllm/config.py
Lines changed: 1 addition & 1 deletion b/‎vllm/config.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎vllm/engine/llm_engine.py
Lines changed: 13 additions & 7 deletions b/‎vllm/engine/llm_engine.py
Lines changed: 13 additions & 7 deletions
diff --git a/‎vllm/model_executor/layers/attention.py
Lines changed: 1 addition & 1 deletion b/‎vllm/model_executor/layers/attention.py
Lines changed: 1 addition & 1 deletion
@@ -11,7 +11,7 @@ def main(args: argparse.Namespace):
     # Test the following prompts.
     test_prompts = [
         ("A robot may not injure a human being",
-         SamplingParams(temperature=0.0)),
+         SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1)),
         ("To be or not to be,",
          SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
         ("What is the meaning of life?",
 
@@ -64,7 +64,7 @@ def test_request_tracker():
     stream_5 = tracker.add_request("5")
     assert tracker.new_requests_event.flag
     tracker.process_request_output(
-        RequestOutput("2", "output", [], [], finished=True))
+        RequestOutput("2", "output", [], [], [], finished=True))
     new, finished = tracker.get_new_and_finished_requests()
     assert not tracker.new_requests_event.flag
     assert len(finished) == 1
 
@@ -107,6 +107,39 @@ def generate_beam_search(
             outputs[i] = (output_ids, output_str)
         return outputs
 
+    def generate_greedy_logprobs(
+        self,
+        prompts: List[str],
+        max_tokens: int,
+    ) -> List[List[torch.Tensor]]:
+        all_logprobs = []
+        for prompt in prompts:
+            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
+            output = self.model.generate(
+                input_ids.cuda(),
+                use_cache=True,
+                do_sample=False,
+                max_new_tokens=max_tokens,
+                output_hidden_states=True,
+                return_dict_in_generate=True,
+            )
+            seq_logprobs = []
+            for hidden_states in output.hidden_states:
+                last_hidden_states = hidden_states[-1][0]
+                logits = torch.matmul(
+                    last_hidden_states,
+                    self.model.get_output_embeddings().weight.t(),
+                )
+                if self.model.get_output_embeddings().bias is not None:
+                    logits += self.model.get_output_embeddings(
+                    ).bias.unsqueeze(0)
+                logprobs = torch.nn.functional.log_softmax(logits,
+                                                           dim=-1,
+                                                           dtype=torch.float32)
+                seq_logprobs.append(logprobs)
+            all_logprobs.append(seq_logprobs)
+        return all_logprobs
+
 
 @pytest.fixture
 def hf_runner():
 
@@ -0,0 +1,55 @@
+import pytest
+import torch
+
+from vllm import SamplingParams
+
+MODELS = ["facebook/opt-125m"]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_get_prompt_logprobs(
+    hf_runner,
+    vllm_runner,
+    model,
+    dtype,
+    example_prompts,
+):
+    max_tokens = 5
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_logprobs = hf_model.generate_greedy_logprobs(
+        example_prompts,
+        max_tokens=max_tokens,
+    )
+    del hf_model
+
+    vllm_model = vllm_runner(model, dtype=dtype)
+    vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
+                                          logprobs=5,
+                                          prompt_logprobs=5,
+                                          temperature=0.0)
+    vllm_results = vllm_model.model.generate(
+        example_prompts, sampling_params=vllm_sampling_params)
+
+    # Test whether logprobs are included in the results.
+    for result in vllm_results:
+        assert result.prompt_logprobs is not None
+        assert result.outputs[0].logprobs is not None
+
+    # Test whether prompt logprobs are consistent with HF
+    for vllm_result, hf_logprob in zip(vllm_results, hf_logprobs):
+        # Check prompt logprobs
+        vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:]
+        for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs):
+            for token_id, logprob in vllm_prompt_logprob_dict.items():
+                torch.testing.assert_close(logprob,
+                                           hf_logprob[0][i][token_id].item(),
+                                           atol=1e-2,
+                                           rtol=1e-2)
+        vllm_sample_logprobs = vllm_result.outputs[0].logprobs
+        for i, vllm_sample_logprob_dict in enumerate(vllm_sample_logprobs):
+            for token_id, logprob in vllm_sample_logprob_dict.items():
+                torch.testing.assert_close(logprob,
+                                           hf_logprob[i][-1][token_id].item(),
+                                           atol=1e-2,
+                                           rtol=1e-2)
@@ -143,7 +143,7 @@ def get_head_size(self) -> int:
     def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
         """Returns the number of KV heads per GPU worker."""
         # For GPTBigCode & Falcon:
-        # Note: for falcon, when new_decoder_architecture is True, the
+        # NOTE: for falcon, when new_decoder_architecture is True, the
         # multi_query flag is ignored and we use n_head_kv for the number of
         # KV heads.
         falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"]
 
@@ -12,8 +12,8 @@
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (SamplerOutput, Sequence, SequenceGroup,
-                           SequenceGroupMetadata, SequenceOutputs,
-                           SequenceStatus)
+                           SequenceGroupMetadata, SequenceGroupOutputs,
+                           SequenceOutputs, SequenceStatus)
 from vllm.transformers_utils.tokenizer import (detokenize_incrementally,
                                                get_tokenizer)
 from vllm.utils import Counter
@@ -350,9 +350,15 @@ def _check_beam_search_early_stopping(
                         eos_token_id=self.tokenizer.eos_token_id))
         return current_worst_score >= highest_attainable_score
 
-    def _process_sequence_group_samples(
-            self, seq_group: SequenceGroup,
-            samples: List[SequenceOutputs]) -> None:
+    def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
+                                        outputs: SequenceGroupOutputs) -> None:
+        # Process prompt logprobs
+        prompt_logprobs = outputs.prompt_logprobs
+        if prompt_logprobs is not None:
+            seq_group.prompt_logprobs = prompt_logprobs
+
+        # Process samples
+        samples = outputs.samples
         parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
         existing_finished_seqs = seq_group.get_finished_seqs()
         parent_child_dict = {
@@ -520,8 +526,8 @@ def _process_model_outputs(
             scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]:
         # Update the scheduled sequence groups with the model outputs.
         scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups
-        for seq_group, samples in zip(scheduled_seq_groups, output):
-            self._process_sequence_group_samples(seq_group, samples)
+        for seq_group, outputs in zip(scheduled_seq_groups, output):
+            self._process_sequence_group_outputs(seq_group, outputs)
 
         # Free the finished sequence groups.
         self.scheduler.free_finished_seq_groups()
 
@@ -420,7 +420,7 @@ def set_attn_bias(self, input_metadata: InputMetadata,
         # Generates ALiBi mask for each prompt.
         for prompt_len in input_metadata.prompt_lens:
             bias = torch.arange(prompt_len, dtype=dtype)
-            # Note(zhuohan): HF uses
+            # NOTE(zhuohan): HF uses
             #     `bias = bias[None, :].repeat(prompt_len, 1)`
             # here. We find that both biases give the same results, but
             # the bias below more accurately follows the original ALiBi