feat: estimate input tokens before model calls (#2221)

opieter-aws · web-flow · commit 888c98c4307b · 2026-04-29T14:19:14.000-04:00
diff --git a/src/strands/agent/agent_result.py b/src/strands/agent/agent_result.py
@@ -44,6 +44,15 @@ def context_size(self) -> int | None:
         """
         return self.metrics.latest_context_size
 
+    @property
+    def projected_context_size(self) -> int | None:
+        """Projected context size for the next model call.
+
+        Returns:
+            The projected token count (inputTokens + outputTokens), or None if no data is available.
+        """
+        return self.metrics.projected_context_size
+
     def __str__(self) -> str:
         """Return a string representation of the agent result.
 
diff --git a/src/strands/event_loop/event_loop.py b/src/strands/event_loop/event_loop.py
@@ -75,6 +75,48 @@ def _has_tool_use_in_latest_message(messages: "Messages") -> bool:
     return False
 
 
+async def _estimate_input_tokens(agent: "Agent") -> int:
+    """Estimate the input token count for the next model call.
+
+    Reads inputTokens + outputTokens from the last assistant message's metadata as a known
+    baseline, then estimates only new messages added after it. Falls back to full estimation
+    when no metadata is available (cold start or first call). On cold start, tool specs are
+    resolved lazily so that the caller does not need to resolve them before BeforeModelCallEvent.
+
+    Args:
+        agent: The agent instance with messages and model.
+
+    Returns:
+        Estimated input token count.
+    """
+    messages = agent.messages
+
+    # Find the last assistant message with usage metadata
+    last_assistant_idx = -1
+    for i, msg in reversed(list(enumerate(messages))):
+        if msg.get("role") == "assistant" and msg.get("metadata", {}).get("usage"):
+            last_assistant_idx = i
+            break
+
+    if last_assistant_idx >= 0:
+        usage = messages[last_assistant_idx]["metadata"]["usage"]
+        known_baseline = usage["inputTokens"] + usage["outputTokens"]
+        new_messages = messages[last_assistant_idx + 1 :]
+        if not new_messages:
+            return known_baseline
+        # System prompt and tool spec tokens are already included in the baseline
+        return known_baseline + await agent.model.count_tokens(new_messages)
+
+    # Cold start: resolve tool specs lazily for estimation only
+    tool_specs = agent.tool_registry.get_all_tool_specs()
+    return await agent.model.count_tokens(
+        messages,
+        tool_specs=tool_specs,
+        system_prompt=agent.system_prompt,
+        system_prompt_content=agent._system_prompt_content,
+    )
+
+
 async def event_loop_cycle(
     agent: "Agent",
     invocation_state: dict[str, Any],
@@ -325,10 +367,18 @@ async def _handle_model_execution(
         )
         with trace_api.use_span(model_invoke_span, end_on_exit=False):
             try:
+                # Estimate input tokens for the upcoming model call (non-fatal)
+                projected_input_tokens: int | None = None
+                try:
+                    projected_input_tokens = await _estimate_input_tokens(agent)
+                except Exception as e:
+                    logger.debug("error=<%s> | token estimation failed, proceeding without estimate", e)
+
                 await agent.hooks.invoke_callbacks_async(
                     BeforeModelCallEvent(
                         agent=agent,
                         invocation_state=invocation_state,
+                        projected_input_tokens=projected_input_tokens,
                     )
                 )
 
diff --git a/src/strands/hooks/events.py b/src/strands/hooks/events.py
@@ -236,9 +236,14 @@ class BeforeModelCallEvent(HookEvent):
         invocation_state: State and configuration passed through the agent invocation.
             This can include shared context for multi-agent coordination, request tracking,
             and dynamic configuration.
+        projected_input_tokens: Projected input token count for the upcoming model call.
+            Computed by the agent loop from message metadata and token estimation.
+            Available for hooks and plugins (e.g. conversation managers) to make
+            proactive decisions about context management. None if estimation failed.
     """
 
     invocation_state: dict[str, Any] = field(default_factory=dict)
+    projected_input_tokens: int | None = None
 
 
 @dataclass
diff --git a/src/strands/telemetry/metrics.py b/src/strands/telemetry/metrics.py
@@ -215,6 +215,25 @@ def latest_context_size(self) -> int | None:
             return self.agent_invocations[-1].cycles[-1].usage.get("inputTokens")
         return None
 
+    @property
+    def projected_context_size(self) -> int | None:
+        """Projected context size for the next model call.
+
+        Computed as inputTokens + outputTokens from the most recent cycle's usage,
+        representing the approximate input token count for the next model call
+        (prior input + generated output that is now part of the conversation).
+
+        Returns:
+            The projected token count, or None if no data is available.
+        """
+        if self.agent_invocations and self.agent_invocations[-1].cycles:
+            usage = self.agent_invocations[-1].cycles[-1].usage
+            input_tokens = usage.get("inputTokens")
+            output_tokens = usage.get("outputTokens")
+            if input_tokens is not None and output_tokens is not None:
+                return input_tokens + output_tokens
+        return None
+
     @property
     def _metrics_client(self) -> "MetricsClient":
         """Get the singleton MetricsClient instance."""
diff --git a/tests/strands/agent/hooks/test_events.py b/tests/strands/agent/hooks/test_events.py
@@ -260,3 +260,22 @@ def test_after_invocation_event_resume_accepts_various_input_types(agent):
     # None to stop
     event.resume = None
     assert event.resume is None
+
+
+def test_before_model_call_event_projected_input_tokens_default(agent):
+    """Test that projected_input_tokens defaults to None."""
+    event = BeforeModelCallEvent(agent=agent)
+    assert event.projected_input_tokens is None
+
+
+def test_before_model_call_event_projected_input_tokens_set(agent):
+    """Test that projected_input_tokens can be set at construction."""
+    event = BeforeModelCallEvent(agent=agent, projected_input_tokens=500)
+    assert event.projected_input_tokens == 500
+
+
+def test_before_model_call_event_projected_input_tokens_not_writable(agent):
+    """Test that projected_input_tokens is not writable after construction."""
+    event = BeforeModelCallEvent(agent=agent, projected_input_tokens=500)
+    with pytest.raises(AttributeError, match="Property projected_input_tokens is not writable"):
+        event.projected_input_tokens = 1000
diff --git a/tests/strands/agent/test_agent_hooks.py b/tests/strands/agent/test_agent_hooks.py
@@ -165,7 +165,7 @@ def test_agent__call__hooks(agent, hook_provider, agent_tool, mock_model, tool_u
         agent=agent,
         message=agent.messages[0],
     )
-    assert next(events) == BeforeModelCallEvent(agent=agent, invocation_state=ANY)
+    assert next(events) == BeforeModelCallEvent(agent=agent, invocation_state=ANY, projected_input_tokens=ANY)
     assert next(events) == AfterModelCallEvent(
         agent=agent,
         invocation_state=ANY,
@@ -195,7 +195,7 @@ def test_agent__call__hooks(agent, hook_provider, agent_tool, mock_model, tool_u
         result={"content": [{"text": "!loot a dekovni I"}], "status": "success", "toolUseId": "123"},
     )
     assert next(events) == MessageAddedEvent(agent=agent, message=agent.messages[2])
-    assert next(events) == BeforeModelCallEvent(agent=agent, invocation_state=ANY)
+    assert next(events) == BeforeModelCallEvent(agent=agent, invocation_state=ANY, projected_input_tokens=ANY)
     assert next(events) == AfterModelCallEvent(
         agent=agent,
         invocation_state=ANY,
@@ -239,7 +239,7 @@ async def test_agent_stream_async_hooks(agent, hook_provider, agent_tool, mock_m
         agent=agent,
         message=agent.messages[0],
     )
-    assert next(events) == BeforeModelCallEvent(agent=agent, invocation_state=ANY)
+    assert next(events) == BeforeModelCallEvent(agent=agent, invocation_state=ANY, projected_input_tokens=ANY)
     assert next(events) == AfterModelCallEvent(
         agent=agent,
         invocation_state=ANY,
@@ -269,7 +269,7 @@ async def test_agent_stream_async_hooks(agent, hook_provider, agent_tool, mock_m
         result={"content": [{"text": "!loot a dekovni I"}], "status": "success", "toolUseId": "123"},
     )
     assert next(events) == MessageAddedEvent(agent=agent, message=agent.messages[2])
-    assert next(events) == BeforeModelCallEvent(agent=agent, invocation_state=ANY)
+    assert next(events) == BeforeModelCallEvent(agent=agent, invocation_state=ANY, projected_input_tokens=ANY)
     assert next(events) == AfterModelCallEvent(
         agent=agent,
         invocation_state=ANY,
diff --git a/tests/strands/agent/test_agent_result.py b/tests/strands/agent/test_agent_result.py
@@ -384,3 +384,17 @@ def test_context_size_none_when_no_data(mock_metrics, simple_message: Message):
     mock_metrics.latest_context_size = None
     result = AgentResult(stop_reason="end_turn", message=simple_message, metrics=mock_metrics, state={})
     assert result.context_size is None
+
+
+def test_projected_context_size_delegates_to_metrics(mock_metrics, simple_message: Message):
+    """Test that projected_context_size delegates to metrics.projected_context_size."""
+    mock_metrics.projected_context_size = 15000
+    result = AgentResult(stop_reason="end_turn", message=simple_message, metrics=mock_metrics, state={})
+    assert result.projected_context_size == 15000
+
+
+def test_projected_context_size_none_when_no_data(mock_metrics, simple_message: Message):
+    """Test that projected_context_size returns None when metrics has no data."""
+    mock_metrics.projected_context_size = None
+    result = AgentResult(stop_reason="end_turn", message=simple_message, metrics=mock_metrics, state={})
+    assert result.projected_context_size is None
diff --git a/tests/strands/event_loop/test_event_loop.py b/tests/strands/event_loop/test_event_loop.py
@@ -1198,3 +1198,84 @@ async def test_event_loop_metrics_recorded_before_recursion(
         # Verify the event loop completed successfully
         tru_stop_reason, _, _, _, _, _ = events[-1]["stop"]
         assert tru_stop_reason == "end_turn"
+
+
+class TestEstimateInputTokens:
+    """Tests for _estimate_input_tokens helper."""
+
+    @pytest.mark.asyncio
+    async def test_cold_start_estimates_all_messages(self):
+        """On cold start (no prior usage metadata), estimates all messages with lazily resolved tool specs."""
+        agent = unittest.mock.AsyncMock()
+        agent.messages = [{"role": "user", "content": [{"text": "Hi"}]}]
+        agent.system_prompt = "You are helpful"
+        agent._system_prompt_content = None
+        agent.tool_registry = unittest.mock.MagicMock()
+        agent.tool_registry.get_all_tool_specs.return_value = [{"name": "tool1"}]
+        agent.model.count_tokens = AsyncMock(return_value=42)
+
+        result = await strands.event_loop.event_loop._estimate_input_tokens(agent)
+
+        assert result == 42
+        agent.tool_registry.get_all_tool_specs.assert_called_once()
+        agent.model.count_tokens.assert_called_once_with(
+            agent.messages,
+            tool_specs=[{"name": "tool1"}],
+            system_prompt="You are helpful",
+            system_prompt_content=None,
+        )
+
+    @pytest.mark.asyncio
+    async def test_baseline_only_no_new_messages(self):
+        """When last message is assistant with usage and no new messages after, returns baseline."""
+        agent = unittest.mock.AsyncMock()
+        agent.messages = [
+            {"role": "user", "content": [{"text": "Hi"}]},
+            {
+                "role": "assistant",
+                "content": [{"text": "Hello"}],
+                "metadata": {"usage": {"inputTokens": 100, "outputTokens": 20, "totalTokens": 120}},
+            },
+        ]
+        agent.system_prompt = "You are helpful"
+
+        result = await strands.event_loop.event_loop._estimate_input_tokens(agent)
+
+        assert result == 120
+        agent.model.count_tokens.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_baseline_plus_delta(self):
+        """When new messages exist after last assistant, adds estimated delta to baseline."""
+        agent = unittest.mock.AsyncMock()
+        agent.messages = [
+            {"role": "user", "content": [{"text": "Hi"}]},
+            {
+                "role": "assistant",
+                "content": [{"text": "Hello"}],
+                "metadata": {"usage": {"inputTokens": 100, "outputTokens": 30, "totalTokens": 130}},
+            },
+            {"role": "user", "content": [{"text": "tool result"}]},
+        ]
+        agent.system_prompt = "You are helpful"
+        agent.model.count_tokens = AsyncMock(return_value=50)
+
+        result = await strands.event_loop.event_loop._estimate_input_tokens(agent)
+
+        # baseline (100+30) + delta (50) = 180
+        assert result == 180
+        agent.model.count_tokens.assert_called_once()
+
+    @pytest.mark.asyncio
+    async def test_error_fallback_returns_none_at_call_site(self):
+        """When count_tokens raises, the caller catches and sets projected_input_tokens to None."""
+        agent = unittest.mock.AsyncMock()
+        agent.messages = [{"role": "user", "content": [{"text": "Hi"}]}]
+        agent.system_prompt = "You are helpful"
+        agent._system_prompt_content = None
+        agent.tool_registry = unittest.mock.MagicMock()
+        agent.tool_registry.get_all_tool_specs.return_value = []
+        agent.model.count_tokens = AsyncMock(side_effect=Exception("API unavailable"))
+
+        with pytest.raises(Exception, match="API unavailable"):
+            await strands.event_loop.event_loop._estimate_input_tokens(agent)
diff --git a/tests/strands/telemetry/test_metrics.py b/tests/strands/telemetry/test_metrics.py
@@ -613,3 +613,44 @@ def test_latest_context_size_missing_input_tokens_key(event_loop_metrics):
         )
     )
     assert event_loop_metrics.latest_context_size is None
+
+
+def test_projected_context_size_no_invocations(event_loop_metrics):
+    assert event_loop_metrics.projected_context_size is None
+
+
+def test_projected_context_size_invocation_with_no_cycles(event_loop_metrics):
+    event_loop_metrics.reset_usage_metrics()
+    assert event_loop_metrics.projected_context_size is None
+
+
+def test_projected_context_size_returns_input_plus_output(event_loop_metrics, mock_get_meter_provider):
+    event_loop_metrics.reset_usage_metrics()
+    event_loop_metrics.start_cycle(attributes={"event_loop_cycle_id": "c1"})
+    event_loop_metrics.update_usage(Usage(inputTokens=100, outputTokens=50, totalTokens=150))
+
+    assert event_loop_metrics.projected_context_size == 150
+
+
+def test_projected_context_size_updates_across_cycles(event_loop_metrics, mock_get_meter_provider):
+    event_loop_metrics.reset_usage_metrics()
+    event_loop_metrics.start_cycle(attributes={"event_loop_cycle_id": "c1"})
+    event_loop_metrics.update_usage(Usage(inputTokens=100, outputTokens=50, totalTokens=150))
+
+    event_loop_metrics.start_cycle(attributes={"event_loop_cycle_id": "c2"})
+    event_loop_metrics.update_usage(Usage(inputTokens=200, outputTokens=80, totalTokens=280))
+
+    assert event_loop_metrics.projected_context_size == 280
+
+
+def test_projected_context_size_missing_tokens_key(event_loop_metrics):
+    """Returns None when usage dict is missing inputTokens or outputTokens."""
+    event_loop_metrics.reset_usage_metrics()
+    invocation = event_loop_metrics.agent_invocations[-1]
+    invocation.cycles.append(
+        strands.telemetry.metrics.EventLoopCycleMetric(
+            event_loop_cycle_id="c1",
+            usage={"outputTokens": 50, "totalTokens": 50},
+        )
+    )
+    assert event_loop_metrics.projected_context_size is None