letta-ai
diff --git a/‎letta/__init__.py‎
Lines changed: 1 addition & 2 deletions b/‎letta/__init__.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎letta/agent.py‎
Lines changed: 6 additions & 4 deletions b/‎letta/agent.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎letta/client/streaming.py‎
Lines changed: 39 additions & 44 deletions b/‎letta/client/streaming.py‎
Lines changed: 39 additions & 44 deletions
diff --git a/‎letta/constants.py‎
Lines changed: 0 additions & 3 deletions b/‎letta/constants.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎letta/llm_api/anthropic.py‎
Lines changed: 73 additions & 8 deletions b/‎letta/llm_api/anthropic.py‎
Lines changed: 73 additions & 8 deletions
@@ -1,5 +1,4 @@
-__version__ = "0.6.23"
-
+__version__ = "0.6.24"
 
 # import clients
 from letta.client.client import LocalClient, RESTClient, create_client
 
@@ -447,8 +447,6 @@ def _handle_ai_response(
             function_call = (
                 response_message.function_call if response_message.function_call is not None else response_message.tool_calls[0].function
             )
-
-            # Get the name of the function
             function_name = function_call.name
             self.logger.info(f"Request to call function {function_name} with tool_call_id: {tool_call_id}")
 
@@ -461,7 +459,9 @@ def _handle_ai_response(
             if not target_letta_tool:
                 error_msg = f"No function named {function_name}"
                 function_response = "None"  # more like "never ran?"
-                messages = self._handle_function_error_response(error_msg, tool_call_id, function_name, function_args, function_response, messages)
+                messages = self._handle_function_error_response(
+                    error_msg, tool_call_id, function_name, function_args, function_response, messages
+                )
                 return messages, False, True  # force a heartbeat to allow agent to handle error
 
             # Failure case 2: function name is OK, but function args are bad JSON
@@ -471,7 +471,9 @@ def _handle_ai_response(
             except Exception:
                 error_msg = f"Error parsing JSON for function '{function_name}' arguments: {function_call.arguments}"
                 function_response = "None"  # more like "never ran?"
-                messages = self._handle_function_error_response(error_msg, tool_call_id, function_name, function_args, function_response, messages)
+                messages = self._handle_function_error_response(
+                    error_msg, tool_call_id, function_name, function_args, function_response, messages
+                )
                 return messages, False, True  # force a heartbeat to allow agent to handle error
 
             # Check if inner thoughts is in the function call arguments (possible apparently if you are using Azure)
 
@@ -17,48 +17,45 @@
 
 
 def _sse_post(url: str, data: dict, headers: dict) -> Generator[Union[LettaStreamingResponse, ChatCompletionChunk], None, None]:
-
-    with httpx.Client() as client:
+    """
+    Sends an SSE POST request and yields parsed response chunks.
+    """
+    # TODO: Please note his is a very generous timeout for e2b reasons
+    with httpx.Client(timeout=httpx.Timeout(5 * 60.0, read=5 * 60.0)) as client:
         with connect_sse(client, method="POST", url=url, json=data, headers=headers) as event_source:
 
-            # Inspect for errors before iterating (see https://github.com/florimondmanca/httpx-sse/pull/12)
+            # Check for immediate HTTP errors before processing the SSE stream
             if not event_source.response.is_success:
-                # handle errors
-                pass
-
-                logger.warning("Caught error before iterating SSE request:", vars(event_source.response))
-                logger.warning(event_source.response.read().decode("utf-8"))
+                response_bytes = event_source.response.read()
+                logger.warning(f"SSE request error: {vars(event_source.response)}")
+                logger.warning(response_bytes.decode("utf-8"))
 
                 try:
-                    response_bytes = event_source.response.read()
                     response_dict = json.loads(response_bytes.decode("utf-8"))
-                    # e.g.: This model's maximum context length is 8192 tokens. However, your messages resulted in 8198 tokens (7450 in the messages, 748 in the functions). Please reduce the length of the messages or functions.
-                    if (
-                        "error" in response_dict
-                        and "message" in response_dict["error"]
-                        and OPENAI_CONTEXT_WINDOW_ERROR_SUBSTRING in response_dict["error"]["message"]
-                    ):
-                        logger.error(response_dict["error"]["message"])
-                        raise LLMError(response_dict["error"]["message"])
+                    error_message = response_dict.get("error", {}).get("message", "")
+
+                    if OPENAI_CONTEXT_WINDOW_ERROR_SUBSTRING in error_message:
+                        logger.error(error_message)
+                        raise LLMError(error_message)
                 except LLMError:
                     raise
-                except:
-                    logger.error(f"Failed to parse SSE message, throwing SSE HTTP error up the stack")
+                except Exception:
+                    logger.error("Failed to parse SSE message, raising HTTP error")
                     event_source.response.raise_for_status()
 
             try:
                 for sse in event_source.iter_sse():
-                    # if sse.data == OPENAI_SSE_DONE:
-                    # print("finished")
-                    # break
-                    if sse.data in [status.value for status in MessageStreamStatus]:
-                        # break
+                    if sse.data in {status.value for status in MessageStreamStatus}:
                         yield MessageStreamStatus(sse.data)
+                        if sse.data == MessageStreamStatus.done.value:
+                            # We received the [DONE], so stop reading the stream.
+                            break
                     else:
                         chunk_data = json.loads(sse.data)
+
                         if "reasoning" in chunk_data:
                             yield ReasoningMessage(**chunk_data)
-                        elif "message_type" in chunk_data and chunk_data["message_type"] == "assistant_message":
+                        elif chunk_data.get("message_type") == "assistant_message":
                             yield AssistantMessage(**chunk_data)
                         elif "tool_call" in chunk_data:
                             yield ToolCallMessage(**chunk_data)
@@ -67,33 +64,31 @@ def _sse_post(url: str, data: dict, headers: dict) -> Generator[Union[LettaStrea
                         elif "step_count" in chunk_data:
                             yield LettaUsageStatistics(**chunk_data)
                         elif chunk_data.get("object") == get_args(ChatCompletionChunk.__annotations__["object"])[0]:
-                            yield ChatCompletionChunk(**chunk_data)  # Add your processing logic for chat chunks here
+                            yield ChatCompletionChunk(**chunk_data)
                         else:
                             raise ValueError(f"Unknown message type in chunk_data: {chunk_data}")
 
             except SSEError as e:
-                logger.error("Caught an error while iterating the SSE stream:", str(e))
-                if "application/json" in str(e):  # Check if the error is because of JSON response
-                    # TODO figure out a better way to catch the error other than re-trying with a POST
-                    response = client.post(url=url, json=data, headers=headers)  # Make the request again to get the JSON response
-                    if response.headers["Content-Type"].startswith("application/json"):
-                        error_details = response.json()  # Parse the JSON to get the error message
-                        logger.error("Request:", vars(response.request))
-                        logger.error("POST Error:", error_details)
-                        logger.error("Original SSE Error:", str(e))
+                logger.error(f"SSE stream error: {e}")
+
+                if "application/json" in str(e):
+                    response = client.post(url=url, json=data, headers=headers)
+
+                    if response.headers.get("Content-Type", "").startswith("application/json"):
+                        error_details = response.json()
+                        logger.error(f"POST Error: {error_details}")
                     else:
                         logger.error("Failed to retrieve JSON error message via retry.")
-                else:
-                    logger.error("SSEError not related to 'application/json' content type.")
 
-                # Optionally re-raise the exception if you need to propagate it
                 raise e
 
             except Exception as e:
-                if event_source.response.request is not None:
-                    logger.error("HTTP Request:", vars(event_source.response.request))
-                if event_source.response is not None:
-                    logger.error("HTTP Status:", event_source.response.status_code)
-                    logger.error("HTTP Headers:", event_source.response.headers)
-                logger.error("Exception message:", str(e))
+                logger.error(f"Unexpected exception: {e}")
+
+                if event_source.response.request:
+                    logger.error(f"HTTP Request: {vars(event_source.response.request)}")
+                if event_source.response:
+                    logger.error(f"HTTP Status: {event_source.response.status_code}")
+                    logger.error(f"HTTP Headers: {event_source.response.headers}")
+
                 raise e
@@ -51,9 +51,6 @@
 BASE_MEMORY_TOOLS = ["core_memory_append", "core_memory_replace"]
 # Multi agent tools
 MULTI_AGENT_TOOLS = ["send_message_to_agent_and_wait_for_reply", "send_message_to_agents_matching_all_tags", "send_message_to_agent_async"]
-MULTI_AGENT_SEND_MESSAGE_MAX_RETRIES = 3
-MULTI_AGENT_SEND_MESSAGE_TIMEOUT = 20 * 60
-MULTI_AGENT_CONCURRENT_SENDS = 15
 
 # The name of the tool used to send message to the user
 # May not be relevant in cases where the agent has multiple ways to message to user (send_imessage, send_discord_mesasge, ...)
 
@@ -19,6 +19,8 @@
 
 from letta.errors import BedrockError, BedrockPermissionError
 from letta.llm_api.aws_bedrock import get_bedrock_client
+from letta.llm_api.helpers import add_inner_thoughts_to_functions
+from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_DESCRIPTION
 from letta.local_llm.utils import num_tokens_from_functions, num_tokens_from_messages
 from letta.schemas.message import Message as _Message
 from letta.schemas.message import MessageRole as _MessageRole
@@ -513,9 +515,23 @@ def convert_anthropic_stream_event_to_chatcompletion(
 def _prepare_anthropic_request(
     data: ChatCompletionRequest,
     inner_thoughts_xml_tag: Optional[str] = "thinking",
+    # if true, prefix fill the generation with the thinking tag
+    prefix_fill: bool = True,
+    # if true, put COT inside the tool calls instead of inside the content
+    put_inner_thoughts_in_kwargs: bool = False,
 ) -> dict:
     """Prepare the request data for Anthropic API format."""
-    # convert the tools
+
+    # if needed, put inner thoughts as a kwarg for all tools
+    if data.tools and put_inner_thoughts_in_kwargs:
+        functions = add_inner_thoughts_to_functions(
+            functions=[t.function.model_dump() for t in data.tools],
+            inner_thoughts_key=INNER_THOUGHTS_KWARG,
+            inner_thoughts_description=INNER_THOUGHTS_KWARG_DESCRIPTION,
+        )
+        data.tools = [Tool(function=f) for f in functions]
+
+    # convert the tools to Anthropic's payload format
     anthropic_tools = None if data.tools is None else convert_tools_to_anthropic_format(data.tools)
 
     # pydantic -> dict
@@ -529,11 +545,25 @@ def _prepare_anthropic_request(
         data.pop("tools")
         data.pop("tool_choice", None)
     elif anthropic_tools is not None:
+        # TODO eventually enable parallel tool use
         data["tools"] = anthropic_tools
-        if len(anthropic_tools) == 1:
+
+        # tool_choice_type other than "auto" only plays nice if thinking goes inside the tool calls
+        if put_inner_thoughts_in_kwargs:
+            if len(anthropic_tools) == 1:
+                data["tool_choice"] = {
+                    "type": "tool",
+                    "name": anthropic_tools[0]["name"],
+                    "disable_parallel_tool_use": True,
+                }
+            else:
+                data["tool_choice"] = {
+                    "type": "any",
+                    "disable_parallel_tool_use": True,
+                }
+        else:
             data["tool_choice"] = {
-                "type": "tool",
-                "name": anthropic_tools[0]["name"],
+                "type": "auto",
                 "disable_parallel_tool_use": True,
             }
 
@@ -548,8 +578,21 @@ def _prepare_anthropic_request(
             message["content"] = None
 
     # Convert to Anthropic format
-    msg_objs = [_Message.dict_to_message(user_id=None, agent_id=None, openai_message_dict=m) for m in data["messages"]]
-    data["messages"] = [m.to_anthropic_dict(inner_thoughts_xml_tag=inner_thoughts_xml_tag) for m in msg_objs]
+    msg_objs = [
+        _Message.dict_to_message(
+            user_id=None,
+            agent_id=None,
+            openai_message_dict=m,
+        )
+        for m in data["messages"]
+    ]
+    data["messages"] = [
+        m.to_anthropic_dict(
+            inner_thoughts_xml_tag=inner_thoughts_xml_tag,
+            put_inner_thoughts_in_kwargs=put_inner_thoughts_in_kwargs,
+        )
+        for m in msg_objs
+    ]
 
     # Ensure first message is user
     if data["messages"][0]["role"] != "user":
@@ -558,6 +601,16 @@ def _prepare_anthropic_request(
     # Handle alternating messages
     data["messages"] = merge_tool_results_into_user_messages(data["messages"])
 
+    # Handle prefix fill (not compatible with inner-thouguhts-in-kwargs)
+    # https://docs.anthropic.com/en/api/messages#body-messages
+    # NOTE: cannot prefill with tools for opus:
+    # Your API request included an `assistant` message in the final position, which would pre-fill the `assistant` response. When using tools with "claude-3-opus-20240229"
+    if prefix_fill and not put_inner_thoughts_in_kwargs and "opus" not in data["model"]:
+        data["messages"].append(
+            # Start the thinking process for the assistant
+            {"role": "assistant", "content": f"<{inner_thoughts_xml_tag}>"},
+        )
+
     # Validate max_tokens
     assert "max_tokens" in data, data
 
@@ -571,6 +624,7 @@ def _prepare_anthropic_request(
 def anthropic_chat_completions_request(
     data: ChatCompletionRequest,
     inner_thoughts_xml_tag: Optional[str] = "thinking",
+    put_inner_thoughts_in_kwargs: bool = False,
     betas: List[str] = ["tools-2024-04-04"],
 ) -> ChatCompletionResponse:
     """https://docs.anthropic.com/claude/docs/tool-use"""
@@ -580,7 +634,11 @@ def anthropic_chat_completions_request(
         anthropic_client = anthropic.Anthropic(api_key=anthropic_override_key)
     elif model_settings.anthropic_api_key:
         anthropic_client = anthropic.Anthropic()
-    data = _prepare_anthropic_request(data, inner_thoughts_xml_tag)
+    data = _prepare_anthropic_request(
+        data=data,
+        inner_thoughts_xml_tag=inner_thoughts_xml_tag,
+        put_inner_thoughts_in_kwargs=put_inner_thoughts_in_kwargs,
+    )
     response = anthropic_client.beta.messages.create(
         **data,
         betas=betas,
@@ -611,14 +669,19 @@ def anthropic_bedrock_chat_completions_request(
 def anthropic_chat_completions_request_stream(
     data: ChatCompletionRequest,
     inner_thoughts_xml_tag: Optional[str] = "thinking",
+    put_inner_thoughts_in_kwargs: bool = False,
     betas: List[str] = ["tools-2024-04-04"],
 ) -> Generator[ChatCompletionChunkResponse, None, None]:
     """Stream chat completions from Anthropic API.
 
     Similar to OpenAI's streaming, but using Anthropic's native streaming support.
     See: https://docs.anthropic.com/claude/reference/messages-streaming
     """
-    data = _prepare_anthropic_request(data, inner_thoughts_xml_tag)
+    data = _prepare_anthropic_request(
+        data=data,
+        inner_thoughts_xml_tag=inner_thoughts_xml_tag,
+        put_inner_thoughts_in_kwargs=put_inner_thoughts_in_kwargs,
+    )
 
     anthropic_override_key = ProviderManager().get_anthropic_override_key()
     if anthropic_override_key:
@@ -666,6 +729,7 @@ def anthropic_chat_completions_process_stream(
     chat_completion_request: ChatCompletionRequest,
     stream_interface: Optional[Union[AgentChunkStreamingInterface, AgentRefreshStreamingInterface]] = None,
     inner_thoughts_xml_tag: Optional[str] = "thinking",
+    put_inner_thoughts_in_kwargs: bool = False,
     create_message_id: bool = True,
     create_message_datetime: bool = True,
     betas: List[str] = ["tools-2024-04-04"],
@@ -743,6 +807,7 @@ def anthropic_chat_completions_process_stream(
             anthropic_chat_completions_request_stream(
                 data=chat_completion_request,
                 inner_thoughts_xml_tag=inner_thoughts_xml_tag,
+                put_inner_thoughts_in_kwargs=put_inner_thoughts_in_kwargs,
                 betas=betas,
             )
         ):