fix(streming) accumulating json parsing when streaming is True

arthurbrenno · arthurbrenno · commit c48314e60475 · 2025-10-28T10:45:24.000-03:00
diff --git a/agentle/generations/providers/google/adapters/generate_generate_content_response_to_generation_adapter.py b/agentle/generations/providers/google/adapters/generate_generate_content_response_to_generation_adapter.py
@@ -3,7 +3,7 @@
 import datetime
 import logging
 import uuid
-from collections.abc import AsyncIterator
+from collections.abc import AsyncGenerator, AsyncIterator
 from logging import Logger
 from typing import TYPE_CHECKING, Any, Literal, cast, overload
 
@@ -88,11 +88,11 @@ def adapt(self, _f: "GenerateContentResponse") -> Generation[T]: ...
     @overload
     def adapt(
         self, _f: AsyncIterator["GenerateContentResponse"]
-    ) -> AsyncIterator[Generation[T]]: ...
+    ) -> AsyncGenerator[Generation[T], None]: ...
 
     def adapt(
         self, _f: "GenerateContentResponse | AsyncIterator[GenerateContentResponse]"
-    ) -> Generation[T] | AsyncIterator[Generation[T]]:
+    ) -> Generation[T] | AsyncGenerator[Generation[T], None]:
         """
         Convert Google response(s) to Agentle Generation object(s).
 
@@ -214,7 +214,7 @@ async def _adapt_single_async(
 
     async def _adapt_streaming(
         self, response_stream: AsyncIterator["GenerateContentResponse"]
-    ) -> AsyncIterator[Generation[T]]:
+    ) -> AsyncGenerator[Generation[T], None]:
         """Adapt a streaming response with proper text accumulation."""
         generation_id = self.preferred_id or uuid.uuid4()
         created_time = datetime.datetime.now()
diff --git a/agentle/utils/parse_streaming_json.py b/agentle/utils/parse_streaming_json.py
@@ -21,8 +21,6 @@ def parse_streaming_json[T: BaseModel](potential_json: str | None, model: type[T
     if potential_json is None:
         return model()
 
-    # print(f"parsing: {potential_json}")
-
     def find_json_boundaries(text: str) -> tuple[int, int]:
         """Find the start and potential end of JSON in the text."""
 
@@ -95,17 +93,32 @@ def fix_common_json_issues(json_str: str) -> str:
         # Remove any leading/trailing whitespace
         json_str = json_str.strip()
 
-        # Fix missing closing quotes on string values (at the end)
-        # Look for patterns like: "key": "value without closing quote
-        json_str = re.sub(r':\s*"([^"]*?)(?:\s*[,}]|$)', r': "\1"', json_str)
-
-        # Fix missing closing quotes for keys
-        # Look for patterns like: "key without quotes:
-        json_str = re.sub(r'"([^"]*?)(?=\s*:)', r'"\1"', json_str)
-
         # Remove trailing commas before closing braces/brackets
         json_str = re.sub(r",\s*([}\]])", r"\1", json_str)
 
+        # For streaming JSON, we need to handle incomplete strings carefully
+        # Check if we have an unclosed string at the end
+        in_string = False
+        escape_next = False
+        last_quote_pos = -1
+        
+        for i, char in enumerate(json_str):
+            if escape_next:
+                escape_next = False
+                continue
+            if char == '\\':
+                escape_next = True
+                continue
+            if char == '"':
+                in_string = not in_string
+                if in_string:
+                    last_quote_pos = i
+
+        # If we're in a string at the end (incomplete), close it properly
+        if in_string and last_quote_pos != -1:
+            # Add closing quote for the incomplete string
+            json_str += '"'
+
         # Ensure the JSON has proper closing braces if it appears incomplete
         open_braces = json_str.count("{") - json_str.count("}")
         open_brackets = json_str.count("[") - json_str.count("]")
@@ -124,12 +137,25 @@ def extract_data_manually(json_str: str) -> dict[str, Any]:
         data = {}
 
         # Extract string key-value pairs with quoted keys
-        # Pattern: "key": "value" or 'key': 'value'
-        string_pattern = r'["\']([^"\']+)["\']:\s*["\']([^"\']*)["\']?'
-        string_matches = re.findall(string_pattern, json_str)
+        # IMPROVED: Handle long strings that may contain newlines, special chars, etc.
+        # Pattern: "key": "value..." - capture everything until the next unescaped quote or EOF
+        string_pattern = r'["\']([\w]+)["\']:\s*["\']([^"\']*?)(?:["\']|$)'
+        string_matches = re.findall(string_pattern, json_str, re.DOTALL)
+        
+        # Also try to capture very long strings that span multiple lines
+        # This catches incomplete strings during streaming
+        long_string_pattern = r'["\']([\w_]+)["\']:\s*["\'](.+?)(?:["\'],?\s*["}]|$)'
+        long_matches = re.findall(long_string_pattern, json_str, re.DOTALL)
 
         for key, value in string_matches:
             data[key] = value
+        
+        # Prefer long_matches for fields that might be truncated in string_matches
+        for key, value in long_matches:
+            # Only override if the long match has more content
+            existing = data.get(key, "")
+            if key not in data or (isinstance(existing, str) and len(value) > len(existing)):
+                data[key] = value
 
         # Extract string key-value pairs with unquoted keys
         # Pattern: key: "value" (no quotes around key)
diff --git a/uv.lock b/uv.lock