@@ -21,8 +21,6 @@ def parse_streaming_json[T: BaseModel](potential_json: str | None, model: type[T
2121 if potential_json is None :
2222 return model ()
2323
24- # print(f"parsing: {potential_json}")
25-
2624 def find_json_boundaries (text : str ) -> tuple [int , int ]:
2725 """Find the start and potential end of JSON in the text."""
2826
@@ -95,17 +93,32 @@ def fix_common_json_issues(json_str: str) -> str:
9593 # Remove any leading/trailing whitespace
9694 json_str = json_str .strip ()
9795
98- # Fix missing closing quotes on string values (at the end)
99- # Look for patterns like: "key": "value without closing quote
100- json_str = re .sub (r':\s*"([^"]*?)(?:\s*[,}]|$)' , r': "\1"' , json_str )
101-
102- # Fix missing closing quotes for keys
103- # Look for patterns like: "key without quotes:
104- json_str = re .sub (r'"([^"]*?)(?=\s*:)' , r'"\1"' , json_str )
105-
10696 # Remove trailing commas before closing braces/brackets
10797 json_str = re .sub (r",\s*([}\]])" , r"\1" , json_str )
10898
99+ # For streaming JSON, we need to handle incomplete strings carefully
100+ # Check if we have an unclosed string at the end
101+ in_string = False
102+ escape_next = False
103+ last_quote_pos = - 1
104+
105+ for i , char in enumerate (json_str ):
106+ if escape_next :
107+ escape_next = False
108+ continue
109+ if char == '\\ ' :
110+ escape_next = True
111+ continue
112+ if char == '"' :
113+ in_string = not in_string
114+ if in_string :
115+ last_quote_pos = i
116+
117+ # If we're in a string at the end (incomplete), close it properly
118+ if in_string and last_quote_pos != - 1 :
119+ # Add closing quote for the incomplete string
120+ json_str += '"'
121+
109122 # Ensure the JSON has proper closing braces if it appears incomplete
110123 open_braces = json_str .count ("{" ) - json_str .count ("}" )
111124 open_brackets = json_str .count ("[" ) - json_str .count ("]" )
@@ -124,12 +137,25 @@ def extract_data_manually(json_str: str) -> dict[str, Any]:
124137 data = {}
125138
126139 # Extract string key-value pairs with quoted keys
127- # Pattern: "key": "value" or 'key': 'value'
128- string_pattern = r'["\']([^"\']+)["\']:\s*["\']([^"\']*)["\']?'
129- string_matches = re .findall (string_pattern , json_str )
140+ # IMPROVED: Handle long strings that may contain newlines, special chars, etc.
141+ # Pattern: "key": "value..." - capture everything until the next unescaped quote or EOF
142+ string_pattern = r'["\']([\w]+)["\']:\s*["\']([^"\']*?)(?:["\']|$)'
143+ string_matches = re .findall (string_pattern , json_str , re .DOTALL )
144+
145+ # Also try to capture very long strings that span multiple lines
146+ # This catches incomplete strings during streaming
147+ long_string_pattern = r'["\']([\w_]+)["\']:\s*["\'](.+?)(?:["\'],?\s*["}]|$)'
148+ long_matches = re .findall (long_string_pattern , json_str , re .DOTALL )
130149
131150 for key , value in string_matches :
132151 data [key ] = value
152+
153+ # Prefer long_matches for fields that might be truncated in string_matches
154+ for key , value in long_matches :
155+ # Only override if the long match has more content
156+ existing = data .get (key , "" )
157+ if key not in data or (isinstance (existing , str ) and len (value ) > len (existing )):
158+ data [key ] = value
133159
134160 # Extract string key-value pairs with unquoted keys
135161 # Pattern: key: "value" (no quotes around key)
0 commit comments