16
16
17
17
import json
18
18
import logging
19
- from typing import Optional
19
+ from typing import Any , Optional
20
20
21
21
from pydantic import errors
22
22
import pandas as pd
@@ -89,6 +89,9 @@ def _get_evaluation_html(eval_result_json: str) -> str:
89
89
.reference-container {{ background-color: #e6f4ea; padding: 16px; margin: 12px 0; border-radius: 8px; white-space: pre-wrap; word-wrap: break-word; }}
90
90
.response-container {{ background-color: #f9f9f9; padding: 12px; margin-top: 8px; border-radius: 8px; border: 1px solid #eee; }}
91
91
.explanation {{ color: #5f6368; font-style: italic; font-size: 0.9em; padding-top: 6px; }}
92
+ .raw-json-details {{ margin-top: 12px; border: 1px solid #eee; border-radius: 4px; padding: 8px; background-color: #f9f9f9; }}
93
+ .raw-json-details summary {{ font-size: 0.9em; cursor: pointer; color: #5f6368;}}
94
+ .raw-json-container {{ white-space: pre-wrap; word-wrap: break-word; max-height: 300px; overflow-y: auto; background-color: #f1f1f1; padding: 10px; border-radius: 4px; margin-top: 8px; }}
92
95
</style>
93
96
</head>
94
97
<body>
@@ -114,23 +117,36 @@ def _get_evaluation_html(eval_result_json: str) -> str:
114
117
container.innerHTML = '<h2>Detailed Results</h2>';
115
118
if (!caseResults || caseResults.length === 0) {{ container.innerHTML += '<p>No detailed results.</p>'; return; }}
116
119
const datasetRows = metadata && metadata.dataset ? metadata.dataset : [];
120
+
117
121
caseResults.forEach((caseResult, i) => {{
118
122
const original_case = datasetRows[caseResult.eval_case_index] || {{}};
119
- const prompt = original_case.prompt || '(prompt not found)';
123
+ const promptText = original_case.prompt_display_text || '(prompt not found)';
124
+ const promptJson = original_case.prompt_raw_json;
120
125
const reference = original_case.reference || '';
126
+ const responseText = original_case.response_display_text || '(response not found)';
127
+ const responseJson = original_case.response_raw_json;
128
+
121
129
let card = `<details><summary>Case #${{caseResult.eval_case_index != null ? caseResult.eval_case_index : i}}</summary>`;
122
- card += `<div class="prompt-container"><strong>Prompt:</strong><br>${{DOMPurify.sanitize(marked.parse(String(prompt)))}}</div>`;
130
+
131
+ card += `<div class="prompt-container"><strong>Prompt:</strong><br>${{DOMPurify.sanitize(marked.parse(String(promptText)))}}</div>`;
132
+ if (promptJson) {{
133
+ card += `<details class="raw-json-details"><summary>View Raw Prompt JSON</summary><pre class="raw-json-container">${{DOMPurify.sanitize(promptJson)}}</pre></details>`;
134
+ }}
135
+
123
136
if (reference) {{ card += `<div class="reference-container"><strong>Reference:</strong><br>${{DOMPurify.sanitize(marked.parse(String(reference)))}}</div>`; }}
124
- (caseResult.response_candidate_results || []).forEach(candidate => {{
125
- const candidateResponse = candidate.response_text || '(response not found)';
126
- card += `<div class="response-container"><h4>Candidate Response</h4>${{DOMPurify.sanitize(marked.parse(String(candidateResponse)))}}</div>`;
127
- let metricTable = '<h4>Metrics</h4><table><tbody>';
128
- Object.entries(candidate.metric_results || {{}}).forEach(([name, val]) => {{
129
- metricTable += `<tr><td>${{name}}</td><td><b>${{val.score != null ? val.score.toFixed(2) : 'N/A'}}</b></td></tr>`;
130
- if (val.explanation) {{ metricTable += `<tr><td colspan="2"><div class="explanation">${{DOMPurify.sanitize(marked.parse(String(val.explanation)))}}</div></td></tr>`; }}
131
- }});
132
- card += metricTable + '</tbody></table>';
137
+
138
+ card += `<div class="response-container"><h4>Candidate Response</h4>${{DOMPurify.sanitize(marked.parse(String(responseText)))}}</div>`;
139
+ if (responseJson) {{
140
+ card += `<details class="raw-json-details"><summary>View Raw Response JSON</summary><pre class="raw-json-container">${{DOMPurify.sanitize(responseJson)}}</pre></details>`;
141
+ }}
142
+
143
+ let metricTable = '<h4>Metrics</h4><table><tbody>';
144
+ const candidateMetrics = (caseResult.response_candidate_results && caseResult.response_candidate_results[0] && caseResult.response_candidate_results[0].metric_results) || {{}};
145
+ Object.entries(candidateMetrics).forEach(([name, val]) => {{
146
+ metricTable += `<tr><td>${{name}}</td><td><b>${{val.score != null ? val.score.toFixed(2) : 'N/A'}}</b></td></tr>`;
147
+ if (val.explanation) {{ metricTable += `<tr><td colspan="2"><div class="explanation">${{DOMPurify.sanitize(marked.parse(String(val.explanation)))}}</div></td></tr>`; }}
133
148
}});
149
+ card += metricTable + '</tbody></table>';
134
150
container.innerHTML += card + '</details>';
135
151
}});
136
152
}}
@@ -168,6 +184,9 @@ def _get_comparison_html(eval_result_json: str) -> str:
168
184
.response-column {{ border: 1px solid #e0e0e0; padding: 16px; border-radius: 8px; background: #f9f9f9; }}
169
185
.response-text-container {{ background-color: #fff; padding: 12px; margin-top: 8px; border-radius: 4px; border: 1px solid #eee; white-space: pre-wrap; word-wrap: break-word; max-height: 400px; overflow-y: auto; }}
170
186
.explanation {{ color: #5f6368; font-style: italic; font-size: 0.9em; padding-top: 8px; }}
187
+ .raw-json-details {{ margin-top: 12px; border: 1px solid #eee; border-radius: 4px; padding: 8px; background-color: #f9f9f9; }}
188
+ .raw-json-details summary {{ font-size: 0.9em; cursor: pointer; color: #5f6368;}}
189
+ .raw-json-container {{ white-space: pre-wrap; word-wrap: break-word; max-height: 300px; overflow-y: auto; background-color: #f1f1f1; padding: 10px; border-radius: 4px; margin-top: 8px; }}
171
190
</style>
172
191
</head>
173
192
<body>
@@ -202,13 +221,31 @@ def _get_comparison_html(eval_result_json: str) -> str:
202
221
203
222
caseResults.forEach((caseResult, i) => {{
204
223
const original_case = datasetRows[caseResult.eval_case_index] || {{}};
205
- let card = `<details open><summary>Case #${{caseResult.eval_case_index}}</summary><div class="prompt-container">${{DOMPurify.sanitize(marked.parse(String(original_case.prompt || '')))}}</div><div class="responses-grid">`;
224
+ const promptText = original_case.prompt_display_text || '(prompt not found)';
225
+ const promptJson = original_case.prompt_raw_json;
226
+
227
+ let card = `<details open><summary>Case #${{caseResult.eval_case_index}}</summary>`;
228
+ card += `<div class="prompt-container">${{DOMPurify.sanitize(marked.parse(String(promptText)))}}</div>`;
229
+ if (promptJson) {{
230
+ card += `<details class="raw-json-details"><summary>View Raw Prompt JSON</summary><pre class="raw-json-container">${{DOMPurify.sanitize(promptJson)}}</pre></details>`;
231
+ }}
232
+
233
+ card += `<div class="responses-grid">`;
234
+
206
235
(caseResult.response_candidate_results || []).forEach((candidate, j) => {{
207
236
const candidateName = candidateNames ? candidateNames[j] : `Candidate #${{j + 1}}`;
208
- card += `<div class="response-column"><h4>${{candidateName}}</h4><div class="response-text-container">${{DOMPurify.sanitize(marked.parse(String(candidate.response_text || '')))}}</div><h5>Metrics</h5><table><tbody>`;
237
+ const displayText = candidate.display_text || '(response not found)';
238
+ const rawJsonResponse = candidate.raw_json;
239
+
240
+ card += `<div class="response-column"><h4>${{candidateName}}</h4><div class="response-text-container">${{DOMPurify.sanitize(marked.parse(String(displayText)))}}</div>`;
241
+ if (rawJsonResponse) {{
242
+ card += `<details class="raw-json-details"><summary>View Raw Response JSON</summary><pre class="raw-json-container">${{DOMPurify.sanitize(rawJsonResponse)}}</pre></details>`;
243
+ }}
244
+
245
+ card += `<h5>Metrics</h5><table><tbody>`;
209
246
Object.entries(candidate.metric_results || {{}}).forEach(([name, val]) => {{
210
- card += `<tr><td>${{name}}</td><td><b>${{val.score. toFixed(2)}}</b></td></tr>`;
211
- if(val.explanation) card += `<tr><td colspan="2" class="explanation">${{DOMPurify.sanitize(marked.parse(String(val.explanation)))}}</td></tr>`;
247
+ card += `<tr><td>${{name}}</td><td><b>${{val.score != null ? val.score. toFixed(2) : 'N/A' }}</b></td></tr>`;
248
+ if(val.explanation) card += `<tr class="explanation-row" ><td colspan="2" class="explanation">${{DOMPurify.sanitize(marked.parse(String(val.explanation)))}}</td></tr>`;
212
249
}});
213
250
card += '</tbody></table></div>';
214
251
}});
@@ -241,6 +278,9 @@ def _get_inference_html(dataframe_json: str) -> str:
241
278
th, td {{ border: 1px solid #dadce0; padding: 12px; text-align: left; vertical-align: top; }}
242
279
th {{ background-color: #f2f2f2; font-weight: 500;}}
243
280
td > div {{ white-space: pre-wrap; word-wrap: break-word; max-height: 400px; overflow-y: auto; }}
281
+ .raw-json-details {{ margin-top: 8px; border-top: 1px solid #eee; padding-top: 8px; }}
282
+ .raw-json-details summary {{ font-size: 0.9em; cursor: pointer; color: #5f6368; }}
283
+ .raw-json-container {{ white-space: pre-wrap; word-wrap: break-word; max-height: 300px; overflow-y: auto; background-color: #f1f1f1; padding: 10px; border-radius: 4px; margin-top: 8px; }}
244
284
</style>
245
285
</head>
246
286
<body>
@@ -249,8 +289,23 @@ def _get_inference_html(dataframe_json: str) -> str:
249
289
<div id="results-table"></div>
250
290
</div>
251
291
<script>
252
- const data = JSON.parse( { dataframe_json } ) ;
292
+ const data = { dataframe_json } ;
253
293
const container = document.getElementById('results-table');
294
+
295
+ function renderCell(cellValue) {{
296
+ let cellContent = '';
297
+ if (cellValue && typeof cellValue === 'object' && cellValue.display_text !== undefined) {{
298
+ cellContent += `<div>${{DOMPurify.sanitize(marked.parse(String(cellValue.display_text)))}}</div>`;
299
+ if (cellValue.raw_json) {{
300
+ cellContent += `<details class="raw-json-details"><summary>View Raw JSON</summary><pre class="raw-json-container">${{DOMPurify.sanitize(cellValue.raw_json)}}</pre></details>`;
301
+ }}
302
+ }} else {{
303
+ const cellDisplay = cellValue === null || cellValue === undefined ? '' : String(cellValue);
304
+ cellContent = `<div>${{DOMPurify.sanitize(marked.parse(cellDisplay))}}</div>`;
305
+ }}
306
+ return `<td>${{cellContent}}</td>`;
307
+ }}
308
+
254
309
if (!data || data.length === 0) {{ container.innerHTML = "<p>No data.</p>"; }}
255
310
else {{
256
311
let table = '<table><thead><tr>';
@@ -260,9 +315,7 @@ def _get_inference_html(dataframe_json: str) -> str:
260
315
data.forEach(row => {{
261
316
table += '<tr>';
262
317
headers.forEach(header => {{
263
- const cellValue = row[header];
264
- const cellDisplay = cellValue === null || cellValue === undefined ? '' : String(cellValue);
265
- table += `<td><div>${{DOMPurify.sanitize(marked.parse(cellDisplay))}}</div></td>`;
318
+ table += renderCell(row[header]);
266
319
}});
267
320
table += '</tr>';
268
321
}});
@@ -274,6 +327,71 @@ def _get_inference_html(dataframe_json: str) -> str:
274
327
"""
275
328
276
329
330
+ def _extract_text_and_raw_json (content : Any ) -> dict [str , str ]:
331
+ """Extracts display text and raw JSON from a content object.
332
+
333
+ This function handles raw strings, Gemini's `contents` format, and
334
+ OpenAI's `messages` format.
335
+
336
+ Args:
337
+ content: The content from a 'prompt', 'request', or 'response' column.
338
+
339
+ Returns:
340
+ A dictionary with 'display_text' for direct rendering and 'raw_json'
341
+ for an expandable view.
342
+ """
343
+ if not isinstance (content , (str , dict )):
344
+ return {"display_text" : str (content or "" ), "raw_json" : "" }
345
+
346
+ try :
347
+ data = json .loads (content ) if isinstance (content , str ) else content
348
+
349
+ if not isinstance (data , dict ):
350
+ return {"display_text" : str (content ), "raw_json" : "" }
351
+
352
+ pretty_json = json .dumps (data , indent = 2 , ensure_ascii = False )
353
+
354
+ # Gemini format check.
355
+ if (
356
+ "contents" in data
357
+ and isinstance (data .get ("contents" ), list )
358
+ and data ["contents" ]
359
+ ):
360
+ first_part = data ["contents" ][0 ].get ("parts" , [{}])[0 ]
361
+ display_text = first_part .get ("text" , str (data ))
362
+ return {"display_text" : display_text , "raw_json" : pretty_json }
363
+
364
+ # OpenAI response format check.
365
+ elif (
366
+ "choices" in data
367
+ and isinstance (data .get ("choices" ), list )
368
+ and data ["choices" ]
369
+ ):
370
+ message = data ["choices" ][0 ].get ("message" , {})
371
+ display_text = message .get ("content" , str (data ))
372
+ return {"display_text" : display_text , "raw_json" : pretty_json }
373
+
374
+ # OpenAI request format check.
375
+ elif (
376
+ "messages" in data
377
+ and isinstance (data .get ("messages" ), list )
378
+ and data ["messages" ]
379
+ ):
380
+ user_messages = [
381
+ message .get ("content" , "" )
382
+ for message in data ["messages" ]
383
+ if message .get ("role" ) == "user"
384
+ ]
385
+ display_text = user_messages [- 1 ] if user_messages else str (data )
386
+ return {"display_text" : display_text , "raw_json" : pretty_json }
387
+ else :
388
+ # Not a recognized format.
389
+ return {"display_text" : str (content ), "raw_json" : pretty_json }
390
+
391
+ except (json .JSONDecodeError , TypeError , IndexError ):
392
+ return {"display_text" : str (content ), "raw_json" : "" }
393
+
394
+
277
395
def display_evaluation_result (
278
396
eval_result_obj : types .EvaluationResult ,
279
397
candidate_names : Optional [list [str ]] = None ,
@@ -315,9 +433,18 @@ def display_evaluation_result(
315
433
and input_dataset_list [0 ]
316
434
and input_dataset_list [0 ].eval_dataset_df is not None
317
435
):
318
- metadata_payload ["dataset" ] = _preprocess_df_for_json (
319
- input_dataset_list [0 ].eval_dataset_df
320
- ).to_dict (orient = "records" )
436
+ base_df = _preprocess_df_for_json (input_dataset_list [0 ].eval_dataset_df )
437
+ processed_rows = []
438
+ for _ , row in base_df .iterrows ():
439
+ prompt_key = "request" if "request" in row else "prompt"
440
+ prompt_info = _extract_text_and_raw_json (row .get (prompt_key ))
441
+ processed_row = {
442
+ "prompt_display_text" : prompt_info ["display_text" ],
443
+ "prompt_raw_json" : prompt_info ["raw_json" ],
444
+ "reference" : row .get ("reference" , "" ),
445
+ }
446
+ processed_rows .append (processed_row )
447
+ metadata_payload ["dataset" ] = processed_rows
321
448
322
449
if "eval_case_results" in result_dump :
323
450
for case_res in result_dump ["eval_case_results" ]:
@@ -337,9 +464,10 @@ def display_evaluation_result(
337
464
and case_idx is not None
338
465
and case_idx < len (df )
339
466
):
340
- cand_res ["response_text" ] = df .iloc [case_idx ].get (
341
- "response"
342
- )
467
+ response_content = df .iloc [case_idx ].get ("response" )
468
+ display_info = _extract_text_and_raw_json (response_content )
469
+ cand_res ["display_text" ] = display_info ["display_text" ]
470
+ cand_res ["raw_json" ] = display_info ["raw_json" ]
343
471
344
472
win_rates = eval_result_obj .win_rates if eval_result_obj .win_rates else {}
345
473
if "summary_metrics" in result_dump :
@@ -351,25 +479,41 @@ def display_evaluation_result(
351
479
html_content = _get_comparison_html (json .dumps (result_dump ))
352
480
else :
353
481
single_dataset = input_dataset_list [0 ] if input_dataset_list else None
354
-
482
+ processed_rows = []
355
483
if (
356
484
single_dataset is not None
357
485
and isinstance (single_dataset , types .EvaluationDataset )
358
486
and single_dataset .eval_dataset_df is not None
359
487
):
360
488
processed_df = _preprocess_df_for_json (single_dataset .eval_dataset_df )
361
- metadata_payload ["dataset" ] = processed_df .to_dict (orient = "records" )
362
- if "eval_case_results" in result_dump and processed_df is not None :
489
+ for _ , row in processed_df .iterrows ():
490
+ prompt_key = "request" if "request" in row else "prompt"
491
+ prompt_info = _extract_text_and_raw_json (row .get (prompt_key ))
492
+ response_info = _extract_text_and_raw_json (row .get ("response" ))
493
+ processed_row = {
494
+ "prompt_display_text" : prompt_info ["display_text" ],
495
+ "prompt_raw_json" : prompt_info ["raw_json" ],
496
+ "reference" : row .get ("reference" , "" ),
497
+ "response_display_text" : response_info ["display_text" ],
498
+ "response_raw_json" : response_info ["raw_json" ],
499
+ }
500
+ processed_rows .append (processed_row )
501
+ metadata_payload ["dataset" ] = processed_rows
502
+
503
+ if "eval_case_results" in result_dump and processed_rows :
363
504
for case_res in result_dump ["eval_case_results" ]:
364
505
case_idx = case_res .get ("eval_case_index" )
365
506
if (
366
507
case_idx is not None
367
- and case_idx < len (processed_df )
508
+ and case_idx < len (processed_rows )
368
509
and case_res .get ("response_candidate_results" )
369
510
):
370
- case_res ["response_candidate_results" ][0 ][
371
- "response_text"
372
- ] = processed_df .iloc [case_idx ].get ("response" )
511
+ original_case = processed_rows [case_idx ]
512
+ cand_res = case_res ["response_candidate_results" ][0 ]
513
+ cand_res ["display_text" ] = original_case [
514
+ "response_display_text"
515
+ ]
516
+ cand_res ["raw_json" ] = original_case ["response_raw_json" ]
373
517
374
518
result_dump ["metadata" ] = metadata_payload
375
519
html_content = _get_evaluation_html (json .dumps (result_dump ))
@@ -392,7 +536,21 @@ def display_evaluation_dataset(eval_dataset_obj: types.EvaluationDataset) -> Non
392
536
logger .warning ("No inference data to display." )
393
537
return
394
538
395
- processed_df = _preprocess_df_for_json (eval_dataset_obj .eval_dataset_df )
396
- dataframe_json_string = json .dumps (processed_df .to_json (orient = "records" ))
539
+ processed_rows = []
540
+ df = eval_dataset_obj .eval_dataset_df
541
+
542
+ for _ , row in df .iterrows ():
543
+ processed_row = {}
544
+ for col_name , cell_value in row .items ():
545
+ if col_name in ["prompt" , "request" , "response" ]:
546
+ processed_row [col_name ] = _extract_text_and_raw_json (cell_value )
547
+ else :
548
+ if isinstance (cell_value , (dict , list )):
549
+ processed_row [col_name ] = json .dumps (cell_value , ensure_ascii = False )
550
+ else :
551
+ processed_row [col_name ] = cell_value
552
+ processed_rows .append (processed_row )
553
+
554
+ dataframe_json_string = json .dumps (processed_rows , ensure_ascii = False , default = str )
397
555
html_content = _get_inference_html (dataframe_json_string )
398
556
display .display (display .HTML (html_content ))
0 commit comments