-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathserver.py
More file actions
191 lines (165 loc) · 8.25 KB
/
server.py
File metadata and controls
191 lines (165 loc) · 8.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import http.server
import json
import os
import socketserver
from urllib.parse import parse_qs, urlparse
from shared import (
DEFAULT_SERVER_PORT,
EVALUATED_REPORT_PATH,
TEMPLATE_PATH,
GOLD_RGB,
GREEN_RGB,
HSL_LIGHTNESS_MIN,
HSL_LIGHTNESS_RANGE,
LIGHT_GREEN_RGB,
RGB_MAX,
calculate_model_summary,
create_cell_data_dict,
find_fastest_correct_per_prompt,
format_accuracy,
format_response_time,
get_unique_prompts_and_models,
group_results_by_file,
interpolate_color,
normalize_time_value
)
SERVER_PORT = DEFAULT_SERVER_PORT
class ReportHandler(http.server.SimpleHTTPRequestHandler):
def do_GET(self):
if self.path == '/':
self.send_response(301)
self.send_header('Location', '/?render=report-evaluated.json')
self.end_headers()
return
parsed_path = urlparse(self.path)
if parsed_path.path == '/':
query_components = parse_qs(parsed_path.query)
json_file = query_components.get("render", [None])[0]
if json_file:
json_path = os.path.join("answers-generated", json_file)
else:
json_path = EVALUATED_REPORT_PATH
try:
with open(json_path, 'r', encoding='utf-8') as f:
results = json.load(f)
with open(TEMPLATE_PATH, 'r', encoding='utf-8') as f:
template = f.read()
summary_table, detailed_results_header, detailed_results_body, questions_details, cell_data = self.format_results(results)
html = template.replace("__SUMMARY_TABLE__", summary_table)
html = html.replace("__DETAILED_RESULTS_HEADER__", detailed_results_header)
html = html.replace("__DETAILED_RESULTS_BODY__", detailed_results_body)
html = html.replace("__QUESTIONS_DETAILS__", questions_details)
html = html.replace("__CELL_DATA__", cell_data)
self.send_response(200)
self.send_header("Content-type", "text/html")
self.end_headers()
self.wfile.write(html.encode('utf-8'))
except FileNotFoundError:
self.send_error(404, "Report file not found. Please run the evaluation first.")
except Exception as e:
self.send_error(500, f"An error occurred: {e}")
else:
super().do_GET()
def format_results(self, results):
# Calculate summary using shared function
model_summary = calculate_model_summary(results)
summary_table = ""
for model, stats in model_summary.items():
total = stats["total"]
correct = stats["correct"]
total_time = stats["total_time"]
accuracy = (correct / total) * 100 if total > 0 else 0
avg_time = total_time / total if total > 0 else 0
summary_table += f"""
<tr>
<td>{model}</td>
<td>
{correct}/{total} ({accuracy:.1f}%)
<div class="summary-bar">
<div class="summary-bar-fill" style="width: {accuracy:.1f}%;"></div>
</div>
</td>
<td>{avg_time:.2f}s</td>
</tr>
"""
# Detailed results grid
prompts = sorted(list(set(r["file"] for r in results)))
models = sorted(list(set(r["model"] for r in results)))
all_response_times = [r["response_time"] for r in results]
min_time = min(all_response_times)
max_time = max(all_response_times)
time_range = max_time - min_time if max_time != min_time else 1
detailed_results_header = ""
for prompt in prompts:
detailed_results_header += f"<th>{prompt}</th>"
# Find fastest correct test for each prompt file using shared function
fastest_correct_per_prompt = find_fastest_correct_per_prompt(results, prompts)
detailed_results_body = ""
for model in models:
detailed_results_body += f"<tr><td>{model}</td>"
for prompt in prompts:
cell_style = ""
response_time_text = ""
is_fastest_correct = False
for r in results:
if r["model"] == model and r["file"] == prompt:
normalized_time = (r["response_time"] - min_time) / time_range
response_time_text = f'<div style="text-align: center; font-weight: bold; font-size: 0.9em;">{r["response_time"]:.2f}s</div>'
# Check if this is the fastest correct test for this prompt
is_fastest_correct = (r["correct"] and
prompt in fastest_correct_per_prompt and
fastest_correct_per_prompt[prompt] == model)
if r["correct"]:
if is_fastest_correct:
# Highlight fastest correct with gold/yellow
r_val, g_val, b_val = interpolate_color(GOLD_RGB, GREEN_RGB, normalized_time)
cell_style = f' style="background-color: rgb({r_val}, {g_val}, {b_val}); border: 2px solid #FFD700; box-shadow: 0 0 5px rgba(255, 215, 0, 0.5);"'
else:
# Regular correct answers
r_val, g_val, b_val = interpolate_color(GREEN_RGB, LIGHT_GREEN_RGB, normalized_time)
cell_style = f' style="background-color: rgb({r_val}, {g_val}, {b_val});"'
else:
lightness = int(HSL_LIGHTNESS_MIN + HSL_LIGHTNESS_RANGE * normalized_time)
cell_style = f' style="background-color: hsl(0, 100%, {lightness}%);"'
break
if is_fastest_correct:
response_time_text = '<div style="text-align: center; font-weight: bold; font-size: 0.9em;">⭐ ' + response_time_text.split('>')[1] if '>' in response_time_text else response_time_text
cell_id = f"{model}-{prompt}"
detailed_results_body += f'<td{cell_style} data-cell-id="{cell_id}" onclick="showOverlay(\'{cell_id}\')">{response_time_text}</td>'
detailed_results_body += "</tr>"
# Questions details using shared function
results_by_file = group_results_by_file(results)
questions_details = ""
for file, data in results_by_file.items():
questions_details += f"""
<div class="question">
<div class="question-header" onclick="toggleDetails(this)">▶ {file}</div>
<div class="models-container">
<p><strong>Prompt:</strong></p>
<pre>{data['prompt']}</pre>
<p><strong>Expected Answer:</strong></p>
<pre>{data['expected']}</pre>
<hr>
"""
for model_result in data['models']:
correct_class = "correct" if model_result['correct'] else "incorrect"
questions_details += f"""
<div class="model-answer {correct_class}">
<h4>{model_result['model']}</h4>
<p><strong>Generated Answer:</strong></p>
<pre>{model_result['generated']}</pre>
<p><em>Response Time: {model_result['response_time']:.2f}s</em></p>
</div>
"""
questions_details += """
</div>
</div>
"""
# Cell data for JavaScript using shared function
cell_data_dict = create_cell_data_dict(results)
cell_data = json.dumps(json.dumps(cell_data_dict))
return summary_table, detailed_results_header, detailed_results_body, questions_details, cell_data
if __name__ == "__main__":
with socketserver.TCPServer(("", SERVER_PORT), ReportHandler) as httpd:
print("serving at port", SERVER_PORT)
httpd.serve_forever()