Skip to content

Commit fe9d7ad

Browse files
Hermes Sovereign AgentCoreclaude
andcommitted
fix(state): address PR #19508 P1+P2 review feedback
Mac 3-lens review (architect / code-review / security) flagged 3 P1 correctness bugs. Fixes: P1.1 — Calendar-aware EMA weighting (hermes_state.py) Was: raw_weights = [alpha * (1-alpha)**(n-1-i) for i in range(n)] Now: weights by (today_utc - day).days. Days with no data correctly get zero weight (absent from result); gaps don't compress older data. Defeats the index-based bias toward sparsely-running skills. P1.2 — Volume-weighted cost / duration EMA (hermes_state.py + view) Was: ema_X = sum(w * avg_X) — biased equal-weight across days regardless of invocation count. Now: skill_stats_daily exposes total_duration_s / total_cost_usd alongside avg_*. EMA computed as sum(w_d * total_d) / sum(w_d * count_d) so a day with 100 fast calls correctly outweighs a day with 1 slow call within the same EMA term. P1.3 — Surface telemetry write failures (cron/scheduler.py) Was: except: logger.debug(...) — silent swallow at DEBUG; broken writer invisible until dashboard goes empty. Now: logger.warning(..., exc_info=True). Operator sees regressions; cron still cannot fail (warning never raises). Plus P2.1 (multi-skill cron cost split — divides session cost evenly when len(job["skills"]) > 1, preventing per-skill EMA cost-doubling), P2.2 (DROP VIEW IF EXISTS before CREATE for migration idempotency), and P3 nits (dead conditional on error_msg slice; docstring half-life math corrected — α=0.3 ≈ 1.94d, not 5d; α≈0.129 for true 5-day). Smoke-tested off-tree: skill-a-gappy (days 0,1,7) success_rate=0.954 ← gap correctly down-weights day-7 skill-b-tight (days 0,1,2) success_rate=1.000 skill-c-vol (1 slow + 100 fast) duration=1.69s cost=$0.0024/call ← volume-weighted Constraint: pure additive view change — DROP+CREATE on every connect is safe because skill_stats_daily has no rowids/triggers depending on its identity. Existing rows in skill_invocations untouched. Confidence: high (3 scenarios validated) Scope-risk: narrow (same file boundaries as original PR) Not-tested: multi-skill cron live-fire (smoke uses synthetic data) Machine: orion-terminal Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent ffdc581 commit fe9d7ad

2 files changed

Lines changed: 98 additions & 27 deletions

File tree

cron/scheduler.py

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1363,7 +1363,8 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
13631363
except Exception as e:
13641364
error_msg = f"{type(e).__name__}: {str(e)}"
13651365
logger.exception("Job '%s' failed: %s", job_name, error_msg)
1366-
_skill_outcome = (False, error_msg[:200] if error_msg else None)
1366+
# error_msg is always a non-empty f-string here; slice unconditionally.
1367+
_skill_outcome = (False, error_msg[:200])
13671368

13681369
output = f"""# Cron Job: {job_name} (FAILED)
13691370
@@ -1402,17 +1403,39 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
14021403
# sourced from the agent session row. Slash-command and
14031404
# ad-hoc skill_view calls are not tracked here in v1.
14041405
if _skill_outcome is not None and _invoked_at is not None:
1405-
_job_skills = job.get("skills") or []
1406+
_job_skills = [
1407+
str(_s).strip() for _s in (job.get("skills") or [])
1408+
if str(_s).strip()
1409+
]
14061410
if _job_skills:
14071411
try:
14081412
_completed_at = _hermes_now().timestamp()
14091413
_sess = _session_db.get_session(_cron_session_id) or {}
14101414
_success_flag, _end_reason_val = _skill_outcome
14111415
_duration = _completed_at - _invoked_at
1412-
for _skill_name in _job_skills:
1413-
_sn = str(_skill_name).strip()
1414-
if not _sn:
1415-
continue
1416+
# P2.1 — multi-skill cost split. When a cron loads
1417+
# >1 skill, the underlying agent run is shared and
1418+
# the session-row cost is therefore SHARED. Splitting
1419+
# evenly avoids each skill's ema_cost_per_call double-
1420+
# counting the same dollars. Single-skill crons (the
1421+
# current analyzer pattern) get exclusive attribution
1422+
# unchanged. v2 candidate: introduce an `attribution`
1423+
# column to make this explicit per-row.
1424+
_n_skills = max(1, len(_job_skills))
1425+
_split = lambda v: (
1426+
(v / _n_skills) if _n_skills > 1 and v is not None
1427+
else v
1428+
)
1429+
_split_int = lambda v: (
1430+
(int(v) // _n_skills) if _n_skills > 1
1431+
else int(v)
1432+
)
1433+
_shared_cost = _split(_sess.get("estimated_cost_usd"))
1434+
_shared_in = _split_int(_sess.get("input_tokens") or 0)
1435+
_shared_out = _split_int(_sess.get("output_tokens") or 0)
1436+
_shared_cr = _split_int(_sess.get("cache_read_tokens") or 0)
1437+
_shared_cw = _split_int(_sess.get("cache_write_tokens") or 0)
1438+
for _sn in _job_skills:
14161439
_session_db.record_skill_invocation(
14171440
skill_name=_sn,
14181441
invoked_at=_invoked_at,
@@ -1422,18 +1445,23 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
14221445
duration_seconds=_duration,
14231446
model=_sess.get("model"),
14241447
provider=_sess.get("billing_provider"),
1425-
input_tokens=int(_sess.get("input_tokens") or 0),
1426-
output_tokens=int(_sess.get("output_tokens") or 0),
1427-
cache_read_tokens=int(_sess.get("cache_read_tokens") or 0),
1428-
cache_write_tokens=int(_sess.get("cache_write_tokens") or 0),
1429-
estimated_cost_usd=_sess.get("estimated_cost_usd"),
1448+
input_tokens=_shared_in,
1449+
output_tokens=_shared_out,
1450+
cache_read_tokens=_shared_cr,
1451+
cache_write_tokens=_shared_cw,
1452+
estimated_cost_usd=_shared_cost,
14301453
success=_success_flag,
14311454
end_reason=_end_reason_val,
14321455
)
14331456
except (Exception, KeyboardInterrupt) as e:
1434-
logger.debug(
1457+
# Build #87 telemetry foundation: a silent failure here
1458+
# means the dashboard goes blank with no signal of
1459+
# the regression. Surface at WARNING so the operator
1460+
# sees it; keep the swallow so a broken writer can't
1461+
# fail a cron run.
1462+
logger.warning(
14351463
"Job '%s': failed to record skill invocations: %s",
1436-
job_id, e,
1464+
job_id, e, exc_info=True,
14371465
)
14381466
try:
14391467
_session_db.end_session(_cron_session_id, "cron_complete")

hermes_state.py

Lines changed: 57 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import sqlite3
2222
import threading
2323
import time
24+
from datetime import datetime, timezone
2425
from pathlib import Path
2526

2627
from agent.memory_manager import sanitize_context
@@ -124,7 +125,8 @@
124125
CREATE INDEX IF NOT EXISTS idx_skill_invocations_cron ON skill_invocations(cron_id, invoked_at DESC);
125126
CREATE INDEX IF NOT EXISTS idx_skill_invocations_session ON skill_invocations(session_id);
126127
127-
CREATE VIEW IF NOT EXISTS skill_stats_daily AS
128+
DROP VIEW IF EXISTS skill_stats_daily;
129+
CREATE VIEW skill_stats_daily AS
128130
SELECT
129131
skill_name,
130132
model,
@@ -133,6 +135,7 @@
133135
COUNT(*) AS invocation_count,
134136
SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) AS success_count,
135137
SUM(CASE WHEN success = 0 THEN 1 ELSE 0 END) AS failure_count,
138+
SUM(duration_seconds) AS total_duration_s,
136139
AVG(duration_seconds) AS avg_duration_s,
137140
SUM(estimated_cost_usd) AS total_cost_usd,
138141
AVG(estimated_cost_usd) AS avg_cost_usd,
@@ -788,10 +791,20 @@ def query_skill_ema(
788791
"""Per-(skill_name, model) exponentially-weighted moving averages.
789792
790793
Reads ``skill_stats_daily`` for the last *window_days* and applies
791-
exponential weighting where the most recent day has weight ``alpha``,
792-
the next has ``alpha * (1-alpha)``, and so on. The default
793-
``alpha=0.3`` gives roughly a 5-day half-life — recent behaviour
794-
dominates without dropping older data on a hard window.
794+
**calendar-aware** exponential weighting: each day's weight is
795+
``alpha * (1-alpha) ** age_in_days`` where ``age_in_days`` is the
796+
actual UTC-day distance from today, NOT the row's index in the
797+
result set. Days with no data correctly get zero weight (they're
798+
absent from the result), and gaps don't compress older data.
799+
800+
Cost / duration EMAs are **volume-weighted**: a day with 100 fast
801+
calls weighs more than a day with 1 slow call within the same EMA
802+
term, computed as ``sum(w_d * total_d) / sum(w_d * count_d)``
803+
across days. Avoids the per-day-average × per-day-weight bias that
804+
favoured low-volume early-shadow days.
805+
806+
Half-life heuristic for α=0.3 is ~1.94 calendar days (``ln(2) /
807+
ln(1/(1-α))``). Use α≈0.129 for a true 5-day half-life.
795808
796809
Returns a list of dicts (one per (skill_name, model) bucket) with
797810
keys::
@@ -803,15 +816,18 @@ def query_skill_ema(
803816
804817
Sorted by ``last_invoked_at`` descending so the dashboard surfaces
805818
currently-active skills first. Buckets with zero rows in the window
806-
are silently omitted.
819+
are silently omitted. Buckets where every weighted-count term is
820+
zero (e.g. all rows have NULL ``invocation_count``) get NaN-safe
821+
zeros for cost/duration EMAs.
807822
"""
808823
if window_days <= 0 or alpha <= 0 or alpha >= 1:
809824
return []
810825
cutoff_ts = time.time() - window_days * 86400.0
811826
sql = (
812827
"SELECT skill_name, model, provider, day, "
813828
"invocation_count, success_count, failure_count, "
814-
"avg_duration_s, avg_cost_usd, avg_quality_score, last_invoked_at "
829+
"total_duration_s, total_cost_usd, "
830+
"avg_quality_score, last_invoked_at "
815831
"FROM skill_stats_daily "
816832
"WHERE last_invoked_at >= ? "
817833
"ORDER BY skill_name, model, day"
@@ -825,11 +841,25 @@ def query_skill_ema(
825841
key = (r["skill_name"], r["model"])
826842
groups.setdefault(key, []).append(r)
827843

844+
# Calendar-aware day-age helper. Cron writes ``DATE(invoked_at,
845+
# 'unixepoch')`` which is UTC by SQLite contract, so we compare
846+
# against UTC today.
847+
today_utc = datetime.now(timezone.utc).date()
848+
849+
def _day_age(day_str: str) -> int:
850+
try:
851+
d = datetime.strptime(day_str, "%Y-%m-%d").date()
852+
except (TypeError, ValueError):
853+
return 0
854+
return max(0, (today_utc - d).days)
855+
828856
out: List[Dict[str, Any]] = []
829857
for (skill_name, model), rs in groups.items():
830858
rs.sort(key=lambda r: r["day"])
831859
n = len(rs)
832-
raw_weights = [alpha * (1 - alpha) ** (n - 1 - i) for i in range(n)]
860+
861+
# Calendar-aware geometric weights.
862+
raw_weights = [alpha * (1 - alpha) ** _day_age(r["day"]) for r in rs]
833863
wsum = sum(raw_weights)
834864
weights = (
835865
[w / wsum for w in raw_weights]
@@ -847,14 +877,27 @@ def query_skill_ema(
847877
)
848878
for w, r in zip(weights, rs)
849879
)
850-
ema_duration = sum(
851-
w * float(r["avg_duration_s"] or 0.0)
852-
for w, r in zip(weights, rs)
853-
)
854-
ema_cost = sum(
855-
w * float(r["avg_cost_usd"] or 0.0)
880+
881+
# Volume-weighted EMA: numerator is sum(w_d * total_d), denominator
882+
# is sum(w_d * count_d). When count_d = 0 for every day, fall back
883+
# to 0.0 (skill ran but every row was excluded — rare).
884+
weighted_count_sum = sum(
885+
w * int(r["invocation_count"] or 0)
856886
for w, r in zip(weights, rs)
857887
)
888+
if weighted_count_sum > 0:
889+
ema_duration = sum(
890+
w * float(r["total_duration_s"] or 0.0)
891+
for w, r in zip(weights, rs)
892+
) / weighted_count_sum
893+
ema_cost = sum(
894+
w * float(r["total_cost_usd"] or 0.0)
895+
for w, r in zip(weights, rs)
896+
) / weighted_count_sum
897+
else:
898+
ema_duration = 0.0
899+
ema_cost = 0.0
900+
858901
last_invoked = max(float(r["last_invoked_at"] or 0.0) for r in rs)
859902
provider_latest = rs[-1].get("provider")
860903

0 commit comments

Comments
 (0)