@@ -57,6 +57,8 @@ interface OrchestratorState {
5757interface Finding {
5858 type : 'bug' | 'performance-regression' | 'test-gap' | 'daemon-behavior' ;
5959 severity : 'low' | 'medium' | 'high' | 'critical' ;
60+ /** Expert self-assessed confidence 1-5 (5 = proven with evidence) */
61+ confidence ?: number ;
6062 title : string ;
6163 description : string ;
6264 files : string [ ] ;
@@ -109,6 +111,34 @@ function saveState(state: OrchestratorState): void {
109111 fs . writeFileSync ( STATE_FILE , JSON . stringify ( state , null , 2 ) ) ;
110112}
111113
114+ // --- Log Rotation ---
115+
116+ const LOG_FILE = '/tmp/orchestrator.log' ;
117+ const MAX_LOG_SIZE_BYTES = 10 * 1024 * 1024 ; // 10 MB
118+
119+ /** Rotate log file if it exceeds MAX_LOG_SIZE_BYTES */
120+ function rotateLogIfNeeded ( ) : void {
121+ try {
122+ if ( ! fs . existsSync ( LOG_FILE ) ) return ;
123+ const stats = fs . statSync ( LOG_FILE ) ;
124+ if ( stats . size > MAX_LOG_SIZE_BYTES ) {
125+ // Keep last 2MB, discard the rest
126+ const content = fs . readFileSync ( LOG_FILE , 'utf-8' ) ;
127+ const keepFrom = content . length - 2 * 1024 * 1024 ;
128+ const newContent = keepFrom > 0
129+ ? '... (log rotated)\n' + content . slice ( keepFrom )
130+ : content ;
131+ fs . writeFileSync ( LOG_FILE , newContent ) ;
132+ logger . info (
133+ { oldSize : stats . size , newSize : newContent . length } ,
134+ 'Log file rotated' ,
135+ ) ;
136+ }
137+ } catch {
138+ // Non-fatal — don't crash the orchestrator over log rotation
139+ }
140+ }
141+
112142/** Check and update hourly rate limit window */
113143function checkRateLimit ( state : OrchestratorState ) : boolean {
114144 const currentHour = new Date ( ) . toISOString ( ) . slice ( 0 , 13 ) ; // YYYY-MM-DDTHH
@@ -269,17 +299,26 @@ const EXPERT_RESPONSE_FORMAT = `
269299Respond with a JSON array of findings. Each finding has:
270300- type: one of [bug, performance-regression, test-gap, daemon-behavior]
271301- severity: low | medium | high | critical
302+ - confidence: integer 1-5 (1=uncertain guess, 5=confirmed with evidence from source code)
272303- title: concise, unique, specific (not generic)
273304- description: 2-3 sentences explaining the concern, referencing specific line numbers from the source code
274305- files: array of affected file paths
275- - validation: shell command to verify
306+ - validation: shell command that PROVES the issue exists (must be runnable, e.g. grep for the problematic pattern)
276307
277308BEFORE REPORTING: Read the FULL SOURCE CODE above, not just the diff. Check:
278309- Is this already handled by guards, fallbacks, or platform checks elsewhere in the file?
279310- Is silent failure intentional for best-effort utilities (allocator hints, cache warmup)?
280311- Is "missing persistence" actually "intentionally stateless" (adaptive loops, freshness re-evaluation)?
281312- Would fixing this add unnecessary complexity for negligible benefit?
282313
314+ CONFIDENCE SCORING:
315+ - 5: You can point to exact lines in the source that prove the issue
316+ - 4: Strong evidence but relies on assumptions about runtime behavior
317+ - 3: Plausible concern but code context is ambiguous
318+ - 2: Speculative — the pattern looks concerning but may be intentional
319+ - 1: Gut feeling only — no concrete evidence
320+
321+ Only report findings with confidence >= 4.
283322If nothing warrants attention or you're not confident, respond with: []
284323Prefer returning [] over returning a questionable finding.
285324
@@ -534,11 +573,19 @@ async function triageChanges(
534573 // Deduplicate across perspectives
535574 const deduped = deduplicateFindings ( allFindings ) ;
536575
537- // Filter: only medium+ severity, max 5, sort by severity desc
576+ // Filter: confidence >= 4, medium+ severity, max 5, sort by severity desc
538577 const filtered = deduped
539- . filter (
540- ( f ) => severityRank ( f . severity ) >= severityRank ( MIN_SEVERITY_FOR_ISSUE ) ,
541- )
578+ . filter ( ( f ) => {
579+ // Confidence gate: experts must self-score >= 4 (from FP prevention research)
580+ if ( f . confidence !== undefined && f . confidence < 4 ) {
581+ logger . info (
582+ { title : f . title , confidence : f . confidence } ,
583+ 'Finding below confidence threshold, skipping' ,
584+ ) ;
585+ return false ;
586+ }
587+ return severityRank ( f . severity ) >= severityRank ( MIN_SEVERITY_FOR_ISSUE ) ;
588+ } )
542589 . sort ( ( a , b ) => severityRank ( b . severity ) - severityRank ( a . severity ) )
543590 . slice ( 0 , MAX_FINDINGS_PER_CYCLE ) ;
544591
@@ -845,6 +892,72 @@ function severityRank(s: string): number {
845892 return ranks [ s ] || 0 ;
846893}
847894
895+ // --- Verification Script Execution ---
896+
897+ /**
898+ * Run the finding's validation command to verify the issue actually exists.
899+ * If the command fails (exit code != 0 or no output), the finding is likely
900+ * a hallucination — the problematic pattern doesn't exist in the code.
901+ *
902+ * Inspired by CodeRabbit's "agentic verification" pattern (FP research).
903+ */
904+ function verifyFindingScript (
905+ finding : Finding ,
906+ repoPath : string ,
907+ ) : { verified : boolean ; output : string } {
908+ const cmd = finding . validation ?. trim ( ) ;
909+ if ( ! cmd || cmd . length < 3 ) {
910+ return { verified : false , output : 'No validation command provided' } ;
911+ }
912+
913+ // Safety: only allow grep, rg, find, cat, head, wc, cargo, python, pytest commands
914+ const allowedPrefixes = [
915+ 'grep' ,
916+ 'rg' ,
917+ 'find' ,
918+ 'cat' ,
919+ 'head' ,
920+ 'tail' ,
921+ 'wc' ,
922+ 'cargo' ,
923+ 'python' ,
924+ 'pytest' ,
925+ 'ls' ,
926+ ] ;
927+ const firstWord = cmd . split ( / \s + / ) [ 0 ] ;
928+ if ( ! allowedPrefixes . some ( ( p ) => firstWord . startsWith ( p ) ) ) {
929+ logger . info (
930+ { cmd : cmd . slice ( 0 , 50 ) , title : finding . title } ,
931+ 'Validation command not in allowlist, skipping verification' ,
932+ ) ;
933+ return { verified : true , output : 'Command not verifiable (not in allowlist)' } ;
934+ }
935+
936+ try {
937+ const result = execSync ( cmd , {
938+ cwd : repoPath ,
939+ encoding : 'utf-8' ,
940+ timeout : 10_000 ,
941+ maxBuffer : 256 * 1024 ,
942+ } ) ;
943+ const output = result . trim ( ) ;
944+ if ( output . length === 0 ) {
945+ logger . info (
946+ { title : finding . title , cmd : cmd . slice ( 0 , 80 ) } ,
947+ 'Verification script returned empty — finding may be hallucinated' ,
948+ ) ;
949+ return { verified : false , output : 'Command returned empty output' } ;
950+ }
951+ return { verified : true , output : output . slice ( 0 , 500 ) } ;
952+ } catch ( err ) {
953+ logger . info (
954+ { title : finding . title , cmd : cmd . slice ( 0 , 80 ) , err } ,
955+ 'Verification script failed — finding likely hallucinated' ,
956+ ) ;
957+ return { verified : false , output : `Command failed: ${ String ( err ) . slice ( 0 , 200 ) } ` } ;
958+ }
959+ }
960+
848961// --- Claude Validation (Phase 3) ---
849962
850963/**
@@ -1581,11 +1694,10 @@ export async function startOrchestratorLoop(
15811694 state . lastHeartbeat = new Date ( ) . toISOString ( ) ;
15821695 saveState ( state ) ;
15831696
1584- // Sync cc-skills on heartbeat to stay updated
1697+ // Heartbeat maintenance
15851698 if ( ccSkillsPath ) syncCcSkills ( ccSkillsPath ) ;
1586-
1587- // Learn from rejected issues (closed as "not planned")
15881699 syncFalsePositivePatterns ( ) ;
1700+ rotateLogIfNeeded ( ) ;
15891701 }
15901702
15911703 // Step 1: git pull
@@ -1665,6 +1777,18 @@ export async function startOrchestratorLoop(
16651777 continue ;
16661778 }
16671779
1780+ // Pre-validation: run the finding's verification script
1781+ // If the script fails, the finding is likely hallucinated — skip without wasting Claude
1782+ const scriptCheck = verifyFindingScript ( finding , config . repoPath ) ;
1783+ if ( ! scriptCheck . verified ) {
1784+ state . findingsRejected ++ ;
1785+ logger . info (
1786+ { title : finding . title , reason : scriptCheck . output } ,
1787+ 'Finding failed verification script — likely hallucinated' ,
1788+ ) ;
1789+ continue ;
1790+ }
1791+
16681792 // Claude validation (Phase 3)
16691793 const validation = validateWithClaude (
16701794 finding ,
0 commit comments