fix(05.1-01): defect qwibitai#6 L2 — synthetic user-directive injection in onHuman (breaks AMD context contamination)

Carsten · Carsten · commit 838c45191361 · 2026-04-21T06:29:53.000Z
Layer-2 defense-in-depth for defect qwibitai#6: between updateInstructions(persona) and setTimeout→requestResponse in webhook.ts onHuman closure, inject a conversation.item.create with role=user containing a bracketed system-hint directive. This breaks the conversational context the model inherited from CASE2_AMD_CLASSIFIER_PROMPT so it cannot mis-read the callee's opening greeting ('Restaurant Bellavista') as evidence it should continue in AMD-helper mode. Text verbatim per RESEARCH §2.5 with ASCII umlauts (Phase 2 CASE6B_PERSONA convention): 'Beginne bitte mit der Begruessung gemaess deiner neuen Anweisungen'. Hardcoded literal, not derived from counterpart input — safe under T-05.1-01-04 (counterpart cannot prompt-inject). Pitfall 5: conversation.item.create does NOT itself trigger a response.create (VAD only scopes audio-derived items). The explicit setTimeout→requestResponse is preserved unchanged. Tests added (RED before, GREEN after this commit): - Test F: asserts send order session.update → conversation.item.create → (after GREET_TRIGGER_DELAY_OUTBOUND_MS) response.create - Test G: asserts verbatim directive text with ASCII umlauts, no unicode - Test H (regression, inside F): persona-swap trigger from Wave 3 still fires Also adds dispatch.getAmdClassifier() test-only accessor so the tests can drive classifier.onAmdResult('human') end-to-end through the /accept flow. Full voice-bridge suite: 367 passed / 4 skipped. Build clean. Both defect qwibitai#6 layers now shipped (L1 session.type discriminator, L2 synthetic directive).
diff --git a/voice-bridge/src/tools/dispatch.ts b/voice-bridge/src/tools/dispatch.ts
@@ -36,6 +36,12 @@ let _activeClassifier: AmdClassifier | null = null
 export function setAmdClassifier(classifier: AmdClassifier | null): void {
   _activeClassifier = classifier
 }
+// Plan 05.1-01 Task 3: test-only accessor — lets accept.test.ts drive the
+// classifier's onAmdResult('human') synthetic trigger so the onHuman
+// closure's full send-ordering can be asserted end-to-end.
+export function getAmdClassifier(): AmdClassifier | null {
+  return _activeClassifier
+}
 
 // Tool-name mapping: bridge tool name → Core MCP tool name.
 // null  = not implemented (03-08 skipped or bridge-internal, stub path).
diff --git a/voice-bridge/src/webhook.ts b/voice-bridge/src/webhook.ts
@@ -294,6 +294,46 @@ export function registerAcceptRoute(
             if (ctxRef) {
               // Push Case-2 persona to model via session.update, then trigger greeting.
               updateInstructions(ctxRef.sideband.state, persona, log)
+
+              // Plan 05.1-01 Task 3 (defect #6 Layer 2, RESEARCH §2.5):
+              // synthetic user-directive injection between updateInstructions
+              // and the setTimeout→requestResponse. Breaks the conversational
+              // context inherited from CASE2_AMD_CLASSIFIER_PROMPT — without
+              // this, the model may still mis-read the callee's opening
+              // greeting ("Restaurant Bellavista") as evidence it should
+              // continue in AMD-helper mode instead of CASE2_OUTBOUND_PERSONA.
+              // Text uses ASCII umlauts per Phase 2 CASE6B_PERSONA convention.
+              // Pitfall 5: this item.create does NOT itself trigger a
+              // response.create (VAD only scopes audio-derived items), so
+              // the explicit requestResponse below is still required.
+              try {
+                ctxRef.sideband.state.ws?.send(
+                  JSON.stringify({
+                    type: 'conversation.item.create',
+                    item: {
+                      type: 'message',
+                      role: 'user',
+                      content: [
+                        {
+                          type: 'input_text',
+                          text: '[System-Hinweis: AMD-Verdict war human. Der Anruf laeuft jetzt im Reservierungs-Modus. Beginne bitte mit der Begruessung gemaess deiner neuen Anweisungen.]',
+                        },
+                      ],
+                    },
+                  }),
+                )
+                log.info({
+                  event: 'case_2_amd_synthetic_user_directive_sent',
+                  call_id: callId,
+                })
+              } catch (e: unknown) {
+                log.warn({
+                  event: 'case_2_amd_synthetic_user_directive_send_failed',
+                  call_id: callId,
+                  err: (e as Error)?.message,
+                })
+              }
+
               setTimeout(() => {
                 if (ctxRef) requestResponse(ctxRef.sideband.state, log)
               }, GREET_TRIGGER_DELAY_OUTBOUND_MS)
diff --git a/voice-bridge/tests/accept.test.ts b/voice-bridge/tests/accept.test.ts
@@ -580,6 +580,239 @@ describe('POST /accept — Case-2 outbound branch (05-03 Task 3)', () => {
       await app.close()
     }
   })
+
+  // Plan 05.1-01 Task 3: onHuman L2 defense-in-depth — synthetic user-directive
+  // injection between updateInstructions and setTimeout→requestResponse.
+  // Breaks AMD classifier conversational context contamination (RESEARCH §2.5).
+  // Asserts exact WS send order: session.update → conversation.item.create →
+  // response.create (after GREET_TRIGGER_DELAY_OUTBOUND_MS).
+  it('Test F+H: onHuman sends session.update THEN conversation.item.create THEN (after timer) response.create', async () => {
+    // Mock WS whose .send() we can inspect in order
+    const sentMessages: string[] = []
+    const mockWs = {
+      send: vi.fn((s: string) => {
+        sentMessages.push(s)
+      }),
+      readyState: 1,
+    }
+    // Mock sideband state: ready=true so updateInstructions and requestResponse proceed
+    const mockState = {
+      callId: 'rtc_c2_l2',
+      ready: true,
+      ws: mockWs as unknown as import('ws').WebSocket,
+      openedAt: 0,
+      lastUpdateAt: 0,
+    }
+
+    const outboundRouter = makeCase2OutboundRouter('case_2')
+    await new Promise((r) => setTimeout(r, 10))
+
+    const acceptSpy = vi.fn().mockResolvedValue({})
+    const openai = {
+      webhooks: {
+        unwrap: vi.fn().mockResolvedValue({
+          type: 'realtime.call.incoming',
+          data: {
+            call_id: 'rtc_c2_l2',
+            sip_headers: [{ name: 'From', value: '"Caller" <sip:+4900000@sipgate.de>' }],
+          },
+        }),
+      },
+      realtime: { calls: { accept: acceptSpy, reject: vi.fn() } },
+    }
+
+    const router = {
+      startCall: vi.fn().mockReturnValue({
+        sideband: { state: mockState },
+        close: vi.fn(),
+      }),
+      endCall: vi.fn(),
+      getCall: vi.fn(),
+      _size: vi.fn().mockReturnValue(0),
+    }
+
+    const { buildApp } = await import('../src/index.js')
+    const { getAmdClassifier, setAmdClassifier } = await import('../src/tools/dispatch.js')
+
+    const app = await buildApp({
+      openaiOverride: openai as never,
+      whitelistOverride: new Set(),
+      routerOverride: router as never,
+      outboundRouterOverride: outboundRouter,
+    })
+
+    try {
+      const res = await app.inject({
+        method: 'POST',
+        url: '/accept',
+        headers: {
+          'content-type': 'application/json',
+          'webhook-id': 'c2-l2',
+          'webhook-timestamp': String(Math.floor(Date.now() / 1000)),
+          'webhook-signature': 'v1,xxx',
+        },
+        payload: JSON.stringify({
+          type: 'realtime.call.incoming',
+          data: { call_id: 'rtc_c2_l2' },
+        }),
+      })
+
+      expect(res.statusCode).toBe(200)
+
+      // Switch to fake timers BEFORE firing onAmdResult so the setTimeout in
+      // onHuman (GREET_TRIGGER_DELAY_OUTBOUND_MS) is trapped under our control.
+      vi.useFakeTimers()
+      try {
+        const classifier = getAmdClassifier()
+        expect(classifier).not.toBeNull()
+        // Trigger the human verdict → fires the onHuman closure in webhook.ts
+        classifier?.onAmdResult('human')
+
+        // IMMEDIATELY after onAmdResult: two sync sends must be present
+        // (updateInstructions then conversation.item.create).
+        // requestResponse is still pending in the setTimeout queue.
+        expect(sentMessages.length).toBeGreaterThanOrEqual(2)
+
+        // Test F ordering: first send = session.update with type:'realtime' + Case-2 persona
+        const firstParsed = JSON.parse(sentMessages[0])
+        expect(firstParsed.type).toBe('session.update')
+        expect(firstParsed.session?.type).toBe('realtime')
+        expect(firstParsed.session?.instructions).toContain('NanoClaw im Auftrag')
+
+        // Test F ordering: second send = conversation.item.create role=user synthetic directive
+        const secondParsed = JSON.parse(sentMessages[1])
+        expect(secondParsed.type).toBe('conversation.item.create')
+        expect(secondParsed.item?.type).toBe('message')
+        expect(secondParsed.item?.role).toBe('user')
+        expect(secondParsed.item?.content?.[0]?.type).toBe('input_text')
+        expect(secondParsed.item?.content?.[0]?.text).toContain(
+          '[System-Hinweis: AMD-Verdict war human.',
+        )
+
+        // Test H (regression): the persona-swap trigger from Wave 3 still fires —
+        // advance timers past GREET_TRIGGER_DELAY_OUTBOUND_MS, expect response.create
+        await vi.advanceTimersByTimeAsync(5000)
+        const responseCreateMsg = sentMessages.find((s) => {
+          try {
+            return JSON.parse(s).type === 'response.create'
+          } catch {
+            return false
+          }
+        })
+        expect(responseCreateMsg).toBeDefined()
+
+        // Overall ordering: session.update (idx 0) < item.create (idx 1) < response.create (later)
+        const idxSessionUpdate = sentMessages.findIndex(
+          (s) => JSON.parse(s).type === 'session.update',
+        )
+        const idxItemCreate = sentMessages.findIndex(
+          (s) => JSON.parse(s).type === 'conversation.item.create',
+        )
+        const idxResponseCreate = sentMessages.findIndex(
+          (s) => JSON.parse(s).type === 'response.create',
+        )
+        expect(idxSessionUpdate).toBe(0)
+        expect(idxItemCreate).toBe(1)
+        expect(idxResponseCreate).toBeGreaterThan(idxItemCreate)
+      } finally {
+        vi.useRealTimers()
+        // Clean up classifier registration to avoid cross-test contamination
+        setAmdClassifier(null)
+      }
+    } finally {
+      await app.close()
+    }
+  })
+
+  it('Test G: synthetic-item text contains verbatim directive (RESEARCH §2.5, ASCII umlauts)', async () => {
+    const sentMessages: string[] = []
+    const mockWs = {
+      send: vi.fn((s: string) => {
+        sentMessages.push(s)
+      }),
+      readyState: 1,
+    }
+    const mockState = {
+      callId: 'rtc_c2_l2g',
+      ready: true,
+      ws: mockWs as unknown as import('ws').WebSocket,
+      openedAt: 0,
+      lastUpdateAt: 0,
+    }
+
+    const outboundRouter = makeCase2OutboundRouter('case_2')
+    await new Promise((r) => setTimeout(r, 10))
+
+    const acceptSpy = vi.fn().mockResolvedValue({})
+    const openai = {
+      webhooks: {
+        unwrap: vi.fn().mockResolvedValue({
+          type: 'realtime.call.incoming',
+          data: {
+            call_id: 'rtc_c2_l2g',
+            sip_headers: [{ name: 'From', value: '"Caller" <sip:+4900000@sipgate.de>' }],
+          },
+        }),
+      },
+      realtime: { calls: { accept: acceptSpy, reject: vi.fn() } },
+    }
+
+    const router = {
+      startCall: vi.fn().mockReturnValue({
+        sideband: { state: mockState },
+        close: vi.fn(),
+      }),
+      endCall: vi.fn(),
+      getCall: vi.fn(),
+      _size: vi.fn().mockReturnValue(0),
+    }
+
+    const { buildApp } = await import('../src/index.js')
+    const { getAmdClassifier, setAmdClassifier } = await import('../src/tools/dispatch.js')
+
+    const app = await buildApp({
+      openaiOverride: openai as never,
+      whitelistOverride: new Set(),
+      routerOverride: router as never,
+      outboundRouterOverride: outboundRouter,
+    })
+
+    try {
+      await app.inject({
+        method: 'POST',
+        url: '/accept',
+        headers: {
+          'content-type': 'application/json',
+          'webhook-id': 'c2-l2g',
+          'webhook-timestamp': String(Math.floor(Date.now() / 1000)),
+          'webhook-signature': 'v1,xxx',
+        },
+        payload: JSON.stringify({
+          type: 'realtime.call.incoming',
+          data: { call_id: 'rtc_c2_l2g' },
+        }),
+      })
+
+      const classifier = getAmdClassifier()
+      classifier?.onAmdResult('human')
+
+      const itemCreate = sentMessages
+        .map((s) => JSON.parse(s))
+        .find((p) => p.type === 'conversation.item.create')
+      expect(itemCreate).toBeDefined()
+      const text = itemCreate.item.content[0].text as string
+      // Verbatim phrases per RESEARCH §2.5 + ASCII umlaut convention
+      expect(text).toContain('[System-Hinweis: AMD-Verdict war human.')
+      expect(text).toContain('Reservierungs-Modus')
+      expect(text).toContain('Beginne bitte mit der Begruessung gemaess deiner neuen Anweisungen')
+      // ASCII umlauts, not unicode — project convention (Phase 2 CASE6B_PERSONA)
+      expect(text).not.toMatch(/[äöüß]/)
+
+      setAmdClassifier(null)
+    } finally {
+      await app.close()
+    }
+  })
 })
 
 // Plan 04-02 Task 3: /accept-time cost gate integration.