fix: worktree dedup and multi-project chunk storage (#142)

justinkatzman · web-flow · commit 54580dd43452 · 2026-02-22T09:09:02.000+01:00
* fix: worktree dedup and multi-project chunk storage

Fix two issues that break content-hash dedup across git worktrees
sharing a PostgreSQL backend:

1. Files with documents but no chunks are permanently skipped.
   When a prior indexing run creates document records but fails to
   embed (e.g., missing API key), subsequent runs skip those files
   because the mod-time gate and hash check don't verify chunk
   existence. Fix: check len(doc.ChunkIDs) &gt; 0 before skipping in
   IndexAllWithBatchProgress and NeedsReindex.

2. Chunk IDs collide across projects sharing the same database.
   Chunk IDs use relative paths (e.g., src/App.tsx_0). With the
   primary key on just (id), ON CONFLICT upserts from a second
   project overwrite the first project's chunks without updating
   project_id — the second project ends up with 0 chunks.
   Fix: migrate primary key to (project_id, id) and update the
   ON CONFLICT clause to match.

Tested with 3 projects sharing one postgres instance:
- 100% content-hash cache hit rate for identical code
- ~9s per worktree (vs ~5min without cache)
- Each project gets its own chunk rows

* fix: update tests to match new ChunkIDs-aware skip logic

The indexer now calls GetDocument before the lastIndexTime gate and
requires doc.ChunkIDs to be non-empty for both the time-based and
hash-based skip paths. This prevents silently skipping files whose
prior indexing run created a document but failed to embed chunks.

Update tests to seed documents with ChunkIDs where the test expects
a skip, and remove assertions that GetDocument should not be called
(it is now called by design before the time check).
diff --git a/cli/watch_initialscan_test.go b/cli/watch_initialscan_test.go
@@ -142,6 +142,16 @@ func TestRunInitialScan_SkipsIndexedFileByLastIndexTime(t *testing.T) {
 	vecStore := store.NewGOBStore(filepath.Join(projectRoot, "index.gob"))
 	idx := indexer.NewIndexer(projectRoot, vecStore, emb, chunker, scanner, time.Now().Add(1*time.Hour))
 
+	// Seed a document with ChunkIDs so the lastIndexTime gate can skip it.
+	// The new logic requires doc != nil && len(doc.ChunkIDs) > 0 to skip.
+	if err := vecStore.SaveDocument(ctx, store.Document{
+		Path:     "main.go",
+		Hash:     "seeded",
+		ChunkIDs: []string{"c1"},
+	}); err != nil {
+		t.Fatalf("failed to seed document: %v", err)
+	}
+
 	symbolStore := trace.NewGOBSymbolStore(filepath.Join(projectRoot, "symbols.gob"))
 	defer symbolStore.Close()
 
@@ -210,9 +220,10 @@ func TestHandleFileEvent_SkipsUnchangedFile(t *testing.T) {
 		t.Fatalf("failed to scan source file: %v", err)
 	}
 	if err := vecStore.SaveDocument(ctx, store.Document{
-		Path:    "main.go",
-		Hash:    fileInfo.Hash,
-		ModTime: time.Unix(fileInfo.ModTime, 0),
+		Path:     "main.go",
+		Hash:     fileInfo.Hash,
+		ModTime:  time.Unix(fileInfo.ModTime, 0),
+		ChunkIDs: []string{"c1"},
 	}); err != nil {
 		t.Fatalf("failed to seed document: %v", err)
 	}
@@ -307,9 +318,10 @@ func TestHandleWorkspaceFileEvent_SkipsUnchangedFile(t *testing.T) {
 	projectName := "proj"
 	prefixedPath := workspaceName + "/" + projectName + "/proj/main.go"
 	if err := st.SaveDocument(ctx, store.Document{
-		Path:    prefixedPath,
-		Hash:    fileInfo.Hash,
-		ModTime: time.Unix(fileInfo.ModTime, 0),
+		Path:     prefixedPath,
+		Hash:     fileInfo.Hash,
+		ModTime:  time.Unix(fileInfo.ModTime, 0),
+		ChunkIDs: []string{"c1"},
 	}); err != nil {
 		t.Fatalf("failed to seed workspace document: %v", err)
 	}
diff --git a/indexer/indexer.go b/indexer/indexer.go
@@ -115,8 +115,16 @@ func (idx *Indexer) IndexAllWithBatchProgress(ctx context.Context, onProgress Pr
 			})
 		}
 
-		// Skip files modified before lastIndexTime
-		if !idx.lastIndexTime.IsZero() {
+		// Fetch the document once — used by both the mod-time gate and hash check.
+		doc, err := idx.store.GetDocument(ctx, fileMeta.Path)
+		if err != nil {
+			return nil, fmt.Errorf("failed to get document %s: %w", fileMeta.Path, err)
+		}
+
+		// Skip files modified before lastIndexTime — but only if they have chunks.
+		// Files with no chunks need re-indexing even if their mod_time is old
+		// (e.g., a prior indexing run created the document but failed to embed).
+		if !idx.lastIndexTime.IsZero() && doc != nil && len(doc.ChunkIDs) > 0 {
 			fileModTime := time.Unix(fileMeta.ModTime, 0)
 			if fileModTime.Before(idx.lastIndexTime) || fileModTime.Equal(idx.lastIndexTime) {
 				stats.FilesSkipped++
@@ -125,12 +133,6 @@ func (idx *Indexer) IndexAllWithBatchProgress(ctx context.Context, onProgress Pr
 			}
 		}
 
-		// Check if file needs reindexing
-		doc, err := idx.store.GetDocument(ctx, fileMeta.Path)
-		if err != nil {
-			return nil, fmt.Errorf("failed to get document %s: %w", fileMeta.Path, err)
-		}
-
 		// Load file content and hash only after metadata filtering.
 		file, err := idx.scanner.ScanFile(fileMeta.Path)
 		if err != nil {
@@ -145,9 +147,9 @@ func (idx *Indexer) IndexAllWithBatchProgress(ctx context.Context, onProgress Pr
 			continue
 		}
 
-		if doc != nil && doc.Hash == file.Hash {
+		if doc != nil && doc.Hash == file.Hash && len(doc.ChunkIDs) > 0 {
 			delete(existingMap, fileMeta.Path)
-			continue // File unchanged
+			continue // File unchanged and has chunks
 		}
 
 		filesToIndex = append(filesToIndex, *file)
@@ -671,5 +673,10 @@ func (idx *Indexer) NeedsReindex(ctx context.Context, path string, hash string)
 		return true, nil
 	}
 
-	return doc.Hash != hash, nil
+	// Reindex if hash changed OR if document has no chunks (prior indexing failed)
+	if doc.Hash != hash || len(doc.ChunkIDs) == 0 {
+		return true, nil
+	}
+
+	return false, nil
 }
diff --git a/indexer/indexer_branchswitch_test.go b/indexer/indexer_branchswitch_test.go
@@ -9,6 +9,8 @@ import (
 	"strings"
 	"testing"
 	"time"
+
+	"github.com/yoanbernabeu/grepai/store"
 )
 
 func createGoFixtureFiles(tb testing.TB, root string, fileCount int) {
@@ -33,6 +35,16 @@ func TestIndexAllWithProgress_BranchSwitchSkipsBulkWithoutLookupOrEmbedding(t *t
 	}
 
 	mockStore := newMockStore()
+	// Seed documents with ChunkIDs so the lastIndexTime gate can skip them.
+	// The new logic requires doc != nil && len(doc.ChunkIDs) > 0 to skip.
+	for i := range 200 {
+		path := fmt.Sprintf("file_%04d.go", i)
+		mockStore.documents[path] = store.Document{
+			Path:     path,
+			Hash:     "seeded",
+			ChunkIDs: []string{"c1"},
+		}
+	}
 	mockEmbedder := newMockEmbedder()
 	scanner := NewScanner(tmpDir, ignoreMatcher)
 	chunker := NewChunker(512, 50)
@@ -55,9 +67,6 @@ func TestIndexAllWithProgress_BranchSwitchSkipsBulkWithoutLookupOrEmbedding(t *t
 	if stats.FilesSkipped < 200 {
 		t.Fatalf("expected at least 200 skipped files, got %d", stats.FilesSkipped)
 	}
-	if mockStore.getDocCalled {
-		t.Fatal("GetDocument should not be called for files skipped by lastIndexTime")
-	}
 	if mockEmbedder.embedCalled {
 		t.Fatal("embedder should not be called when all files are skipped")
 	}
diff --git a/indexer/indexer_test.go b/indexer/indexer_test.go
@@ -248,11 +248,6 @@ func TestIndexAllWithProgress_UnchangedFilesSkipped(t *testing.T) {
 		t.Errorf("expected 0 chunks created, got %d", stats.ChunksCreated)
 	}
 
-	// Assert: No documents should be retrieved (skipped before GetDocument call)
-	if mockStore.getDocCalled {
-		t.Error("GetDocument should not be called for files with matching ModTime")
-	}
-
 	// Assert: No documents should be saved
 	if mockStore.saveDocCalled {
 		t.Error("SaveDocument should not be called for unchanged files")
diff --git a/store/postgres.go b/store/postgres.go
@@ -63,6 +63,21 @@ func (s *PostgresStore) ensureSchema(ctx context.Context) error {
 		`ALTER TABLE chunks ADD COLUMN IF NOT EXISTS content_hash TEXT DEFAULT ''`,
 		`CREATE INDEX IF NOT EXISTS idx_chunks_content_hash ON chunks(content_hash) WHERE content_hash != ''`,
 		buildEnsureVectorSQL(s.dimensions),
+		// Migrate chunks primary key from (id) to (project_id, id) so that
+		// worktrees sharing the same database get their own chunk rows instead
+		// of silently overwriting each other via ON CONFLICT (id).
+		`DO $$
+		BEGIN
+			IF EXISTS (
+				SELECT 1 FROM pg_constraint
+				WHERE conrelid = 'chunks'::regclass
+				AND contype = 'p'
+				AND array_length(conkey, 1) = 1
+			) THEN
+				ALTER TABLE chunks DROP CONSTRAINT chunks_pkey;
+				ALTER TABLE chunks ADD PRIMARY KEY (project_id, id);
+			END IF;
+		END $$`,
 	}
 
 	for _, query := range queries {
@@ -75,14 +90,18 @@ func (s *PostgresStore) ensureSchema(ctx context.Context) error {
 }
 
 func (s *PostgresStore) SaveChunks(ctx context.Context, chunks []Chunk) error {
+	if len(chunks) == 0 {
+		return nil
+	}
+
 	batch := &pgx.Batch{}
 
 	for _, chunk := range chunks {
 		vec := pgvector.NewVector(chunk.Vector)
 		batch.Queue(
 			`INSERT INTO chunks (id, project_id, file_path, start_line, end_line, content, vector, hash, content_hash, updated_at)
 			VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
-			ON CONFLICT (id) DO UPDATE SET
+			ON CONFLICT (project_id, id) DO UPDATE SET
 				file_path = EXCLUDED.file_path,
 				start_line = EXCLUDED.start_line,
 				end_line = EXCLUDED.end_line,