Skip to content

Commit 54580dd

Browse files
fix: worktree dedup and multi-project chunk storage (#142)
* fix: worktree dedup and multi-project chunk storage Fix two issues that break content-hash dedup across git worktrees sharing a PostgreSQL backend: 1. Files with documents but no chunks are permanently skipped. When a prior indexing run creates document records but fails to embed (e.g., missing API key), subsequent runs skip those files because the mod-time gate and hash check don't verify chunk existence. Fix: check len(doc.ChunkIDs) > 0 before skipping in IndexAllWithBatchProgress and NeedsReindex. 2. Chunk IDs collide across projects sharing the same database. Chunk IDs use relative paths (e.g., src/App.tsx_0). With the primary key on just (id), ON CONFLICT upserts from a second project overwrite the first project's chunks without updating project_id — the second project ends up with 0 chunks. Fix: migrate primary key to (project_id, id) and update the ON CONFLICT clause to match. Tested with 3 projects sharing one postgres instance: - 100% content-hash cache hit rate for identical code - ~9s per worktree (vs ~5min without cache) - Each project gets its own chunk rows * fix: update tests to match new ChunkIDs-aware skip logic The indexer now calls GetDocument before the lastIndexTime gate and requires doc.ChunkIDs to be non-empty for both the time-based and hash-based skip paths. This prevents silently skipping files whose prior indexing run created a document but failed to embed chunks. Update tests to seed documents with ChunkIDs where the test expects a skip, and remove assertions that GetDocument should not be called (it is now called by design before the time check).
1 parent 135f5bc commit 54580dd

File tree

5 files changed

+68
-26
lines changed

5 files changed

+68
-26
lines changed

cli/watch_initialscan_test.go

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,16 @@ func TestRunInitialScan_SkipsIndexedFileByLastIndexTime(t *testing.T) {
142142
vecStore := store.NewGOBStore(filepath.Join(projectRoot, "index.gob"))
143143
idx := indexer.NewIndexer(projectRoot, vecStore, emb, chunker, scanner, time.Now().Add(1*time.Hour))
144144

145+
// Seed a document with ChunkIDs so the lastIndexTime gate can skip it.
146+
// The new logic requires doc != nil && len(doc.ChunkIDs) > 0 to skip.
147+
if err := vecStore.SaveDocument(ctx, store.Document{
148+
Path: "main.go",
149+
Hash: "seeded",
150+
ChunkIDs: []string{"c1"},
151+
}); err != nil {
152+
t.Fatalf("failed to seed document: %v", err)
153+
}
154+
145155
symbolStore := trace.NewGOBSymbolStore(filepath.Join(projectRoot, "symbols.gob"))
146156
defer symbolStore.Close()
147157

@@ -210,9 +220,10 @@ func TestHandleFileEvent_SkipsUnchangedFile(t *testing.T) {
210220
t.Fatalf("failed to scan source file: %v", err)
211221
}
212222
if err := vecStore.SaveDocument(ctx, store.Document{
213-
Path: "main.go",
214-
Hash: fileInfo.Hash,
215-
ModTime: time.Unix(fileInfo.ModTime, 0),
223+
Path: "main.go",
224+
Hash: fileInfo.Hash,
225+
ModTime: time.Unix(fileInfo.ModTime, 0),
226+
ChunkIDs: []string{"c1"},
216227
}); err != nil {
217228
t.Fatalf("failed to seed document: %v", err)
218229
}
@@ -307,9 +318,10 @@ func TestHandleWorkspaceFileEvent_SkipsUnchangedFile(t *testing.T) {
307318
projectName := "proj"
308319
prefixedPath := workspaceName + "/" + projectName + "/proj/main.go"
309320
if err := st.SaveDocument(ctx, store.Document{
310-
Path: prefixedPath,
311-
Hash: fileInfo.Hash,
312-
ModTime: time.Unix(fileInfo.ModTime, 0),
321+
Path: prefixedPath,
322+
Hash: fileInfo.Hash,
323+
ModTime: time.Unix(fileInfo.ModTime, 0),
324+
ChunkIDs: []string{"c1"},
313325
}); err != nil {
314326
t.Fatalf("failed to seed workspace document: %v", err)
315327
}

indexer/indexer.go

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,16 @@ func (idx *Indexer) IndexAllWithBatchProgress(ctx context.Context, onProgress Pr
115115
})
116116
}
117117

118-
// Skip files modified before lastIndexTime
119-
if !idx.lastIndexTime.IsZero() {
118+
// Fetch the document once — used by both the mod-time gate and hash check.
119+
doc, err := idx.store.GetDocument(ctx, fileMeta.Path)
120+
if err != nil {
121+
return nil, fmt.Errorf("failed to get document %s: %w", fileMeta.Path, err)
122+
}
123+
124+
// Skip files modified before lastIndexTime — but only if they have chunks.
125+
// Files with no chunks need re-indexing even if their mod_time is old
126+
// (e.g., a prior indexing run created the document but failed to embed).
127+
if !idx.lastIndexTime.IsZero() && doc != nil && len(doc.ChunkIDs) > 0 {
120128
fileModTime := time.Unix(fileMeta.ModTime, 0)
121129
if fileModTime.Before(idx.lastIndexTime) || fileModTime.Equal(idx.lastIndexTime) {
122130
stats.FilesSkipped++
@@ -125,12 +133,6 @@ func (idx *Indexer) IndexAllWithBatchProgress(ctx context.Context, onProgress Pr
125133
}
126134
}
127135

128-
// Check if file needs reindexing
129-
doc, err := idx.store.GetDocument(ctx, fileMeta.Path)
130-
if err != nil {
131-
return nil, fmt.Errorf("failed to get document %s: %w", fileMeta.Path, err)
132-
}
133-
134136
// Load file content and hash only after metadata filtering.
135137
file, err := idx.scanner.ScanFile(fileMeta.Path)
136138
if err != nil {
@@ -145,9 +147,9 @@ func (idx *Indexer) IndexAllWithBatchProgress(ctx context.Context, onProgress Pr
145147
continue
146148
}
147149

148-
if doc != nil && doc.Hash == file.Hash {
150+
if doc != nil && doc.Hash == file.Hash && len(doc.ChunkIDs) > 0 {
149151
delete(existingMap, fileMeta.Path)
150-
continue // File unchanged
152+
continue // File unchanged and has chunks
151153
}
152154

153155
filesToIndex = append(filesToIndex, *file)
@@ -671,5 +673,10 @@ func (idx *Indexer) NeedsReindex(ctx context.Context, path string, hash string)
671673
return true, nil
672674
}
673675

674-
return doc.Hash != hash, nil
676+
// Reindex if hash changed OR if document has no chunks (prior indexing failed)
677+
if doc.Hash != hash || len(doc.ChunkIDs) == 0 {
678+
return true, nil
679+
}
680+
681+
return false, nil
675682
}

indexer/indexer_branchswitch_test.go

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ import (
99
"strings"
1010
"testing"
1111
"time"
12+
13+
"github.com/yoanbernabeu/grepai/store"
1214
)
1315

1416
func createGoFixtureFiles(tb testing.TB, root string, fileCount int) {
@@ -33,6 +35,16 @@ func TestIndexAllWithProgress_BranchSwitchSkipsBulkWithoutLookupOrEmbedding(t *t
3335
}
3436

3537
mockStore := newMockStore()
38+
// Seed documents with ChunkIDs so the lastIndexTime gate can skip them.
39+
// The new logic requires doc != nil && len(doc.ChunkIDs) > 0 to skip.
40+
for i := range 200 {
41+
path := fmt.Sprintf("file_%04d.go", i)
42+
mockStore.documents[path] = store.Document{
43+
Path: path,
44+
Hash: "seeded",
45+
ChunkIDs: []string{"c1"},
46+
}
47+
}
3648
mockEmbedder := newMockEmbedder()
3749
scanner := NewScanner(tmpDir, ignoreMatcher)
3850
chunker := NewChunker(512, 50)
@@ -55,9 +67,6 @@ func TestIndexAllWithProgress_BranchSwitchSkipsBulkWithoutLookupOrEmbedding(t *t
5567
if stats.FilesSkipped < 200 {
5668
t.Fatalf("expected at least 200 skipped files, got %d", stats.FilesSkipped)
5769
}
58-
if mockStore.getDocCalled {
59-
t.Fatal("GetDocument should not be called for files skipped by lastIndexTime")
60-
}
6170
if mockEmbedder.embedCalled {
6271
t.Fatal("embedder should not be called when all files are skipped")
6372
}

indexer/indexer_test.go

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -248,11 +248,6 @@ func TestIndexAllWithProgress_UnchangedFilesSkipped(t *testing.T) {
248248
t.Errorf("expected 0 chunks created, got %d", stats.ChunksCreated)
249249
}
250250

251-
// Assert: No documents should be retrieved (skipped before GetDocument call)
252-
if mockStore.getDocCalled {
253-
t.Error("GetDocument should not be called for files with matching ModTime")
254-
}
255-
256251
// Assert: No documents should be saved
257252
if mockStore.saveDocCalled {
258253
t.Error("SaveDocument should not be called for unchanged files")

store/postgres.go

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,21 @@ func (s *PostgresStore) ensureSchema(ctx context.Context) error {
6363
`ALTER TABLE chunks ADD COLUMN IF NOT EXISTS content_hash TEXT DEFAULT ''`,
6464
`CREATE INDEX IF NOT EXISTS idx_chunks_content_hash ON chunks(content_hash) WHERE content_hash != ''`,
6565
buildEnsureVectorSQL(s.dimensions),
66+
// Migrate chunks primary key from (id) to (project_id, id) so that
67+
// worktrees sharing the same database get their own chunk rows instead
68+
// of silently overwriting each other via ON CONFLICT (id).
69+
`DO $$
70+
BEGIN
71+
IF EXISTS (
72+
SELECT 1 FROM pg_constraint
73+
WHERE conrelid = 'chunks'::regclass
74+
AND contype = 'p'
75+
AND array_length(conkey, 1) = 1
76+
) THEN
77+
ALTER TABLE chunks DROP CONSTRAINT chunks_pkey;
78+
ALTER TABLE chunks ADD PRIMARY KEY (project_id, id);
79+
END IF;
80+
END $$`,
6681
}
6782

6883
for _, query := range queries {
@@ -75,14 +90,18 @@ func (s *PostgresStore) ensureSchema(ctx context.Context) error {
7590
}
7691

7792
func (s *PostgresStore) SaveChunks(ctx context.Context, chunks []Chunk) error {
93+
if len(chunks) == 0 {
94+
return nil
95+
}
96+
7897
batch := &pgx.Batch{}
7998

8099
for _, chunk := range chunks {
81100
vec := pgvector.NewVector(chunk.Vector)
82101
batch.Queue(
83102
`INSERT INTO chunks (id, project_id, file_path, start_line, end_line, content, vector, hash, content_hash, updated_at)
84103
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
85-
ON CONFLICT (id) DO UPDATE SET
104+
ON CONFLICT (project_id, id) DO UPDATE SET
86105
file_path = EXCLUDED.file_path,
87106
start_line = EXCLUDED.start_line,
88107
end_line = EXCLUDED.end_line,

0 commit comments

Comments
 (0)