Skip to content

Commit 5a1ef50

Browse files
disruptDevWSclaude
andcommitted
fix: silo parser extracts from headings, semantic report checks architecture dir
Bug qwibitai#2 — parseArchitectureBlueprint() now derives silo names from "### Silo N:" headings, skips non-silo tables (Parts 2-4), prioritizes URL column over Page column, and deduplicates by slug. Fixes 61 duplicates in "Uncategorized" → 22 pages across 5 named silos. Bug qwibitai#3 — syncDwight() now checks architecture/ directory as fallback for semantically_similar_report.csv when auditor/ report is empty. Fixes semantic conflicts showing 0 despite data existing. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 2702eb6 commit 5a1ef50

1 file changed

Lines changed: 73 additions & 11 deletions

File tree

scripts/sync-to-dashboard.ts

Lines changed: 73 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1120,21 +1120,33 @@ async function syncDwight(
11201120
const agentRunId = run?.id ?? null;
11211121

11221122
// Load semantically_similar_report.csv if it exists (supplements internal_all.csv)
1123-
const semReportFile = path.join(dir, 'semantically_similar_report.csv');
1123+
// Also check the architecture directory as a fallback — semantic analysis may live there
1124+
const semCandidates = [
1125+
path.join(dir, 'semantically_similar_report.csv'),
1126+
];
1127+
const archDir = agentDir(domain, 'architecture', date);
1128+
if (archDir) {
1129+
semCandidates.push(path.join(archDir, 'semantically_similar_report.csv'));
1130+
}
1131+
11241132
const semMap = new Map<string, { closestUrl: string; score: number }>();
1125-
if (fs.existsSync(semReportFile)) {
1133+
for (const semReportFile of semCandidates) {
1134+
if (!fs.existsSync(semReportFile)) continue;
11261135
let semCsv = fs.readFileSync(semReportFile, 'utf-8');
11271136
if (semCsv.charCodeAt(0) === 0xfeff) semCsv = semCsv.slice(1);
11281137
const semRows: Record<string, string>[] = csvParse(semCsv, { columns: true, skip_empty_lines: true, relax_column_count: true });
11291138
for (const sr of semRows) {
11301139
const addr = sr['Address'] || '';
11311140
const closest = sr['Closest Semantically Similar Address'] || '';
11321141
const score = parseFloat(sr['Semantic Similarity Score'] || '0') || 0;
1133-
if (addr && score > 0) {
1142+
if (addr && score > 0 && !semMap.has(addr)) {
11341143
semMap.set(addr, { closestUrl: closest, score });
11351144
}
11361145
}
1137-
if (semMap.size > 0) console.log(` [dwight] Loaded ${semMap.size} semantic pairs from report`);
1146+
if (semMap.size > 0) {
1147+
console.log(` [dwight] Loaded ${semMap.size} semantic pairs from ${path.basename(path.dirname(semReportFile))} report`);
1148+
break; // first source with data wins
1149+
}
11381150
}
11391151

11401152
// Filter to HTML pages only
@@ -1262,22 +1274,67 @@ function parseArchitectureBlueprint(filePath: string): { pages: ArchPage[]; mark
12621274
summary = summaryMatch[1].trim();
12631275
}
12641276

1265-
// Parse markdown tables for page assignments
1266-
// Look for tables with columns like: Slug/URL, Status, Silo, Role, Keyword, Volume, Action
1277+
// Parse silo page assignments from markdown tables under "### Silo" headings.
1278+
// Tables outside silo sections (cannibalization, metadata, schema, etc.) are skipped.
12671279
const pages: ArchPage[] = [];
1280+
const seenSlugs = new Set<string>();
1281+
1282+
// Build a map of character offsets → silo names from "### Silo N: Name" headings
1283+
const siloHeadings: Array<{ offset: number; name: string }> = [];
1284+
const siloHeadingRegex = /^###\s+Silo\s+\d+:\s*(.+)$/gm;
1285+
let siloMatch: RegExpExecArray | null;
1286+
while ((siloMatch = siloHeadingRegex.exec(markdown)) !== null) {
1287+
siloHeadings.push({ offset: siloMatch.index, name: siloMatch[1].trim() });
1288+
}
1289+
1290+
// Find the next heading after each silo to bound its range
1291+
const allHeadingOffsets: number[] = [];
1292+
const headingBoundaryRegex = /^#{2,3}\s/gm;
1293+
let hm: RegExpExecArray | null;
1294+
while ((hm = headingBoundaryRegex.exec(markdown)) !== null) {
1295+
allHeadingOffsets.push(hm.index);
1296+
}
1297+
1298+
// Build offsets for ## headings (Part boundaries) to limit silo scope
1299+
const partHeadingOffsets: number[] = [];
1300+
const partHeadingRegex = /^##\s/gm;
1301+
let pm: RegExpExecArray | null;
1302+
while ((pm = partHeadingRegex.exec(markdown)) !== null) {
1303+
partHeadingOffsets.push(pm.index);
1304+
}
1305+
1306+
function getSiloForOffset(offset: number): string | null {
1307+
for (let i = siloHeadings.length - 1; i >= 0; i--) {
1308+
if (offset >= siloHeadings[i].offset) {
1309+
// Bound at: next silo heading, or next ## heading after this silo, whichever is first
1310+
const nextSiloOffset = i + 1 < siloHeadings.length ? siloHeadings[i + 1].offset : Infinity;
1311+
const nextPartOffset = partHeadingOffsets.find((o) => o > siloHeadings[i].offset) ?? Infinity;
1312+
const bound = Math.min(nextSiloOffset, nextPartOffset);
1313+
if (offset < bound) return siloHeadings[i].name;
1314+
return null;
1315+
}
1316+
}
1317+
return null;
1318+
}
1319+
12681320
const tableRegex = /\|(.+)\|\n\|[-\s|:]+\|\n((?:\|.+\|\n?)*)/g;
12691321
let match: RegExpExecArray | null;
12701322

12711323
while ((match = tableRegex.exec(markdown)) !== null) {
1324+
// Only process tables inside silo sections
1325+
const siloName = getSiloForOffset(match.index);
1326+
if (!siloName) continue;
1327+
12721328
const headerLine = match[1];
12731329
const headers = headerLine.split('|').map((h) => h.trim().toLowerCase());
12741330

1275-
// Check if this table has page-related columns
1276-
const slugIdx = headers.findIndex((h) => h.includes('slug') || h.includes('url') || h.includes('page') || h.includes('path'));
1277-
const siloIdx = headers.findIndex((h) => h.includes('silo') || h.includes('cluster') || h.includes('topic'));
1331+
// Check if this table has page-related columns — prefer url/slug/path over generic 'page'
1332+
let slugIdx = headers.findIndex((h) => h.includes('slug') || h.includes('url') || h.includes('path'));
1333+
if (slugIdx < 0) slugIdx = headers.findIndex((h) => h.includes('page'));
12781334
if (slugIdx < 0) continue;
12791335

12801336
const statusIdx = headers.findIndex((h) => h.includes('status') || h.includes('exists') || h.includes('new'));
1337+
const siloColIdx = headers.findIndex((h) => h.includes('silo') || h.includes('cluster'));
12811338
const roleIdx = headers.findIndex((h) => h.includes('role') || h.includes('type'));
12821339
const kwIdx = headers.findIndex((h) => h.includes('keyword') || h.includes('target'));
12831340
const volIdx = headers.findIndex((h) => h.includes('volume') || h.includes('vol'));
@@ -1291,10 +1348,15 @@ function parseArchitectureBlueprint(filePath: string): { pages: ArchPage[]; mark
12911348
const slug = cells[slugIdx] ?? '';
12921349
if (!slug || slug.startsWith('-')) continue;
12931350

1351+
const cleanSlug = slug.replace(/^\//, '').replace(/`/g, '');
1352+
// Deduplicate: first silo assignment wins
1353+
if (seenSlugs.has(cleanSlug.toLowerCase())) continue;
1354+
seenSlugs.add(cleanSlug.toLowerCase());
1355+
12941356
pages.push({
1295-
url_slug: slug.replace(/^\//, '').replace(/`/g, ''),
1357+
url_slug: cleanSlug,
12961358
page_status: statusIdx >= 0 ? (cells[statusIdx] ?? '').toLowerCase().replace(/[*`]/g, '') : 'unknown',
1297-
silo_name: siloIdx >= 0 ? (cells[siloIdx] ?? '').replace(/[*`]/g, '') : '',
1359+
silo_name: siloColIdx >= 0 ? (cells[siloColIdx] ?? '').replace(/[*`]/g, '') : siloName,
12981360
role: roleIdx >= 0 ? (cells[roleIdx] ?? '').replace(/[*`]/g, '') : '',
12991361
primary_keyword: kwIdx >= 0 ? (cells[kwIdx] ?? '').replace(/[*`]/g, '') : '',
13001362
primary_keyword_volume: volIdx >= 0 ? parseInt(cells[volIdx] ?? '0', 10) || 0 : 0,

0 commit comments

Comments
 (0)