@@ -1120,21 +1120,33 @@ async function syncDwight(
11201120 const agentRunId = run ?. id ?? null ;
11211121
11221122 // Load semantically_similar_report.csv if it exists (supplements internal_all.csv)
1123- const semReportFile = path . join ( dir , 'semantically_similar_report.csv' ) ;
1123+ // Also check the architecture directory as a fallback — semantic analysis may live there
1124+ const semCandidates = [
1125+ path . join ( dir , 'semantically_similar_report.csv' ) ,
1126+ ] ;
1127+ const archDir = agentDir ( domain , 'architecture' , date ) ;
1128+ if ( archDir ) {
1129+ semCandidates . push ( path . join ( archDir , 'semantically_similar_report.csv' ) ) ;
1130+ }
1131+
11241132 const semMap = new Map < string , { closestUrl : string ; score : number } > ( ) ;
1125- if ( fs . existsSync ( semReportFile ) ) {
1133+ for ( const semReportFile of semCandidates ) {
1134+ if ( ! fs . existsSync ( semReportFile ) ) continue ;
11261135 let semCsv = fs . readFileSync ( semReportFile , 'utf-8' ) ;
11271136 if ( semCsv . charCodeAt ( 0 ) === 0xfeff ) semCsv = semCsv . slice ( 1 ) ;
11281137 const semRows : Record < string , string > [ ] = csvParse ( semCsv , { columns : true , skip_empty_lines : true , relax_column_count : true } ) ;
11291138 for ( const sr of semRows ) {
11301139 const addr = sr [ 'Address' ] || '' ;
11311140 const closest = sr [ 'Closest Semantically Similar Address' ] || '' ;
11321141 const score = parseFloat ( sr [ 'Semantic Similarity Score' ] || '0' ) || 0 ;
1133- if ( addr && score > 0 ) {
1142+ if ( addr && score > 0 && ! semMap . has ( addr ) ) {
11341143 semMap . set ( addr , { closestUrl : closest , score } ) ;
11351144 }
11361145 }
1137- if ( semMap . size > 0 ) console . log ( ` [dwight] Loaded ${ semMap . size } semantic pairs from report` ) ;
1146+ if ( semMap . size > 0 ) {
1147+ console . log ( ` [dwight] Loaded ${ semMap . size } semantic pairs from ${ path . basename ( path . dirname ( semReportFile ) ) } report` ) ;
1148+ break ; // first source with data wins
1149+ }
11381150 }
11391151
11401152 // Filter to HTML pages only
@@ -1262,22 +1274,67 @@ function parseArchitectureBlueprint(filePath: string): { pages: ArchPage[]; mark
12621274 summary = summaryMatch [ 1 ] . trim ( ) ;
12631275 }
12641276
1265- // Parse markdown tables for page assignments
1266- // Look for tables with columns like: Slug/URL, Status, Silo, Role, Keyword, Volume, Action
1277+ // Parse silo page assignments from markdown tables under "### Silo" headings.
1278+ // Tables outside silo sections (cannibalization, metadata, schema, etc.) are skipped.
12671279 const pages : ArchPage [ ] = [ ] ;
1280+ const seenSlugs = new Set < string > ( ) ;
1281+
1282+ // Build a map of character offsets → silo names from "### Silo N: Name" headings
1283+ const siloHeadings : Array < { offset : number ; name : string } > = [ ] ;
1284+ const siloHeadingRegex = / ^ # # # \s + S i l o \s + \d + : \s * ( .+ ) $ / gm;
1285+ let siloMatch : RegExpExecArray | null ;
1286+ while ( ( siloMatch = siloHeadingRegex . exec ( markdown ) ) !== null ) {
1287+ siloHeadings . push ( { offset : siloMatch . index , name : siloMatch [ 1 ] . trim ( ) } ) ;
1288+ }
1289+
1290+ // Find the next heading after each silo to bound its range
1291+ const allHeadingOffsets : number [ ] = [ ] ;
1292+ const headingBoundaryRegex = / ^ # { 2 , 3 } \s / gm;
1293+ let hm : RegExpExecArray | null ;
1294+ while ( ( hm = headingBoundaryRegex . exec ( markdown ) ) !== null ) {
1295+ allHeadingOffsets . push ( hm . index ) ;
1296+ }
1297+
1298+ // Build offsets for ## headings (Part boundaries) to limit silo scope
1299+ const partHeadingOffsets : number [ ] = [ ] ;
1300+ const partHeadingRegex = / ^ # # \s / gm;
1301+ let pm : RegExpExecArray | null ;
1302+ while ( ( pm = partHeadingRegex . exec ( markdown ) ) !== null ) {
1303+ partHeadingOffsets . push ( pm . index ) ;
1304+ }
1305+
1306+ function getSiloForOffset ( offset : number ) : string | null {
1307+ for ( let i = siloHeadings . length - 1 ; i >= 0 ; i -- ) {
1308+ if ( offset >= siloHeadings [ i ] . offset ) {
1309+ // Bound at: next silo heading, or next ## heading after this silo, whichever is first
1310+ const nextSiloOffset = i + 1 < siloHeadings . length ? siloHeadings [ i + 1 ] . offset : Infinity ;
1311+ const nextPartOffset = partHeadingOffsets . find ( ( o ) => o > siloHeadings [ i ] . offset ) ?? Infinity ;
1312+ const bound = Math . min ( nextSiloOffset , nextPartOffset ) ;
1313+ if ( offset < bound ) return siloHeadings [ i ] . name ;
1314+ return null ;
1315+ }
1316+ }
1317+ return null ;
1318+ }
1319+
12681320 const tableRegex = / \| ( .+ ) \| \n \| [ - \s | : ] + \| \n ( (?: \| .+ \| \n ? ) * ) / g;
12691321 let match : RegExpExecArray | null ;
12701322
12711323 while ( ( match = tableRegex . exec ( markdown ) ) !== null ) {
1324+ // Only process tables inside silo sections
1325+ const siloName = getSiloForOffset ( match . index ) ;
1326+ if ( ! siloName ) continue ;
1327+
12721328 const headerLine = match [ 1 ] ;
12731329 const headers = headerLine . split ( '|' ) . map ( ( h ) => h . trim ( ) . toLowerCase ( ) ) ;
12741330
1275- // Check if this table has page-related columns
1276- const slugIdx = headers . findIndex ( ( h ) => h . includes ( 'slug' ) || h . includes ( 'url' ) || h . includes ( 'page ') || h . includes ( 'path' ) ) ;
1277- const siloIdx = headers . findIndex ( ( h ) => h . includes ( 'silo' ) || h . includes ( 'cluster' ) || h . includes ( 'topic ') ) ;
1331+ // Check if this table has page-related columns — prefer url/slug/path over generic 'page'
1332+ let slugIdx = headers . findIndex ( ( h ) => h . includes ( 'slug' ) || h . includes ( 'url' ) || h . includes ( 'path' ) ) ;
1333+ if ( slugIdx < 0 ) slugIdx = headers . findIndex ( ( h ) => h . includes ( 'page ' ) ) ;
12781334 if ( slugIdx < 0 ) continue ;
12791335
12801336 const statusIdx = headers . findIndex ( ( h ) => h . includes ( 'status' ) || h . includes ( 'exists' ) || h . includes ( 'new' ) ) ;
1337+ const siloColIdx = headers . findIndex ( ( h ) => h . includes ( 'silo' ) || h . includes ( 'cluster' ) ) ;
12811338 const roleIdx = headers . findIndex ( ( h ) => h . includes ( 'role' ) || h . includes ( 'type' ) ) ;
12821339 const kwIdx = headers . findIndex ( ( h ) => h . includes ( 'keyword' ) || h . includes ( 'target' ) ) ;
12831340 const volIdx = headers . findIndex ( ( h ) => h . includes ( 'volume' ) || h . includes ( 'vol' ) ) ;
@@ -1291,10 +1348,15 @@ function parseArchitectureBlueprint(filePath: string): { pages: ArchPage[]; mark
12911348 const slug = cells [ slugIdx ] ?? '' ;
12921349 if ( ! slug || slug . startsWith ( '-' ) ) continue ;
12931350
1351+ const cleanSlug = slug . replace ( / ^ \/ / , '' ) . replace ( / ` / g, '' ) ;
1352+ // Deduplicate: first silo assignment wins
1353+ if ( seenSlugs . has ( cleanSlug . toLowerCase ( ) ) ) continue ;
1354+ seenSlugs . add ( cleanSlug . toLowerCase ( ) ) ;
1355+
12941356 pages . push ( {
1295- url_slug : slug . replace ( / ^ \/ / , '' ) . replace ( / ` / g , '' ) ,
1357+ url_slug : cleanSlug ,
12961358 page_status : statusIdx >= 0 ? ( cells [ statusIdx ] ?? '' ) . toLowerCase ( ) . replace ( / [ * ` ] / g, '' ) : 'unknown' ,
1297- silo_name : siloIdx >= 0 ? ( cells [ siloIdx ] ?? '' ) . replace ( / [ * ` ] / g, '' ) : '' ,
1359+ silo_name : siloColIdx >= 0 ? ( cells [ siloColIdx ] ?? '' ) . replace ( / [ * ` ] / g, '' ) : siloName ,
12981360 role : roleIdx >= 0 ? ( cells [ roleIdx ] ?? '' ) . replace ( / [ * ` ] / g, '' ) : '' ,
12991361 primary_keyword : kwIdx >= 0 ? ( cells [ kwIdx ] ?? '' ) . replace ( / [ * ` ] / g, '' ) : '' ,
13001362 primary_keyword_volume : volIdx >= 0 ? parseInt ( cells [ volIdx ] ?? '0' , 10 ) || 0 : 0 ,
0 commit comments