@@ -371,6 +371,73 @@ func TestHTML_FromChunk(t *testing.T) {
371371 want : "5 > 3 & 2 < 4" ,
372372 },
373373
374+ // --- ASPX / namespace-prefixed tag support ---
375+ {
376+ // Fragment ASPX pages (master-page-based) have no <html> wrapper;
377+ // they start with an <%@ Page %> directive and use <asp:Content>
378+ // as their root. The updated htmlTagPattern must match the
379+ // namespace-prefixed tag so the decoder doesn't return nil.
380+ // The <%@ ... %> directive is emitted as text by the HTML5 parser
381+ // (< followed by % is not a valid tag start, so < is treated literally).
382+ // Namespace element attributes (PlaceHolderMain, server) are also
383+ // emitted because <asp:Content> is a namespace-prefixed element.
384+ name : "aspx fragment with namespace tags and no html wrapper" ,
385+ chunk : & sources.Chunk {Data : []byte (
386+ "<%@ Page MasterPageFile=\" ~masterurl/default.master\" %>\n " +
387+ `<asp:Content ContentPlaceHolderId="PlaceHolderMain" runat="server">` +
388+ "<div>api_key=abc123</div>" +
389+ "</asp:Content>" ,
390+ )},
391+ want : "<%@ Page MasterPageFile=\" ~masterurl/default.master\" %>\n PlaceHolderMain\n server\n api_key=abc123" ,
392+ },
393+ {
394+ // ASP.NET server controls carry secrets in PascalCase attributes
395+ // (ConnectionString, Text, SelectCommand, etc.) that are not in
396+ // highSignalAttrs and do not use the data- prefix. Namespace-prefixed
397+ // elements must have all their attributes emitted.
398+ name : "aspx server control attribute extraction - ConnectionString" ,
399+ chunk : & sources.Chunk {Data : []byte (`<html><body><asp:SqlDataSource ConnectionString="Server=.;Password=hunter2" runat="server" /></body></html>` )},
400+ want : "Server=.;Password=hunter2\n server" ,
401+ },
402+ {
403+ // A plain Text attribute on a server control (common for labels,
404+ // textboxes, buttons that carry pre-filled values).
405+ name : "aspx server control attribute extraction - Text" ,
406+ chunk : & sources.Chunk {Data : []byte (`<html><body><asp:TextBox Text="api_key=secret123" runat="server" /></body></html>` )},
407+ want : "api_key=secret123\n server" ,
408+ },
409+ {
410+ // Entity-encoded HTML chunk with no literal tags. This simulates a
411+ // chunk that falls entirely within a large entity-encoded field (e.g.
412+ // SharePoint mso:CanvasContent1). The htmlEntityPattern must fire so
413+ // the decoder does not return nil. html.Parse decodes the entities to
414+ // literal HTML markup — the engine's iterativeDecode then re-applies
415+ // the HTML decoder at depth 2 to fully extract the secret. At depth 1
416+ // the output is the decoded markup string.
417+ name : "entity-encoded html chunk with no literal tags" ,
418+ chunk : & sources.Chunk {Data : []byte (`<div data-sp-rte=""><p>api_key=secret123</p></div>` )},
419+ want : `<div data-sp-rte=""><p>api_key=secret123</p></div>` ,
420+ },
421+ {
422+ // SharePoint stores page content entity-encoded inside mso:CanvasContent1.
423+ // The first html.Parse pass decodes the entities to a text string containing
424+ // literal HTML markup. The engine's iterativeDecode then re-applies the HTML
425+ // decoder at depth 2 to extract content from that inner markup.
426+ // At depth 1 (this test), the entity-encoded text is emitted as plain text
427+ // after residualEntityReplacer runs. The msdt:dt="string" attribute is also
428+ // emitted because mso:CanvasContent1 is a namespace-prefixed element.
429+ name : "sharepoint aspx mso:CanvasContent1 secret in nested html" ,
430+ chunk : & sources.Chunk {Data : []byte (
431+ `<html xmlns:mso="urn:schemas-microsoft-com:office:office">` +
432+ `<head><mso:CustomDocumentProperties>` +
433+ `<mso:CanvasContent1 msdt:dt="string">` +
434+ `<div><p>-----BEGIN OPENSSH PRIVATE KEY-----</p></div>` +
435+ `</mso:CanvasContent1>` +
436+ `</mso:CustomDocumentProperties></head></html>` ,
437+ )},
438+ want : "string\n <div><p>-----BEGIN OPENSSH PRIVATE KEY-----</p></div>" ,
439+ },
440+
374441 // --- Integration: all extraction types in one chunk ---
375442 {
376443 // Combines text nodes (split across spans), URL-decoded attribute
@@ -462,6 +529,25 @@ func TestLooksLikeHTML(t *testing.T) {
462529 {"XML-like" , "<root>content</root>" , true },
463530 {"just less-than" , "a < b" , false },
464531 {"html entity only" , "& <" , false },
532+
533+ // ASPX / namespace-prefixed tags (literal form)
534+ {"aspx namespace tag with space" , `<asp:Content runat="server">` , true },
535+ {"mso namespace tag with space" , `<mso:WikiField msdt:dt="string">` , true },
536+ {"multi-segment namespace tag" , `<WebPartPages:WebPartZone runat="server">` , true },
537+
538+ // Entity-encoded HTML tags (positive)
539+ {"entity-encoded div with space" , `<div class="foo">` , true },
540+ {"entity-encoded p with gt terminator" , `<p>hello</p>` , true },
541+ {"entity-encoded self-closing br" , `<br/>` , true },
542+ {"entity-encoded namespace tag" , `<asp:Content runat="server">` , true },
543+
544+ // Entity-encoded false positives that must NOT match
545+ {"comparison operator entity-encoded" , `x < maxRetries exceeded` , false },
546+ {"comparison operator single word" , `result < threshold` , false },
547+ {"template placeholder entity-encoded" , `<YOUR_API_KEY>` , false },
548+ {"double-encoded entity not matched" , `&lt;div>` , false },
549+ {"json with digit after lt" , `{"lt": "<3"}` , false },
550+ {"sql comparison bare lt" , `SELECT * FROM t WHERE a < b` , false },
465551 }
466552
467553 for _ , tt := range tests {
0 commit comments