Skip to content

Commit 26eae1f

Browse files
authored
[SCAN-795] HTML decoder: ASPX and entity-encoded HTML support (#4981)
* support aspx decoding in html decoder * check isNamespaced before prefix comparison
1 parent 6c8f640 commit 26eae1f

3 files changed

Lines changed: 121 additions & 11 deletions

File tree

main.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,9 @@ func run(state overseer.State, logSync func() error) {
525525
// OSS Default using github graphql api for issues, pr's and comments
526526
feature.UseGithubGraphQLAPI.Store(false)
527527

528+
// OSS Default Use HTML Decoder on
529+
feature.HTMLDecoderEnabled.Store(true)
530+
528531
// New detector flags
529532
feature.PineconeDetectorEnabled.Store(true)
530533
feature.CloudinaryDetectorEnabled.Store(true)

pkg/decoders/html.go

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,30 @@ func (d *HTML) Type() detectorspb.DecoderType {
2323
return detectorspb.DecoderType_HTML
2424
}
2525

26-
var htmlTagPattern = regexp.MustCompile(`<[a-zA-Z][a-zA-Z0-9]*[\s>/]`)
26+
// htmlTagPattern matches standard HTML/XML opening tags. The optional namespace
27+
// group (?::[a-zA-Z][a-zA-Z0-9]*)? also matches ASPX/XML namespace-prefixed
28+
// tags such as <asp:Content and <mso:CanvasContent1.
29+
var htmlTagPattern = regexp.MustCompile(`<[a-zA-Z][a-zA-Z0-9]*(?::[a-zA-Z][a-zA-Z0-9]*)?[\s>/]`)
30+
31+
// htmlEntityPattern matches entity-encoded HTML tags (e.g. &lt;div , &lt;p&gt;,
32+
// &lt;br/>). The terminator (?:[\s>/]|&gt;) covers both literal > and its
33+
// entity form, so bare-tag forms like &lt;p&gt; are detected. Requiring a
34+
// terminator after the tag name prevents matching comparison operators in
35+
// entity-encoded text (x &lt; threshold has a space before the word, not
36+
// after) and template placeholders (&lt;YOUR_KEY&gt; where _ breaks the
37+
// alphanumeric run before &gt; can match).
38+
var htmlEntityPattern = regexp.MustCompile(`&lt;[a-zA-Z][a-zA-Z0-9]*(?::[a-zA-Z][a-zA-Z0-9]*)?(?:[\s>/]|&gt;)`)
2739

2840
// highSignalAttrs are attribute names whose values are extracted into the
2941
// decoded output because they commonly contain URLs, tokens, or other secrets.
3042
var highSignalAttrs = map[string]bool{
31-
"href": true,
32-
"src": true,
33-
"action": true,
34-
"value": true,
35-
"content": true,
36-
"alt": true,
37-
"title": true,
43+
"href": true,
44+
"src": true,
45+
"action": true,
46+
"value": true,
47+
"content": true,
48+
"alt": true,
49+
"title": true,
3850
}
3951

4052
// syntaxHighlightPrefixes lists CSS class prefixes used by syntax highlighting
@@ -116,7 +128,7 @@ func (d *HTML) FromChunk(chunk *sources.Chunk) *DecodableChunk {
116128
}
117129

118130
func looksLikeHTML(data []byte) bool {
119-
return htmlTagPattern.Match(data)
131+
return htmlTagPattern.Match(data) || htmlEntityPattern.Match(data)
120132
}
121133

122134
func extractHTML(data []byte) []byte {
@@ -196,9 +208,18 @@ func hasSyntaxHighlightClass(n *html.Node) bool {
196208
}
197209

198210
func emitAttributes(buf *bytes.Buffer, n *html.Node) {
211+
// Namespace-prefixed elements (e.g. asp:textbox, mso:canvascontent1) are
212+
// ASP.NET server controls or XML metadata nodes. All their attributes are
213+
// data payloads that may carry secrets (ConnectionString, Text, SelectCommand,
214+
// etc.), so we emit every attribute rather than filtering by highSignalAttrs.
215+
// After html.Parse the colon is preserved in n.Data even though the name is
216+
// lowercased, making strings.Contains a reliable namespace check.
217+
isNamespaced := strings.Contains(n.Data, ":")
218+
199219
for _, attr := range n.Attr {
200-
isDataAttr := strings.HasPrefix(attr.Key, "data-")
201-
if !highSignalAttrs[attr.Key] && !isDataAttr {
220+
if !isNamespaced &&
221+
!highSignalAttrs[attr.Key] &&
222+
!strings.HasPrefix(attr.Key, "data-") {
202223
continue
203224
}
204225
val := strings.TrimSpace(attr.Val)

pkg/decoders/html_test.go

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,73 @@ func TestHTML_FromChunk(t *testing.T) {
371371
want: "5 > 3 & 2 < 4",
372372
},
373373

374+
// --- ASPX / namespace-prefixed tag support ---
375+
{
376+
// Fragment ASPX pages (master-page-based) have no <html> wrapper;
377+
// they start with an <%@ Page %> directive and use <asp:Content>
378+
// as their root. The updated htmlTagPattern must match the
379+
// namespace-prefixed tag so the decoder doesn't return nil.
380+
// The <%@ ... %> directive is emitted as text by the HTML5 parser
381+
// (< followed by % is not a valid tag start, so < is treated literally).
382+
// Namespace element attributes (PlaceHolderMain, server) are also
383+
// emitted because <asp:Content> is a namespace-prefixed element.
384+
name: "aspx fragment with namespace tags and no html wrapper",
385+
chunk: &sources.Chunk{Data: []byte(
386+
"<%@ Page MasterPageFile=\"~masterurl/default.master\" %>\n" +
387+
`<asp:Content ContentPlaceHolderId="PlaceHolderMain" runat="server">` +
388+
"<div>api_key=abc123</div>" +
389+
"</asp:Content>",
390+
)},
391+
want: "<%@ Page MasterPageFile=\"~masterurl/default.master\" %>\nPlaceHolderMain\nserver\napi_key=abc123",
392+
},
393+
{
394+
// ASP.NET server controls carry secrets in PascalCase attributes
395+
// (ConnectionString, Text, SelectCommand, etc.) that are not in
396+
// highSignalAttrs and do not use the data- prefix. Namespace-prefixed
397+
// elements must have all their attributes emitted.
398+
name: "aspx server control attribute extraction - ConnectionString",
399+
chunk: &sources.Chunk{Data: []byte(`<html><body><asp:SqlDataSource ConnectionString="Server=.;Password=hunter2" runat="server" /></body></html>`)},
400+
want: "Server=.;Password=hunter2\nserver",
401+
},
402+
{
403+
// A plain Text attribute on a server control (common for labels,
404+
// textboxes, buttons that carry pre-filled values).
405+
name: "aspx server control attribute extraction - Text",
406+
chunk: &sources.Chunk{Data: []byte(`<html><body><asp:TextBox Text="api_key=secret123" runat="server" /></body></html>`)},
407+
want: "api_key=secret123\nserver",
408+
},
409+
{
410+
// Entity-encoded HTML chunk with no literal tags. This simulates a
411+
// chunk that falls entirely within a large entity-encoded field (e.g.
412+
// SharePoint mso:CanvasContent1). The htmlEntityPattern must fire so
413+
// the decoder does not return nil. html.Parse decodes the entities to
414+
// literal HTML markup — the engine's iterativeDecode then re-applies
415+
// the HTML decoder at depth 2 to fully extract the secret. At depth 1
416+
// the output is the decoded markup string.
417+
name: "entity-encoded html chunk with no literal tags",
418+
chunk: &sources.Chunk{Data: []byte(`&lt;div data-sp-rte=&quot;&quot;&gt;&lt;p&gt;api_key=secret123&lt;/p&gt;&lt;/div&gt;`)},
419+
want: `<div data-sp-rte=""><p>api_key=secret123</p></div>`,
420+
},
421+
{
422+
// SharePoint stores page content entity-encoded inside mso:CanvasContent1.
423+
// The first html.Parse pass decodes the entities to a text string containing
424+
// literal HTML markup. The engine's iterativeDecode then re-applies the HTML
425+
// decoder at depth 2 to extract content from that inner markup.
426+
// At depth 1 (this test), the entity-encoded text is emitted as plain text
427+
// after residualEntityReplacer runs. The msdt:dt="string" attribute is also
428+
// emitted because mso:CanvasContent1 is a namespace-prefixed element.
429+
name: "sharepoint aspx mso:CanvasContent1 secret in nested html",
430+
chunk: &sources.Chunk{Data: []byte(
431+
`<html xmlns:mso="urn:schemas-microsoft-com:office:office">` +
432+
`<head><mso:CustomDocumentProperties>` +
433+
`<mso:CanvasContent1 msdt:dt="string">` +
434+
`&lt;div&gt;&lt;p&gt;-----BEGIN OPENSSH PRIVATE KEY-----&lt;/p&gt;&lt;/div&gt;` +
435+
`</mso:CanvasContent1>` +
436+
`</mso:CustomDocumentProperties></head></html>`,
437+
)},
438+
want: "string\n<div><p>-----BEGIN OPENSSH PRIVATE KEY-----</p></div>",
439+
},
440+
374441
// --- Integration: all extraction types in one chunk ---
375442
{
376443
// Combines text nodes (split across spans), URL-decoded attribute
@@ -462,6 +529,25 @@ func TestLooksLikeHTML(t *testing.T) {
462529
{"XML-like", "<root>content</root>", true},
463530
{"just less-than", "a < b", false},
464531
{"html entity only", "&amp; &lt;", false},
532+
533+
// ASPX / namespace-prefixed tags (literal form)
534+
{"aspx namespace tag with space", `<asp:Content runat="server">`, true},
535+
{"mso namespace tag with space", `<mso:WikiField msdt:dt="string">`, true},
536+
{"multi-segment namespace tag", `<WebPartPages:WebPartZone runat="server">`, true},
537+
538+
// Entity-encoded HTML tags (positive)
539+
{"entity-encoded div with space", `&lt;div class="foo"&gt;`, true},
540+
{"entity-encoded p with gt terminator", `&lt;p&gt;hello&lt;/p&gt;`, true},
541+
{"entity-encoded self-closing br", `&lt;br/&gt;`, true},
542+
{"entity-encoded namespace tag", `&lt;asp:Content runat="server"&gt;`, true},
543+
544+
// Entity-encoded false positives that must NOT match
545+
{"comparison operator entity-encoded", `x &lt; maxRetries exceeded`, false},
546+
{"comparison operator single word", `result &lt; threshold`, false},
547+
{"template placeholder entity-encoded", `&lt;YOUR_API_KEY&gt;`, false},
548+
{"double-encoded entity not matched", `&amp;lt;div&gt;`, false},
549+
{"json with digit after lt", `{"lt": "<3"}`, false},
550+
{"sql comparison bare lt", `SELECT * FROM t WHERE a < b`, false},
465551
}
466552

467553
for _, tt := range tests {

0 commit comments

Comments
 (0)