Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 87 additions & 22 deletions common/pagetypeclassifier/pagetypeclassifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ package pagetypeclassifier

import (
_ "embed"
"sync"

"fmt"
"strings"
"sync"

htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2"
"github.com/microcosm-cc/bluemonday"
Expand Down Expand Up @@ -41,30 +41,29 @@ var (
sanitizerPolicyOnce sync.Once
)

// getSanitizerPolicy returns an aggressive HTML sanitizer policy that strips
// most elements to reduce nesting depth and prevent parser stack overflow.
// getSanitizerPolicy returns an ultra-aggressive HTML sanitizer policy that strips
// almost all elements to minimize nesting depth and prevent parser stack overflow.
func getSanitizerPolicy() *bluemonday.Policy {
sanitizerPolicyOnce.Do(func() {
p := bluemonday.NewPolicy()
// Allow only basic text elements with minimal nesting
// This aggressive policy helps reduce nesting depth significantly
p.AllowElements("p", "br", "div", "span", "h1", "h2", "h3", "h4", "h5", "h6")
p.AllowElements("strong", "em", "b", "i", "u")
p.AllowElements("ul", "ol", "li")
p.AllowElements("blockquote", "pre", "code")
// Allow basic attributes but no style (which can cause nesting issues)
p.AllowStandardAttributes()
// Ultra-aggressive policy: Allow only the most basic text elements
// to minimize nesting and reduce parser stack depth
p.AllowElements("p", "br", "h1", "h2", "h3", "h4", "h5", "h6")
p.AllowElements("strong", "em", "b", "i")
// Remove div, span, ul, ol, li as they can create deep nesting
// No attributes allowed to prevent style-based nesting issues
sanitizerPolicy = p
})
return sanitizerPolicy
}

// htmlToText safely converts HTML to text and protects against panics from Go's HTML parser.
// htmlToText safely converts HTML to text with multiple fallback strategies.
// The 512 node limit in golang.org/x/net/html is hardcoded and cannot be increased.
// Strategy:
// 1. Always sanitize HTML with bluemonday first to remove useless elements and reduce nesting
// 2. Convert sanitized HTML to markdown
// 3. If conversion panics, recover and return empty string with error
// 1. Length limit the input HTML to prevent massive documents
// 2. Sanitize HTML aggressively with bluemonday to reduce nesting
// 3. Convert sanitized HTML to markdown with panic recovery
// 4. If conversion fails, fallback to plain text extraction
func htmlToText(html string) (text string, err error) {
defer func() {
if r := recover(); r != nil {
Expand All @@ -73,19 +72,85 @@ func htmlToText(html string) (text string, err error) {
}
}()

// First, sanitize HTML with bluemonday to strip useless elements and reduce nesting
// Limit input size to prevent processing extremely large HTML documents
const maxHTMLSize = 1024 * 1024 // 1MB limit
if len(html) > maxHTMLSize {
html = html[:maxHTMLSize]
}

// First, sanitize HTML with ultra-aggressive bluemonday policy
sanitizedHTML := getSanitizerPolicy().Sanitize(html)

// If sanitization failed or produced empty result, return empty
// If sanitization failed or produced empty result, try plain text fallback
if sanitizedHTML == "" {
return "", nil
return extractPlainText(html), nil
}

// Convert sanitized HTML to markdown
text, err = htmltomarkdown.ConvertString(sanitizedHTML)
if err != nil || text == "" {
return "", err
if err != nil {
// If markdown conversion fails, fallback to plain text extraction
return extractPlainText(sanitizedHTML), nil
}

if text == "" {
// If result is empty, try plain text fallback
return extractPlainText(sanitizedHTML), nil
}

return text, nil
}

return
// extractPlainText is a simple fallback that extracts text content without HTML parsing
// This is used when the HTML parser fails due to complexity or nesting depth
func extractPlainText(html string) string {
// Simple regex-based text extraction as fallback
// Remove script and style tags first
text := html

// Remove script tags and content
for {
start := strings.Index(text, "<script")
if start == -1 {
break
}
end := strings.Index(text[start:], "</script>")
if end == -1 {
text = text[:start]
break
}
text = text[:start] + text[start+end+9:]
}

// Remove style tags and content
for {
start := strings.Index(text, "<style")
if start == -1 {
break
}
end := strings.Index(text[start:], "</style>")
if end == -1 {
text = text[:start]
break
}
text = text[:start] + text[start+end+8:]
}

// Simple HTML tag removal (not perfect but safe)
result := ""
inTag := false
for _, char := range text {
if char == '<' {
inTag = true
} else if char == '>' {
inTag = false
result += " " // Replace tags with spaces
} else if !inTag {
result += string(char)
}
}

// Clean up multiple spaces
words := strings.Fields(result)
return strings.Join(words, " ")
}
56 changes: 47 additions & 9 deletions common/pagetypeclassifier/pagetypeclassifier_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package pagetypeclassifier

import (
"strings"
"testing"

"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -56,13 +57,13 @@ func TestPageTypeClassifier(t *testing.T) {
`))
})

t.Run("test panic recovery with deeply nested HTML", func(t *testing.T) {
t.Run("test resilience with deeply nested HTML", func(t *testing.T) {
epc, err := New()
require.NoError(t, err)
require.NotNil(t, epc)

// Generate deeply nested HTML that exceeds the 512 node stack limit
// This should trigger a panic in the HTML parser, which we recover from
// Generate deeply nested HTML that would have exceeded the 512 node stack limit
// With our enhanced sanitization and fallback mechanisms, this should now work
deeplyNestedHTML := "<div>"
for i := 0; i < 600; i++ {
deeplyNestedHTML += "<div><span>"
Expand All @@ -73,13 +74,15 @@ func TestPageTypeClassifier(t *testing.T) {
}
deeplyNestedHTML += "</div>"

// Should not panic and should return "other" when htmlToText returns empty string
// Should not panic and should successfully classify the content
result := epc.Classify(deeplyNestedHTML)
require.Equal(t, "other", result)
require.NotEmpty(t, result)
// Should be able to extract and classify the text content
require.NotEqual(t, "", result)
})

t.Run("test htmlToText with deeply nested HTML", func(t *testing.T) {
// Generate deeply nested HTML that exceeds the 512 node stack limit
// Generate deeply nested HTML that would have exceeded the 512 node stack limit
deeplyNestedHTML := "<div>"
for i := 0; i < 600; i++ {
deeplyNestedHTML += "<div><span>"
Expand All @@ -90,10 +93,11 @@ func TestPageTypeClassifier(t *testing.T) {
}
deeplyNestedHTML += "</div>"

// Should not panic and should return empty string with error on panic
// Should not panic and should successfully extract text with enhanced sanitization
result, err := htmlToText(deeplyNestedHTML)
require.Error(t, err)
require.Equal(t, "", result)
require.NoError(t, err)
require.NotEmpty(t, result)
require.Contains(t, result, "Some text content")
})

t.Run("test htmlToText with normal HTML", func(t *testing.T) {
Expand All @@ -102,4 +106,38 @@ func TestPageTypeClassifier(t *testing.T) {
require.NoError(t, err)
require.NotEmpty(t, result)
})

t.Run("test htmlToText with extremely large HTML", func(t *testing.T) {
// Create a very large HTML document (over 1MB)
largeContent := strings.Repeat("<p>This is a test paragraph with some content. ", 50000)
largeHTML := "<html><body>" + largeContent + "</body></html>"

// Should handle large documents without panic
result, err := htmlToText(largeHTML)
require.NoError(t, err)
require.NotEmpty(t, result)
})

t.Run("test extractPlainText fallback", func(t *testing.T) {
htmlWithScriptAndStyle := `<html>
<head>
<style>body { color: red; }</style>
<script>alert('test');</script>
</head>
<body>
<h1>Title</h1>
<p>Some <strong>important</strong> content here</p>
<div><span>Nested content</span></div>
</body>
</html>`

result := extractPlainText(htmlWithScriptAndStyle)
require.NotEmpty(t, result)
require.Contains(t, result, "Title")
require.Contains(t, result, "important")
require.Contains(t, result, "content")
// Should not contain script or style content
require.NotContains(t, result, "alert")
require.NotContains(t, result, "color: red")
})
}
Loading