Skip to content

Commit 1d46ed8

Browse files
committed
html: have Render escape comments less often
Fixes golang/go#58246 Change-Id: I3effbd2afd7e363a42baa4db20691e57c9a08389 Reviewed-on: https://go-review.googlesource.com/c/net/+/469056 TryBot-Result: Gopher Robot <[email protected]> Run-TryBot: Nigel Tao <[email protected]> Reviewed-by: Bryan Mills <[email protected]> Reviewed-by: Kunpei Sakai <[email protected]> Reviewed-by: Damien Neil <[email protected]>
1 parent 569fe81 commit 1d46ed8

File tree

5 files changed

+137
-33
lines changed

5 files changed

+137
-33
lines changed

html/comment_test.go

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,13 @@ package html
66

77
import (
88
"bytes"
9+
"strings"
910
"testing"
1011
)
1112

1213
// TestComments exhaustively tests every 'interesting' N-byte string is
13-
// correctly parsed as a comment. N ranges from 4+1 to 4+suffixLen inclusive,
14-
// where 4 is the length of the "<!--" prefix that starts an HTML comment.
14+
// correctly parsed as a comment. N ranges from 4+1 to 4+maxSuffixLen
15+
// inclusive. 4 is the length of the "<!--" prefix that starts an HTML comment.
1516
//
1617
// 'Interesting' means that the N-4 byte suffix consists entirely of bytes
1718
// sampled from the interestingCommentBytes const string, below. These cover
@@ -27,8 +28,8 @@ import (
2728
// two algorithms match.
2829
func TestComments(t *testing.T) {
2930
const prefix = "<!--"
30-
const suffixLen = 6
31-
buffer := make([]byte, 0, len(prefix)+suffixLen)
31+
const maxSuffixLen = 6
32+
buffer := make([]byte, 0, len(prefix)+maxSuffixLen)
3233
testAllComments(t, append(buffer, prefix...))
3334
}
3435

@@ -205,6 +206,26 @@ loop:
205206
if (gotComment != wantComment) || (gotRemainder != wantRemainder) {
206207
t.Errorf("input=%q\ngot: %q + %q\nwant: %q + %q",
207208
b, gotComment, gotRemainder, wantComment, wantRemainder)
209+
return
210+
}
211+
212+
// suffix is the "N-4 byte suffix" per the TestComments comment.
213+
suffix := string(b[4:])
214+
215+
// Test that a round trip, rendering (escaped) and re-parsing, of a comment
216+
// token (with that suffix as the Token.Data) preserves that string.
217+
tok := Token{
218+
Type: CommentToken,
219+
Data: suffix,
220+
}
221+
z2 := NewTokenizer(strings.NewReader(tok.String()))
222+
if next := z2.Next(); next != CommentToken {
223+
t.Fatalf("round-trip Next(%q): got %v, want %v", suffix, next, CommentToken)
224+
}
225+
gotComment2 := string(z2.Text())
226+
if gotComment2 != suffix {
227+
t.Errorf("round-trip\ngot: %q\nwant: %q", gotComment2, suffix)
228+
return
208229
}
209230
}
210231

html/escape.go

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,87 @@ func lower(b []byte) []byte {
193193
return b
194194
}
195195

196+
// escapeComment is like func escape but escapes its input bytes less often.
197+
// Per https://github.com/golang/go/issues/58246 some HTML comments are (1)
198+
// meaningful and (2) contain angle brackets that we'd like to avoid escaping
199+
// unless we have to.
200+
//
201+
// "We have to" includes the '&' byte, since that introduces other escapes.
202+
//
203+
// It also includes those bytes (not including EOF) that would otherwise end
204+
// the comment. Per the summary table at the bottom of comment_test.go, this is
205+
// the '>' byte that, per above, we'd like to avoid escaping unless we have to.
206+
//
207+
// Studying the summary table (and T actions in its '>' column) closely, we
208+
// only need to escape in states 43, 44, 49, 51 and 52. State 43 is at the
209+
// start of the comment data. State 52 is after a '!'. The other three states
210+
// are after a '-'.
211+
//
212+
// Our algorithm is thus to escape every '&' and to escape '>' if and only if:
213+
// - The '>' is after a '!' or '-' (in the unescaped data) or
214+
// - The '>' is at the start of the comment data (after the opening "<!--").
215+
func escapeComment(w writer, s string) error {
216+
// When modifying this function, consider manually increasing the
217+
// maxSuffixLen constant in func TestComments, from 6 to e.g. 9 or more.
218+
// That increase should only be temporary, not committed, as it
219+
// exponentially affects the test running time.
220+
221+
if len(s) == 0 {
222+
return nil
223+
}
224+
225+
// Loop:
226+
// - Grow j such that s[i:j] does not need escaping.
227+
// - If s[j] does need escaping, output s[i:j] and an escaped s[j],
228+
// resetting i and j to point past that s[j] byte.
229+
i := 0
230+
for j := 0; j < len(s); j++ {
231+
escaped := ""
232+
switch s[j] {
233+
case '&':
234+
escaped = "&amp;"
235+
236+
case '>':
237+
if j > 0 {
238+
if prev := s[j-1]; (prev != '!') && (prev != '-') {
239+
continue
240+
}
241+
}
242+
escaped = "&gt;"
243+
244+
default:
245+
continue
246+
}
247+
248+
if i < j {
249+
if _, err := w.WriteString(s[i:j]); err != nil {
250+
return err
251+
}
252+
}
253+
if _, err := w.WriteString(escaped); err != nil {
254+
return err
255+
}
256+
i = j + 1
257+
}
258+
259+
if i < len(s) {
260+
if _, err := w.WriteString(s[i:]); err != nil {
261+
return err
262+
}
263+
}
264+
return nil
265+
}
266+
267+
// escapeCommentString is to EscapeString as escapeComment is to escape.
268+
func escapeCommentString(s string) string {
269+
if strings.IndexAny(s, "&>") == -1 {
270+
return s
271+
}
272+
var buf bytes.Buffer
273+
escapeComment(&buf, s)
274+
return buf.String()
275+
}
276+
196277
const escapedChars = "&'<>\"\r"
197278

198279
func escape(w writer, s string) error {

html/render.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ func render1(w writer, n *Node) error {
8585
if _, err := w.WriteString("<!--"); err != nil {
8686
return err
8787
}
88-
if err := escape(w, n.Data); err != nil {
88+
if err := escapeComment(w, n.Data); err != nil {
8989
return err
9090
}
9191
if _, err := w.WriteString("-->"); err != nil {

html/token.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ func (t Token) String() string {
110110
case SelfClosingTagToken:
111111
return "<" + t.tagString() + "/>"
112112
case CommentToken:
113-
return "<!--" + EscapeString(t.Data) + "-->"
113+
return "<!--" + escapeCommentString(t.Data) + "-->"
114114
case DoctypeToken:
115115
return "<!DOCTYPE " + EscapeString(t.Data) + ">"
116116
}
@@ -598,10 +598,10 @@ scriptDataDoubleEscapeEnd:
598598
// readComment reads the next comment token starting with "<!--". The opening
599599
// "<!--" has already been consumed.
600600
func (z *Tokenizer) readComment() {
601-
// When modifying this function, consider manually increasing the suffixLen
602-
// constant in func TestComments, from 6 to e.g. 9 or more. That increase
603-
// should only be temporary, not committed, as it exponentially affects the
604-
// test running time.
601+
// When modifying this function, consider manually increasing the
602+
// maxSuffixLen constant in func TestComments, from 6 to e.g. 9 or more.
603+
// That increase should only be temporary, not committed, as it
604+
// exponentially affects the test running time.
605605

606606
z.data.start = z.raw.end
607607
defer func() {

html/token_test.go

Lines changed: 25 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,6 @@ const issue58246 = `<!--[if gte mso 12]>
2323
</o:OfficeDocumentSettings>
2424
</xml>
2525
<![endif]-->`
26-
const issue58246Rendered = `<!--[if gte mso 12]&gt;
27-
&lt;xml&gt;
28-
&lt;o:OfficeDocumentSettings&gt;
29-
&lt;o:AllowPNG/&gt;
30-
&lt;o:PixelsPerInch&gt;96&lt;/o:PixelsPerInch&gt;
31-
&lt;/o:OfficeDocumentSettings&gt;
32-
&lt;/xml&gt;
33-
&lt;![endif]-->`
3426

3527
type tokenTest struct {
3628
// A short description of the test case.
@@ -332,7 +324,7 @@ var tokenTests = []tokenTest{
332324
{
333325
"comment3",
334326
"a<!--x>-->z",
335-
"a$<!--x&gt;-->$z",
327+
"a$<!--x>-->$z",
336328
},
337329
{
338330
"comment4",
@@ -352,7 +344,7 @@ var tokenTests = []tokenTest{
352344
{
353345
"comment7",
354346
"a<!---<>z",
355-
"a$<!---&lt;&gt;z-->",
347+
"a$<!---<>z-->",
356348
},
357349
{
358350
"comment8",
@@ -407,12 +399,12 @@ var tokenTests = []tokenTest{
407399
{
408400
"comment18",
409401
"a<!--<!-->z",
410-
"a$<!--&lt;!-->$z",
402+
"a$<!--<!-->$z",
411403
},
412404
{
413405
"comment19",
414406
"a<!--<!--",
415-
"a$<!--&lt;!-->",
407+
"a$<!--<!-->",
416408
},
417409
{
418410
"comment20",
@@ -427,7 +419,7 @@ var tokenTests = []tokenTest{
427419
{
428420
"comment22",
429421
"a<!--!--!<--!-->z",
430-
"a$<!--!--!&lt;--!-->$z",
422+
"a$<!--!--!<--!-->$z",
431423
},
432424
{
433425
"comment23",
@@ -437,27 +429,27 @@ var tokenTests = []tokenTest{
437429
{
438430
"comment24",
439431
"a<!--&gt;>x",
440-
"a$<!--&gt;&gt;x-->",
432+
"a$<!--&gt;>x-->",
441433
},
442434
{
443435
"comment25",
444436
"a<!--&gt;&gt;",
445-
"a$<!--&gt;&gt;-->",
437+
"a$<!--&gt;>-->",
446438
},
447439
{
448440
"comment26",
449441
"a<!--&gt;&gt;-",
450-
"a$<!--&gt;&gt;-->",
442+
"a$<!--&gt;>-->",
451443
},
452444
{
453445
"comment27",
454446
"a<!--&gt;&gt;-->z",
455-
"a$<!--&gt;&gt;-->$z",
447+
"a$<!--&gt;>-->$z",
456448
},
457449
{
458450
"comment28",
459451
"a<!--&amp;&gt;-->z",
460-
"a$<!--&amp;&gt;-->$z",
452+
"a$<!--&amp;>-->$z",
461453
},
462454
{
463455
"comment29",
@@ -469,10 +461,20 @@ var tokenTests = []tokenTest{
469461
"a<!--&nosuchentity;-->z",
470462
"a$<!--&amp;nosuchentity;-->$z",
471463
},
464+
{
465+
"comment31",
466+
"a<!--i>>j-->z",
467+
"a$<!--i>>j-->$z",
468+
},
469+
{
470+
"comment32",
471+
"a<!--i!>>j-->z",
472+
"a$<!--i!&gt;>j-->$z",
473+
},
472474
// https://stackoverflow.design/email/base/mso/#targeting-specific-outlook-versions
473475
// says "[For] Windows Outlook 2003 and above... conditional comments allow
474476
// us to add bits of HTML that are only read by the Word-based versions of
475-
// Outlook". TODO: these comments (with angle brackets) should pass through
477+
// Outlook". These comments (with angle brackets) should pass through
476478
// unchanged (by this Go package) when rendering.
477479
//
478480
// We should also still escape ">" as "&gt;" when necessary.
@@ -484,22 +486,22 @@ var tokenTests = []tokenTest{
484486
{
485487
"issue48237CommentWithAmpgtsemi1",
486488
"a<!--<p></p>&lt;!--[video]--&gt;-->z",
487-
"a$<!--&lt;p&gt;&lt;/p&gt;&lt;!--[video]--&gt;-->$z",
489+
"a$<!--<p></p><!--[video]--&gt;-->$z",
488490
},
489491
{
490492
"issue48237CommentWithAmpgtsemi2",
491493
"a<!--<p></p>&lt;!--[video]--!&gt;-->z",
492-
"a$<!--&lt;p&gt;&lt;/p&gt;&lt;!--[video]--!&gt;-->$z",
494+
"a$<!--<p></p><!--[video]--!&gt;-->$z",
493495
},
494496
{
495497
"issue58246MicrosoftOutlookComment1",
496498
"a<!--[if mso]> your code <![endif]-->z",
497-
"a$<!--[if mso]&gt; your code &lt;![endif]-->$z",
499+
"a$<!--[if mso]> your code <![endif]-->$z",
498500
},
499501
{
500502
"issue58246MicrosoftOutlookComment2",
501503
"a" + issue58246 + "z",
502-
"a$" + issue58246Rendered + "$z",
504+
"a$" + issue58246 + "$z",
503505
},
504506
// An attribute with a backslash.
505507
{

0 commit comments

Comments
 (0)