gitea/services/gitdiff/highlightdiff.go at ba3d8758cce42fd56ce3f9427d0804f61c9afb3e · wxiaoguang/gitea · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
// Copyright 2022 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT

package gitdiff

import (
	"bytes"
	"html/template"
	"strings"
	"unicode/utf8"

	"code.gitea.io/gitea/modules/util"

	"github.com/sergi/go-diff/diffmatchpatch"
)

// extractDiffTokenRemainingFullTag tries to extract full tag with content from the remaining string
// e.g. for input: "content</span>the-rest...", it returns "content</span>", "the-rest...", true
func extractDiffTokenRemainingFullTag(s string) (token, after string, valid bool) {
	pos := 0
	for ; pos < len(s); pos++ {
		c := s[pos]
		if c == '<' {
			break
		}
		// keep in mind: even if we'd like to relax this check,
		// we should never ignore "&" because it is for HTML entity and can't be safely used in the diff algorithm,
		// because diff between "&lt;" and "&gt;" will generate broken result.
		isSymbolChar := 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' || c == '_' || c == '-' || c == '.'
		if !isSymbolChar {
			return "", s, false
		}
	}
	if pos+1 >= len(s) || s[pos+1] != '/' {
		return "", s, false
	}
	pos2 := strings.IndexByte(s[pos:], '>')
	if pos2 == -1 {
		return "", s, false
	}
	return s[:pos+pos2+1], s[pos+pos2+1:], true
}

// Returned token:
// * full tag with content: "<<span>content</span>>", it is used to optimize diff results to highlight the whole changed symbol
// * opening/closing tag: "<span ...>" or "</span>"
// * HTML entity: "&lt;"
func extractDiffToken(s string) (before, token, after string, valid bool) {
	for pos1 := 0; pos1 < len(s); pos1++ {
		switch s[pos1] {
		case '<':
			pos2 := strings.IndexByte(s[pos1:], '>')
			if pos2 == -1 {
				return "", "", s, false
			}
			before, token, after = s[:pos1], s[pos1:pos1+pos2+1], s[pos1+pos2+1:]

			if !strings.HasPrefix(token, "</") {
				// try to extract full tag with content, e.g. `<<span>content</span>>`, to optimize diff results
				if fullTokenRemaining, fullTokenAfter, ok := extractDiffTokenRemainingFullTag(after); ok {
					return before, "<" + token + fullTokenRemaining + ">", fullTokenAfter, true
				}
			}
			return before, token, after, true
		case '&':
			pos2 := strings.IndexByte(s[pos1:], ';')
			if pos2 == -1 {
				return "", "", s, false
			}
			return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true
		}
	}
	return "", "", s, true
}

// highlightCodeDiff is used to do diff with highlighted HTML code.
// It totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes.
// The HTML tags and entities will be replaced by Unicode placeholders: "<span>{TEXT}</span>" => "\uE000{TEXT}\uE001"
// These Unicode placeholders are friendly to the diff.
// Then after diff, the placeholders in diff result will be recovered to the HTML tags and entities.
// It's guaranteed that the tags in final diff result are paired correctly.
type highlightCodeDiff struct {
	placeholderBegin    rune
	placeholderMaxCount int
	placeholderIndex    int
	placeholderTokenMap map[rune]string
	tokenPlaceholderMap map[string]rune

	placeholderOverflowCount int

	diffCodeAddedOpen   rune
	diffCodeRemovedOpen rune
	diffCodeClose       rune
}

func newHighlightCodeDiff() *highlightCodeDiff {
	return &highlightCodeDiff{
		placeholderBegin:    rune(0x100000), // Plane 16: Supplementary Private Use Area B (U+100000..U+10FFFD)
		placeholderMaxCount: 64000,
		placeholderTokenMap: map[rune]string{},
		tokenPlaceholderMap: map[string]rune{},
	}
}

// nextPlaceholder returns 0 if no more placeholder can be used
// the diff is done line by line, usually there are only a few (no more than 10) placeholders in one line
// so the placeholderMaxCount is impossible to be exhausted in real cases.
func (hcd *highlightCodeDiff) nextPlaceholder() rune {
	for hcd.placeholderIndex < hcd.placeholderMaxCount {
		r := hcd.placeholderBegin + rune(hcd.placeholderIndex)
		hcd.placeholderIndex++
		// only use non-existing (not used by code) rune as placeholders
		if _, ok := hcd.placeholderTokenMap[r]; !ok {
			return r
		}
	}
	return 0 // no more available placeholder
}

func (hcd *highlightCodeDiff) isInPlaceholderRange(r rune) bool {
	return hcd.placeholderBegin <= r && r < hcd.placeholderBegin+rune(hcd.placeholderMaxCount)
}

func (hcd *highlightCodeDiff) collectUsedRunes(code template.HTML) {
	for _, r := range code {
		if hcd.isInPlaceholderRange(r) {
			// put the existing rune (used by code) in map, then this rune won't be used a placeholder anymore.
			hcd.placeholderTokenMap[r] = ""
		}
	}
}

func (hcd *highlightCodeDiff) diffEqualPartIsSpaceOnly(s string) bool {
	for _, r := range s {
		if r >= hcd.placeholderBegin {
			recovered := hcd.placeholderTokenMap[r]
			if strings.HasPrefix(recovered, "<<") {
				return false // a full tag with content, it can't be space-only
			} else if strings.HasPrefix(recovered, "<") {
				continue // a single opening/closing tag, skip the tag and continue to check the content
			}
			return false // otherwise, it must be an HTML entity, it can't be space-only
		}
		isSpace := r == ' ' || r == '\t' || r == '\n' || r == '\r'
		if !isSpace {
			return false
		}
	}
	return true
}

func (hcd *highlightCodeDiff) diffLineWithHighlight(lineType DiffLineType, codeA, codeB template.HTML) template.HTML {
	hcd.collectUsedRunes(codeA)
	hcd.collectUsedRunes(codeB)

	convertedCodeA := hcd.convertToPlaceholders(codeA)
	convertedCodeB := hcd.convertToPlaceholders(codeB)

	dmp := defaultDiffMatchPatch()
	diffs := dmp.DiffMain(convertedCodeA, convertedCodeB, true)
	diffs = dmp.DiffCleanupSemantic(diffs)

	buf := bytes.NewBuffer(nil)

	if hcd.diffCodeClose == 0 {
		// tests can pre-set the placeholders
		hcd.diffCodeAddedOpen = hcd.registerTokenAsPlaceholder(`<span class="added-code">`)
		hcd.diffCodeRemovedOpen = hcd.registerTokenAsPlaceholder(`<span class="removed-code">`)
		hcd.diffCodeClose = hcd.registerTokenAsPlaceholder(`</span><!-- diff-code-close -->`)
	}

	equalPartSpaceOnly := true
	for _, diff := range diffs {
		if diff.Type != diffmatchpatch.DiffEqual {
			continue
		}
		if equalPartSpaceOnly = hcd.diffEqualPartIsSpaceOnly(diff.Text); !equalPartSpaceOnly {
			break
		}
	}

	// only add "added"/"removed" tags when needed:
	// * non-space contents appear in the DiffEqual parts (not a full-line add/del)
	// * placeholder map still works (not exhausted, can get the closing tag placeholder)
	addDiffTags := !equalPartSpaceOnly && hcd.diffCodeClose != 0
	if addDiffTags {
		for _, diff := range diffs {
			switch {
			case diff.Type == diffmatchpatch.DiffEqual:
				buf.WriteString(diff.Text)
			case diff.Type == diffmatchpatch.DiffInsert && lineType == DiffLineAdd:
				buf.WriteRune(hcd.diffCodeAddedOpen)
				buf.WriteString(diff.Text)
				buf.WriteRune(hcd.diffCodeClose)
			case diff.Type == diffmatchpatch.DiffDelete && lineType == DiffLineDel:
				buf.WriteRune(hcd.diffCodeRemovedOpen)
				buf.WriteString(diff.Text)
				buf.WriteRune(hcd.diffCodeClose)
			}
		}
	} else {
		// the caller will still add added/removed backgrounds for the whole line
		for _, diff := range diffs {
			take := diff.Type == diffmatchpatch.DiffEqual || (diff.Type == diffmatchpatch.DiffInsert && lineType == DiffLineAdd) || (diff.Type == diffmatchpatch.DiffDelete && lineType == DiffLineDel)
			if take {
				buf.WriteString(diff.Text)
			}
		}
	}
	return hcd.recoverOneDiff(buf.String())
}

func (hcd *highlightCodeDiff) registerTokenAsPlaceholder(token string) rune {
	recovered := token
	if token[0] == '<' && token[1] != '<' {
		// when recovering a single tag, only use the tag itself, ignore the trailing comment (for how the comment is generated, see the code in `convert` function)
		recovered = token[:strings.IndexByte(token, '>')+1]
	}
	placeholder, ok := hcd.tokenPlaceholderMap[token]
	if !ok {
		placeholder = hcd.nextPlaceholder()
		if placeholder != 0 {
			hcd.tokenPlaceholderMap[token] = placeholder
			hcd.placeholderTokenMap[placeholder] = recovered
		}
	}
	return placeholder
}

// convertToPlaceholders totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes.
func (hcd *highlightCodeDiff) convertToPlaceholders(htmlContent template.HTML) string {
	var tagStack []string
	res := strings.Builder{}

	htmlCode := string(htmlContent)
	var beforeToken, token string
	var valid bool
	for {
		beforeToken, token, htmlCode, valid = extractDiffToken(htmlCode)
		if !valid || token == "" {
			break
		}
		// write the content before the token into result string, and consume the token in the string
		res.WriteString(beforeToken)

		// the standard chroma highlight HTML is `<span class="line [hl]"><span class="cl"> ... </span></span>`
		// the line wrapper tags should be removed before diff
		if strings.HasPrefix(token, `<span class="line`) || strings.HasPrefix(token, `<span class="cl"`) {
			continue
		}

		var tokenInMap string
		if strings.HasPrefix(token, "</") { // for closing tag
			if len(tagStack) == 0 {
				continue // no opening tag but see closing tag, skip it
			}
			// make sure the closing tag in map is related to the open tag, to make the diff algorithm can match the opening/closing tags
			// the closing tag will be recorded in the map by key "</span><!-- <span the-opening> -->" for "<span the-opening>"
			tokenInMap = token + "<!-- " + tagStack[len(tagStack)-1] + "-->"
			tagStack = tagStack[:len(tagStack)-1]
		} else if token[0] == '<' {
			if token[1] == '<' {
				// full tag `<<span>content</span>>`, recover to `<span>content</span>`
				tokenInMap = token
			} else {
				// opening tag
				tokenInMap = token
				tagStack = append(tagStack, token)
			}
		} else if token[0] == '&' { // for HTML entity
			tokenInMap = token
		} // else: impossible

		// remember the placeholder and token in the map
		placeholder := hcd.registerTokenAsPlaceholder(tokenInMap)

		if placeholder != 0 {
			res.WriteRune(placeholder) // use the placeholder to replace the token
		} else {
			// unfortunately, all private use runes has been exhausted, no more placeholder could be used, no more converting
			// usually, the exhausting won't occur in real cases, the magnitude of used placeholders is not larger than that of the CSS classes outputted by chroma.
			hcd.placeholderOverflowCount++
			if strings.HasPrefix(token, "<<") {
				pos1 := strings.IndexByte(token, '>')
				pos2 := strings.LastIndexByte(token, '<')
				res.WriteString(token[pos1+1 : pos2]) // recover to `content` from "<<span>content</span>>"
			}
			if strings.HasPrefix(token, "&") {
				// when the token is an HTML entity, something must be outputted even if there is no placeholder.
				res.WriteRune(0xFFFD)      // replacement character TODO: how to handle this case more gracefully?
				res.WriteString(token[1:]) // still output the entity code part, otherwise there will be no diff result.
			}
		}
	}

	// write the remaining string
	res.WriteString(htmlCode)
	return res.String()
}

// recoverOneRune tries to recover one rune
// * if the rune is a placeholder, it will be recovered to the corresponding content
// * otherwise it will be returned as is
func (hcd *highlightCodeDiff) recoverOneRune(buf []byte) (r rune, runeLen int, isSingleTag bool, recovered string) {
	r, runeLen = utf8.DecodeRune(buf)
	token := hcd.placeholderTokenMap[r]
	if token == "" {
		return r, runeLen, false, "" // rune itself, not a placeholder
	} else if token[0] == '<' {
		if token[1] == '<' {
			return 0, runeLen, false, token[1 : len(token)-1] // full tag `<<span>content</span>>`, recover to `<span>content</span>`
		}
		return r, runeLen, true, token // single tag
	}
	return 0, runeLen, false, token // HTML entity
}

func (hcd *highlightCodeDiff) recoverOneDiff(str string) template.HTML {
	sb := strings.Builder{}
	var tagStack []string
	var diffCodeOpenTag string
	diffCodeCloseTag := hcd.placeholderTokenMap[hcd.diffCodeClose]
	strBytes := util.UnsafeStringToBytes(str)

	// this loop is slightly longer than expected, for performance consideration
	for idx := 0; idx < len(strBytes); {
		// take a look at the next rune
		r, runeLen, isSingleTag, recovered := hcd.recoverOneRune(strBytes[idx:])
		idx += runeLen

		// loop section 1: if it isn't a single tag, then try to find the following runes until the next single tag, and recover them together
		if !isSingleTag {
			if diffCodeOpenTag != "" {
				// start the "added/removed diff tag" if the current token is in the diff part
				sb.WriteString(diffCodeOpenTag)
			}
			if recovered != "" {
				sb.WriteString(recovered)
			} else {
				sb.WriteRune(r)
			}
			// inner loop to recover following runes until the next single tag
			for idx < len(strBytes) {
				r, runeLen, isSingleTag, recovered = hcd.recoverOneRune(strBytes[idx:])
				idx += runeLen
				if isSingleTag {
					break
				}
				if recovered != "" {
					sb.WriteString(recovered)
				} else {
					sb.WriteRune(r)
				}
			}
			if diffCodeOpenTag != "" {
				// end the "added/removed diff tag" if the current token is in the diff part
				sb.WriteString(diffCodeCloseTag)
			}
		}

		if !isSingleTag {
			break // the inner loop has already consumed all remaining runes, no more single tag found
		}

		// loop section 2: for opening/closing HTML tags
		placeholder := r
		if recovered[1] != '/' { // opening tag
			if placeholder == hcd.diffCodeAddedOpen || placeholder == hcd.diffCodeRemovedOpen {
				diffCodeOpenTag = recovered
				recovered = ""
			} else {
				tagStack = append(tagStack, recovered)
			}
		} else { // closing tag
			if placeholder == hcd.diffCodeClose {
				diffCodeOpenTag = "" // the highlighted diff is closed, no more diff
				recovered = ""
			} else if len(tagStack) != 0 {
				tagStack = tagStack[:len(tagStack)-1]
			} else {
				recovered = ""
			}
		}
		sb.WriteString(recovered)
	}

	// close all opening tags
	for i := len(tagStack) - 1; i >= 0; i-- {
		tagToClose := tagStack[i]
		// get the closing tag "</span>" from "<span class=...>" or "<span>"
		pos := strings.IndexAny(tagToClose, " >")
		// pos must be positive, because the tags were pushed by us
		sb.WriteString("</" + tagToClose[1:pos] + ">")
	}
	return template.HTML(sb.String())
}