-
Notifications
You must be signed in to change notification settings - Fork 218
Fix the regressions introduced in the fix for #89 #120
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,8 +34,6 @@ const ( | |
DiffInsert Operation = 1 | ||
// DiffEqual item represents an equal diff. | ||
DiffEqual Operation = 0 | ||
//IndexSeparator is used to seperate the array indexes in an index string | ||
IndexSeparator = "," | ||
) | ||
|
||
// Diff represents one diff operation | ||
|
@@ -195,7 +193,7 @@ func (dmp *DiffMatchPatch) diffCompute(text1, text2 []rune, checklines bool, dea | |
// diffLineMode does a quick line-level diff on both []runes, then rediff the parts for greater accuracy. This speedup can produce non-minimal diffs. | ||
func (dmp *DiffMatchPatch) diffLineMode(text1, text2 []rune, deadline time.Time) []Diff { | ||
// Scan the text on a line-by-line basis first. | ||
text1, text2, linearray := dmp.DiffLinesToRunes(string(text1), string(text2)) | ||
text1, text2, linearray := dmp.diffLinesToRunes(text1, text2) | ||
|
||
diffs := dmp.diffMainRunes(text1, text2, false, deadline) | ||
|
||
|
@@ -392,28 +390,88 @@ func (dmp *DiffMatchPatch) diffBisectSplit(runes1, runes2 []rune, x, y int, | |
// DiffLinesToChars splits two texts into a list of strings, and educes the texts to a string of hashes where each Unicode character represents one line. | ||
// It's slightly faster to call DiffLinesToRunes first, followed by DiffMainRunes. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this comment still accurate? |
||
func (dmp *DiffMatchPatch) DiffLinesToChars(text1, text2 string) (string, string, []string) { | ||
chars1, chars2, lineArray := dmp.diffLinesToStrings(text1, text2) | ||
return chars1, chars2, lineArray | ||
chars1, chars2, lineArray := dmp.DiffLinesToRunes(text1, text2) | ||
return string(chars1), string(chars2), lineArray | ||
} | ||
|
||
// DiffLinesToRunes splits two texts into a list of runes. | ||
// DiffLinesToRunes splits two texts into a list of runes. Each rune represents one line. | ||
func (dmp *DiffMatchPatch) DiffLinesToRunes(text1, text2 string) ([]rune, []rune, []string) { | ||
chars1, chars2, lineArray := dmp.diffLinesToStrings(text1, text2) | ||
return []rune(chars1), []rune(chars2), lineArray | ||
// '\x00' is a valid character, but various debuggers don't like it. So we'll insert a junk entry to avoid generating a null character. | ||
lineArray := []string{""} // e.g. lineArray[4] == 'Hello\n' | ||
lineHash := map[string]int{} // e.g. lineHash['Hello\n'] == 4 | ||
|
||
chars1 := dmp.diffLinesToRunesMunge(text1, &lineArray, lineHash) | ||
chars2 := dmp.diffLinesToRunesMunge(text2, &lineArray, lineHash) | ||
|
||
return chars1, chars2, lineArray | ||
} | ||
|
||
func (dmp *DiffMatchPatch) diffLinesToRunes(text1, text2 []rune) ([]rune, []rune, []string) { | ||
return dmp.DiffLinesToRunes(string(text1), string(text2)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm a bit concerned about all this unnecessary allocation converting to string, when it really shouldn't be necessary. Did you compare benchmark performance? |
||
} | ||
|
||
// diffLinesToRunesMunge splits a text into an array of strings, and reduces the texts to a []rune where each Unicode character represents one line. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The words 'array', 'list', and 'slice' are used interchangeably throughout, but array is really incorrect in this context. |
||
// We use strings instead of []runes as input mainly because you can't use []rune as a map key. | ||
func (dmp *DiffMatchPatch) diffLinesToRunesMunge(text string, lineArray *[]string, lineHash map[string]int) []rune { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm staring at this, and I don't see how the previous algorithm worked without passing in lineHash. Is that the root cause of the bug? Can you explain somewhere the exact nature of the bug in the fix for #89? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, the root cause of the bug is described in the PR description:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ...so array indexes could be corrupted/split in the diff output, right? It's worth explaining in a comment. |
||
// Walk the text, pulling out a substring for each line. text.split('\n') would would temporarily double our memory footprint. Modifying text would create many large strings to garbage collect. | ||
lineStart := 0 | ||
lineEnd := -1 | ||
runes := []rune{} | ||
|
||
for lineEnd < len(text)-1 { | ||
lineEnd = indexOf(text, "\n", lineStart) | ||
|
||
if lineEnd == -1 { | ||
lineEnd = len(text) - 1 | ||
} | ||
|
||
line := text[lineStart : lineEnd+1] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I'd be interested to know how performance would be affected by using an actual hash to look up the rune value in the index, rather than a string. Then we could convert to []byte at API boundaries and avoid allocating again. But that doesn't need to be done for this PR. |
||
lineStart = lineEnd + 1 | ||
lineValue, ok := lineHash[line] | ||
if !ok { | ||
checkLineArray(lineArray) | ||
|
||
*lineArray = append(*lineArray, line) | ||
lineValue = len(*lineArray) - 1 | ||
lineHash[line] = lineValue | ||
} | ||
runes = append(runes, rune(lineValue)) | ||
} | ||
|
||
return runes | ||
} | ||
|
||
// checkLineArray checks the size of the slice and ensures that the index of the next element | ||
// will be the valid rune. | ||
func checkLineArray(a *[]string) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not have functions // lineRune returns a valid rune value for the line at the given index
func lineRune(idx int) rune
// runeLine looks up the given rune hash in the lines slice.
func runeLine(idxRune rune, lines []string) string Thereby avoiding the padding here. It could also be used to avoid the prepended "". There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it can break the user code because There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I meant that DiffLinesToRunes could use lineRune to convert to rune values, and DiffCharsToLines can use runeLine to convert to line indexes. These functions would be inverses of eachother. |
||
// Runes in this range are invalid, utf8.ValidRune() returns false. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is very subtle. Please explain further in this comment why this is a problem. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This comment describes why runes must be valid: #89 (comment) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Understood. I think it's worth explaining here (you can reference #89). |
||
const ( | ||
surrogateMin = 0xD800 | ||
surrogateMax = 0xDFFF | ||
) | ||
|
||
// Check the index of the next element | ||
switch len(*a) { | ||
case surrogateMin: | ||
// Skip invalid runes. | ||
padding := [surrogateMax - surrogateMin + 1]string{} | ||
*a = append(*a, padding[:]...) | ||
|
||
case utf8.MaxRune + 1: | ||
// We can't do anything about it. | ||
panic(fmt.Sprintf("rune can't be more than %d", utf8.MaxRune)) | ||
} | ||
} | ||
|
||
// DiffCharsToLines rehydrates the text in a diff from a string of line hashes to real lines of text. | ||
func (dmp *DiffMatchPatch) DiffCharsToLines(diffs []Diff, lineArray []string) []Diff { | ||
hydrated := make([]Diff, 0, len(diffs)) | ||
for _, aDiff := range diffs { | ||
chars := strings.Split(aDiff.Text, IndexSeparator) | ||
chars := aDiff.Text | ||
text := make([]string, len(chars)) | ||
|
||
for i, r := range chars { | ||
i1, err := strconv.Atoi(r) | ||
if err == nil { | ||
text[i] = lineArray[i1] | ||
} | ||
text[i] = lineArray[r] | ||
} | ||
|
||
aDiff.Text = strings.Join(text, "") | ||
|
@@ -1307,46 +1365,3 @@ func (dmp *DiffMatchPatch) DiffFromDelta(text1 string, delta string) (diffs []Di | |
|
||
return diffs, nil | ||
} | ||
|
||
// diffLinesToStrings splits two texts into a list of strings. Each string represents one line. | ||
func (dmp *DiffMatchPatch) diffLinesToStrings(text1, text2 string) (string, string, []string) { | ||
// '\x00' is a valid character, but various debuggers don't like it. So we'll insert a junk entry to avoid generating a null character. | ||
lineArray := []string{""} // e.g. lineArray[4] == 'Hello\n' | ||
|
||
//Each string has the index of lineArray which it points to | ||
strIndexArray1 := dmp.diffLinesToStringsMunge(text1, &lineArray) | ||
strIndexArray2 := dmp.diffLinesToStringsMunge(text2, &lineArray) | ||
|
||
return intArrayToString(strIndexArray1), intArrayToString(strIndexArray2), lineArray | ||
} | ||
|
||
// diffLinesToStringsMunge splits a text into an array of strings, and reduces the texts to a []string. | ||
func (dmp *DiffMatchPatch) diffLinesToStringsMunge(text string, lineArray *[]string) []uint32 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please reorganize to minimize the diff with this refactored method. |
||
// Walk the text, pulling out a substring for each line. text.split('\n') would would temporarily double our memory footprint. Modifying text would create many large strings to garbage collect. | ||
lineHash := map[string]int{} // e.g. lineHash['Hello\n'] == 4 | ||
lineStart := 0 | ||
lineEnd := -1 | ||
strs := []uint32{} | ||
|
||
for lineEnd < len(text)-1 { | ||
lineEnd = indexOf(text, "\n", lineStart) | ||
|
||
if lineEnd == -1 { | ||
lineEnd = len(text) - 1 | ||
} | ||
|
||
line := text[lineStart : lineEnd+1] | ||
lineStart = lineEnd + 1 | ||
lineValue, ok := lineHash[line] | ||
|
||
if ok { | ||
strs = append(strs, uint32(lineValue)) | ||
} else { | ||
*lineArray = append(*lineArray, line) | ||
lineHash[line] = len(*lineArray) - 1 | ||
strs = append(strs, uint32(len(*lineArray)-1)) | ||
} | ||
} | ||
|
||
return strs | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
s/educes/reduces