From 527d64b3b66a70527fa9ab706841cdcf67f47909 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Mon, 19 Feb 2024 14:37:26 +0000 Subject: [PATCH 01/52] Add comment outlining the new logic I intend to implement --- src/diff/word.js | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/diff/word.js b/src/diff/word.js index 4ed3df26..04cf33d1 100644 --- a/src/diff/word.js +++ b/src/diff/word.js @@ -32,7 +32,28 @@ wordDiff.equals = function(left, right, options) { return left === right || (options.ignoreWhitespace && !reWhitespace.test(left) && !reWhitespace.test(right)); }; wordDiff.tokenize = function(value) { - // All whitespace symbols except newline group into one token, each newline - in separate token + // Each token is one of the following: + // - A newline (either Unix-style or Windows-style) + // - A punctuation mark plus the surrounding non-newline whitespace + // - A word plus the surrounding non-newline whitespace + // + // We have to include surrounding whitespace in the tokens because the two + // alternative approaches produce horribly broken results: + // * If we just discard the whitespace, we can't fully reproduce the original + // text from the sequence of tokens and any attempt to render the diff will + // get the whitespace wrong. + // * If we have separate tokens for whitespace, then in a typical text every + // second token will be a single space character. But this often results in + // the optimal diff between two texts being a perverse one that preserves + // the spaces between words but deletes and reinserts actual common words. + // See https://github.com/kpdecker/jsdiff/issues/160#issuecomment-1866099640 + // for an example. + // + // Keeping the surrounding whitespace of course has implications for .equals + // and .join, below. + + // TODO: actually implement above + let tokens = value.split(/([^\S\r\n]+|[()[\]{}'"\r\n]|\b)/); // Join the boundary splits that we do not consider to be boundaries. This is primarily the extended Latin character set. From 3eb79a20015e780e2940639ca355b2cdd6e312bc Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Wed, 21 Feb 2024 13:09:30 +0000 Subject: [PATCH 02/52] Implement the plan --- src/diff/word.js | 72 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 56 insertions(+), 16 deletions(-) diff --git a/src/diff/word.js b/src/diff/word.js index ff219b04..f2980594 100644 --- a/src/diff/word.js +++ b/src/diff/word.js @@ -25,6 +25,7 @@ const extendedWordChars = 'a-zA-Z\\u{C0}-\\u{FF}\\u{D8}-\\u{F6}\\u{F8}-\\u{2C6}\ // - A newline (either Unix-style or Windows-style) // - A punctuation mark plus the surrounding non-newline whitespace // - A word plus the surrounding non-newline whitespace +// - A run of pure whitespace (but only when this is the only content on a line) // // We have to include surrounding whitespace in the tokens because the two // alternative approaches produce horribly broken results: @@ -39,9 +40,13 @@ const extendedWordChars = 'a-zA-Z\\u{C0}-\\u{FF}\\u{D8}-\\u{F6}\\u{F8}-\\u{2C6}\ // for an example. // // Keeping the surrounding whitespace of course has implications for .equals -// and .join, below. -// TODO: actually implement above -const tokenizeRegex = new RegExp(`\\r?\\n|[${extendedWordChars}]+|[^\\S\\r\\n]+|[^${extendedWordChars}]`, 'ug'); +// and .join, not just .tokenize. + +// This regex does NOT fully implement the tokenization rules described above. +// Instead, it gives runs of whitespace their own "token". The tokenize method +// then handles stitching whitespace tokens onto adjacent word or punctuation +// tokens. +const tokenizeIncludingWhitespace = new RegExp(`\\r?\\n|[${extendedWordChars}]+|[^\\S\\r\\n]+|[^${extendedWordChars}]`, 'ug'); export const wordDiff = new Diff(); wordDiff.equals = function(left, right, options) { @@ -49,24 +54,59 @@ wordDiff.equals = function(left, right, options) { left = left.toLowerCase(); right = right.toLowerCase(); } - // The comparisons to the empty string are needed PURELY to signal to - // buildValues that the whitespace token should be ignored. The empty string - // will never be a token (removeEmpty removes it) but buildValues uses empty - // string comparisons to test for ignored tokens and we need to handle that - // query here. - const leftIsWhitespace = (left === '' || (/^\s+$/).test(left)); - const rightIsWhitespace = (right === '' || (/^\s+$/).test(right)); - return left === right || (options.ignoreWhitespace && leftIsWhitespace && rightIsWhitespace); + + return left.trim() === right.trim(); }; + wordDiff.tokenize = function(value) { - return value.match(tokenizeRegex) || []; + let parts = value.match(tokenizeIncludingWhitespace) || []; + const tokens = []; + let prevPart = null; + for (const part of parts) { + if (part.includes('\n')) { + tokens.push(part); + } else if ((/\s/).test(part)) { + if (prevPart == null || prevPart.includes('\n')) { + tokens.push(part); + } else { + tokens.push(tokens.pop() + part); + } + } else if ((/\s/).test(prevPart) && !prevPart.includes('\n')) { + if (tokens[tokens.length - 1] == prevPart) { + tokens.push(tokens.pop() + part); + } else { + tokens.push(prevPart + part); + } + } else { + tokens.push(part); + } + + prevPart = part; + } + return tokens; +}; + +wordDiff.join = function(tokens) { + // Tokens being joined here will always have appeared consecutively in the + // same text, so we can simply strip off the leading whitespace from all the + // tokens except the first (and expect any whitespace-only tokens) and then + // join them and the whitespace around words and punctuation will end up + // correct. + return tokens.map((token, i) => { + if (i == 0) { + return token; + } else if ((/^\s+$/).test(token)) { + return token; + } else { + return token.replace((/^\s+/), ''); + } + }).join(''); }; export function diffWords(oldStr, newStr, options) { - options = generateOptions(options, {ignoreWhitespace: true}); + options = generateOptions(options); return wordDiff.diff(oldStr, newStr, options); } -export function diffWordsWithSpace(oldStr, newStr, options) { - return wordDiff.diff(oldStr, newStr, options); -} +// TODO: Restore diffWordsWithSpace. It should basically just use +// tokenizeIncludingWhitespace without the extra logic. From b01be2965ae686e171b47487ce6fbddef99e7fed Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Wed, 21 Feb 2024 15:02:56 +0000 Subject: [PATCH 03/52] More progress --- src/diff/word.js | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/src/diff/word.js b/src/diff/word.js index f2980594..285fc315 100644 --- a/src/diff/word.js +++ b/src/diff/word.js @@ -1,5 +1,4 @@ import Diff from './base'; -import {generateOptions} from '../util/params'; // Based on https://en.wikipedia.org/wiki/Latin_script_in_Unicode // @@ -22,10 +21,10 @@ import {generateOptions} from '../util/params'; const extendedWordChars = 'a-zA-Z\\u{C0}-\\u{FF}\\u{D8}-\\u{F6}\\u{F8}-\\u{2C6}\\u{2C8}-\\u{2D7}\\u{2DE}-\\u{2FF}\\u{1E00}-\\u{1EFF}'; // Each token is one of the following: -// - A newline (either Unix-style or Windows-style) -// - A punctuation mark plus the surrounding non-newline whitespace -// - A word plus the surrounding non-newline whitespace -// - A run of pure whitespace (but only when this is the only content on a line) +// - A punctuation mark plus the surrounding whitespace +// - A word plus the surrounding whitespace +// - Pure whitespace (but only in the special case where this the entire text +// is just whitespace) // // We have to include surrounding whitespace in the tokens because the two // alternative approaches produce horribly broken results: @@ -46,7 +45,7 @@ const extendedWordChars = 'a-zA-Z\\u{C0}-\\u{FF}\\u{D8}-\\u{F6}\\u{F8}-\\u{2C6}\ // Instead, it gives runs of whitespace their own "token". The tokenize method // then handles stitching whitespace tokens onto adjacent word or punctuation // tokens. -const tokenizeIncludingWhitespace = new RegExp(`\\r?\\n|[${extendedWordChars}]+|[^\\S\\r\\n]+|[^${extendedWordChars}]`, 'ug'); +const tokenizeIncludingWhitespace = new RegExp(`[${extendedWordChars}]+|\s+|[^${extendedWordChars}]`, 'ug'); export const wordDiff = new Diff(); wordDiff.equals = function(left, right, options) { @@ -63,15 +62,13 @@ wordDiff.tokenize = function(value) { const tokens = []; let prevPart = null; for (const part of parts) { - if (part.includes('\n')) { - tokens.push(part); - } else if ((/\s/).test(part)) { - if (prevPart == null || prevPart.includes('\n')) { + if ((/\s/).test(part)) { + if (prevPart == null) { tokens.push(part); } else { tokens.push(tokens.pop() + part); } - } else if ((/\s/).test(prevPart) && !prevPart.includes('\n')) { + } else if ((/\s/).test(prevPart)) { if (tokens[tokens.length - 1] == prevPart) { tokens.push(tokens.pop() + part); } else { @@ -104,9 +101,19 @@ wordDiff.join = function(tokens) { }; export function diffWords(oldStr, newStr, options) { - options = generateOptions(options); - return wordDiff.diff(oldStr, newStr, options); + const result = wordDiff.diff(oldStr, newStr, options); + // Before returning, we tidy up the leading and trailing whitespace of the + // change objects to eliminate cases where trailing whitespace in one object + // is repeated as leading whitespace in the next. + + // TODO: the above! + return result; } -// TODO: Restore diffWordsWithSpace. It should basically just use -// tokenizeIncludingWhitespace without the extra logic. +export const wordWithSpaceDiff = new Diff(); +wordWithSpaceDiff.tokenize = function(value) { + return value.match(tokenizeIncludingWhitespace) || []; +}; +export function diffWordsWithSpace(oldStr, newStr, options) { + return wordWithSpaceDiff.diff(oldStr, newStr, options); +} From e039ae90b9e2a4169fd00b69c1be1b5e2f21f898 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Thu, 22 Feb 2024 13:44:43 +0000 Subject: [PATCH 04/52] Further WIP --- src/diff/word.js | 103 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 100 insertions(+), 3 deletions(-) diff --git a/src/diff/word.js b/src/diff/word.js index 285fc315..2139339b 100644 --- a/src/diff/word.js +++ b/src/diff/word.js @@ -1,4 +1,5 @@ import Diff from './base'; +import { longestCommonPrefix, longestCommonSuffix, replacePrefix, replaceSuffix, removePrefix, removeSuffix, maximumOverlap } from '../util/string'; // Based on https://en.wikipedia.org/wiki/Latin_script_in_Unicode // @@ -101,13 +102,109 @@ wordDiff.join = function(tokens) { }; export function diffWords(oldStr, newStr, options) { - const result = wordDiff.diff(oldStr, newStr, options); + const changes = wordDiff.diff(oldStr, newStr, options); + // TODO: skip all the stuff below in 1 token per co mode + // Before returning, we tidy up the leading and trailing whitespace of the // change objects to eliminate cases where trailing whitespace in one object // is repeated as leading whitespace in the next. + // Below are examples of the outcomes we want here to explain the code. + // I=insert, K=keep, D=delete + // 1. diffing 'foo bar baz' vs 'foo baz' + // Prior to cleanup, we have K:'foo ' D:' bar ' K:' baz' + // After cleanup, we want: K:'foo ' D:'bar ' K:'baz' + // + // 2. Diffing 'foo bar baz' vs 'foo qux baz' + // Prior to cleanup, we have K:'foo ' D:' bar ' I:' qux ' K:' baz' + // After cleanup, we want K:'foo ' D:'bar' I:'qux' K:' baz' + // + // 3. Diffing 'foo\nbar baz' vs 'foo baz' + // Prior to cleanup, we have K:'foo ' D:'\nbar ' K:' baz' + // After cleanup, we want K'foo' D:'\nbar' K:' baz' + // + // 4. Diffing 'foo baz' vs 'foo\nbar baz' + // Prior to cleanup, we have K:'foo\n' I:'\nbar ' K:' baz' + // After cleanup, we want K'foo' D:'\nbar' K:' baz' + // + // 5. Diffing 'foo bar baz' vs 'foo baz' + // Prior to cleanup, we have K:'foo ' D:' bar ' K:' baz' + // After cleanup, we want K:'foo ' D:' bar ' K:'baz' + // + // Our handling is unavoidably imperfect in the case where there's a single + // indel between keeps and the whitespace has changed. For instance, consider + // diffing 'foo\tbar\nbaz' vs 'foo baz'. Unless we create an extra change + // object to represent the insertion of the space character (which isn't even + // a token), we have no way to avoid losing information about the texts' + // original whitespace in the result we return. Still, we do our best to + // output something that will look sensible if we e.g. print it with + // insertions in green and deletions in red. + + let lastKeep = null; + // Change objects representing insertions or deletions since the last "keep" + // change object. + let indels = []; + for (const change of changes) { + if (change.added || change.removed) { + indels.push(change); + } else if (lastKeep != null) { + // Between two "keep" change objects, we can have either: + // * A "delete" followed by an "insert" + // * Just an "insert" + // * Just a "delete" + // We handle the three cases separately. + if (indels.length == 2) { + const [deletion, insertion] = indels; + const oldWsPrefix = deletion.value.match(/^\s*/)[0]; + const oldWsSuffix = deletion.value.match(/\s*$/)[0]; + const newWsPrefix = insertion.value.match(/^\s*/)[0]; + const newWsSuffix = insertion.value.match(/\s*$/)[0]; + + const commonWsPrefix = longestCommonPrefix(oldWsPrefix, newWsPrefix); + lastKeep.value = replaceSuffix(lastKeep.value, newWsPrefix, commonWsPrefix); + deletion.value = removePrefix(deletion.value, commonWsPrefix); + insertion.value = removePrefix(insertion.value, commonWsPrefix); + + const commonWsSuffix = longestCommonSuffix(oldWsSuffix, newWsSuffix); + change.value = replacePrefix(change.value, newWsSuffix, commonWsSuffix); + deletion.value = removeSuffix(deletion.value, commonWsSuffix); + insertion.value = removeSuffix(deletion.value, commonWsSuffix); + } else { + const [indel] = indels; + + if (indel.added) { + // The whitespaces all reflect what was in the new text rather than + // the old, so we essentially have no information about whitespace + // insertion or deletion. We just want to dedupe the whitespace. + // We do that by having each change object keep its trailing + // whitespace and deleting duplicate leading whitespace where + // present. + indel.value = indel.value.replace(/^\s*/, ''); + change.value = change.value.replace(/^\s*/, ''); + } else { + const keep1WsSuffix = lastKeep.value.match(/\s*$/); + const deletionWsPrefix = indel.value.match(/^\s*/); + + // The leading whitespace of the second "keep" change object can + // always be nuked, and the trailing whitespace of the deletion + // can always be kept, whether or not they match. + change.value = change.value.replace(/^\s*/, ''); + + // The only awkward bit is the leading whitespace on the deleted + // token. To the extent that this overlaps with the trailing + // whitespace on the preceding kept token, it can be deleted, but + // we need to compute the overlap. + const overlap = maximumOverlap(keep1WsSuffix, deletionWsPrefix); + indel.value = removePrefix(indel.value, overlap); + } + } + } else { + // The indels we've got here come before the first "keep" change object + // TODO: Handle this case + } + } + // TODO: Handle indels that come after the last "keep" change object - // TODO: the above! - return result; + return changes; } export const wordWithSpaceDiff = new Diff(); From 77d67415688712cc3df0cb15ca3e046429870c96 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Thu, 22 Feb 2024 14:00:28 +0000 Subject: [PATCH 05/52] Add utils file I forgot to commit --- src/util/string.js | 79 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 src/util/string.js diff --git a/src/util/string.js b/src/util/string.js new file mode 100644 index 00000000..9089ef13 --- /dev/null +++ b/src/util/string.js @@ -0,0 +1,79 @@ +// TODO: unit test these utils + +export function longestCommonPrefix(str1, str2) { + let i; + for (i = 0; i < str1.length && i < str2.length; i++) { + if (str1[i] != str2[i]) { + return str1.slice(0, i); + } + } + return str1.slice(0, i); +} + +export function longestCommonSuffix(str1, str2) { + let i; + for (i = 0; i < str1.length && i < str2.length; i++) { + if (str1[str1.length - (i + 1)] != str2[i - (i + 1)]) { + return str1.slice(-i); + } + } + return str1.slice(0, i); +} + +export function replacePrefix(string, oldPrefix, newPrefix) { + if (string.slice(oldPrefix.length) != oldPrefix) { + throw Error(`string ${string} doesn't start with prefix ${oldPrefix}; this is a bug`); + } + return newPrefix + string.slice(oldPrefix.length); +} + +export function replaceSuffix(string, oldSuffix, newSuffix) { + if (string.slice(-oldSuffix.length) != oldSuffix) { + throw Error(`string ${string} doesn't end with suffix ${oldSuffix}; this is a bug`); + } + return string.slice(0, -oldSuffix.length) + newSuffix; +} + +export function removePrefix(string, oldPrefix) { + return string.replacePrefix(oldPrefix, ''); +} + +export function removeSuffix(string, oldSuffix) { + return string.replaceSuffix(oldSuffix, ''); +} + +export function maximumOverlap(string1, string2) { + return string2.slice(0, overlapCount(string1, string2)); +} + +// Nicked from https://stackoverflow.com/a/60422853/1709587 +// TODO: Check this works before committing +function overlapCount(a, b) { + // Deal with cases where the strings differ in length + let startA = 0; + if (a.length > b.length) { startA = a.length - b.length; } + let endB = b.length; + if (a.length < b.length) { endB = a.length; } + // Create a back-reference for each index + // that should be followed in case of a mismatch. + // We only need B to make these references: + let map = Array(endB); + let k = 0; // Index that lags behind j + map[0] = 0; + for (let j = 1; j < endB; j++) { + if (b[j] == b[k]) { + map[j] = map[k]; // skip over the same character (optional optimisation) + } else { + map[j] = k; + } + while (k > 0 && b[j] != b[k]) { k = map[k]; } + if (b[j] == b[k]) { k++; } + } + // Phase 2: use these references while iterating over A + k = 0; + for (let i = startA; i < a.length; i++) { + while (k > 0 && a[i] != b[k]) { k = map[k]; } + if (a[i] == b[k]) { k++; } + } + return k; +} From 92081746253c8c1d68712b0442f853e9f9bf49d1 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Thu, 22 Feb 2024 14:00:56 +0000 Subject: [PATCH 06/52] Fix tokenize test --- src/diff/word.js | 2 +- test/diff/word.js | 43 ++++++++++++++----------------------------- 2 files changed, 15 insertions(+), 30 deletions(-) diff --git a/src/diff/word.js b/src/diff/word.js index 2139339b..76e2ebb8 100644 --- a/src/diff/word.js +++ b/src/diff/word.js @@ -46,7 +46,7 @@ const extendedWordChars = 'a-zA-Z\\u{C0}-\\u{FF}\\u{D8}-\\u{F6}\\u{F8}-\\u{2C6}\ // Instead, it gives runs of whitespace their own "token". The tokenize method // then handles stitching whitespace tokens onto adjacent word or punctuation // tokens. -const tokenizeIncludingWhitespace = new RegExp(`[${extendedWordChars}]+|\s+|[^${extendedWordChars}]`, 'ug'); +const tokenizeIncludingWhitespace = new RegExp(`[${extendedWordChars}]+|\\s+|[^${extendedWordChars}]`, 'ug'); export const wordDiff = new Diff(); wordDiff.equals = function(left, right, options) { diff --git a/test/diff/word.js b/test/diff/word.js index 59ab19ea..56f516d8 100644 --- a/test/diff/word.js +++ b/test/diff/word.js @@ -5,41 +5,26 @@ import {expect} from 'chai'; describe('WordDiff', function() { describe('#tokenize', function() { - it('should give words, punctuation marks, newlines, and runs of whitespace their own token', function() { + it('should give each word & punctuation mark its own token, including leading and trailing whitespace', function() { expect( wordDiff.tokenize( 'foo bar baz jurídica wir üben bla\t\t \txyzáxyz \n\n\n animá-los\r\n\r\n(wibbly wobbly)().' ) ).to.deep.equal([ - 'foo', - ' ', - 'bar', - ' ', - 'baz', - ' ', - 'jurídica', - ' ', - 'wir', - ' ', - 'üben', - ' ', - 'bla', - '\t\t \t', - 'xyzáxyz', - ' ', - '\n', - '\n', - '\n', - ' ', - 'animá', + 'foo ', + ' bar ', + ' baz ', + ' jurídica ', + ' wir ', + ' üben ', + ' bla\t\t \t', + '\t\t \txyzáxyz \n\n\n ', + ' \n\n\n animá', '-', - 'los', - '\r\n', - '\r\n', - '(', - 'wibbly', - ' ', - 'wobbly', + 'los\r\n\r\n', + '\r\n\r\n(', + 'wibbly ', + ' wobbly', ')', '(', ')', From a5e67fe08f59e8a4c6c22708237ea9e79d2e2325 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Thu, 22 Feb 2024 14:28:32 +0000 Subject: [PATCH 07/52] Add some extra tests --- test/diff/word.js | 63 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 60 insertions(+), 3 deletions(-) diff --git a/test/diff/word.js b/test/diff/word.js index 56f516d8..89246778 100644 --- a/test/diff/word.js +++ b/test/diff/word.js @@ -34,11 +34,68 @@ describe('WordDiff', function() { }); describe('#diffWords', function() { - it('should diff whitespace', function() { - const diffResult = diffWords('New Value', 'New ValueMoreData'); - expect(convertChangesToXML(diffResult)).to.equal('New ValueValueMoreData'); + it("should ignore whitespace changes between tokens that aren't added or deleted", function() { + const diffResult = diffWords('New Value', 'New \n \t Value'); + expect(convertChangesToXML(diffResult)).to.equal('New \n \t Value'); }); + describe('whitespace changes that border inserted/deleted tokens should be included in the diff as far as is possible...', function() { + it('(add+del at end of text)', function() { + const diffResult = diffWords('New Value', 'New ValueMoreData'); + expect(convertChangesToXML(diffResult)).to.equal('New Value ValueMoreData'); + }); + + it('(add+del in middle of text)', function() { + const diffResult = diffWords('New Value End', 'New ValueMoreData End'); + expect(convertChangesToXML(diffResult)).to.equal('New Value ValueMoreData End'); + }); + + it('(add+del at start of text)', function() { + const diffResult = diffWords('\tValue End', ' ValueMoreData End'); + expect(convertChangesToXML(diffResult)).to.equal('\tValue ValueMoreData End'); + }); + + it('(add at start of text)', function() { + const diffResult = diffWords('\t Value', 'More Value'); + expect(convertChangesToXML(diffResult)).to.equal('More Value'); + }); + + it('(del at start of text)', function() { + const diffResult = diffWords('More Value', '\t Value'); + expect(convertChangesToXML(diffResult)).to.equal('More \t Value'); + }); + + it('(add in middle of text)', function() { + const diffResult = diffWords('Even Value', 'Even More Value'); + expect(convertChangesToXML(diffResult)).to.equal('Even More Value'); + // Preferable would be: + // 'Even More Value' + // But this is hard to achieve without adding something like the + // .oldValue property I contemplate in + // https://github.com/kpdecker/jsdiff/pull/219#issuecomment-1858246490 + // to change objects returned by the base diffing algorithm. The CO + // cleanup done by diffWords simply doesn't have enough information to + // return the ideal result otherwise. + }); + + it('(del in middle of text)', function() { + const diffResult = diffWords('Even More Value', 'Even Value'); + expect(convertChangesToXML(diffResult)).to.equal('Even More Value'); + }); + + it('(add at end of text)', function() { + const diffResult = diffWords('Foo\n', 'Foo Bar\n'); + expect(convertChangesToXML(diffResult)).to.equal('FooBar\n'); + }); + + it('(del at end of text)', function() { + const diffResult = diffWords('Foo Bar', 'Foo '); + expect(convertChangesToXML(diffResult)).to.equal('Foo Bar'); + }); + }); + + // TODO: Review all tests below here + it('should diff multiple whitespace values', function() { const diffResult = diffWords('New Value ', 'New ValueMoreData '); expect(convertChangesToXML(diffResult)).to.equal('New ValueValueMoreData '); From b72b4cc5adb0708148c1febe641940e0d1230fbc Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Tue, 5 Mar 2024 13:48:49 +0000 Subject: [PATCH 08/52] Fix some bugs --- src/diff/word.js | 11 ++++++++--- src/util/string.js | 10 +++++----- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/diff/word.js b/src/diff/word.js index 76e2ebb8..c5b711d9 100644 --- a/src/diff/word.js +++ b/src/diff/word.js @@ -167,7 +167,7 @@ export function diffWords(oldStr, newStr, options) { const commonWsSuffix = longestCommonSuffix(oldWsSuffix, newWsSuffix); change.value = replacePrefix(change.value, newWsSuffix, commonWsSuffix); deletion.value = removeSuffix(deletion.value, commonWsSuffix); - insertion.value = removeSuffix(deletion.value, commonWsSuffix); + insertion.value = removeSuffix(insertion.value, commonWsSuffix); } else { const [indel] = indels; @@ -181,8 +181,8 @@ export function diffWords(oldStr, newStr, options) { indel.value = indel.value.replace(/^\s*/, ''); change.value = change.value.replace(/^\s*/, ''); } else { - const keep1WsSuffix = lastKeep.value.match(/\s*$/); - const deletionWsPrefix = indel.value.match(/^\s*/); + const keep1WsSuffix = lastKeep.value.match(/\s*$/)[0]; + const deletionWsPrefix = indel.value.match(/^\s*/)[0]; // The leading whitespace of the second "keep" change object can // always be nuked, and the trailing whitespace of the deletion @@ -197,9 +197,14 @@ export function diffWords(oldStr, newStr, options) { indel.value = removePrefix(indel.value, overlap); } } + lastKeep = change; + indels = []; } else { // The indels we've got here come before the first "keep" change object // TODO: Handle this case + + lastKeep = change; + indels = []; } } // TODO: Handle indels that come after the last "keep" change object diff --git a/src/util/string.js b/src/util/string.js index 9089ef13..7df8314a 100644 --- a/src/util/string.js +++ b/src/util/string.js @@ -21,25 +21,25 @@ export function longestCommonSuffix(str1, str2) { } export function replacePrefix(string, oldPrefix, newPrefix) { - if (string.slice(oldPrefix.length) != oldPrefix) { - throw Error(`string ${string} doesn't start with prefix ${oldPrefix}; this is a bug`); + if (string.slice(0, oldPrefix.length) != oldPrefix) { + throw Error(`string ${JSON.stringify(string)} doesn't start with prefix ${JSON.stringify(oldPrefix)}; this is a bug`); } return newPrefix + string.slice(oldPrefix.length); } export function replaceSuffix(string, oldSuffix, newSuffix) { if (string.slice(-oldSuffix.length) != oldSuffix) { - throw Error(`string ${string} doesn't end with suffix ${oldSuffix}; this is a bug`); + throw Error(`string ${JSON.stringify(string)} doesn't end with suffix ${JSON.stringify(oldSuffix)}; this is a bug`); } return string.slice(0, -oldSuffix.length) + newSuffix; } export function removePrefix(string, oldPrefix) { - return string.replacePrefix(oldPrefix, ''); + return replacePrefix(string, oldPrefix, ''); } export function removeSuffix(string, oldSuffix) { - return string.replaceSuffix(oldSuffix, ''); + return replaceSuffix(string, oldSuffix, ''); } export function maximumOverlap(string1, string2) { From 2f58b02b16055de8a7f85948918e620130399fe2 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Tue, 5 Mar 2024 16:35:25 +0000 Subject: [PATCH 09/52] Fix more bugs --- src/diff/word.js | 165 ++++++++++++++++++++++++++------------------- src/util/string.js | 4 ++ test/diff/word.js | 22 +++++- 3 files changed, 119 insertions(+), 72 deletions(-) diff --git a/src/diff/word.js b/src/diff/word.js index c5b711d9..12682da6 100644 --- a/src/diff/word.js +++ b/src/diff/word.js @@ -105,6 +105,31 @@ export function diffWords(oldStr, newStr, options) { const changes = wordDiff.diff(oldStr, newStr, options); // TODO: skip all the stuff below in 1 token per co mode + let lastKeep = null; + // Change objects representing any insertion or deletion since the last + // "keep" change object. There can be at most one of each. + let insertion = null; + let deletion = null; + for (const change of changes) { + if (change.added) { + insertion = change; + } else if (change.removed) { + deletion = change; + } else { + if (insertion || deletion) { // May be false at start of text + dedupeWhitespaceInChangeObjects(lastKeep, deletion, insertion, change); + } + lastKeep = change; + insertion = null; + deletion = null; + } + } + if (insertion || deletion) { + dedupeWhitespaceInChangeObjects(lastKeep, deletion, insertion, null); + } + return changes; +} +function dedupeWhitespaceInChangeObjects(startKeep, deletion, insertion, endKeep) { // Before returning, we tidy up the leading and trailing whitespace of the // change objects to eliminate cases where trailing whitespace in one object // is repeated as leading whitespace in the next. @@ -124,7 +149,7 @@ export function diffWords(oldStr, newStr, options) { // // 4. Diffing 'foo baz' vs 'foo\nbar baz' // Prior to cleanup, we have K:'foo\n' I:'\nbar ' K:' baz' - // After cleanup, we want K'foo' D:'\nbar' K:' baz' + // After cleanup, we want K'foo' I:'\nbar' K:' baz' // // 5. Diffing 'foo bar baz' vs 'foo baz' // Prior to cleanup, we have K:'foo ' D:' bar ' K:' baz' @@ -139,77 +164,79 @@ export function diffWords(oldStr, newStr, options) { // output something that will look sensible if we e.g. print it with // insertions in green and deletions in red. - let lastKeep = null; - // Change objects representing insertions or deletions since the last "keep" - // change object. - let indels = []; - for (const change of changes) { - if (change.added || change.removed) { - indels.push(change); - } else if (lastKeep != null) { - // Between two "keep" change objects, we can have either: - // * A "delete" followed by an "insert" - // * Just an "insert" - // * Just a "delete" - // We handle the three cases separately. - if (indels.length == 2) { - const [deletion, insertion] = indels; - const oldWsPrefix = deletion.value.match(/^\s*/)[0]; - const oldWsSuffix = deletion.value.match(/\s*$/)[0]; - const newWsPrefix = insertion.value.match(/^\s*/)[0]; - const newWsSuffix = insertion.value.match(/\s*$/)[0]; - - const commonWsPrefix = longestCommonPrefix(oldWsPrefix, newWsPrefix); - lastKeep.value = replaceSuffix(lastKeep.value, newWsPrefix, commonWsPrefix); - deletion.value = removePrefix(deletion.value, commonWsPrefix); - insertion.value = removePrefix(insertion.value, commonWsPrefix); - - const commonWsSuffix = longestCommonSuffix(oldWsSuffix, newWsSuffix); - change.value = replacePrefix(change.value, newWsSuffix, commonWsSuffix); - deletion.value = removeSuffix(deletion.value, commonWsSuffix); - insertion.value = removeSuffix(insertion.value, commonWsSuffix); - } else { - const [indel] = indels; - - if (indel.added) { - // The whitespaces all reflect what was in the new text rather than - // the old, so we essentially have no information about whitespace - // insertion or deletion. We just want to dedupe the whitespace. - // We do that by having each change object keep its trailing - // whitespace and deleting duplicate leading whitespace where - // present. - indel.value = indel.value.replace(/^\s*/, ''); - change.value = change.value.replace(/^\s*/, ''); - } else { - const keep1WsSuffix = lastKeep.value.match(/\s*$/)[0]; - const deletionWsPrefix = indel.value.match(/^\s*/)[0]; - - // The leading whitespace of the second "keep" change object can - // always be nuked, and the trailing whitespace of the deletion - // can always be kept, whether or not they match. - change.value = change.value.replace(/^\s*/, ''); - - // The only awkward bit is the leading whitespace on the deleted - // token. To the extent that this overlaps with the trailing - // whitespace on the preceding kept token, it can be deleted, but - // we need to compute the overlap. - const overlap = maximumOverlap(keep1WsSuffix, deletionWsPrefix); - indel.value = removePrefix(indel.value, overlap); - } - } - lastKeep = change; - indels = []; - } else { - // The indels we've got here come before the first "keep" change object - // TODO: Handle this case + // Between two "keep" change objects (or before the first or after the last + // change object), we can have either: + // * A "delete" followed by an "insert" + // * Just an "insert" + // * Just a "delete" + // We handle the three cases separately. - lastKeep = change; - indels = []; - } + if (!deletion && !insertion) { + throw Error("deletion & insertion are both null; this shouldn't happen"); } - // TODO: Handle indels that come after the last "keep" change object - return changes; + if (deletion && insertion) { + const oldWsPrefix = deletion.value.match(/^\s*/)[0]; + const oldWsSuffix = deletion.value.match(/\s*$/)[0]; + const newWsPrefix = insertion.value.match(/^\s*/)[0]; + const newWsSuffix = insertion.value.match(/\s*$/)[0]; + + if (startKeep) { + const commonWsPrefix = longestCommonPrefix(oldWsPrefix, newWsPrefix); + startKeep.value = replaceSuffix(startKeep.value, newWsPrefix, commonWsPrefix); + deletion.value = removePrefix(deletion.value, commonWsPrefix); + insertion.value = removePrefix(insertion.value, commonWsPrefix); + } + if (endKeep) { + const commonWsSuffix = longestCommonSuffix(oldWsSuffix, newWsSuffix); + endKeep.value = replacePrefix(endKeep.value, newWsSuffix, commonWsSuffix); + deletion.value = removeSuffix(deletion.value, commonWsSuffix); + insertion.value = removeSuffix(insertion.value, commonWsSuffix); + } + } else if (insertion) { + // The whitespaces all reflect what was in the new text rather than + // the old, so we essentially have no information about whitespace + // insertion or deletion. We just want to dedupe the whitespace. + // We do that by having each change object keep its trailing + // whitespace and deleting duplicate leading whitespace where + // present. + if (startKeep) { + insertion.value = insertion.value.replace(/^\s*/, ''); + } + if (endKeep) { + endKeep.value = endKeep.value.replace(/^\s*/, ''); + } + } else { + console.log("Handling deletion. startKeep:", startKeep, 'deletion:', deletion, 'endKeep:', endKeep) + + // As long as we're NOT at the start of the text, the leading whitespace of + // the second "keep" change object can always be nuked, and the trailing + // whitespace of the deletion can always be kept, whether or not they + // match. (The whitespace separating the two keeps will be including at + // the end of startKeep so we don't need to duplicate it on endKeep.) + if (startKeep && endKeep) { + endKeep.value = endKeep.value.replace(/^\s*/, ''); + } else if (endKeep) { + // If we ARE at the start of the text, though, we need to preserve all + // the whitespace on endKeep. In that case, we just want to remove + // whitespace from the end of deletion to the extent that it overlaps + // with the start of endKeep. + const endKeepWsPrefix = endKeep.value.match(/^\s*/)[0]; + const deletionWsSuffix = deletion.value.match(/\s*$/)[0]; + const overlap = maximumOverlap(deletionWsSuffix, endKeepWsPrefix); + deletion.value = removeSuffix(deletion.value, overlap); + } + + // Leading whitespace on the deleted token can only be deleted to the + // extent that this overlaps with the trailing whitespace on the preceding + // kept token. + if (startKeep) { + const startKeepWsSuffix = startKeep.value.match(/\s*$/)[0]; + const deletionWsPrefix = deletion.value.match(/^\s*/)[0]; + const overlap = maximumOverlap(startKeepWsSuffix, deletionWsPrefix); + deletion.value = removePrefix(deletion.value, overlap); + } + } } export const wordWithSpaceDiff = new Diff(); diff --git a/src/util/string.js b/src/util/string.js index 7df8314a..08d7796b 100644 --- a/src/util/string.js +++ b/src/util/string.js @@ -28,6 +28,10 @@ export function replacePrefix(string, oldPrefix, newPrefix) { } export function replaceSuffix(string, oldSuffix, newSuffix) { + if (!oldSuffix) { + return string + newSuffix; + } + if (string.slice(-oldSuffix.length) != oldSuffix) { throw Error(`string ${JSON.stringify(string)} doesn't end with suffix ${JSON.stringify(oldSuffix)}; this is a bug`); } diff --git a/test/diff/word.js b/test/diff/word.js index 89246778..dce13e97 100644 --- a/test/diff/word.js +++ b/test/diff/word.js @@ -57,7 +57,15 @@ describe('WordDiff', function() { it('(add at start of text)', function() { const diffResult = diffWords('\t Value', 'More Value'); - expect(convertChangesToXML(diffResult)).to.equal('More Value'); + // Preferable would be: + // 'More Value' + // But this is hard to achieve without adding something like the + // .oldValue property I contemplate in + // https://github.com/kpdecker/jsdiff/pull/219#issuecomment-1858246490 + // to change objects returned by the base diffing algorithm. The CO + // cleanup done by diffWords simply doesn't have enough information to + // return the ideal result otherwise. + expect(convertChangesToXML(diffResult)).to.equal('More Value'); }); it('(del at start of text)', function() { @@ -67,7 +75,6 @@ describe('WordDiff', function() { it('(add in middle of text)', function() { const diffResult = diffWords('Even Value', 'Even More Value'); - expect(convertChangesToXML(diffResult)).to.equal('Even More Value'); // Preferable would be: // 'Even More Value' // But this is hard to achieve without adding something like the @@ -76,6 +83,7 @@ describe('WordDiff', function() { // to change objects returned by the base diffing algorithm. The CO // cleanup done by diffWords simply doesn't have enough information to // return the ideal result otherwise. + expect(convertChangesToXML(diffResult)).to.equal('Even More Value'); }); it('(del in middle of text)', function() { @@ -85,7 +93,15 @@ describe('WordDiff', function() { it('(add at end of text)', function() { const diffResult = diffWords('Foo\n', 'Foo Bar\n'); - expect(convertChangesToXML(diffResult)).to.equal('FooBar\n'); + // Preferable would be: + // 'Foo Bar\n' + // But this is hard to achieve without adding something like the + // .oldValue property I contemplate in + // https://github.com/kpdecker/jsdiff/pull/219#issuecomment-1858246490 + // to change objects returned by the base diffing algorithm. The CO + // cleanup done by diffWords simply doesn't have enough information to + // return the ideal result otherwise. + expect(convertChangesToXML(diffResult)).to.equal('Foo Bar\n'); }); it('(del at end of text)', function() { From 2a164658130f36ccf5d1de213b2eb6a7057d2f5c Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Tue, 5 Mar 2024 16:40:51 +0000 Subject: [PATCH 10/52] Remove a test by rolling it into an earlier one --- src/diff/word.js | 2 -- test/diff/word.js | 10 ++-------- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/src/diff/word.js b/src/diff/word.js index 12682da6..91cf00fa 100644 --- a/src/diff/word.js +++ b/src/diff/word.js @@ -207,8 +207,6 @@ function dedupeWhitespaceInChangeObjects(startKeep, deletion, insertion, endKeep endKeep.value = endKeep.value.replace(/^\s*/, ''); } } else { - console.log("Handling deletion. startKeep:", startKeep, 'deletion:', deletion, 'endKeep:', endKeep) - // As long as we're NOT at the start of the text, the leading whitespace of // the second "keep" change object can always be nuked, and the trailing // whitespace of the deletion can always be kept, whether or not they diff --git a/test/diff/word.js b/test/diff/word.js index dce13e97..7c58bf8d 100644 --- a/test/diff/word.js +++ b/test/diff/word.js @@ -41,8 +41,8 @@ describe('WordDiff', function() { describe('whitespace changes that border inserted/deleted tokens should be included in the diff as far as is possible...', function() { it('(add+del at end of text)', function() { - const diffResult = diffWords('New Value', 'New ValueMoreData'); - expect(convertChangesToXML(diffResult)).to.equal('New Value ValueMoreData'); + const diffResult = diffWords('New Value ', 'New ValueMoreData '); + expect(convertChangesToXML(diffResult)).to.equal('New Value ValueMoreData '); }); it('(add+del in middle of text)', function() { @@ -111,12 +111,6 @@ describe('WordDiff', function() { }); // TODO: Review all tests below here - - it('should diff multiple whitespace values', function() { - const diffResult = diffWords('New Value ', 'New ValueMoreData '); - expect(convertChangesToXML(diffResult)).to.equal('New ValueValueMoreData '); - }); - // Diff on word boundary it('should diff on word boundaries', function() { let diffResult = diffWords('New :Value:Test', 'New ValueMoreData '); From 577b25d32a4485ba7a72e9350a84015d6c754394 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Tue, 5 Mar 2024 17:09:24 +0000 Subject: [PATCH 11/52] Fix another test --- test/convert/dmp.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/convert/dmp.js b/test/convert/dmp.js index f6ab6965..c86c0842 100644 --- a/test/convert/dmp.js +++ b/test/convert/dmp.js @@ -1,12 +1,12 @@ import {convertChangesToDMP} from '../../lib/convert/dmp'; -import {diffWords} from '../../lib/diff/word'; +import {diffChars} from '../../lib/diff/character'; import {expect} from 'chai'; describe('convertToDMP', function() { it('should output diff-match-patch format', function() { - const diffResult = diffWords('New Value ', 'New ValueMoreData '); + const diffResult = diffChars('New Value ', 'New ValueMoreData '); - expect(convertChangesToDMP(diffResult)).to.eql([[0, 'New '], [-1, 'Value'], [1, 'ValueMoreData'], [0, ' ']]); + expect(convertChangesToDMP(diffResult)).to.eql([[0, 'New '], [1, ' '], [0, 'Value'], [1, 'MoreData'], [0, ' '], [-1, ' ']]); }); }); From 6ef6821c36a476d611e621d283df2dc25bdc9a83 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Tue, 5 Mar 2024 18:45:07 +0000 Subject: [PATCH 12/52] Update another test --- test/diff/word.js | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/test/diff/word.js b/test/diff/word.js index 7c58bf8d..f7a53b79 100644 --- a/test/diff/word.js +++ b/test/diff/word.js @@ -110,21 +110,13 @@ describe('WordDiff', function() { }); }); + it('should treat punctuation characters as tokens', function() { + let diffResult = diffWords('New:Value:Test', 'New,Value,More,Data '); + expect(convertChangesToXML(diffResult)).to.equal('New:,Value:Test,More,Data '); + }); + // TODO: Review all tests below here // Diff on word boundary - it('should diff on word boundaries', function() { - let diffResult = diffWords('New :Value:Test', 'New ValueMoreData '); - expect(convertChangesToXML(diffResult)).to.equal('New :Value:TestValueMoreData '); - - diffResult = diffWords('New Value:Test', 'New Value:MoreData '); - expect(convertChangesToXML(diffResult)).to.equal('New Value:TestMoreData '); - - diffResult = diffWords('New Value-Test', 'New Value:MoreData '); - expect(convertChangesToXML(diffResult)).to.equal('New Value-Test:MoreData '); - - diffResult = diffWords('New Value', 'New Value:MoreData '); - expect(convertChangesToXML(diffResult)).to.equal('New Value:MoreData '); - }); // Diff without changes it('should handle identity', function() { @@ -207,9 +199,9 @@ describe('WordDiff', function() { }); // Diff on word boundary - it('should diff on word boundaries', function(done) { - diffWords('New :Value:Test', 'New ValueMoreData ', function(diffResult) { - expect(convertChangesToXML(diffResult)).to.equal('New :Value:TestValueMoreData '); + it('should treat punctuation characters as tokens', function(done) { + diffWords('New:Value:Test', 'New,Value,More,Data ', function(diffResult) { + expect(convertChangesToXML(diffResult)).to.equal('New:,Value:Test,More,Data '); done(); }); }); From 77b1efe8e00042ba30f2e81224ba0012fde8929a Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Tue, 5 Mar 2024 18:47:12 +0000 Subject: [PATCH 13/52] Eliminate test duplication --- test/diff/word.js | 66 +++-------------------------------------------- 1 file changed, 3 insertions(+), 63 deletions(-) diff --git a/test/diff/word.js b/test/diff/word.js index f7a53b79..a0f476d2 100644 --- a/test/diff/word.js +++ b/test/diff/word.js @@ -181,70 +181,10 @@ describe('WordDiff', function() { diffResult = diffWords(' ', ''); expect(convertChangesToXML(diffResult)).to.equal(' '); }); - }); - - describe('#diffWords - async', function() { - it('should diff whitespace', function(done) { - diffWords('New Value', 'New ValueMoreData', function(diffResult) { - expect(convertChangesToXML(diffResult)).to.equal('New ValueValueMoreData'); - done(); - }); - }); - - it('should diff multiple whitespace values', function(done) { - diffWords('New Value ', 'New ValueMoreData ', function(diffResult) { - expect(convertChangesToXML(diffResult)).to.equal('New ValueValueMoreData '); - done(); - }); - }); - // Diff on word boundary - it('should treat punctuation characters as tokens', function(done) { - diffWords('New:Value:Test', 'New,Value,More,Data ', function(diffResult) { - expect(convertChangesToXML(diffResult)).to.equal('New:,Value:Test,More,Data '); - done(); - }); - }); - - // Diff without changes - it('should handle identity', function(done) { - diffWords('New Value', 'New Value', function(diffResult) { - expect(convertChangesToXML(diffResult)).to.equal('New Value'); - done(); - }); - }); - it('should handle empty', function(done) { - diffWords('', '', function(diffResult) { - expect(convertChangesToXML(diffResult)).to.equal(''); - done(); - }); - }); - it('should diff has identical content', function(done) { - diffWords('New Value', 'New Value', function(diffResult) { - expect(convertChangesToXML(diffResult)).to.equal('New Value'); - done(); - }); - }); - - // Empty diffs - it('should diff empty new content', function(done) { - diffWords('New Value', '', function(diffResult) { - expect(diffResult.length).to.equal(1); - expect(convertChangesToXML(diffResult)).to.equal('New Value'); - done(); - }); - }); - it('should diff empty old content', function(done) { - diffWords('', 'New Value', function(diffResult) { - expect(convertChangesToXML(diffResult)).to.equal('New Value'); - done(); - }); - }); - - // With without anchor (the Heckel algorithm error case) - it('should diff when there is no anchor value', function(done) { - diffWords('New Value New Value', 'Value Value New New', function(diffResult) { - expect(convertChangesToXML(diffResult)).to.equal('NewValue Value New ValueNew'); + it('should support async mode', function(done) { + diffWords('New Value', 'New \n \t Value', function(diffResult) { + expect(convertChangesToXML(diffResult)).to.equal('New \n \t Value'); done(); }); }); From ea2d181f535fa1a70a16a92cc6ec44c49c05c65e Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Tue, 5 Mar 2024 19:09:00 +0000 Subject: [PATCH 14/52] Move anchor test to be for diffWordsWithSpace since it no longer makes sense where it is --- test/diff/word.js | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/diff/word.js b/test/diff/word.js index a0f476d2..faca7f4b 100644 --- a/test/diff/word.js +++ b/test/diff/word.js @@ -143,12 +143,6 @@ describe('WordDiff', function() { expect(convertChangesToXML(diffResult)).to.equal('New Value'); }); - // With without anchor (the Heckel algorithm error case) - it('should diff when there is no anchor value', function() { - const diffResult = diffWords('New Value New Value', 'Value Value New New'); - expect(convertChangesToXML(diffResult)).to.equal('NewValue Value New ValueNew'); - }); - it('should include count with identity cases', function() { expect(diffWords('foo', 'foo')).to.eql([{value: 'foo', count: 1, removed: false, added: false}]); expect(diffWords('foo bar', 'foo bar')).to.eql([{value: 'foo bar', count: 3, removed: false, added: false}]); @@ -242,6 +236,12 @@ describe('WordDiff', function() { }); }); + // With without anchor (the Heckel algorithm error case) + it('should diff when there is no anchor value', function() { + const diffResult = diffWordsWithSpace('New Value New Value', 'Value Value New New'); + expect(convertChangesToXML(diffResult)).to.equal('NewValue Value New ValueNew'); + }); + describe('case insensitivity', function() { it("is considered when there's a difference", function() { const diffResult = diffWordsWithSpace('new value', 'New ValueMoreData', {ignoreCase: true}); From 1caa0d527b2ab939dcda5304d2af2dda1d51e0b8 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Tue, 5 Mar 2024 19:16:12 +0000 Subject: [PATCH 15/52] Fix tests of counts --- test/diff/word.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/diff/word.js b/test/diff/word.js index 8e3dd4fc..3e08cf42 100644 --- a/test/diff/word.js +++ b/test/diff/word.js @@ -145,14 +145,14 @@ describe('WordDiff', function() { it('should include count with identity cases', function() { expect(diffWords('foo', 'foo')).to.eql([{value: 'foo', count: 1, removed: false, added: false}]); - expect(diffWords('foo bar', 'foo bar')).to.eql([{value: 'foo bar', count: 3, removed: false, added: false}]); + expect(diffWords('foo bar', 'foo bar')).to.eql([{value: 'foo bar', count: 2, removed: false, added: false}]); }); it('should include count with empty cases', function() { expect(diffWords('foo', '')).to.eql([{value: 'foo', count: 1, added: false, removed: true}]); - expect(diffWords('foo bar', '')).to.eql([{value: 'foo bar', count: 3, added: false, removed: true}]); + expect(diffWords('foo bar', '')).to.eql([{value: 'foo bar', count: 2, added: false, removed: true}]); expect(diffWords('', 'foo')).to.eql([{value: 'foo', count: 1, added: true, removed: false}]); - expect(diffWords('', 'foo bar')).to.eql([{value: 'foo bar', count: 3, added: true, removed: false}]); + expect(diffWords('', 'foo bar')).to.eql([{value: 'foo bar', count: 2, added: true, removed: false}]); }); it('should ignore whitespace', function() { From 68c65055c1d283c142ab4b2cad54438eb0533d35 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Tue, 5 Mar 2024 19:17:20 +0000 Subject: [PATCH 16/52] Update another test --- test/diff/word.js | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/diff/word.js b/test/diff/word.js index 3e08cf42..cb4ca297 100644 --- a/test/diff/word.js +++ b/test/diff/word.js @@ -156,11 +156,11 @@ describe('WordDiff', function() { }); it('should ignore whitespace', function() { - expect(diffWords('hase igel fuchs', 'hase igel fuchs')).to.eql([{ count: 5, value: 'hase igel fuchs', removed: false, added: false }]); - expect(diffWords('hase igel fuchs', 'hase igel fuchs\n')).to.eql([{ count: 5, value: 'hase igel fuchs\n', removed: false, added: false }]); - expect(diffWords('hase igel fuchs\n', 'hase igel fuchs')).to.eql([{ count: 5, value: 'hase igel fuchs\n', removed: false, added: false }]); - expect(diffWords('hase igel fuchs', 'hase igel\nfuchs')).to.eql([{ count: 5, value: 'hase igel\nfuchs', removed: false, added: false }]); - expect(diffWords('hase igel\nfuchs', 'hase igel fuchs')).to.eql([{ count: 5, value: 'hase igel fuchs', removed: false, added: false }]); + expect(diffWords('hase igel fuchs', 'hase igel fuchs')).to.eql([{ count: 3, value: 'hase igel fuchs', removed: false, added: false }]); + expect(diffWords('hase igel fuchs', 'hase igel fuchs\n')).to.eql([{ count: 3, value: 'hase igel fuchs\n', removed: false, added: false }]); + expect(diffWords('hase igel fuchs\n', 'hase igel fuchs')).to.eql([{ count: 3, value: 'hase igel fuchs', removed: false, added: false }]); + expect(diffWords('hase igel fuchs', 'hase igel\nfuchs')).to.eql([{ count: 3, value: 'hase igel\nfuchs', removed: false, added: false }]); + expect(diffWords('hase igel\nfuchs', 'hase igel fuchs')).to.eql([{ count: 3, value: 'hase igel fuchs', removed: false, added: false }]); }); it('should diff whitespace with flag', function() { From e170ade733047d33ac1aa34f5295d84211fe2613 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Tue, 5 Mar 2024 19:20:10 +0000 Subject: [PATCH 17/52] Nuke test of removed ignoreWhitespace flag --- test/diff/word.js | 5 ----- 1 file changed, 5 deletions(-) diff --git a/test/diff/word.js b/test/diff/word.js index cb4ca297..045dadff 100644 --- a/test/diff/word.js +++ b/test/diff/word.js @@ -163,11 +163,6 @@ describe('WordDiff', function() { expect(diffWords('hase igel\nfuchs', 'hase igel fuchs')).to.eql([{ count: 3, value: 'hase igel fuchs', removed: false, added: false }]); }); - it('should diff whitespace with flag', function() { - const diffResult = diffWords('New Value', 'New ValueMoreData', {ignoreWhitespace: false}); - expect(convertChangesToXML(diffResult)).to.equal('New Value ValueMoreData'); - }); - it('should diff with only whitespace', function() { let diffResult = diffWords('', ' '); expect(convertChangesToXML(diffResult)).to.equal(' '); From 221365e22e0b3e8b4afe1736c47daa40ba4aabde Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Tue, 5 Mar 2024 19:30:49 +0000 Subject: [PATCH 18/52] Add TODO --- src/diff/word.js | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/diff/word.js b/src/diff/word.js index 91cf00fa..cabd1441 100644 --- a/src/diff/word.js +++ b/src/diff/word.js @@ -1,5 +1,6 @@ import Diff from './base'; import { longestCommonPrefix, longestCommonSuffix, replacePrefix, replaceSuffix, removePrefix, removeSuffix, maximumOverlap } from '../util/string'; +import {generateOptions} from '../util/params'; // Based on https://en.wikipedia.org/wiki/Latin_script_in_Unicode // @@ -102,6 +103,9 @@ wordDiff.join = function(tokens) { }; export function diffWords(oldStr, newStr, options) { + // TODO: Support async properly. Currently conflicts with the + // extra logic below. + options = generateOptions(options, {}); const changes = wordDiff.diff(oldStr, newStr, options); // TODO: skip all the stuff below in 1 token per co mode @@ -242,5 +246,6 @@ wordWithSpaceDiff.tokenize = function(value) { return value.match(tokenizeIncludingWhitespace) || []; }; export function diffWordsWithSpace(oldStr, newStr, options) { + options = generateOptions(options, {}); return wordWithSpaceDiff.diff(oldStr, newStr, options); } From 144a26d2c31db44d7a1924fd2eb429aa6d50aca0 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Wed, 6 Mar 2024 11:17:58 +0000 Subject: [PATCH 19/52] Support async --- src/diff/base.js | 4 ++++ src/diff/word.js | 16 +++++++++++----- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/diff/base.js b/src/diff/base.js index b356d953..e240bf66 100644 --- a/src/diff/base.js +++ b/src/diff/base.js @@ -11,6 +11,7 @@ Diff.prototype = { let self = this; function done(value) { + value = self.postProcess(value); if (callback) { setTimeout(function() { callback(value); }, 0); return true; @@ -209,6 +210,9 @@ Diff.prototype = { }, join(chars) { return chars.join(''); + }, + postProcess(changeObjects) { + return changeObjects; } }; diff --git a/src/diff/word.js b/src/diff/word.js index cabd1441..c866da7d 100644 --- a/src/diff/word.js +++ b/src/diff/word.js @@ -102,13 +102,13 @@ wordDiff.join = function(tokens) { }).join(''); }; -export function diffWords(oldStr, newStr, options) { - // TODO: Support async properly. Currently conflicts with the - // extra logic below. - options = generateOptions(options, {}); - const changes = wordDiff.diff(oldStr, newStr, options); +wordDiff.postProcess = function(changes) { // TODO: skip all the stuff below in 1 token per co mode + if (!changes) { + return changes; + } + let lastKeep = null; // Change objects representing any insertion or deletion since the last // "keep" change object. There can be at most one of each. @@ -132,7 +132,13 @@ export function diffWords(oldStr, newStr, options) { dedupeWhitespaceInChangeObjects(lastKeep, deletion, insertion, null); } return changes; +}; + +export function diffWords(oldStr, newStr, options) { + options = generateOptions(options, {}); + return wordDiff.diff(oldStr, newStr, options); } + function dedupeWhitespaceInChangeObjects(startKeep, deletion, insertion, endKeep) { // Before returning, we tidy up the leading and trailing whitespace of the // change objects to eliminate cases where trailing whitespace in one object From 130fc565215272c8af3f7f1a6b44c2a6da2778e3 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Wed, 6 Mar 2024 11:18:18 +0000 Subject: [PATCH 20/52] Add TODO --- src/diff/base.js | 1 + 1 file changed, 1 insertion(+) diff --git a/src/diff/base.js b/src/diff/base.js index e240bf66..ba6cd2fe 100644 --- a/src/diff/base.js +++ b/src/diff/base.js @@ -211,6 +211,7 @@ Diff.prototype = { join(chars) { return chars.join(''); }, + // TODO: Document me postProcess(changeObjects) { return changeObjects; } From 45968a92b7f84728d9a0a3c8259059fcc061b8f4 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Wed, 6 Mar 2024 11:24:32 +0000 Subject: [PATCH 21/52] Fix newline handling in diffWordsWithSpace --- src/diff/word.js | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/diff/word.js b/src/diff/word.js index c866da7d..98bb0e8b 100644 --- a/src/diff/word.js +++ b/src/diff/word.js @@ -249,7 +249,13 @@ function dedupeWhitespaceInChangeObjects(startKeep, deletion, insertion, endKeep export const wordWithSpaceDiff = new Diff(); wordWithSpaceDiff.tokenize = function(value) { - return value.match(tokenizeIncludingWhitespace) || []; + // Slightly different to the tokenizeIncludingWhitespace regex used above in + // that this one treats each individual newline as a distinct tokens, rather + // than merging them into other surrounding whitespace. This was requested + // in https://github.com/kpdecker/jsdiff/issues/180 & + // https://github.com/kpdecker/jsdiff/issues/211 + const regex = new RegExp(`\r?\n|[${extendedWordChars}]+|\\s+|[^${extendedWordChars}]`, 'ug'); + return value.match(regex) || []; }; export function diffWordsWithSpace(oldStr, newStr, options) { options = generateOptions(options, {}); From ebcbb6dbefecfdc9993f061be0db3f29bdcba936 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Wed, 6 Mar 2024 11:42:56 +0000 Subject: [PATCH 22/52] Fix a minor oopsie (but it doesn't fix the test failure...) --- src/diff/word.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diff/word.js b/src/diff/word.js index 98bb0e8b..1bc2362e 100644 --- a/src/diff/word.js +++ b/src/diff/word.js @@ -254,7 +254,7 @@ wordWithSpaceDiff.tokenize = function(value) { // than merging them into other surrounding whitespace. This was requested // in https://github.com/kpdecker/jsdiff/issues/180 & // https://github.com/kpdecker/jsdiff/issues/211 - const regex = new RegExp(`\r?\n|[${extendedWordChars}]+|\\s+|[^${extendedWordChars}]`, 'ug'); + const regex = new RegExp(`\\r?\\n|[${extendedWordChars}]+|\\s+|[^${extendedWordChars}]`, 'ug'); return value.match(regex) || []; }; export function diffWordsWithSpace(oldStr, newStr, options) { From 1b5595c3ed4dcc942ed53ed0c20683c407d41d02 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Wed, 6 Mar 2024 12:13:32 +0000 Subject: [PATCH 23/52] Fix test failure about Windows newlines --- src/diff/word.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diff/word.js b/src/diff/word.js index 1bc2362e..679de9a6 100644 --- a/src/diff/word.js +++ b/src/diff/word.js @@ -254,7 +254,7 @@ wordWithSpaceDiff.tokenize = function(value) { // than merging them into other surrounding whitespace. This was requested // in https://github.com/kpdecker/jsdiff/issues/180 & // https://github.com/kpdecker/jsdiff/issues/211 - const regex = new RegExp(`\\r?\\n|[${extendedWordChars}]+|\\s+|[^${extendedWordChars}]`, 'ug'); + const regex = new RegExp(`(\\r?\\n)|[${extendedWordChars}]+|[^\\S\\n\\r]+|[^${extendedWordChars}]`, 'ug'); return value.match(regex) || []; }; export function diffWordsWithSpace(oldStr, newStr, options) { From 76ce2a35d96eabf19af93b5c1ced43a5d394ff00 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Wed, 6 Mar 2024 12:19:12 +0000 Subject: [PATCH 24/52] Document postProcess --- README.md | 1 + src/diff/base.js | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 142727ac..b09636d2 100644 --- a/README.md +++ b/README.md @@ -182,6 +182,7 @@ For even more customisation of the diffing behavior, you can create a `new Diff. * `removeEmpty(array)`: called on the arrays of tokens returned by `tokenize` and can be used to modify them. Defaults to stripping out falsey tokens, such as empty strings. `diffArrays` overrides this to simply return the `array`, which means that falsey values like empty strings can be handled like any other token by `diffArrays`. * `equals(left, right, options)`: called to determine if two tokens (one from the old string, one from the new string) should be considered equal. Defaults to comparing them with `===`. * `join(tokens)`: gets called with an array of consecutive tokens that have either all been added, all been removed, or are all common. Needs to join them into a single value that can be used as the `value` property of the [change object](#change-objects) for these tokens. Defaults to simply returning `tokens.join('')`. +* `postProcess(changeObjects)`: gets called at the end of the algorithm with the [change objects](#change-objects) produced, and can do final cleanups on them. Defaults to simply returning `changeObjects` unchanged. ### Change Objects Many of the methods above return change objects. These objects consist of the following fields: diff --git a/src/diff/base.js b/src/diff/base.js index ba6cd2fe..e240bf66 100644 --- a/src/diff/base.js +++ b/src/diff/base.js @@ -211,7 +211,6 @@ Diff.prototype = { join(chars) { return chars.join(''); }, - // TODO: Document me postProcess(changeObjects) { return changeObjects; } From 51f7cf5440f2c6b27fd1b8584538b580e3c8a5a8 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Wed, 6 Mar 2024 12:32:37 +0000 Subject: [PATCH 25/52] Remove done TODO --- test/diff/word.js | 3 --- 1 file changed, 3 deletions(-) diff --git a/test/diff/word.js b/test/diff/word.js index a04065d0..af8163bc 100644 --- a/test/diff/word.js +++ b/test/diff/word.js @@ -115,9 +115,6 @@ describe('WordDiff', function() { expect(convertChangesToXML(diffResult)).to.equal('New:,Value:Test,More,Data '); }); - // TODO: Review all tests below here - // Diff on word boundary - // Diff without changes it('should handle identity', function() { const diffResult = diffWords('New Value', 'New Value'); From 8f02ce9f4d4a8f0c6f1f6c8ad1f83c4af6694891 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Wed, 6 Mar 2024 14:07:16 +0000 Subject: [PATCH 26/52] Add (failing) test of one-change-per-token mode --- test/diff/word.js | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/diff/word.js b/test/diff/word.js index af8163bc..c2e2ad50 100644 --- a/test/diff/word.js +++ b/test/diff/word.js @@ -110,6 +110,13 @@ describe('WordDiff', function() { }); }); + it('should skip postprocessing of change objects in one-change-object-per-token mode', function() { + const diffResult = diffWords('Foo Bar', 'Foo Baz', {oneChangePerToken: true}); + expect(convertChangesToXML(diffResult)).to.equal( + 'Foo Bar Baz' + ); + }); + it('should treat punctuation characters as tokens', function() { let diffResult = diffWords('New:Value:Test', 'New,Value,More,Data '); expect(convertChangesToXML(diffResult)).to.equal('New:,Value:Test,More,Data '); From aaa865cb520d45bf048a590362bedd69060cdc6b Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Wed, 6 Mar 2024 14:09:54 +0000 Subject: [PATCH 27/52] Get test passing --- src/diff/base.js | 2 +- src/diff/word.js | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/diff/base.js b/src/diff/base.js index e240bf66..efeb4f8f 100644 --- a/src/diff/base.js +++ b/src/diff/base.js @@ -11,7 +11,7 @@ Diff.prototype = { let self = this; function done(value) { - value = self.postProcess(value); + value = self.postProcess(value, options); if (callback) { setTimeout(function() { callback(value); }, 0); return true; diff --git a/src/diff/word.js b/src/diff/word.js index 679de9a6..48a65c79 100644 --- a/src/diff/word.js +++ b/src/diff/word.js @@ -102,10 +102,8 @@ wordDiff.join = function(tokens) { }).join(''); }; -wordDiff.postProcess = function(changes) { - // TODO: skip all the stuff below in 1 token per co mode - - if (!changes) { +wordDiff.postProcess = function(changes, options) { + if (!changes || options.oneChangePerToken) { return changes; } From 0fd6c09a68fc8a907208a398ab738008d3044597 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Wed, 6 Mar 2024 14:11:30 +0000 Subject: [PATCH 28/52] Fix typo --- src/diff/word.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diff/word.js b/src/diff/word.js index 48a65c79..028b0f1b 100644 --- a/src/diff/word.js +++ b/src/diff/word.js @@ -88,7 +88,7 @@ wordDiff.tokenize = function(value) { wordDiff.join = function(tokens) { // Tokens being joined here will always have appeared consecutively in the // same text, so we can simply strip off the leading whitespace from all the - // tokens except the first (and expect any whitespace-only tokens) and then + // tokens except the first (and except any whitespace-only tokens) and then // join them and the whitespace around words and punctuation will end up // correct. return tokens.map((token, i) => { From 3108dfa8d787a6d7b81b0ce7bcd2a71ab11c3751 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Wed, 6 Mar 2024 14:28:01 +0000 Subject: [PATCH 29/52] Tweak code to improve coverage stat --- src/diff/word.js | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/diff/word.js b/src/diff/word.js index 028b0f1b..daa8451a 100644 --- a/src/diff/word.js +++ b/src/diff/word.js @@ -88,14 +88,12 @@ wordDiff.tokenize = function(value) { wordDiff.join = function(tokens) { // Tokens being joined here will always have appeared consecutively in the // same text, so we can simply strip off the leading whitespace from all the - // tokens except the first (and except any whitespace-only tokens) and then - // join them and the whitespace around words and punctuation will end up - // correct. + // tokens except the first (and except any whitespace-only tokens - but such + // a token will always be the first and only token anyway) and then join them + // and the whitespace around words and punctuation will end up correct. return tokens.map((token, i) => { if (i == 0) { return token; - } else if ((/^\s+$/).test(token)) { - return token; } else { return token.replace((/^\s+/), ''); } From 1a26257e4bce4d50d09d013bcfd9dfd580ccc737 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Wed, 6 Mar 2024 14:30:17 +0000 Subject: [PATCH 30/52] Remove unreachable code to placate istanbul --- src/diff/word.js | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/diff/word.js b/src/diff/word.js index daa8451a..b9884d39 100644 --- a/src/diff/word.js +++ b/src/diff/word.js @@ -176,11 +176,6 @@ function dedupeWhitespaceInChangeObjects(startKeep, deletion, insertion, endKeep // * Just an "insert" // * Just a "delete" // We handle the three cases separately. - - if (!deletion && !insertion) { - throw Error("deletion & insertion are both null; this shouldn't happen"); - } - if (deletion && insertion) { const oldWsPrefix = deletion.value.match(/^\s*/)[0]; const oldWsSuffix = deletion.value.match(/\s*$/)[0]; From 2c3ccd8460ed4936f6655364a249509a723e9f59 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Wed, 6 Mar 2024 14:32:59 +0000 Subject: [PATCH 31/52] Add an extra test to improve coverage --- test/diff/word.js | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/diff/word.js b/test/diff/word.js index c2e2ad50..2b5080d7 100644 --- a/test/diff/word.js +++ b/test/diff/word.js @@ -117,6 +117,13 @@ describe('WordDiff', function() { ); }); + it('should respect options.ignoreCase', function() { + const diffResult = diffWords('foo bar baz', 'FOO BAR QUX', {ignoreCase: true}); + expect(convertChangesToXML(diffResult)).to.equal( + 'FOO BAR bazQUX' + ); + }); + it('should treat punctuation characters as tokens', function() { let diffResult = diffWords('New:Value:Test', 'New,Value,More,Data '); expect(convertChangesToXML(diffResult)).to.equal('New:,Value:Test,More,Data '); From 29eff792c9e1c9f9987683606433508b1168003b Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Wed, 6 Mar 2024 14:41:40 +0000 Subject: [PATCH 32/52] Replace for...of loops with .forEach to placate istanbul --- src/diff/word.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/diff/word.js b/src/diff/word.js index b9884d39..0db3bb7b 100644 --- a/src/diff/word.js +++ b/src/diff/word.js @@ -63,7 +63,7 @@ wordDiff.tokenize = function(value) { let parts = value.match(tokenizeIncludingWhitespace) || []; const tokens = []; let prevPart = null; - for (const part of parts) { + parts.forEach(part => { if ((/\s/).test(part)) { if (prevPart == null) { tokens.push(part); @@ -81,7 +81,7 @@ wordDiff.tokenize = function(value) { } prevPart = part; - } + }); return tokens; }; @@ -110,7 +110,7 @@ wordDiff.postProcess = function(changes, options) { // "keep" change object. There can be at most one of each. let insertion = null; let deletion = null; - for (const change of changes) { + changes.forEach(change => { if (change.added) { insertion = change; } else if (change.removed) { @@ -123,7 +123,7 @@ wordDiff.postProcess = function(changes, options) { insertion = null; deletion = null; } - } + }); if (insertion || deletion) { dedupeWhitespaceInChangeObjects(lastKeep, deletion, insertion, null); } From 5cbb17af174bc0cdbd06693e56aa8d54ccb97a6c Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Wed, 6 Mar 2024 16:24:53 +0000 Subject: [PATCH 33/52] Begin testing string utils --- test/util/string.js | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 test/util/string.js diff --git a/test/util/string.js b/test/util/string.js new file mode 100644 index 00000000..9e101e71 --- /dev/null +++ b/test/util/string.js @@ -0,0 +1,15 @@ +import {longestCommonPrefix} from '../../lib/util/string'; +import {expect} from 'chai'; + +describe('#longestCommonPrefix', function() { + it('finds the longest common prefix', function() { + expect(longestCommonPrefix('food', 'foolish')).to.equal('foo'); + expect(longestCommonPrefix('foolish', 'food')).to.equal('foo'); + expect(longestCommonPrefix('foolish', 'foo')).to.equal('foo'); + expect(longestCommonPrefix('foo', 'foolish')).to.equal('foo'); + expect(longestCommonPrefix('foo', '')).to.equal(''); + expect(longestCommonPrefix('', 'foo')).to.equal(''); + expect(longestCommonPrefix('', '')).to.equal(''); + expect(longestCommonPrefix('foo', 'bar')).to.equal(''); + }); +}); From d09ace3227a09bda8eec7572dc206599f460fef2 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Wed, 6 Mar 2024 16:30:01 +0000 Subject: [PATCH 34/52] Add another test; fix serious bugs --- src/util/string.js | 12 ++++++++++-- test/util/string.js | 16 +++++++++++++++- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/src/util/string.js b/src/util/string.js index 08d7796b..316e2a83 100644 --- a/src/util/string.js +++ b/src/util/string.js @@ -12,12 +12,20 @@ export function longestCommonPrefix(str1, str2) { export function longestCommonSuffix(str1, str2) { let i; + + // Unlike longestCommonPrefix, we need a special case to handle all scenarios + // where we return the empty string since str1.slice(-0) will return the + // entire string. + if (!str1 || !str2 || str1[str1.length - 1] != str2[str2.length - 1]) { + return ''; + } + for (i = 0; i < str1.length && i < str2.length; i++) { - if (str1[str1.length - (i + 1)] != str2[i - (i + 1)]) { + if (str1[str1.length - (i + 1)] != str2[str2.length - (i + 1)]) { return str1.slice(-i); } } - return str1.slice(0, i); + return str1.slice(-i); } export function replacePrefix(string, oldPrefix, newPrefix) { diff --git a/test/util/string.js b/test/util/string.js index 9e101e71..71e90db7 100644 --- a/test/util/string.js +++ b/test/util/string.js @@ -1,4 +1,4 @@ -import {longestCommonPrefix} from '../../lib/util/string'; +import {longestCommonPrefix, longestCommonSuffix} from '../../lib/util/string'; import {expect} from 'chai'; describe('#longestCommonPrefix', function() { @@ -13,3 +13,17 @@ describe('#longestCommonPrefix', function() { expect(longestCommonPrefix('foo', 'bar')).to.equal(''); }); }); + +describe('#longestCommonSuffix', function() { + it('finds the longest common suffix', function() { + expect(longestCommonSuffix('bumpy', 'grumpy')).to.equal('umpy'); + expect(longestCommonSuffix('grumpy', 'bumpy')).to.equal('umpy'); + expect(longestCommonSuffix('grumpy', 'umpy')).to.equal('umpy'); + expect(longestCommonSuffix('umpy', 'grumpy')).to.equal('umpy'); + expect(longestCommonSuffix('foo', '')).to.equal(''); + expect(longestCommonSuffix('', 'foo')).to.equal(''); + expect(longestCommonSuffix('', '')).to.equal(''); + expect(longestCommonSuffix('foo', 'bar')).to.equal(''); + }); +}); + From 493fbd26eaf877b090d101330bd3ab9627c6f5a5 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Wed, 6 Mar 2024 16:56:03 +0000 Subject: [PATCH 35/52] Test another util --- test/util/string.js | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/test/util/string.js b/test/util/string.js index 71e90db7..d29be447 100644 --- a/test/util/string.js +++ b/test/util/string.js @@ -1,4 +1,4 @@ -import {longestCommonPrefix, longestCommonSuffix} from '../../lib/util/string'; +import {longestCommonPrefix, longestCommonSuffix, replacePrefix} from '../../lib/util/string'; import {expect} from 'chai'; describe('#longestCommonPrefix', function() { @@ -27,3 +27,14 @@ describe('#longestCommonSuffix', function() { }); }); +describe('#replacePrefix', function() { + it('replaces a prefix on a string with a different prefix', function() { + expect((replacePrefix('food', 'foo', 'gla'))).to.equal('glad'); + expect((replacePrefix('food', '', 'good '))).to.equal('good food'); + }); + + it("throws if the prefix to remove isn't present", function() { + // eslint-disable-next-line dot-notation + expect(() => replacePrefix('food', 'drin', 'goo')).to.throw(); + }); +}); From c36e533316e0209f13a920b6eeb2d1e26098e1cf Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Wed, 6 Mar 2024 16:57:58 +0000 Subject: [PATCH 36/52] Test more --- test/util/string.js | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/test/util/string.js b/test/util/string.js index d29be447..6bfab828 100644 --- a/test/util/string.js +++ b/test/util/string.js @@ -1,4 +1,4 @@ -import {longestCommonPrefix, longestCommonSuffix, replacePrefix} from '../../lib/util/string'; +import {longestCommonPrefix, longestCommonSuffix, replacePrefix, replaceSuffix} from '../../lib/util/string'; import {expect} from 'chai'; describe('#longestCommonPrefix', function() { @@ -38,3 +38,15 @@ describe('#replacePrefix', function() { expect(() => replacePrefix('food', 'drin', 'goo')).to.throw(); }); }); + +describe('#replaceSuffx', function() { + it('replaces a suffix on a string with a different suffix', function() { + expect((replaceSuffix('bangle', 'gle', 'jo'))).to.equal('banjo'); + expect((replaceSuffix('bun', '', 'gle'))).to.equal('bungle'); + }); + + it("throws if the suffix to remove isn't present", function() { + // eslint-disable-next-line dot-notation + expect(() => replaceSuffix('food', 'ool', 'ondle')).to.throw(); + }); +}); From 42da26e4005a4dd0e88d2c5ccb5e825b5f1f3c07 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Wed, 6 Mar 2024 17:03:01 +0000 Subject: [PATCH 37/52] Add more tests --- test/util/string.js | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/test/util/string.js b/test/util/string.js index 6bfab828..dc7c66c3 100644 --- a/test/util/string.js +++ b/test/util/string.js @@ -1,4 +1,4 @@ -import {longestCommonPrefix, longestCommonSuffix, replacePrefix, replaceSuffix} from '../../lib/util/string'; +import {longestCommonPrefix, longestCommonSuffix, replacePrefix, replaceSuffix, removePrefix, removeSuffix} from '../../lib/util/string'; import {expect} from 'chai'; describe('#longestCommonPrefix', function() { @@ -39,7 +39,7 @@ describe('#replacePrefix', function() { }); }); -describe('#replaceSuffx', function() { +describe('#replaceSuffix', function() { it('replaces a suffix on a string with a different suffix', function() { expect((replaceSuffix('bangle', 'gle', 'jo'))).to.equal('banjo'); expect((replaceSuffix('bun', '', 'gle'))).to.equal('bungle'); @@ -50,3 +50,29 @@ describe('#replaceSuffx', function() { expect(() => replaceSuffix('food', 'ool', 'ondle')).to.throw(); }); }); + +describe('#removePrefix', function() { + it('removes a prefix', function() { + expect(removePrefix('inconceivable', 'in')).to.equal('conceivable'); + expect(removePrefix('inconceivable', '')).to.equal('inconceivable'); + expect(removePrefix('inconceivable', 'inconceivable')).to.equal(''); + }); + + it("throws if the prefix to remove isn't present", function() { + // eslint-disable-next-line dot-notation + expect(() => removePrefix('food', 'dr')).to.throw(); + }); +}); + +describe('#removeSuffix', function() { + it('removes a suffix', function() { + expect(removeSuffix('counterfactual', 'factual')).to.equal('counter'); + expect(removeSuffix('counterfactual', '')).to.equal('counterfactual'); + expect(removeSuffix('counterfactual', 'counterfactual')).to.equal(''); + }); + + it("throws if tyhe suffix to remove isn't present", function() { + // eslint-disable-next-line dot-notation + expect(() => removeSuffix('food', 'dr')).to.throw(); + }); +}); From 5bda1116ebfed643c11dd6cb1d9d548bbbb046df Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Thu, 7 Mar 2024 09:33:34 +0000 Subject: [PATCH 38/52] Test more --- test/util/string.js | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/test/util/string.js b/test/util/string.js index dc7c66c3..df151b32 100644 --- a/test/util/string.js +++ b/test/util/string.js @@ -1,4 +1,4 @@ -import {longestCommonPrefix, longestCommonSuffix, replacePrefix, replaceSuffix, removePrefix, removeSuffix} from '../../lib/util/string'; +import {longestCommonPrefix, longestCommonSuffix, replacePrefix, replaceSuffix, removePrefix, removeSuffix, maximumOverlap} from '../../lib/util/string'; import {expect} from 'chai'; describe('#longestCommonPrefix', function() { @@ -71,8 +71,20 @@ describe('#removeSuffix', function() { expect(removeSuffix('counterfactual', 'counterfactual')).to.equal(''); }); - it("throws if tyhe suffix to remove isn't present", function() { + it("throws if the suffix to remove isn't present", function() { // eslint-disable-next-line dot-notation expect(() => removeSuffix('food', 'dr')).to.throw(); }); }); + +describe('#maximumOverlap', function() { + it('finds the maximum overlap between the end of one string and the start of the other', function() { + expect(maximumOverlap('qwertyuiop', 'uiopasdfgh')).to.equal('uiop'); + expect(maximumOverlap('qwertyuiop', 'qwertyuiop')).to.equal('qwertyuiop'); + expect(maximumOverlap('qwertyuiop', 'asdfghjkl')).to.equal(''); + expect(maximumOverlap('qwertyuiop', '')).to.equal(''); + expect(maximumOverlap('uiopasdfgh', 'qwertyuiop')).to.equal(''); + expect(maximumOverlap('x ', ' x')).to.equal(' '); + expect(maximumOverlap('', '')).to.equal(''); + }); +}); From 549ed5f8ce967039ad6a905247267e095b0bed72 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Thu, 7 Mar 2024 16:19:41 +0000 Subject: [PATCH 39/52] Remove TODOs --- src/util/string.js | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/util/string.js b/src/util/string.js index 316e2a83..bcb0ae92 100644 --- a/src/util/string.js +++ b/src/util/string.js @@ -1,5 +1,3 @@ -// TODO: unit test these utils - export function longestCommonPrefix(str1, str2) { let i; for (i = 0; i < str1.length && i < str2.length; i++) { @@ -59,7 +57,6 @@ export function maximumOverlap(string1, string2) { } // Nicked from https://stackoverflow.com/a/60422853/1709587 -// TODO: Check this works before committing function overlapCount(a, b) { // Deal with cases where the strings differ in length let startA = 0; From f1398faf1a8445e55ee805540214f22dbc29b89a Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Thu, 7 Mar 2024 16:22:32 +0000 Subject: [PATCH 40/52] Update docs --- README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b09636d2..25a2b3d8 100644 --- a/README.md +++ b/README.md @@ -34,16 +34,14 @@ Broadly, jsdiff's diff functions all take an old text and a new text and perform Options * `ignoreCase`: If `true`, the uppercase and lowercase forms of a character are considered equal. Defaults to `false`. -* `Diff.diffWords(oldStr, newStr[, options])` - diffs two blocks of text, treating each word and each word separator (punctuation, newline, or run of whitespace) as a token. - - (Whitespace-only tokens are automatically treated as equal to each other, so changes like changing a space to a newline or a run of multiple spaces will be ignored.) +* `Diff.diffWords(oldStr, newStr[, options])` - diffs two blocks of text, treating each word and each punctuation mark as a token. Whitespace is ignored when computing the diff (but preserved as far as possible in the final change objects). Returns a list of [change objects](#change-objects). Options * `ignoreCase`: Same as in `diffChars`. Defaults to false. -* `Diff.diffWordsWithSpace(oldStr, newStr[, options])` - same as `diffWords`, except whitespace-only tokens are not automatically considered equal, so e.g. changing a space to a tab is considered a change. +* `Diff.diffWordsWithSpace(oldStr, newStr[, options])` - diffs two blocks of text, treating each word, punctuation mark, newline, or run of (non-newline) whitespace as a token. * `Diff.diffLines(oldStr, newStr[, options])` - diffs two blocks of text, treating each line as a token. From f892d72daa9418c8482b1b6f26c135d2633e2d3d Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Thu, 7 Mar 2024 16:43:27 +0000 Subject: [PATCH 41/52] Remove unneeded generateOptions calls (and improve test coverage as a consequence) --- src/diff/word.js | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/diff/word.js b/src/diff/word.js index 0db3bb7b..0fe9247d 100644 --- a/src/diff/word.js +++ b/src/diff/word.js @@ -1,6 +1,5 @@ import Diff from './base'; import { longestCommonPrefix, longestCommonSuffix, replacePrefix, replaceSuffix, removePrefix, removeSuffix, maximumOverlap } from '../util/string'; -import {generateOptions} from '../util/params'; // Based on https://en.wikipedia.org/wiki/Latin_script_in_Unicode // @@ -131,7 +130,6 @@ wordDiff.postProcess = function(changes, options) { }; export function diffWords(oldStr, newStr, options) { - options = generateOptions(options, {}); return wordDiff.diff(oldStr, newStr, options); } @@ -249,6 +247,5 @@ wordWithSpaceDiff.tokenize = function(value) { return value.match(regex) || []; }; export function diffWordsWithSpace(oldStr, newStr, options) { - options = generateOptions(options, {}); return wordWithSpaceDiff.diff(oldStr, newStr, options); } From 3da0adb05f69bd6497f1d9736dfd8c49769bf6e3 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Thu, 7 Mar 2024 16:46:22 +0000 Subject: [PATCH 42/52] Purge now-dead code --- src/diff/base.js | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/src/diff/base.js b/src/diff/base.js index efeb4f8f..0b07fc9d 100644 --- a/src/diff/base.js +++ b/src/diff/base.js @@ -42,7 +42,7 @@ Diff.prototype = { let newPos = this.extractCommon(bestPath[0], newString, oldString, 0, options); if (bestPath[0].oldPos + 1 >= oldLen && newPos + 1 >= newLen) { // Identity per the equality and tokenizer - return done(buildValues(self, bestPath[0].lastComponent, newString, oldString, self.useLongestToken, options)); + return done(buildValues(self, bestPath[0].lastComponent, newString, oldString, self.useLongestToken)); } // Once we hit the right edge of the edit graph on some diagonal k, we can @@ -106,7 +106,7 @@ Diff.prototype = { if (basePath.oldPos + 1 >= oldLen && newPos + 1 >= newLen) { // If we have hit the end of both strings, then we are done - return done(buildValues(self, basePath.lastComponent, newString, oldString, self.useLongestToken, options)); + return done(buildValues(self, basePath.lastComponent, newString, oldString, self.useLongestToken)); } else { bestPath[diagonalPath] = basePath; if (basePath.oldPos + 1 >= oldLen) { @@ -216,7 +216,7 @@ Diff.prototype = { } }; -function buildValues(diff, lastComponent, newString, oldString, useLongestToken, options) { +function buildValues(diff, lastComponent, newString, oldString, useLongestToken) { // First we convert our linked list of components in reverse order to an // array in the right order: const components = []; @@ -260,22 +260,5 @@ function buildValues(diff, lastComponent, newString, oldString, useLongestToken, } } - // Special case handle for when one terminal is ignored (i.e. whitespace). - // For this case we merge the terminal into the prior string and drop the change. - // This is only available for string mode. - let finalComponent = components[componentLen - 1]; - if ( - componentLen > 1 - && typeof finalComponent.value === 'string' - && ( - (finalComponent.added && diff.equals('', finalComponent.value, options)) - || - (finalComponent.removed && diff.equals(finalComponent.value, '', options)) - ) - ) { - components[componentLen - 2].value += finalComponent.value; - components.pop(); - } - return components; } From c2e94848fcc4adbae6fb60acad2725c19b1b25a2 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Thu, 7 Mar 2024 16:52:02 +0000 Subject: [PATCH 43/52] Add test to placate coverage metric --- test/diff/word.js | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/diff/word.js b/test/diff/word.js index 2b5080d7..adcfe267 100644 --- a/test/diff/word.js +++ b/test/diff/word.js @@ -253,6 +253,11 @@ describe('WordDiff', function() { expect(convertChangesToXML(diffResult)).to.equal('NewValue Value New ValueNew'); }); + it('should handle empty', function() { + const diffResult = diffWordsWithSpace('', ''); + expect(convertChangesToXML(diffResult)).to.equal(''); + }); + describe('case insensitivity', function() { it("is considered when there's a difference", function() { const diffResult = diffWordsWithSpace('new value', 'New ValueMoreData', {ignoreCase: true}); From dd43b1f789b57a305cd2bcf9b525dab83f958405 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Thu, 7 Mar 2024 16:56:36 +0000 Subject: [PATCH 44/52] Add yet another test to fix a util coverage gap I'd introduced --- test/diff/line.js | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/test/diff/line.js b/test/diff/line.js index 894514ba..610e0ea7 100644 --- a/test/diff/line.js +++ b/test/diff/line.js @@ -204,6 +204,19 @@ describe('diff/line', function() { ); expect(convertChangesToXML(diffResult)).to.equal('line1\nline2\n \n\nline4\n \n'); }); + + it('supports async mode by passing a function as the options argument', function(done) { + diffTrimmedLines( + 'line\r\nold value \r\nline', + 'line \r\nnew value\r\nline', + function(diffResult) { + expect(convertChangesToXML(diffResult)).to.equal( + 'line \r\nold value \r\nnew value\r\nline' + ); + done(); + } + ); + }); }); describe('#diffLinesNL', function() { From 1827219e4a6a26c554849cdc1d48b3b681a847fe Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Fri, 8 Mar 2024 10:54:26 +0000 Subject: [PATCH 45/52] Bold changes in release notes --- release-notes.md | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/release-notes.md b/release-notes.md index 3a27d6e4..3a00e3f8 100644 --- a/release-notes.md +++ b/release-notes.md @@ -4,16 +4,17 @@ [Commits](https://github.com/kpdecker/jsdiff/compare/master...v6.0.0-staging) -- [#435](https://github.com/kpdecker/jsdiff/pull/435) Fix `parsePatch` handling of control characters. `parsePatch` used to interpret various unusual control characters - namely vertical tabs, form feeds, lone carriage returns without a line feed, and EBCDIC NELs - as line breaks when parsing a patch file. This was inconsistent with the behavior of both JsDiff's own `diffLines` method and also the Unix `diff` and `patch` utils, which all simply treat those control characters as ordinary characters. The result of this discrepancy was that some well-formed patches - produced either by `diff` or by JsDiff itself and handled properly by the `patch` util - would be wrongly parsed by `parsePatch`, with the effect that it would disregard the remainder of a hunk after encountering one of these control characters. -- [#439](https://github.com/kpdecker/jsdiff/pull/439) Prefer diffs that order deletions before insertions. When faced with a choice between two diffs with an equal total edit distance, the Myers diff algorithm generally prefers one that does deletions before insertions rather than insertions before deletions. For instance, when diffing `abcd` against `acbd`, it will prefer a diff that says to delete the `b` and then insert a new `b` after the `c`, over a diff that says to insert a `c` before the `b` and then delete the existing `c`. JsDiff deviated from the published Myers algorithm in a way that led to it having the opposite preference in many cases, including that example. This is now fixed, meaning diffs output by JsDiff will more accurately reflect what the published Myers diff algorithm would output. -- [#455](https://github.com/kpdecker/jsdiff/pull/455) The `added` and `removed` properties of change objects are now guaranteed to be set to a boolean value. (Previously, they would be set to `undefined` or omitted entirely instead of setting them to false.) +- [] +- [#435](https://github.com/kpdecker/jsdiff/pull/435) **Fix `parsePatch` handling of control characters.** `parsePatch` used to interpret various unusual control characters - namely vertical tabs, form feeds, lone carriage returns without a line feed, and EBCDIC NELs - as line breaks when parsing a patch file. This was inconsistent with the behavior of both JsDiff's own `diffLines` method and also the Unix `diff` and `patch` utils, which all simply treat those control characters as ordinary characters. The result of this discrepancy was that some well-formed patches - produced either by `diff` or by JsDiff itself and handled properly by the `patch` util - would be wrongly parsed by `parsePatch`, with the effect that it would disregard the remainder of a hunk after encountering one of these control characters. +- [#439](https://github.com/kpdecker/jsdiff/pull/439) **Prefer diffs that order deletions before insertions.** When faced with a choice between two diffs with an equal total edit distance, the Myers diff algorithm generally prefers one that does deletions before insertions rather than insertions before deletions. For instance, when diffing `abcd` against `acbd`, it will prefer a diff that says to delete the `b` and then insert a new `b` after the `c`, over a diff that says to insert a `c` before the `b` and then delete the existing `c`. JsDiff deviated from the published Myers algorithm in a way that led to it having the opposite preference in many cases, including that example. This is now fixed, meaning diffs output by JsDiff will more accurately reflect what the published Myers diff algorithm would output. +- [#455](https://github.com/kpdecker/jsdiff/pull/455) **The `added` and `removed` properties of change objects are now guaranteed to be set to a boolean value.** (Previously, they would be set to `undefined` or omitted entirely instead of setting them to false.) - [#464](https://github.com/kpdecker/jsdiff/pull/464) Specifying `{maxEditLength: 0}` now sets a max edit length of 0 instead of no maximum. -- [#460](https://github.com/kpdecker/jsdiff/pull/460) Added `oneChangePerToken` option. -- [#467](https://github.com/kpdecker/jsdiff/pull/467) When passing a `comparator(left, right)` to `diffArrays`, values from the old array will now consistently be passed as the first argument (`left`) and values from the new array as the second argument (`right`). Previously this was almost (but not quite) always the other way round. -- [#480](https://github.com/kpdecker/jsdiff/pull/480) Passing `maxEditLength` to `createPatch` & `createTwoFilesPatch` now works properly (i.e. returns undefined if the max edit distance is exceeded; previous behavior was to crash with a `TypeError` if the edit distance was exceeded). -- [#486](https://github.com/kpdecker/jsdiff/pull/486) The `ignoreWhitespace` option of `diffLines` behaves more sensibly now. `value`s in returned change objects now include leading/trailing whitespace even when `ignoreWhitespace` is used, just like how with `ignoreCase` the `value`s still reflect the case of one of the original texts instead of being all-lowercase. `ignoreWhitespace` is also now compatible with `newlineIsToken`. Finally, `diffTrimmedLines` is deprecated (and removed from the docs) in favour of using `diffLines` with `ignoreWhitespace: true`; the two are, and always have been, equivalent. -- [#490](https://github.com/kpdecker/jsdiff/pull/490) When calling diffing functions in async mode by passing a `callback` option, the diff result will now be passed as the *first* argument to the callback instead of the second. (Previously, the first argument was never used at all and would always have value `undefined`.) -- [#489](github.com/kpdecker/jsdiff/pull/489) `this.options` no longer exists on `Diff` objects. Instead, `options` is now passed as an argument to methods that rely on options, like `equals(left, right, options)`. This fixes a race condition in async mode, where diffing behaviour could be changed mid-execution if a concurrent usage of the same `Diff` instances overwrote its `options`. +- [#460](https://github.com/kpdecker/jsdiff/pull/460) **Added `oneChangePerToken` option.** +- [#467](https://github.com/kpdecker/jsdiff/pull/467) **Consistent ordering of arguments to `comparator(left, right)`.** Values from the old array will now consistently be passed as the first argument (`left`) and values from the new array as the second argument (`right`). Previously this was almost (but not quite) always the other way round. +- [#480](https://github.com/kpdecker/jsdiff/pull/480) **Passing `maxEditLength` to `createPatch` & `createTwoFilesPatch` now works properly** (i.e. returns undefined if the max edit distance is exceeded; previous behavior was to crash with a `TypeError` if the edit distance was exceeded). +- [#486](https://github.com/kpdecker/jsdiff/pull/486) **The `ignoreWhitespace` option of `diffLines` behaves more sensibly now.** `value`s in returned change objects now include leading/trailing whitespace even when `ignoreWhitespace` is used, just like how with `ignoreCase` the `value`s still reflect the case of one of the original texts instead of being all-lowercase. `ignoreWhitespace` is also now compatible with `newlineIsToken`. Finally, **`diffTrimmedLines` is deprecated** (and removed from the docs) in favour of using `diffLines` with `ignoreWhitespace: true`; the two are, and always have been, equivalent. +- [#490](https://github.com/kpdecker/jsdiff/pull/490) **When calling diffing functions in async mode by passing a `callback` option, the diff result will now be passed as the *first* argument to the callback instead of the second.** (Previously, the first argument was never used at all and would always have value `undefined`.) +- [#489](github.com/kpdecker/jsdiff/pull/489) **`this.options` no longer exists on `Diff` objects.** Instead, `options` is now passed as an argument to methods that rely on options, like `equals(left, right, options)`. This fixes a race condition in async mode, where diffing behaviour could be changed mid-execution if a concurrent usage of the same `Diff` instances overwrote its `options`. ## v5.2.0 From eab93c64e3d5ef5c4374155a1059b09156026053 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Fri, 8 Mar 2024 11:31:30 +0000 Subject: [PATCH 46/52] Add test of ignoreWhitespace option, and restore it --- src/diff/word.js | 8 ++++++++ test/diff/word.js | 9 +++++++++ 2 files changed, 17 insertions(+) diff --git a/src/diff/word.js b/src/diff/word.js index 0fe9247d..78c96e1a 100644 --- a/src/diff/word.js +++ b/src/diff/word.js @@ -130,6 +130,14 @@ wordDiff.postProcess = function(changes, options) { }; export function diffWords(oldStr, newStr, options) { + // This option has never documented and never will be (it's clearer to just + // call `diffWordsWithSpace` directly if you need that behavior), but has + // existed in jsdiff for a long time, so we retain support for it here for + // the sake of backwards compatibility. + if (options?.ignoreWhitespace != null && !options.ignoreWhitespace) { + return diffWordsWithSpace(oldStr, newStr, options); + } + return wordDiff.diff(oldStr, newStr, options); } diff --git a/test/diff/word.js b/test/diff/word.js index adcfe267..502336b3 100644 --- a/test/diff/word.js +++ b/test/diff/word.js @@ -188,6 +188,15 @@ describe('WordDiff', function() { done(); }); }); + + it('calls #diffWordsWithSpace if you pass ignoreWhitespace: false', function() { + const diffResult = diffWords( + 'foo bar', + 'foo\tbar', + {ignoreWhitespace: false} + ); + expect(convertChangesToXML(diffResult)).to.equal('foo \tbar'); + }); }); describe('#diffWordsWithSpace', function() { From 3f8e9b7e48ea853aa473382f4d51b710a122fc09 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Fri, 8 Mar 2024 11:31:42 +0000 Subject: [PATCH 47/52] Add release notes --- release-notes.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/release-notes.md b/release-notes.md index 3a00e3f8..064814b1 100644 --- a/release-notes.md +++ b/release-notes.md @@ -4,7 +4,13 @@ [Commits](https://github.com/kpdecker/jsdiff/compare/master...v6.0.0-staging) -- [] +- [#497](https://github.com/kpdecker/jsdiff/pull/497) **`diffWords` behavior has been radically changed.** Previously, even with `ignoreWhitespace: true`, runs of whitespace were tokens, which led to unhelpful and unintuitive diffing behavior in typical texts. Specifically, even when two texts contained overlapping passages, `diffWords` would sometimes choose to delete all the words from the old text and insert them anew in their new positions in order to avoid having to delete or insert whitespace tokens. Whitespace sequences are no longer tokens as of this release, which affects both the generated diffs and the `count`s. + + Runs of whitespace are still tokens in `diffWordsWithSpace`. + + As part of the changes to `diffWords`, **a new `.postProcess` method has been added on the base `Diff` type**, which can be overridden in custom `Diff` implementations. + + As part of this change, **the ignore - [#435](https://github.com/kpdecker/jsdiff/pull/435) **Fix `parsePatch` handling of control characters.** `parsePatch` used to interpret various unusual control characters - namely vertical tabs, form feeds, lone carriage returns without a line feed, and EBCDIC NELs - as line breaks when parsing a patch file. This was inconsistent with the behavior of both JsDiff's own `diffLines` method and also the Unix `diff` and `patch` utils, which all simply treat those control characters as ordinary characters. The result of this discrepancy was that some well-formed patches - produced either by `diff` or by JsDiff itself and handled properly by the `patch` util - would be wrongly parsed by `parsePatch`, with the effect that it would disregard the remainder of a hunk after encountering one of these control characters. - [#439](https://github.com/kpdecker/jsdiff/pull/439) **Prefer diffs that order deletions before insertions.** When faced with a choice between two diffs with an equal total edit distance, the Myers diff algorithm generally prefers one that does deletions before insertions rather than insertions before deletions. For instance, when diffing `abcd` against `acbd`, it will prefer a diff that says to delete the `b` and then insert a new `b` after the `c`, over a diff that says to insert a `c` before the `b` and then delete the existing `c`. JsDiff deviated from the published Myers algorithm in a way that led to it having the opposite preference in many cases, including that example. This is now fixed, meaning diffs output by JsDiff will more accurately reflect what the published Myers diff algorithm would output. - [#455](https://github.com/kpdecker/jsdiff/pull/455) **The `added` and `removed` properties of change objects are now guaranteed to be set to a boolean value.** (Previously, they would be set to `undefined` or omitted entirely instead of setting them to false.) From 94f5958c93adbf6ace51ab4159b9939c393f8b97 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Fri, 8 Mar 2024 11:40:28 +0000 Subject: [PATCH 48/52] Remove unfinished sentence --- release-notes.md | 1 - 1 file changed, 1 deletion(-) diff --git a/release-notes.md b/release-notes.md index 064814b1..f5abc97a 100644 --- a/release-notes.md +++ b/release-notes.md @@ -10,7 +10,6 @@ As part of the changes to `diffWords`, **a new `.postProcess` method has been added on the base `Diff` type**, which can be overridden in custom `Diff` implementations. - As part of this change, **the ignore - [#435](https://github.com/kpdecker/jsdiff/pull/435) **Fix `parsePatch` handling of control characters.** `parsePatch` used to interpret various unusual control characters - namely vertical tabs, form feeds, lone carriage returns without a line feed, and EBCDIC NELs - as line breaks when parsing a patch file. This was inconsistent with the behavior of both JsDiff's own `diffLines` method and also the Unix `diff` and `patch` utils, which all simply treat those control characters as ordinary characters. The result of this discrepancy was that some well-formed patches - produced either by `diff` or by JsDiff itself and handled properly by the `patch` util - would be wrongly parsed by `parsePatch`, with the effect that it would disregard the remainder of a hunk after encountering one of these control characters. - [#439](https://github.com/kpdecker/jsdiff/pull/439) **Prefer diffs that order deletions before insertions.** When faced with a choice between two diffs with an equal total edit distance, the Myers diff algorithm generally prefers one that does deletions before insertions rather than insertions before deletions. For instance, when diffing `abcd` against `acbd`, it will prefer a diff that says to delete the `b` and then insert a new `b` after the `c`, over a diff that says to insert a `c` before the `b` and then delete the existing `c`. JsDiff deviated from the published Myers algorithm in a way that led to it having the opposite preference in many cases, including that example. This is now fixed, meaning diffs output by JsDiff will more accurately reflect what the published Myers diff algorithm would output. - [#455](https://github.com/kpdecker/jsdiff/pull/455) **The `added` and `removed` properties of change objects are now guaranteed to be set to a boolean value.** (Previously, they would be set to `undefined` or omitted entirely instead of setting them to false.) From 11237c48a7304deb7c0a3bafad08a6bcd806e8b5 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Fri, 8 Mar 2024 12:02:55 +0000 Subject: [PATCH 49/52] Test and document inadvertent (but welcome) fix I accidentally made to diffTrimmedLines --- release-notes.md | 2 ++ test/diff/line.js | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/release-notes.md b/release-notes.md index f5abc97a..6d093db9 100644 --- a/release-notes.md +++ b/release-notes.md @@ -10,6 +10,8 @@ As part of the changes to `diffWords`, **a new `.postProcess` method has been added on the base `Diff` type**, which can be overridden in custom `Diff` implementations. + **`diffLines` with `ignoreWhitespace: true` will no longer ignore the insertion or deletion of entire extra lines of whitespace at the end of the text**. Previously, these would not show up as insertions or deletions, as a side effect of a hack in the base diffing algorithm meant to help ignore whitespace in `diffWords`. More generally, **the undocumented special handling in the core algorithm for ignored terminals has been removed entirely.** (This special case behavior used to rewrite the final two change objects in a scenario where the final change object was an addition or deletion and its `value` was treated as equal to the empty string when compared using the diff object's `.equals` method.) + - [#435](https://github.com/kpdecker/jsdiff/pull/435) **Fix `parsePatch` handling of control characters.** `parsePatch` used to interpret various unusual control characters - namely vertical tabs, form feeds, lone carriage returns without a line feed, and EBCDIC NELs - as line breaks when parsing a patch file. This was inconsistent with the behavior of both JsDiff's own `diffLines` method and also the Unix `diff` and `patch` utils, which all simply treat those control characters as ordinary characters. The result of this discrepancy was that some well-formed patches - produced either by `diff` or by JsDiff itself and handled properly by the `patch` util - would be wrongly parsed by `parsePatch`, with the effect that it would disregard the remainder of a hunk after encountering one of these control characters. - [#439](https://github.com/kpdecker/jsdiff/pull/439) **Prefer diffs that order deletions before insertions.** When faced with a choice between two diffs with an equal total edit distance, the Myers diff algorithm generally prefers one that does deletions before insertions rather than insertions before deletions. For instance, when diffing `abcd` against `acbd`, it will prefer a diff that says to delete the `b` and then insert a new `b` after the `c`, over a diff that says to insert a `c` before the `b` and then delete the existing `c`. JsDiff deviated from the published Myers algorithm in a way that led to it having the opposite preference in many cases, including that example. This is now fixed, meaning diffs output by JsDiff will more accurately reflect what the published Myers diff algorithm would output. - [#455](https://github.com/kpdecker/jsdiff/pull/455) **The `added` and `removed` properties of change objects are now guaranteed to be set to a boolean value.** (Previously, they would be set to `undefined` or omitted entirely instead of setting them to false.) diff --git a/test/diff/line.js b/test/diff/line.js index 610e0ea7..1461b264 100644 --- a/test/diff/line.js +++ b/test/diff/line.js @@ -157,6 +157,15 @@ describe('diff/line', function() { expect(convertChangesToXML(diffResult4)).to.equal('line\n value\nline'); }); + it('should not ignore the insertion or deletion of lines of whitespace at the end', function() { + const finalChange = diffLines('foo\nbar\n', 'foo\nbar\n \n \n \n', {ignoreWhitespace: true}).pop(); + expect(finalChange.count).to.equal(3); + expect(finalChange.added).to.equal(true); + + const finalChange2 = diffLines('foo\nbar\n\n', 'foo\nbar\n', {ignoreWhitespace: true}).pop(); + expect(finalChange2.removed).to.equal(true); + }); + it('should keep leading and trailing whitespace in the output', function() { function stringify(value) { return JSON.stringify(value, null, 2); From 75d99ab6e1e6f511707b1b4a8c0e1afe66b5113e Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Fri, 8 Mar 2024 12:18:57 +0000 Subject: [PATCH 50/52] Fix typo --- src/diff/word.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/diff/word.js b/src/diff/word.js index 78c96e1a..52613ad6 100644 --- a/src/diff/word.js +++ b/src/diff/word.js @@ -130,10 +130,10 @@ wordDiff.postProcess = function(changes, options) { }; export function diffWords(oldStr, newStr, options) { - // This option has never documented and never will be (it's clearer to just - // call `diffWordsWithSpace` directly if you need that behavior), but has - // existed in jsdiff for a long time, so we retain support for it here for - // the sake of backwards compatibility. + // This option has never been documented and never will be (it's clearer to + // just call `diffWordsWithSpace` directly if you need that behavior), but + // has existed in jsdiff for a long time, so we retain support for it here + // for the sake of backwards compatibility. if (options?.ignoreWhitespace != null && !options.ignoreWhitespace) { return diffWordsWithSpace(oldStr, newStr, options); } From a8bc543811bee3b247e5859606b067eb44be817a Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Fri, 8 Mar 2024 15:02:21 +0000 Subject: [PATCH 51/52] Add more tests and fix more bugs about whitespace --- src/diff/word.js | 71 +++++++++++++++++++++++++++++------------------ test/diff/word.js | 12 ++++++++ 2 files changed, 56 insertions(+), 27 deletions(-) diff --git a/src/diff/word.js b/src/diff/word.js index 52613ad6..85ef7fd6 100644 --- a/src/diff/word.js +++ b/src/diff/word.js @@ -213,37 +213,54 @@ function dedupeWhitespaceInChangeObjects(startKeep, deletion, insertion, endKeep if (endKeep) { endKeep.value = endKeep.value.replace(/^\s*/, ''); } - } else { - // As long as we're NOT at the start of the text, the leading whitespace of - // the second "keep" change object can always be nuked, and the trailing - // whitespace of the deletion can always be kept, whether or not they - // match. (The whitespace separating the two keeps will be including at - // the end of startKeep so we don't need to duplicate it on endKeep.) - if (startKeep && endKeep) { - endKeep.value = endKeep.value.replace(/^\s*/, ''); - } else if (endKeep) { - // If we ARE at the start of the text, though, we need to preserve all - // the whitespace on endKeep. In that case, we just want to remove - // whitespace from the end of deletion to the extent that it overlaps - // with the start of endKeep. - const endKeepWsPrefix = endKeep.value.match(/^\s*/)[0]; - const deletionWsSuffix = deletion.value.match(/\s*$/)[0]; - const overlap = maximumOverlap(deletionWsSuffix, endKeepWsPrefix); - deletion.value = removeSuffix(deletion.value, overlap); - } + // otherwise we've got a deletion and no insertion + } else if (startKeep && endKeep) { + const newWsFull = endKeep.value.match(/^\s*/)[0], + delWsStart = deletion.value.match(/^\s*/)[0], + delWsEnd = deletion.value.match(/\s*$/)[0]; - // Leading whitespace on the deleted token can only be deleted to the - // extent that this overlaps with the trailing whitespace on the preceding - // kept token. - if (startKeep) { - const startKeepWsSuffix = startKeep.value.match(/\s*$/)[0]; - const deletionWsPrefix = deletion.value.match(/^\s*/)[0]; - const overlap = maximumOverlap(startKeepWsSuffix, deletionWsPrefix); - deletion.value = removePrefix(deletion.value, overlap); - } + // Any whitespace that comes straight after startKeep in both the old and + // new texts, assign to startKeep and remove from the deletion. + const newWsStart = longestCommonPrefix(newWsFull, delWsStart); + deletion.value = removePrefix(deletion.value, newWsStart); + + // Any whitespace that comes straight before endKeep in both the old and + // new texts, and hasn't already been assigned to startKeep, assign to + // endKeep and remove from the deletion. + const newWsEnd = longestCommonSuffix( + removePrefix(newWsFull, newWsStart), + delWsEnd + ); + deletion.value = removeSuffix(deletion.value, newWsEnd); + endKeep.value = replacePrefix(endKeep.value, newWsFull, newWsEnd); + + // If there's any whitespace from the new text that HASN'T already been + // assigned, assign it to the start: + startKeep.value = replaceSuffix( + startKeep.value, + newWsFull, + newWsFull.slice(0, newWsFull.length - newWsEnd.length) + ); + } else if (endKeep) { + // We are at the start of the text. Preserve all the whitespace on + // endKeep, and just remove whitespace from the end of deletion to the + // extent that it overlaps with the start of endKeep. + const endKeepWsPrefix = endKeep.value.match(/^\s*/)[0]; + const deletionWsSuffix = deletion.value.match(/\s*$/)[0]; + const overlap = maximumOverlap(deletionWsSuffix, endKeepWsPrefix); + deletion.value = removeSuffix(deletion.value, overlap); + } else if (startKeep) { + // We are at the END of the text. Preserve all the whitespace on + // startKeep, and just remove whitespace from the start of deletion to + // the extent that it overlaps with the end of startKeep. + const startKeepWsSuffix = startKeep.value.match(/\s*$/)[0]; + const deletionWsPrefix = deletion.value.match(/^\s*/)[0]; + const overlap = maximumOverlap(startKeepWsSuffix, deletionWsPrefix); + deletion.value = removePrefix(deletion.value, overlap); } } + export const wordWithSpaceDiff = new Diff(); wordWithSpaceDiff.tokenize = function(value) { // Slightly different to the tokenizeIncludingWhitespace regex used above in diff --git a/test/diff/word.js b/test/diff/word.js index 502336b3..708ea555 100644 --- a/test/diff/word.js +++ b/test/diff/word.js @@ -89,6 +89,18 @@ describe('WordDiff', function() { it('(del in middle of text)', function() { const diffResult = diffWords('Even More Value', 'Even Value'); expect(convertChangesToXML(diffResult)).to.equal('Even More Value'); + + // Rules around how to split up the whitespace between the start and + // end "keep" change objects are subtle, as shown by the three examples + // below: + const diffResult2 = diffWords('foo\nbar baz', 'foo baz'); + expect(convertChangesToXML(diffResult2)).to.equal('foo\nbar baz'); + + const diffResult3 = diffWords('foo bar baz', 'foo baz'); + expect(convertChangesToXML(diffResult3)).to.equal('foo bar baz'); + + const diffResult4 = diffWords('foo\nbar baz', 'foo\n baz'); + expect(convertChangesToXML(diffResult4)).to.equal('foo\nbar baz'); }); it('(add at end of text)', function() { From 4304c28498f575500c6a8a156980b22bd22a2dba Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Fri, 8 Mar 2024 15:04:57 +0000 Subject: [PATCH 52/52] Tweak a lying comment --- src/diff/word.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/diff/word.js b/src/diff/word.js index 85ef7fd6..ba8c682e 100644 --- a/src/diff/word.js +++ b/src/diff/word.js @@ -161,7 +161,9 @@ function dedupeWhitespaceInChangeObjects(startKeep, deletion, insertion, endKeep // // 4. Diffing 'foo baz' vs 'foo\nbar baz' // Prior to cleanup, we have K:'foo\n' I:'\nbar ' K:' baz' - // After cleanup, we want K'foo' I:'\nbar' K:' baz' + // After cleanup, we ideally want K'foo' I:'\nbar' K:' baz' + // but don't actually manage this currently (the pre-cleanup change + // objects don't contain enough information to make it possible). // // 5. Diffing 'foo bar baz' vs 'foo baz' // Prior to cleanup, we have K:'foo ' D:' bar ' K:' baz'