patience diff: remove myers fallback

phillipwood · phillipwood · commit a6ab8564eb36 · 2025-10-22T14:58:33.000+01:00
When the patience_diff() is called on a range of lines that does not contain any unique context lines it falls back to calling the myers algorithm on that region. The myers implementation calls xdl_optimize_ctxs() but because it is called on a subset of the input it cannot optimize the context lines as well is it does when called on the whole file. In particular insignificant blank lines that would be ignored as matches by the myers algorithm when it is run on the whole file will be considered matches within a smaller region. This has the unfortunate effect of matching blank lines as context when they are not meaningful generating sub-optimal diffs. Instead when there are no unique context lines within a region just find the leading, trailing and inter-hunk context. This is more in keeping with the patience algorithm outlined by Bram Cohen[1]. Note that the interhunk context code rarely finds any interhunk context. In git it finds interhunk context 158 times out of the 8598 hunks where there are no unique lines (there are 368157 hunks in total). Trailing context is even rarer with just 101 instances in the 8598 hunk with no unique lines. There are no instances of Leading context which seems suprising but the leading context is handled cororectly in tests added in this patch. [1] https://bramcohen.livejournal.com/73318.html Signed-off-by: Phillip Wood <phillip.wood@dunelm.org.uk>
diff --git a/t/t4033-diff-patience.sh b/t/t4033-diff-patience.sh
@@ -17,4 +17,46 @@ test_diff_frobnitz "patience"
 
 test_diff_unique "patience"
 
+test_expect_success 'non unique context between deletion and addition' '
+	test_write_lines a b a b c d c d >file &&
+	git add file &&
+	test_write_lines a b c d e c d >file &&
+	git diff --diff-algorithm=patience file >actual &&
+	sed -ne "/^@@/,\$p" actual >hunk &&
+	cat >expect <<-\EOF &&
+	@@ -1,8 +1,7 @@
+	 a
+	 b
+	-a
+	-b
+	 c
+	 d
+	+e
+	 c
+	 d
+	EOF
+	test_cmp expect hunk
+'
+
+test_expect_success 'non unique context between additon and deletion' '
+	test_write_lines a b a b c d c d >file &&
+	git add file &&
+	test_write_lines a b e a b c d >file &&
+	git diff --diff-algorithm=patience file >actual &&
+	sed -ne "/^@@/,\$p" actual >hunk &&
+	cat >expect <<-\EOF &&
+	@@ -1,8 +1,7 @@
+	 a
+	 b
+	+e
+	 a
+	 b
+	 c
+	 d
+	-c
+	-d
+	EOF
+	test_cmp expect hunk
+'
+
 test_done
diff --git a/xdiff/xpatience.c b/xdiff/xpatience.c
@@ -21,6 +21,7 @@
  */
 
 #include "xinclude.h"
+#include "xtypes.h"
 
 /*
  * The basic idea of patience diff is to find lines that are unique in
@@ -303,16 +304,14 @@ static int walk_common_sequence(struct hashmap *map, struct entry *first,
 	}
 }
 
-static int fall_back_to_classic_diff(struct hashmap *map,
-		int line1, int count1, int line2, int count2)
+static bool regions_match(xrecord_t *a, xrecord_t *b, int count)
 {
-	xpparam_t xpp;
-
-	memset(&xpp, 0, sizeof(xpp));
-	xpp.flags = map->xpp->flags & ~XDF_DIFF_ALGORITHM_MASK;
+	for (int i = 0; i < count; i++) {
+		if (a[i].ha != b[i].ha)
+			return false;
+	}
 
-	return xdl_fall_back_diff(map->env, &xpp,
-				  line1, count1, line2, count2);
+	return true;
 }
 
 /*
@@ -357,12 +356,71 @@ static int patience_diff(xpparam_t const *xpp, xdfenv_t *env,
 	result = find_longest_common_sequence(&map, &first);
 	if (result)
 		goto out;
-	if (first)
+	if (first) {
 		result = walk_common_sequence(&map, first,
 			line1, count1, line2, count2);
-	else
-		result = fall_back_to_classic_diff(&map,
-			line1, count1, line2, count2);
+	} else {
+		xrecord_t *rec1 = env->xdf1.recs, *rec2 = env->xdf2.recs;
+		long i1 = line1 - 2, i2 = line2 - 2;
+		long overlap1, overlap2;
+
+		/* Find trailing context */
+		while (count1 && count2 &&
+		       rec1[i1 + count1].ha == rec2[i2 + count2].ha) {
+			count1--;
+			count2--;
+		}
+		/* Find leading context */
+		while (count1 && count2 &&
+		       rec1[line1 - 1].ha == rec2[line2 - 1].ha) {
+			count1--;
+			count2--;
+			line1++;
+			line2++;
+		}
+		/*
+		 * Find inter-hunk context. First see any of the
+		 * trailing deletions match leading additions, then
+		 * check to see if any trailing additions match leading
+		 * deletions and take the longest overlap.
+		 */
+		overlap1 = count1 > count2 ? count2 : count1;
+		if (overlap1)
+			overlap1--;
+		i1 = line1 + (count1 - overlap1) - 1;
+		i2 = line2 - 1;
+		while (overlap1) {
+			if (regions_match(&rec1[i1], &rec2[i2], overlap1))
+			    break;
+			overlap1--;
+			i1++;
+		}
+		overlap2 = count1 > count2 ? count2 : count1;
+		if (overlap2)
+			overlap2--;
+		i1 = line1 - 1;
+		i2 = line2 + (count2 - overlap2) - 1;
+		while (overlap2) {
+			if (regions_match(&rec1[i1], &rec2[i2], overlap2))
+			    break;
+			overlap2--;
+			i2++;
+		}
+		if (overlap1 > overlap2) {
+			count1 -= overlap1;
+			count2 -= overlap1;
+			line2 += overlap1;
+		} else {
+			count1 -= overlap2;
+			line1 += overlap2;
+			count2 -= overlap2;
+		}
+
+		while (count1--)
+			env->xdf1.changed[line1++ - 1] = true;
+		while (count2--)
+			env->xdf2.changed[line2++ - 1] = true;
+	}
  out:
 	xdl_free(map.entries);
 	return result;