From e3a1ca2816b8ab7c96388f06eccf3db68851ac10 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.de>
Date: Wed, 24 Aug 2011 22:49:04 +1000
Subject: [PATCH] Clean up bestmatch.c

Add lots of comments, remove some cruft etc.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 bestmatch.c | 215 +++++++++++++++++++++++++---------------------------
 1 file changed, 103 insertions(+), 112 deletions(-)

diff --git a/bestmatch.c b/bestmatch.c
index 1c899fd..c59bc78 100644
--- a/bestmatch.c
+++ b/bestmatch.c
@@ -2,6 +2,7 @@
  * wiggle - apply rejected patches
  *
  * Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
+ * Copyright (C) 2011 Neil Brown <neilb@suse.de>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
@@ -23,39 +24,54 @@
  */
 
 /*
- * Find the best match for a patch against
- * a file.
- * The quality of a match is the length of the match minus the
- * differential between the endpoints.
- * We progress through the matrix recording the best
- * match as we find it.
+ * Find the best match for a patch against a file.  A patch is a
+ * sequence of chunks each of which is expected to match a particular
+ * locality of the file.  So we expect big gaps between where chunks
+ * match, but only small gaps within chunks.
  *
- * We perform a full diagonal bredth first traversal assessing
- * the quality of matches at each point.
- * At each point there are two or three previous points,
- * up, back or diagonal if there is a match.
- * We assess the value of the match at each point and choose the
- * best.  No match at all is given a score of -3.
+ * The matching algorithm is similar to that in diff.c, so you should
+ * understand that first.  However it takes fewer shortcuts and
+ * analyses cost in a more detailed way.
+ *
+ * We walk the whole matrix in a breadth first fashion following a
+ * 'front' on which x+y is constant.  Along this front we examine each
+ * diagonal.  For each point we calculate a 'value' for the match so
+ * far.  This will be in some particlar chunk.  For each chunk we
+ * separately record the best value found so far, and where it was.
+ * To choose a new value for each point we calculate based on the
+ * previous value on each neighbouring diagonal and on this diagonal.
+ *
+ * This can result is a set of 'best' matches for each chunk which are
+ * not in the same order that the chunks initially were.  This
+ * probably isn't desired, so we choose a 'best' best match and
+ * recurse on each side of it.
+ *
+ * The quality of a match is a somewhat complex function that is
+ * roughly 3 times the number of matching symbols minus the number
+ * of replaced, added, or deleted.  This seems to work.
  *
  * For any point, the best possible score using that point
  * is a complete diagonal to the nearest edge.  We ignore points
  * which cannot contibute to a better overall score.
  *
+ * As this is a fairly expensive search we remove uninteresting
+ * symbols before searching.  Specifically we only keep alphanumeric
+ * (plus '_') strings.  Spaces and punctuation is ignored.  This should
+ * contain enough information to achieve a reliable match while scanning
+ * many fewer symbols.
  */
 
+#include	<malloc.h>
+#include	<ctype.h>
+#include	<stdlib.h>
+#include	"wiggle.h"
+
 /* This structure keeps track of the current match at each point.
  * It holds the start of the match as x,k where k is the
  * diagonal, so y = x-k.
  * Also the length of the match so far.
  * If l == 0, there is no match.
  */
-
-#include	<malloc.h>
-#include	<ctype.h>
-#include	<stdlib.h>
-#include	"wiggle.h"
-
-
 struct v {
 	int x, y;  /* location of start of match */
 	int val;  /* value of match from x,y to here */
@@ -65,9 +81,9 @@ struct v {
 };
 
 /*
- * Here must must determine the 'value' of a partial match.
+ * Here we must determine the 'value' of a partial match.
  * The input parameters are:
- *   length - the total number of symbols matches
+ *   length - the total number of symbols matched
  *   errs  - the total number of insertions or deletions
  *   dif   - the absolute difference between number of insertions and deletions.
  *
@@ -76,27 +92,36 @@ struct v {
  *  - When does adding an extra symbol after a small gap improve the match
  *  - When does a match become so bad that we would rather start again.
  *
- * We would like symetry in our answers so that a good sequence with an out-rider on
- * one end is evaluated the same as a good sequence with an out-rider on the other end.
- * However to do this we cannot really use value of the good sequence to weigh in the
- * outriders favour as in the case of a leading outrider, we do not yet know the value of
- * of the good sequence.
- * First, we need an arbitrary number, X, to say "Given a single symbol, after X errors, we
- * forget that symbol".  5 seems a good number.
- * Next we need to understand how replacements compare to insertions or deletions.
- * Probably a replacement is the same cost as an insertion or deletion.
- * Finally, a few large stretches are better then lots of little ones, so the number
- * of disjoint stretches should be kept low.
+ * We would like symmetry in our answers so that a good sequence with
+ * an out-rider on one end is evaluated the same as a good sequence
+ * with an out-rider on the other end.
+ *
+ * However to do this we cannot really use the value of the good
+ * sequence to weigh in the out-riders favour as in the case of a
+ * leading outrider, we do not yet know the value of the good
+ * sequence.
+ *
+ * First, we need an arbitrary number, X, to say "Given a single
+ * symbol, after X errors, we forget that symbol".  5 seems a good
+ * number.
+ *
+ * Next we need to understand how replacements compare to insertions
+ * or deletions.  Probably a replacement is the same cost as an
+ * insertion or deletion.  Finally, a few large stretches are better
+ * then lots of little ones, so the number of disjoint stretches
+ * should be kept low.
+ *
  * So:
- *   Each match after the first adds 5 to value.
- *   The first match in a string adds 6.
+ *   The first match sets the value to 6.
+ *   Each consecutive match adds 3
+ *   A non-consecutive match which value is still +ve adds 2
  *   Each non-match subtracts one unless it is the other half of a replacement.
  *   A value of 0 causes us to forget where we are and start again.
  *
- * We need to not only assess the value at a particular location, but also
- * assess the maximum value we could get if all remaining symbols matched, to
- * help exclude parts of the matrix.
- * The value of that possibility is 6 times the number of remaining symbols, -1 if we
+ * We need to not only assess the value at a particular location, but
+ * also assess the maximum value we could get if all remaining symbols
+ * matched, to help exclude parts of the matrix.  The value of that
+ * possibility is 6 times the number of remaining symbols, -1 if we
  * just had a match.
  */
 /* dir == 0 for match, 1 for k increase, -1 for k decrease */
@@ -122,6 +147,10 @@ static inline void update_value(struct v *v, int dir, int k, int x)
 	}
 }
 
+/* Calculate the best possible value that this 'struct v'
+ * could reach if there are 'max' symbols remaining
+ * that could possibly be matches.
+ */
 static inline int best_val(struct v *v, int max)
 {
 	if (v->val <= 0)
@@ -131,7 +160,9 @@ static inline int best_val(struct v *v, int max)
 }
 
 struct best {
-	int xlo, ylo, xhi, yhi, val;
+	int xlo, ylo;
+	int xhi, yhi;
+	int val;
 };
 
 static inline int min(int a, int b)
@@ -158,38 +189,29 @@ static void find_best(struct file *a, struct file *b,
 		int x, y;
 
 		f++;
-#if 0
-		if (f == ahi+bhi)
-			printf("f %d klo %d khi %d\n", f, klo, khi);
-#endif
 		for (k = klo+1; k <= khi-1 ; k += 2) {
 			struct v vnew, vnew2;
 			x = (k+f)/2;
 			y = x-k;
-			/* first consider the diagonal */
+			/* first consider the diagonal - if possible
+			 * it is always preferred
+			 */
 			if (match(&a->list[x-1], &b->list[y-1])) {
 				vnew = v[k];
-				update_value(&vnew, 0, k, x);
-#if 0
-				printf("new %d,%d %d,%d (%d) ...",
-				       vnew.x, vy(vnew), x, y, value(vnew, k, x));
-#endif
-				if (vnew.c < 0)
+				update_value(&v[k], 0, k, x);
+				if (v[k].c < 0)
 					abort();
-				if (vnew.val > best[vnew.c].val) {
-#if 0
-					printf("New best for %d at %d,%d %d,%d, val %d\n",
-					       vnew.c, vnew.x, vnew.y,
-					       x, y, vnew.val);
-#endif
-					best[vnew.c].xlo = vnew.x;
-					best[vnew.c].ylo = vnew.y;
-					best[vnew.c].xhi = x;
-					best[vnew.c].yhi = y;
-					best[vnew.c].val = vnew.val;
+				if (v[k].val > best[v[k].c].val) {
+					int chunk = v[k].c;
+					best[chunk].xlo = v[k].x;
+					best[chunk].ylo = v[k].y;
+					best[chunk].xhi = x;
+					best[chunk].yhi = y;
+					best[chunk].val = v[k].val;
 				}
-				v[k] = vnew;
 			} else {
+				/* First consider a y-step: adding a
+				 * symbol from B */
 				vnew = v[k+1];
 				update_value(&vnew, -1, k, x);
 				/* might cross a chunk boundary */
@@ -197,9 +219,15 @@ static void find_best(struct file *a, struct file *b,
 					vnew.c = atoi(b->list[y-1].start+1);
 					vnew.val = 0;
 				}
+
+				/* Not consider an x-step: deleting
+				 * a symbol.  This cannot be a chunk
+				 * boundary as there aren't any in 'A'
+				 */
 				vnew2 = v[k-1];
 				update_value(&vnew2, 1, k, x);
 
+				/* Now choose the best. */
 				if (vnew2.val > vnew.val)
 					v[k] = vnew2;
 				else
@@ -213,9 +241,6 @@ static void find_best(struct file *a, struct file *b,
 		update_value(&v[klo], -1, klo, x);
 		if (y <= bhi && b->list[y-1].len && b->list[y-1].start[0] == 0) {
 			v[klo].c = atoi(b->list[y-1].start+1);
-#if 0
-			printf("entered %d at %d,%d\n", v[klo].c, x, y);
-#endif
 			v[klo].val = 0;
 		}
 		while (klo+2 < (ahi-bhi) &&
@@ -247,6 +272,9 @@ static void find_best(struct file *a, struct file *b,
 	free(valloc);
 }
 
+/* Join two csl lists together.
+ * Simply allocate new space and copy everything in.
+ */
 static struct csl *csl_join(struct csl *c1, struct csl *c2)
 {
 	struct csl *c, *cd,  *rv;
@@ -287,13 +315,13 @@ static void printword(struct elmnt e)
 #endif
 
 /*
- * reduce a file by discarding less interesting words
+ * Reduce a file by discarding less interesting words
  * Words that end with a newline are interesting (so all words
  * in line-mode are interesting) and words that start with
  * and alphanumeric are interesting.  This excludes spaces and
  * special characters in word mode
  * Doing a best-fit comparision on only interesting words is
- * much fast than on all words, and it nearly as good
+ * much faster than on all words, and is nearly as good
  */
 
 static inline int is_skipped(struct elmnt e)
@@ -336,7 +364,7 @@ static void remap(struct best *best, int cnt,
 		  struct file a2, struct file b2)
 {
 	int b;
-	int pa, pb;
+	int pa, pb; /* pointers into the a2 and b2 arrays */
 
 	pa = pb = 0;
 
@@ -345,11 +373,6 @@ static void remap(struct best *best, int cnt,
 
 	for (b = 1; b < cnt; b++)
 	    if (best[b].val > 0) {
-#if 0
-		printf("best %d,%d  %d,%d\n",
-		       best[b].xlo, best[b].ylo,
-		       best[b].xhi, best[b].yhi);
-#endif
 		while (pa < a2.elcnt &&
 		       a2.list[pa].start != a1.list[best[b].xlo].start)
 			pa++;
@@ -369,31 +392,29 @@ static void remap(struct best *best, int cnt,
 		while (pb > 0 && is_skipped(b2.list[pb-1]))
 			pb--;
 
-#if 0
-		printf("-> %d,%d\n", pa, pb);
-#endif
 		best[b].xlo = pa;
 		best[b].ylo = pb;
 
 		while (pa < a2.elcnt &&
-		       (pa == 0 || a2.list[pa-1].start != a1.list[best[b].xhi-1].start))
+		       (pa == 0 || (a2.list[pa-1].start
+				    != a1.list[best[b].xhi-1].start)))
 			pa++;
 		if (pa == a2.elcnt && best[b].xhi != a1.elcnt)
 			abort();
 		while (pb < b2.elcnt &&
-		       (pb == 0 || b2.list[pb-1].start != b1.list[best[b].yhi-1].start))
+		       (pb == 0 || (b2.list[pb-1].start
+				    != b1.list[best[b].yhi-1].start)))
 			pb++;
 		if (pb == b2.elcnt && best[b].yhi != b1.elcnt)
 			abort();
 
-		/* now step pa,pb forward over ignored words */
+		/* pa,pb is now the end of the best bit.
+		 * Step pa,pb forward over ignored words.
+		 */
 		while (pa < a2.elcnt && is_skipped(a2.list[pa]))
 			pa++;
 		while (pb < b2.elcnt && is_skipped(b2.list[pb]))
 			pb++;
-#if 0
-		printf("-> %d,%d\n", pa, pb);
-#endif
 		best[b].xhi = pa;
 		best[b].yhi = pb;
 	}
@@ -464,47 +485,17 @@ struct csl *pdiff(struct file a, struct file b, int chunks)
 	alo = blo = 0;
 	ahi = asmall.elcnt;
 	bhi = bsmall.elcnt;
-/*	printf("start: %d,%d  %d,%d\n", alo,blo,ahi,bhi); */
 
 	for (i = 0; i < chunks+1; i++)
 		best[i].val = 0;
 	find_best_inorder(&asmall, &bsmall,
 			  0, asmall.elcnt, 0, bsmall.elcnt,
 			  best, 1, chunks+1);
-#if 0
-/*	for(i=0; i < b.elcnt;i++) { printf("%d: ", i); printword(b.list[i]); }*/
-	for (i = 1; i <= chunks; i++) {
-		printf("end: %d,%d  %d,%d\n", best[i].xlo, best[i].ylo,
-		       best[i].xhi, best[i].yhi);
-		printf("<");
-		printword(bsmall.list[best[i].ylo]);
-		printf("><");
-		printword(bsmall.list[best[i].yhi-1]);
-		printf(">\n");
-	}
-#endif
 	remap(best, chunks+1, asmall, bsmall, a, b);
-#if 0
-/*	for(i=0; i < b.elcnt;i++) { printf("%d: ", i); printword(b.list[i]); }*/
-	for (i = 1; i <= chunks; i++)
-		printf("end: %d,%d  %d,%d\n", best[i].xlo, best[i].ylo,
-		       best[i].xhi, best[i].yhi);
-	printf("small: a %d b %d --- normal: a %d b %d\n", asmall.elcnt,
-	       bsmall.elcnt, a.elcnt, b.elcnt);
-#endif
 
 	csl1 = NULL;
 	for (i = 1; i <= chunks; i++)
 		if (best[i].val > 0) {
-#if 0
-			int j;
-			printf("Before:\n");
-			for (j = best[i].xlo; j < best[i].xhi; j++)
-				printword(a.list[j]);
-			printf("After:\n");
-			for (j = best[i].ylo; j < best[i].yhi; j++)
-				printword(b.list[j]);
-#endif
 			csl2 = diff_partial(a, b,
 					    best[i].xlo, best[i].xhi,
 					    best[i].ylo, best[i].yhi);
-- 
2.39.5