* wiggle - apply rejected patches
*
* Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
+ * Copyright (C) 2011 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
*/
/*
- * Find the best match for a patch against
- * a file.
- * The quality of a match is the length of the match minus the
- * differential between the endpoints.
- * We progress through the matrix recording the best
- * match as we find it.
+ * Find the best match for a patch against a file. A patch is a
+ * sequence of chunks each of which is expected to match a particular
+ * locality of the file. So we expect big gaps between where chunks
+ * match, but only small gaps within chunks.
*
- * We perform a full diagonal bredth first traversal assessing
- * the quality of matches at each point.
- * At each point there are two or three previous points,
- * up, back or diagonal if there is a match.
- * We assess the value of the match at each point and choose the
- * best. No match at all is given a score of -3.
+ * The matching algorithm is similar to that in diff.c, so you should
+ * understand that first. However it takes fewer shortcuts and
+ * analyses cost in a more detailed way.
+ *
+ * We walk the whole matrix in a breadth first fashion following a
+ * 'front' on which x+y is constant. Along this front we examine each
+ * diagonal. For each point we calculate a 'value' for the match so
+ * far. This will be in some particlar chunk. For each chunk we
+ * separately record the best value found so far, and where it was.
+ * To choose a new value for each point we calculate based on the
+ * previous value on each neighbouring diagonal and on this diagonal.
+ *
+ * This can result is a set of 'best' matches for each chunk which are
+ * not in the same order that the chunks initially were. This
+ * probably isn't desired, so we choose a 'best' best match and
+ * recurse on each side of it.
+ *
+ * The quality of a match is a somewhat complex function that is
+ * roughly 3 times the number of matching symbols minus the number
+ * of replaced, added, or deleted. This seems to work.
*
* For any point, the best possible score using that point
* is a complete diagonal to the nearest edge. We ignore points
* which cannot contibute to a better overall score.
*
+ * As this is a fairly expensive search we remove uninteresting
+ * symbols before searching. Specifically we only keep alphanumeric
+ * (plus '_') strings. Spaces and punctuation is ignored. This should
+ * contain enough information to achieve a reliable match while scanning
+ * many fewer symbols.
*/
+#include <malloc.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include "wiggle.h"
+
/* This structure keeps track of the current match at each point.
* It holds the start of the match as x,k where k is the
* diagonal, so y = x-k.
* Also the length of the match so far.
* If l == 0, there is no match.
*/
-
-#include <malloc.h>
-#include <ctype.h>
-#include <stdlib.h>
-#include "wiggle.h"
-
-
struct v {
int x, y; /* location of start of match */
int val; /* value of match from x,y to here */
};
/*
- * Here must must determine the 'value' of a partial match.
+ * Here we must determine the 'value' of a partial match.
* The input parameters are:
- * length - the total number of symbols matches
+ * length - the total number of symbols matched
* errs - the total number of insertions or deletions
* dif - the absolute difference between number of insertions and deletions.
*
* - When does adding an extra symbol after a small gap improve the match
* - When does a match become so bad that we would rather start again.
*
- * We would like symetry in our answers so that a good sequence with an out-rider on
- * one end is evaluated the same as a good sequence with an out-rider on the other end.
- * However to do this we cannot really use value of the good sequence to weigh in the
- * outriders favour as in the case of a leading outrider, we do not yet know the value of
- * of the good sequence.
- * First, we need an arbitrary number, X, to say "Given a single symbol, after X errors, we
- * forget that symbol". 5 seems a good number.
- * Next we need to understand how replacements compare to insertions or deletions.
- * Probably a replacement is the same cost as an insertion or deletion.
- * Finally, a few large stretches are better then lots of little ones, so the number
- * of disjoint stretches should be kept low.
+ * We would like symmetry in our answers so that a good sequence with
+ * an out-rider on one end is evaluated the same as a good sequence
+ * with an out-rider on the other end.
+ *
+ * However to do this we cannot really use the value of the good
+ * sequence to weigh in the out-riders favour as in the case of a
+ * leading outrider, we do not yet know the value of the good
+ * sequence.
+ *
+ * First, we need an arbitrary number, X, to say "Given a single
+ * symbol, after X errors, we forget that symbol". 5 seems a good
+ * number.
+ *
+ * Next we need to understand how replacements compare to insertions
+ * or deletions. Probably a replacement is the same cost as an
+ * insertion or deletion. Finally, a few large stretches are better
+ * then lots of little ones, so the number of disjoint stretches
+ * should be kept low.
+ *
* So:
- * Each match after the first adds 5 to value.
- * The first match in a string adds 6.
+ * The first match sets the value to 6.
+ * Each consecutive match adds 3
+ * A non-consecutive match which value is still +ve adds 2
* Each non-match subtracts one unless it is the other half of a replacement.
* A value of 0 causes us to forget where we are and start again.
*
- * We need to not only assess the value at a particular location, but also
- * assess the maximum value we could get if all remaining symbols matched, to
- * help exclude parts of the matrix.
- * The value of that possibility is 6 times the number of remaining symbols, -1 if we
+ * We need to not only assess the value at a particular location, but
+ * also assess the maximum value we could get if all remaining symbols
+ * matched, to help exclude parts of the matrix. The value of that
+ * possibility is 6 times the number of remaining symbols, -1 if we
* just had a match.
*/
/* dir == 0 for match, 1 for k increase, -1 for k decrease */
}
}
+/* Calculate the best possible value that this 'struct v'
+ * could reach if there are 'max' symbols remaining
+ * that could possibly be matches.
+ */
static inline int best_val(struct v *v, int max)
{
if (v->val <= 0)
}
struct best {
- int xlo, ylo, xhi, yhi, val;
+ int xlo, ylo;
+ int xhi, yhi;
+ int val;
};
static inline int min(int a, int b)
int x, y;
f++;
-#if 0
- if (f == ahi+bhi)
- printf("f %d klo %d khi %d\n", f, klo, khi);
-#endif
for (k = klo+1; k <= khi-1 ; k += 2) {
struct v vnew, vnew2;
x = (k+f)/2;
y = x-k;
- /* first consider the diagonal */
+ /* first consider the diagonal - if possible
+ * it is always preferred
+ */
if (match(&a->list[x-1], &b->list[y-1])) {
vnew = v[k];
- update_value(&vnew, 0, k, x);
-#if 0
- printf("new %d,%d %d,%d (%d) ...",
- vnew.x, vy(vnew), x, y, value(vnew, k, x));
-#endif
- if (vnew.c < 0)
+ update_value(&v[k], 0, k, x);
+ if (v[k].c < 0)
abort();
- if (vnew.val > best[vnew.c].val) {
-#if 0
- printf("New best for %d at %d,%d %d,%d, val %d\n",
- vnew.c, vnew.x, vnew.y,
- x, y, vnew.val);
-#endif
- best[vnew.c].xlo = vnew.x;
- best[vnew.c].ylo = vnew.y;
- best[vnew.c].xhi = x;
- best[vnew.c].yhi = y;
- best[vnew.c].val = vnew.val;
+ if (v[k].val > best[v[k].c].val) {
+ int chunk = v[k].c;
+ best[chunk].xlo = v[k].x;
+ best[chunk].ylo = v[k].y;
+ best[chunk].xhi = x;
+ best[chunk].yhi = y;
+ best[chunk].val = v[k].val;
}
- v[k] = vnew;
} else {
+ /* First consider a y-step: adding a
+ * symbol from B */
vnew = v[k+1];
update_value(&vnew, -1, k, x);
/* might cross a chunk boundary */
vnew.c = atoi(b->list[y-1].start+1);
vnew.val = 0;
}
+
+ /* Not consider an x-step: deleting
+ * a symbol. This cannot be a chunk
+ * boundary as there aren't any in 'A'
+ */
vnew2 = v[k-1];
update_value(&vnew2, 1, k, x);
+ /* Now choose the best. */
if (vnew2.val > vnew.val)
v[k] = vnew2;
else
update_value(&v[klo], -1, klo, x);
if (y <= bhi && b->list[y-1].len && b->list[y-1].start[0] == 0) {
v[klo].c = atoi(b->list[y-1].start+1);
-#if 0
- printf("entered %d at %d,%d\n", v[klo].c, x, y);
-#endif
v[klo].val = 0;
}
while (klo+2 < (ahi-bhi) &&
free(valloc);
}
+/* Join two csl lists together.
+ * Simply allocate new space and copy everything in.
+ */
static struct csl *csl_join(struct csl *c1, struct csl *c2)
{
struct csl *c, *cd, *rv;
#endif
/*
- * reduce a file by discarding less interesting words
+ * Reduce a file by discarding less interesting words
* Words that end with a newline are interesting (so all words
* in line-mode are interesting) and words that start with
* and alphanumeric are interesting. This excludes spaces and
* special characters in word mode
* Doing a best-fit comparision on only interesting words is
- * much fast than on all words, and it nearly as good
+ * much faster than on all words, and is nearly as good
*/
static inline int is_skipped(struct elmnt e)
struct file a2, struct file b2)
{
int b;
- int pa, pb;
+ int pa, pb; /* pointers into the a2 and b2 arrays */
pa = pb = 0;
for (b = 1; b < cnt; b++)
if (best[b].val > 0) {
-#if 0
- printf("best %d,%d %d,%d\n",
- best[b].xlo, best[b].ylo,
- best[b].xhi, best[b].yhi);
-#endif
while (pa < a2.elcnt &&
a2.list[pa].start != a1.list[best[b].xlo].start)
pa++;
while (pb > 0 && is_skipped(b2.list[pb-1]))
pb--;
-#if 0
- printf("-> %d,%d\n", pa, pb);
-#endif
best[b].xlo = pa;
best[b].ylo = pb;
while (pa < a2.elcnt &&
- (pa == 0 || a2.list[pa-1].start != a1.list[best[b].xhi-1].start))
+ (pa == 0 || (a2.list[pa-1].start
+ != a1.list[best[b].xhi-1].start)))
pa++;
if (pa == a2.elcnt && best[b].xhi != a1.elcnt)
abort();
while (pb < b2.elcnt &&
- (pb == 0 || b2.list[pb-1].start != b1.list[best[b].yhi-1].start))
+ (pb == 0 || (b2.list[pb-1].start
+ != b1.list[best[b].yhi-1].start)))
pb++;
if (pb == b2.elcnt && best[b].yhi != b1.elcnt)
abort();
- /* now step pa,pb forward over ignored words */
+ /* pa,pb is now the end of the best bit.
+ * Step pa,pb forward over ignored words.
+ */
while (pa < a2.elcnt && is_skipped(a2.list[pa]))
pa++;
while (pb < b2.elcnt && is_skipped(b2.list[pb]))
pb++;
-#if 0
- printf("-> %d,%d\n", pa, pb);
-#endif
best[b].xhi = pa;
best[b].yhi = pb;
}
alo = blo = 0;
ahi = asmall.elcnt;
bhi = bsmall.elcnt;
-/* printf("start: %d,%d %d,%d\n", alo,blo,ahi,bhi); */
for (i = 0; i < chunks+1; i++)
best[i].val = 0;
find_best_inorder(&asmall, &bsmall,
0, asmall.elcnt, 0, bsmall.elcnt,
best, 1, chunks+1);
-#if 0
-/* for(i=0; i < b.elcnt;i++) { printf("%d: ", i); printword(b.list[i]); }*/
- for (i = 1; i <= chunks; i++) {
- printf("end: %d,%d %d,%d\n", best[i].xlo, best[i].ylo,
- best[i].xhi, best[i].yhi);
- printf("<");
- printword(bsmall.list[best[i].ylo]);
- printf("><");
- printword(bsmall.list[best[i].yhi-1]);
- printf(">\n");
- }
-#endif
remap(best, chunks+1, asmall, bsmall, a, b);
-#if 0
-/* for(i=0; i < b.elcnt;i++) { printf("%d: ", i); printword(b.list[i]); }*/
- for (i = 1; i <= chunks; i++)
- printf("end: %d,%d %d,%d\n", best[i].xlo, best[i].ylo,
- best[i].xhi, best[i].yhi);
- printf("small: a %d b %d --- normal: a %d b %d\n", asmall.elcnt,
- bsmall.elcnt, a.elcnt, b.elcnt);
-#endif
csl1 = NULL;
for (i = 1; i <= chunks; i++)
if (best[i].val > 0) {
-#if 0
- int j;
- printf("Before:\n");
- for (j = best[i].xlo; j < best[i].xhi; j++)
- printword(a.list[j]);
- printf("After:\n");
- for (j = best[i].ylo; j < best[i].yhi; j++)
- printword(b.list[j]);
-#endif
csl2 = diff_partial(a, b,
best[i].xlo, best[i].xhi,
best[i].ylo, best[i].yhi);