From e3a1ca2816b8ab7c96388f06eccf3db68851ac10 Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Wed, 24 Aug 2011 22:49:04 +1000 Subject: [PATCH] Clean up bestmatch.c Add lots of comments, remove some cruft etc. Signed-off-by: NeilBrown --- bestmatch.c | 215 +++++++++++++++++++++++++--------------------------- 1 file changed, 103 insertions(+), 112 deletions(-) diff --git a/bestmatch.c b/bestmatch.c index 1c899fd..c59bc78 100644 --- a/bestmatch.c +++ b/bestmatch.c @@ -2,6 +2,7 @@ * wiggle - apply rejected patches * * Copyright (C) 2003 Neil Brown + * Copyright (C) 2011 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -23,39 +24,54 @@ */ /* - * Find the best match for a patch against - * a file. - * The quality of a match is the length of the match minus the - * differential between the endpoints. - * We progress through the matrix recording the best - * match as we find it. + * Find the best match for a patch against a file. A patch is a + * sequence of chunks each of which is expected to match a particular + * locality of the file. So we expect big gaps between where chunks + * match, but only small gaps within chunks. * - * We perform a full diagonal bredth first traversal assessing - * the quality of matches at each point. - * At each point there are two or three previous points, - * up, back or diagonal if there is a match. - * We assess the value of the match at each point and choose the - * best. No match at all is given a score of -3. + * The matching algorithm is similar to that in diff.c, so you should + * understand that first. However it takes fewer shortcuts and + * analyses cost in a more detailed way. + * + * We walk the whole matrix in a breadth first fashion following a + * 'front' on which x+y is constant. Along this front we examine each + * diagonal. For each point we calculate a 'value' for the match so + * far. This will be in some particlar chunk. For each chunk we + * separately record the best value found so far, and where it was. + * To choose a new value for each point we calculate based on the + * previous value on each neighbouring diagonal and on this diagonal. + * + * This can result is a set of 'best' matches for each chunk which are + * not in the same order that the chunks initially were. This + * probably isn't desired, so we choose a 'best' best match and + * recurse on each side of it. + * + * The quality of a match is a somewhat complex function that is + * roughly 3 times the number of matching symbols minus the number + * of replaced, added, or deleted. This seems to work. * * For any point, the best possible score using that point * is a complete diagonal to the nearest edge. We ignore points * which cannot contibute to a better overall score. * + * As this is a fairly expensive search we remove uninteresting + * symbols before searching. Specifically we only keep alphanumeric + * (plus '_') strings. Spaces and punctuation is ignored. This should + * contain enough information to achieve a reliable match while scanning + * many fewer symbols. */ +#include +#include +#include +#include "wiggle.h" + /* This structure keeps track of the current match at each point. * It holds the start of the match as x,k where k is the * diagonal, so y = x-k. * Also the length of the match so far. * If l == 0, there is no match. */ - -#include -#include -#include -#include "wiggle.h" - - struct v { int x, y; /* location of start of match */ int val; /* value of match from x,y to here */ @@ -65,9 +81,9 @@ struct v { }; /* - * Here must must determine the 'value' of a partial match. + * Here we must determine the 'value' of a partial match. * The input parameters are: - * length - the total number of symbols matches + * length - the total number of symbols matched * errs - the total number of insertions or deletions * dif - the absolute difference between number of insertions and deletions. * @@ -76,27 +92,36 @@ struct v { * - When does adding an extra symbol after a small gap improve the match * - When does a match become so bad that we would rather start again. * - * We would like symetry in our answers so that a good sequence with an out-rider on - * one end is evaluated the same as a good sequence with an out-rider on the other end. - * However to do this we cannot really use value of the good sequence to weigh in the - * outriders favour as in the case of a leading outrider, we do not yet know the value of - * of the good sequence. - * First, we need an arbitrary number, X, to say "Given a single symbol, after X errors, we - * forget that symbol". 5 seems a good number. - * Next we need to understand how replacements compare to insertions or deletions. - * Probably a replacement is the same cost as an insertion or deletion. - * Finally, a few large stretches are better then lots of little ones, so the number - * of disjoint stretches should be kept low. + * We would like symmetry in our answers so that a good sequence with + * an out-rider on one end is evaluated the same as a good sequence + * with an out-rider on the other end. + * + * However to do this we cannot really use the value of the good + * sequence to weigh in the out-riders favour as in the case of a + * leading outrider, we do not yet know the value of the good + * sequence. + * + * First, we need an arbitrary number, X, to say "Given a single + * symbol, after X errors, we forget that symbol". 5 seems a good + * number. + * + * Next we need to understand how replacements compare to insertions + * or deletions. Probably a replacement is the same cost as an + * insertion or deletion. Finally, a few large stretches are better + * then lots of little ones, so the number of disjoint stretches + * should be kept low. + * * So: - * Each match after the first adds 5 to value. - * The first match in a string adds 6. + * The first match sets the value to 6. + * Each consecutive match adds 3 + * A non-consecutive match which value is still +ve adds 2 * Each non-match subtracts one unless it is the other half of a replacement. * A value of 0 causes us to forget where we are and start again. * - * We need to not only assess the value at a particular location, but also - * assess the maximum value we could get if all remaining symbols matched, to - * help exclude parts of the matrix. - * The value of that possibility is 6 times the number of remaining symbols, -1 if we + * We need to not only assess the value at a particular location, but + * also assess the maximum value we could get if all remaining symbols + * matched, to help exclude parts of the matrix. The value of that + * possibility is 6 times the number of remaining symbols, -1 if we * just had a match. */ /* dir == 0 for match, 1 for k increase, -1 for k decrease */ @@ -122,6 +147,10 @@ static inline void update_value(struct v *v, int dir, int k, int x) } } +/* Calculate the best possible value that this 'struct v' + * could reach if there are 'max' symbols remaining + * that could possibly be matches. + */ static inline int best_val(struct v *v, int max) { if (v->val <= 0) @@ -131,7 +160,9 @@ static inline int best_val(struct v *v, int max) } struct best { - int xlo, ylo, xhi, yhi, val; + int xlo, ylo; + int xhi, yhi; + int val; }; static inline int min(int a, int b) @@ -158,38 +189,29 @@ static void find_best(struct file *a, struct file *b, int x, y; f++; -#if 0 - if (f == ahi+bhi) - printf("f %d klo %d khi %d\n", f, klo, khi); -#endif for (k = klo+1; k <= khi-1 ; k += 2) { struct v vnew, vnew2; x = (k+f)/2; y = x-k; - /* first consider the diagonal */ + /* first consider the diagonal - if possible + * it is always preferred + */ if (match(&a->list[x-1], &b->list[y-1])) { vnew = v[k]; - update_value(&vnew, 0, k, x); -#if 0 - printf("new %d,%d %d,%d (%d) ...", - vnew.x, vy(vnew), x, y, value(vnew, k, x)); -#endif - if (vnew.c < 0) + update_value(&v[k], 0, k, x); + if (v[k].c < 0) abort(); - if (vnew.val > best[vnew.c].val) { -#if 0 - printf("New best for %d at %d,%d %d,%d, val %d\n", - vnew.c, vnew.x, vnew.y, - x, y, vnew.val); -#endif - best[vnew.c].xlo = vnew.x; - best[vnew.c].ylo = vnew.y; - best[vnew.c].xhi = x; - best[vnew.c].yhi = y; - best[vnew.c].val = vnew.val; + if (v[k].val > best[v[k].c].val) { + int chunk = v[k].c; + best[chunk].xlo = v[k].x; + best[chunk].ylo = v[k].y; + best[chunk].xhi = x; + best[chunk].yhi = y; + best[chunk].val = v[k].val; } - v[k] = vnew; } else { + /* First consider a y-step: adding a + * symbol from B */ vnew = v[k+1]; update_value(&vnew, -1, k, x); /* might cross a chunk boundary */ @@ -197,9 +219,15 @@ static void find_best(struct file *a, struct file *b, vnew.c = atoi(b->list[y-1].start+1); vnew.val = 0; } + + /* Not consider an x-step: deleting + * a symbol. This cannot be a chunk + * boundary as there aren't any in 'A' + */ vnew2 = v[k-1]; update_value(&vnew2, 1, k, x); + /* Now choose the best. */ if (vnew2.val > vnew.val) v[k] = vnew2; else @@ -213,9 +241,6 @@ static void find_best(struct file *a, struct file *b, update_value(&v[klo], -1, klo, x); if (y <= bhi && b->list[y-1].len && b->list[y-1].start[0] == 0) { v[klo].c = atoi(b->list[y-1].start+1); -#if 0 - printf("entered %d at %d,%d\n", v[klo].c, x, y); -#endif v[klo].val = 0; } while (klo+2 < (ahi-bhi) && @@ -247,6 +272,9 @@ static void find_best(struct file *a, struct file *b, free(valloc); } +/* Join two csl lists together. + * Simply allocate new space and copy everything in. + */ static struct csl *csl_join(struct csl *c1, struct csl *c2) { struct csl *c, *cd, *rv; @@ -287,13 +315,13 @@ static void printword(struct elmnt e) #endif /* - * reduce a file by discarding less interesting words + * Reduce a file by discarding less interesting words * Words that end with a newline are interesting (so all words * in line-mode are interesting) and words that start with * and alphanumeric are interesting. This excludes spaces and * special characters in word mode * Doing a best-fit comparision on only interesting words is - * much fast than on all words, and it nearly as good + * much faster than on all words, and is nearly as good */ static inline int is_skipped(struct elmnt e) @@ -336,7 +364,7 @@ static void remap(struct best *best, int cnt, struct file a2, struct file b2) { int b; - int pa, pb; + int pa, pb; /* pointers into the a2 and b2 arrays */ pa = pb = 0; @@ -345,11 +373,6 @@ static void remap(struct best *best, int cnt, for (b = 1; b < cnt; b++) if (best[b].val > 0) { -#if 0 - printf("best %d,%d %d,%d\n", - best[b].xlo, best[b].ylo, - best[b].xhi, best[b].yhi); -#endif while (pa < a2.elcnt && a2.list[pa].start != a1.list[best[b].xlo].start) pa++; @@ -369,31 +392,29 @@ static void remap(struct best *best, int cnt, while (pb > 0 && is_skipped(b2.list[pb-1])) pb--; -#if 0 - printf("-> %d,%d\n", pa, pb); -#endif best[b].xlo = pa; best[b].ylo = pb; while (pa < a2.elcnt && - (pa == 0 || a2.list[pa-1].start != a1.list[best[b].xhi-1].start)) + (pa == 0 || (a2.list[pa-1].start + != a1.list[best[b].xhi-1].start))) pa++; if (pa == a2.elcnt && best[b].xhi != a1.elcnt) abort(); while (pb < b2.elcnt && - (pb == 0 || b2.list[pb-1].start != b1.list[best[b].yhi-1].start)) + (pb == 0 || (b2.list[pb-1].start + != b1.list[best[b].yhi-1].start))) pb++; if (pb == b2.elcnt && best[b].yhi != b1.elcnt) abort(); - /* now step pa,pb forward over ignored words */ + /* pa,pb is now the end of the best bit. + * Step pa,pb forward over ignored words. + */ while (pa < a2.elcnt && is_skipped(a2.list[pa])) pa++; while (pb < b2.elcnt && is_skipped(b2.list[pb])) pb++; -#if 0 - printf("-> %d,%d\n", pa, pb); -#endif best[b].xhi = pa; best[b].yhi = pb; } @@ -464,47 +485,17 @@ struct csl *pdiff(struct file a, struct file b, int chunks) alo = blo = 0; ahi = asmall.elcnt; bhi = bsmall.elcnt; -/* printf("start: %d,%d %d,%d\n", alo,blo,ahi,bhi); */ for (i = 0; i < chunks+1; i++) best[i].val = 0; find_best_inorder(&asmall, &bsmall, 0, asmall.elcnt, 0, bsmall.elcnt, best, 1, chunks+1); -#if 0 -/* for(i=0; i < b.elcnt;i++) { printf("%d: ", i); printword(b.list[i]); }*/ - for (i = 1; i <= chunks; i++) { - printf("end: %d,%d %d,%d\n", best[i].xlo, best[i].ylo, - best[i].xhi, best[i].yhi); - printf("<"); - printword(bsmall.list[best[i].ylo]); - printf("><"); - printword(bsmall.list[best[i].yhi-1]); - printf(">\n"); - } -#endif remap(best, chunks+1, asmall, bsmall, a, b); -#if 0 -/* for(i=0; i < b.elcnt;i++) { printf("%d: ", i); printword(b.list[i]); }*/ - for (i = 1; i <= chunks; i++) - printf("end: %d,%d %d,%d\n", best[i].xlo, best[i].ylo, - best[i].xhi, best[i].yhi); - printf("small: a %d b %d --- normal: a %d b %d\n", asmall.elcnt, - bsmall.elcnt, a.elcnt, b.elcnt); -#endif csl1 = NULL; for (i = 1; i <= chunks; i++) if (best[i].val > 0) { -#if 0 - int j; - printf("Before:\n"); - for (j = best[i].xlo; j < best[i].xhi; j++) - printword(a.list[j]); - printf("After:\n"); - for (j = best[i].ylo; j < best[i].yhi; j++) - printword(b.list[j]); -#endif csl2 = diff_partial(a, b, best[i].xlo, best[i].xhi, best[i].ylo, best[i].yhi); -- 2.39.5