From dc22c433f2b1cca64a3e0e304bf023d2ab14e5a7 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sat, 29 Aug 2020 17:06:27 +1000 Subject: [PATCH] diff: filter out unmatchable lines before comparing. If either file has a run of 2 or more consecutive lines, none of which appear in the other file, then reduce the run down to a single line. This does not materially change the set of common-sub-lists that are calculated, but it can make the calculate much faster when there is a lot of unmatchable lines. Signed-off-by: NeilBrown --- diff.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- wiggle.h | 2 +- 2 files changed, 101 insertions(+), 6 deletions(-) diff --git a/diff.c b/diff.c index 1deea20..6d2a8fa 100644 --- a/diff.c +++ b/diff.c @@ -532,6 +532,88 @@ static void fixup(struct file *a, struct file *b, struct csl *list) } } +static int elcmp(const void *v1, const void *v2) +{ + const struct elmnt *e1 = v1; + const struct elmnt *e2 = v2; + + if (e1->hash != e2->hash) { + if (e1->hash < e2->hash) + return -1; + return 1; + } + if (e1->start[0] == 0 && e2->start[0] == 0) + return 0; + if (e1->len != e2->len) + return e1->len - e2->len; + return strncmp(e1->start, e2->start, e1->len); +} + +#define BPL (sizeof(unsigned long) * 8) +static struct file filter_unique(struct file f, struct file ref) +{ + /* Use a bloom-filter to record all hashes in 'ref' and + * then if there are consequtive entries in 'f' that are + * not in 'ref', reduce each such run to 1 entry + */ + struct file n; + int fi, cnt; + struct file sorted; + + sorted.list = xmalloc(sizeof(sorted.list[0]) * ref.elcnt); + sorted.elcnt = ref.elcnt; + memcpy(sorted.list, ref.list, sizeof(sorted.list[0]) * sorted.elcnt); + qsort(sorted.list, sorted.elcnt, sizeof(sorted.list[0]), + elcmp); + + n.list = xmalloc(sizeof(n.list[0]) * f.elcnt); + n.elcnt = 0; + cnt = 0; + for (fi = 0; fi < f.elcnt; fi++) { + int lo = 0, hi = sorted.elcnt; + while (lo + 1 < hi) { + int mid = (lo + hi) / 2; + if (elcmp(&f.list[fi], &sorted.list[mid]) < 0) + hi = mid; + else + lo = mid; + } + if (match(&f.list[fi], &sorted.list[lo])) + cnt = 0; + else + cnt += 1; + if (cnt <= 1) + n.list[n.elcnt++] = f.list[fi]; + } + free(sorted.list); + return n; +} + +static void remap(struct csl *csl, int which, struct file from, struct file to) +{ + /* The a,b pointer in csl points to 'from' we need to remap to 'to'. + * 'to' has everything that 'from' has, plus more. + * Each list[].start is unique + */ + int ti = 0; + while (csl->len) { + int fi = which ? csl->b : csl->a; + while (to.list[ti].start != from.list[fi].start) { + ti += 1; + if (ti > to.elcnt) + abort(); + } + if (which) + csl->b = ti; + else + csl->a = ti; + csl += 1; + } + if (which) + csl->b = to.elcnt; + else + csl->a = to.elcnt; +} /* Main entry point - find the common-sub-list of files 'a' and 'b'. * The final element in the list will have 'len' == 0 and will point * beyond the end of the files. @@ -540,13 +622,26 @@ struct csl *diff(struct file a, struct file b) { struct v *v; struct csl *csl; - v = xmalloc(sizeof(struct v)*(a.elcnt+b.elcnt+2)); - v += b.elcnt+1; + struct file af, bf; + + /* Remove runs of 2 or more elements in one file that don't + * exist in the other file. This often makes the number of + * elements more manageable. + */ + af = filter_unique(a, b); + bf = filter_unique(b, a); + + v = xmalloc(sizeof(struct v)*(af.elcnt+bf.elcnt+2)); + v += bf.elcnt+1; - csl = lcsl(&a, 0, a.elcnt, - &b, 0, b.elcnt, + csl = lcsl(&af, 0, af.elcnt, + &bf, 0, bf.elcnt, NULL, v); - free(v-(b.elcnt+1)); + free(v-(bf.elcnt+1)); + remap(csl, 0, af, a); + remap(csl, 1, bf, b); + free(af.list); + free(bf.list); fixup(&a, &b, csl); if (!csl) { csl = xmalloc(sizeof(*csl)); diff --git a/wiggle.h b/wiggle.h index 0aa759b..1befdd5 100644 --- a/wiggle.h +++ b/wiggle.h @@ -54,7 +54,7 @@ struct elmnt { short len, plen, prefix; }; -static inline int match(struct elmnt *a, struct elmnt *b) +static inline int match(struct elmnt *a, struct elmnt *b) { return a->hash == b->hash && -- 2.39.5