joachim99@52: /* Analyze file differences for GNU DIFF. joachim99@52: joachim99@52: Modified for KDiff3 by Joachim Eibl 2003. joachim99@53: The original file was part of GNU DIFF. joachim99@52: joachim99@52: Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002 joachim99@52: Free Software Foundation, Inc. joachim99@52: joachim99@52: GNU DIFF is free software; you can redistribute it and/or modify joachim99@52: it under the terms of the GNU General Public License as published by joachim99@52: the Free Software Foundation; either version 2, or (at your option) joachim99@52: any later version. joachim99@52: joachim99@52: GNU DIFF is distributed in the hope that it will be useful, joachim99@52: but WITHOUT ANY WARRANTY; without even the implied warranty of joachim99@52: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the joachim99@52: GNU General Public License for more details. joachim99@52: joachim99@52: You should have received a copy of the GNU General Public License joachim99@52: along with this program; see the file COPYING. joachim99@52: If not, write to the Free Software Foundation, joachim99@69: 51 Franklin Steet, Fifth Floor, Boston, MA 02110-1301, USA. */ joachim99@52: joachim99@52: /* The basic algorithm is described in: joachim99@52: "An O(ND) Difference Algorithm and its Variations", Eugene Myers, joachim99@52: Algorithmica Vol. 1 No. 2, 1986, pp. 251-266; joachim99@52: see especially section 4.2, which describes the variation used below. joachim99@52: Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE joachim99@52: heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N) joachim99@52: at the price of producing suboptimal output for large inputs with joachim99@52: many differences. joachim99@52: joachim99@52: The basic algorithm was independently discovered as described in: joachim99@52: "Algorithms for Approximate String Matching", E. Ukkonen, joachim99@52: Information and Control Vol. 64, 1985, pp. 100-118. */ joachim99@52: joachim99@52: #define GDIFF_MAIN joachim99@52: joachim99@52: #include "gnudiff_diff.h" joachim99@52: //#include joachim99@52: #include joachim99@52: joachim99@52: static lin *xvec, *yvec; /* Vectors being compared. */ joachim99@52: static lin *fdiag; /* Vector, indexed by diagonal, containing joachim99@52: 1 + the X coordinate of the point furthest joachim99@52: along the given diagonal in the forward joachim99@52: search of the edit matrix. */ joachim99@52: static lin *bdiag; /* Vector, indexed by diagonal, containing joachim99@52: the X coordinate of the point furthest joachim99@52: along the given diagonal in the backward joachim99@52: search of the edit matrix. */ joachim99@52: static lin too_expensive; /* Edit scripts longer than this are too joachim99@52: expensive to compute. */ joachim99@52: joachim99@52: #define SNAKE_LIMIT 20 /* Snakes bigger than this are considered `big'. */ joachim99@53: joachim99@52: joachim99@52: struct partition joachim99@52: { joachim99@52: lin xmid, ymid; /* Midpoints of this partition. */ joachim99@52: bool lo_minimal; /* Nonzero if low half will be analyzed minimally. */ joachim99@52: bool hi_minimal; /* Likewise for high half. */ joachim99@52: }; joachim99@52: joachim99@52: /* Find the midpoint of the shortest edit script for a specified joachim99@52: portion of the two files. joachim99@52: joachim99@52: Scan from the beginnings of the files, and simultaneously from the ends, joachim99@52: doing a breadth-first search through the space of edit-sequence. joachim99@52: When the two searches meet, we have found the midpoint of the shortest joachim99@52: edit sequence. joachim99@52: joachim99@52: If FIND_MINIMAL is nonzero, find the minimal edit script regardless joachim99@52: of expense. Otherwise, if the search is too expensive, use joachim99@52: heuristics to stop the search and report a suboptimal answer. joachim99@52: joachim99@52: Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number joachim99@52: XMID - YMID equals the number of inserted lines minus the number joachim99@52: of deleted lines (counting only lines before the midpoint). joachim99@52: Return the approximate edit cost; this is the total number of joachim99@52: lines inserted or deleted (counting only lines before the midpoint), joachim99@52: unless a heuristic is used to terminate the search prematurely. joachim99@52: joachim99@52: Set PART->lo_minimal to true iff the minimal edit script for the joachim99@52: left half of the partition is known; similarly for PART->hi_minimal. joachim99@52: joachim99@52: This function assumes that the first lines of the specified portions joachim99@52: of the two files do not match, and likewise that the last lines do not joachim99@52: match. The caller must trim matching lines from the beginning and end joachim99@52: of the portions it is going to specify. joachim99@52: joachim99@52: If we return the "wrong" partitions, joachim99@52: the worst this can do is cause suboptimal diff output. joachim99@52: It cannot cause incorrect diff output. */ joachim99@52: joachim99@53: lin joachim99@53: GnuDiff::diag (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal, joachim99@52: struct partition *part) joachim99@52: { joachim99@52: lin *const fd = fdiag; /* Give the compiler a chance. */ joachim99@52: lin *const bd = bdiag; /* Additional help for the compiler. */ joachim99@52: lin const *const xv = xvec; /* Still more help for the compiler. */ joachim99@52: lin const *const yv = yvec; /* And more and more . . . */ joachim99@52: lin const dmin = xoff - ylim; /* Minimum valid diagonal. */ joachim99@52: lin const dmax = xlim - yoff; /* Maximum valid diagonal. */ joachim99@52: lin const fmid = xoff - yoff; /* Center diagonal of top-down search. */ joachim99@52: lin const bmid = xlim - ylim; /* Center diagonal of bottom-up search. */ joachim99@52: lin fmin = fmid, fmax = fmid; /* Limits of top-down search. */ joachim99@52: lin bmin = bmid, bmax = bmid; /* Limits of bottom-up search. */ joachim99@52: lin c; /* Cost. */ joachim99@52: bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd joachim99@52: diagonal with respect to the northwest. */ joachim99@52: joachim99@52: fd[fmid] = xoff; joachim99@52: bd[bmid] = xlim; joachim99@52: joachim99@52: for (c = 1;; ++c) joachim99@52: { joachim99@52: lin d; /* Active diagonal. */ joachim99@52: bool big_snake = 0; joachim99@52: joachim99@52: /* Extend the top-down search by an edit step in each diagonal. */ joachim99@52: fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin; joachim99@52: fmax < dmax ? fd[++fmax + 1] = -1 : --fmax; joachim99@52: for (d = fmax; d >= fmin; d -= 2) joachim99@52: { joachim99@52: lin x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1]; joachim99@52: joachim99@52: if (tlo >= thi) joachim99@52: x = tlo + 1; joachim99@52: else joachim99@52: x = thi; joachim99@52: oldx = x; joachim99@52: y = x - d; joachim99@52: while (x < xlim && y < ylim && xv[x] == yv[y]) joachim99@52: ++x, ++y; joachim99@52: if (x - oldx > SNAKE_LIMIT) joachim99@52: big_snake = 1; joachim99@52: fd[d] = x; joachim99@52: if (odd && bmin <= d && d <= bmax && bd[d] <= x) joachim99@52: { joachim99@52: part->xmid = x; joachim99@52: part->ymid = y; joachim99@52: part->lo_minimal = part->hi_minimal = 1; joachim99@52: return 2 * c - 1; joachim99@52: } joachim99@52: } joachim99@52: joachim99@52: /* Similarly extend the bottom-up search. */ joachim99@52: bmin > dmin ? bd[--bmin - 1] = LIN_MAX : ++bmin; joachim99@52: bmax < dmax ? bd[++bmax + 1] = LIN_MAX : --bmax; joachim99@52: for (d = bmax; d >= bmin; d -= 2) joachim99@52: { joachim99@52: lin x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1]; joachim99@52: joachim99@52: if (tlo < thi) joachim99@52: x = tlo; joachim99@52: else joachim99@52: x = thi - 1; joachim99@52: oldx = x; joachim99@52: y = x - d; joachim99@52: while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) joachim99@52: --x, --y; joachim99@52: if (oldx - x > SNAKE_LIMIT) joachim99@52: big_snake = 1; joachim99@52: bd[d] = x; joachim99@52: if (!odd && fmin <= d && d <= fmax && x <= fd[d]) joachim99@52: { joachim99@52: part->xmid = x; joachim99@52: part->ymid = y; joachim99@52: part->lo_minimal = part->hi_minimal = 1; joachim99@52: return 2 * c; joachim99@52: } joachim99@52: } joachim99@52: joachim99@52: if (find_minimal) joachim99@52: continue; joachim99@52: joachim99@52: /* Heuristic: check occasionally for a diagonal that has made joachim99@52: lots of progress compared with the edit distance. joachim99@52: If we have any such, find the one that has made the most joachim99@52: progress and return it as if it had succeeded. joachim99@52: joachim99@52: With this heuristic, for files with a constant small density joachim99@52: of changes, the algorithm is linear in the file size. */ joachim99@52: joachim99@52: if (200 < c && big_snake && speed_large_files) joachim99@52: { joachim99@52: lin best; joachim99@52: joachim99@52: best = 0; joachim99@52: for (d = fmax; d >= fmin; d -= 2) joachim99@52: { joachim99@52: lin dd = d - fmid; joachim99@52: lin x = fd[d]; joachim99@52: lin y = x - d; joachim99@52: lin v = (x - xoff) * 2 - dd; joachim99@52: if (v > 12 * (c + (dd < 0 ? -dd : dd))) joachim99@52: { joachim99@52: if (v > best joachim99@52: && xoff + SNAKE_LIMIT <= x && x < xlim joachim99@52: && yoff + SNAKE_LIMIT <= y && y < ylim) joachim99@52: { joachim99@52: /* We have a good enough best diagonal; joachim99@52: now insist that it end with a significant snake. */ joachim99@52: int k; joachim99@52: joachim99@52: for (k = 1; xv[x - k] == yv[y - k]; k++) joachim99@52: if (k == SNAKE_LIMIT) joachim99@52: { joachim99@52: best = v; joachim99@52: part->xmid = x; joachim99@52: part->ymid = y; joachim99@52: break; joachim99@52: } joachim99@52: } joachim99@52: } joachim99@52: } joachim99@52: if (best > 0) joachim99@52: { joachim99@52: part->lo_minimal = 1; joachim99@52: part->hi_minimal = 0; joachim99@52: return 2 * c - 1; joachim99@52: } joachim99@52: joachim99@52: best = 0; joachim99@52: for (d = bmax; d >= bmin; d -= 2) joachim99@52: { joachim99@52: lin dd = d - bmid; joachim99@52: lin x = bd[d]; joachim99@52: lin y = x - d; joachim99@52: lin v = (xlim - x) * 2 + dd; joachim99@52: if (v > 12 * (c + (dd < 0 ? -dd : dd))) joachim99@52: { joachim99@52: if (v > best joachim99@52: && xoff < x && x <= xlim - SNAKE_LIMIT joachim99@52: && yoff < y && y <= ylim - SNAKE_LIMIT) joachim99@52: { joachim99@52: /* We have a good enough best diagonal; joachim99@52: now insist that it end with a significant snake. */ joachim99@52: int k; joachim99@52: joachim99@52: for (k = 0; xv[x + k] == yv[y + k]; k++) joachim99@52: if (k == SNAKE_LIMIT - 1) joachim99@52: { joachim99@52: best = v; joachim99@52: part->xmid = x; joachim99@52: part->ymid = y; joachim99@52: break; joachim99@52: } joachim99@52: } joachim99@52: } joachim99@52: } joachim99@52: if (best > 0) joachim99@52: { joachim99@52: part->lo_minimal = 0; joachim99@52: part->hi_minimal = 1; joachim99@52: return 2 * c - 1; joachim99@52: } joachim99@52: } joachim99@52: joachim99@52: /* Heuristic: if we've gone well beyond the call of duty, joachim99@52: give up and report halfway between our best results so far. */ joachim99@52: if (c >= too_expensive) joachim99@52: { joachim99@52: lin fxybest, fxbest; joachim99@52: lin bxybest, bxbest; joachim99@52: joachim99@52: fxbest = bxbest = 0; /* Pacify `gcc -Wall'. */ joachim99@52: joachim99@52: /* Find forward diagonal that maximizes X + Y. */ joachim99@52: fxybest = -1; joachim99@52: for (d = fmax; d >= fmin; d -= 2) joachim99@52: { joachim99@52: lin x = MIN (fd[d], xlim); joachim99@52: lin y = x - d; joachim99@52: if (ylim < y) joachim99@52: x = ylim + d, y = ylim; joachim99@52: if (fxybest < x + y) joachim99@52: { joachim99@52: fxybest = x + y; joachim99@52: fxbest = x; joachim99@52: } joachim99@52: } joachim99@52: joachim99@52: /* Find backward diagonal that minimizes X + Y. */ joachim99@52: bxybest = LIN_MAX; joachim99@52: for (d = bmax; d >= bmin; d -= 2) joachim99@52: { joachim99@52: lin x = MAX (xoff, bd[d]); joachim99@52: lin y = x - d; joachim99@52: if (y < yoff) joachim99@52: x = yoff + d, y = yoff; joachim99@52: if (x + y < bxybest) joachim99@52: { joachim99@52: bxybest = x + y; joachim99@52: bxbest = x; joachim99@52: } joachim99@52: } joachim99@52: joachim99@52: /* Use the better of the two diagonals. */ joachim99@52: if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff)) joachim99@52: { joachim99@52: part->xmid = fxbest; joachim99@52: part->ymid = fxybest - fxbest; joachim99@52: part->lo_minimal = 1; joachim99@52: part->hi_minimal = 0; joachim99@52: } joachim99@52: else joachim99@52: { joachim99@52: part->xmid = bxbest; joachim99@52: part->ymid = bxybest - bxbest; joachim99@52: part->lo_minimal = 0; joachim99@52: part->hi_minimal = 1; joachim99@52: } joachim99@52: return 2 * c - 1; joachim99@52: } joachim99@52: } joachim99@52: } joachim99@52: joachim99@52: /* Compare in detail contiguous subsequences of the two files joachim99@52: which are known, as a whole, to match each other. joachim99@52: joachim99@52: The results are recorded in the vectors files[N].changed, by joachim99@52: storing 1 in the element for each line that is an insertion or deletion. joachim99@52: joachim99@52: The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1. joachim99@52: joachim99@52: Note that XLIM, YLIM are exclusive bounds. joachim99@52: All line numbers are origin-0 and discarded lines are not counted. joachim99@52: joachim99@52: If FIND_MINIMAL, find a minimal difference no matter how joachim99@52: expensive it is. */ joachim99@52: joachim99@53: void GnuDiff::compareseq (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal) joachim99@52: { joachim99@52: lin * const xv = xvec; /* Help the compiler. */ joachim99@52: lin * const yv = yvec; joachim99@52: joachim99@52: /* Slide down the bottom initial diagonal. */ joachim99@52: while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff]) joachim99@52: ++xoff, ++yoff; joachim99@52: /* Slide up the top initial diagonal. */ joachim99@52: while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1]) joachim99@52: --xlim, --ylim; joachim99@52: joachim99@52: /* Handle simple cases. */ joachim99@52: if (xoff == xlim) joachim99@52: while (yoff < ylim) joachim99@52: files[1].changed[files[1].realindexes[yoff++]] = 1; joachim99@52: else if (yoff == ylim) joachim99@52: while (xoff < xlim) joachim99@52: files[0].changed[files[0].realindexes[xoff++]] = 1; joachim99@52: else joachim99@52: { joachim99@52: lin c; joachim99@52: struct partition part; joachim99@52: joachim99@52: /* Find a point of correspondence in the middle of the files. */ joachim99@52: joachim99@52: c = diag (xoff, xlim, yoff, ylim, find_minimal, &part); joachim99@52: joachim99@52: if (c == 1) joachim99@52: { joachim99@52: /* This should be impossible, because it implies that joachim99@52: one of the two subsequences is empty, joachim99@52: and that case was handled above without calling `diag'. joachim99@52: Let's verify that this is true. */ joachim99@52: abort (); joachim99@52: #if 0 joachim99@52: /* The two subsequences differ by a single insert or delete; joachim99@52: record it and we are done. */ joachim99@52: if (part.xmid - part.ymid < xoff - yoff) joachim99@52: files[1].changed[files[1].realindexes[part.ymid - 1]] = 1; joachim99@52: else joachim99@52: files[0].changed[files[0].realindexes[part.xmid]] = 1; joachim99@52: #endif joachim99@52: } joachim99@52: else joachim99@52: { joachim99@52: /* Use the partitions to split this problem into subproblems. */ joachim99@52: compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal); joachim99@52: compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal); joachim99@52: } joachim99@52: } joachim99@52: } joachim99@52: joachim99@52: /* Discard lines from one file that have no matches in the other file. joachim99@52: joachim99@52: A line which is discarded will not be considered by the actual joachim99@52: comparison algorithm; it will be as if that line were not in the file. joachim99@52: The file's `realindexes' table maps virtual line numbers joachim99@52: (which don't count the discarded lines) into real line numbers; joachim99@52: this is how the actual comparison algorithm produces results joachim99@52: that are comprehensible when the discarded lines are counted. joachim99@52: joachim99@52: When we discard a line, we also mark it as a deletion or insertion joachim99@52: so that it will be printed in the output. */ joachim99@52: joachim99@53: void GnuDiff::discard_confusing_lines (struct file_data filevec[]) joachim99@52: { joachim99@52: int f; joachim99@52: lin i; joachim99@52: char *discarded[2]; joachim99@52: lin *equiv_count[2]; joachim99@52: lin *p; joachim99@52: joachim99@52: /* Allocate our results. */ joachim99@52: p = (lin*)xmalloc ((filevec[0].buffered_lines + filevec[1].buffered_lines) joachim99@52: * (2 * sizeof *p)); joachim99@52: for (f = 0; f < 2; f++) joachim99@52: { joachim99@52: filevec[f].undiscarded = p; p += filevec[f].buffered_lines; joachim99@52: filevec[f].realindexes = p; p += filevec[f].buffered_lines; joachim99@52: } joachim99@52: joachim99@52: /* Set up equiv_count[F][I] as the number of lines in file F joachim99@52: that fall in equivalence class I. */ joachim99@52: joachim99@52: p = (lin*)zalloc (filevec[0].equiv_max * (2 * sizeof *p)); joachim99@52: equiv_count[0] = p; joachim99@52: equiv_count[1] = p + filevec[0].equiv_max; joachim99@52: joachim99@52: for (i = 0; i < filevec[0].buffered_lines; ++i) joachim99@52: ++equiv_count[0][filevec[0].equivs[i]]; joachim99@52: for (i = 0; i < filevec[1].buffered_lines; ++i) joachim99@52: ++equiv_count[1][filevec[1].equivs[i]]; joachim99@52: joachim99@52: /* Set up tables of which lines are going to be discarded. */ joachim99@52: joachim99@52: discarded[0] = (char*)zalloc (filevec[0].buffered_lines joachim99@52: + filevec[1].buffered_lines); joachim99@52: discarded[1] = discarded[0] + filevec[0].buffered_lines; joachim99@52: joachim99@52: /* Mark to be discarded each line that matches no line of the other file. joachim99@52: If a line matches many lines, mark it as provisionally discardable. */ joachim99@52: joachim99@52: for (f = 0; f < 2; f++) joachim99@52: { joachim99@52: size_t end = filevec[f].buffered_lines; joachim99@52: char *discards = discarded[f]; joachim99@52: lin *counts = equiv_count[1 - f]; joachim99@52: lin *equivs = filevec[f].equivs; joachim99@52: size_t many = 5; joachim99@52: size_t tem = end / 64; joachim99@52: joachim99@52: /* Multiply MANY by approximate square root of number of lines. joachim99@52: That is the threshold for provisionally discardable lines. */ joachim99@52: while ((tem = tem >> 2) > 0) joachim99@52: many *= 2; joachim99@52: joachim99@66: for (i = 0; i < (lin)end; i++) joachim99@52: { joachim99@52: lin nmatch; joachim99@52: if (equivs[i] == 0) joachim99@52: continue; joachim99@52: nmatch = counts[equivs[i]]; joachim99@52: if (nmatch == 0) joachim99@52: discards[i] = 1; joachim99@66: else if (nmatch > (lin)many) joachim99@52: discards[i] = 2; joachim99@52: } joachim99@52: } joachim99@52: joachim99@52: /* Don't really discard the provisional lines except when they occur joachim99@52: in a run of discardables, with nonprovisionals at the beginning joachim99@52: and end. */ joachim99@52: joachim99@52: for (f = 0; f < 2; f++) joachim99@52: { joachim99@52: lin end = filevec[f].buffered_lines; joachim99@52: register char *discards = discarded[f]; joachim99@52: joachim99@52: for (i = 0; i < end; i++) joachim99@52: { joachim99@52: /* Cancel provisional discards not in middle of run of discards. */ joachim99@52: if (discards[i] == 2) joachim99@52: discards[i] = 0; joachim99@52: else if (discards[i] != 0) joachim99@52: { joachim99@52: /* We have found a nonprovisional discard. */ joachim99@52: register lin j; joachim99@52: lin length; joachim99@52: lin provisional = 0; joachim99@52: joachim99@52: /* Find end of this run of discardable lines. joachim99@52: Count how many are provisionally discardable. */ joachim99@52: for (j = i; j < end; j++) joachim99@52: { joachim99@52: if (discards[j] == 0) joachim99@52: break; joachim99@52: if (discards[j] == 2) joachim99@52: ++provisional; joachim99@52: } joachim99@52: joachim99@52: /* Cancel provisional discards at end, and shrink the run. */ joachim99@52: while (j > i && discards[j - 1] == 2) joachim99@52: discards[--j] = 0, --provisional; joachim99@52: joachim99@52: /* Now we have the length of a run of discardable lines joachim99@52: whose first and last are not provisional. */ joachim99@52: length = j - i; joachim99@52: joachim99@52: /* If 1/4 of the lines in the run are provisional, joachim99@52: cancel discarding of all provisional lines in the run. */ joachim99@52: if (provisional * 4 > length) joachim99@52: { joachim99@52: while (j > i) joachim99@52: if (discards[--j] == 2) joachim99@52: discards[j] = 0; joachim99@52: } joachim99@52: else joachim99@52: { joachim99@52: register lin consec; joachim99@52: lin minimum = 1; joachim99@52: lin tem = length >> 2; joachim99@52: joachim99@52: /* MINIMUM is approximate square root of LENGTH/4. joachim99@52: A subrun of two or more provisionals can stand joachim99@52: when LENGTH is at least 16. joachim99@52: A subrun of 4 or more can stand when LENGTH >= 64. */ joachim99@52: while (0 < (tem >>= 2)) joachim99@52: minimum <<= 1; joachim99@52: minimum++; joachim99@52: joachim99@52: /* Cancel any subrun of MINIMUM or more provisionals joachim99@52: within the larger run. */ joachim99@52: for (j = 0, consec = 0; j < length; j++) joachim99@52: if (discards[i + j] != 2) joachim99@52: consec = 0; joachim99@52: else if (minimum == ++consec) joachim99@52: /* Back up to start of subrun, to cancel it all. */ joachim99@52: j -= consec; joachim99@52: else if (minimum < consec) joachim99@52: discards[i + j] = 0; joachim99@52: joachim99@52: /* Scan from beginning of run joachim99@52: until we find 3 or more nonprovisionals in a row joachim99@52: or until the first nonprovisional at least 8 lines in. joachim99@52: Until that point, cancel any provisionals. */ joachim99@52: for (j = 0, consec = 0; j < length; j++) joachim99@52: { joachim99@52: if (j >= 8 && discards[i + j] == 1) joachim99@52: break; joachim99@52: if (discards[i + j] == 2) joachim99@52: consec = 0, discards[i + j] = 0; joachim99@52: else if (discards[i + j] == 0) joachim99@52: consec = 0; joachim99@52: else joachim99@52: consec++; joachim99@52: if (consec == 3) joachim99@52: break; joachim99@52: } joachim99@52: joachim99@52: /* I advances to the last line of the run. */ joachim99@52: i += length - 1; joachim99@52: joachim99@52: /* Same thing, from end. */ joachim99@52: for (j = 0, consec = 0; j < length; j++) joachim99@52: { joachim99@52: if (j >= 8 && discards[i - j] == 1) joachim99@52: break; joachim99@52: if (discards[i - j] == 2) joachim99@52: consec = 0, discards[i - j] = 0; joachim99@52: else if (discards[i - j] == 0) joachim99@52: consec = 0; joachim99@52: else joachim99@52: consec++; joachim99@52: if (consec == 3) joachim99@52: break; joachim99@52: } joachim99@52: } joachim99@52: } joachim99@52: } joachim99@52: } joachim99@52: joachim99@52: /* Actually discard the lines. */ joachim99@52: for (f = 0; f < 2; f++) joachim99@52: { joachim99@52: char *discards = discarded[f]; joachim99@52: lin end = filevec[f].buffered_lines; joachim99@52: lin j = 0; joachim99@52: for (i = 0; i < end; ++i) joachim99@52: if (minimal || discards[i] == 0) joachim99@52: { joachim99@52: filevec[f].undiscarded[j] = filevec[f].equivs[i]; joachim99@52: filevec[f].realindexes[j++] = i; joachim99@52: } joachim99@52: else joachim99@52: filevec[f].changed[i] = 1; joachim99@52: filevec[f].nondiscarded_lines = j; joachim99@52: } joachim99@52: joachim99@52: free (discarded[0]); joachim99@52: free (equiv_count[0]); joachim99@52: } joachim99@52: joachim99@52: /* Adjust inserts/deletes of identical lines to join changes joachim99@52: as much as possible. joachim99@52: joachim99@52: We do something when a run of changed lines include a joachim99@52: line at one end and have an excluded, identical line at the other. joachim99@52: We are free to choose which identical line is included. joachim99@52: `compareseq' usually chooses the one at the beginning, joachim99@52: but usually it is cleaner to consider the following identical line joachim99@52: to be the "change". */ joachim99@52: joachim99@53: void GnuDiff::shift_boundaries (struct file_data filevec[]) joachim99@52: { joachim99@52: int f; joachim99@52: joachim99@52: for (f = 0; f < 2; f++) joachim99@52: { joachim99@52: bool *changed = filevec[f].changed; joachim99@52: bool const *other_changed = filevec[1 - f].changed; joachim99@52: lin const *equivs = filevec[f].equivs; joachim99@52: lin i = 0; joachim99@52: lin j = 0; joachim99@52: lin i_end = filevec[f].buffered_lines; joachim99@52: joachim99@52: while (1) joachim99@52: { joachim99@52: lin runlength, start, corresponding; joachim99@52: joachim99@52: /* Scan forwards to find beginning of another run of changes. joachim99@52: Also keep track of the corresponding point in the other file. */ joachim99@52: joachim99@52: while (i < i_end && !changed[i]) joachim99@52: { joachim99@52: while (other_changed[j++]) joachim99@52: continue; joachim99@52: i++; joachim99@52: } joachim99@52: joachim99@52: if (i == i_end) joachim99@52: break; joachim99@52: joachim99@52: start = i; joachim99@52: joachim99@52: /* Find the end of this run of changes. */ joachim99@52: joachim99@52: while (changed[++i]) joachim99@52: continue; joachim99@52: while (other_changed[j]) joachim99@52: j++; joachim99@52: joachim99@52: do joachim99@52: { joachim99@52: /* Record the length of this run of changes, so that joachim99@52: we can later determine whether the run has grown. */ joachim99@52: runlength = i - start; joachim99@52: joachim99@52: /* Move the changed region back, so long as the joachim99@52: previous unchanged line matches the last changed one. joachim99@52: This merges with previous changed regions. */ joachim99@52: joachim99@52: while (start && equivs[start - 1] == equivs[i - 1]) joachim99@52: { joachim99@52: changed[--start] = 1; joachim99@52: changed[--i] = 0; joachim99@52: while (changed[start - 1]) joachim99@52: start--; joachim99@52: while (other_changed[--j]) joachim99@52: continue; joachim99@52: } joachim99@52: joachim99@52: /* Set CORRESPONDING to the end of the changed run, at the last joachim99@52: point where it corresponds to a changed run in the other file. joachim99@52: CORRESPONDING == I_END means no such point has been found. */ joachim99@52: corresponding = other_changed[j - 1] ? i : i_end; joachim99@52: joachim99@52: /* Move the changed region forward, so long as the joachim99@52: first changed line matches the following unchanged one. joachim99@52: This merges with following changed regions. joachim99@52: Do this second, so that if there are no merges, joachim99@52: the changed region is moved forward as far as possible. */ joachim99@52: joachim99@52: while (i != i_end && equivs[start] == equivs[i]) joachim99@52: { joachim99@52: changed[start++] = 0; joachim99@52: changed[i++] = 1; joachim99@52: while (changed[i]) joachim99@52: i++; joachim99@52: while (other_changed[++j]) joachim99@52: corresponding = i; joachim99@52: } joachim99@52: } joachim99@52: while (runlength != i - start); joachim99@52: joachim99@52: /* If possible, move the fully-merged run of changes joachim99@52: back to a corresponding run in the other file. */ joachim99@52: joachim99@52: while (corresponding < i) joachim99@52: { joachim99@52: changed[--start] = 1; joachim99@52: changed[--i] = 0; joachim99@52: while (other_changed[--j]) joachim99@52: continue; joachim99@52: } joachim99@52: } joachim99@52: } joachim99@52: } joachim99@52: joachim99@52: /* Cons an additional entry onto the front of an edit script OLD. joachim99@52: LINE0 and LINE1 are the first affected lines in the two files (origin 0). joachim99@52: DELETED is the number of lines deleted here from file 0. joachim99@52: INSERTED is the number of lines inserted here in file 1. joachim99@52: joachim99@52: If DELETED is 0 then LINE0 is the number of the line before joachim99@52: which the insertion was done; vice versa for INSERTED and LINE1. */ joachim99@52: joachim99@53: GnuDiff::change* GnuDiff::add_change (lin line0, lin line1, lin deleted, lin inserted, struct change *old) joachim99@52: { joachim99@52: struct change *newChange = (change*) xmalloc (sizeof *newChange); joachim99@52: joachim99@52: newChange->line0 = line0; joachim99@52: newChange->line1 = line1; joachim99@52: newChange->inserted = inserted; joachim99@52: newChange->deleted = deleted; joachim99@52: newChange->link = old; joachim99@52: return newChange; joachim99@52: } joachim99@52: joachim99@52: /* Scan the tables of which lines are inserted and deleted, joachim99@52: producing an edit script in reverse order. */ joachim99@52: joachim99@53: GnuDiff::change* GnuDiff::build_reverse_script (struct file_data const filevec[]) joachim99@52: { joachim99@52: struct change *script = 0; joachim99@52: bool *changed0 = filevec[0].changed; joachim99@52: bool *changed1 = filevec[1].changed; joachim99@52: lin len0 = filevec[0].buffered_lines; joachim99@52: lin len1 = filevec[1].buffered_lines; joachim99@52: joachim99@52: /* Note that changedN[len0] does exist, and is 0. */ joachim99@52: joachim99@52: lin i0 = 0, i1 = 0; joachim99@52: joachim99@52: while (i0 < len0 || i1 < len1) joachim99@52: { joachim99@52: if (changed0[i0] | changed1[i1]) joachim99@52: { joachim99@52: lin line0 = i0, line1 = i1; joachim99@52: joachim99@52: /* Find # lines changed here in each file. */ joachim99@52: while (changed0[i0]) ++i0; joachim99@52: while (changed1[i1]) ++i1; joachim99@52: joachim99@52: /* Record this change. */ joachim99@52: script = add_change (line0, line1, i0 - line0, i1 - line1, script); joachim99@52: } joachim99@52: joachim99@52: /* We have reached lines in the two files that match each other. */ joachim99@52: i0++, i1++; joachim99@52: } joachim99@52: joachim99@52: return script; joachim99@52: } joachim99@52: joachim99@52: /* Scan the tables of which lines are inserted and deleted, joachim99@52: producing an edit script in forward order. */ joachim99@52: joachim99@53: GnuDiff::change* GnuDiff::build_script (struct file_data const filevec[]) joachim99@52: { joachim99@52: struct change *script = 0; joachim99@52: bool *changed0 = filevec[0].changed; joachim99@52: bool *changed1 = filevec[1].changed; joachim99@52: lin i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines; joachim99@52: joachim99@52: /* Note that changedN[-1] does exist, and is 0. */ joachim99@52: joachim99@52: while (i0 >= 0 || i1 >= 0) joachim99@52: { joachim99@52: if (changed0[i0 - 1] | changed1[i1 - 1]) joachim99@52: { joachim99@52: lin line0 = i0, line1 = i1; joachim99@52: joachim99@52: /* Find # lines changed here in each file. */ joachim99@52: while (changed0[i0 - 1]) --i0; joachim99@52: while (changed1[i1 - 1]) --i1; joachim99@52: joachim99@52: /* Record this change. */ joachim99@52: script = add_change (i0, i1, line0 - i0, line1 - i1, script); joachim99@52: } joachim99@52: joachim99@52: /* We have reached lines in the two files that match each other. */ joachim99@52: i0--, i1--; joachim99@52: } joachim99@52: joachim99@52: return script; joachim99@52: } joachim99@52: joachim99@52: joachim99@52: /* Report the differences of two files. */ joachim99@53: GnuDiff::change* GnuDiff::diff_2_files (struct comparison *cmp) joachim99@52: { joachim99@52: lin diags; joachim99@52: int f; joachim99@52: //struct change *e, *p; joachim99@52: struct change *script; joachim99@52: int changes; joachim99@52: joachim99@52: read_files (cmp->file, files_can_be_treated_as_binary); joachim99@52: joachim99@52: { joachim99@52: /* Allocate vectors for the results of comparison: joachim99@52: a flag for each line of each file, saying whether that line joachim99@52: is an insertion or deletion. joachim99@52: Allocate an extra element, always 0, at each end of each vector. */ joachim99@52: joachim99@52: size_t s = cmp->file[0].buffered_lines + cmp->file[1].buffered_lines + 4; joachim99@52: bool *flag_space = (bool*)zalloc (s * sizeof(*flag_space)); joachim99@52: cmp->file[0].changed = flag_space + 1; joachim99@52: cmp->file[1].changed = flag_space + cmp->file[0].buffered_lines + 3; joachim99@52: joachim99@52: /* Some lines are obviously insertions or deletions joachim99@52: because they don't match anything. Detect them now, and joachim99@52: avoid even thinking about them in the main comparison algorithm. */ joachim99@52: joachim99@52: discard_confusing_lines (cmp->file); joachim99@52: joachim99@52: /* Now do the main comparison algorithm, considering just the joachim99@52: undiscarded lines. */ joachim99@52: joachim99@52: xvec = cmp->file[0].undiscarded; joachim99@52: yvec = cmp->file[1].undiscarded; joachim99@52: diags = (cmp->file[0].nondiscarded_lines joachim99@52: + cmp->file[1].nondiscarded_lines + 3); joachim99@52: fdiag = (lin*)xmalloc (diags * (2 * sizeof *fdiag)); joachim99@52: bdiag = fdiag + diags; joachim99@52: fdiag += cmp->file[1].nondiscarded_lines + 1; joachim99@52: bdiag += cmp->file[1].nondiscarded_lines + 1; joachim99@52: joachim99@52: /* Set TOO_EXPENSIVE to be approximate square root of input size, joachim99@52: bounded below by 256. */ joachim99@52: too_expensive = 1; joachim99@52: for (; diags != 0; diags >>= 2) joachim99@52: too_expensive <<= 1; joachim99@52: too_expensive = MAX (256, too_expensive); joachim99@52: joachim99@52: files[0] = cmp->file[0]; joachim99@52: files[1] = cmp->file[1]; joachim99@52: joachim99@52: compareseq (0, cmp->file[0].nondiscarded_lines, joachim99@52: 0, cmp->file[1].nondiscarded_lines, minimal); joachim99@52: joachim99@52: free (fdiag - (cmp->file[1].nondiscarded_lines + 1)); joachim99@52: joachim99@52: /* Modify the results slightly to make them prettier joachim99@52: in cases where that can validly be done. */ joachim99@52: joachim99@52: shift_boundaries (cmp->file); joachim99@52: joachim99@52: /* Get the results of comparison in the form of a chain joachim99@69: of `struct change's -- an edit script. */ joachim99@52: joachim99@69: script = build_script (cmp->file); joachim99@52: joachim99@52: changes = (script != 0); joachim99@52: joachim99@52: free (cmp->file[0].undiscarded); joachim99@52: joachim99@52: free (flag_space); joachim99@52: joachim99@52: for (f = 0; f < 2; f++) joachim99@52: { joachim99@52: free (cmp->file[f].equivs); joachim99@52: free (cmp->file[f].linbuf + cmp->file[f].linbuf_base); joachim99@52: } joachim99@52: } joachim99@52: joachim99@52: return script; joachim99@52: }