Chris@0: 2, and some optimizations) Chris@0: * are my own. Chris@0: * Chris@0: * Line length limits for robustness added by Tim Starling, 2005-08-31 Chris@0: * Chris@0: * @author Geoffrey T. Dairiki, Tim Starling Chris@0: * @private Chris@0: * @subpackage DifferenceEngine Chris@0: */ Chris@0: class DiffEngine { Chris@0: Chris@0: const USE_ASSERTS = FALSE; Chris@0: Chris@0: const MAX_XREF_LENGTH = 10000; Chris@0: Chris@0: public function diff($from_lines, $to_lines) { Chris@0: Chris@0: $n_from = sizeof($from_lines); Chris@0: $n_to = sizeof($to_lines); Chris@0: Chris@0: $this->xchanged = $this->ychanged = []; Chris@0: $this->xv = $this->yv = []; Chris@0: $this->xind = $this->yind = []; Chris@0: unset($this->seq); Chris@0: unset($this->in_seq); Chris@0: unset($this->lcs); Chris@0: Chris@0: // Skip leading common lines. Chris@0: for ($skip = 0; $skip < $n_from && $skip < $n_to; $skip++) { Chris@0: if ($from_lines[$skip] !== $to_lines[$skip]) { Chris@0: break; Chris@0: } Chris@0: $this->xchanged[$skip] = $this->ychanged[$skip] = FALSE; Chris@0: } Chris@0: // Skip trailing common lines. Chris@0: $xi = $n_from; Chris@0: $yi = $n_to; Chris@0: for ($endskip = 0; --$xi > $skip && --$yi > $skip; $endskip++) { Chris@0: if ($from_lines[$xi] !== $to_lines[$yi]) { Chris@0: break; Chris@0: } Chris@0: $this->xchanged[$xi] = $this->ychanged[$yi] = FALSE; Chris@0: } Chris@0: Chris@0: // Ignore lines which do not exist in both files. Chris@0: for ($xi = $skip; $xi < $n_from - $endskip; $xi++) { Chris@0: $xhash[$this->_line_hash($from_lines[$xi])] = 1; Chris@0: } Chris@0: Chris@0: for ($yi = $skip; $yi < $n_to - $endskip; $yi++) { Chris@0: $line = $to_lines[$yi]; Chris@0: if ($this->ychanged[$yi] = empty($xhash[$this->_line_hash($line)])) { Chris@0: continue; Chris@0: } Chris@0: $yhash[$this->_line_hash($line)] = 1; Chris@0: $this->yv[] = $line; Chris@0: $this->yind[] = $yi; Chris@0: } Chris@0: for ($xi = $skip; $xi < $n_from - $endskip; $xi++) { Chris@0: $line = $from_lines[$xi]; Chris@0: if ($this->xchanged[$xi] = empty($yhash[$this->_line_hash($line)])) { Chris@0: continue; Chris@0: } Chris@0: $this->xv[] = $line; Chris@0: $this->xind[] = $xi; Chris@0: } Chris@0: Chris@0: // Find the LCS. Chris@0: $this->_compareseq(0, sizeof($this->xv), 0, sizeof($this->yv)); Chris@0: Chris@0: // Merge edits when possible Chris@0: $this->_shift_boundaries($from_lines, $this->xchanged, $this->ychanged); Chris@0: $this->_shift_boundaries($to_lines, $this->ychanged, $this->xchanged); Chris@0: Chris@0: // Compute the edit operations. Chris@0: $edits = []; Chris@0: $xi = $yi = 0; Chris@0: while ($xi < $n_from || $yi < $n_to) { Chris@0: $this::USE_ASSERTS && assert($yi < $n_to || $this->xchanged[$xi]); Chris@0: $this::USE_ASSERTS && assert($xi < $n_from || $this->ychanged[$yi]); Chris@0: Chris@0: // Skip matching "snake". Chris@0: $copy = []; Chris@0: while ( $xi < $n_from && $yi < $n_to && !$this->xchanged[$xi] && !$this->ychanged[$yi]) { Chris@0: $copy[] = $from_lines[$xi++]; Chris@0: ++$yi; Chris@0: } Chris@0: if ($copy) { Chris@0: $edits[] = new DiffOpCopy($copy); Chris@0: } Chris@0: // Find deletes & adds. Chris@0: $delete = []; Chris@0: while ($xi < $n_from && $this->xchanged[$xi]) { Chris@0: $delete[] = $from_lines[$xi++]; Chris@0: } Chris@0: $add = []; Chris@0: while ($yi < $n_to && $this->ychanged[$yi]) { Chris@0: $add[] = $to_lines[$yi++]; Chris@0: } Chris@0: if ($delete && $add) { Chris@0: $edits[] = new DiffOpChange($delete, $add); Chris@0: } Chris@0: elseif ($delete) { Chris@0: $edits[] = new DiffOpDelete($delete); Chris@0: } Chris@0: elseif ($add) { Chris@0: $edits[] = new DiffOpAdd($add); Chris@0: } Chris@0: } Chris@0: return $edits; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Returns the whole line if it's small enough, or the MD5 hash otherwise. Chris@0: */ Chris@0: protected function _line_hash($line) { Chris@17: if (mb_strlen($line) > $this::MAX_XREF_LENGTH) { Chris@0: return md5($line); Chris@0: } Chris@0: else { Chris@0: return $line; Chris@0: } Chris@0: } Chris@0: Chris@0: Chris@0: /** Chris@0: * Divide the Largest Common Subsequence (LCS) of the sequences Chris@0: * [XOFF, XLIM) and [YOFF, YLIM) into NCHUNKS approximately equally Chris@0: * sized segments. Chris@0: * Chris@0: * Returns (LCS, PTS). LCS is the length of the LCS. PTS is an Chris@0: * array of NCHUNKS+1 (X, Y) indexes giving the diving points between Chris@0: * sub sequences. The first sub-sequence is contained in [X0, X1), Chris@0: * [Y0, Y1), the second in [X1, X2), [Y1, Y2) and so on. Note Chris@0: * that (X0, Y0) == (XOFF, YOFF) and Chris@0: * (X[NCHUNKS], Y[NCHUNKS]) == (XLIM, YLIM). Chris@0: * Chris@0: * This function assumes that the first lines of the specified portions Chris@0: * of the two files do not match, and likewise that the last lines do not Chris@0: * match. The caller must trim matching lines from the beginning and end Chris@0: * of the portions it is going to specify. Chris@0: */ Chris@0: protected function _diag($xoff, $xlim, $yoff, $ylim, $nchunks) { Chris@0: $flip = FALSE; Chris@0: Chris@0: if ($xlim - $xoff > $ylim - $yoff) { Chris@0: // Things seems faster (I'm not sure I understand why) Chris@0: // when the shortest sequence in X. Chris@0: $flip = TRUE; Chris@0: list($xoff, $xlim, $yoff, $ylim) = [$yoff, $ylim, $xoff, $xlim]; Chris@0: } Chris@0: Chris@0: if ($flip) { Chris@0: for ($i = $ylim - 1; $i >= $yoff; $i--) { Chris@0: $ymatches[$this->xv[$i]][] = $i; Chris@0: } Chris@0: } Chris@0: else { Chris@0: for ($i = $ylim - 1; $i >= $yoff; $i--) { Chris@0: $ymatches[$this->yv[$i]][] = $i; Chris@0: } Chris@0: } Chris@0: $this->lcs = 0; Chris@0: $this->seq[0] = $yoff - 1; Chris@0: $this->in_seq = []; Chris@0: $ymids[0] = []; Chris@0: Chris@0: $numer = $xlim - $xoff + $nchunks - 1; Chris@0: $x = $xoff; Chris@0: for ($chunk = 0; $chunk < $nchunks; $chunk++) { Chris@0: if ($chunk > 0) { Chris@0: for ($i = 0; $i <= $this->lcs; $i++) { Chris@0: $ymids[$i][$chunk - 1] = $this->seq[$i]; Chris@0: } Chris@0: } Chris@0: Chris@0: $x1 = $xoff + (int)(($numer + ($xlim - $xoff) * $chunk) / $nchunks); Chris@0: for (; $x < $x1; $x++) { Chris@0: $line = $flip ? $this->yv[$x] : $this->xv[$x]; Chris@0: if (empty($ymatches[$line])) { Chris@0: continue; Chris@0: } Chris@0: $matches = $ymatches[$line]; Chris@14: $found_empty = FALSE; Chris@14: foreach ($matches as $y) { Chris@14: if (!$found_empty) { Chris@14: if (empty($this->in_seq[$y])) { Chris@14: $k = $this->_lcs_pos($y); Chris@14: $this::USE_ASSERTS && assert($k > 0); Chris@14: $ymids[$k] = $ymids[$k - 1]; Chris@14: $found_empty = TRUE; Chris@14: } Chris@0: } Chris@14: else { Chris@14: if ($y > $this->seq[$k - 1]) { Chris@14: $this::USE_ASSERTS && assert($y < $this->seq[$k]); Chris@14: // Optimization: this is a common case: Chris@14: // next match is just replacing previous match. Chris@14: $this->in_seq[$this->seq[$k]] = FALSE; Chris@14: $this->seq[$k] = $y; Chris@14: $this->in_seq[$y] = 1; Chris@14: } Chris@14: elseif (empty($this->in_seq[$y])) { Chris@14: $k = $this->_lcs_pos($y); Chris@14: $this::USE_ASSERTS && assert($k > 0); Chris@14: $ymids[$k] = $ymids[$k - 1]; Chris@14: } Chris@0: } Chris@0: } Chris@0: } Chris@0: } Chris@0: Chris@0: $seps[] = $flip ? [$yoff, $xoff] : [$xoff, $yoff]; Chris@0: $ymid = $ymids[$this->lcs]; Chris@0: for ($n = 0; $n < $nchunks - 1; $n++) { Chris@0: $x1 = $xoff + (int)(($numer + ($xlim - $xoff) * $n) / $nchunks); Chris@0: $y1 = $ymid[$n] + 1; Chris@0: $seps[] = $flip ? [$y1, $x1] : [$x1, $y1]; Chris@0: } Chris@0: $seps[] = $flip ? [$ylim, $xlim] : [$xlim, $ylim]; Chris@0: Chris@0: return [$this->lcs, $seps]; Chris@0: } Chris@0: Chris@0: protected function _lcs_pos($ypos) { Chris@0: Chris@0: $end = $this->lcs; Chris@0: if ($end == 0 || $ypos > $this->seq[$end]) { Chris@0: $this->seq[++$this->lcs] = $ypos; Chris@0: $this->in_seq[$ypos] = 1; Chris@0: return $this->lcs; Chris@0: } Chris@0: Chris@0: $beg = 1; Chris@0: while ($beg < $end) { Chris@0: $mid = (int)(($beg + $end) / 2); Chris@0: if ($ypos > $this->seq[$mid]) { Chris@0: $beg = $mid + 1; Chris@0: } Chris@0: else { Chris@0: $end = $mid; Chris@0: } Chris@0: } Chris@0: Chris@0: $this::USE_ASSERTS && assert($ypos != $this->seq[$end]); Chris@0: Chris@0: $this->in_seq[$this->seq[$end]] = FALSE; Chris@0: $this->seq[$end] = $ypos; Chris@0: $this->in_seq[$ypos] = 1; Chris@0: return $end; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Find LCS of two sequences. Chris@0: * Chris@0: * The results are recorded in the vectors $this->{x,y}changed[], by Chris@0: * storing a 1 in the element for each line that is an insertion Chris@0: * or deletion (ie. is not in the LCS). Chris@0: * Chris@0: * The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1. Chris@0: * Chris@0: * Note that XLIM, YLIM are exclusive bounds. Chris@0: * All line numbers are origin-0 and discarded lines are not counted. Chris@0: */ Chris@0: protected function _compareseq($xoff, $xlim, $yoff, $ylim) { Chris@0: Chris@0: // Slide down the bottom initial diagonal. Chris@0: while ($xoff < $xlim && $yoff < $ylim && $this->xv[$xoff] == $this->yv[$yoff]) { Chris@0: ++$xoff; Chris@0: ++$yoff; Chris@0: } Chris@0: Chris@0: // Slide up the top initial diagonal. Chris@0: while ($xlim > $xoff && $ylim > $yoff && $this->xv[$xlim - 1] == $this->yv[$ylim - 1]) { Chris@0: --$xlim; Chris@0: --$ylim; Chris@0: } Chris@0: Chris@0: if ($xoff == $xlim || $yoff == $ylim) { Chris@0: $lcs = 0; Chris@0: } Chris@0: else { Chris@0: // This is ad hoc but seems to work well. Chris@0: //$nchunks = sqrt(min($xlim - $xoff, $ylim - $yoff) / 2.5); Chris@0: //$nchunks = max(2, min(8, (int)$nchunks)); Chris@0: $nchunks = min(7, $xlim - $xoff, $ylim - $yoff) + 1; Chris@0: list($lcs, $seps) = $this->_diag($xoff, $xlim, $yoff, $ylim, $nchunks); Chris@0: } Chris@0: Chris@0: if ($lcs == 0) { Chris@0: // X and Y sequences have no common subsequence: Chris@0: // mark all changed. Chris@0: while ($yoff < $ylim) { Chris@0: $this->ychanged[$this->yind[$yoff++]] = 1; Chris@0: } Chris@0: while ($xoff < $xlim) { Chris@0: $this->xchanged[$this->xind[$xoff++]] = 1; Chris@0: } Chris@0: } Chris@0: else { Chris@0: // Use the partitions to split this problem into subproblems. Chris@0: reset($seps); Chris@0: $pt1 = $seps[0]; Chris@0: while ($pt2 = next($seps)) { Chris@0: $this->_compareseq ($pt1[0], $pt2[0], $pt1[1], $pt2[1]); Chris@0: $pt1 = $pt2; Chris@0: } Chris@0: } Chris@0: } Chris@0: Chris@0: /** Chris@0: * Adjust inserts/deletes of identical lines to join changes Chris@0: * as much as possible. Chris@0: * Chris@0: * We do something when a run of changed lines include a Chris@0: * line at one end and has an excluded, identical line at the other. Chris@0: * We are free to choose which identical line is included. Chris@0: * `compareseq' usually chooses the one at the beginning, Chris@0: * but usually it is cleaner to consider the following identical line Chris@0: * to be the "change". Chris@0: * Chris@0: * This is extracted verbatim from analyze.c (GNU diffutils-2.7). Chris@0: */ Chris@0: protected function _shift_boundaries($lines, &$changed, $other_changed) { Chris@0: $i = 0; Chris@0: $j = 0; Chris@0: Chris@14: $this::USE_ASSERTS && assert(sizeof($lines) == sizeof($changed)); Chris@0: $len = sizeof($lines); Chris@0: $other_len = sizeof($other_changed); Chris@0: Chris@0: while (1) { Chris@0: /* Chris@0: * Scan forwards to find beginning of another run of changes. Chris@0: * Also keep track of the corresponding point in the other file. Chris@0: * Chris@0: * Throughout this code, $i and $j are adjusted together so that Chris@0: * the first $i elements of $changed and the first $j elements Chris@0: * of $other_changed both contain the same number of zeros Chris@0: * (unchanged lines). Chris@0: * Furthermore, $j is always kept so that $j == $other_len or Chris@0: * $other_changed[$j] == FALSE. Chris@0: */ Chris@0: while ($j < $other_len && $other_changed[$j]) { Chris@0: $j++; Chris@0: } Chris@0: while ($i < $len && !$changed[$i]) { Chris@14: $this::USE_ASSERTS && assert($j < $other_len && ! $other_changed[$j]); Chris@0: $i++; Chris@0: $j++; Chris@0: while ($j < $other_len && $other_changed[$j]) { Chris@0: $j++; Chris@0: } Chris@0: } Chris@0: Chris@0: if ($i == $len) { Chris@0: break; Chris@0: } Chris@0: $start = $i; Chris@0: Chris@0: // Find the end of this run of changes. Chris@0: while (++$i < $len && $changed[$i]) { Chris@0: continue; Chris@0: } Chris@0: Chris@0: do { Chris@0: /* Chris@0: * Record the length of this run of changes, so that Chris@0: * we can later determine whether the run has grown. Chris@0: */ Chris@0: $runlength = $i - $start; Chris@0: Chris@0: /* Chris@0: * Move the changed region back, so long as the Chris@0: * previous unchanged line matches the last changed one. Chris@0: * This merges with previous changed regions. Chris@0: */ Chris@0: while ($start > 0 && $lines[$start - 1] == $lines[$i - 1]) { Chris@0: $changed[--$start] = 1; Chris@0: $changed[--$i] = FALSE; Chris@0: while ($start > 0 && $changed[$start - 1]) { Chris@0: $start--; Chris@0: } Chris@14: $this::USE_ASSERTS && assert($j > 0); Chris@0: while ($other_changed[--$j]) { Chris@0: continue; Chris@0: } Chris@14: $this::USE_ASSERTS && assert($j >= 0 && !$other_changed[$j]); Chris@0: } Chris@0: Chris@0: /* Chris@0: * Set CORRESPONDING to the end of the changed run, at the last Chris@0: * point where it corresponds to a changed run in the other file. Chris@0: * CORRESPONDING == LEN means no such point has been found. Chris@0: */ Chris@0: $corresponding = $j < $other_len ? $i : $len; Chris@0: Chris@0: /* Chris@0: * Move the changed region forward, so long as the Chris@0: * first changed line matches the following unchanged one. Chris@0: * This merges with following changed regions. Chris@0: * Do this second, so that if there are no merges, Chris@0: * the changed region is moved forward as far as possible. Chris@0: */ Chris@0: while ($i < $len && $lines[$start] == $lines[$i]) { Chris@0: $changed[$start++] = FALSE; Chris@0: $changed[$i++] = 1; Chris@0: while ($i < $len && $changed[$i]) { Chris@0: $i++; Chris@0: } Chris@14: $this::USE_ASSERTS && assert($j < $other_len && ! $other_changed[$j]); Chris@0: $j++; Chris@0: if ($j < $other_len && $other_changed[$j]) { Chris@0: $corresponding = $i; Chris@0: while ($j < $other_len && $other_changed[$j]) { Chris@0: $j++; Chris@0: } Chris@0: } Chris@0: } Chris@0: } while ($runlength != $i - $start); Chris@0: Chris@0: /* Chris@0: * If possible, move the fully-merged run of changes Chris@0: * back to a corresponding run in the other file. Chris@0: */ Chris@0: while ($corresponding < $i) { Chris@0: $changed[--$start] = 1; Chris@0: $changed[--$i] = 0; Chris@14: $this::USE_ASSERTS && assert($j > 0); Chris@0: while ($other_changed[--$j]) { Chris@0: continue; Chris@0: } Chris@14: $this::USE_ASSERTS && assert($j >= 0 && !$other_changed[$j]); Chris@0: } Chris@0: } Chris@0: } Chris@0: Chris@0: }