PHPXRef 0.7.1 : Unnamed Project : /includes/diff/engine.php source

[Summary view] [Print] [Text view]
   1  <?php
   2  /**
   3  *
   4  * @package diff
   5  * @version $Id$
   6  * @copyright (c) 2006 phpBB Group
   7  * @license http://opensource.org/licenses/gpl-license.php GNU Public License
   8  *
   9  */
  10  
  11  /**
  12  * @ignore
  13  */
  14  if (!defined('IN_PHPBB'))
  15  {
  16      exit;
  17  }
  18  
  19  /**
  20  * Code from pear.php.net, Text_Diff-1.1.0 package
  21  * http://pear.php.net/package/Text_Diff/ (native engine)
  22  *
  23  * Modified by phpBB Group to meet our coding standards
  24  * and being able to integrate into phpBB
  25  *
  26  * Class used internally by Text_Diff to actually compute the diffs. This
  27  * class is implemented using native PHP code.
  28  *
  29  * The algorithm used here is mostly lifted from the perl module
  30  * Algorithm::Diff (version 1.06) by Ned Konz, which is available at:
  31  * http://www.perl.com/CPAN/authors/id/N/NE/NEDKONZ/Algorithm-Diff-1.06.zip
  32  *
  33  * More ideas are taken from: http://www.ics.uci.edu/~eppstein/161/960229.html
  34  *
  35  * Some ideas (and a bit of code) are taken from analyze.c, of GNU
  36  * diffutils-2.7, which can be found at:
  37  * ftp://gnudist.gnu.org/pub/gnu/diffutils/diffutils-2.7.tar.gz
  38  *
  39  * Some ideas (subdivision by NCHUNKS > 2, and some optimizations) are from
  40  * Geoffrey T. Dairiki <dairiki@dairiki.org>. The original PHP version of this
  41  * code was written by him, and is used/adapted with his permission.
  42  *
  43  * Copyright 2004-2008 The Horde Project (http://www.horde.org/)
  44  *
  45  * @author  Geoffrey T. Dairiki <dairiki@dairiki.org>
  46  * @package diff
  47  *
  48  * @access private
  49  */
  50  class diff_engine
  51  {
  52      /**
  53      * If set to true we trim all lines before we compare them. This ensures that sole space/tab changes do not trigger diffs.
  54      */
  55      var $skip_whitespace_changes = true;
  56  
  57  	function diff(&$from_lines, &$to_lines, $preserve_cr = true)
  58      {
  59          // Remove empty lines...
  60          // If preserve_cr is true, we basically only change \r\n and bare \r to \n to get the same carriage returns for both files
  61          // If it is false, we try to only use \n once per line and ommit all empty lines to be able to get a proper data diff
  62  
  63          if (is_array($from_lines))
  64          {
  65              $from_lines = implode("\n", $from_lines);
  66          }
  67  
  68          if (is_array($to_lines))
  69          {
  70              $to_lines = implode("\n", $to_lines);
  71          }
  72  
  73          if ($preserve_cr)
  74          {
  75              $from_lines = explode("\n", str_replace("\r", "\n", str_replace("\r\n", "\n", $from_lines)));
  76              $to_lines = explode("\n", str_replace("\r", "\n", str_replace("\r\n", "\n", $to_lines)));
  77          }
  78          else
  79          {
  80              $from_lines = explode("\n", preg_replace('#[\n\r]+#', "\n", $from_lines));
  81              $to_lines = explode("\n", preg_replace('#[\n\r]+#', "\n", $to_lines));
  82          }
  83  
  84          $n_from = sizeof($from_lines);
  85          $n_to = sizeof($to_lines);
  86  
  87          $this->xchanged = $this->ychanged = $this->xv = $this->yv = $this->xind = $this->yind = array();
  88          unset($this->seq, $this->in_seq, $this->lcs);
  89  
  90          // Skip leading common lines.
  91          for ($skip = 0; $skip < $n_from && $skip < $n_to; $skip++)
  92          {
  93              if (trim($from_lines[$skip]) !== trim($to_lines[$skip]))
  94              {
  95                  break;
  96              }
  97              $this->xchanged[$skip] = $this->ychanged[$skip] = false;
  98          }
  99  
 100          // Skip trailing common lines.
 101          $xi = $n_from;
 102          $yi = $n_to;
 103  
 104          for ($endskip = 0; --$xi > $skip && --$yi > $skip; $endskip++)
 105          {
 106              if (trim($from_lines[$xi]) !== trim($to_lines[$yi]))
 107              {
 108                  break;
 109              }
 110              $this->xchanged[$xi] = $this->ychanged[$yi] = false;
 111          }
 112  
 113          // Ignore lines which do not exist in both files.
 114          for ($xi = $skip; $xi < $n_from - $endskip; $xi++)
 115          {
 116              if ($this->skip_whitespace_changes) $xhash[trim($from_lines[$xi])] = 1; else $xhash[$from_lines[$xi]] = 1;
 117          }
 118  
 119          for ($yi = $skip; $yi < $n_to - $endskip; $yi++)
 120          {
 121              $line = ($this->skip_whitespace_changes) ? trim($to_lines[$yi]) : $to_lines[$yi];
 122  
 123              if (($this->ychanged[$yi] = empty($xhash[$line])))
 124              {
 125                  continue;
 126              }
 127              $yhash[$line] = 1;
 128              $this->yv[] = $line;
 129              $this->yind[] = $yi;
 130          }
 131  
 132          for ($xi = $skip; $xi < $n_from - $endskip; $xi++)
 133          {
 134              $line = ($this->skip_whitespace_changes) ? trim($from_lines[$xi]) : $from_lines[$xi];
 135  
 136              if (($this->xchanged[$xi] = empty($yhash[$line])))
 137              {
 138                  continue;
 139              }
 140              $this->xv[] = $line;
 141              $this->xind[] = $xi;
 142          }
 143  
 144          // Find the LCS.
 145          $this->_compareseq(0, sizeof($this->xv), 0, sizeof($this->yv));
 146  
 147          // Merge edits when possible.
 148          if ($this->skip_whitespace_changes)
 149          {
 150              $from_lines_clean = array_map('trim', $from_lines);
 151              $to_lines_clean = array_map('trim', $to_lines);
 152  
 153              $this->_shift_boundaries($from_lines_clean, $this->xchanged, $this->ychanged);
 154              $this->_shift_boundaries($to_lines_clean, $this->ychanged, $this->xchanged);
 155  
 156              unset($from_lines_clean, $to_lines_clean);
 157          }
 158          else
 159          {
 160              $this->_shift_boundaries($from_lines, $this->xchanged, $this->ychanged);
 161              $this->_shift_boundaries($to_lines, $this->ychanged, $this->xchanged);
 162          }
 163  
 164          // Compute the edit operations.
 165          $edits = array();
 166          $xi = $yi = 0;
 167  
 168          while ($xi < $n_from || $yi < $n_to)
 169          {
 170              // Skip matching "snake".
 171              $copy = array();
 172  
 173              while ($xi < $n_from && $yi < $n_to && !$this->xchanged[$xi] && !$this->ychanged[$yi])
 174              {
 175                  $copy[] = $from_lines[$xi++];
 176                  $yi++;
 177              }
 178  
 179              if ($copy)
 180              {
 181                  $edits[] = new diff_op_copy($copy);
 182              }
 183  
 184              // Find deletes & adds.
 185              $delete = array();
 186              while ($xi < $n_from && $this->xchanged[$xi])
 187              {
 188                  $delete[] = $from_lines[$xi++];
 189              }
 190  
 191              $add = array();
 192              while ($yi < $n_to && $this->ychanged[$yi])
 193              {
 194                  $add[] = $to_lines[$yi++];
 195              }
 196  
 197              if ($delete && $add)
 198              {
 199                  $edits[] = new diff_op_change($delete, $add);
 200              }
 201              else if ($delete)
 202              {
 203                  $edits[] = new diff_op_delete($delete);
 204              }
 205              else if ($add)
 206              {
 207                  $edits[] = new diff_op_add($add);
 208              }
 209          }
 210  
 211          return $edits;
 212      }
 213  
 214      /**
 215      * Divides the Largest Common Subsequence (LCS) of the sequences (XOFF,
 216      * XLIM) and (YOFF, YLIM) into NCHUNKS approximately equally sized segments.
 217      *
 218      * Returns (LCS, PTS).  LCS is the length of the LCS. PTS is an array of
 219      * NCHUNKS+1 (X, Y) indexes giving the diving points between sub
 220      * sequences.  The first sub-sequence is contained in (X0, X1), (Y0, Y1),
 221      * the second in (X1, X2), (Y1, Y2) and so on.  Note that (X0, Y0) ==
 222      * (XOFF, YOFF) and (X[NCHUNKS], Y[NCHUNKS]) == (XLIM, YLIM).
 223      *
 224      * This function assumes that the first lines of the specified portions of
 225      * the two files do not match, and likewise that the last lines do not
 226      * match.  The caller must trim matching lines from the beginning and end
 227      * of the portions it is going to specify.
 228      */
 229  	function _diag($xoff, $xlim, $yoff, $ylim, $nchunks)
 230      {
 231          $flip = false;
 232  
 233          if ($xlim - $xoff > $ylim - $yoff)
 234          {
 235              // Things seems faster (I'm not sure I understand why) when the shortest sequence is in X.
 236              $flip = true;
 237              list($xoff, $xlim, $yoff, $ylim) = array($yoff, $ylim, $xoff, $xlim);
 238          }
 239  
 240          if ($flip)
 241          {
 242              for ($i = $ylim - 1; $i >= $yoff; $i--)
 243              {
 244                  $ymatches[$this->xv[$i]][] = $i;
 245              }
 246          }
 247          else
 248          {
 249              for ($i = $ylim - 1; $i >= $yoff; $i--)
 250              {
 251                  $ymatches[$this->yv[$i]][] = $i;
 252              }
 253          }
 254  
 255          $this->lcs = 0;
 256          $this->seq[0]= $yoff - 1;
 257          $this->in_seq = array();
 258          $ymids[0] = array();
 259  
 260          $numer = $xlim - $xoff + $nchunks - 1;
 261          $x = $xoff;
 262  
 263          for ($chunk = 0; $chunk < $nchunks; $chunk++)
 264          {
 265              if ($chunk > 0)
 266              {
 267                  for ($i = 0; $i <= $this->lcs; $i++)
 268                  {
 269                      $ymids[$i][$chunk - 1] = $this->seq[$i];
 270                  }
 271              }
 272  
 273              $x1 = $xoff + (int)(($numer + ($xlim - $xoff) * $chunk) / $nchunks);
 274  
 275              for (; $x < $x1; $x++)
 276              {
 277                  $line = $flip ? $this->yv[$x] : $this->xv[$x];
 278                  if (empty($ymatches[$line]))
 279                  {
 280                      continue;
 281                  }
 282                  $matches = $ymatches[$line];
 283  
 284                  reset($matches);
 285                  while (list(, $y) = each($matches))
 286                  {
 287                      if (empty($this->in_seq[$y]))
 288                      {
 289                          $k = $this->_lcs_pos($y);
 290                          $ymids[$k] = $ymids[$k - 1];
 291                          break;
 292                      }
 293                  }
 294  
 295                  // no reset() here
 296                  while (list(, $y) = each($matches))
 297                  {
 298                      if ($y > $this->seq[$k - 1])
 299                      {
 300                          // Optimization: this is a common case: next match is just replacing previous match.
 301                          $this->in_seq[$this->seq[$k]] = false;
 302                          $this->seq[$k] = $y;
 303                          $this->in_seq[$y] = 1;
 304                      }
 305                      else if (empty($this->in_seq[$y]))
 306                      {
 307                          $k = $this->_lcs_pos($y);
 308                          $ymids[$k] = $ymids[$k - 1];
 309                      }
 310                  }
 311              }
 312          }
 313  
 314          $seps[] = $flip ? array($yoff, $xoff) : array($xoff, $yoff);
 315          $ymid = $ymids[$this->lcs];
 316  
 317          for ($n = 0; $n < $nchunks - 1; $n++)
 318          {
 319              $x1 = $xoff + (int)(($numer + ($xlim - $xoff) * $n) / $nchunks);
 320              $y1 = $ymid[$n] + 1;
 321              $seps[] = $flip ? array($y1, $x1) : array($x1, $y1);
 322          }
 323          $seps[] = $flip ? array($ylim, $xlim) : array($xlim, $ylim);
 324  
 325          return array($this->lcs, $seps);
 326      }
 327  
 328  	function _lcs_pos($ypos)
 329      {
 330          $end = $this->lcs;
 331  
 332          if ($end == 0 || $ypos > $this->seq[$end])
 333          {
 334              $this->seq[++$this->lcs] = $ypos;
 335              $this->in_seq[$ypos] = 1;
 336              return $this->lcs;
 337          }
 338  
 339          $beg = 1;
 340          while ($beg < $end)
 341          {
 342              $mid = (int)(($beg + $end) / 2);
 343              if ($ypos > $this->seq[$mid])
 344              {
 345                  $beg = $mid + 1;
 346              }
 347              else
 348              {
 349                  $end = $mid;
 350              }
 351          }
 352  
 353          $this->in_seq[$this->seq[$end]] = false;
 354          $this->seq[$end] = $ypos;
 355          $this->in_seq[$ypos] = 1;
 356  
 357          return $end;
 358      }
 359  
 360      /**
 361      * Finds LCS of two sequences.
 362      *
 363      * The results are recorded in the vectors $this->{x,y}changed[], by
 364      * storing a 1 in the element for each line that is an insertion or
 365      * deletion (ie. is not in the LCS).
 366      *
 367      * The subsequence of file 0 is (XOFF, XLIM) and likewise for file 1.
 368      *
 369      * Note that XLIM, YLIM are exclusive bounds.  All line numbers are
 370      * origin-0 and discarded lines are not counted.
 371      */
 372  	function _compareseq($xoff, $xlim, $yoff, $ylim)
 373      {
 374          // Slide down the bottom initial diagonal.
 375          while ($xoff < $xlim && $yoff < $ylim && $this->xv[$xoff] == $this->yv[$yoff])
 376          {
 377              ++$xoff;
 378              ++$yoff;
 379          }
 380  
 381          // Slide up the top initial diagonal.
 382          while ($xlim > $xoff && $ylim > $yoff && $this->xv[$xlim - 1] == $this->yv[$ylim - 1])
 383          {
 384              --$xlim;
 385              --$ylim;
 386          }
 387  
 388          if ($xoff == $xlim || $yoff == $ylim)
 389          {
 390              $lcs = 0;
 391          }
 392          else
 393          {
 394              // This is ad hoc but seems to work well.
 395              // $nchunks = sqrt(min($xlim - $xoff, $ylim - $yoff) / 2.5);
 396              // $nchunks = max(2,min(8,(int)$nchunks));
 397              $nchunks = min(7, $xlim - $xoff, $ylim - $yoff) + 1;
 398              list($lcs, $seps) = $this->_diag($xoff, $xlim, $yoff, $ylim, $nchunks);
 399          }
 400  
 401          if ($lcs == 0)
 402          {
 403              // X and Y sequences have no common subsequence: mark all changed.
 404              while ($yoff < $ylim)
 405              {
 406                  $this->ychanged[$this->yind[$yoff++]] = 1;
 407              }
 408  
 409              while ($xoff < $xlim)
 410              {
 411                  $this->xchanged[$this->xind[$xoff++]] = 1;
 412              }
 413          }
 414          else
 415          {
 416              // Use the partitions to split this problem into subproblems.
 417              reset($seps);
 418              $pt1 = $seps[0];
 419  
 420              while ($pt2 = next($seps))
 421              {
 422                  $this->_compareseq($pt1[0], $pt2[0], $pt1[1], $pt2[1]);
 423                  $pt1 = $pt2;
 424              }
 425          }
 426      }
 427  
 428      /**
 429      * Adjusts inserts/deletes of identical lines to join changes as much as possible.
 430      *
 431      * We do something when a run of changed lines include a line at one end
 432      * and has an excluded, identical line at the other.  We are free to
 433      * choose which identical line is included. 'compareseq' usually chooses
 434      * the one at the beginning, but usually it is cleaner to consider the
 435      * following identical line to be the "change".
 436      *
 437      * This is extracted verbatim from analyze.c (GNU diffutils-2.7).
 438      */
 439  	function _shift_boundaries($lines, &$changed, $other_changed)
 440      {
 441          $i = 0;
 442          $j = 0;
 443  
 444          $len = sizeof($lines);
 445          $other_len = sizeof($other_changed);
 446  
 447          while (1)
 448          {
 449              // Scan forward to find the beginning of another run of
 450              // changes. Also keep track of the corresponding point in the other file.
 451              //
 452              // Throughout this code, $i and $j are adjusted together so that
 453              // the first $i elements of $changed and the first $j elements of
 454              // $other_changed both contain the same number of zeros (unchanged lines).
 455              //
 456              // Furthermore, $j is always kept so that $j == $other_len or $other_changed[$j] == false.
 457              while ($j < $other_len && $other_changed[$j])
 458              {
 459                  $j++;
 460              }
 461  
 462              while ($i < $len && ! $changed[$i])
 463              {
 464                  $i++;
 465                  $j++;
 466  
 467                  while ($j < $other_len && $other_changed[$j])
 468                  {
 469                      $j++;
 470                  }
 471              }
 472  
 473              if ($i == $len)
 474              {
 475                  break;
 476              }
 477  
 478              $start = $i;
 479  
 480              // Find the end of this run of changes.
 481              while (++$i < $len && $changed[$i])
 482              {
 483                  continue;
 484              }
 485  
 486              do
 487              {
 488                  // Record the length of this run of changes, so that we can later determine whether the run has grown.
 489                  $runlength = $i - $start;
 490  
 491                  // Move the changed region back, so long as the previous unchanged line matches the last changed one.
 492                  // This merges with previous changed regions.
 493                  while ($start > 0 && $lines[$start - 1] == $lines[$i - 1])
 494                  {
 495                      $changed[--$start] = 1;
 496                      $changed[--$i] = false;
 497  
 498                      while ($start > 0 && $changed[$start - 1])
 499                      {
 500                          $start--;
 501                      }
 502  
 503                      while ($other_changed[--$j])
 504                      {
 505                          continue;
 506                      }
 507                  }
 508  
 509                  // Set CORRESPONDING to the end of the changed run, at the last point where it corresponds to a changed run in the
 510                  // other file. CORRESPONDING == LEN means no such point has been found.
 511                  $corresponding = $j < $other_len ? $i : $len;
 512  
 513                  // Move the changed region forward, so long as the first changed line matches the following unchanged one.
 514                  // This merges with following changed regions.
 515                  // Do this second, so that if there are no merges, the changed region is moved forward as far as possible.
 516                  while ($i < $len && $lines[$start] == $lines[$i])
 517                  {
 518                      $changed[$start++] = false;
 519                      $changed[$i++] = 1;
 520  
 521                      while ($i < $len && $changed[$i])
 522                      {
 523                          $i++;
 524                      }
 525  
 526                      $j++;
 527                      if ($j < $other_len && $other_changed[$j])
 528                      {
 529                          $corresponding = $i;
 530                          while ($j < $other_len && $other_changed[$j])
 531                          {
 532                              $j++;
 533                          }
 534                      }
 535                  }
 536              }
 537              while ($runlength != $i - $start);
 538  
 539              // If possible, move the fully-merged run of changes back to a corresponding run in the other file.
 540              while ($corresponding < $i)
 541              {
 542                  $changed[--$start] = 1;
 543                  $changed[--$i] = 0;
 544  
 545                  while ($other_changed[--$j])
 546                  {
 547                      continue;
 548                  }
 549              }
 550          }
 551      }
 552  }
 553  
 554  ?>
PHP Cross Reference of Unnamed Project

/includes/diff/ -> engine.php (source)