001/*
002 * $Log: Diff.java,v $
003 * Revision 1.7  2009/01/19 03:05:26  stuart
004 * Fix StackOverflow bug with heuristic on reported by Jimmy Han.
005 *
006 * Revision 1.6  2003/03/06 22:51:32  stuart
007 * Convert to CVS
008 *
009 * Revision 1.5  2002/07/19  19:14:40  stuart
010 * fix reverseScript, make change ctor public, update docs
011 *
012 * Revision 1.4  2002/04/09  17:53:39  stuart
013 * More flexible interface for diff() function.
014 *
015 * Revision 1.3  2000/03/03 21:58:03  stuart
016 * move discard_confusing_lines and shift_boundaries to class file_data
017 *
018 * Revision 1.2  2000/03/02  16:37:38  stuart
019 * Add GPL and copyright
020 *
021*/
022// package bmsi.util;
023package ca.bc.webarts.tools;
024
025import java.util.Hashtable;
026
027/** A class to compare vectors of objects.  The result of comparison
028 * is a list of <code>change</code> objects which form an
029 * edit script.  The objects compared are traditionally lines
030 * of text from two files.  Comparison options such as "ignore
031 * whitespace" are implemented by modifying the <code>equals</code>
032 * and <code>hashcode</code> methods for the objects compared.
033 * <p>
034 * The basic algorithm is described in: </br>
035 * "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
036 * Algorithmica Vol. 1 No. 2, 1986, p 251.
037 * <p>
038 * This class outputs different results from GNU diff 1.15 on some
039 * inputs.  Our results are actually better (smaller change list, smaller
040 * total size of changes), but it would be nice to know why.  Perhaps
041 * there is a memory overwrite bug in GNU diff 1.15.
042 *
043 * @author Stuart D. Gathman, translated from GNU diff 1.15
044 * Copyright (C) 2000  Business Management Systems, Inc.
045 * <p>
046 * This program is free software; you can redistribute it and/or modify
047 * it under the terms of the GNU General Public License as published by
048 * the Free Software Foundation; either version 1, or (at your option)
049 * any later version.
050 * <p>
051 * This program is distributed in the hope that it will be useful,
052 * but WITHOUT ANY WARRANTY; without even the implied warranty of
053 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
054 * GNU General Public License for more details.
055 * <p>
056 * You should have received a copy of the <a href=COPYING.txt>
057 * GNU General Public License</a>
058 * along with this program; if not, write to the Free Software
059 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
060 *
061 */
062
063public class Diff
064{
065
066  /** Prepare to find differences between two arrays.  Each element of
067   * the arrays is translated to an "equivalence number" based on
068   * the result of <code>equals</code>.  The original Object arrays
069   * are no longer needed for computing the differences.  They will
070   * be needed again later to print the results of the comparison as
071   * an edit script, if desired.
072   */
073  public Diff(Object[] a, Object[] b)
074  {
075    Hashtable h = new Hashtable(a.length + b.length);
076    filevec[0] = new file_data(a, h);
077    filevec[1] = new file_data(b, h);
078  }
079
080  /** 1 more than the maximum equivalence value used for this or its
081   * sibling file. */
082  private int equiv_max = 1;
083
084  /** When set to true, the comparison uses a heuristic to speed it up.
085   * With this heuristic, for files with a constant small density
086   * of changes, the algorithm is linear in the file size.  */
087  public boolean heuristic = false;
088
089  /** When set to true, the algorithm returns a guarranteed minimal
090   * set of changes.  This makes things slower, sometimes much slower. */
091  public boolean no_discards = false;
092
093  private int[] xvec, yvec;  /* Vectors being compared. */
094  private int[] fdiag;  /* Vector, indexed by diagonal, containing
095           the X coordinate of the point furthest
096           along the given diagonal in the forward
097  search of the edit matrix. */
098  private int[] bdiag;  /* Vector, indexed by diagonal, containing
099           the X coordinate of the point furthest
100           along the given diagonal in the backward
101  search of the edit matrix. */
102  private int fdiagoff, bdiagoff;
103  private final file_data[] filevec = new file_data[2];
104  private int cost;
105  /** Snakes bigger than this are considered "big". */
106  private static final int SNAKE_LIMIT = 20;
107
108  /** Find the midpoint of the shortest edit script for a specified
109   * portion of the two files.
110   *
111   * We scan from the beginnings of the files, and simultaneously from the ends,
112   * doing a breadth-first search through the space of edit-sequence.
113   * When the two searches meet, we have found the midpoint of the shortest
114   * edit sequence.
115   *
116   * The value returned is the number of the diagonal on which the midpoint lies.
117   * The diagonal number equals the number of inserted lines minus the number
118   * of deleted lines (counting only lines before the midpoint).
119   * The edit cost is stored into COST; this is the total number of
120   * lines inserted or deleted (counting only lines before the midpoint).
121   *
122   * This function assumes that the first lines of the specified portions
123   * of the two files do not match, and likewise that the last lines do not
124   * match.  The caller must trim matching lines from the beginning and end
125   * of the portions it is going to specify.
126   *
127   * Note that if we return the "wrong" diagonal value, or if
128   * the value of bdiag at that diagonal is "wrong",
129   * the worst this can do is cause suboptimal diff output.
130   * It cannot cause incorrect diff output.  */
131
132  private int diag(int xoff, int xlim, int yoff, int ylim)
133  {
134    final int[] fd = fdiag;    // Give the compiler a chance.
135    final int[] bd = bdiag;    // Additional help for the compiler.
136    final int[] xv = xvec;    // Still more help for the compiler.
137    final int[] yv = yvec;    // And more and more . . .
138    final int dmin = xoff - ylim;    // Minimum valid diagonal.
139    final int dmax = xlim - yoff;    // Maximum valid diagonal.
140    final int fmid = xoff - yoff;    // Center diagonal of top-down search.
141    final int bmid = xlim - ylim;    // Center diagonal of bottom-up search.
142    int fmin = fmid, fmax = fmid;    // Limits of top-down search.
143    int bmin = bmid, bmax = bmid;    // Limits of bottom-up search.
144    /* True if southeast corner is on an odd
145    diagonal with respect to the northwest. */
146    final boolean odd = (fmid - bmid & 1) != 0;
147
148    fd[ fdiagoff + fmid] = xoff;
149    bd[ bdiagoff + bmid] = xlim;
150
151    for (int c = 1;; ++c)
152    {
153      int d;      /* Active diagonal. */
154      boolean big_snake = false;
155
156      /* Extend the top-down search by an edit step in each diagonal. */
157      if (fmin > dmin)
158      {
159        fd[ fdiagoff + -- fmin - 1] = -1;
160      }
161      else
162      {
163        ++fmin;
164      }
165      if (fmax < dmax)
166      {
167        fd[ fdiagoff + ++ fmax + 1] = -1;
168      }
169      else
170      {
171        --fmax;
172      }
173      for (d = fmax; d >= fmin; d -= 2)
174      {
175        int x, y, oldx, tlo = fd[ fdiagoff + d - 1], thi = fd[ fdiagoff + d + 1];
176
177        if (tlo >= thi)
178        {
179          x = tlo + 1;
180        }
181        else
182        {
183          x = thi;
184        }
185        oldx = x;
186        y = x - d;
187        while (x < xlim && y < ylim && xv[x] == yv[y])
188        {
189          ++x; ++y;
190        }
191        if (x - oldx > SNAKE_LIMIT)
192        {
193          big_snake = true;
194        }
195        fd[ fdiagoff + d] = x;
196        if (odd && bmin <= d && d <= bmax && bd[ bdiagoff + d] <= fd[ fdiagoff + d])
197        {
198          cost = 2 * c - 1;
199          return d;
200        }
201      }
202
203      /* Similar extend the bottom-up search. */
204      if (bmin > dmin)
205      {
206        bd[ bdiagoff + -- bmin - 1] = Integer.MAX_VALUE;
207      }
208      else
209      {
210        ++bmin;
211      }
212      if (bmax < dmax)
213      {
214        bd[ bdiagoff + ++ bmax + 1] = Integer.MAX_VALUE;
215      }
216      else
217      {
218        --bmax;
219      }
220      for (d = bmax; d >= bmin; d -= 2)
221      {
222        int x, y, oldx, tlo = bd[ bdiagoff + d - 1], thi = bd[ bdiagoff + d + 1];
223
224        if (tlo < thi)
225        {
226          x = tlo;
227        }
228        else
229        {
230          x = thi - 1;
231        }
232        oldx = x;
233        y = x - d;
234        while (x > xoff && y > yoff && xv[ x - 1] == yv[ y - 1])
235        {
236          --x; --y;
237        }
238        if (oldx - x > SNAKE_LIMIT)
239        {
240          big_snake = true;
241        }
242        bd[ bdiagoff + d] = x;
243        if (! odd && fmin <= d && d <= fmax && bd[ bdiagoff + d] <= fd[ fdiagoff + d])
244        {
245          cost = 2 * c;
246          return d;
247        }
248      }
249
250      /* Heuristic: check occasionally for a diagonal that has made
251     lots of progress compared with the edit distance.
252     If we have any such, find the one that has made the most
253     progress and return it as if it had succeeded.
254
255     With this heuristic, for files with a constant small density
256      of changes, the algorithm is linear in the file size.  */
257
258      if (c > 200 && big_snake && heuristic)
259      {
260        int best = 0;
261        int bestpos = -1;
262
263        for (d = fmax; d >= fmin; d -= 2)
264        {
265          int dd = d - fmid;
266          int x = fd[ fdiagoff + d];
267          int y = x - d;
268          int v = (x - xoff) * 2 - dd;
269          if (v > 12 * (c + (dd < 0 ? - dd : dd)))
270          {
271            if (v > best && xoff + SNAKE_LIMIT <= x && x < xlim && yoff + SNAKE_LIMIT <= y && y < ylim)
272            {
273              /* We have a good enough best diagonal;
274              now insist that it end with a significant snake.  */
275              int k;
276
277              for (k = 1; xvec[ x - k] == yvec[ y - k]; k++)
278              {
279                if (k == SNAKE_LIMIT)
280                {
281                  best = v;
282                  bestpos = d;
283                  break;
284                }
285              }
286            }
287          }
288        }
289        if (best > 0)
290        {
291          cost = 2 * c - 1;
292          return bestpos;
293        }
294
295        best = 0;
296        for (d = bmax; d >= bmin; d -= 2)
297        {
298          int dd = d - bmid;
299          int x = bd[ bdiagoff + d];
300          int y = x - d;
301          int v = (xlim - x) * 2 + dd;
302          if (v > 12 * (c + (dd < 0 ? - dd : dd)))
303          {
304            if (v > best && xoff < x && x <= xlim - SNAKE_LIMIT && yoff < y && y <= ylim - SNAKE_LIMIT)
305            {
306              /* We have a good enough best diagonal;
307              now insist that it end with a significant snake.  */
308              int k;
309
310              for (k = 0; xvec[ x + k] == yvec[ y + k]; k++)
311              {
312                if (k == SNAKE_LIMIT)
313                {
314                  best = v;
315                  bestpos = d;
316                  break;
317                }
318              }
319            }
320          }
321        }
322        if (best > 0)
323        {
324          cost = 2 * c - 1;
325          return bestpos;
326        }
327      }
328    }
329  }
330
331  /** Compare in detail contiguous subsequences of the two files
332   * which are known, as a whole, to match each other.
333   *
334   * The results are recorded in the vectors filevec[N].changed_flag, by
335   * storing a 1 in the element for each line that is an insertion or deletion.
336   *
337   * The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
338   *
339   * Note that XLIM, YLIM are exclusive bounds.
340   * All line numbers are origin-0 and discarded lines are not counted.  */
341
342  private void compareseq(int xoff, int xlim, int yoff, int ylim)
343  {
344    /* Slide down the bottom initial diagonal. */
345    while (xoff < xlim && yoff < ylim && xvec[xoff] == yvec[yoff])
346    {
347      ++xoff; ++yoff;
348    }
349    /* Slide up the top initial diagonal. */
350    while (xlim > xoff && ylim > yoff && xvec[ xlim - 1] == yvec[ ylim - 1])
351    {
352      --xlim; --ylim;
353    }
354
355    /* Handle simple cases. */
356    if (xoff == xlim)
357    {
358      while (yoff < ylim)
359      {
360        filevec[1].changed_flag[1 + filevec[1].realindexes[yoff++]] = true;
361      }
362    }
363    else if (yoff == ylim)
364    {
365      while (xoff < xlim)
366      {
367        filevec[0].changed_flag[1 + filevec[0].realindexes[xoff++]] = true;
368      }
369    }
370    else
371    {
372      /* Find a point of correspondence in the middle of the files.  */
373
374      int d = diag(xoff, xlim, yoff, ylim);
375      int c = cost;
376      int f = fdiag[ fdiagoff + d];
377      int b = bdiag[ bdiagoff + d];
378
379      if (c == 1)
380      {
381        /* This should be impossible, because it implies that
382         one of the two subsequences is empty,
383         and that case was handled above without calling `diag'.
384        Let's verify that this is true.  */
385        throw new IllegalArgumentException("Empty subsequence");
386      }
387      else
388      {
389        /* Use that point to split this problem into two subproblems.  */
390        compareseq(xoff, b, yoff, b - d);
391        /* This used to use f instead of b,
392         but that is incorrect!
393         It is not necessarily the case that diagonal d
394        has a snake from b to f.  */
395        compareseq(b, xlim, b - d, ylim);
396      }
397    }
398  }
399
400  /** Discard lines from one file that have no matches in the other file.
401   */
402
403  private void discard_confusing_lines()
404  {
405    filevec[0].discard_confusing_lines(filevec[1]);
406    filevec[1].discard_confusing_lines(filevec[0]);
407  }
408
409  private boolean inhibit = false;
410
411  /** Adjust inserts/deletes of blank lines to join changes
412   * as much as possible.
413   */
414
415  private void shift_boundaries()
416  {
417    if (inhibit)
418    {
419      return;
420    }
421    filevec[0].shift_boundaries(filevec[1]);
422    filevec[1].shift_boundaries(filevec[0]);
423  }
424
425  public interface ScriptBuilder
426  {
427    /** Scan the tables of which lines are inserted and deleted,
428     * producing an edit script.
429     * @param changed0 true for lines in first file which do not match 2nd
430     * @param len0 number of lines in first file
431     * @param changed1 true for lines in 2nd file which do not match 1st
432     * @param len1 number of lines in 2nd file
433     * @return a linked list of changes - or null
434     */
435    public change build_script( boolean[] changed0, int len0, boolean[] changed1, int len1 );
436  }
437
438  /** Scan the tables of which lines are inserted and deleted,
439   * producing an edit script in reverse order.  */
440
441  static class ReverseScript implements ScriptBuilder
442  {
443    public change build_script( final boolean[] changed0, int len0, final boolean[] changed1, int len1)
444    {
445      change script = null;
446      int i0 = 0, i1 = 0;
447      while (i0 < len0 || i1 < len1)
448      {
449        if (changed0[1 + i0] || changed1[1 + i1])
450        {
451          int line0 = i0, line1 = i1;
452
453          /* Find # lines changed here in each file.  */
454          while (changed0[1 + i0])
455          {
456            ++i0;
457          }
458
459          while (changed1[1 + i1])
460          {
461            ++i1;
462          }
463
464
465          /* Record this change.  */
466          script = new change(line0, line1, i0 - line0, i1 - line1, script);
467        }
468
469        /* We have reached lines in the two files that match each other.  */
470        i0++; i1++;
471      }
472
473      return script;
474    }
475  }
476
477  static class ForwardScript implements ScriptBuilder
478  {
479    /** Scan the tables of which lines are inserted and deleted,
480     * producing an edit script in forward order.  */
481    public change build_script( final boolean[] changed0, int len0, final boolean[] changed1, int len1)
482    {
483      change script = null;
484      int i0 = len0, i1 = len1;
485
486      while (i0 >= 0 || i1 >= 0)
487      {
488        if (changed0[i0] || changed1[i1])
489        {
490          int line0 = i0, line1 = i1;
491
492          /* Find # lines changed here in each file.  */
493          while (changed0[i0])
494          {
495            --i0;
496          }
497
498          while (changed1[i1])
499          {
500            --i1;
501          }
502
503
504          /* Record this change.  */
505          script = new change(i0, i1, line0 - i0, line1 - i1, script);
506        }
507
508        /* We have reached lines in the two files that match each other.  */
509        i0--; i1--;
510      }
511
512      return script;
513    }
514  }
515
516  /** Standard ScriptBuilders. */
517
518  public static final ScriptBuilder forwardScript = new ForwardScript(), reverseScript = new ReverseScript();
519
520  /* Report the differences of two files.  DEPTH is the current directory
521  depth. */
522  public final change diff_2(final boolean reverse)
523  {
524    return diff(reverse ? reverseScript : forwardScript);
525  }
526
527  /** Get the results of comparison as an edit script.  The script
528   * is described by a list of changes.  The standard ScriptBuilder
529   * implementations provide for forward and reverse edit scripts.
530   * Alternate implementations could, for instance, list common elements
531   * instead of differences.
532   * @param bld an object to build the script from change flags
533   * @return the head of a list of changes
534   */
535  public change diff(final ScriptBuilder bld)
536  {
537
538    /* Some lines are obviously insertions or deletions
539       because they don't match anything.  Detect them now,
540    and avoid even thinking about them in the main comparison algorithm.  */
541
542    discard_confusing_lines();
543
544    /* Now do the main comparison algorithm, considering just the
545    undiscarded lines.  */
546
547    xvec = filevec[0].undiscarded;
548    yvec = filevec[1].undiscarded;
549
550    int diags = filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines + 3;
551    fdiag = new int[diags];
552    fdiagoff = filevec[1].nondiscarded_lines + 1;
553    bdiag = new int[diags];
554    bdiagoff = filevec[1].nondiscarded_lines + 1;
555
556    compareseq(0, filevec[0].nondiscarded_lines, 0, filevec[1].nondiscarded_lines);
557    fdiag = null;
558    bdiag = null;
559
560    /* Modify the results slightly to make them prettier
561    in cases where that can validly be done.  */
562
563    shift_boundaries();
564
565    /* Get the results of comparison in the form of a chain
566    of `struct change's -- an edit script.  */
567    return bld.build_script (filevec[0].changed_flag, filevec[0].buffered_lines, filevec[1].changed_flag, filevec[1].buffered_lines );
568
569  }
570
571  /** The result of comparison is an "edit script": a chain of change objects.
572   * Each change represents one place where some lines are deleted
573   * and some are inserted.
574   *
575   * LINE0 and LINE1 are the first affected lines in the two files (origin 0).
576   * DELETED is the number of lines deleted here from file 0.
577   * INSERTED is the number of lines inserted here in file 1.
578   *
579   * If DELETED is 0 then LINE0 is the number of the line before
580   * which the insertion was done; vice versa for INSERTED and LINE1.  */
581
582  public static class change
583  {
584    /** Previous or next edit command. */
585    public change link;
586    /** # lines of file 1 changed here.  */
587    public final int inserted;
588    /** # lines of file 0 changed here.  */
589    public final int deleted;
590    /** Line number of 1st deleted line.  */
591    public final int line0;
592    /** Line number of 1st inserted line.  */
593    public final int line1;
594
595    /** Cons an additional entry onto the front of an edit script OLD.
596     * LINE0 and LINE1 are the first affected lines in the two files (origin 0).
597     * DELETED is the number of lines deleted here from file 0.
598     * INSERTED is the number of lines inserted here in file 1.
599     *
600     * If DELETED is 0 then LINE0 is the number of the line before
601     * which the insertion was done; vice versa for INSERTED and LINE1.  */
602    public change(int line0, int line1, int deleted, int inserted, change old)
603    {
604      this.line0 = line0;
605      this.line1 = line1;
606      this.inserted = inserted;
607      this.deleted = deleted;
608      this.link = old;
609      // System.err.println(line0+","+line1+","+inserted+","+deleted);
610    }
611  }
612
613  /** Data on one input file being compared.
614   */
615
616  class file_data
617  {
618
619    /** Allocate changed array for the results of comparison.  */
620    void clear()
621    {
622      /* Allocate a flag for each line of each file, saying whether that line
623   is an insertion or deletion.
624   Allocate an extra element, always zero, at each end of each vector.
625      */
626      changed_flag = new boolean[ buffered_lines + 2];
627    }
628
629    /** Return equiv_count[I] as the number of lines in this file
630     * that fall in equivalence class I.
631     * @return the array of equivalence class counts.
632     */
633    int[] equivCount()
634    {
635      int[] equiv_count = new int[equiv_max];
636      for (int i = 0; i < buffered_lines; ++i)
637      {
638        ++equiv_count[equivs[i]];
639      }
640
641      return equiv_count;
642    }
643
644    /** Discard lines that have no matches in another file.
645     *
646     * A line which is discarded will not be considered by the actual
647     * comparison algorithm; it will be as if that line were not in the file.
648     * The file's `realindexes' table maps virtual line numbers
649     * (which don't count the discarded lines) into real line numbers;
650     * this is how the actual comparison algorithm produces results
651     * that are comprehensible when the discarded lines are counted.
652     * <p>
653     * When we discard a line, we also mark it as a deletion or insertion
654     * so that it will be printed in the output.
655     * @param f the other file
656     */
657    void discard_confusing_lines(file_data f)
658    {
659      clear();
660      /* Set up table of which lines are going to be discarded. */
661      final byte[] discarded = discardable(f.equivCount());
662
663      /* Don't really discard the provisional lines except when they occur
664       in a run of discardables, with nonprovisionals at the beginning
665      and end.  */
666      filterDiscards(discarded);
667
668      /* Actually discard the lines. */
669      discard(discarded);
670    }
671
672    /** Mark to be discarded each line that matches no line of another file.
673     * If a line matches many lines, mark it as provisionally discardable.
674     * @see equivCount()
675     * @param counts The count of each equivalence number for the other file.
676     * @return 0=nondiscardable, 1=discardable or 2=provisionally discardable
677     * for each line
678     */
679
680    private byte[] discardable(final int[] counts)
681    {
682      final int end = buffered_lines;
683      final byte[] discards = new byte[end];
684      final int[] equivs = this.equivs;
685      int many = 5;
686      int tem = end / 64;
687
688      /* Multiply MANY by approximate square root of number of lines.
689      That is the threshold for provisionally discardable lines.  */
690      while ((tem = tem >> 2) > 0)
691      {
692        many *= 2;
693      }
694
695
696      for (int i = 0; i < end; i++)
697      {
698        int nmatch;
699        if (equivs[i] == 0)
700        {
701          continue;
702        }
703        nmatch = counts[equivs[i]];
704        if (nmatch == 0)
705        {
706          discards[i] = 1;
707        }
708        else if (nmatch > many)
709        {
710          discards[i] = 2;
711        }
712      }
713      return discards;
714    }
715
716    /** Don't really discard the provisional lines except when they occur
717     * in a run of discardables, with nonprovisionals at the beginning
718     * and end.  */
719
720    private void filterDiscards(final byte[] discards)
721    {
722      final int end = buffered_lines;
723
724      for (int i = 0; i < end; i++)
725      {
726        /* Cancel provisional discards not in middle of run of discards.  */
727        if (discards[i] == 2)
728        {
729          discards[i] = 0;
730        }
731        else if (discards[i] != 0)
732        {
733          /* We have found a nonprovisional discard.  */
734          int j;
735          int length;
736          int provisional = 0;
737
738          /* Find end of this run of discardable lines.
739          Count how many are provisionally discardable.  */
740          for (j = i; j < end; j++)
741          {
742            if (discards[j] == 0)
743            {
744              break;
745            }
746            if (discards[j] == 2)
747            {
748              ++provisional;
749            }
750          }
751
752          /* Cancel provisional discards at end, and shrink the run.  */
753          while (j > i && discards[ j - 1] == 2)
754          {
755            discards[ --j] = 0; --provisional;
756          }
757
758          /* Now we have the length of a run of discardable lines
759          whose first and last are not provisional.  */
760          length = j - i;
761
762          /* If 1/4 of the lines in the run are provisional,
763          cancel discarding of all provisional lines in the run.  */
764          if (provisional * 4 > length)
765          {
766            while (j > i)
767            {
768              if (discards[ --j] == 2)
769              {
770                discards[j] = 0;
771              }
772            }
773          }
774          else
775          {
776            int consec;
777            int minimum = 1;
778            int tem = length / 4;
779
780            /* MINIMUM is approximate square root of LENGTH/4.
781           A subrun of two or more provisionals can stand
782           when LENGTH is at least 16.
783            A subrun of 4 or more can stand when LENGTH >= 64.  */
784            while ((tem = tem >> 2) > 0)
785            {
786              minimum *= 2;
787            }
788
789            minimum++;
790
791            /* Cancel any subrun of MINIMUM or more provisionals
792            within the larger run.  */
793            for (j = 0, consec = 0; j < length; j++)
794            {
795              if (discards[ i + j] != 2)
796              {
797                consec = 0;
798              }
799              else{}
800            }
801            if (minimum == ++consec)            /* Back up to start of subrun, to cancel it all.  */
802            {
803              j -= consec;
804            }
805            else if (minimum < consec)
806            {
807              discards[ i + j] = 0;
808            }
809
810            /* Scan from beginning of run
811           until we find 3 or more nonprovisionals in a row
812           or until the first nonprovisional at least 8 lines in.
813            Until that point, cancel any provisionals.  */
814            for (j = 0, consec = 0; j < length; j++)
815            {
816              if (j >= 8 && discards[ i + j] == 1)
817              {
818                break;
819              }
820              if (discards[ i + j] == 2)
821              {
822                consec = 0; discards[ i + j] = 0;
823              }
824              else if (discards[ i + j] == 0)
825              {
826                consec = 0;
827              }
828              else
829              {
830                consec++;
831              }
832              if (consec == 3)
833              {
834                break;
835              }
836            }
837
838            /* I advances to the last line of the run.  */
839            i += length - 1;
840
841            /* Same thing, from end.  */
842            for (j = 0, consec = 0; j < length; j++)
843            {
844              if (j >= 8 && discards[ i - j] == 1)
845              {
846                break;
847              }
848              if (discards[ i - j] == 2)
849              {
850                consec = 0; discards[ i - j] = 0;
851              }
852              else if (discards[ i - j] == 0)
853              {
854                consec = 0;
855              }
856              else
857              {
858                consec++;
859              }
860              if (consec == 3)
861              {
862                break;
863              }
864            }
865          }
866        }
867      }
868    }
869
870    /** Actually discard the lines.
871     * @param discards flags lines to be discarded
872     */
873    private void discard(final byte[] discards)
874    {
875      final int end = buffered_lines;
876      int j = 0;
877      for (int i = 0; i < end; ++i)
878      {
879        if (no_discards || discards[i] == 0)
880        {
881          undiscarded[j] = equivs[i];
882          realindexes[j++] = i;
883        }
884        else
885        {
886          changed_flag[1 + i] = true;
887        }
888      }
889
890      nondiscarded_lines = j;
891    }
892
893    file_data(Object[] data, Hashtable h)
894    {
895      buffered_lines = data.length;
896
897      equivs = new int[buffered_lines];
898      undiscarded = new int[buffered_lines];
899      realindexes = new int[buffered_lines];
900
901      for (int i = 0; i < data.length; ++i)
902      {
903        Integer ir = (Integer) h.get(data[i]);
904        if (ir == null)
905        {
906          h.put(data[i], new Integer(equivs[i] = equiv_max++));
907        }
908        else
909        {
910          equivs[i] = ir.intValue();
911        }
912      }
913    }
914
915    /** Adjust inserts/deletes of blank lines to join changes
916     * as much as possible.
917     *
918     * We do something when a run of changed lines include a blank
919     * line at one end and have an excluded blank line at the other.
920     * We are free to choose which blank line is included.
921     * `compareseq' always chooses the one at the beginning,
922     * but usually it is cleaner to consider the following blank line
923     * to be the "change".  The only exception is if the preceding blank line
924     * would join this change to other changes.
925     * @param f the file being compared against
926     */
927
928    void shift_boundaries(file_data f)
929    {
930      final boolean[] changed = changed_flag;
931      final boolean[] other_changed = f.changed_flag;
932      int i = 0;
933      int j = 0;
934      int i_end = buffered_lines;
935      int preceding = -1;
936      int other_preceding = -1;
937
938      for (;;)
939      {
940        int start, end, other_start;
941
942        /* Scan forwards to find beginning of another run of changes.
943        Also keep track of the corresponding point in the other file.  */
944
945        while (i < i_end && !changed[1 + i])
946        {
947          while (other_changed[1 + j++])          /* Non-corresponding lines in the other file
948          will count as the preceding batch of changes.  */
949          {
950            other_preceding = j;
951          }
952
953          i++;
954        }
955
956        if (i == i_end)
957        {
958          break;
959        }
960
961        start = i;
962        other_start = j;
963
964        for (;;)
965        {
966          /* Now find the end of this run of changes.  */
967
968          while (i < i_end && changed[1 + i])
969          {
970            i++;
971          }
972
973          end = i;
974
975          /* If the first changed line matches the following unchanged one,
976     and this run does not follow right after a previous run,
977     and there are no lines deleted from the other file here,
978     then classify the first changed line as unchanged
979          and the following line as changed in its place.  */
980
981          /* You might ask, how could this run follow right after another?
982          Only because the previous run was shifted here.  */
983
984          if (end != i_end && equivs[start] == equivs[end] && !other_changed[1 + j] && end != i_end && !((preceding >= 0 && start == preceding) || (other_preceding >= 0 && other_start == other_preceding)))
985          {
986            changed[1 + end++] = true;
987            changed[1 + start++] = false;
988            ++i;
989            /* Since one line-that-matches is now before this run
990         instead of after, we must advance in the other file
991            to keep in synch.  */
992            ++j;
993          }
994          else
995          {
996            break;
997          }
998        }
999
1000        preceding = i;
1001        other_preceding = j;
1002      }
1003    }
1004
1005    /** Number of elements (lines) in this file. */
1006    final int buffered_lines;
1007
1008    /** Vector, indexed by line number, containing an equivalence code for
1009     * each line.  It is this vector that is actually compared with that
1010     * of another file to generate differences. */
1011    private final int[] equivs;
1012
1013    /** Vector, like the previous one except that
1014     * the elements for discarded lines have been squeezed out.  */
1015    final int[] undiscarded;
1016
1017    /** Vector mapping virtual line numbers (not counting discarded lines)
1018     * to real ones (counting those lines).  Both are origin-0.  */
1019    final int[] realindexes;
1020
1021    /** Total number of nondiscarded lines. */
1022    int nondiscarded_lines;
1023
1024    /** Array, indexed by real origin-1 line number,
1025     * containing true for a line that is an insertion or a deletion.
1026     * The results of comparison are stored here.  */
1027    boolean[] changed_flag;
1028
1029  }
1030}
1031