comparison kdiff3/src-QT4/gnudiff_io.cpp @ 75:08ea9b86c12c

KDiff3-0.9.91
author joachim99
date Sat, 04 Nov 2006 00:05:00 +0000
parents kdiff3/src/gnudiff_io.cpp@8febbfb1148c
children
comparison
equal deleted inserted replaced
74:069521efec1a 75:08ea9b86c12c
1 /* File I/O for GNU DIFF.
2
3 Modified for KDiff3 by Joachim Eibl 2003, 2004, 2005.
4 The original file was part of GNU DIFF.
5
6 Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002
7 Free Software Foundation, Inc.
8
9 GNU DIFF is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 2, or (at your option)
12 any later version.
13
14 GNU DIFF is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING.
21 If not, write to the Free Software Foundation,
22 51 Franklin Steet, Fifth Floor, Boston, MA 02110-1301, USA. */
23
24 #include "gnudiff_diff.h"
25 #include <stdlib.h>
26
27 /* Rotate an unsigned value to the left. */
28 #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
29
30 /* Given a hash value and a new character, return a new hash value. */
31 #define HASH(h, c) ((c) + ROL (h, 7))
32
33 /* The type of a hash value. */
34 typedef size_t hash_value;
35 verify (hash_value_is_unsigned, ! TYPE_SIGNED (hash_value));
36
37 /* Lines are put into equivalence classes of lines that match in lines_differ.
38 Each equivalence class is represented by one of these structures,
39 but only while the classes are being computed.
40 Afterward, each class is represented by a number. */
41 struct equivclass
42 {
43 lin next; /* Next item in this bucket. */
44 hash_value hash; /* Hash of lines in this class. */
45 const QChar *line; /* A line that fits this class. */
46 size_t length; /* That line's length, not counting its newline. */
47 };
48
49 /* Hash-table: array of buckets, each being a chain of equivalence classes.
50 buckets[-1] is reserved for incomplete lines. */
51 static lin *buckets;
52
53 /* Number of buckets in the hash table array, not counting buckets[-1]. */
54 static size_t nbuckets;
55
56 /* Array in which the equivalence classes are allocated.
57 The bucket-chains go through the elements in this array.
58 The number of an equivalence class is its index in this array. */
59 static struct equivclass *equivs;
60
61 /* Index of first free element in the array `equivs'. */
62 static lin equivs_index;
63
64 /* Number of elements allocated in the array `equivs'. */
65 static lin equivs_alloc;
66
67
68 /* Check for binary files and compare them for exact identity. */
69
70 /* Return 1 if BUF contains a non text character.
71 SIZE is the number of characters in BUF. */
72
73 #define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
74
75 /* Compare two lines (typically one from each input file)
76 according to the command line options.
77 For efficiency, this is invoked only when the lines do not match exactly
78 but an option like -i might cause us to ignore the difference.
79 Return nonzero if the lines differ. */
80
81 bool GnuDiff::lines_differ (const QChar *s1, size_t len1, const QChar *s2, size_t len2 )
82 {
83 const QChar *t1 = s1;
84 const QChar *t2 = s2;
85 const QChar *s1end = s1+len1;
86 const QChar *s2end = s2+len2;
87
88 for ( ; ; ++t1, ++t2 )
89 {
90 /* Test for exact char equality first, since it's a common case. */
91 if ( t1!=s1end && t2!=s2end && *t1==*t2 )
92 continue;
93 else
94 {
95 while ( t1!=s1end &&
96 ( bIgnoreWhiteSpace && isWhite( *t1 ) ||
97 bIgnoreNumbers && (t1->isDigit() || *t1=='-' || *t1=='.' )))
98 {
99 ++t1;
100 }
101
102 while ( t2 != s2end &&
103 ( bIgnoreWhiteSpace && isWhite( *t2 ) ||
104 bIgnoreNumbers && (t2->isDigit() || *t2=='-' || *t2=='.' )))
105 {
106 ++t2;
107 }
108
109 if ( t1!=s1end && t2!=s2end )
110 {
111 if (ignore_case)
112 { /* Lowercase comparison. */
113 if ( t1->toLower() == t2->toLower() )
114 continue;
115 }
116 else if ( *t1 == *t2 )
117 continue;
118 else
119 return true;
120 }
121 else if ( t1==s1end && t2==s2end )
122 return false;
123 else
124 return true;
125 }
126 }
127 return false;
128 }
129
130
131 /* Split the file into lines, simultaneously computing the equivalence
132 class for each line. */
133
134 void GnuDiff::find_and_hash_each_line (struct file_data *current)
135 {
136 hash_value h;
137 const QChar *p = current->prefix_end;
138 QChar c;
139 lin i, *bucket;
140 size_t length;
141
142 /* Cache often-used quantities in local variables to help the compiler. */
143 const QChar **linbuf = current->linbuf;
144 lin alloc_lines = current->alloc_lines;
145 lin line = 0;
146 lin linbuf_base = current->linbuf_base;
147 lin *cureqs = (lin*)xmalloc (alloc_lines * sizeof *cureqs);
148 struct equivclass *eqs = equivs;
149 lin eqs_index = equivs_index;
150 lin eqs_alloc = equivs_alloc;
151 const QChar *suffix_begin = current->suffix_begin;
152 const QChar *bufend = current->buffer + current->buffered;
153 bool diff_length_compare_anyway =
154 ignore_white_space != IGNORE_NO_WHITE_SPACE || bIgnoreNumbers;
155 bool same_length_diff_contents_compare_anyway =
156 diff_length_compare_anyway | ignore_case;
157
158 while ( p < suffix_begin)
159 {
160 const QChar *ip = p;
161
162 h = 0;
163
164 /* Hash this line until we find a newline or bufend is reached. */
165 if (ignore_case)
166 switch (ignore_white_space)
167 {
168 case IGNORE_ALL_SPACE:
169 while ( p<bufend && (c = *p) != '\n' )
170 {
171 if (! (isWhite(c) || bIgnoreNumbers && (c.isDigit() || c=='-' || c=='.' ) ))
172 h = HASH (h, c.toLower().unicode());
173 ++p;
174 }
175 break;
176
177 default:
178 while ( p<bufend && (c = *p) != '\n' )
179 {
180 h = HASH (h, c.toLower().unicode());
181 ++p;
182 }
183 break;
184 }
185 else
186 switch (ignore_white_space)
187 {
188 case IGNORE_ALL_SPACE:
189 while ( p<bufend && (c = *p) != '\n')
190 {
191 if (! (isWhite(c)|| bIgnoreNumbers && (c.isDigit() || c=='-' || c=='.' ) ))
192 h = HASH (h, c.unicode());
193 ++p;
194 }
195 break;
196
197 default:
198 while ( p<bufend && (c = *p) != '\n')
199 {
200 h = HASH (h, c.unicode());
201 ++p;
202 }
203 break;
204 }
205
206 bucket = &buckets[h % nbuckets];
207 length = p - ip;
208 ++p;
209
210 for (i = *bucket; ; i = eqs[i].next)
211 if (!i)
212 {
213 /* Create a new equivalence class in this bucket. */
214 i = eqs_index++;
215 if (i == eqs_alloc)
216 {
217 if ((lin)(PTRDIFF_MAX / (2 * sizeof *eqs)) <= eqs_alloc)
218 xalloc_die ();
219 eqs_alloc *= 2;
220 eqs = (equivclass*)xrealloc (eqs, eqs_alloc * sizeof *eqs);
221 }
222 eqs[i].next = *bucket;
223 eqs[i].hash = h;
224 eqs[i].line = ip;
225 eqs[i].length = length;
226 *bucket = i;
227 break;
228 }
229 else if (eqs[i].hash == h)
230 {
231 const QChar *eqline = eqs[i].line;
232
233 /* Reuse existing class if lines_differ reports the lines
234 equal. */
235 if (eqs[i].length == length)
236 {
237 /* Reuse existing equivalence class if the lines are identical.
238 This detects the common case of exact identity
239 faster than lines_differ would. */
240 if (memcmp (eqline, ip, length*sizeof(QChar)) == 0)
241 break;
242 if (!same_length_diff_contents_compare_anyway)
243 continue;
244 }
245 else if (!diff_length_compare_anyway)
246 continue;
247
248 if (! lines_differ (eqline, eqs[i].length, ip, length))
249 break;
250 }
251
252 /* Maybe increase the size of the line table. */
253 if (line == alloc_lines)
254 {
255 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */
256 if ((lin)(PTRDIFF_MAX / 3) <= alloc_lines
257 || (lin)(PTRDIFF_MAX / sizeof *cureqs) <= 2 * alloc_lines - linbuf_base
258 || (lin)(PTRDIFF_MAX / sizeof *linbuf) <= alloc_lines - linbuf_base)
259 xalloc_die ();
260 alloc_lines = 2 * alloc_lines - linbuf_base;
261 cureqs =(lin*) xrealloc (cureqs, alloc_lines * sizeof *cureqs);
262 linbuf += linbuf_base;
263 linbuf = (const QChar**) xrealloc (linbuf,
264 (alloc_lines - linbuf_base) * sizeof *linbuf);
265 linbuf -= linbuf_base;
266 }
267 linbuf[line] = ip;
268 cureqs[line] = i;
269 ++line;
270 }
271
272 current->buffered_lines = line;
273
274 for (i = 0; ; i++)
275 {
276 /* Record the line start for lines in the suffix that we care about.
277 Record one more line start than lines,
278 so that we can compute the length of any buffered line. */
279 if (line == alloc_lines)
280 {
281 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */
282 if ((lin)(PTRDIFF_MAX / 3) <= alloc_lines
283 || (lin)(PTRDIFF_MAX / sizeof *cureqs) <= 2 * alloc_lines - linbuf_base
284 || (lin)(PTRDIFF_MAX / sizeof *linbuf) <= alloc_lines - linbuf_base)
285 xalloc_die ();
286 alloc_lines = 2 * alloc_lines - linbuf_base;
287 linbuf += linbuf_base;
288 linbuf = (const QChar**)xrealloc (linbuf,
289 (alloc_lines - linbuf_base) * sizeof *linbuf);
290 linbuf -= linbuf_base;
291 }
292 linbuf[line] = p;
293
294 if ( p >= bufend)
295 break;
296
297 if (context <= i && no_diff_means_no_output)
298 break;
299
300 line++;
301
302 while (p<bufend && *p++ != '\n')
303 continue;
304 }
305
306 /* Done with cache in local variables. */
307 current->linbuf = linbuf;
308 current->valid_lines = line;
309 current->alloc_lines = alloc_lines;
310 current->equivs = cureqs;
311 equivs = eqs;
312 equivs_alloc = eqs_alloc;
313 equivs_index = eqs_index;
314 }
315
316 /* We have found N lines in a buffer of size S; guess the
317 proportionate number of lines that will be found in a buffer of
318 size T. However, do not guess a number of lines so large that the
319 resulting line table might cause overflow in size calculations. */
320 static lin
321 guess_lines (lin n, size_t s, size_t t)
322 {
323 size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
324 lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
325 return MIN (guessed_lines, (lin)(PTRDIFF_MAX / (2 * sizeof (QChar *) + 1) - 5)) + 5;
326 }
327
328 /* Given a vector of two file_data objects, find the identical
329 prefixes and suffixes of each object. */
330
331 void GnuDiff::find_identical_ends (struct file_data filevec[])
332 {
333 /* Find identical prefix. */
334 const QChar *p0, *p1, *buffer0, *buffer1;
335 p0 = buffer0 = filevec[0].buffer;
336 p1 = buffer1 = filevec[1].buffer;
337 size_t n0, n1;
338 n0 = filevec[0].buffered;
339 n1 = filevec[1].buffered;
340 const QChar* const pEnd0 = p0 + n0;
341 const QChar* const pEnd1 = p1 + n1;
342
343 if (p0 == p1)
344 /* The buffers are the same; sentinels won't work. */
345 p0 = p1 += n1;
346 else
347 {
348 /* Loop until first mismatch, or end. */
349 while ( p0!=pEnd0 && p1!=pEnd1 && *p0 == *p1 )
350 {
351 p0++;
352 p1++;
353 }
354 }
355
356 /* Now P0 and P1 point at the first nonmatching characters. */
357
358 /* Skip back to last line-beginning in the prefix. */
359 while (p0 != buffer0 && (p0[-1] != '\n' ))
360 p0--, p1--;
361
362 /* Record the prefix. */
363 filevec[0].prefix_end = p0;
364 filevec[1].prefix_end = p1;
365
366 /* Find identical suffix. */
367
368 /* P0 and P1 point beyond the last chars not yet compared. */
369 p0 = buffer0 + n0;
370 p1 = buffer1 + n1;
371
372 const QChar *end0, *beg0;
373 end0 = p0; /* Addr of last char in file 0. */
374
375 /* Get value of P0 at which we should stop scanning backward:
376 this is when either P0 or P1 points just past the last char
377 of the identical prefix. */
378 beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
379
380 /* Scan back until chars don't match or we reach that point. */
381 for (; p0 != beg0; p0--, p1--)
382 {
383 if (*p0 != *p1)
384 {
385 /* Point at the first char of the matching suffix. */
386 beg0 = p0;
387 break;
388 }
389 }
390
391 // Go to the next line (skip last line with a difference)
392 if ( p0 != end0 )
393 {
394 if (*p0 != *p1)
395 ++p0;
396 while ( p0<pEnd0 && *p0++ != '\n')
397 continue;
398 }
399
400 p1 += p0 - beg0;
401
402 /* Record the suffix. */
403 filevec[0].suffix_begin = p0;
404 filevec[1].suffix_begin = p1;
405
406 /* Calculate number of lines of prefix to save.
407
408 prefix_count == 0 means save the whole prefix;
409 we need this for options like -D that output the whole file,
410 or for enormous contexts (to avoid worrying about arithmetic overflow).
411 We also need it for options like -F that output some preceding line;
412 at least we will need to find the last few lines,
413 but since we don't know how many, it's easiest to find them all.
414
415 Otherwise, prefix_count != 0. Save just prefix_count lines at start
416 of the line buffer; they'll be moved to the proper location later.
417 Handle 1 more line than the context says (because we count 1 too many),
418 rounded up to the next power of 2 to speed index computation. */
419
420 const QChar **linbuf0, **linbuf1;
421 lin alloc_lines0, alloc_lines1;
422 lin buffered_prefix, prefix_count, prefix_mask;
423 lin middle_guess, suffix_guess;
424 if (no_diff_means_no_output
425 && context < (lin)(LIN_MAX / 4) && context < (lin)(n0))
426 {
427 middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
428 suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
429 for (prefix_count = 1; prefix_count <= context; prefix_count *= 2)
430 continue;
431 alloc_lines0 = (prefix_count + middle_guess
432 + MIN (context, suffix_guess));
433 }
434 else
435 {
436 prefix_count = 0;
437 alloc_lines0 = guess_lines (0, 0, n0);
438 }
439
440 prefix_mask = prefix_count - 1;
441 lin lines = 0;
442 linbuf0 = (const QChar**) xmalloc (alloc_lines0 * sizeof(*linbuf0));
443 p0 = buffer0;
444
445 /* If the prefix is needed, find the prefix lines. */
446 if (! (no_diff_means_no_output
447 && filevec[0].prefix_end == p0
448 && filevec[1].prefix_end == p1))
449 {
450 end0 = filevec[0].prefix_end;
451 while (p0 != end0)
452 {
453 lin l = lines++ & prefix_mask;
454 if (l == alloc_lines0)
455 {
456 if ((lin)(PTRDIFF_MAX / (2 * sizeof *linbuf0)) <= alloc_lines0)
457 xalloc_die ();
458 alloc_lines0 *= 2;
459 linbuf0 = (const QChar**) xrealloc (linbuf0, alloc_lines0 * sizeof(*linbuf0));
460 }
461 linbuf0[l] = p0;
462 while ( p0<pEnd0 && *p0++ != '\n' )
463 continue;
464 }
465 }
466 buffered_prefix = prefix_count && context < lines ? context : lines;
467
468 /* Allocate line buffer 1. */
469
470 middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
471 suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
472 alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess);
473 if (alloc_lines1 < buffered_prefix
474 || (lin)(PTRDIFF_MAX / sizeof *linbuf1) <= alloc_lines1)
475 xalloc_die ();
476 linbuf1 = (const QChar**)xmalloc (alloc_lines1 * sizeof(*linbuf1));
477
478 lin i;
479 if (buffered_prefix != lines)
480 {
481 /* Rotate prefix lines to proper location. */
482 for (i = 0; i < buffered_prefix; i++)
483 linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
484 for (i = 0; i < buffered_prefix; i++)
485 linbuf0[i] = linbuf1[i];
486 }
487
488 /* Initialize line buffer 1 from line buffer 0. */
489 for (i = 0; i < buffered_prefix; i++)
490 linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
491
492 /* Record the line buffer, adjusted so that
493 linbuf[0] points at the first differing line. */
494 filevec[0].linbuf = linbuf0 + buffered_prefix;
495 filevec[1].linbuf = linbuf1 + buffered_prefix;
496 filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
497 filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
498 filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
499 filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
500 }
501
502 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
503 than 2**k. This table is derived from Chris K. Caldwell's list
504 <http://www.utm.edu/research/primes/lists/2small/>. */
505
506 static unsigned char const prime_offset[] =
507 {
508 0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
509 15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
510 11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
511 55, 93, 1, 57, 25
512 };
513
514 /* Verify that this host's size_t is not too wide for the above table. */
515
516 verify (enough_prime_offsets,
517 sizeof (size_t) * CHAR_BIT <= sizeof prime_offset);
518
519 /* Given a vector of two file_data objects, read the file associated
520 with each one, and build the table of equivalence classes.
521 Return nonzero if either file appears to be a binary file.
522 If PRETEND_BINARY is nonzero, pretend they are binary regardless. */
523
524 bool
525 GnuDiff::read_files (struct file_data filevec[], bool /*pretend_binary*/)
526 {
527 int i;
528
529 find_identical_ends (filevec);
530
531 equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
532 if ((lin)(PTRDIFF_MAX / sizeof *equivs) <= equivs_alloc)
533 xalloc_die ();
534 equivs = (equivclass*)xmalloc (equivs_alloc * sizeof *equivs);
535 /* Equivalence class 0 is permanently safe for lines that were not
536 hashed. Real equivalence classes start at 1. */
537 equivs_index = 1;
538
539 /* Allocate (one plus) a prime number of hash buckets. Use a prime
540 number between 1/3 and 2/3 of the value of equiv_allocs,
541 approximately. */
542 for (i = 9; 1 << i < equivs_alloc / 3; i++)
543 continue;
544 nbuckets = ((size_t) 1 << i) - prime_offset[i];
545 if (PTRDIFF_MAX / sizeof *buckets <= nbuckets)
546 xalloc_die ();
547 buckets = (lin*)zalloc ((nbuckets + 1) * sizeof *buckets);
548 buckets++;
549
550 for (i = 0; i < 2; i++)
551 find_and_hash_each_line (&filevec[i]);
552
553 filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
554
555 free (equivs);
556 free (buckets - 1);
557
558 return 0;
559 }