comparison kdiff3/src/gnudiff_io.cpp @ 52:ba22ec30aa4e

GNU-Diff-Algorithms: Adapted for KDiff3-0.9.80
author joachim99
date Tue, 09 Dec 2003 20:34:32 +0000
parents
children 32d5cbf9db71
comparison
equal deleted inserted replaced
51:c59d5a3a8ff3 52:ba22ec30aa4e
1 /* File I/O for GNU DIFF.
2
3 Modified for KDiff3 by Joachim Eibl 2003.
4
5 Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002
6 Free Software Foundation, Inc.
7
8 This file is part of GNU DIFF.
9
10 GNU DIFF is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 GNU DIFF is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with this program; see the file COPYING.
22 If not, write to the Free Software Foundation,
23 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
24
25 #include "gnudiff_diff.h"
26 #include <stdlib.h>
27 #include "gnudiff_xalloc.h"
28
29 namespace GnuDiff
30 {
31
32 /* Rotate an unsigned value to the left. */
33 #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
34
35 /* Given a hash value and a new character, return a new hash value. */
36 #define HASH(h, c) ((c) + ROL (h, 7))
37
38 /* The type of a hash value. */
39 typedef size_t hash_value;
40 verify (hash_value_is_unsigned, ! TYPE_SIGNED (hash_value));
41
42 /* Lines are put into equivalence classes of lines that match in lines_differ.
43 Each equivalence class is represented by one of these structures,
44 but only while the classes are being computed.
45 Afterward, each class is represented by a number. */
46 struct equivclass
47 {
48 lin next; /* Next item in this bucket. */
49 hash_value hash; /* Hash of lines in this class. */
50 char const *line; /* A line that fits this class. */
51 size_t length; /* That line's length, not counting its newline. */
52 };
53
54 /* Hash-table: array of buckets, each being a chain of equivalence classes.
55 buckets[-1] is reserved for incomplete lines. */
56 static lin *buckets;
57
58 /* Number of buckets in the hash table array, not counting buckets[-1]. */
59 static size_t nbuckets;
60
61 /* Array in which the equivalence classes are allocated.
62 The bucket-chains go through the elements in this array.
63 The number of an equivalence class is its index in this array. */
64 static struct equivclass *equivs;
65
66 /* Index of first free element in the array `equivs'. */
67 static lin equivs_index;
68
69 /* Number of elements allocated in the array `equivs'. */
70 static lin equivs_alloc;
71
72
73 /* Check for binary files and compare them for exact identity. */
74
75 /* Return 1 if BUF contains a non text character.
76 SIZE is the number of characters in BUF. */
77
78 #define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
79
80
81 /* Split the file into lines, simultaneously computing the equivalence
82 class for each line. */
83
84 static void
85 find_and_hash_each_line (struct file_data *current)
86 {
87 hash_value h;
88 unsigned char const *p = (unsigned char const *) current->prefix_end;
89 unsigned char c;
90 lin i, *bucket;
91 size_t length;
92
93 /* Cache often-used quantities in local variables to help the compiler. */
94 char const **linbuf = current->linbuf;
95 lin alloc_lines = current->alloc_lines;
96 lin line = 0;
97 lin linbuf_base = current->linbuf_base;
98 lin *cureqs = (lin*)xmalloc (alloc_lines * sizeof *cureqs);
99 struct equivclass *eqs = equivs;
100 lin eqs_index = equivs_index;
101 lin eqs_alloc = equivs_alloc;
102 char const *suffix_begin = current->suffix_begin;
103 char const *bufend = FILE_BUFFER (current) + current->buffered;
104 bool diff_length_compare_anyway =
105 ignore_white_space != IGNORE_NO_WHITE_SPACE || bIgnoreNumbers;
106 bool same_length_diff_contents_compare_anyway =
107 diff_length_compare_anyway | ignore_case;
108
109 while ((char const *) p < suffix_begin)
110 {
111 char const *ip = (char const *) p;
112
113 h = 0;
114
115 /* Hash this line until we find a newline. */
116 if (ignore_case)
117 switch (ignore_white_space)
118 {
119 case IGNORE_ALL_SPACE:
120 while ((c = *p++) != '\n')
121 if (! (ISSPACE (c) || bIgnoreNumbers && (isdigit( c ) || c=='-' || c=='.' ) ))
122 h = HASH (h, TOLOWER (c));
123 break;
124
125 case IGNORE_SPACE_CHANGE:
126 while ((c = *p++) != '\n')
127 {
128 if (ISSPACE (c))
129 {
130 do
131 if ((c = *p++) == '\n')
132 goto hashing_done;
133 while (ISSPACE (c));
134
135 h = HASH (h, ' ');
136 }
137
138 /* C is now the first non-space. */
139 h = HASH (h, TOLOWER (c));
140 }
141 break;
142
143 case IGNORE_TAB_EXPANSION:
144 {
145 size_t column = 0;
146 while ((c = *p++) != '\n')
147 {
148 int repetitions = 1;
149
150 switch (c)
151 {
152 case '\b':
153 column -= 0 < column;
154 break;
155
156 case '\t':
157 c = ' ';
158 repetitions = TAB_WIDTH - column % TAB_WIDTH;
159 column += repetitions;
160 break;
161
162 case '\r':
163 column = 0;
164 break;
165
166 default:
167 c = TOLOWER (c);
168 column++;
169 break;
170 }
171
172 do
173 h = HASH (h, c);
174 while (--repetitions != 0);
175 }
176 }
177 break;
178
179 default:
180 while ((c = *p++) != '\n')
181 h = HASH (h, TOLOWER (c));
182 break;
183 }
184 else
185 switch (ignore_white_space)
186 {
187 case IGNORE_ALL_SPACE:
188 while ((c = *p++) != '\n')
189 if (! (ISSPACE (c)|| bIgnoreNumbers && (isdigit( c ) || c=='-' || c=='.' ) ))
190 h = HASH (h, c);
191 break;
192
193 case IGNORE_SPACE_CHANGE:
194 while ((c = *p++) != '\n')
195 {
196 if (ISSPACE (c))
197 {
198 do
199 if ((c = *p++) == '\n')
200 goto hashing_done;
201 while (ISSPACE (c));
202
203 h = HASH (h, ' ');
204 }
205
206 /* C is now the first non-space. */
207 h = HASH (h, c);
208 }
209 break;
210
211 case IGNORE_TAB_EXPANSION:
212 {
213 size_t column = 0;
214 while ((c = *p++) != '\n')
215 {
216 int repetitions = 1;
217
218 switch (c)
219 {
220 case '\b':
221 column -= 0 < column;
222 break;
223
224 case '\t':
225 c = ' ';
226 repetitions = TAB_WIDTH - column % TAB_WIDTH;
227 column += repetitions;
228 break;
229
230 case '\r':
231 column = 0;
232 break;
233
234 default:
235 column++;
236 break;
237 }
238
239 do
240 h = HASH (h, c);
241 while (--repetitions != 0);
242 }
243 }
244 break;
245
246 default:
247 while ((c = *p++) != '\n')
248 h = HASH (h, c);
249 break;
250 }
251
252 hashing_done:;
253
254 bucket = &buckets[h % nbuckets];
255 length = (char const *) p - ip - 1;
256
257 if ((char const *) p == bufend
258 && current->missing_newline
259 && ROBUST_OUTPUT_STYLE (output_style))
260 {
261 /* This line is incomplete. If this is significant,
262 put the line into buckets[-1]. */
263 if (ignore_white_space < IGNORE_SPACE_CHANGE)
264 bucket = &buckets[-1];
265
266 /* Omit the inserted newline when computing linbuf later. */
267 p--;
268 bufend = suffix_begin = (char const *) p;
269 }
270
271 for (i = *bucket; ; i = eqs[i].next)
272 if (!i)
273 {
274 /* Create a new equivalence class in this bucket. */
275 i = eqs_index++;
276 if (i == eqs_alloc)
277 {
278 if ((int)(PTRDIFF_MAX / (2 * sizeof *eqs)) <= eqs_alloc)
279 xalloc_die ();
280 eqs_alloc *= 2;
281 eqs = (equivclass*)xrealloc (eqs, eqs_alloc * sizeof *eqs);
282 }
283 eqs[i].next = *bucket;
284 eqs[i].hash = h;
285 eqs[i].line = ip;
286 eqs[i].length = length;
287 *bucket = i;
288 break;
289 }
290 else if (eqs[i].hash == h)
291 {
292 char const *eqline = eqs[i].line;
293
294 /* Reuse existing class if lines_differ reports the lines
295 equal. */
296 if (eqs[i].length == length)
297 {
298 /* Reuse existing equivalence class if the lines are identical.
299 This detects the common case of exact identity
300 faster than lines_differ would. */
301 if (memcmp (eqline, ip, length) == 0)
302 break;
303 if (!same_length_diff_contents_compare_anyway)
304 continue;
305 }
306 else if (!diff_length_compare_anyway)
307 continue;
308
309 if (! lines_differ (eqline, ip))
310 break;
311 }
312
313 /* Maybe increase the size of the line table. */
314 if (line == alloc_lines)
315 {
316 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */
317 if ((int)(PTRDIFF_MAX / 3) <= alloc_lines
318 || (int)(PTRDIFF_MAX / sizeof *cureqs) <= 2 * alloc_lines - linbuf_base
319 || (int)(PTRDIFF_MAX / sizeof *linbuf) <= alloc_lines - linbuf_base)
320 xalloc_die ();
321 alloc_lines = 2 * alloc_lines - linbuf_base;
322 cureqs =(lin*) xrealloc (cureqs, alloc_lines * sizeof *cureqs);
323 linbuf += linbuf_base;
324 linbuf = (const char**) xrealloc (linbuf,
325 (alloc_lines - linbuf_base) * sizeof *linbuf);
326 linbuf -= linbuf_base;
327 }
328 linbuf[line] = ip;
329 cureqs[line] = i;
330 ++line;
331 }
332
333 current->buffered_lines = line;
334
335 for (i = 0; ; i++)
336 {
337 /* Record the line start for lines in the suffix that we care about.
338 Record one more line start than lines,
339 so that we can compute the length of any buffered line. */
340 if (line == alloc_lines)
341 {
342 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */
343 if ((int)(PTRDIFF_MAX / 3) <= alloc_lines
344 || (int)(PTRDIFF_MAX / sizeof *cureqs) <= 2 * alloc_lines - linbuf_base
345 || (int)(PTRDIFF_MAX / sizeof *linbuf) <= alloc_lines - linbuf_base)
346 xalloc_die ();
347 alloc_lines = 2 * alloc_lines - linbuf_base;
348 linbuf += linbuf_base;
349 linbuf = (const char**)xrealloc (linbuf,
350 (alloc_lines - linbuf_base) * sizeof *linbuf);
351 linbuf -= linbuf_base;
352 }
353 linbuf[line] = (char const *) p;
354
355 if ((char const *) p == bufend)
356 break;
357
358 if (context <= i && no_diff_means_no_output)
359 break;
360
361 line++;
362
363 while (*p++ != '\n')
364 continue;
365 }
366
367 /* Done with cache in local variables. */
368 current->linbuf = linbuf;
369 current->valid_lines = line;
370 current->alloc_lines = alloc_lines;
371 current->equivs = cureqs;
372 equivs = eqs;
373 equivs_alloc = eqs_alloc;
374 equivs_index = eqs_index;
375 }
376
377 /* Prepare the text. Make sure the text end is initialized.
378 Make sure text ends in a newline,
379 but remember that we had to add one.
380 Strip trailing CRs, if that was requested. */
381
382 static void
383 prepare_text (struct file_data *current)
384 {
385 size_t buffered = current->buffered;
386 char *p = FILE_BUFFER (current);
387 char *dst;
388
389 if (buffered == 0 || p[buffered - 1] == '\n')
390 current->missing_newline = 0;
391 else
392 {
393 p[buffered++] = '\n';
394 current->missing_newline = 1;
395 }
396
397 if (!p)
398 return;
399
400 /* Don't use uninitialized storage when planting or using sentinels. */
401 memset (p + buffered, 0, sizeof (word));
402
403 if (strip_trailing_cr && (dst = (char*)memchr (p, '\r', buffered)))
404 {
405 char const *src = dst;
406 char const *srclim = p + buffered;
407
408 do
409 dst += ! ((*dst = *src++) == '\r' && *src == '\n');
410 while (src < srclim);
411
412 buffered -= src - dst;
413 }
414
415 current->buffered = buffered;
416 }
417
418 /* We have found N lines in a buffer of size S; guess the
419 proportionate number of lines that will be found in a buffer of
420 size T. However, do not guess a number of lines so large that the
421 resulting line table might cause overflow in size calculations. */
422 static lin
423 guess_lines (lin n, size_t s, size_t t)
424 {
425 size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
426 lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
427 return MIN (guessed_lines, (int)(PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5)) + 5;
428 }
429
430 /* Given a vector of two file_data objects, find the identical
431 prefixes and suffixes of each object. */
432
433 static void
434 find_identical_ends (struct file_data filevec[])
435 {
436 word *w0, *w1;
437 char *p0, *p1, *buffer0, *buffer1;
438 char const *end0, *beg0;
439 char const **linbuf0, **linbuf1;
440 lin i, lines;
441 size_t n0, n1;
442 lin alloc_lines0, alloc_lines1;
443 lin buffered_prefix, prefix_count, prefix_mask;
444 lin middle_guess, suffix_guess;
445
446 prepare_text (&filevec[0]);
447 prepare_text (&filevec[1]);
448
449 /* Find identical prefix. */
450
451 w0 = filevec[0].buffer;
452 w1 = filevec[1].buffer;
453 p0 = buffer0 = (char *) w0;
454 p1 = buffer1 = (char *) w1;
455 n0 = filevec[0].buffered;
456 n1 = filevec[1].buffered;
457
458 if (p0 == p1)
459 /* The buffers are the same; sentinels won't work. */
460 p0 = p1 += n1;
461 else
462 {
463 /* Insert end sentinels, in this case characters that are guaranteed
464 to make the equality test false, and thus terminate the loop. */
465
466 if (n0 < n1)
467 p0[n0] = ~p1[n0];
468 else
469 p1[n1] = ~p0[n1];
470
471 /* Loop until first mismatch, or to the sentinel characters. */
472
473 /* Compare a word at a time for speed. */
474 while (*w0 == *w1)
475 w0++, w1++;
476
477 /* Do the last few bytes of comparison a byte at a time. */
478 p0 = (char *) w0;
479 p1 = (char *) w1;
480 while (*p0 == *p1)
481 p0++, p1++;
482
483 /* Don't mistakenly count missing newline as part of prefix. */
484 if (ROBUST_OUTPUT_STYLE (output_style)
485 && ((buffer0 + n0 - filevec[0].missing_newline < p0)
486 !=
487 (buffer1 + n1 - filevec[1].missing_newline < p1)))
488 p0--, p1--;
489 }
490
491 /* Now P0 and P1 point at the first nonmatching characters. */
492
493 /* Skip back to last line-beginning in the prefix,
494 and then discard up to HORIZON_LINES lines from the prefix. */
495 i = horizon_lines;
496 while (p0 != buffer0 && (p0[-1] != '\n' || i--))
497 p0--, p1--;
498
499 /* Record the prefix. */
500 filevec[0].prefix_end = p0;
501 filevec[1].prefix_end = p1;
502
503 /* Find identical suffix. */
504
505 /* P0 and P1 point beyond the last chars not yet compared. */
506 p0 = buffer0 + n0;
507 p1 = buffer1 + n1;
508
509 if (! ROBUST_OUTPUT_STYLE (output_style)
510 || filevec[0].missing_newline == filevec[1].missing_newline)
511 {
512 end0 = p0; /* Addr of last char in file 0. */
513
514 /* Get value of P0 at which we should stop scanning backward:
515 this is when either P0 or P1 points just past the last char
516 of the identical prefix. */
517 beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
518
519 /* Scan back until chars don't match or we reach that point. */
520 for (; p0 != beg0; p0--, p1--)
521 if (*p0 != *p1)
522 {
523 /* Point at the first char of the matching suffix. */
524 beg0 = p0;
525 break;
526 }
527
528 /* Are we at a line-beginning in both files? If not, add the rest of
529 this line to the main body. Discard up to HORIZON_LINES lines from
530 the identical suffix. Also, discard one extra line,
531 because shift_boundaries may need it. */
532 i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
533 &&
534 (buffer1 == p1 || p1[-1] == '\n'));
535 while (i-- && p0 != end0)
536 while (*p0++ != '\n')
537 continue;
538
539 p1 += p0 - beg0;
540 }
541
542 /* Record the suffix. */
543 filevec[0].suffix_begin = p0;
544 filevec[1].suffix_begin = p1;
545
546 /* Calculate number of lines of prefix to save.
547
548 prefix_count == 0 means save the whole prefix;
549 we need this for options like -D that output the whole file,
550 or for enormous contexts (to avoid worrying about arithmetic overflow).
551 We also need it for options like -F that output some preceding line;
552 at least we will need to find the last few lines,
553 but since we don't know how many, it's easiest to find them all.
554
555 Otherwise, prefix_count != 0. Save just prefix_count lines at start
556 of the line buffer; they'll be moved to the proper location later.
557 Handle 1 more line than the context says (because we count 1 too many),
558 rounded up to the next power of 2 to speed index computation. */
559
560 if (no_diff_means_no_output
561 && context < int(LIN_MAX / 4) && context < int(n0))
562 {
563 middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
564 suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
565 for (prefix_count = 1; prefix_count <= context; prefix_count *= 2)
566 continue;
567 alloc_lines0 = (prefix_count + middle_guess
568 + MIN (context, suffix_guess));
569 }
570 else
571 {
572 prefix_count = 0;
573 alloc_lines0 = guess_lines (0, 0, n0);
574 }
575
576 prefix_mask = prefix_count - 1;
577 lines = 0;
578 linbuf0 = (const char**) xmalloc (alloc_lines0 * sizeof *linbuf0);
579 p0 = buffer0;
580
581 /* If the prefix is needed, find the prefix lines. */
582 if (! (no_diff_means_no_output
583 && filevec[0].prefix_end == p0
584 && filevec[1].prefix_end == p1))
585 {
586 end0 = filevec[0].prefix_end;
587 while (p0 != end0)
588 {
589 lin l = lines++ & prefix_mask;
590 if (l == alloc_lines0)
591 {
592 if (int(PTRDIFF_MAX / (2 * sizeof *linbuf0)) <= alloc_lines0)
593 xalloc_die ();
594 alloc_lines0 *= 2;
595 linbuf0 = (const char**) xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0);
596 }
597 linbuf0[l] = p0;
598 while (*p0++ != '\n')
599 continue;
600 }
601 }
602 buffered_prefix = prefix_count && context < lines ? context : lines;
603
604 /* Allocate line buffer 1. */
605
606 middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
607 suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
608 alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess);
609 if (alloc_lines1 < buffered_prefix
610 || int(PTRDIFF_MAX / sizeof *linbuf1) <= alloc_lines1)
611 xalloc_die ();
612 linbuf1 = (const char**)xmalloc (alloc_lines1 * sizeof *linbuf1);
613
614 if (buffered_prefix != lines)
615 {
616 /* Rotate prefix lines to proper location. */
617 for (i = 0; i < buffered_prefix; i++)
618 linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
619 for (i = 0; i < buffered_prefix; i++)
620 linbuf0[i] = linbuf1[i];
621 }
622
623 /* Initialize line buffer 1 from line buffer 0. */
624 for (i = 0; i < buffered_prefix; i++)
625 linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
626
627 /* Record the line buffer, adjusted so that
628 linbuf[0] points at the first differing line. */
629 filevec[0].linbuf = linbuf0 + buffered_prefix;
630 filevec[1].linbuf = linbuf1 + buffered_prefix;
631 filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
632 filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
633 filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
634 filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
635 }
636
637 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
638 than 2**k. This table is derived from Chris K. Caldwell's list
639 <http://www.utm.edu/research/primes/lists/2small/>. */
640
641 static unsigned char const prime_offset[] =
642 {
643 0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
644 15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
645 11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
646 55, 93, 1, 57, 25
647 };
648
649 /* Verify that this host's size_t is not too wide for the above table. */
650
651 verify (enough_prime_offsets,
652 sizeof (size_t) * CHAR_BIT <= sizeof prime_offset);
653
654 /* Given a vector of two file_data objects, read the file associated
655 with each one, and build the table of equivalence classes.
656 Return nonzero if either file appears to be a binary file.
657 If PRETEND_BINARY is nonzero, pretend they are binary regardless. */
658
659 bool
660 read_files (struct file_data filevec[], bool /*pretend_binary*/)
661 {
662 int i;
663
664 find_identical_ends (filevec);
665
666 equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
667 if (int(PTRDIFF_MAX / sizeof *equivs) <= equivs_alloc)
668 xalloc_die ();
669 equivs = (equivclass*)xmalloc (equivs_alloc * sizeof *equivs);
670 /* Equivalence class 0 is permanently safe for lines that were not
671 hashed. Real equivalence classes start at 1. */
672 equivs_index = 1;
673
674 /* Allocate (one plus) a prime number of hash buckets. Use a prime
675 number between 1/3 and 2/3 of the value of equiv_allocs,
676 approximately. */
677 for (i = 9; 1 << i < equivs_alloc / 3; i++)
678 continue;
679 nbuckets = ((size_t) 1 << i) - prime_offset[i];
680 if (PTRDIFF_MAX / sizeof *buckets <= nbuckets)
681 xalloc_die ();
682 buckets = (lin*)zalloc ((nbuckets + 1) * sizeof *buckets);
683 buckets++;
684
685 for (i = 0; i < 2; i++)
686 find_and_hash_each_line (&filevec[i]);
687
688 filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
689
690 free (equivs);
691 free (buckets - 1);
692
693 return 0;
694 }
695
696 } // namespace GnuDiff