Chris@4: Chris@4: /*-------------------------------------------------------------*/ Chris@4: /*--- Block sorting machinery ---*/ Chris@4: /*--- blocksort.c ---*/ Chris@4: /*-------------------------------------------------------------*/ Chris@4: Chris@4: /* ------------------------------------------------------------------ Chris@4: This file is part of bzip2/libbzip2, a program and library for Chris@4: lossless, block-sorting data compression. Chris@4: Chris@4: bzip2/libbzip2 version 1.0.6 of 6 September 2010 Chris@4: Copyright (C) 1996-2010 Julian Seward Chris@4: Chris@4: Please read the WARNING, DISCLAIMER and PATENTS sections in the Chris@4: README file. Chris@4: Chris@4: This program is released under the terms of the license contained Chris@4: in the file LICENSE. Chris@4: ------------------------------------------------------------------ */ Chris@4: Chris@4: Chris@4: #include "bzlib_private.h" Chris@4: Chris@4: /*---------------------------------------------*/ Chris@4: /*--- Fallback O(N log(N)^2) sorting ---*/ Chris@4: /*--- algorithm, for repetitive blocks ---*/ Chris@4: /*---------------------------------------------*/ Chris@4: Chris@4: /*---------------------------------------------*/ Chris@4: static Chris@4: __inline__ Chris@4: void fallbackSimpleSort ( UInt32* fmap, Chris@4: UInt32* eclass, Chris@4: Int32 lo, Chris@4: Int32 hi ) Chris@4: { Chris@4: Int32 i, j, tmp; Chris@4: UInt32 ec_tmp; Chris@4: Chris@4: if (lo == hi) return; Chris@4: Chris@4: if (hi - lo > 3) { Chris@4: for ( i = hi-4; i >= lo; i-- ) { Chris@4: tmp = fmap[i]; Chris@4: ec_tmp = eclass[tmp]; Chris@4: for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 ) Chris@4: fmap[j-4] = fmap[j]; Chris@4: fmap[j-4] = tmp; Chris@4: } Chris@4: } Chris@4: Chris@4: for ( i = hi-1; i >= lo; i-- ) { Chris@4: tmp = fmap[i]; Chris@4: ec_tmp = eclass[tmp]; Chris@4: for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ ) Chris@4: fmap[j-1] = fmap[j]; Chris@4: fmap[j-1] = tmp; Chris@4: } Chris@4: } Chris@4: Chris@4: Chris@4: /*---------------------------------------------*/ Chris@4: #define fswap(zz1, zz2) \ Chris@4: { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; } Chris@4: Chris@4: #define fvswap(zzp1, zzp2, zzn) \ Chris@4: { \ Chris@4: Int32 yyp1 = (zzp1); \ Chris@4: Int32 yyp2 = (zzp2); \ Chris@4: Int32 yyn = (zzn); \ Chris@4: while (yyn > 0) { \ Chris@4: fswap(fmap[yyp1], fmap[yyp2]); \ Chris@4: yyp1++; yyp2++; yyn--; \ Chris@4: } \ Chris@4: } Chris@4: Chris@4: Chris@4: #define fmin(a,b) ((a) < (b)) ? (a) : (b) Chris@4: Chris@4: #define fpush(lz,hz) { stackLo[sp] = lz; \ Chris@4: stackHi[sp] = hz; \ Chris@4: sp++; } Chris@4: Chris@4: #define fpop(lz,hz) { sp--; \ Chris@4: lz = stackLo[sp]; \ Chris@4: hz = stackHi[sp]; } Chris@4: Chris@4: #define FALLBACK_QSORT_SMALL_THRESH 10 Chris@4: #define FALLBACK_QSORT_STACK_SIZE 100 Chris@4: Chris@4: Chris@4: static Chris@4: void fallbackQSort3 ( UInt32* fmap, Chris@4: UInt32* eclass, Chris@4: Int32 loSt, Chris@4: Int32 hiSt ) Chris@4: { Chris@4: Int32 unLo, unHi, ltLo, gtHi, n, m; Chris@4: Int32 sp, lo, hi; Chris@4: UInt32 med, r, r3; Chris@4: Int32 stackLo[FALLBACK_QSORT_STACK_SIZE]; Chris@4: Int32 stackHi[FALLBACK_QSORT_STACK_SIZE]; Chris@4: Chris@4: r = 0; Chris@4: Chris@4: sp = 0; Chris@4: fpush ( loSt, hiSt ); Chris@4: Chris@4: while (sp > 0) { Chris@4: Chris@4: AssertH ( sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004 ); Chris@4: Chris@4: fpop ( lo, hi ); Chris@4: if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) { Chris@4: fallbackSimpleSort ( fmap, eclass, lo, hi ); Chris@4: continue; Chris@4: } Chris@4: Chris@4: /* Random partitioning. Median of 3 sometimes fails to Chris@4: avoid bad cases. Median of 9 seems to help but Chris@4: looks rather expensive. This too seems to work but Chris@4: is cheaper. Guidance for the magic constants Chris@4: 7621 and 32768 is taken from Sedgewick's algorithms Chris@4: book, chapter 35. Chris@4: */ Chris@4: r = ((r * 7621) + 1) % 32768; Chris@4: r3 = r % 3; Chris@4: if (r3 == 0) med = eclass[fmap[lo]]; else Chris@4: if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else Chris@4: med = eclass[fmap[hi]]; Chris@4: Chris@4: unLo = ltLo = lo; Chris@4: unHi = gtHi = hi; Chris@4: Chris@4: while (1) { Chris@4: while (1) { Chris@4: if (unLo > unHi) break; Chris@4: n = (Int32)eclass[fmap[unLo]] - (Int32)med; Chris@4: if (n == 0) { Chris@4: fswap(fmap[unLo], fmap[ltLo]); Chris@4: ltLo++; unLo++; Chris@4: continue; Chris@4: }; Chris@4: if (n > 0) break; Chris@4: unLo++; Chris@4: } Chris@4: while (1) { Chris@4: if (unLo > unHi) break; Chris@4: n = (Int32)eclass[fmap[unHi]] - (Int32)med; Chris@4: if (n == 0) { Chris@4: fswap(fmap[unHi], fmap[gtHi]); Chris@4: gtHi--; unHi--; Chris@4: continue; Chris@4: }; Chris@4: if (n < 0) break; Chris@4: unHi--; Chris@4: } Chris@4: if (unLo > unHi) break; Chris@4: fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--; Chris@4: } Chris@4: Chris@4: AssertD ( unHi == unLo-1, "fallbackQSort3(2)" ); Chris@4: Chris@4: if (gtHi < ltLo) continue; Chris@4: Chris@4: n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n); Chris@4: m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m); Chris@4: Chris@4: n = lo + unLo - ltLo - 1; Chris@4: m = hi - (gtHi - unHi) + 1; Chris@4: Chris@4: if (n - lo > hi - m) { Chris@4: fpush ( lo, n ); Chris@4: fpush ( m, hi ); Chris@4: } else { Chris@4: fpush ( m, hi ); Chris@4: fpush ( lo, n ); Chris@4: } Chris@4: } Chris@4: } Chris@4: Chris@4: #undef fmin Chris@4: #undef fpush Chris@4: #undef fpop Chris@4: #undef fswap Chris@4: #undef fvswap Chris@4: #undef FALLBACK_QSORT_SMALL_THRESH Chris@4: #undef FALLBACK_QSORT_STACK_SIZE Chris@4: Chris@4: Chris@4: /*---------------------------------------------*/ Chris@4: /* Pre: Chris@4: nblock > 0 Chris@4: eclass exists for [0 .. nblock-1] Chris@4: ((UChar*)eclass) [0 .. nblock-1] holds block Chris@4: ptr exists for [0 .. nblock-1] Chris@4: Chris@4: Post: Chris@4: ((UChar*)eclass) [0 .. nblock-1] holds block Chris@4: All other areas of eclass destroyed Chris@4: fmap [0 .. nblock-1] holds sorted order Chris@4: bhtab [ 0 .. 2+(nblock/32) ] destroyed Chris@4: */ Chris@4: Chris@4: #define SET_BH(zz) bhtab[(zz) >> 5] |= (1 << ((zz) & 31)) Chris@4: #define CLEAR_BH(zz) bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31)) Chris@4: #define ISSET_BH(zz) (bhtab[(zz) >> 5] & (1 << ((zz) & 31))) Chris@4: #define WORD_BH(zz) bhtab[(zz) >> 5] Chris@4: #define UNALIGNED_BH(zz) ((zz) & 0x01f) Chris@4: Chris@4: static Chris@4: void fallbackSort ( UInt32* fmap, Chris@4: UInt32* eclass, Chris@4: UInt32* bhtab, Chris@4: Int32 nblock, Chris@4: Int32 verb ) Chris@4: { Chris@4: Int32 ftab[257]; Chris@4: Int32 ftabCopy[256]; Chris@4: Int32 H, i, j, k, l, r, cc, cc1; Chris@4: Int32 nNotDone; Chris@4: Int32 nBhtab; Chris@4: UChar* eclass8 = (UChar*)eclass; Chris@4: Chris@4: /*-- Chris@4: Initial 1-char radix sort to generate Chris@4: initial fmap and initial BH bits. Chris@4: --*/ Chris@4: if (verb >= 4) Chris@4: VPrintf0 ( " bucket sorting ...\n" ); Chris@4: for (i = 0; i < 257; i++) ftab[i] = 0; Chris@4: for (i = 0; i < nblock; i++) ftab[eclass8[i]]++; Chris@4: for (i = 0; i < 256; i++) ftabCopy[i] = ftab[i]; Chris@4: for (i = 1; i < 257; i++) ftab[i] += ftab[i-1]; Chris@4: Chris@4: for (i = 0; i < nblock; i++) { Chris@4: j = eclass8[i]; Chris@4: k = ftab[j] - 1; Chris@4: ftab[j] = k; Chris@4: fmap[k] = i; Chris@4: } Chris@4: Chris@4: nBhtab = 2 + (nblock / 32); Chris@4: for (i = 0; i < nBhtab; i++) bhtab[i] = 0; Chris@4: for (i = 0; i < 256; i++) SET_BH(ftab[i]); Chris@4: Chris@4: /*-- Chris@4: Inductively refine the buckets. Kind-of an Chris@4: "exponential radix sort" (!), inspired by the Chris@4: Manber-Myers suffix array construction algorithm. Chris@4: --*/ Chris@4: Chris@4: /*-- set sentinel bits for block-end detection --*/ Chris@4: for (i = 0; i < 32; i++) { Chris@4: SET_BH(nblock + 2*i); Chris@4: CLEAR_BH(nblock + 2*i + 1); Chris@4: } Chris@4: Chris@4: /*-- the log(N) loop --*/ Chris@4: H = 1; Chris@4: while (1) { Chris@4: Chris@4: if (verb >= 4) Chris@4: VPrintf1 ( " depth %6d has ", H ); Chris@4: Chris@4: j = 0; Chris@4: for (i = 0; i < nblock; i++) { Chris@4: if (ISSET_BH(i)) j = i; Chris@4: k = fmap[i] - H; if (k < 0) k += nblock; Chris@4: eclass[k] = j; Chris@4: } Chris@4: Chris@4: nNotDone = 0; Chris@4: r = -1; Chris@4: while (1) { Chris@4: Chris@4: /*-- find the next non-singleton bucket --*/ Chris@4: k = r + 1; Chris@4: while (ISSET_BH(k) && UNALIGNED_BH(k)) k++; Chris@4: if (ISSET_BH(k)) { Chris@4: while (WORD_BH(k) == 0xffffffff) k += 32; Chris@4: while (ISSET_BH(k)) k++; Chris@4: } Chris@4: l = k - 1; Chris@4: if (l >= nblock) break; Chris@4: while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++; Chris@4: if (!ISSET_BH(k)) { Chris@4: while (WORD_BH(k) == 0x00000000) k += 32; Chris@4: while (!ISSET_BH(k)) k++; Chris@4: } Chris@4: r = k - 1; Chris@4: if (r >= nblock) break; Chris@4: Chris@4: /*-- now [l, r] bracket current bucket --*/ Chris@4: if (r > l) { Chris@4: nNotDone += (r - l + 1); Chris@4: fallbackQSort3 ( fmap, eclass, l, r ); Chris@4: Chris@4: /*-- scan bucket and generate header bits-- */ Chris@4: cc = -1; Chris@4: for (i = l; i <= r; i++) { Chris@4: cc1 = eclass[fmap[i]]; Chris@4: if (cc != cc1) { SET_BH(i); cc = cc1; }; Chris@4: } Chris@4: } Chris@4: } Chris@4: Chris@4: if (verb >= 4) Chris@4: VPrintf1 ( "%6d unresolved strings\n", nNotDone ); Chris@4: Chris@4: H *= 2; Chris@4: if (H > nblock || nNotDone == 0) break; Chris@4: } Chris@4: Chris@4: /*-- Chris@4: Reconstruct the original block in Chris@4: eclass8 [0 .. nblock-1], since the Chris@4: previous phase destroyed it. Chris@4: --*/ Chris@4: if (verb >= 4) Chris@4: VPrintf0 ( " reconstructing block ...\n" ); Chris@4: j = 0; Chris@4: for (i = 0; i < nblock; i++) { Chris@4: while (ftabCopy[j] == 0) j++; Chris@4: ftabCopy[j]--; Chris@4: eclass8[fmap[i]] = (UChar)j; Chris@4: } Chris@4: AssertH ( j < 256, 1005 ); Chris@4: } Chris@4: Chris@4: #undef SET_BH Chris@4: #undef CLEAR_BH Chris@4: #undef ISSET_BH Chris@4: #undef WORD_BH Chris@4: #undef UNALIGNED_BH Chris@4: Chris@4: Chris@4: /*---------------------------------------------*/ Chris@4: /*--- The main, O(N^2 log(N)) sorting ---*/ Chris@4: /*--- algorithm. Faster for "normal" ---*/ Chris@4: /*--- non-repetitive blocks. ---*/ Chris@4: /*---------------------------------------------*/ Chris@4: Chris@4: /*---------------------------------------------*/ Chris@4: static Chris@4: __inline__ Chris@4: Bool mainGtU ( UInt32 i1, Chris@4: UInt32 i2, Chris@4: UChar* block, Chris@4: UInt16* quadrant, Chris@4: UInt32 nblock, Chris@4: Int32* budget ) Chris@4: { Chris@4: Int32 k; Chris@4: UChar c1, c2; Chris@4: UInt16 s1, s2; Chris@4: Chris@4: AssertD ( i1 != i2, "mainGtU" ); Chris@4: /* 1 */ Chris@4: c1 = block[i1]; c2 = block[i2]; Chris@4: if (c1 != c2) return (c1 > c2); Chris@4: i1++; i2++; Chris@4: /* 2 */ Chris@4: c1 = block[i1]; c2 = block[i2]; Chris@4: if (c1 != c2) return (c1 > c2); Chris@4: i1++; i2++; Chris@4: /* 3 */ Chris@4: c1 = block[i1]; c2 = block[i2]; Chris@4: if (c1 != c2) return (c1 > c2); Chris@4: i1++; i2++; Chris@4: /* 4 */ Chris@4: c1 = block[i1]; c2 = block[i2]; Chris@4: if (c1 != c2) return (c1 > c2); Chris@4: i1++; i2++; Chris@4: /* 5 */ Chris@4: c1 = block[i1]; c2 = block[i2]; Chris@4: if (c1 != c2) return (c1 > c2); Chris@4: i1++; i2++; Chris@4: /* 6 */ Chris@4: c1 = block[i1]; c2 = block[i2]; Chris@4: if (c1 != c2) return (c1 > c2); Chris@4: i1++; i2++; Chris@4: /* 7 */ Chris@4: c1 = block[i1]; c2 = block[i2]; Chris@4: if (c1 != c2) return (c1 > c2); Chris@4: i1++; i2++; Chris@4: /* 8 */ Chris@4: c1 = block[i1]; c2 = block[i2]; Chris@4: if (c1 != c2) return (c1 > c2); Chris@4: i1++; i2++; Chris@4: /* 9 */ Chris@4: c1 = block[i1]; c2 = block[i2]; Chris@4: if (c1 != c2) return (c1 > c2); Chris@4: i1++; i2++; Chris@4: /* 10 */ Chris@4: c1 = block[i1]; c2 = block[i2]; Chris@4: if (c1 != c2) return (c1 > c2); Chris@4: i1++; i2++; Chris@4: /* 11 */ Chris@4: c1 = block[i1]; c2 = block[i2]; Chris@4: if (c1 != c2) return (c1 > c2); Chris@4: i1++; i2++; Chris@4: /* 12 */ Chris@4: c1 = block[i1]; c2 = block[i2]; Chris@4: if (c1 != c2) return (c1 > c2); Chris@4: i1++; i2++; Chris@4: Chris@4: k = nblock + 8; Chris@4: Chris@4: do { Chris@4: /* 1 */ Chris@4: c1 = block[i1]; c2 = block[i2]; Chris@4: if (c1 != c2) return (c1 > c2); Chris@4: s1 = quadrant[i1]; s2 = quadrant[i2]; Chris@4: if (s1 != s2) return (s1 > s2); Chris@4: i1++; i2++; Chris@4: /* 2 */ Chris@4: c1 = block[i1]; c2 = block[i2]; Chris@4: if (c1 != c2) return (c1 > c2); Chris@4: s1 = quadrant[i1]; s2 = quadrant[i2]; Chris@4: if (s1 != s2) return (s1 > s2); Chris@4: i1++; i2++; Chris@4: /* 3 */ Chris@4: c1 = block[i1]; c2 = block[i2]; Chris@4: if (c1 != c2) return (c1 > c2); Chris@4: s1 = quadrant[i1]; s2 = quadrant[i2]; Chris@4: if (s1 != s2) return (s1 > s2); Chris@4: i1++; i2++; Chris@4: /* 4 */ Chris@4: c1 = block[i1]; c2 = block[i2]; Chris@4: if (c1 != c2) return (c1 > c2); Chris@4: s1 = quadrant[i1]; s2 = quadrant[i2]; Chris@4: if (s1 != s2) return (s1 > s2); Chris@4: i1++; i2++; Chris@4: /* 5 */ Chris@4: c1 = block[i1]; c2 = block[i2]; Chris@4: if (c1 != c2) return (c1 > c2); Chris@4: s1 = quadrant[i1]; s2 = quadrant[i2]; Chris@4: if (s1 != s2) return (s1 > s2); Chris@4: i1++; i2++; Chris@4: /* 6 */ Chris@4: c1 = block[i1]; c2 = block[i2]; Chris@4: if (c1 != c2) return (c1 > c2); Chris@4: s1 = quadrant[i1]; s2 = quadrant[i2]; Chris@4: if (s1 != s2) return (s1 > s2); Chris@4: i1++; i2++; Chris@4: /* 7 */ Chris@4: c1 = block[i1]; c2 = block[i2]; Chris@4: if (c1 != c2) return (c1 > c2); Chris@4: s1 = quadrant[i1]; s2 = quadrant[i2]; Chris@4: if (s1 != s2) return (s1 > s2); Chris@4: i1++; i2++; Chris@4: /* 8 */ Chris@4: c1 = block[i1]; c2 = block[i2]; Chris@4: if (c1 != c2) return (c1 > c2); Chris@4: s1 = quadrant[i1]; s2 = quadrant[i2]; Chris@4: if (s1 != s2) return (s1 > s2); Chris@4: i1++; i2++; Chris@4: Chris@4: if (i1 >= nblock) i1 -= nblock; Chris@4: if (i2 >= nblock) i2 -= nblock; Chris@4: Chris@4: k -= 8; Chris@4: (*budget)--; Chris@4: } Chris@4: while (k >= 0); Chris@4: Chris@4: return False; Chris@4: } Chris@4: Chris@4: Chris@4: /*---------------------------------------------*/ Chris@4: /*-- Chris@4: Knuth's increments seem to work better Chris@4: than Incerpi-Sedgewick here. Possibly Chris@4: because the number of elems to sort is Chris@4: usually small, typically <= 20. Chris@4: --*/ Chris@4: static Chris@4: Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280, Chris@4: 9841, 29524, 88573, 265720, Chris@4: 797161, 2391484 }; Chris@4: Chris@4: static Chris@4: void mainSimpleSort ( UInt32* ptr, Chris@4: UChar* block, Chris@4: UInt16* quadrant, Chris@4: Int32 nblock, Chris@4: Int32 lo, Chris@4: Int32 hi, Chris@4: Int32 d, Chris@4: Int32* budget ) Chris@4: { Chris@4: Int32 i, j, h, bigN, hp; Chris@4: UInt32 v; Chris@4: Chris@4: bigN = hi - lo + 1; Chris@4: if (bigN < 2) return; Chris@4: Chris@4: hp = 0; Chris@4: while (incs[hp] < bigN) hp++; Chris@4: hp--; Chris@4: Chris@4: for (; hp >= 0; hp--) { Chris@4: h = incs[hp]; Chris@4: Chris@4: i = lo + h; Chris@4: while (True) { Chris@4: Chris@4: /*-- copy 1 --*/ Chris@4: if (i > hi) break; Chris@4: v = ptr[i]; Chris@4: j = i; Chris@4: while ( mainGtU ( Chris@4: ptr[j-h]+d, v+d, block, quadrant, nblock, budget Chris@4: ) ) { Chris@4: ptr[j] = ptr[j-h]; Chris@4: j = j - h; Chris@4: if (j <= (lo + h - 1)) break; Chris@4: } Chris@4: ptr[j] = v; Chris@4: i++; Chris@4: Chris@4: /*-- copy 2 --*/ Chris@4: if (i > hi) break; Chris@4: v = ptr[i]; Chris@4: j = i; Chris@4: while ( mainGtU ( Chris@4: ptr[j-h]+d, v+d, block, quadrant, nblock, budget Chris@4: ) ) { Chris@4: ptr[j] = ptr[j-h]; Chris@4: j = j - h; Chris@4: if (j <= (lo + h - 1)) break; Chris@4: } Chris@4: ptr[j] = v; Chris@4: i++; Chris@4: Chris@4: /*-- copy 3 --*/ Chris@4: if (i > hi) break; Chris@4: v = ptr[i]; Chris@4: j = i; Chris@4: while ( mainGtU ( Chris@4: ptr[j-h]+d, v+d, block, quadrant, nblock, budget Chris@4: ) ) { Chris@4: ptr[j] = ptr[j-h]; Chris@4: j = j - h; Chris@4: if (j <= (lo + h - 1)) break; Chris@4: } Chris@4: ptr[j] = v; Chris@4: i++; Chris@4: Chris@4: if (*budget < 0) return; Chris@4: } Chris@4: } Chris@4: } Chris@4: Chris@4: Chris@4: /*---------------------------------------------*/ Chris@4: /*-- Chris@4: The following is an implementation of Chris@4: an elegant 3-way quicksort for strings, Chris@4: described in a paper "Fast Algorithms for Chris@4: Sorting and Searching Strings", by Robert Chris@4: Sedgewick and Jon L. Bentley. Chris@4: --*/ Chris@4: Chris@4: #define mswap(zz1, zz2) \ Chris@4: { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; } Chris@4: Chris@4: #define mvswap(zzp1, zzp2, zzn) \ Chris@4: { \ Chris@4: Int32 yyp1 = (zzp1); \ Chris@4: Int32 yyp2 = (zzp2); \ Chris@4: Int32 yyn = (zzn); \ Chris@4: while (yyn > 0) { \ Chris@4: mswap(ptr[yyp1], ptr[yyp2]); \ Chris@4: yyp1++; yyp2++; yyn--; \ Chris@4: } \ Chris@4: } Chris@4: Chris@4: static Chris@4: __inline__ Chris@4: UChar mmed3 ( UChar a, UChar b, UChar c ) Chris@4: { Chris@4: UChar t; Chris@4: if (a > b) { t = a; a = b; b = t; }; Chris@4: if (b > c) { Chris@4: b = c; Chris@4: if (a > b) b = a; Chris@4: } Chris@4: return b; Chris@4: } Chris@4: Chris@4: #define mmin(a,b) ((a) < (b)) ? (a) : (b) Chris@4: Chris@4: #define mpush(lz,hz,dz) { stackLo[sp] = lz; \ Chris@4: stackHi[sp] = hz; \ Chris@4: stackD [sp] = dz; \ Chris@4: sp++; } Chris@4: Chris@4: #define mpop(lz,hz,dz) { sp--; \ Chris@4: lz = stackLo[sp]; \ Chris@4: hz = stackHi[sp]; \ Chris@4: dz = stackD [sp]; } Chris@4: Chris@4: Chris@4: #define mnextsize(az) (nextHi[az]-nextLo[az]) Chris@4: Chris@4: #define mnextswap(az,bz) \ Chris@4: { Int32 tz; \ Chris@4: tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \ Chris@4: tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \ Chris@4: tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; } Chris@4: Chris@4: Chris@4: #define MAIN_QSORT_SMALL_THRESH 20 Chris@4: #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT) Chris@4: #define MAIN_QSORT_STACK_SIZE 100 Chris@4: Chris@4: static Chris@4: void mainQSort3 ( UInt32* ptr, Chris@4: UChar* block, Chris@4: UInt16* quadrant, Chris@4: Int32 nblock, Chris@4: Int32 loSt, Chris@4: Int32 hiSt, Chris@4: Int32 dSt, Chris@4: Int32* budget ) Chris@4: { Chris@4: Int32 unLo, unHi, ltLo, gtHi, n, m, med; Chris@4: Int32 sp, lo, hi, d; Chris@4: Chris@4: Int32 stackLo[MAIN_QSORT_STACK_SIZE]; Chris@4: Int32 stackHi[MAIN_QSORT_STACK_SIZE]; Chris@4: Int32 stackD [MAIN_QSORT_STACK_SIZE]; Chris@4: Chris@4: Int32 nextLo[3]; Chris@4: Int32 nextHi[3]; Chris@4: Int32 nextD [3]; Chris@4: Chris@4: sp = 0; Chris@4: mpush ( loSt, hiSt, dSt ); Chris@4: Chris@4: while (sp > 0) { Chris@4: Chris@4: AssertH ( sp < MAIN_QSORT_STACK_SIZE - 2, 1001 ); Chris@4: Chris@4: mpop ( lo, hi, d ); Chris@4: if (hi - lo < MAIN_QSORT_SMALL_THRESH || Chris@4: d > MAIN_QSORT_DEPTH_THRESH) { Chris@4: mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget ); Chris@4: if (*budget < 0) return; Chris@4: continue; Chris@4: } Chris@4: Chris@4: med = (Int32) Chris@4: mmed3 ( block[ptr[ lo ]+d], Chris@4: block[ptr[ hi ]+d], Chris@4: block[ptr[ (lo+hi)>>1 ]+d] ); Chris@4: Chris@4: unLo = ltLo = lo; Chris@4: unHi = gtHi = hi; Chris@4: Chris@4: while (True) { Chris@4: while (True) { Chris@4: if (unLo > unHi) break; Chris@4: n = ((Int32)block[ptr[unLo]+d]) - med; Chris@4: if (n == 0) { Chris@4: mswap(ptr[unLo], ptr[ltLo]); Chris@4: ltLo++; unLo++; continue; Chris@4: }; Chris@4: if (n > 0) break; Chris@4: unLo++; Chris@4: } Chris@4: while (True) { Chris@4: if (unLo > unHi) break; Chris@4: n = ((Int32)block[ptr[unHi]+d]) - med; Chris@4: if (n == 0) { Chris@4: mswap(ptr[unHi], ptr[gtHi]); Chris@4: gtHi--; unHi--; continue; Chris@4: }; Chris@4: if (n < 0) break; Chris@4: unHi--; Chris@4: } Chris@4: if (unLo > unHi) break; Chris@4: mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--; Chris@4: } Chris@4: Chris@4: AssertD ( unHi == unLo-1, "mainQSort3(2)" ); Chris@4: Chris@4: if (gtHi < ltLo) { Chris@4: mpush(lo, hi, d+1 ); Chris@4: continue; Chris@4: } Chris@4: Chris@4: n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n); Chris@4: m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m); Chris@4: Chris@4: n = lo + unLo - ltLo - 1; Chris@4: m = hi - (gtHi - unHi) + 1; Chris@4: Chris@4: nextLo[0] = lo; nextHi[0] = n; nextD[0] = d; Chris@4: nextLo[1] = m; nextHi[1] = hi; nextD[1] = d; Chris@4: nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1; Chris@4: Chris@4: if (mnextsize(0) < mnextsize(1)) mnextswap(0,1); Chris@4: if (mnextsize(1) < mnextsize(2)) mnextswap(1,2); Chris@4: if (mnextsize(0) < mnextsize(1)) mnextswap(0,1); Chris@4: Chris@4: AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" ); Chris@4: AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" ); Chris@4: Chris@4: mpush (nextLo[0], nextHi[0], nextD[0]); Chris@4: mpush (nextLo[1], nextHi[1], nextD[1]); Chris@4: mpush (nextLo[2], nextHi[2], nextD[2]); Chris@4: } Chris@4: } Chris@4: Chris@4: #undef mswap Chris@4: #undef mvswap Chris@4: #undef mpush Chris@4: #undef mpop Chris@4: #undef mmin Chris@4: #undef mnextsize Chris@4: #undef mnextswap Chris@4: #undef MAIN_QSORT_SMALL_THRESH Chris@4: #undef MAIN_QSORT_DEPTH_THRESH Chris@4: #undef MAIN_QSORT_STACK_SIZE Chris@4: Chris@4: Chris@4: /*---------------------------------------------*/ Chris@4: /* Pre: Chris@4: nblock > N_OVERSHOOT Chris@4: block32 exists for [0 .. nblock-1 +N_OVERSHOOT] Chris@4: ((UChar*)block32) [0 .. nblock-1] holds block Chris@4: ptr exists for [0 .. nblock-1] Chris@4: Chris@4: Post: Chris@4: ((UChar*)block32) [0 .. nblock-1] holds block Chris@4: All other areas of block32 destroyed Chris@4: ftab [0 .. 65536 ] destroyed Chris@4: ptr [0 .. nblock-1] holds sorted order Chris@4: if (*budget < 0), sorting was abandoned Chris@4: */ Chris@4: Chris@4: #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8]) Chris@4: #define SETMASK (1 << 21) Chris@4: #define CLEARMASK (~(SETMASK)) Chris@4: Chris@4: static Chris@4: void mainSort ( UInt32* ptr, Chris@4: UChar* block, Chris@4: UInt16* quadrant, Chris@4: UInt32* ftab, Chris@4: Int32 nblock, Chris@4: Int32 verb, Chris@4: Int32* budget ) Chris@4: { Chris@4: Int32 i, j, k, ss, sb; Chris@4: Int32 runningOrder[256]; Chris@4: Bool bigDone[256]; Chris@4: Int32 copyStart[256]; Chris@4: Int32 copyEnd [256]; Chris@4: UChar c1; Chris@4: Int32 numQSorted; Chris@4: UInt16 s; Chris@4: if (verb >= 4) VPrintf0 ( " main sort initialise ...\n" ); Chris@4: Chris@4: /*-- set up the 2-byte frequency table --*/ Chris@4: for (i = 65536; i >= 0; i--) ftab[i] = 0; Chris@4: Chris@4: j = block[0] << 8; Chris@4: i = nblock-1; Chris@4: for (; i >= 3; i -= 4) { Chris@4: quadrant[i] = 0; Chris@4: j = (j >> 8) | ( ((UInt16)block[i]) << 8); Chris@4: ftab[j]++; Chris@4: quadrant[i-1] = 0; Chris@4: j = (j >> 8) | ( ((UInt16)block[i-1]) << 8); Chris@4: ftab[j]++; Chris@4: quadrant[i-2] = 0; Chris@4: j = (j >> 8) | ( ((UInt16)block[i-2]) << 8); Chris@4: ftab[j]++; Chris@4: quadrant[i-3] = 0; Chris@4: j = (j >> 8) | ( ((UInt16)block[i-3]) << 8); Chris@4: ftab[j]++; Chris@4: } Chris@4: for (; i >= 0; i--) { Chris@4: quadrant[i] = 0; Chris@4: j = (j >> 8) | ( ((UInt16)block[i]) << 8); Chris@4: ftab[j]++; Chris@4: } Chris@4: Chris@4: /*-- (emphasises close relationship of block & quadrant) --*/ Chris@4: for (i = 0; i < BZ_N_OVERSHOOT; i++) { Chris@4: block [nblock+i] = block[i]; Chris@4: quadrant[nblock+i] = 0; Chris@4: } Chris@4: Chris@4: if (verb >= 4) VPrintf0 ( " bucket sorting ...\n" ); Chris@4: Chris@4: /*-- Complete the initial radix sort --*/ Chris@4: for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1]; Chris@4: Chris@4: s = block[0] << 8; Chris@4: i = nblock-1; Chris@4: for (; i >= 3; i -= 4) { Chris@4: s = (s >> 8) | (block[i] << 8); Chris@4: j = ftab[s] -1; Chris@4: ftab[s] = j; Chris@4: ptr[j] = i; Chris@4: s = (s >> 8) | (block[i-1] << 8); Chris@4: j = ftab[s] -1; Chris@4: ftab[s] = j; Chris@4: ptr[j] = i-1; Chris@4: s = (s >> 8) | (block[i-2] << 8); Chris@4: j = ftab[s] -1; Chris@4: ftab[s] = j; Chris@4: ptr[j] = i-2; Chris@4: s = (s >> 8) | (block[i-3] << 8); Chris@4: j = ftab[s] -1; Chris@4: ftab[s] = j; Chris@4: ptr[j] = i-3; Chris@4: } Chris@4: for (; i >= 0; i--) { Chris@4: s = (s >> 8) | (block[i] << 8); Chris@4: j = ftab[s] -1; Chris@4: ftab[s] = j; Chris@4: ptr[j] = i; Chris@4: } Chris@4: Chris@4: /*-- Chris@4: Now ftab contains the first loc of every small bucket. Chris@4: Calculate the running order, from smallest to largest Chris@4: big bucket. Chris@4: --*/ Chris@4: for (i = 0; i <= 255; i++) { Chris@4: bigDone [i] = False; Chris@4: runningOrder[i] = i; Chris@4: } Chris@4: Chris@4: { Chris@4: Int32 vv; Chris@4: Int32 h = 1; Chris@4: do h = 3 * h + 1; while (h <= 256); Chris@4: do { Chris@4: h = h / 3; Chris@4: for (i = h; i <= 255; i++) { Chris@4: vv = runningOrder[i]; Chris@4: j = i; Chris@4: while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) { Chris@4: runningOrder[j] = runningOrder[j-h]; Chris@4: j = j - h; Chris@4: if (j <= (h - 1)) goto zero; Chris@4: } Chris@4: zero: Chris@4: runningOrder[j] = vv; Chris@4: } Chris@4: } while (h != 1); Chris@4: } Chris@4: Chris@4: /*-- Chris@4: The main sorting loop. Chris@4: --*/ Chris@4: Chris@4: numQSorted = 0; Chris@4: Chris@4: for (i = 0; i <= 255; i++) { Chris@4: Chris@4: /*-- Chris@4: Process big buckets, starting with the least full. Chris@4: Basically this is a 3-step process in which we call Chris@4: mainQSort3 to sort the small buckets [ss, j], but Chris@4: also make a big effort to avoid the calls if we can. Chris@4: --*/ Chris@4: ss = runningOrder[i]; Chris@4: Chris@4: /*-- Chris@4: Step 1: Chris@4: Complete the big bucket [ss] by quicksorting Chris@4: any unsorted small buckets [ss, j], for j != ss. Chris@4: Hopefully previous pointer-scanning phases have already Chris@4: completed many of the small buckets [ss, j], so Chris@4: we don't have to sort them at all. Chris@4: --*/ Chris@4: for (j = 0; j <= 255; j++) { Chris@4: if (j != ss) { Chris@4: sb = (ss << 8) + j; Chris@4: if ( ! (ftab[sb] & SETMASK) ) { Chris@4: Int32 lo = ftab[sb] & CLEARMASK; Chris@4: Int32 hi = (ftab[sb+1] & CLEARMASK) - 1; Chris@4: if (hi > lo) { Chris@4: if (verb >= 4) Chris@4: VPrintf4 ( " qsort [0x%x, 0x%x] " Chris@4: "done %d this %d\n", Chris@4: ss, j, numQSorted, hi - lo + 1 ); Chris@4: mainQSort3 ( Chris@4: ptr, block, quadrant, nblock, Chris@4: lo, hi, BZ_N_RADIX, budget Chris@4: ); Chris@4: numQSorted += (hi - lo + 1); Chris@4: if (*budget < 0) return; Chris@4: } Chris@4: } Chris@4: ftab[sb] |= SETMASK; Chris@4: } Chris@4: } Chris@4: Chris@4: AssertH ( !bigDone[ss], 1006 ); Chris@4: Chris@4: /*-- Chris@4: Step 2: Chris@4: Now scan this big bucket [ss] so as to synthesise the Chris@4: sorted order for small buckets [t, ss] for all t, Chris@4: including, magically, the bucket [ss,ss] too. Chris@4: This will avoid doing Real Work in subsequent Step 1's. Chris@4: --*/ Chris@4: { Chris@4: for (j = 0; j <= 255; j++) { Chris@4: copyStart[j] = ftab[(j << 8) + ss] & CLEARMASK; Chris@4: copyEnd [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1; Chris@4: } Chris@4: for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) { Chris@4: k = ptr[j]-1; if (k < 0) k += nblock; Chris@4: c1 = block[k]; Chris@4: if (!bigDone[c1]) Chris@4: ptr[ copyStart[c1]++ ] = k; Chris@4: } Chris@4: for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) { Chris@4: k = ptr[j]-1; if (k < 0) k += nblock; Chris@4: c1 = block[k]; Chris@4: if (!bigDone[c1]) Chris@4: ptr[ copyEnd[c1]-- ] = k; Chris@4: } Chris@4: } Chris@4: Chris@4: AssertH ( (copyStart[ss]-1 == copyEnd[ss]) Chris@4: || Chris@4: /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1. Chris@4: Necessity for this case is demonstrated by compressing Chris@4: a sequence of approximately 48.5 million of character Chris@4: 251; 1.0.0/1.0.1 will then die here. */ Chris@4: (copyStart[ss] == 0 && copyEnd[ss] == nblock-1), Chris@4: 1007 ) Chris@4: Chris@4: for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK; Chris@4: Chris@4: /*-- Chris@4: Step 3: Chris@4: The [ss] big bucket is now done. Record this fact, Chris@4: and update the quadrant descriptors. Remember to Chris@4: update quadrants in the overshoot area too, if Chris@4: necessary. The "if (i < 255)" test merely skips Chris@4: this updating for the last bucket processed, since Chris@4: updating for the last bucket is pointless. Chris@4: Chris@4: The quadrant array provides a way to incrementally Chris@4: cache sort orderings, as they appear, so as to Chris@4: make subsequent comparisons in fullGtU() complete Chris@4: faster. For repetitive blocks this makes a big Chris@4: difference (but not big enough to be able to avoid Chris@4: the fallback sorting mechanism, exponential radix sort). Chris@4: Chris@4: The precise meaning is: at all times: Chris@4: Chris@4: for 0 <= i < nblock and 0 <= j <= nblock Chris@4: Chris@4: if block[i] != block[j], Chris@4: Chris@4: then the relative values of quadrant[i] and Chris@4: quadrant[j] are meaningless. Chris@4: Chris@4: else { Chris@4: if quadrant[i] < quadrant[j] Chris@4: then the string starting at i lexicographically Chris@4: precedes the string starting at j Chris@4: Chris@4: else if quadrant[i] > quadrant[j] Chris@4: then the string starting at j lexicographically Chris@4: precedes the string starting at i Chris@4: Chris@4: else Chris@4: the relative ordering of the strings starting Chris@4: at i and j has not yet been determined. Chris@4: } Chris@4: --*/ Chris@4: bigDone[ss] = True; Chris@4: Chris@4: if (i < 255) { Chris@4: Int32 bbStart = ftab[ss << 8] & CLEARMASK; Chris@4: Int32 bbSize = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart; Chris@4: Int32 shifts = 0; Chris@4: Chris@4: while ((bbSize >> shifts) > 65534) shifts++; Chris@4: Chris@4: for (j = bbSize-1; j >= 0; j--) { Chris@4: Int32 a2update = ptr[bbStart + j]; Chris@4: UInt16 qVal = (UInt16)(j >> shifts); Chris@4: quadrant[a2update] = qVal; Chris@4: if (a2update < BZ_N_OVERSHOOT) Chris@4: quadrant[a2update + nblock] = qVal; Chris@4: } Chris@4: AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 ); Chris@4: } Chris@4: Chris@4: } Chris@4: Chris@4: if (verb >= 4) Chris@4: VPrintf3 ( " %d pointers, %d sorted, %d scanned\n", Chris@4: nblock, numQSorted, nblock - numQSorted ); Chris@4: } Chris@4: Chris@4: #undef BIGFREQ Chris@4: #undef SETMASK Chris@4: #undef CLEARMASK Chris@4: Chris@4: Chris@4: /*---------------------------------------------*/ Chris@4: /* Pre: Chris@4: nblock > 0 Chris@4: arr2 exists for [0 .. nblock-1 +N_OVERSHOOT] Chris@4: ((UChar*)arr2) [0 .. nblock-1] holds block Chris@4: arr1 exists for [0 .. nblock-1] Chris@4: Chris@4: Post: Chris@4: ((UChar*)arr2) [0 .. nblock-1] holds block Chris@4: All other areas of block destroyed Chris@4: ftab [ 0 .. 65536 ] destroyed Chris@4: arr1 [0 .. nblock-1] holds sorted order Chris@4: */ Chris@4: void BZ2_blockSort ( EState* s ) Chris@4: { Chris@4: UInt32* ptr = s->ptr; Chris@4: UChar* block = s->block; Chris@4: UInt32* ftab = s->ftab; Chris@4: Int32 nblock = s->nblock; Chris@4: Int32 verb = s->verbosity; Chris@4: Int32 wfact = s->workFactor; Chris@4: UInt16* quadrant; Chris@4: Int32 budget; Chris@4: Int32 budgetInit; Chris@4: Int32 i; Chris@4: Chris@4: if (nblock < 10000) { Chris@4: fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb ); Chris@4: } else { Chris@4: /* Calculate the location for quadrant, remembering to get Chris@4: the alignment right. Assumes that &(block[0]) is at least Chris@4: 2-byte aligned -- this should be ok since block is really Chris@4: the first section of arr2. Chris@4: */ Chris@4: i = nblock+BZ_N_OVERSHOOT; Chris@4: if (i & 1) i++; Chris@4: quadrant = (UInt16*)(&(block[i])); Chris@4: Chris@4: /* (wfact-1) / 3 puts the default-factor-30 Chris@4: transition point at very roughly the same place as Chris@4: with v0.1 and v0.9.0. Chris@4: Not that it particularly matters any more, since the Chris@4: resulting compressed stream is now the same regardless Chris@4: of whether or not we use the main sort or fallback sort. Chris@4: */ Chris@4: if (wfact < 1 ) wfact = 1; Chris@4: if (wfact > 100) wfact = 100; Chris@4: budgetInit = nblock * ((wfact-1) / 3); Chris@4: budget = budgetInit; Chris@4: Chris@4: mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget ); Chris@4: if (verb >= 3) Chris@4: VPrintf3 ( " %d work, %d block, ratio %5.2f\n", Chris@4: budgetInit - budget, Chris@4: nblock, Chris@4: (float)(budgetInit - budget) / Chris@4: (float)(nblock==0 ? 1 : nblock) ); Chris@4: if (budget < 0) { Chris@4: if (verb >= 2) Chris@4: VPrintf0 ( " too repetitive; using fallback" Chris@4: " sorting algorithm\n" ); Chris@4: fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb ); Chris@4: } Chris@4: } Chris@4: Chris@4: s->origPtr = -1; Chris@4: for (i = 0; i < s->nblock; i++) Chris@4: if (ptr[i] == 0) Chris@4: { s->origPtr = i; break; }; Chris@4: Chris@4: AssertH( s->origPtr != -1, 1003 ); Chris@4: } Chris@4: Chris@4: Chris@4: /*-------------------------------------------------------------*/ Chris@4: /*--- end blocksort.c ---*/ Chris@4: /*-------------------------------------------------------------*/