annotate src/zlib-1.2.7/contrib/masmx64/inffas8664.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents e13257ea84a4
children
rev   line source
Chris@4 1 /* inffas8664.c is a hand tuned assembler version of inffast.c - fast decoding
Chris@4 2 * version for AMD64 on Windows using Microsoft C compiler
Chris@4 3 *
Chris@4 4 * Copyright (C) 1995-2003 Mark Adler
Chris@4 5 * For conditions of distribution and use, see copyright notice in zlib.h
Chris@4 6 *
Chris@4 7 * Copyright (C) 2003 Chris Anderson <christop@charm.net>
Chris@4 8 * Please use the copyright conditions above.
Chris@4 9 *
Chris@4 10 * 2005 - Adaptation to Microsoft C Compiler for AMD64 by Gilles Vollant
Chris@4 11 *
Chris@4 12 * inffas8664.c call function inffas8664fnc in inffasx64.asm
Chris@4 13 * inffasx64.asm is automatically convert from AMD64 portion of inffas86.c
Chris@4 14 *
Chris@4 15 * Dec-29-2003 -- I added AMD64 inflate asm support. This version is also
Chris@4 16 * slightly quicker on x86 systems because, instead of using rep movsb to copy
Chris@4 17 * data, it uses rep movsw, which moves data in 2-byte chunks instead of single
Chris@4 18 * bytes. I've tested the AMD64 code on a Fedora Core 1 + the x86_64 updates
Chris@4 19 * from http://fedora.linux.duke.edu/fc1_x86_64
Chris@4 20 * which is running on an Athlon 64 3000+ / Gigabyte GA-K8VT800M system with
Chris@4 21 * 1GB ram. The 64-bit version is about 4% faster than the 32-bit version,
Chris@4 22 * when decompressing mozilla-source-1.3.tar.gz.
Chris@4 23 *
Chris@4 24 * Mar-13-2003 -- Most of this is derived from inffast.S which is derived from
Chris@4 25 * the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at
Chris@4 26 * the moment. I have successfully compiled and tested this code with gcc2.96,
Chris@4 27 * gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S
Chris@4 28 * compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX
Chris@4 29 * enabled. I will attempt to merge the MMX code into this version. Newer
Chris@4 30 * versions of this and inffast.S can be found at
Chris@4 31 * http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/
Chris@4 32 *
Chris@4 33 */
Chris@4 34
Chris@4 35 #include <stdio.h>
Chris@4 36 #include "zutil.h"
Chris@4 37 #include "inftrees.h"
Chris@4 38 #include "inflate.h"
Chris@4 39 #include "inffast.h"
Chris@4 40
Chris@4 41 /* Mark Adler's comments from inffast.c: */
Chris@4 42
Chris@4 43 /*
Chris@4 44 Decode literal, length, and distance codes and write out the resulting
Chris@4 45 literal and match bytes until either not enough input or output is
Chris@4 46 available, an end-of-block is encountered, or a data error is encountered.
Chris@4 47 When large enough input and output buffers are supplied to inflate(), for
Chris@4 48 example, a 16K input buffer and a 64K output buffer, more than 95% of the
Chris@4 49 inflate execution time is spent in this routine.
Chris@4 50
Chris@4 51 Entry assumptions:
Chris@4 52
Chris@4 53 state->mode == LEN
Chris@4 54 strm->avail_in >= 6
Chris@4 55 strm->avail_out >= 258
Chris@4 56 start >= strm->avail_out
Chris@4 57 state->bits < 8
Chris@4 58
Chris@4 59 On return, state->mode is one of:
Chris@4 60
Chris@4 61 LEN -- ran out of enough output space or enough available input
Chris@4 62 TYPE -- reached end of block code, inflate() to interpret next block
Chris@4 63 BAD -- error in block data
Chris@4 64
Chris@4 65 Notes:
Chris@4 66
Chris@4 67 - The maximum input bits used by a length/distance pair is 15 bits for the
Chris@4 68 length code, 5 bits for the length extra, 15 bits for the distance code,
Chris@4 69 and 13 bits for the distance extra. This totals 48 bits, or six bytes.
Chris@4 70 Therefore if strm->avail_in >= 6, then there is enough input to avoid
Chris@4 71 checking for available input while decoding.
Chris@4 72
Chris@4 73 - The maximum bytes that a single length/distance pair can output is 258
Chris@4 74 bytes, which is the maximum length that can be coded. inflate_fast()
Chris@4 75 requires strm->avail_out >= 258 for each loop to avoid checking for
Chris@4 76 output space.
Chris@4 77 */
Chris@4 78
Chris@4 79
Chris@4 80
Chris@4 81 typedef struct inffast_ar {
Chris@4 82 /* 64 32 x86 x86_64 */
Chris@4 83 /* ar offset register */
Chris@4 84 /* 0 0 */ void *esp; /* esp save */
Chris@4 85 /* 8 4 */ void *ebp; /* ebp save */
Chris@4 86 /* 16 8 */ unsigned char FAR *in; /* esi rsi local strm->next_in */
Chris@4 87 /* 24 12 */ unsigned char FAR *last; /* r9 while in < last */
Chris@4 88 /* 32 16 */ unsigned char FAR *out; /* edi rdi local strm->next_out */
Chris@4 89 /* 40 20 */ unsigned char FAR *beg; /* inflate()'s init next_out */
Chris@4 90 /* 48 24 */ unsigned char FAR *end; /* r10 while out < end */
Chris@4 91 /* 56 28 */ unsigned char FAR *window;/* size of window, wsize!=0 */
Chris@4 92 /* 64 32 */ code const FAR *lcode; /* ebp rbp local strm->lencode */
Chris@4 93 /* 72 36 */ code const FAR *dcode; /* r11 local strm->distcode */
Chris@4 94 /* 80 40 */ size_t /*unsigned long */hold; /* edx rdx local strm->hold */
Chris@4 95 /* 88 44 */ unsigned bits; /* ebx rbx local strm->bits */
Chris@4 96 /* 92 48 */ unsigned wsize; /* window size */
Chris@4 97 /* 96 52 */ unsigned write; /* window write index */
Chris@4 98 /*100 56 */ unsigned lmask; /* r12 mask for lcode */
Chris@4 99 /*104 60 */ unsigned dmask; /* r13 mask for dcode */
Chris@4 100 /*108 64 */ unsigned len; /* r14 match length */
Chris@4 101 /*112 68 */ unsigned dist; /* r15 match distance */
Chris@4 102 /*116 72 */ unsigned status; /* set when state chng*/
Chris@4 103 } type_ar;
Chris@4 104 #ifdef ASMINF
Chris@4 105
Chris@4 106 void inflate_fast(strm, start)
Chris@4 107 z_streamp strm;
Chris@4 108 unsigned start; /* inflate()'s starting value for strm->avail_out */
Chris@4 109 {
Chris@4 110 struct inflate_state FAR *state;
Chris@4 111 type_ar ar;
Chris@4 112 void inffas8664fnc(struct inffast_ar * par);
Chris@4 113
Chris@4 114
Chris@4 115
Chris@4 116 #if (defined( __GNUC__ ) && defined( __amd64__ ) && ! defined( __i386 )) || (defined(_MSC_VER) && defined(_M_AMD64))
Chris@4 117 #define PAD_AVAIL_IN 6
Chris@4 118 #define PAD_AVAIL_OUT 258
Chris@4 119 #else
Chris@4 120 #define PAD_AVAIL_IN 5
Chris@4 121 #define PAD_AVAIL_OUT 257
Chris@4 122 #endif
Chris@4 123
Chris@4 124 /* copy state to local variables */
Chris@4 125 state = (struct inflate_state FAR *)strm->state;
Chris@4 126
Chris@4 127 ar.in = strm->next_in;
Chris@4 128 ar.last = ar.in + (strm->avail_in - PAD_AVAIL_IN);
Chris@4 129 ar.out = strm->next_out;
Chris@4 130 ar.beg = ar.out - (start - strm->avail_out);
Chris@4 131 ar.end = ar.out + (strm->avail_out - PAD_AVAIL_OUT);
Chris@4 132 ar.wsize = state->wsize;
Chris@4 133 ar.write = state->wnext;
Chris@4 134 ar.window = state->window;
Chris@4 135 ar.hold = state->hold;
Chris@4 136 ar.bits = state->bits;
Chris@4 137 ar.lcode = state->lencode;
Chris@4 138 ar.dcode = state->distcode;
Chris@4 139 ar.lmask = (1U << state->lenbits) - 1;
Chris@4 140 ar.dmask = (1U << state->distbits) - 1;
Chris@4 141
Chris@4 142 /* decode literals and length/distances until end-of-block or not enough
Chris@4 143 input data or output space */
Chris@4 144
Chris@4 145 /* align in on 1/2 hold size boundary */
Chris@4 146 while (((size_t)(void *)ar.in & (sizeof(ar.hold) / 2 - 1)) != 0) {
Chris@4 147 ar.hold += (unsigned long)*ar.in++ << ar.bits;
Chris@4 148 ar.bits += 8;
Chris@4 149 }
Chris@4 150
Chris@4 151 inffas8664fnc(&ar);
Chris@4 152
Chris@4 153 if (ar.status > 1) {
Chris@4 154 if (ar.status == 2)
Chris@4 155 strm->msg = "invalid literal/length code";
Chris@4 156 else if (ar.status == 3)
Chris@4 157 strm->msg = "invalid distance code";
Chris@4 158 else
Chris@4 159 strm->msg = "invalid distance too far back";
Chris@4 160 state->mode = BAD;
Chris@4 161 }
Chris@4 162 else if ( ar.status == 1 ) {
Chris@4 163 state->mode = TYPE;
Chris@4 164 }
Chris@4 165
Chris@4 166 /* return unused bytes (on entry, bits < 8, so in won't go too far back) */
Chris@4 167 ar.len = ar.bits >> 3;
Chris@4 168 ar.in -= ar.len;
Chris@4 169 ar.bits -= ar.len << 3;
Chris@4 170 ar.hold &= (1U << ar.bits) - 1;
Chris@4 171
Chris@4 172 /* update state and return */
Chris@4 173 strm->next_in = ar.in;
Chris@4 174 strm->next_out = ar.out;
Chris@4 175 strm->avail_in = (unsigned)(ar.in < ar.last ?
Chris@4 176 PAD_AVAIL_IN + (ar.last - ar.in) :
Chris@4 177 PAD_AVAIL_IN - (ar.in - ar.last));
Chris@4 178 strm->avail_out = (unsigned)(ar.out < ar.end ?
Chris@4 179 PAD_AVAIL_OUT + (ar.end - ar.out) :
Chris@4 180 PAD_AVAIL_OUT - (ar.out - ar.end));
Chris@4 181 state->hold = (unsigned long)ar.hold;
Chris@4 182 state->bits = ar.bits;
Chris@4 183 return;
Chris@4 184 }
Chris@4 185
Chris@4 186 #endif