annotate src/zlib-1.2.7/contrib/asm686/match.S @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents e13257ea84a4
children
rev   line source
Chris@4 1 /* match.S -- x86 assembly version of the zlib longest_match() function.
Chris@4 2 * Optimized for the Intel 686 chips (PPro and later).
Chris@4 3 *
Chris@4 4 * Copyright (C) 1998, 2007 Brian Raiter <breadbox@muppetlabs.com>
Chris@4 5 *
Chris@4 6 * This software is provided 'as-is', without any express or implied
Chris@4 7 * warranty. In no event will the author be held liable for any damages
Chris@4 8 * arising from the use of this software.
Chris@4 9 *
Chris@4 10 * Permission is granted to anyone to use this software for any purpose,
Chris@4 11 * including commercial applications, and to alter it and redistribute it
Chris@4 12 * freely, subject to the following restrictions:
Chris@4 13 *
Chris@4 14 * 1. The origin of this software must not be misrepresented; you must not
Chris@4 15 * claim that you wrote the original software. If you use this software
Chris@4 16 * in a product, an acknowledgment in the product documentation would be
Chris@4 17 * appreciated but is not required.
Chris@4 18 * 2. Altered source versions must be plainly marked as such, and must not be
Chris@4 19 * misrepresented as being the original software.
Chris@4 20 * 3. This notice may not be removed or altered from any source distribution.
Chris@4 21 */
Chris@4 22
Chris@4 23 #ifndef NO_UNDERLINE
Chris@4 24 #define match_init _match_init
Chris@4 25 #define longest_match _longest_match
Chris@4 26 #endif
Chris@4 27
Chris@4 28 #define MAX_MATCH (258)
Chris@4 29 #define MIN_MATCH (3)
Chris@4 30 #define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1)
Chris@4 31 #define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7)
Chris@4 32
Chris@4 33 /* stack frame offsets */
Chris@4 34
Chris@4 35 #define chainlenwmask 0 /* high word: current chain len */
Chris@4 36 /* low word: s->wmask */
Chris@4 37 #define window 4 /* local copy of s->window */
Chris@4 38 #define windowbestlen 8 /* s->window + bestlen */
Chris@4 39 #define scanstart 16 /* first two bytes of string */
Chris@4 40 #define scanend 12 /* last two bytes of string */
Chris@4 41 #define scanalign 20 /* dword-misalignment of string */
Chris@4 42 #define nicematch 24 /* a good enough match size */
Chris@4 43 #define bestlen 28 /* size of best match so far */
Chris@4 44 #define scan 32 /* ptr to string wanting match */
Chris@4 45
Chris@4 46 #define LocalVarsSize (36)
Chris@4 47 /* saved ebx 36 */
Chris@4 48 /* saved edi 40 */
Chris@4 49 /* saved esi 44 */
Chris@4 50 /* saved ebp 48 */
Chris@4 51 /* return address 52 */
Chris@4 52 #define deflatestate 56 /* the function arguments */
Chris@4 53 #define curmatch 60
Chris@4 54
Chris@4 55 /* All the +zlib1222add offsets are due to the addition of fields
Chris@4 56 * in zlib in the deflate_state structure since the asm code was first written
Chris@4 57 * (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
Chris@4 58 * (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
Chris@4 59 * if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
Chris@4 60 */
Chris@4 61
Chris@4 62 #define zlib1222add (8)
Chris@4 63
Chris@4 64 #define dsWSize (36+zlib1222add)
Chris@4 65 #define dsWMask (44+zlib1222add)
Chris@4 66 #define dsWindow (48+zlib1222add)
Chris@4 67 #define dsPrev (56+zlib1222add)
Chris@4 68 #define dsMatchLen (88+zlib1222add)
Chris@4 69 #define dsPrevMatch (92+zlib1222add)
Chris@4 70 #define dsStrStart (100+zlib1222add)
Chris@4 71 #define dsMatchStart (104+zlib1222add)
Chris@4 72 #define dsLookahead (108+zlib1222add)
Chris@4 73 #define dsPrevLen (112+zlib1222add)
Chris@4 74 #define dsMaxChainLen (116+zlib1222add)
Chris@4 75 #define dsGoodMatch (132+zlib1222add)
Chris@4 76 #define dsNiceMatch (136+zlib1222add)
Chris@4 77
Chris@4 78
Chris@4 79 .file "match.S"
Chris@4 80
Chris@4 81 .globl match_init, longest_match
Chris@4 82
Chris@4 83 .text
Chris@4 84
Chris@4 85 /* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */
Chris@4 86 .cfi_sections .debug_frame
Chris@4 87
Chris@4 88 longest_match:
Chris@4 89
Chris@4 90 .cfi_startproc
Chris@4 91 /* Save registers that the compiler may be using, and adjust %esp to */
Chris@4 92 /* make room for our stack frame. */
Chris@4 93
Chris@4 94 pushl %ebp
Chris@4 95 .cfi_def_cfa_offset 8
Chris@4 96 .cfi_offset ebp, -8
Chris@4 97 pushl %edi
Chris@4 98 .cfi_def_cfa_offset 12
Chris@4 99 pushl %esi
Chris@4 100 .cfi_def_cfa_offset 16
Chris@4 101 pushl %ebx
Chris@4 102 .cfi_def_cfa_offset 20
Chris@4 103 subl $LocalVarsSize, %esp
Chris@4 104 .cfi_def_cfa_offset LocalVarsSize+20
Chris@4 105
Chris@4 106 /* Retrieve the function arguments. %ecx will hold cur_match */
Chris@4 107 /* throughout the entire function. %edx will hold the pointer to the */
Chris@4 108 /* deflate_state structure during the function's setup (before */
Chris@4 109 /* entering the main loop). */
Chris@4 110
Chris@4 111 movl deflatestate(%esp), %edx
Chris@4 112 movl curmatch(%esp), %ecx
Chris@4 113
Chris@4 114 /* uInt wmask = s->w_mask; */
Chris@4 115 /* unsigned chain_length = s->max_chain_length; */
Chris@4 116 /* if (s->prev_length >= s->good_match) { */
Chris@4 117 /* chain_length >>= 2; */
Chris@4 118 /* } */
Chris@4 119
Chris@4 120 movl dsPrevLen(%edx), %eax
Chris@4 121 movl dsGoodMatch(%edx), %ebx
Chris@4 122 cmpl %ebx, %eax
Chris@4 123 movl dsWMask(%edx), %eax
Chris@4 124 movl dsMaxChainLen(%edx), %ebx
Chris@4 125 jl LastMatchGood
Chris@4 126 shrl $2, %ebx
Chris@4 127 LastMatchGood:
Chris@4 128
Chris@4 129 /* chainlen is decremented once beforehand so that the function can */
Chris@4 130 /* use the sign flag instead of the zero flag for the exit test. */
Chris@4 131 /* It is then shifted into the high word, to make room for the wmask */
Chris@4 132 /* value, which it will always accompany. */
Chris@4 133
Chris@4 134 decl %ebx
Chris@4 135 shll $16, %ebx
Chris@4 136 orl %eax, %ebx
Chris@4 137 movl %ebx, chainlenwmask(%esp)
Chris@4 138
Chris@4 139 /* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */
Chris@4 140
Chris@4 141 movl dsNiceMatch(%edx), %eax
Chris@4 142 movl dsLookahead(%edx), %ebx
Chris@4 143 cmpl %eax, %ebx
Chris@4 144 jl LookaheadLess
Chris@4 145 movl %eax, %ebx
Chris@4 146 LookaheadLess: movl %ebx, nicematch(%esp)
Chris@4 147
Chris@4 148 /* register Bytef *scan = s->window + s->strstart; */
Chris@4 149
Chris@4 150 movl dsWindow(%edx), %esi
Chris@4 151 movl %esi, window(%esp)
Chris@4 152 movl dsStrStart(%edx), %ebp
Chris@4 153 lea (%esi,%ebp), %edi
Chris@4 154 movl %edi, scan(%esp)
Chris@4 155
Chris@4 156 /* Determine how many bytes the scan ptr is off from being */
Chris@4 157 /* dword-aligned. */
Chris@4 158
Chris@4 159 movl %edi, %eax
Chris@4 160 negl %eax
Chris@4 161 andl $3, %eax
Chris@4 162 movl %eax, scanalign(%esp)
Chris@4 163
Chris@4 164 /* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */
Chris@4 165 /* s->strstart - (IPos)MAX_DIST(s) : NIL; */
Chris@4 166
Chris@4 167 movl dsWSize(%edx), %eax
Chris@4 168 subl $MIN_LOOKAHEAD, %eax
Chris@4 169 subl %eax, %ebp
Chris@4 170 jg LimitPositive
Chris@4 171 xorl %ebp, %ebp
Chris@4 172 LimitPositive:
Chris@4 173
Chris@4 174 /* int best_len = s->prev_length; */
Chris@4 175
Chris@4 176 movl dsPrevLen(%edx), %eax
Chris@4 177 movl %eax, bestlen(%esp)
Chris@4 178
Chris@4 179 /* Store the sum of s->window + best_len in %esi locally, and in %esi. */
Chris@4 180
Chris@4 181 addl %eax, %esi
Chris@4 182 movl %esi, windowbestlen(%esp)
Chris@4 183
Chris@4 184 /* register ush scan_start = *(ushf*)scan; */
Chris@4 185 /* register ush scan_end = *(ushf*)(scan+best_len-1); */
Chris@4 186 /* Posf *prev = s->prev; */
Chris@4 187
Chris@4 188 movzwl (%edi), %ebx
Chris@4 189 movl %ebx, scanstart(%esp)
Chris@4 190 movzwl -1(%edi,%eax), %ebx
Chris@4 191 movl %ebx, scanend(%esp)
Chris@4 192 movl dsPrev(%edx), %edi
Chris@4 193
Chris@4 194 /* Jump into the main loop. */
Chris@4 195
Chris@4 196 movl chainlenwmask(%esp), %edx
Chris@4 197 jmp LoopEntry
Chris@4 198
Chris@4 199 .balign 16
Chris@4 200
Chris@4 201 /* do {
Chris@4 202 * match = s->window + cur_match;
Chris@4 203 * if (*(ushf*)(match+best_len-1) != scan_end ||
Chris@4 204 * *(ushf*)match != scan_start) continue;
Chris@4 205 * [...]
Chris@4 206 * } while ((cur_match = prev[cur_match & wmask]) > limit
Chris@4 207 * && --chain_length != 0);
Chris@4 208 *
Chris@4 209 * Here is the inner loop of the function. The function will spend the
Chris@4 210 * majority of its time in this loop, and majority of that time will
Chris@4 211 * be spent in the first ten instructions.
Chris@4 212 *
Chris@4 213 * Within this loop:
Chris@4 214 * %ebx = scanend
Chris@4 215 * %ecx = curmatch
Chris@4 216 * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
Chris@4 217 * %esi = windowbestlen - i.e., (window + bestlen)
Chris@4 218 * %edi = prev
Chris@4 219 * %ebp = limit
Chris@4 220 */
Chris@4 221 LookupLoop:
Chris@4 222 andl %edx, %ecx
Chris@4 223 movzwl (%edi,%ecx,2), %ecx
Chris@4 224 cmpl %ebp, %ecx
Chris@4 225 jbe LeaveNow
Chris@4 226 subl $0x00010000, %edx
Chris@4 227 js LeaveNow
Chris@4 228 LoopEntry: movzwl -1(%esi,%ecx), %eax
Chris@4 229 cmpl %ebx, %eax
Chris@4 230 jnz LookupLoop
Chris@4 231 movl window(%esp), %eax
Chris@4 232 movzwl (%eax,%ecx), %eax
Chris@4 233 cmpl scanstart(%esp), %eax
Chris@4 234 jnz LookupLoop
Chris@4 235
Chris@4 236 /* Store the current value of chainlen. */
Chris@4 237
Chris@4 238 movl %edx, chainlenwmask(%esp)
Chris@4 239
Chris@4 240 /* Point %edi to the string under scrutiny, and %esi to the string we */
Chris@4 241 /* are hoping to match it up with. In actuality, %esi and %edi are */
Chris@4 242 /* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */
Chris@4 243 /* initialized to -(MAX_MATCH_8 - scanalign). */
Chris@4 244
Chris@4 245 movl window(%esp), %esi
Chris@4 246 movl scan(%esp), %edi
Chris@4 247 addl %ecx, %esi
Chris@4 248 movl scanalign(%esp), %eax
Chris@4 249 movl $(-MAX_MATCH_8), %edx
Chris@4 250 lea MAX_MATCH_8(%edi,%eax), %edi
Chris@4 251 lea MAX_MATCH_8(%esi,%eax), %esi
Chris@4 252
Chris@4 253 /* Test the strings for equality, 8 bytes at a time. At the end,
Chris@4 254 * adjust %edx so that it is offset to the exact byte that mismatched.
Chris@4 255 *
Chris@4 256 * We already know at this point that the first three bytes of the
Chris@4 257 * strings match each other, and they can be safely passed over before
Chris@4 258 * starting the compare loop. So what this code does is skip over 0-3
Chris@4 259 * bytes, as much as necessary in order to dword-align the %edi
Chris@4 260 * pointer. (%esi will still be misaligned three times out of four.)
Chris@4 261 *
Chris@4 262 * It should be confessed that this loop usually does not represent
Chris@4 263 * much of the total running time. Replacing it with a more
Chris@4 264 * straightforward "rep cmpsb" would not drastically degrade
Chris@4 265 * performance.
Chris@4 266 */
Chris@4 267 LoopCmps:
Chris@4 268 movl (%esi,%edx), %eax
Chris@4 269 xorl (%edi,%edx), %eax
Chris@4 270 jnz LeaveLoopCmps
Chris@4 271 movl 4(%esi,%edx), %eax
Chris@4 272 xorl 4(%edi,%edx), %eax
Chris@4 273 jnz LeaveLoopCmps4
Chris@4 274 addl $8, %edx
Chris@4 275 jnz LoopCmps
Chris@4 276 jmp LenMaximum
Chris@4 277 LeaveLoopCmps4: addl $4, %edx
Chris@4 278 LeaveLoopCmps: testl $0x0000FFFF, %eax
Chris@4 279 jnz LenLower
Chris@4 280 addl $2, %edx
Chris@4 281 shrl $16, %eax
Chris@4 282 LenLower: subb $1, %al
Chris@4 283 adcl $0, %edx
Chris@4 284
Chris@4 285 /* Calculate the length of the match. If it is longer than MAX_MATCH, */
Chris@4 286 /* then automatically accept it as the best possible match and leave. */
Chris@4 287
Chris@4 288 lea (%edi,%edx), %eax
Chris@4 289 movl scan(%esp), %edi
Chris@4 290 subl %edi, %eax
Chris@4 291 cmpl $MAX_MATCH, %eax
Chris@4 292 jge LenMaximum
Chris@4 293
Chris@4 294 /* If the length of the match is not longer than the best match we */
Chris@4 295 /* have so far, then forget it and return to the lookup loop. */
Chris@4 296
Chris@4 297 movl deflatestate(%esp), %edx
Chris@4 298 movl bestlen(%esp), %ebx
Chris@4 299 cmpl %ebx, %eax
Chris@4 300 jg LongerMatch
Chris@4 301 movl windowbestlen(%esp), %esi
Chris@4 302 movl dsPrev(%edx), %edi
Chris@4 303 movl scanend(%esp), %ebx
Chris@4 304 movl chainlenwmask(%esp), %edx
Chris@4 305 jmp LookupLoop
Chris@4 306
Chris@4 307 /* s->match_start = cur_match; */
Chris@4 308 /* best_len = len; */
Chris@4 309 /* if (len >= nice_match) break; */
Chris@4 310 /* scan_end = *(ushf*)(scan+best_len-1); */
Chris@4 311
Chris@4 312 LongerMatch: movl nicematch(%esp), %ebx
Chris@4 313 movl %eax, bestlen(%esp)
Chris@4 314 movl %ecx, dsMatchStart(%edx)
Chris@4 315 cmpl %ebx, %eax
Chris@4 316 jge LeaveNow
Chris@4 317 movl window(%esp), %esi
Chris@4 318 addl %eax, %esi
Chris@4 319 movl %esi, windowbestlen(%esp)
Chris@4 320 movzwl -1(%edi,%eax), %ebx
Chris@4 321 movl dsPrev(%edx), %edi
Chris@4 322 movl %ebx, scanend(%esp)
Chris@4 323 movl chainlenwmask(%esp), %edx
Chris@4 324 jmp LookupLoop
Chris@4 325
Chris@4 326 /* Accept the current string, with the maximum possible length. */
Chris@4 327
Chris@4 328 LenMaximum: movl deflatestate(%esp), %edx
Chris@4 329 movl $MAX_MATCH, bestlen(%esp)
Chris@4 330 movl %ecx, dsMatchStart(%edx)
Chris@4 331
Chris@4 332 /* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */
Chris@4 333 /* return s->lookahead; */
Chris@4 334
Chris@4 335 LeaveNow:
Chris@4 336 movl deflatestate(%esp), %edx
Chris@4 337 movl bestlen(%esp), %ebx
Chris@4 338 movl dsLookahead(%edx), %eax
Chris@4 339 cmpl %eax, %ebx
Chris@4 340 jg LookaheadRet
Chris@4 341 movl %ebx, %eax
Chris@4 342 LookaheadRet:
Chris@4 343
Chris@4 344 /* Restore the stack and return from whence we came. */
Chris@4 345
Chris@4 346 addl $LocalVarsSize, %esp
Chris@4 347 .cfi_def_cfa_offset 20
Chris@4 348 popl %ebx
Chris@4 349 .cfi_def_cfa_offset 16
Chris@4 350 popl %esi
Chris@4 351 .cfi_def_cfa_offset 12
Chris@4 352 popl %edi
Chris@4 353 .cfi_def_cfa_offset 8
Chris@4 354 popl %ebp
Chris@4 355 .cfi_def_cfa_offset 4
Chris@4 356 .cfi_endproc
Chris@4 357 match_init: ret