annotate src/zlib-1.2.8/contrib/amd64/amd64-match.S @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 5ea0608b923f
children
rev   line source
Chris@43 1 /*
Chris@43 2 * match.S -- optimized version of longest_match()
Chris@43 3 * based on the similar work by Gilles Vollant, and Brian Raiter, written 1998
Chris@43 4 *
Chris@43 5 * This is free software; you can redistribute it and/or modify it
Chris@43 6 * under the terms of the BSD License. Use by owners of Che Guevarra
Chris@43 7 * parafernalia is prohibited, where possible, and highly discouraged
Chris@43 8 * elsewhere.
Chris@43 9 */
Chris@43 10
Chris@43 11 #ifndef NO_UNDERLINE
Chris@43 12 # define match_init _match_init
Chris@43 13 # define longest_match _longest_match
Chris@43 14 #endif
Chris@43 15
Chris@43 16 #define scanend ebx
Chris@43 17 #define scanendw bx
Chris@43 18 #define chainlenwmask edx /* high word: current chain len low word: s->wmask */
Chris@43 19 #define curmatch rsi
Chris@43 20 #define curmatchd esi
Chris@43 21 #define windowbestlen r8
Chris@43 22 #define scanalign r9
Chris@43 23 #define scanalignd r9d
Chris@43 24 #define window r10
Chris@43 25 #define bestlen r11
Chris@43 26 #define bestlend r11d
Chris@43 27 #define scanstart r12d
Chris@43 28 #define scanstartw r12w
Chris@43 29 #define scan r13
Chris@43 30 #define nicematch r14d
Chris@43 31 #define limit r15
Chris@43 32 #define limitd r15d
Chris@43 33 #define prev rcx
Chris@43 34
Chris@43 35 /*
Chris@43 36 * The 258 is a "magic number, not a parameter -- changing it
Chris@43 37 * breaks the hell loose
Chris@43 38 */
Chris@43 39 #define MAX_MATCH (258)
Chris@43 40 #define MIN_MATCH (3)
Chris@43 41 #define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1)
Chris@43 42 #define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7)
Chris@43 43
Chris@43 44 /* stack frame offsets */
Chris@43 45 #define LocalVarsSize (112)
Chris@43 46 #define _chainlenwmask ( 8-LocalVarsSize)(%rsp)
Chris@43 47 #define _windowbestlen (16-LocalVarsSize)(%rsp)
Chris@43 48 #define save_r14 (24-LocalVarsSize)(%rsp)
Chris@43 49 #define save_rsi (32-LocalVarsSize)(%rsp)
Chris@43 50 #define save_rbx (40-LocalVarsSize)(%rsp)
Chris@43 51 #define save_r12 (56-LocalVarsSize)(%rsp)
Chris@43 52 #define save_r13 (64-LocalVarsSize)(%rsp)
Chris@43 53 #define save_r15 (80-LocalVarsSize)(%rsp)
Chris@43 54
Chris@43 55
Chris@43 56 .globl match_init, longest_match
Chris@43 57
Chris@43 58 /*
Chris@43 59 * On AMD64 the first argument of a function (in our case -- the pointer to
Chris@43 60 * deflate_state structure) is passed in %rdi, hence our offsets below are
Chris@43 61 * all off of that.
Chris@43 62 */
Chris@43 63
Chris@43 64 /* you can check the structure offset by running
Chris@43 65
Chris@43 66 #include <stdlib.h>
Chris@43 67 #include <stdio.h>
Chris@43 68 #include "deflate.h"
Chris@43 69
Chris@43 70 void print_depl()
Chris@43 71 {
Chris@43 72 deflate_state ds;
Chris@43 73 deflate_state *s=&ds;
Chris@43 74 printf("size pointer=%u\n",(int)sizeof(void*));
Chris@43 75
Chris@43 76 printf("#define dsWSize (%3u)(%%rdi)\n",(int)(((char*)&(s->w_size))-((char*)s)));
Chris@43 77 printf("#define dsWMask (%3u)(%%rdi)\n",(int)(((char*)&(s->w_mask))-((char*)s)));
Chris@43 78 printf("#define dsWindow (%3u)(%%rdi)\n",(int)(((char*)&(s->window))-((char*)s)));
Chris@43 79 printf("#define dsPrev (%3u)(%%rdi)\n",(int)(((char*)&(s->prev))-((char*)s)));
Chris@43 80 printf("#define dsMatchLen (%3u)(%%rdi)\n",(int)(((char*)&(s->match_length))-((char*)s)));
Chris@43 81 printf("#define dsPrevMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->prev_match))-((char*)s)));
Chris@43 82 printf("#define dsStrStart (%3u)(%%rdi)\n",(int)(((char*)&(s->strstart))-((char*)s)));
Chris@43 83 printf("#define dsMatchStart (%3u)(%%rdi)\n",(int)(((char*)&(s->match_start))-((char*)s)));
Chris@43 84 printf("#define dsLookahead (%3u)(%%rdi)\n",(int)(((char*)&(s->lookahead))-((char*)s)));
Chris@43 85 printf("#define dsPrevLen (%3u)(%%rdi)\n",(int)(((char*)&(s->prev_length))-((char*)s)));
Chris@43 86 printf("#define dsMaxChainLen (%3u)(%%rdi)\n",(int)(((char*)&(s->max_chain_length))-((char*)s)));
Chris@43 87 printf("#define dsGoodMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->good_match))-((char*)s)));
Chris@43 88 printf("#define dsNiceMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->nice_match))-((char*)s)));
Chris@43 89 }
Chris@43 90
Chris@43 91 */
Chris@43 92
Chris@43 93
Chris@43 94 /*
Chris@43 95 to compile for XCode 3.2 on MacOSX x86_64
Chris@43 96 - run "gcc -g -c -DXCODE_MAC_X64_STRUCTURE amd64-match.S"
Chris@43 97 */
Chris@43 98
Chris@43 99
Chris@43 100 #ifndef CURRENT_LINX_XCODE_MAC_X64_STRUCTURE
Chris@43 101 #define dsWSize ( 68)(%rdi)
Chris@43 102 #define dsWMask ( 76)(%rdi)
Chris@43 103 #define dsWindow ( 80)(%rdi)
Chris@43 104 #define dsPrev ( 96)(%rdi)
Chris@43 105 #define dsMatchLen (144)(%rdi)
Chris@43 106 #define dsPrevMatch (148)(%rdi)
Chris@43 107 #define dsStrStart (156)(%rdi)
Chris@43 108 #define dsMatchStart (160)(%rdi)
Chris@43 109 #define dsLookahead (164)(%rdi)
Chris@43 110 #define dsPrevLen (168)(%rdi)
Chris@43 111 #define dsMaxChainLen (172)(%rdi)
Chris@43 112 #define dsGoodMatch (188)(%rdi)
Chris@43 113 #define dsNiceMatch (192)(%rdi)
Chris@43 114
Chris@43 115 #else
Chris@43 116
Chris@43 117 #ifndef STRUCT_OFFSET
Chris@43 118 # define STRUCT_OFFSET (0)
Chris@43 119 #endif
Chris@43 120
Chris@43 121
Chris@43 122 #define dsWSize ( 56 + STRUCT_OFFSET)(%rdi)
Chris@43 123 #define dsWMask ( 64 + STRUCT_OFFSET)(%rdi)
Chris@43 124 #define dsWindow ( 72 + STRUCT_OFFSET)(%rdi)
Chris@43 125 #define dsPrev ( 88 + STRUCT_OFFSET)(%rdi)
Chris@43 126 #define dsMatchLen (136 + STRUCT_OFFSET)(%rdi)
Chris@43 127 #define dsPrevMatch (140 + STRUCT_OFFSET)(%rdi)
Chris@43 128 #define dsStrStart (148 + STRUCT_OFFSET)(%rdi)
Chris@43 129 #define dsMatchStart (152 + STRUCT_OFFSET)(%rdi)
Chris@43 130 #define dsLookahead (156 + STRUCT_OFFSET)(%rdi)
Chris@43 131 #define dsPrevLen (160 + STRUCT_OFFSET)(%rdi)
Chris@43 132 #define dsMaxChainLen (164 + STRUCT_OFFSET)(%rdi)
Chris@43 133 #define dsGoodMatch (180 + STRUCT_OFFSET)(%rdi)
Chris@43 134 #define dsNiceMatch (184 + STRUCT_OFFSET)(%rdi)
Chris@43 135
Chris@43 136 #endif
Chris@43 137
Chris@43 138
Chris@43 139
Chris@43 140
Chris@43 141 .text
Chris@43 142
Chris@43 143 /* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */
Chris@43 144
Chris@43 145 longest_match:
Chris@43 146 /*
Chris@43 147 * Retrieve the function arguments. %curmatch will hold cur_match
Chris@43 148 * throughout the entire function (passed via rsi on amd64).
Chris@43 149 * rdi will hold the pointer to the deflate_state (first arg on amd64)
Chris@43 150 */
Chris@43 151 mov %rsi, save_rsi
Chris@43 152 mov %rbx, save_rbx
Chris@43 153 mov %r12, save_r12
Chris@43 154 mov %r13, save_r13
Chris@43 155 mov %r14, save_r14
Chris@43 156 mov %r15, save_r15
Chris@43 157
Chris@43 158 /* uInt wmask = s->w_mask; */
Chris@43 159 /* unsigned chain_length = s->max_chain_length; */
Chris@43 160 /* if (s->prev_length >= s->good_match) { */
Chris@43 161 /* chain_length >>= 2; */
Chris@43 162 /* } */
Chris@43 163
Chris@43 164 movl dsPrevLen, %eax
Chris@43 165 movl dsGoodMatch, %ebx
Chris@43 166 cmpl %ebx, %eax
Chris@43 167 movl dsWMask, %eax
Chris@43 168 movl dsMaxChainLen, %chainlenwmask
Chris@43 169 jl LastMatchGood
Chris@43 170 shrl $2, %chainlenwmask
Chris@43 171 LastMatchGood:
Chris@43 172
Chris@43 173 /* chainlen is decremented once beforehand so that the function can */
Chris@43 174 /* use the sign flag instead of the zero flag for the exit test. */
Chris@43 175 /* It is then shifted into the high word, to make room for the wmask */
Chris@43 176 /* value, which it will always accompany. */
Chris@43 177
Chris@43 178 decl %chainlenwmask
Chris@43 179 shll $16, %chainlenwmask
Chris@43 180 orl %eax, %chainlenwmask
Chris@43 181
Chris@43 182 /* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */
Chris@43 183
Chris@43 184 movl dsNiceMatch, %eax
Chris@43 185 movl dsLookahead, %ebx
Chris@43 186 cmpl %eax, %ebx
Chris@43 187 jl LookaheadLess
Chris@43 188 movl %eax, %ebx
Chris@43 189 LookaheadLess: movl %ebx, %nicematch
Chris@43 190
Chris@43 191 /* register Bytef *scan = s->window + s->strstart; */
Chris@43 192
Chris@43 193 mov dsWindow, %window
Chris@43 194 movl dsStrStart, %limitd
Chris@43 195 lea (%limit, %window), %scan
Chris@43 196
Chris@43 197 /* Determine how many bytes the scan ptr is off from being */
Chris@43 198 /* dword-aligned. */
Chris@43 199
Chris@43 200 mov %scan, %scanalign
Chris@43 201 negl %scanalignd
Chris@43 202 andl $3, %scanalignd
Chris@43 203
Chris@43 204 /* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */
Chris@43 205 /* s->strstart - (IPos)MAX_DIST(s) : NIL; */
Chris@43 206
Chris@43 207 movl dsWSize, %eax
Chris@43 208 subl $MIN_LOOKAHEAD, %eax
Chris@43 209 xorl %ecx, %ecx
Chris@43 210 subl %eax, %limitd
Chris@43 211 cmovng %ecx, %limitd
Chris@43 212
Chris@43 213 /* int best_len = s->prev_length; */
Chris@43 214
Chris@43 215 movl dsPrevLen, %bestlend
Chris@43 216
Chris@43 217 /* Store the sum of s->window + best_len in %windowbestlen locally, and in memory. */
Chris@43 218
Chris@43 219 lea (%window, %bestlen), %windowbestlen
Chris@43 220 mov %windowbestlen, _windowbestlen
Chris@43 221
Chris@43 222 /* register ush scan_start = *(ushf*)scan; */
Chris@43 223 /* register ush scan_end = *(ushf*)(scan+best_len-1); */
Chris@43 224 /* Posf *prev = s->prev; */
Chris@43 225
Chris@43 226 movzwl (%scan), %scanstart
Chris@43 227 movzwl -1(%scan, %bestlen), %scanend
Chris@43 228 mov dsPrev, %prev
Chris@43 229
Chris@43 230 /* Jump into the main loop. */
Chris@43 231
Chris@43 232 movl %chainlenwmask, _chainlenwmask
Chris@43 233 jmp LoopEntry
Chris@43 234
Chris@43 235 .balign 16
Chris@43 236
Chris@43 237 /* do {
Chris@43 238 * match = s->window + cur_match;
Chris@43 239 * if (*(ushf*)(match+best_len-1) != scan_end ||
Chris@43 240 * *(ushf*)match != scan_start) continue;
Chris@43 241 * [...]
Chris@43 242 * } while ((cur_match = prev[cur_match & wmask]) > limit
Chris@43 243 * && --chain_length != 0);
Chris@43 244 *
Chris@43 245 * Here is the inner loop of the function. The function will spend the
Chris@43 246 * majority of its time in this loop, and majority of that time will
Chris@43 247 * be spent in the first ten instructions.
Chris@43 248 */
Chris@43 249 LookupLoop:
Chris@43 250 andl %chainlenwmask, %curmatchd
Chris@43 251 movzwl (%prev, %curmatch, 2), %curmatchd
Chris@43 252 cmpl %limitd, %curmatchd
Chris@43 253 jbe LeaveNow
Chris@43 254 subl $0x00010000, %chainlenwmask
Chris@43 255 js LeaveNow
Chris@43 256 LoopEntry: cmpw -1(%windowbestlen, %curmatch), %scanendw
Chris@43 257 jne LookupLoop
Chris@43 258 cmpw %scanstartw, (%window, %curmatch)
Chris@43 259 jne LookupLoop
Chris@43 260
Chris@43 261 /* Store the current value of chainlen. */
Chris@43 262 movl %chainlenwmask, _chainlenwmask
Chris@43 263
Chris@43 264 /* %scan is the string under scrutiny, and %prev to the string we */
Chris@43 265 /* are hoping to match it up with. In actuality, %esi and %edi are */
Chris@43 266 /* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */
Chris@43 267 /* initialized to -(MAX_MATCH_8 - scanalign). */
Chris@43 268
Chris@43 269 mov $(-MAX_MATCH_8), %rdx
Chris@43 270 lea (%curmatch, %window), %windowbestlen
Chris@43 271 lea MAX_MATCH_8(%windowbestlen, %scanalign), %windowbestlen
Chris@43 272 lea MAX_MATCH_8(%scan, %scanalign), %prev
Chris@43 273
Chris@43 274 /* the prefetching below makes very little difference... */
Chris@43 275 prefetcht1 (%windowbestlen, %rdx)
Chris@43 276 prefetcht1 (%prev, %rdx)
Chris@43 277
Chris@43 278 /*
Chris@43 279 * Test the strings for equality, 8 bytes at a time. At the end,
Chris@43 280 * adjust %rdx so that it is offset to the exact byte that mismatched.
Chris@43 281 *
Chris@43 282 * It should be confessed that this loop usually does not represent
Chris@43 283 * much of the total running time. Replacing it with a more
Chris@43 284 * straightforward "rep cmpsb" would not drastically degrade
Chris@43 285 * performance -- unrolling it, for example, makes no difference.
Chris@43 286 */
Chris@43 287
Chris@43 288 #undef USE_SSE /* works, but is 6-7% slower, than non-SSE... */
Chris@43 289
Chris@43 290 LoopCmps:
Chris@43 291 #ifdef USE_SSE
Chris@43 292 /* Preload the SSE registers */
Chris@43 293 movdqu (%windowbestlen, %rdx), %xmm1
Chris@43 294 movdqu (%prev, %rdx), %xmm2
Chris@43 295 pcmpeqb %xmm2, %xmm1
Chris@43 296 movdqu 16(%windowbestlen, %rdx), %xmm3
Chris@43 297 movdqu 16(%prev, %rdx), %xmm4
Chris@43 298 pcmpeqb %xmm4, %xmm3
Chris@43 299 movdqu 32(%windowbestlen, %rdx), %xmm5
Chris@43 300 movdqu 32(%prev, %rdx), %xmm6
Chris@43 301 pcmpeqb %xmm6, %xmm5
Chris@43 302 movdqu 48(%windowbestlen, %rdx), %xmm7
Chris@43 303 movdqu 48(%prev, %rdx), %xmm8
Chris@43 304 pcmpeqb %xmm8, %xmm7
Chris@43 305
Chris@43 306 /* Check the comparisions' results */
Chris@43 307 pmovmskb %xmm1, %rax
Chris@43 308 notw %ax
Chris@43 309 bsfw %ax, %ax
Chris@43 310 jnz LeaveLoopCmps
Chris@43 311
Chris@43 312 /* this is the only iteration of the loop with a possibility of having
Chris@43 313 incremented rdx by 0x108 (each loop iteration add 16*4 = 0x40
Chris@43 314 and (0x40*4)+8=0x108 */
Chris@43 315 add $8, %rdx
Chris@43 316 jz LenMaximum
Chris@43 317 add $8, %rdx
Chris@43 318
Chris@43 319
Chris@43 320 pmovmskb %xmm3, %rax
Chris@43 321 notw %ax
Chris@43 322 bsfw %ax, %ax
Chris@43 323 jnz LeaveLoopCmps
Chris@43 324
Chris@43 325
Chris@43 326 add $16, %rdx
Chris@43 327
Chris@43 328
Chris@43 329 pmovmskb %xmm5, %rax
Chris@43 330 notw %ax
Chris@43 331 bsfw %ax, %ax
Chris@43 332 jnz LeaveLoopCmps
Chris@43 333
Chris@43 334 add $16, %rdx
Chris@43 335
Chris@43 336
Chris@43 337 pmovmskb %xmm7, %rax
Chris@43 338 notw %ax
Chris@43 339 bsfw %ax, %ax
Chris@43 340 jnz LeaveLoopCmps
Chris@43 341
Chris@43 342 add $16, %rdx
Chris@43 343
Chris@43 344 jmp LoopCmps
Chris@43 345 LeaveLoopCmps: add %rax, %rdx
Chris@43 346 #else
Chris@43 347 mov (%windowbestlen, %rdx), %rax
Chris@43 348 xor (%prev, %rdx), %rax
Chris@43 349 jnz LeaveLoopCmps
Chris@43 350
Chris@43 351 mov 8(%windowbestlen, %rdx), %rax
Chris@43 352 xor 8(%prev, %rdx), %rax
Chris@43 353 jnz LeaveLoopCmps8
Chris@43 354
Chris@43 355 mov 16(%windowbestlen, %rdx), %rax
Chris@43 356 xor 16(%prev, %rdx), %rax
Chris@43 357 jnz LeaveLoopCmps16
Chris@43 358
Chris@43 359 add $24, %rdx
Chris@43 360 jnz LoopCmps
Chris@43 361 jmp LenMaximum
Chris@43 362 # if 0
Chris@43 363 /*
Chris@43 364 * This three-liner is tantalizingly simple, but bsf is a slow instruction,
Chris@43 365 * and the complicated alternative down below is quite a bit faster. Sad...
Chris@43 366 */
Chris@43 367
Chris@43 368 LeaveLoopCmps: bsf %rax, %rax /* find the first non-zero bit */
Chris@43 369 shrl $3, %eax /* divide by 8 to get the byte */
Chris@43 370 add %rax, %rdx
Chris@43 371 # else
Chris@43 372 LeaveLoopCmps16:
Chris@43 373 add $8, %rdx
Chris@43 374 LeaveLoopCmps8:
Chris@43 375 add $8, %rdx
Chris@43 376 LeaveLoopCmps: testl $0xFFFFFFFF, %eax /* Check the first 4 bytes */
Chris@43 377 jnz Check16
Chris@43 378 add $4, %rdx
Chris@43 379 shr $32, %rax
Chris@43 380 Check16: testw $0xFFFF, %ax
Chris@43 381 jnz LenLower
Chris@43 382 add $2, %rdx
Chris@43 383 shrl $16, %eax
Chris@43 384 LenLower: subb $1, %al
Chris@43 385 adc $0, %rdx
Chris@43 386 # endif
Chris@43 387 #endif
Chris@43 388
Chris@43 389 /* Calculate the length of the match. If it is longer than MAX_MATCH, */
Chris@43 390 /* then automatically accept it as the best possible match and leave. */
Chris@43 391
Chris@43 392 lea (%prev, %rdx), %rax
Chris@43 393 sub %scan, %rax
Chris@43 394 cmpl $MAX_MATCH, %eax
Chris@43 395 jge LenMaximum
Chris@43 396
Chris@43 397 /* If the length of the match is not longer than the best match we */
Chris@43 398 /* have so far, then forget it and return to the lookup loop. */
Chris@43 399
Chris@43 400 cmpl %bestlend, %eax
Chris@43 401 jg LongerMatch
Chris@43 402 mov _windowbestlen, %windowbestlen
Chris@43 403 mov dsPrev, %prev
Chris@43 404 movl _chainlenwmask, %edx
Chris@43 405 jmp LookupLoop
Chris@43 406
Chris@43 407 /* s->match_start = cur_match; */
Chris@43 408 /* best_len = len; */
Chris@43 409 /* if (len >= nice_match) break; */
Chris@43 410 /* scan_end = *(ushf*)(scan+best_len-1); */
Chris@43 411
Chris@43 412 LongerMatch:
Chris@43 413 movl %eax, %bestlend
Chris@43 414 movl %curmatchd, dsMatchStart
Chris@43 415 cmpl %nicematch, %eax
Chris@43 416 jge LeaveNow
Chris@43 417
Chris@43 418 lea (%window, %bestlen), %windowbestlen
Chris@43 419 mov %windowbestlen, _windowbestlen
Chris@43 420
Chris@43 421 movzwl -1(%scan, %rax), %scanend
Chris@43 422 mov dsPrev, %prev
Chris@43 423 movl _chainlenwmask, %chainlenwmask
Chris@43 424 jmp LookupLoop
Chris@43 425
Chris@43 426 /* Accept the current string, with the maximum possible length. */
Chris@43 427
Chris@43 428 LenMaximum:
Chris@43 429 movl $MAX_MATCH, %bestlend
Chris@43 430 movl %curmatchd, dsMatchStart
Chris@43 431
Chris@43 432 /* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */
Chris@43 433 /* return s->lookahead; */
Chris@43 434
Chris@43 435 LeaveNow:
Chris@43 436 movl dsLookahead, %eax
Chris@43 437 cmpl %eax, %bestlend
Chris@43 438 cmovngl %bestlend, %eax
Chris@43 439 LookaheadRet:
Chris@43 440
Chris@43 441 /* Restore the registers and return from whence we came. */
Chris@43 442
Chris@43 443 mov save_rsi, %rsi
Chris@43 444 mov save_rbx, %rbx
Chris@43 445 mov save_r12, %r12
Chris@43 446 mov save_r13, %r13
Chris@43 447 mov save_r14, %r14
Chris@43 448 mov save_r15, %r15
Chris@43 449
Chris@43 450 ret
Chris@43 451
Chris@43 452 match_init: ret