annotate src/zlib-1.2.8/contrib/gcc_gvmat64/gvmat64.S @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 5ea0608b923f
children
rev   line source
Chris@43 1 /*
Chris@43 2 ;uInt longest_match_x64(
Chris@43 3 ; deflate_state *s,
Chris@43 4 ; IPos cur_match); // current match
Chris@43 5
Chris@43 6 ; gvmat64.S -- Asm portion of the optimized longest_match for 32 bits x86_64
Chris@43 7 ; (AMD64 on Athlon 64, Opteron, Phenom
Chris@43 8 ; and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7)
Chris@43 9 ; this file is translation from gvmat64.asm to GCC 4.x (for Linux, Mac XCode)
Chris@43 10 ; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
Chris@43 11 ;
Chris@43 12 ; File written by Gilles Vollant, by converting to assembly the longest_match
Chris@43 13 ; from Jean-loup Gailly in deflate.c of zLib and infoZip zip.
Chris@43 14 ; and by taking inspiration on asm686 with masm, optimised assembly code
Chris@43 15 ; from Brian Raiter, written 1998
Chris@43 16 ;
Chris@43 17 ; This software is provided 'as-is', without any express or implied
Chris@43 18 ; warranty. In no event will the authors be held liable for any damages
Chris@43 19 ; arising from the use of this software.
Chris@43 20 ;
Chris@43 21 ; Permission is granted to anyone to use this software for any purpose,
Chris@43 22 ; including commercial applications, and to alter it and redistribute it
Chris@43 23 ; freely, subject to the following restrictions:
Chris@43 24 ;
Chris@43 25 ; 1. The origin of this software must not be misrepresented; you must not
Chris@43 26 ; claim that you wrote the original software. If you use this software
Chris@43 27 ; in a product, an acknowledgment in the product documentation would be
Chris@43 28 ; appreciated but is not required.
Chris@43 29 ; 2. Altered source versions must be plainly marked as such, and must not be
Chris@43 30 ; misrepresented as being the original software
Chris@43 31 ; 3. This notice may not be removed or altered from any source distribution.
Chris@43 32 ;
Chris@43 33 ; http://www.zlib.net
Chris@43 34 ; http://www.winimage.com/zLibDll
Chris@43 35 ; http://www.muppetlabs.com/~breadbox/software/assembly.html
Chris@43 36 ;
Chris@43 37 ; to compile this file for zLib, I use option:
Chris@43 38 ; gcc -c -arch x86_64 gvmat64.S
Chris@43 39
Chris@43 40
Chris@43 41 ;uInt longest_match(s, cur_match)
Chris@43 42 ; deflate_state *s;
Chris@43 43 ; IPos cur_match; // current match /
Chris@43 44 ;
Chris@43 45 ; with XCode for Mac, I had strange error with some jump on intel syntax
Chris@43 46 ; this is why BEFORE_JMP and AFTER_JMP are used
Chris@43 47 */
Chris@43 48
Chris@43 49
Chris@43 50 #define BEFORE_JMP .att_syntax
Chris@43 51 #define AFTER_JMP .intel_syntax noprefix
Chris@43 52
Chris@43 53 #ifndef NO_UNDERLINE
Chris@43 54 # define match_init _match_init
Chris@43 55 # define longest_match _longest_match
Chris@43 56 #endif
Chris@43 57
Chris@43 58 .intel_syntax noprefix
Chris@43 59
Chris@43 60 .globl match_init, longest_match
Chris@43 61 .text
Chris@43 62 longest_match:
Chris@43 63
Chris@43 64
Chris@43 65
Chris@43 66 #define LocalVarsSize 96
Chris@43 67 /*
Chris@43 68 ; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12
Chris@43 69 ; free register : r14,r15
Chris@43 70 ; register can be saved : rsp
Chris@43 71 */
Chris@43 72
Chris@43 73 #define chainlenwmask (rsp + 8 - LocalVarsSize)
Chris@43 74 #define nicematch (rsp + 16 - LocalVarsSize)
Chris@43 75
Chris@43 76 #define save_rdi (rsp + 24 - LocalVarsSize)
Chris@43 77 #define save_rsi (rsp + 32 - LocalVarsSize)
Chris@43 78 #define save_rbx (rsp + 40 - LocalVarsSize)
Chris@43 79 #define save_rbp (rsp + 48 - LocalVarsSize)
Chris@43 80 #define save_r12 (rsp + 56 - LocalVarsSize)
Chris@43 81 #define save_r13 (rsp + 64 - LocalVarsSize)
Chris@43 82 #define save_r14 (rsp + 72 - LocalVarsSize)
Chris@43 83 #define save_r15 (rsp + 80 - LocalVarsSize)
Chris@43 84
Chris@43 85
Chris@43 86 /*
Chris@43 87 ; all the +4 offsets are due to the addition of pending_buf_size (in zlib
Chris@43 88 ; in the deflate_state structure since the asm code was first written
Chris@43 89 ; (if you compile with zlib 1.0.4 or older, remove the +4).
Chris@43 90 ; Note : these value are good with a 8 bytes boundary pack structure
Chris@43 91 */
Chris@43 92
Chris@43 93 #define MAX_MATCH 258
Chris@43 94 #define MIN_MATCH 3
Chris@43 95 #define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1)
Chris@43 96
Chris@43 97 /*
Chris@43 98 ;;; Offsets for fields in the deflate_state structure. These numbers
Chris@43 99 ;;; are calculated from the definition of deflate_state, with the
Chris@43 100 ;;; assumption that the compiler will dword-align the fields. (Thus,
Chris@43 101 ;;; changing the definition of deflate_state could easily cause this
Chris@43 102 ;;; program to crash horribly, without so much as a warning at
Chris@43 103 ;;; compile time. Sigh.)
Chris@43 104
Chris@43 105 ; all the +zlib1222add offsets are due to the addition of fields
Chris@43 106 ; in zlib in the deflate_state structure since the asm code was first written
Chris@43 107 ; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
Chris@43 108 ; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
Chris@43 109 ; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
Chris@43 110 */
Chris@43 111
Chris@43 112
Chris@43 113
Chris@43 114 /* you can check the structure offset by running
Chris@43 115
Chris@43 116 #include <stdlib.h>
Chris@43 117 #include <stdio.h>
Chris@43 118 #include "deflate.h"
Chris@43 119
Chris@43 120 void print_depl()
Chris@43 121 {
Chris@43 122 deflate_state ds;
Chris@43 123 deflate_state *s=&ds;
Chris@43 124 printf("size pointer=%u\n",(int)sizeof(void*));
Chris@43 125
Chris@43 126 printf("#define dsWSize %u\n",(int)(((char*)&(s->w_size))-((char*)s)));
Chris@43 127 printf("#define dsWMask %u\n",(int)(((char*)&(s->w_mask))-((char*)s)));
Chris@43 128 printf("#define dsWindow %u\n",(int)(((char*)&(s->window))-((char*)s)));
Chris@43 129 printf("#define dsPrev %u\n",(int)(((char*)&(s->prev))-((char*)s)));
Chris@43 130 printf("#define dsMatchLen %u\n",(int)(((char*)&(s->match_length))-((char*)s)));
Chris@43 131 printf("#define dsPrevMatch %u\n",(int)(((char*)&(s->prev_match))-((char*)s)));
Chris@43 132 printf("#define dsStrStart %u\n",(int)(((char*)&(s->strstart))-((char*)s)));
Chris@43 133 printf("#define dsMatchStart %u\n",(int)(((char*)&(s->match_start))-((char*)s)));
Chris@43 134 printf("#define dsLookahead %u\n",(int)(((char*)&(s->lookahead))-((char*)s)));
Chris@43 135 printf("#define dsPrevLen %u\n",(int)(((char*)&(s->prev_length))-((char*)s)));
Chris@43 136 printf("#define dsMaxChainLen %u\n",(int)(((char*)&(s->max_chain_length))-((char*)s)));
Chris@43 137 printf("#define dsGoodMatch %u\n",(int)(((char*)&(s->good_match))-((char*)s)));
Chris@43 138 printf("#define dsNiceMatch %u\n",(int)(((char*)&(s->nice_match))-((char*)s)));
Chris@43 139 }
Chris@43 140 */
Chris@43 141
Chris@43 142 #define dsWSize 68
Chris@43 143 #define dsWMask 76
Chris@43 144 #define dsWindow 80
Chris@43 145 #define dsPrev 96
Chris@43 146 #define dsMatchLen 144
Chris@43 147 #define dsPrevMatch 148
Chris@43 148 #define dsStrStart 156
Chris@43 149 #define dsMatchStart 160
Chris@43 150 #define dsLookahead 164
Chris@43 151 #define dsPrevLen 168
Chris@43 152 #define dsMaxChainLen 172
Chris@43 153 #define dsGoodMatch 188
Chris@43 154 #define dsNiceMatch 192
Chris@43 155
Chris@43 156 #define window_size [ rcx + dsWSize]
Chris@43 157 #define WMask [ rcx + dsWMask]
Chris@43 158 #define window_ad [ rcx + dsWindow]
Chris@43 159 #define prev_ad [ rcx + dsPrev]
Chris@43 160 #define strstart [ rcx + dsStrStart]
Chris@43 161 #define match_start [ rcx + dsMatchStart]
Chris@43 162 #define Lookahead [ rcx + dsLookahead] //; 0ffffffffh on infozip
Chris@43 163 #define prev_length [ rcx + dsPrevLen]
Chris@43 164 #define max_chain_length [ rcx + dsMaxChainLen]
Chris@43 165 #define good_match [ rcx + dsGoodMatch]
Chris@43 166 #define nice_match [ rcx + dsNiceMatch]
Chris@43 167
Chris@43 168 /*
Chris@43 169 ; windows:
Chris@43 170 ; parameter 1 in rcx(deflate state s), param 2 in rdx (cur match)
Chris@43 171
Chris@43 172 ; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
Chris@43 173 ; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
Chris@43 174 ;
Chris@43 175 ; All registers must be preserved across the call, except for
Chris@43 176 ; rax, rcx, rdx, r8, r9, r10, and r11, which are scratch.
Chris@43 177
Chris@43 178 ;
Chris@43 179 ; gcc on macosx-linux:
Chris@43 180 ; see http://www.x86-64.org/documentation/abi-0.99.pdf
Chris@43 181 ; param 1 in rdi, param 2 in rsi
Chris@43 182 ; rbx, rsp, rbp, r12 to r15 must be preserved
Chris@43 183
Chris@43 184 ;;; Save registers that the compiler may be using, and adjust esp to
Chris@43 185 ;;; make room for our stack frame.
Chris@43 186
Chris@43 187
Chris@43 188 ;;; Retrieve the function arguments. r8d will hold cur_match
Chris@43 189 ;;; throughout the entire function. edx will hold the pointer to the
Chris@43 190 ;;; deflate_state structure during the function's setup (before
Chris@43 191 ;;; entering the main loop.
Chris@43 192
Chris@43 193 ; ms: parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match)
Chris@43 194 ; mac: param 1 in rdi, param 2 rsi
Chris@43 195 ; this clear high 32 bits of r8, which can be garbage in both r8 and rdx
Chris@43 196 */
Chris@43 197 mov [save_rbx],rbx
Chris@43 198 mov [save_rbp],rbp
Chris@43 199
Chris@43 200
Chris@43 201 mov rcx,rdi
Chris@43 202
Chris@43 203 mov r8d,esi
Chris@43 204
Chris@43 205
Chris@43 206 mov [save_r12],r12
Chris@43 207 mov [save_r13],r13
Chris@43 208 mov [save_r14],r14
Chris@43 209 mov [save_r15],r15
Chris@43 210
Chris@43 211
Chris@43 212 //;;; uInt wmask = s->w_mask;
Chris@43 213 //;;; unsigned chain_length = s->max_chain_length;
Chris@43 214 //;;; if (s->prev_length >= s->good_match) {
Chris@43 215 //;;; chain_length >>= 2;
Chris@43 216 //;;; }
Chris@43 217
Chris@43 218
Chris@43 219 mov edi, prev_length
Chris@43 220 mov esi, good_match
Chris@43 221 mov eax, WMask
Chris@43 222 mov ebx, max_chain_length
Chris@43 223 cmp edi, esi
Chris@43 224 jl LastMatchGood
Chris@43 225 shr ebx, 2
Chris@43 226 LastMatchGood:
Chris@43 227
Chris@43 228 //;;; chainlen is decremented once beforehand so that the function can
Chris@43 229 //;;; use the sign flag instead of the zero flag for the exit test.
Chris@43 230 //;;; It is then shifted into the high word, to make room for the wmask
Chris@43 231 //;;; value, which it will always accompany.
Chris@43 232
Chris@43 233 dec ebx
Chris@43 234 shl ebx, 16
Chris@43 235 or ebx, eax
Chris@43 236
Chris@43 237 //;;; on zlib only
Chris@43 238 //;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
Chris@43 239
Chris@43 240
Chris@43 241
Chris@43 242 mov eax, nice_match
Chris@43 243 mov [chainlenwmask], ebx
Chris@43 244 mov r10d, Lookahead
Chris@43 245 cmp r10d, eax
Chris@43 246 cmovnl r10d, eax
Chris@43 247 mov [nicematch],r10d
Chris@43 248
Chris@43 249
Chris@43 250
Chris@43 251 //;;; register Bytef *scan = s->window + s->strstart;
Chris@43 252 mov r10, window_ad
Chris@43 253 mov ebp, strstart
Chris@43 254 lea r13, [r10 + rbp]
Chris@43 255
Chris@43 256 //;;; Determine how many bytes the scan ptr is off from being
Chris@43 257 //;;; dword-aligned.
Chris@43 258
Chris@43 259 mov r9,r13
Chris@43 260 neg r13
Chris@43 261 and r13,3
Chris@43 262
Chris@43 263 //;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
Chris@43 264 //;;; s->strstart - (IPos)MAX_DIST(s) : NIL;
Chris@43 265
Chris@43 266
Chris@43 267 mov eax, window_size
Chris@43 268 sub eax, MIN_LOOKAHEAD
Chris@43 269
Chris@43 270
Chris@43 271 xor edi,edi
Chris@43 272 sub ebp, eax
Chris@43 273
Chris@43 274 mov r11d, prev_length
Chris@43 275
Chris@43 276 cmovng ebp,edi
Chris@43 277
Chris@43 278 //;;; int best_len = s->prev_length;
Chris@43 279
Chris@43 280
Chris@43 281 //;;; Store the sum of s->window + best_len in esi locally, and in esi.
Chris@43 282
Chris@43 283 lea rsi,[r10+r11]
Chris@43 284
Chris@43 285 //;;; register ush scan_start = *(ushf*)scan;
Chris@43 286 //;;; register ush scan_end = *(ushf*)(scan+best_len-1);
Chris@43 287 //;;; Posf *prev = s->prev;
Chris@43 288
Chris@43 289 movzx r12d,word ptr [r9]
Chris@43 290 movzx ebx, word ptr [r9 + r11 - 1]
Chris@43 291
Chris@43 292 mov rdi, prev_ad
Chris@43 293
Chris@43 294 //;;; Jump into the main loop.
Chris@43 295
Chris@43 296 mov edx, [chainlenwmask]
Chris@43 297
Chris@43 298 cmp bx,word ptr [rsi + r8 - 1]
Chris@43 299 jz LookupLoopIsZero
Chris@43 300
Chris@43 301
Chris@43 302
Chris@43 303 LookupLoop1:
Chris@43 304 and r8d, edx
Chris@43 305
Chris@43 306 movzx r8d, word ptr [rdi + r8*2]
Chris@43 307 cmp r8d, ebp
Chris@43 308 jbe LeaveNow
Chris@43 309
Chris@43 310
Chris@43 311
Chris@43 312 sub edx, 0x00010000
Chris@43 313 BEFORE_JMP
Chris@43 314 js LeaveNow
Chris@43 315 AFTER_JMP
Chris@43 316
Chris@43 317 LoopEntry1:
Chris@43 318 cmp bx,word ptr [rsi + r8 - 1]
Chris@43 319 BEFORE_JMP
Chris@43 320 jz LookupLoopIsZero
Chris@43 321 AFTER_JMP
Chris@43 322
Chris@43 323 LookupLoop2:
Chris@43 324 and r8d, edx
Chris@43 325
Chris@43 326 movzx r8d, word ptr [rdi + r8*2]
Chris@43 327 cmp r8d, ebp
Chris@43 328 BEFORE_JMP
Chris@43 329 jbe LeaveNow
Chris@43 330 AFTER_JMP
Chris@43 331 sub edx, 0x00010000
Chris@43 332 BEFORE_JMP
Chris@43 333 js LeaveNow
Chris@43 334 AFTER_JMP
Chris@43 335
Chris@43 336 LoopEntry2:
Chris@43 337 cmp bx,word ptr [rsi + r8 - 1]
Chris@43 338 BEFORE_JMP
Chris@43 339 jz LookupLoopIsZero
Chris@43 340 AFTER_JMP
Chris@43 341
Chris@43 342 LookupLoop4:
Chris@43 343 and r8d, edx
Chris@43 344
Chris@43 345 movzx r8d, word ptr [rdi + r8*2]
Chris@43 346 cmp r8d, ebp
Chris@43 347 BEFORE_JMP
Chris@43 348 jbe LeaveNow
Chris@43 349 AFTER_JMP
Chris@43 350 sub edx, 0x00010000
Chris@43 351 BEFORE_JMP
Chris@43 352 js LeaveNow
Chris@43 353 AFTER_JMP
Chris@43 354
Chris@43 355 LoopEntry4:
Chris@43 356
Chris@43 357 cmp bx,word ptr [rsi + r8 - 1]
Chris@43 358 BEFORE_JMP
Chris@43 359 jnz LookupLoop1
Chris@43 360 jmp LookupLoopIsZero
Chris@43 361 AFTER_JMP
Chris@43 362 /*
Chris@43 363 ;;; do {
Chris@43 364 ;;; match = s->window + cur_match;
Chris@43 365 ;;; if (*(ushf*)(match+best_len-1) != scan_end ||
Chris@43 366 ;;; *(ushf*)match != scan_start) continue;
Chris@43 367 ;;; [...]
Chris@43 368 ;;; } while ((cur_match = prev[cur_match & wmask]) > limit
Chris@43 369 ;;; && --chain_length != 0);
Chris@43 370 ;;;
Chris@43 371 ;;; Here is the inner loop of the function. The function will spend the
Chris@43 372 ;;; majority of its time in this loop, and majority of that time will
Chris@43 373 ;;; be spent in the first ten instructions.
Chris@43 374 ;;;
Chris@43 375 ;;; Within this loop:
Chris@43 376 ;;; ebx = scanend
Chris@43 377 ;;; r8d = curmatch
Chris@43 378 ;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
Chris@43 379 ;;; esi = windowbestlen - i.e., (window + bestlen)
Chris@43 380 ;;; edi = prev
Chris@43 381 ;;; ebp = limit
Chris@43 382 */
Chris@43 383 .balign 16
Chris@43 384 LookupLoop:
Chris@43 385 and r8d, edx
Chris@43 386
Chris@43 387 movzx r8d, word ptr [rdi + r8*2]
Chris@43 388 cmp r8d, ebp
Chris@43 389 BEFORE_JMP
Chris@43 390 jbe LeaveNow
Chris@43 391 AFTER_JMP
Chris@43 392 sub edx, 0x00010000
Chris@43 393 BEFORE_JMP
Chris@43 394 js LeaveNow
Chris@43 395 AFTER_JMP
Chris@43 396
Chris@43 397 LoopEntry:
Chris@43 398
Chris@43 399 cmp bx,word ptr [rsi + r8 - 1]
Chris@43 400 BEFORE_JMP
Chris@43 401 jnz LookupLoop1
Chris@43 402 AFTER_JMP
Chris@43 403 LookupLoopIsZero:
Chris@43 404 cmp r12w, word ptr [r10 + r8]
Chris@43 405 BEFORE_JMP
Chris@43 406 jnz LookupLoop1
Chris@43 407 AFTER_JMP
Chris@43 408
Chris@43 409
Chris@43 410 //;;; Store the current value of chainlen.
Chris@43 411 mov [chainlenwmask], edx
Chris@43 412 /*
Chris@43 413 ;;; Point edi to the string under scrutiny, and esi to the string we
Chris@43 414 ;;; are hoping to match it up with. In actuality, esi and edi are
Chris@43 415 ;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
Chris@43 416 ;;; initialized to -(MAX_MATCH_8 - scanalign).
Chris@43 417 */
Chris@43 418 lea rsi,[r8+r10]
Chris@43 419 mov rdx, 0xfffffffffffffef8 //; -(MAX_MATCH_8)
Chris@43 420 lea rsi, [rsi + r13 + 0x0108] //;MAX_MATCH_8]
Chris@43 421 lea rdi, [r9 + r13 + 0x0108] //;MAX_MATCH_8]
Chris@43 422
Chris@43 423 prefetcht1 [rsi+rdx]
Chris@43 424 prefetcht1 [rdi+rdx]
Chris@43 425
Chris@43 426 /*
Chris@43 427 ;;; Test the strings for equality, 8 bytes at a time. At the end,
Chris@43 428 ;;; adjust rdx so that it is offset to the exact byte that mismatched.
Chris@43 429 ;;;
Chris@43 430 ;;; We already know at this point that the first three bytes of the
Chris@43 431 ;;; strings match each other, and they can be safely passed over before
Chris@43 432 ;;; starting the compare loop. So what this code does is skip over 0-3
Chris@43 433 ;;; bytes, as much as necessary in order to dword-align the edi
Chris@43 434 ;;; pointer. (rsi will still be misaligned three times out of four.)
Chris@43 435 ;;;
Chris@43 436 ;;; It should be confessed that this loop usually does not represent
Chris@43 437 ;;; much of the total running time. Replacing it with a more
Chris@43 438 ;;; straightforward "rep cmpsb" would not drastically degrade
Chris@43 439 ;;; performance.
Chris@43 440 */
Chris@43 441
Chris@43 442 LoopCmps:
Chris@43 443 mov rax, [rsi + rdx]
Chris@43 444 xor rax, [rdi + rdx]
Chris@43 445 jnz LeaveLoopCmps
Chris@43 446
Chris@43 447 mov rax, [rsi + rdx + 8]
Chris@43 448 xor rax, [rdi + rdx + 8]
Chris@43 449 jnz LeaveLoopCmps8
Chris@43 450
Chris@43 451
Chris@43 452 mov rax, [rsi + rdx + 8+8]
Chris@43 453 xor rax, [rdi + rdx + 8+8]
Chris@43 454 jnz LeaveLoopCmps16
Chris@43 455
Chris@43 456 add rdx,8+8+8
Chris@43 457
Chris@43 458 BEFORE_JMP
Chris@43 459 jnz LoopCmps
Chris@43 460 jmp LenMaximum
Chris@43 461 AFTER_JMP
Chris@43 462
Chris@43 463 LeaveLoopCmps16: add rdx,8
Chris@43 464 LeaveLoopCmps8: add rdx,8
Chris@43 465 LeaveLoopCmps:
Chris@43 466
Chris@43 467 test eax, 0x0000FFFF
Chris@43 468 jnz LenLower
Chris@43 469
Chris@43 470 test eax,0xffffffff
Chris@43 471
Chris@43 472 jnz LenLower32
Chris@43 473
Chris@43 474 add rdx,4
Chris@43 475 shr rax,32
Chris@43 476 or ax,ax
Chris@43 477 BEFORE_JMP
Chris@43 478 jnz LenLower
Chris@43 479 AFTER_JMP
Chris@43 480
Chris@43 481 LenLower32:
Chris@43 482 shr eax,16
Chris@43 483 add rdx,2
Chris@43 484
Chris@43 485 LenLower:
Chris@43 486 sub al, 1
Chris@43 487 adc rdx, 0
Chris@43 488 //;;; Calculate the length of the match. If it is longer than MAX_MATCH,
Chris@43 489 //;;; then automatically accept it as the best possible match and leave.
Chris@43 490
Chris@43 491 lea rax, [rdi + rdx]
Chris@43 492 sub rax, r9
Chris@43 493 cmp eax, MAX_MATCH
Chris@43 494 BEFORE_JMP
Chris@43 495 jge LenMaximum
Chris@43 496 AFTER_JMP
Chris@43 497 /*
Chris@43 498 ;;; If the length of the match is not longer than the best match we
Chris@43 499 ;;; have so far, then forget it and return to the lookup loop.
Chris@43 500 ;///////////////////////////////////
Chris@43 501 */
Chris@43 502 cmp eax, r11d
Chris@43 503 jg LongerMatch
Chris@43 504
Chris@43 505 lea rsi,[r10+r11]
Chris@43 506
Chris@43 507 mov rdi, prev_ad
Chris@43 508 mov edx, [chainlenwmask]
Chris@43 509 BEFORE_JMP
Chris@43 510 jmp LookupLoop
Chris@43 511 AFTER_JMP
Chris@43 512 /*
Chris@43 513 ;;; s->match_start = cur_match;
Chris@43 514 ;;; best_len = len;
Chris@43 515 ;;; if (len >= nice_match) break;
Chris@43 516 ;;; scan_end = *(ushf*)(scan+best_len-1);
Chris@43 517 */
Chris@43 518 LongerMatch:
Chris@43 519 mov r11d, eax
Chris@43 520 mov match_start, r8d
Chris@43 521 cmp eax, [nicematch]
Chris@43 522 BEFORE_JMP
Chris@43 523 jge LeaveNow
Chris@43 524 AFTER_JMP
Chris@43 525
Chris@43 526 lea rsi,[r10+rax]
Chris@43 527
Chris@43 528 movzx ebx, word ptr [r9 + rax - 1]
Chris@43 529 mov rdi, prev_ad
Chris@43 530 mov edx, [chainlenwmask]
Chris@43 531 BEFORE_JMP
Chris@43 532 jmp LookupLoop
Chris@43 533 AFTER_JMP
Chris@43 534
Chris@43 535 //;;; Accept the current string, with the maximum possible length.
Chris@43 536
Chris@43 537 LenMaximum:
Chris@43 538 mov r11d,MAX_MATCH
Chris@43 539 mov match_start, r8d
Chris@43 540
Chris@43 541 //;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
Chris@43 542 //;;; return s->lookahead;
Chris@43 543
Chris@43 544 LeaveNow:
Chris@43 545 mov eax, Lookahead
Chris@43 546 cmp r11d, eax
Chris@43 547 cmovng eax, r11d
Chris@43 548
Chris@43 549
Chris@43 550
Chris@43 551 //;;; Restore the stack and return from whence we came.
Chris@43 552
Chris@43 553
Chris@43 554 // mov rsi,[save_rsi]
Chris@43 555 // mov rdi,[save_rdi]
Chris@43 556 mov rbx,[save_rbx]
Chris@43 557 mov rbp,[save_rbp]
Chris@43 558 mov r12,[save_r12]
Chris@43 559 mov r13,[save_r13]
Chris@43 560 mov r14,[save_r14]
Chris@43 561 mov r15,[save_r15]
Chris@43 562
Chris@43 563
Chris@43 564 ret 0
Chris@43 565 //; please don't remove this string !
Chris@43 566 //; Your can freely use gvmat64 in any free or commercial app
Chris@43 567 //; but it is far better don't remove the string in the binary!
Chris@43 568 // db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0
Chris@43 569
Chris@43 570
Chris@43 571 match_init:
Chris@43 572 ret 0
Chris@43 573
Chris@43 574