annotate src/zlib-1.2.8/contrib/gcc_gvmat64/gvmat64.S @ 167:bd3cc4d1df30

Add FFTW 3.3.8 source, and a Linux build
author Chris Cannam <cannam@all-day-breakfast.com>
date Tue, 19 Nov 2019 14:52:55 +0000
parents 5b4145a0d408
children
rev   line source
cannam@128 1 /*
cannam@128 2 ;uInt longest_match_x64(
cannam@128 3 ; deflate_state *s,
cannam@128 4 ; IPos cur_match); // current match
cannam@128 5
cannam@128 6 ; gvmat64.S -- Asm portion of the optimized longest_match for 32 bits x86_64
cannam@128 7 ; (AMD64 on Athlon 64, Opteron, Phenom
cannam@128 8 ; and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7)
cannam@128 9 ; this file is translation from gvmat64.asm to GCC 4.x (for Linux, Mac XCode)
cannam@128 10 ; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
cannam@128 11 ;
cannam@128 12 ; File written by Gilles Vollant, by converting to assembly the longest_match
cannam@128 13 ; from Jean-loup Gailly in deflate.c of zLib and infoZip zip.
cannam@128 14 ; and by taking inspiration on asm686 with masm, optimised assembly code
cannam@128 15 ; from Brian Raiter, written 1998
cannam@128 16 ;
cannam@128 17 ; This software is provided 'as-is', without any express or implied
cannam@128 18 ; warranty. In no event will the authors be held liable for any damages
cannam@128 19 ; arising from the use of this software.
cannam@128 20 ;
cannam@128 21 ; Permission is granted to anyone to use this software for any purpose,
cannam@128 22 ; including commercial applications, and to alter it and redistribute it
cannam@128 23 ; freely, subject to the following restrictions:
cannam@128 24 ;
cannam@128 25 ; 1. The origin of this software must not be misrepresented; you must not
cannam@128 26 ; claim that you wrote the original software. If you use this software
cannam@128 27 ; in a product, an acknowledgment in the product documentation would be
cannam@128 28 ; appreciated but is not required.
cannam@128 29 ; 2. Altered source versions must be plainly marked as such, and must not be
cannam@128 30 ; misrepresented as being the original software
cannam@128 31 ; 3. This notice may not be removed or altered from any source distribution.
cannam@128 32 ;
cannam@128 33 ; http://www.zlib.net
cannam@128 34 ; http://www.winimage.com/zLibDll
cannam@128 35 ; http://www.muppetlabs.com/~breadbox/software/assembly.html
cannam@128 36 ;
cannam@128 37 ; to compile this file for zLib, I use option:
cannam@128 38 ; gcc -c -arch x86_64 gvmat64.S
cannam@128 39
cannam@128 40
cannam@128 41 ;uInt longest_match(s, cur_match)
cannam@128 42 ; deflate_state *s;
cannam@128 43 ; IPos cur_match; // current match /
cannam@128 44 ;
cannam@128 45 ; with XCode for Mac, I had strange error with some jump on intel syntax
cannam@128 46 ; this is why BEFORE_JMP and AFTER_JMP are used
cannam@128 47 */
cannam@128 48
cannam@128 49
cannam@128 50 #define BEFORE_JMP .att_syntax
cannam@128 51 #define AFTER_JMP .intel_syntax noprefix
cannam@128 52
cannam@128 53 #ifndef NO_UNDERLINE
cannam@128 54 # define match_init _match_init
cannam@128 55 # define longest_match _longest_match
cannam@128 56 #endif
cannam@128 57
cannam@128 58 .intel_syntax noprefix
cannam@128 59
cannam@128 60 .globl match_init, longest_match
cannam@128 61 .text
cannam@128 62 longest_match:
cannam@128 63
cannam@128 64
cannam@128 65
cannam@128 66 #define LocalVarsSize 96
cannam@128 67 /*
cannam@128 68 ; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12
cannam@128 69 ; free register : r14,r15
cannam@128 70 ; register can be saved : rsp
cannam@128 71 */
cannam@128 72
cannam@128 73 #define chainlenwmask (rsp + 8 - LocalVarsSize)
cannam@128 74 #define nicematch (rsp + 16 - LocalVarsSize)
cannam@128 75
cannam@128 76 #define save_rdi (rsp + 24 - LocalVarsSize)
cannam@128 77 #define save_rsi (rsp + 32 - LocalVarsSize)
cannam@128 78 #define save_rbx (rsp + 40 - LocalVarsSize)
cannam@128 79 #define save_rbp (rsp + 48 - LocalVarsSize)
cannam@128 80 #define save_r12 (rsp + 56 - LocalVarsSize)
cannam@128 81 #define save_r13 (rsp + 64 - LocalVarsSize)
cannam@128 82 #define save_r14 (rsp + 72 - LocalVarsSize)
cannam@128 83 #define save_r15 (rsp + 80 - LocalVarsSize)
cannam@128 84
cannam@128 85
cannam@128 86 /*
cannam@128 87 ; all the +4 offsets are due to the addition of pending_buf_size (in zlib
cannam@128 88 ; in the deflate_state structure since the asm code was first written
cannam@128 89 ; (if you compile with zlib 1.0.4 or older, remove the +4).
cannam@128 90 ; Note : these value are good with a 8 bytes boundary pack structure
cannam@128 91 */
cannam@128 92
cannam@128 93 #define MAX_MATCH 258
cannam@128 94 #define MIN_MATCH 3
cannam@128 95 #define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1)
cannam@128 96
cannam@128 97 /*
cannam@128 98 ;;; Offsets for fields in the deflate_state structure. These numbers
cannam@128 99 ;;; are calculated from the definition of deflate_state, with the
cannam@128 100 ;;; assumption that the compiler will dword-align the fields. (Thus,
cannam@128 101 ;;; changing the definition of deflate_state could easily cause this
cannam@128 102 ;;; program to crash horribly, without so much as a warning at
cannam@128 103 ;;; compile time. Sigh.)
cannam@128 104
cannam@128 105 ; all the +zlib1222add offsets are due to the addition of fields
cannam@128 106 ; in zlib in the deflate_state structure since the asm code was first written
cannam@128 107 ; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
cannam@128 108 ; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
cannam@128 109 ; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
cannam@128 110 */
cannam@128 111
cannam@128 112
cannam@128 113
cannam@128 114 /* you can check the structure offset by running
cannam@128 115
cannam@128 116 #include <stdlib.h>
cannam@128 117 #include <stdio.h>
cannam@128 118 #include "deflate.h"
cannam@128 119
cannam@128 120 void print_depl()
cannam@128 121 {
cannam@128 122 deflate_state ds;
cannam@128 123 deflate_state *s=&ds;
cannam@128 124 printf("size pointer=%u\n",(int)sizeof(void*));
cannam@128 125
cannam@128 126 printf("#define dsWSize %u\n",(int)(((char*)&(s->w_size))-((char*)s)));
cannam@128 127 printf("#define dsWMask %u\n",(int)(((char*)&(s->w_mask))-((char*)s)));
cannam@128 128 printf("#define dsWindow %u\n",(int)(((char*)&(s->window))-((char*)s)));
cannam@128 129 printf("#define dsPrev %u\n",(int)(((char*)&(s->prev))-((char*)s)));
cannam@128 130 printf("#define dsMatchLen %u\n",(int)(((char*)&(s->match_length))-((char*)s)));
cannam@128 131 printf("#define dsPrevMatch %u\n",(int)(((char*)&(s->prev_match))-((char*)s)));
cannam@128 132 printf("#define dsStrStart %u\n",(int)(((char*)&(s->strstart))-((char*)s)));
cannam@128 133 printf("#define dsMatchStart %u\n",(int)(((char*)&(s->match_start))-((char*)s)));
cannam@128 134 printf("#define dsLookahead %u\n",(int)(((char*)&(s->lookahead))-((char*)s)));
cannam@128 135 printf("#define dsPrevLen %u\n",(int)(((char*)&(s->prev_length))-((char*)s)));
cannam@128 136 printf("#define dsMaxChainLen %u\n",(int)(((char*)&(s->max_chain_length))-((char*)s)));
cannam@128 137 printf("#define dsGoodMatch %u\n",(int)(((char*)&(s->good_match))-((char*)s)));
cannam@128 138 printf("#define dsNiceMatch %u\n",(int)(((char*)&(s->nice_match))-((char*)s)));
cannam@128 139 }
cannam@128 140 */
cannam@128 141
cannam@128 142 #define dsWSize 68
cannam@128 143 #define dsWMask 76
cannam@128 144 #define dsWindow 80
cannam@128 145 #define dsPrev 96
cannam@128 146 #define dsMatchLen 144
cannam@128 147 #define dsPrevMatch 148
cannam@128 148 #define dsStrStart 156
cannam@128 149 #define dsMatchStart 160
cannam@128 150 #define dsLookahead 164
cannam@128 151 #define dsPrevLen 168
cannam@128 152 #define dsMaxChainLen 172
cannam@128 153 #define dsGoodMatch 188
cannam@128 154 #define dsNiceMatch 192
cannam@128 155
cannam@128 156 #define window_size [ rcx + dsWSize]
cannam@128 157 #define WMask [ rcx + dsWMask]
cannam@128 158 #define window_ad [ rcx + dsWindow]
cannam@128 159 #define prev_ad [ rcx + dsPrev]
cannam@128 160 #define strstart [ rcx + dsStrStart]
cannam@128 161 #define match_start [ rcx + dsMatchStart]
cannam@128 162 #define Lookahead [ rcx + dsLookahead] //; 0ffffffffh on infozip
cannam@128 163 #define prev_length [ rcx + dsPrevLen]
cannam@128 164 #define max_chain_length [ rcx + dsMaxChainLen]
cannam@128 165 #define good_match [ rcx + dsGoodMatch]
cannam@128 166 #define nice_match [ rcx + dsNiceMatch]
cannam@128 167
cannam@128 168 /*
cannam@128 169 ; windows:
cannam@128 170 ; parameter 1 in rcx(deflate state s), param 2 in rdx (cur match)
cannam@128 171
cannam@128 172 ; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
cannam@128 173 ; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
cannam@128 174 ;
cannam@128 175 ; All registers must be preserved across the call, except for
cannam@128 176 ; rax, rcx, rdx, r8, r9, r10, and r11, which are scratch.
cannam@128 177
cannam@128 178 ;
cannam@128 179 ; gcc on macosx-linux:
cannam@128 180 ; see http://www.x86-64.org/documentation/abi-0.99.pdf
cannam@128 181 ; param 1 in rdi, param 2 in rsi
cannam@128 182 ; rbx, rsp, rbp, r12 to r15 must be preserved
cannam@128 183
cannam@128 184 ;;; Save registers that the compiler may be using, and adjust esp to
cannam@128 185 ;;; make room for our stack frame.
cannam@128 186
cannam@128 187
cannam@128 188 ;;; Retrieve the function arguments. r8d will hold cur_match
cannam@128 189 ;;; throughout the entire function. edx will hold the pointer to the
cannam@128 190 ;;; deflate_state structure during the function's setup (before
cannam@128 191 ;;; entering the main loop.
cannam@128 192
cannam@128 193 ; ms: parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match)
cannam@128 194 ; mac: param 1 in rdi, param 2 rsi
cannam@128 195 ; this clear high 32 bits of r8, which can be garbage in both r8 and rdx
cannam@128 196 */
cannam@128 197 mov [save_rbx],rbx
cannam@128 198 mov [save_rbp],rbp
cannam@128 199
cannam@128 200
cannam@128 201 mov rcx,rdi
cannam@128 202
cannam@128 203 mov r8d,esi
cannam@128 204
cannam@128 205
cannam@128 206 mov [save_r12],r12
cannam@128 207 mov [save_r13],r13
cannam@128 208 mov [save_r14],r14
cannam@128 209 mov [save_r15],r15
cannam@128 210
cannam@128 211
cannam@128 212 //;;; uInt wmask = s->w_mask;
cannam@128 213 //;;; unsigned chain_length = s->max_chain_length;
cannam@128 214 //;;; if (s->prev_length >= s->good_match) {
cannam@128 215 //;;; chain_length >>= 2;
cannam@128 216 //;;; }
cannam@128 217
cannam@128 218
cannam@128 219 mov edi, prev_length
cannam@128 220 mov esi, good_match
cannam@128 221 mov eax, WMask
cannam@128 222 mov ebx, max_chain_length
cannam@128 223 cmp edi, esi
cannam@128 224 jl LastMatchGood
cannam@128 225 shr ebx, 2
cannam@128 226 LastMatchGood:
cannam@128 227
cannam@128 228 //;;; chainlen is decremented once beforehand so that the function can
cannam@128 229 //;;; use the sign flag instead of the zero flag for the exit test.
cannam@128 230 //;;; It is then shifted into the high word, to make room for the wmask
cannam@128 231 //;;; value, which it will always accompany.
cannam@128 232
cannam@128 233 dec ebx
cannam@128 234 shl ebx, 16
cannam@128 235 or ebx, eax
cannam@128 236
cannam@128 237 //;;; on zlib only
cannam@128 238 //;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
cannam@128 239
cannam@128 240
cannam@128 241
cannam@128 242 mov eax, nice_match
cannam@128 243 mov [chainlenwmask], ebx
cannam@128 244 mov r10d, Lookahead
cannam@128 245 cmp r10d, eax
cannam@128 246 cmovnl r10d, eax
cannam@128 247 mov [nicematch],r10d
cannam@128 248
cannam@128 249
cannam@128 250
cannam@128 251 //;;; register Bytef *scan = s->window + s->strstart;
cannam@128 252 mov r10, window_ad
cannam@128 253 mov ebp, strstart
cannam@128 254 lea r13, [r10 + rbp]
cannam@128 255
cannam@128 256 //;;; Determine how many bytes the scan ptr is off from being
cannam@128 257 //;;; dword-aligned.
cannam@128 258
cannam@128 259 mov r9,r13
cannam@128 260 neg r13
cannam@128 261 and r13,3
cannam@128 262
cannam@128 263 //;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
cannam@128 264 //;;; s->strstart - (IPos)MAX_DIST(s) : NIL;
cannam@128 265
cannam@128 266
cannam@128 267 mov eax, window_size
cannam@128 268 sub eax, MIN_LOOKAHEAD
cannam@128 269
cannam@128 270
cannam@128 271 xor edi,edi
cannam@128 272 sub ebp, eax
cannam@128 273
cannam@128 274 mov r11d, prev_length
cannam@128 275
cannam@128 276 cmovng ebp,edi
cannam@128 277
cannam@128 278 //;;; int best_len = s->prev_length;
cannam@128 279
cannam@128 280
cannam@128 281 //;;; Store the sum of s->window + best_len in esi locally, and in esi.
cannam@128 282
cannam@128 283 lea rsi,[r10+r11]
cannam@128 284
cannam@128 285 //;;; register ush scan_start = *(ushf*)scan;
cannam@128 286 //;;; register ush scan_end = *(ushf*)(scan+best_len-1);
cannam@128 287 //;;; Posf *prev = s->prev;
cannam@128 288
cannam@128 289 movzx r12d,word ptr [r9]
cannam@128 290 movzx ebx, word ptr [r9 + r11 - 1]
cannam@128 291
cannam@128 292 mov rdi, prev_ad
cannam@128 293
cannam@128 294 //;;; Jump into the main loop.
cannam@128 295
cannam@128 296 mov edx, [chainlenwmask]
cannam@128 297
cannam@128 298 cmp bx,word ptr [rsi + r8 - 1]
cannam@128 299 jz LookupLoopIsZero
cannam@128 300
cannam@128 301
cannam@128 302
cannam@128 303 LookupLoop1:
cannam@128 304 and r8d, edx
cannam@128 305
cannam@128 306 movzx r8d, word ptr [rdi + r8*2]
cannam@128 307 cmp r8d, ebp
cannam@128 308 jbe LeaveNow
cannam@128 309
cannam@128 310
cannam@128 311
cannam@128 312 sub edx, 0x00010000
cannam@128 313 BEFORE_JMP
cannam@128 314 js LeaveNow
cannam@128 315 AFTER_JMP
cannam@128 316
cannam@128 317 LoopEntry1:
cannam@128 318 cmp bx,word ptr [rsi + r8 - 1]
cannam@128 319 BEFORE_JMP
cannam@128 320 jz LookupLoopIsZero
cannam@128 321 AFTER_JMP
cannam@128 322
cannam@128 323 LookupLoop2:
cannam@128 324 and r8d, edx
cannam@128 325
cannam@128 326 movzx r8d, word ptr [rdi + r8*2]
cannam@128 327 cmp r8d, ebp
cannam@128 328 BEFORE_JMP
cannam@128 329 jbe LeaveNow
cannam@128 330 AFTER_JMP
cannam@128 331 sub edx, 0x00010000
cannam@128 332 BEFORE_JMP
cannam@128 333 js LeaveNow
cannam@128 334 AFTER_JMP
cannam@128 335
cannam@128 336 LoopEntry2:
cannam@128 337 cmp bx,word ptr [rsi + r8 - 1]
cannam@128 338 BEFORE_JMP
cannam@128 339 jz LookupLoopIsZero
cannam@128 340 AFTER_JMP
cannam@128 341
cannam@128 342 LookupLoop4:
cannam@128 343 and r8d, edx
cannam@128 344
cannam@128 345 movzx r8d, word ptr [rdi + r8*2]
cannam@128 346 cmp r8d, ebp
cannam@128 347 BEFORE_JMP
cannam@128 348 jbe LeaveNow
cannam@128 349 AFTER_JMP
cannam@128 350 sub edx, 0x00010000
cannam@128 351 BEFORE_JMP
cannam@128 352 js LeaveNow
cannam@128 353 AFTER_JMP
cannam@128 354
cannam@128 355 LoopEntry4:
cannam@128 356
cannam@128 357 cmp bx,word ptr [rsi + r8 - 1]
cannam@128 358 BEFORE_JMP
cannam@128 359 jnz LookupLoop1
cannam@128 360 jmp LookupLoopIsZero
cannam@128 361 AFTER_JMP
cannam@128 362 /*
cannam@128 363 ;;; do {
cannam@128 364 ;;; match = s->window + cur_match;
cannam@128 365 ;;; if (*(ushf*)(match+best_len-1) != scan_end ||
cannam@128 366 ;;; *(ushf*)match != scan_start) continue;
cannam@128 367 ;;; [...]
cannam@128 368 ;;; } while ((cur_match = prev[cur_match & wmask]) > limit
cannam@128 369 ;;; && --chain_length != 0);
cannam@128 370 ;;;
cannam@128 371 ;;; Here is the inner loop of the function. The function will spend the
cannam@128 372 ;;; majority of its time in this loop, and majority of that time will
cannam@128 373 ;;; be spent in the first ten instructions.
cannam@128 374 ;;;
cannam@128 375 ;;; Within this loop:
cannam@128 376 ;;; ebx = scanend
cannam@128 377 ;;; r8d = curmatch
cannam@128 378 ;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
cannam@128 379 ;;; esi = windowbestlen - i.e., (window + bestlen)
cannam@128 380 ;;; edi = prev
cannam@128 381 ;;; ebp = limit
cannam@128 382 */
cannam@128 383 .balign 16
cannam@128 384 LookupLoop:
cannam@128 385 and r8d, edx
cannam@128 386
cannam@128 387 movzx r8d, word ptr [rdi + r8*2]
cannam@128 388 cmp r8d, ebp
cannam@128 389 BEFORE_JMP
cannam@128 390 jbe LeaveNow
cannam@128 391 AFTER_JMP
cannam@128 392 sub edx, 0x00010000
cannam@128 393 BEFORE_JMP
cannam@128 394 js LeaveNow
cannam@128 395 AFTER_JMP
cannam@128 396
cannam@128 397 LoopEntry:
cannam@128 398
cannam@128 399 cmp bx,word ptr [rsi + r8 - 1]
cannam@128 400 BEFORE_JMP
cannam@128 401 jnz LookupLoop1
cannam@128 402 AFTER_JMP
cannam@128 403 LookupLoopIsZero:
cannam@128 404 cmp r12w, word ptr [r10 + r8]
cannam@128 405 BEFORE_JMP
cannam@128 406 jnz LookupLoop1
cannam@128 407 AFTER_JMP
cannam@128 408
cannam@128 409
cannam@128 410 //;;; Store the current value of chainlen.
cannam@128 411 mov [chainlenwmask], edx
cannam@128 412 /*
cannam@128 413 ;;; Point edi to the string under scrutiny, and esi to the string we
cannam@128 414 ;;; are hoping to match it up with. In actuality, esi and edi are
cannam@128 415 ;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
cannam@128 416 ;;; initialized to -(MAX_MATCH_8 - scanalign).
cannam@128 417 */
cannam@128 418 lea rsi,[r8+r10]
cannam@128 419 mov rdx, 0xfffffffffffffef8 //; -(MAX_MATCH_8)
cannam@128 420 lea rsi, [rsi + r13 + 0x0108] //;MAX_MATCH_8]
cannam@128 421 lea rdi, [r9 + r13 + 0x0108] //;MAX_MATCH_8]
cannam@128 422
cannam@128 423 prefetcht1 [rsi+rdx]
cannam@128 424 prefetcht1 [rdi+rdx]
cannam@128 425
cannam@128 426 /*
cannam@128 427 ;;; Test the strings for equality, 8 bytes at a time. At the end,
cannam@128 428 ;;; adjust rdx so that it is offset to the exact byte that mismatched.
cannam@128 429 ;;;
cannam@128 430 ;;; We already know at this point that the first three bytes of the
cannam@128 431 ;;; strings match each other, and they can be safely passed over before
cannam@128 432 ;;; starting the compare loop. So what this code does is skip over 0-3
cannam@128 433 ;;; bytes, as much as necessary in order to dword-align the edi
cannam@128 434 ;;; pointer. (rsi will still be misaligned three times out of four.)
cannam@128 435 ;;;
cannam@128 436 ;;; It should be confessed that this loop usually does not represent
cannam@128 437 ;;; much of the total running time. Replacing it with a more
cannam@128 438 ;;; straightforward "rep cmpsb" would not drastically degrade
cannam@128 439 ;;; performance.
cannam@128 440 */
cannam@128 441
cannam@128 442 LoopCmps:
cannam@128 443 mov rax, [rsi + rdx]
cannam@128 444 xor rax, [rdi + rdx]
cannam@128 445 jnz LeaveLoopCmps
cannam@128 446
cannam@128 447 mov rax, [rsi + rdx + 8]
cannam@128 448 xor rax, [rdi + rdx + 8]
cannam@128 449 jnz LeaveLoopCmps8
cannam@128 450
cannam@128 451
cannam@128 452 mov rax, [rsi + rdx + 8+8]
cannam@128 453 xor rax, [rdi + rdx + 8+8]
cannam@128 454 jnz LeaveLoopCmps16
cannam@128 455
cannam@128 456 add rdx,8+8+8
cannam@128 457
cannam@128 458 BEFORE_JMP
cannam@128 459 jnz LoopCmps
cannam@128 460 jmp LenMaximum
cannam@128 461 AFTER_JMP
cannam@128 462
cannam@128 463 LeaveLoopCmps16: add rdx,8
cannam@128 464 LeaveLoopCmps8: add rdx,8
cannam@128 465 LeaveLoopCmps:
cannam@128 466
cannam@128 467 test eax, 0x0000FFFF
cannam@128 468 jnz LenLower
cannam@128 469
cannam@128 470 test eax,0xffffffff
cannam@128 471
cannam@128 472 jnz LenLower32
cannam@128 473
cannam@128 474 add rdx,4
cannam@128 475 shr rax,32
cannam@128 476 or ax,ax
cannam@128 477 BEFORE_JMP
cannam@128 478 jnz LenLower
cannam@128 479 AFTER_JMP
cannam@128 480
cannam@128 481 LenLower32:
cannam@128 482 shr eax,16
cannam@128 483 add rdx,2
cannam@128 484
cannam@128 485 LenLower:
cannam@128 486 sub al, 1
cannam@128 487 adc rdx, 0
cannam@128 488 //;;; Calculate the length of the match. If it is longer than MAX_MATCH,
cannam@128 489 //;;; then automatically accept it as the best possible match and leave.
cannam@128 490
cannam@128 491 lea rax, [rdi + rdx]
cannam@128 492 sub rax, r9
cannam@128 493 cmp eax, MAX_MATCH
cannam@128 494 BEFORE_JMP
cannam@128 495 jge LenMaximum
cannam@128 496 AFTER_JMP
cannam@128 497 /*
cannam@128 498 ;;; If the length of the match is not longer than the best match we
cannam@128 499 ;;; have so far, then forget it and return to the lookup loop.
cannam@128 500 ;///////////////////////////////////
cannam@128 501 */
cannam@128 502 cmp eax, r11d
cannam@128 503 jg LongerMatch
cannam@128 504
cannam@128 505 lea rsi,[r10+r11]
cannam@128 506
cannam@128 507 mov rdi, prev_ad
cannam@128 508 mov edx, [chainlenwmask]
cannam@128 509 BEFORE_JMP
cannam@128 510 jmp LookupLoop
cannam@128 511 AFTER_JMP
cannam@128 512 /*
cannam@128 513 ;;; s->match_start = cur_match;
cannam@128 514 ;;; best_len = len;
cannam@128 515 ;;; if (len >= nice_match) break;
cannam@128 516 ;;; scan_end = *(ushf*)(scan+best_len-1);
cannam@128 517 */
cannam@128 518 LongerMatch:
cannam@128 519 mov r11d, eax
cannam@128 520 mov match_start, r8d
cannam@128 521 cmp eax, [nicematch]
cannam@128 522 BEFORE_JMP
cannam@128 523 jge LeaveNow
cannam@128 524 AFTER_JMP
cannam@128 525
cannam@128 526 lea rsi,[r10+rax]
cannam@128 527
cannam@128 528 movzx ebx, word ptr [r9 + rax - 1]
cannam@128 529 mov rdi, prev_ad
cannam@128 530 mov edx, [chainlenwmask]
cannam@128 531 BEFORE_JMP
cannam@128 532 jmp LookupLoop
cannam@128 533 AFTER_JMP
cannam@128 534
cannam@128 535 //;;; Accept the current string, with the maximum possible length.
cannam@128 536
cannam@128 537 LenMaximum:
cannam@128 538 mov r11d,MAX_MATCH
cannam@128 539 mov match_start, r8d
cannam@128 540
cannam@128 541 //;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
cannam@128 542 //;;; return s->lookahead;
cannam@128 543
cannam@128 544 LeaveNow:
cannam@128 545 mov eax, Lookahead
cannam@128 546 cmp r11d, eax
cannam@128 547 cmovng eax, r11d
cannam@128 548
cannam@128 549
cannam@128 550
cannam@128 551 //;;; Restore the stack and return from whence we came.
cannam@128 552
cannam@128 553
cannam@128 554 // mov rsi,[save_rsi]
cannam@128 555 // mov rdi,[save_rdi]
cannam@128 556 mov rbx,[save_rbx]
cannam@128 557 mov rbp,[save_rbp]
cannam@128 558 mov r12,[save_r12]
cannam@128 559 mov r13,[save_r13]
cannam@128 560 mov r14,[save_r14]
cannam@128 561 mov r15,[save_r15]
cannam@128 562
cannam@128 563
cannam@128 564 ret 0
cannam@128 565 //; please don't remove this string !
cannam@128 566 //; Your can freely use gvmat64 in any free or commercial app
cannam@128 567 //; but it is far better don't remove the string in the binary!
cannam@128 568 // db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0
cannam@128 569
cannam@128 570
cannam@128 571 match_init:
cannam@128 572 ret 0
cannam@128 573
cannam@128 574