annotate src/zlib-1.2.7/contrib/gcc_gvmat64/gvmat64.S @ 90:07fe46ff1966

Add more mingw builds
author Chris Cannam <cannam@all-day-breakfast.com>
date Wed, 20 Mar 2013 14:01:32 +0000
parents 8a15ff55d9af
children
rev   line source
cannam@89 1 /*
cannam@89 2 ;uInt longest_match_x64(
cannam@89 3 ; deflate_state *s,
cannam@89 4 ; IPos cur_match); // current match
cannam@89 5
cannam@89 6 ; gvmat64.S -- Asm portion of the optimized longest_match for 32 bits x86_64
cannam@89 7 ; (AMD64 on Athlon 64, Opteron, Phenom
cannam@89 8 ; and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7)
cannam@89 9 ; this file is translation from gvmat64.asm to GCC 4.x (for Linux, Mac XCode)
cannam@89 10 ; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
cannam@89 11 ;
cannam@89 12 ; File written by Gilles Vollant, by converting to assembly the longest_match
cannam@89 13 ; from Jean-loup Gailly in deflate.c of zLib and infoZip zip.
cannam@89 14 ; and by taking inspiration on asm686 with masm, optimised assembly code
cannam@89 15 ; from Brian Raiter, written 1998
cannam@89 16 ;
cannam@89 17 ; This software is provided 'as-is', without any express or implied
cannam@89 18 ; warranty. In no event will the authors be held liable for any damages
cannam@89 19 ; arising from the use of this software.
cannam@89 20 ;
cannam@89 21 ; Permission is granted to anyone to use this software for any purpose,
cannam@89 22 ; including commercial applications, and to alter it and redistribute it
cannam@89 23 ; freely, subject to the following restrictions:
cannam@89 24 ;
cannam@89 25 ; 1. The origin of this software must not be misrepresented; you must not
cannam@89 26 ; claim that you wrote the original software. If you use this software
cannam@89 27 ; in a product, an acknowledgment in the product documentation would be
cannam@89 28 ; appreciated but is not required.
cannam@89 29 ; 2. Altered source versions must be plainly marked as such, and must not be
cannam@89 30 ; misrepresented as being the original software
cannam@89 31 ; 3. This notice may not be removed or altered from any source distribution.
cannam@89 32 ;
cannam@89 33 ; http://www.zlib.net
cannam@89 34 ; http://www.winimage.com/zLibDll
cannam@89 35 ; http://www.muppetlabs.com/~breadbox/software/assembly.html
cannam@89 36 ;
cannam@89 37 ; to compile this file for zLib, I use option:
cannam@89 38 ; gcc -c -arch x86_64 gvmat64.S
cannam@89 39
cannam@89 40
cannam@89 41 ;uInt longest_match(s, cur_match)
cannam@89 42 ; deflate_state *s;
cannam@89 43 ; IPos cur_match; // current match /
cannam@89 44 ;
cannam@89 45 ; with XCode for Mac, I had strange error with some jump on intel syntax
cannam@89 46 ; this is why BEFORE_JMP and AFTER_JMP are used
cannam@89 47 */
cannam@89 48
cannam@89 49
cannam@89 50 #define BEFORE_JMP .att_syntax
cannam@89 51 #define AFTER_JMP .intel_syntax noprefix
cannam@89 52
cannam@89 53 #ifndef NO_UNDERLINE
cannam@89 54 # define match_init _match_init
cannam@89 55 # define longest_match _longest_match
cannam@89 56 #endif
cannam@89 57
cannam@89 58 .intel_syntax noprefix
cannam@89 59
cannam@89 60 .globl match_init, longest_match
cannam@89 61 .text
cannam@89 62 longest_match:
cannam@89 63
cannam@89 64
cannam@89 65
cannam@89 66 #define LocalVarsSize 96
cannam@89 67 /*
cannam@89 68 ; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12
cannam@89 69 ; free register : r14,r15
cannam@89 70 ; register can be saved : rsp
cannam@89 71 */
cannam@89 72
cannam@89 73 #define chainlenwmask (rsp + 8 - LocalVarsSize)
cannam@89 74 #define nicematch (rsp + 16 - LocalVarsSize)
cannam@89 75
cannam@89 76 #define save_rdi (rsp + 24 - LocalVarsSize)
cannam@89 77 #define save_rsi (rsp + 32 - LocalVarsSize)
cannam@89 78 #define save_rbx (rsp + 40 - LocalVarsSize)
cannam@89 79 #define save_rbp (rsp + 48 - LocalVarsSize)
cannam@89 80 #define save_r12 (rsp + 56 - LocalVarsSize)
cannam@89 81 #define save_r13 (rsp + 64 - LocalVarsSize)
cannam@89 82 #define save_r14 (rsp + 72 - LocalVarsSize)
cannam@89 83 #define save_r15 (rsp + 80 - LocalVarsSize)
cannam@89 84
cannam@89 85
cannam@89 86 /*
cannam@89 87 ; all the +4 offsets are due to the addition of pending_buf_size (in zlib
cannam@89 88 ; in the deflate_state structure since the asm code was first written
cannam@89 89 ; (if you compile with zlib 1.0.4 or older, remove the +4).
cannam@89 90 ; Note : these value are good with a 8 bytes boundary pack structure
cannam@89 91 */
cannam@89 92
cannam@89 93 #define MAX_MATCH 258
cannam@89 94 #define MIN_MATCH 3
cannam@89 95 #define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1)
cannam@89 96
cannam@89 97 /*
cannam@89 98 ;;; Offsets for fields in the deflate_state structure. These numbers
cannam@89 99 ;;; are calculated from the definition of deflate_state, with the
cannam@89 100 ;;; assumption that the compiler will dword-align the fields. (Thus,
cannam@89 101 ;;; changing the definition of deflate_state could easily cause this
cannam@89 102 ;;; program to crash horribly, without so much as a warning at
cannam@89 103 ;;; compile time. Sigh.)
cannam@89 104
cannam@89 105 ; all the +zlib1222add offsets are due to the addition of fields
cannam@89 106 ; in zlib in the deflate_state structure since the asm code was first written
cannam@89 107 ; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
cannam@89 108 ; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
cannam@89 109 ; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
cannam@89 110 */
cannam@89 111
cannam@89 112
cannam@89 113
cannam@89 114 /* you can check the structure offset by running
cannam@89 115
cannam@89 116 #include <stdlib.h>
cannam@89 117 #include <stdio.h>
cannam@89 118 #include "deflate.h"
cannam@89 119
cannam@89 120 void print_depl()
cannam@89 121 {
cannam@89 122 deflate_state ds;
cannam@89 123 deflate_state *s=&ds;
cannam@89 124 printf("size pointer=%u\n",(int)sizeof(void*));
cannam@89 125
cannam@89 126 printf("#define dsWSize %u\n",(int)(((char*)&(s->w_size))-((char*)s)));
cannam@89 127 printf("#define dsWMask %u\n",(int)(((char*)&(s->w_mask))-((char*)s)));
cannam@89 128 printf("#define dsWindow %u\n",(int)(((char*)&(s->window))-((char*)s)));
cannam@89 129 printf("#define dsPrev %u\n",(int)(((char*)&(s->prev))-((char*)s)));
cannam@89 130 printf("#define dsMatchLen %u\n",(int)(((char*)&(s->match_length))-((char*)s)));
cannam@89 131 printf("#define dsPrevMatch %u\n",(int)(((char*)&(s->prev_match))-((char*)s)));
cannam@89 132 printf("#define dsStrStart %u\n",(int)(((char*)&(s->strstart))-((char*)s)));
cannam@89 133 printf("#define dsMatchStart %u\n",(int)(((char*)&(s->match_start))-((char*)s)));
cannam@89 134 printf("#define dsLookahead %u\n",(int)(((char*)&(s->lookahead))-((char*)s)));
cannam@89 135 printf("#define dsPrevLen %u\n",(int)(((char*)&(s->prev_length))-((char*)s)));
cannam@89 136 printf("#define dsMaxChainLen %u\n",(int)(((char*)&(s->max_chain_length))-((char*)s)));
cannam@89 137 printf("#define dsGoodMatch %u\n",(int)(((char*)&(s->good_match))-((char*)s)));
cannam@89 138 printf("#define dsNiceMatch %u\n",(int)(((char*)&(s->nice_match))-((char*)s)));
cannam@89 139 }
cannam@89 140 */
cannam@89 141
cannam@89 142 #define dsWSize 68
cannam@89 143 #define dsWMask 76
cannam@89 144 #define dsWindow 80
cannam@89 145 #define dsPrev 96
cannam@89 146 #define dsMatchLen 144
cannam@89 147 #define dsPrevMatch 148
cannam@89 148 #define dsStrStart 156
cannam@89 149 #define dsMatchStart 160
cannam@89 150 #define dsLookahead 164
cannam@89 151 #define dsPrevLen 168
cannam@89 152 #define dsMaxChainLen 172
cannam@89 153 #define dsGoodMatch 188
cannam@89 154 #define dsNiceMatch 192
cannam@89 155
cannam@89 156 #define window_size [ rcx + dsWSize]
cannam@89 157 #define WMask [ rcx + dsWMask]
cannam@89 158 #define window_ad [ rcx + dsWindow]
cannam@89 159 #define prev_ad [ rcx + dsPrev]
cannam@89 160 #define strstart [ rcx + dsStrStart]
cannam@89 161 #define match_start [ rcx + dsMatchStart]
cannam@89 162 #define Lookahead [ rcx + dsLookahead] //; 0ffffffffh on infozip
cannam@89 163 #define prev_length [ rcx + dsPrevLen]
cannam@89 164 #define max_chain_length [ rcx + dsMaxChainLen]
cannam@89 165 #define good_match [ rcx + dsGoodMatch]
cannam@89 166 #define nice_match [ rcx + dsNiceMatch]
cannam@89 167
cannam@89 168 /*
cannam@89 169 ; windows:
cannam@89 170 ; parameter 1 in rcx(deflate state s), param 2 in rdx (cur match)
cannam@89 171
cannam@89 172 ; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
cannam@89 173 ; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
cannam@89 174 ;
cannam@89 175 ; All registers must be preserved across the call, except for
cannam@89 176 ; rax, rcx, rdx, r8, r9, r10, and r11, which are scratch.
cannam@89 177
cannam@89 178 ;
cannam@89 179 ; gcc on macosx-linux:
cannam@89 180 ; see http://www.x86-64.org/documentation/abi-0.99.pdf
cannam@89 181 ; param 1 in rdi, param 2 in rsi
cannam@89 182 ; rbx, rsp, rbp, r12 to r15 must be preserved
cannam@89 183
cannam@89 184 ;;; Save registers that the compiler may be using, and adjust esp to
cannam@89 185 ;;; make room for our stack frame.
cannam@89 186
cannam@89 187
cannam@89 188 ;;; Retrieve the function arguments. r8d will hold cur_match
cannam@89 189 ;;; throughout the entire function. edx will hold the pointer to the
cannam@89 190 ;;; deflate_state structure during the function's setup (before
cannam@89 191 ;;; entering the main loop.
cannam@89 192
cannam@89 193 ; ms: parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match)
cannam@89 194 ; mac: param 1 in rdi, param 2 rsi
cannam@89 195 ; this clear high 32 bits of r8, which can be garbage in both r8 and rdx
cannam@89 196 */
cannam@89 197 mov [save_rbx],rbx
cannam@89 198 mov [save_rbp],rbp
cannam@89 199
cannam@89 200
cannam@89 201 mov rcx,rdi
cannam@89 202
cannam@89 203 mov r8d,esi
cannam@89 204
cannam@89 205
cannam@89 206 mov [save_r12],r12
cannam@89 207 mov [save_r13],r13
cannam@89 208 mov [save_r14],r14
cannam@89 209 mov [save_r15],r15
cannam@89 210
cannam@89 211
cannam@89 212 //;;; uInt wmask = s->w_mask;
cannam@89 213 //;;; unsigned chain_length = s->max_chain_length;
cannam@89 214 //;;; if (s->prev_length >= s->good_match) {
cannam@89 215 //;;; chain_length >>= 2;
cannam@89 216 //;;; }
cannam@89 217
cannam@89 218
cannam@89 219 mov edi, prev_length
cannam@89 220 mov esi, good_match
cannam@89 221 mov eax, WMask
cannam@89 222 mov ebx, max_chain_length
cannam@89 223 cmp edi, esi
cannam@89 224 jl LastMatchGood
cannam@89 225 shr ebx, 2
cannam@89 226 LastMatchGood:
cannam@89 227
cannam@89 228 //;;; chainlen is decremented once beforehand so that the function can
cannam@89 229 //;;; use the sign flag instead of the zero flag for the exit test.
cannam@89 230 //;;; It is then shifted into the high word, to make room for the wmask
cannam@89 231 //;;; value, which it will always accompany.
cannam@89 232
cannam@89 233 dec ebx
cannam@89 234 shl ebx, 16
cannam@89 235 or ebx, eax
cannam@89 236
cannam@89 237 //;;; on zlib only
cannam@89 238 //;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
cannam@89 239
cannam@89 240
cannam@89 241
cannam@89 242 mov eax, nice_match
cannam@89 243 mov [chainlenwmask], ebx
cannam@89 244 mov r10d, Lookahead
cannam@89 245 cmp r10d, eax
cannam@89 246 cmovnl r10d, eax
cannam@89 247 mov [nicematch],r10d
cannam@89 248
cannam@89 249
cannam@89 250
cannam@89 251 //;;; register Bytef *scan = s->window + s->strstart;
cannam@89 252 mov r10, window_ad
cannam@89 253 mov ebp, strstart
cannam@89 254 lea r13, [r10 + rbp]
cannam@89 255
cannam@89 256 //;;; Determine how many bytes the scan ptr is off from being
cannam@89 257 //;;; dword-aligned.
cannam@89 258
cannam@89 259 mov r9,r13
cannam@89 260 neg r13
cannam@89 261 and r13,3
cannam@89 262
cannam@89 263 //;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
cannam@89 264 //;;; s->strstart - (IPos)MAX_DIST(s) : NIL;
cannam@89 265
cannam@89 266
cannam@89 267 mov eax, window_size
cannam@89 268 sub eax, MIN_LOOKAHEAD
cannam@89 269
cannam@89 270
cannam@89 271 xor edi,edi
cannam@89 272 sub ebp, eax
cannam@89 273
cannam@89 274 mov r11d, prev_length
cannam@89 275
cannam@89 276 cmovng ebp,edi
cannam@89 277
cannam@89 278 //;;; int best_len = s->prev_length;
cannam@89 279
cannam@89 280
cannam@89 281 //;;; Store the sum of s->window + best_len in esi locally, and in esi.
cannam@89 282
cannam@89 283 lea rsi,[r10+r11]
cannam@89 284
cannam@89 285 //;;; register ush scan_start = *(ushf*)scan;
cannam@89 286 //;;; register ush scan_end = *(ushf*)(scan+best_len-1);
cannam@89 287 //;;; Posf *prev = s->prev;
cannam@89 288
cannam@89 289 movzx r12d,word ptr [r9]
cannam@89 290 movzx ebx, word ptr [r9 + r11 - 1]
cannam@89 291
cannam@89 292 mov rdi, prev_ad
cannam@89 293
cannam@89 294 //;;; Jump into the main loop.
cannam@89 295
cannam@89 296 mov edx, [chainlenwmask]
cannam@89 297
cannam@89 298 cmp bx,word ptr [rsi + r8 - 1]
cannam@89 299 jz LookupLoopIsZero
cannam@89 300
cannam@89 301
cannam@89 302
cannam@89 303 LookupLoop1:
cannam@89 304 and r8d, edx
cannam@89 305
cannam@89 306 movzx r8d, word ptr [rdi + r8*2]
cannam@89 307 cmp r8d, ebp
cannam@89 308 jbe LeaveNow
cannam@89 309
cannam@89 310
cannam@89 311
cannam@89 312 sub edx, 0x00010000
cannam@89 313 BEFORE_JMP
cannam@89 314 js LeaveNow
cannam@89 315 AFTER_JMP
cannam@89 316
cannam@89 317 LoopEntry1:
cannam@89 318 cmp bx,word ptr [rsi + r8 - 1]
cannam@89 319 BEFORE_JMP
cannam@89 320 jz LookupLoopIsZero
cannam@89 321 AFTER_JMP
cannam@89 322
cannam@89 323 LookupLoop2:
cannam@89 324 and r8d, edx
cannam@89 325
cannam@89 326 movzx r8d, word ptr [rdi + r8*2]
cannam@89 327 cmp r8d, ebp
cannam@89 328 BEFORE_JMP
cannam@89 329 jbe LeaveNow
cannam@89 330 AFTER_JMP
cannam@89 331 sub edx, 0x00010000
cannam@89 332 BEFORE_JMP
cannam@89 333 js LeaveNow
cannam@89 334 AFTER_JMP
cannam@89 335
cannam@89 336 LoopEntry2:
cannam@89 337 cmp bx,word ptr [rsi + r8 - 1]
cannam@89 338 BEFORE_JMP
cannam@89 339 jz LookupLoopIsZero
cannam@89 340 AFTER_JMP
cannam@89 341
cannam@89 342 LookupLoop4:
cannam@89 343 and r8d, edx
cannam@89 344
cannam@89 345 movzx r8d, word ptr [rdi + r8*2]
cannam@89 346 cmp r8d, ebp
cannam@89 347 BEFORE_JMP
cannam@89 348 jbe LeaveNow
cannam@89 349 AFTER_JMP
cannam@89 350 sub edx, 0x00010000
cannam@89 351 BEFORE_JMP
cannam@89 352 js LeaveNow
cannam@89 353 AFTER_JMP
cannam@89 354
cannam@89 355 LoopEntry4:
cannam@89 356
cannam@89 357 cmp bx,word ptr [rsi + r8 - 1]
cannam@89 358 BEFORE_JMP
cannam@89 359 jnz LookupLoop1
cannam@89 360 jmp LookupLoopIsZero
cannam@89 361 AFTER_JMP
cannam@89 362 /*
cannam@89 363 ;;; do {
cannam@89 364 ;;; match = s->window + cur_match;
cannam@89 365 ;;; if (*(ushf*)(match+best_len-1) != scan_end ||
cannam@89 366 ;;; *(ushf*)match != scan_start) continue;
cannam@89 367 ;;; [...]
cannam@89 368 ;;; } while ((cur_match = prev[cur_match & wmask]) > limit
cannam@89 369 ;;; && --chain_length != 0);
cannam@89 370 ;;;
cannam@89 371 ;;; Here is the inner loop of the function. The function will spend the
cannam@89 372 ;;; majority of its time in this loop, and majority of that time will
cannam@89 373 ;;; be spent in the first ten instructions.
cannam@89 374 ;;;
cannam@89 375 ;;; Within this loop:
cannam@89 376 ;;; ebx = scanend
cannam@89 377 ;;; r8d = curmatch
cannam@89 378 ;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
cannam@89 379 ;;; esi = windowbestlen - i.e., (window + bestlen)
cannam@89 380 ;;; edi = prev
cannam@89 381 ;;; ebp = limit
cannam@89 382 */
cannam@89 383 .balign 16
cannam@89 384 LookupLoop:
cannam@89 385 and r8d, edx
cannam@89 386
cannam@89 387 movzx r8d, word ptr [rdi + r8*2]
cannam@89 388 cmp r8d, ebp
cannam@89 389 BEFORE_JMP
cannam@89 390 jbe LeaveNow
cannam@89 391 AFTER_JMP
cannam@89 392 sub edx, 0x00010000
cannam@89 393 BEFORE_JMP
cannam@89 394 js LeaveNow
cannam@89 395 AFTER_JMP
cannam@89 396
cannam@89 397 LoopEntry:
cannam@89 398
cannam@89 399 cmp bx,word ptr [rsi + r8 - 1]
cannam@89 400 BEFORE_JMP
cannam@89 401 jnz LookupLoop1
cannam@89 402 AFTER_JMP
cannam@89 403 LookupLoopIsZero:
cannam@89 404 cmp r12w, word ptr [r10 + r8]
cannam@89 405 BEFORE_JMP
cannam@89 406 jnz LookupLoop1
cannam@89 407 AFTER_JMP
cannam@89 408
cannam@89 409
cannam@89 410 //;;; Store the current value of chainlen.
cannam@89 411 mov [chainlenwmask], edx
cannam@89 412 /*
cannam@89 413 ;;; Point edi to the string under scrutiny, and esi to the string we
cannam@89 414 ;;; are hoping to match it up with. In actuality, esi and edi are
cannam@89 415 ;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
cannam@89 416 ;;; initialized to -(MAX_MATCH_8 - scanalign).
cannam@89 417 */
cannam@89 418 lea rsi,[r8+r10]
cannam@89 419 mov rdx, 0xfffffffffffffef8 //; -(MAX_MATCH_8)
cannam@89 420 lea rsi, [rsi + r13 + 0x0108] //;MAX_MATCH_8]
cannam@89 421 lea rdi, [r9 + r13 + 0x0108] //;MAX_MATCH_8]
cannam@89 422
cannam@89 423 prefetcht1 [rsi+rdx]
cannam@89 424 prefetcht1 [rdi+rdx]
cannam@89 425
cannam@89 426 /*
cannam@89 427 ;;; Test the strings for equality, 8 bytes at a time. At the end,
cannam@89 428 ;;; adjust rdx so that it is offset to the exact byte that mismatched.
cannam@89 429 ;;;
cannam@89 430 ;;; We already know at this point that the first three bytes of the
cannam@89 431 ;;; strings match each other, and they can be safely passed over before
cannam@89 432 ;;; starting the compare loop. So what this code does is skip over 0-3
cannam@89 433 ;;; bytes, as much as necessary in order to dword-align the edi
cannam@89 434 ;;; pointer. (rsi will still be misaligned three times out of four.)
cannam@89 435 ;;;
cannam@89 436 ;;; It should be confessed that this loop usually does not represent
cannam@89 437 ;;; much of the total running time. Replacing it with a more
cannam@89 438 ;;; straightforward "rep cmpsb" would not drastically degrade
cannam@89 439 ;;; performance.
cannam@89 440 */
cannam@89 441
cannam@89 442 LoopCmps:
cannam@89 443 mov rax, [rsi + rdx]
cannam@89 444 xor rax, [rdi + rdx]
cannam@89 445 jnz LeaveLoopCmps
cannam@89 446
cannam@89 447 mov rax, [rsi + rdx + 8]
cannam@89 448 xor rax, [rdi + rdx + 8]
cannam@89 449 jnz LeaveLoopCmps8
cannam@89 450
cannam@89 451
cannam@89 452 mov rax, [rsi + rdx + 8+8]
cannam@89 453 xor rax, [rdi + rdx + 8+8]
cannam@89 454 jnz LeaveLoopCmps16
cannam@89 455
cannam@89 456 add rdx,8+8+8
cannam@89 457
cannam@89 458 BEFORE_JMP
cannam@89 459 jnz LoopCmps
cannam@89 460 jmp LenMaximum
cannam@89 461 AFTER_JMP
cannam@89 462
cannam@89 463 LeaveLoopCmps16: add rdx,8
cannam@89 464 LeaveLoopCmps8: add rdx,8
cannam@89 465 LeaveLoopCmps:
cannam@89 466
cannam@89 467 test eax, 0x0000FFFF
cannam@89 468 jnz LenLower
cannam@89 469
cannam@89 470 test eax,0xffffffff
cannam@89 471
cannam@89 472 jnz LenLower32
cannam@89 473
cannam@89 474 add rdx,4
cannam@89 475 shr rax,32
cannam@89 476 or ax,ax
cannam@89 477 BEFORE_JMP
cannam@89 478 jnz LenLower
cannam@89 479 AFTER_JMP
cannam@89 480
cannam@89 481 LenLower32:
cannam@89 482 shr eax,16
cannam@89 483 add rdx,2
cannam@89 484
cannam@89 485 LenLower:
cannam@89 486 sub al, 1
cannam@89 487 adc rdx, 0
cannam@89 488 //;;; Calculate the length of the match. If it is longer than MAX_MATCH,
cannam@89 489 //;;; then automatically accept it as the best possible match and leave.
cannam@89 490
cannam@89 491 lea rax, [rdi + rdx]
cannam@89 492 sub rax, r9
cannam@89 493 cmp eax, MAX_MATCH
cannam@89 494 BEFORE_JMP
cannam@89 495 jge LenMaximum
cannam@89 496 AFTER_JMP
cannam@89 497 /*
cannam@89 498 ;;; If the length of the match is not longer than the best match we
cannam@89 499 ;;; have so far, then forget it and return to the lookup loop.
cannam@89 500 ;///////////////////////////////////
cannam@89 501 */
cannam@89 502 cmp eax, r11d
cannam@89 503 jg LongerMatch
cannam@89 504
cannam@89 505 lea rsi,[r10+r11]
cannam@89 506
cannam@89 507 mov rdi, prev_ad
cannam@89 508 mov edx, [chainlenwmask]
cannam@89 509 BEFORE_JMP
cannam@89 510 jmp LookupLoop
cannam@89 511 AFTER_JMP
cannam@89 512 /*
cannam@89 513 ;;; s->match_start = cur_match;
cannam@89 514 ;;; best_len = len;
cannam@89 515 ;;; if (len >= nice_match) break;
cannam@89 516 ;;; scan_end = *(ushf*)(scan+best_len-1);
cannam@89 517 */
cannam@89 518 LongerMatch:
cannam@89 519 mov r11d, eax
cannam@89 520 mov match_start, r8d
cannam@89 521 cmp eax, [nicematch]
cannam@89 522 BEFORE_JMP
cannam@89 523 jge LeaveNow
cannam@89 524 AFTER_JMP
cannam@89 525
cannam@89 526 lea rsi,[r10+rax]
cannam@89 527
cannam@89 528 movzx ebx, word ptr [r9 + rax - 1]
cannam@89 529 mov rdi, prev_ad
cannam@89 530 mov edx, [chainlenwmask]
cannam@89 531 BEFORE_JMP
cannam@89 532 jmp LookupLoop
cannam@89 533 AFTER_JMP
cannam@89 534
cannam@89 535 //;;; Accept the current string, with the maximum possible length.
cannam@89 536
cannam@89 537 LenMaximum:
cannam@89 538 mov r11d,MAX_MATCH
cannam@89 539 mov match_start, r8d
cannam@89 540
cannam@89 541 //;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
cannam@89 542 //;;; return s->lookahead;
cannam@89 543
cannam@89 544 LeaveNow:
cannam@89 545 mov eax, Lookahead
cannam@89 546 cmp r11d, eax
cannam@89 547 cmovng eax, r11d
cannam@89 548
cannam@89 549
cannam@89 550
cannam@89 551 //;;; Restore the stack and return from whence we came.
cannam@89 552
cannam@89 553
cannam@89 554 // mov rsi,[save_rsi]
cannam@89 555 // mov rdi,[save_rdi]
cannam@89 556 mov rbx,[save_rbx]
cannam@89 557 mov rbp,[save_rbp]
cannam@89 558 mov r12,[save_r12]
cannam@89 559 mov r13,[save_r13]
cannam@89 560 mov r14,[save_r14]
cannam@89 561 mov r15,[save_r15]
cannam@89 562
cannam@89 563
cannam@89 564 ret 0
cannam@89 565 //; please don't remove this string !
cannam@89 566 //; Your can freely use gvmat64 in any free or commercial app
cannam@89 567 //; but it is far better don't remove the string in the binary!
cannam@89 568 // db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0
cannam@89 569
cannam@89 570
cannam@89 571 match_init:
cannam@89 572 ret 0
cannam@89 573
cannam@89 574