annotate src/zlib-1.2.7/contrib/masmx64/gvmat64.asm @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents e13257ea84a4
children
rev   line source
Chris@4 1 ;uInt longest_match_x64(
Chris@4 2 ; deflate_state *s,
Chris@4 3 ; IPos cur_match); /* current match */
Chris@4 4
Chris@4 5 ; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86_64
Chris@4 6 ; (AMD64 on Athlon 64, Opteron, Phenom
Chris@4 7 ; and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7)
Chris@4 8 ; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
Chris@4 9 ;
Chris@4 10 ; File written by Gilles Vollant, by converting to assembly the longest_match
Chris@4 11 ; from Jean-loup Gailly in deflate.c of zLib and infoZip zip.
Chris@4 12 ;
Chris@4 13 ; and by taking inspiration on asm686 with masm, optimised assembly code
Chris@4 14 ; from Brian Raiter, written 1998
Chris@4 15 ;
Chris@4 16 ; This software is provided 'as-is', without any express or implied
Chris@4 17 ; warranty. In no event will the authors be held liable for any damages
Chris@4 18 ; arising from the use of this software.
Chris@4 19 ;
Chris@4 20 ; Permission is granted to anyone to use this software for any purpose,
Chris@4 21 ; including commercial applications, and to alter it and redistribute it
Chris@4 22 ; freely, subject to the following restrictions:
Chris@4 23 ;
Chris@4 24 ; 1. The origin of this software must not be misrepresented; you must not
Chris@4 25 ; claim that you wrote the original software. If you use this software
Chris@4 26 ; in a product, an acknowledgment in the product documentation would be
Chris@4 27 ; appreciated but is not required.
Chris@4 28 ; 2. Altered source versions must be plainly marked as such, and must not be
Chris@4 29 ; misrepresented as being the original software
Chris@4 30 ; 3. This notice may not be removed or altered from any source distribution.
Chris@4 31 ;
Chris@4 32 ;
Chris@4 33 ;
Chris@4 34 ; http://www.zlib.net
Chris@4 35 ; http://www.winimage.com/zLibDll
Chris@4 36 ; http://www.muppetlabs.com/~breadbox/software/assembly.html
Chris@4 37 ;
Chris@4 38 ; to compile this file for infozip Zip, I use option:
Chris@4 39 ; ml64.exe /Flgvmat64 /c /Zi /DINFOZIP gvmat64.asm
Chris@4 40 ;
Chris@4 41 ; to compile this file for zLib, I use option:
Chris@4 42 ; ml64.exe /Flgvmat64 /c /Zi gvmat64.asm
Chris@4 43 ; Be carrefull to adapt zlib1222add below to your version of zLib
Chris@4 44 ; (if you use a version of zLib before 1.0.4 or after 1.2.2.2, change
Chris@4 45 ; value of zlib1222add later)
Chris@4 46 ;
Chris@4 47 ; This file compile with Microsoft Macro Assembler (x64) for AMD64
Chris@4 48 ;
Chris@4 49 ; ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK
Chris@4 50 ;
Chris@4 51 ; (you can get Windows WDK with ml64 for AMD64 from
Chris@4 52 ; http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price)
Chris@4 53 ;
Chris@4 54
Chris@4 55
Chris@4 56 ;uInt longest_match(s, cur_match)
Chris@4 57 ; deflate_state *s;
Chris@4 58 ; IPos cur_match; /* current match */
Chris@4 59 .code
Chris@4 60 longest_match PROC
Chris@4 61
Chris@4 62
Chris@4 63 ;LocalVarsSize equ 88
Chris@4 64 LocalVarsSize equ 72
Chris@4 65
Chris@4 66 ; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12
Chris@4 67 ; free register : r14,r15
Chris@4 68 ; register can be saved : rsp
Chris@4 69
Chris@4 70 chainlenwmask equ rsp + 8 - LocalVarsSize ; high word: current chain len
Chris@4 71 ; low word: s->wmask
Chris@4 72 ;window equ rsp + xx - LocalVarsSize ; local copy of s->window ; stored in r10
Chris@4 73 ;windowbestlen equ rsp + xx - LocalVarsSize ; s->window + bestlen , use r10+r11
Chris@4 74 ;scanstart equ rsp + xx - LocalVarsSize ; first two bytes of string ; stored in r12w
Chris@4 75 ;scanend equ rsp + xx - LocalVarsSize ; last two bytes of string use ebx
Chris@4 76 ;scanalign equ rsp + xx - LocalVarsSize ; dword-misalignment of string r13
Chris@4 77 ;bestlen equ rsp + xx - LocalVarsSize ; size of best match so far -> r11d
Chris@4 78 ;scan equ rsp + xx - LocalVarsSize ; ptr to string wanting match -> r9
Chris@4 79 IFDEF INFOZIP
Chris@4 80 ELSE
Chris@4 81 nicematch equ (rsp + 16 - LocalVarsSize) ; a good enough match size
Chris@4 82 ENDIF
Chris@4 83
Chris@4 84 save_rdi equ rsp + 24 - LocalVarsSize
Chris@4 85 save_rsi equ rsp + 32 - LocalVarsSize
Chris@4 86 save_rbx equ rsp + 40 - LocalVarsSize
Chris@4 87 save_rbp equ rsp + 48 - LocalVarsSize
Chris@4 88 save_r12 equ rsp + 56 - LocalVarsSize
Chris@4 89 save_r13 equ rsp + 64 - LocalVarsSize
Chris@4 90 ;save_r14 equ rsp + 72 - LocalVarsSize
Chris@4 91 ;save_r15 equ rsp + 80 - LocalVarsSize
Chris@4 92
Chris@4 93
Chris@4 94 ; summary of register usage
Chris@4 95 ; scanend ebx
Chris@4 96 ; scanendw bx
Chris@4 97 ; chainlenwmask edx
Chris@4 98 ; curmatch rsi
Chris@4 99 ; curmatchd esi
Chris@4 100 ; windowbestlen r8
Chris@4 101 ; scanalign r9
Chris@4 102 ; scanalignd r9d
Chris@4 103 ; window r10
Chris@4 104 ; bestlen r11
Chris@4 105 ; bestlend r11d
Chris@4 106 ; scanstart r12d
Chris@4 107 ; scanstartw r12w
Chris@4 108 ; scan r13
Chris@4 109 ; nicematch r14d
Chris@4 110 ; limit r15
Chris@4 111 ; limitd r15d
Chris@4 112 ; prev rcx
Chris@4 113
Chris@4 114 ; all the +4 offsets are due to the addition of pending_buf_size (in zlib
Chris@4 115 ; in the deflate_state structure since the asm code was first written
Chris@4 116 ; (if you compile with zlib 1.0.4 or older, remove the +4).
Chris@4 117 ; Note : these value are good with a 8 bytes boundary pack structure
Chris@4 118
Chris@4 119
Chris@4 120 MAX_MATCH equ 258
Chris@4 121 MIN_MATCH equ 3
Chris@4 122 MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1)
Chris@4 123
Chris@4 124
Chris@4 125 ;;; Offsets for fields in the deflate_state structure. These numbers
Chris@4 126 ;;; are calculated from the definition of deflate_state, with the
Chris@4 127 ;;; assumption that the compiler will dword-align the fields. (Thus,
Chris@4 128 ;;; changing the definition of deflate_state could easily cause this
Chris@4 129 ;;; program to crash horribly, without so much as a warning at
Chris@4 130 ;;; compile time. Sigh.)
Chris@4 131
Chris@4 132 ; all the +zlib1222add offsets are due to the addition of fields
Chris@4 133 ; in zlib in the deflate_state structure since the asm code was first written
Chris@4 134 ; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
Chris@4 135 ; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
Chris@4 136 ; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
Chris@4 137
Chris@4 138
Chris@4 139 IFDEF INFOZIP
Chris@4 140
Chris@4 141 _DATA SEGMENT
Chris@4 142 COMM window_size:DWORD
Chris@4 143 ; WMask ; 7fff
Chris@4 144 COMM window:BYTE:010040H
Chris@4 145 COMM prev:WORD:08000H
Chris@4 146 ; MatchLen : unused
Chris@4 147 ; PrevMatch : unused
Chris@4 148 COMM strstart:DWORD
Chris@4 149 COMM match_start:DWORD
Chris@4 150 ; Lookahead : ignore
Chris@4 151 COMM prev_length:DWORD ; PrevLen
Chris@4 152 COMM max_chain_length:DWORD
Chris@4 153 COMM good_match:DWORD
Chris@4 154 COMM nice_match:DWORD
Chris@4 155 prev_ad equ OFFSET prev
Chris@4 156 window_ad equ OFFSET window
Chris@4 157 nicematch equ nice_match
Chris@4 158 _DATA ENDS
Chris@4 159 WMask equ 07fffh
Chris@4 160
Chris@4 161 ELSE
Chris@4 162
Chris@4 163 IFNDEF zlib1222add
Chris@4 164 zlib1222add equ 8
Chris@4 165 ENDIF
Chris@4 166 dsWSize equ 56+zlib1222add+(zlib1222add/2)
Chris@4 167 dsWMask equ 64+zlib1222add+(zlib1222add/2)
Chris@4 168 dsWindow equ 72+zlib1222add
Chris@4 169 dsPrev equ 88+zlib1222add
Chris@4 170 dsMatchLen equ 128+zlib1222add
Chris@4 171 dsPrevMatch equ 132+zlib1222add
Chris@4 172 dsStrStart equ 140+zlib1222add
Chris@4 173 dsMatchStart equ 144+zlib1222add
Chris@4 174 dsLookahead equ 148+zlib1222add
Chris@4 175 dsPrevLen equ 152+zlib1222add
Chris@4 176 dsMaxChainLen equ 156+zlib1222add
Chris@4 177 dsGoodMatch equ 172+zlib1222add
Chris@4 178 dsNiceMatch equ 176+zlib1222add
Chris@4 179
Chris@4 180 window_size equ [ rcx + dsWSize]
Chris@4 181 WMask equ [ rcx + dsWMask]
Chris@4 182 window_ad equ [ rcx + dsWindow]
Chris@4 183 prev_ad equ [ rcx + dsPrev]
Chris@4 184 strstart equ [ rcx + dsStrStart]
Chris@4 185 match_start equ [ rcx + dsMatchStart]
Chris@4 186 Lookahead equ [ rcx + dsLookahead] ; 0ffffffffh on infozip
Chris@4 187 prev_length equ [ rcx + dsPrevLen]
Chris@4 188 max_chain_length equ [ rcx + dsMaxChainLen]
Chris@4 189 good_match equ [ rcx + dsGoodMatch]
Chris@4 190 nice_match equ [ rcx + dsNiceMatch]
Chris@4 191 ENDIF
Chris@4 192
Chris@4 193 ; parameter 1 in r8(deflate state s), param 2 in rdx (cur match)
Chris@4 194
Chris@4 195 ; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
Chris@4 196 ; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
Chris@4 197 ;
Chris@4 198 ; All registers must be preserved across the call, except for
Chris@4 199 ; rax, rcx, rdx, r8, r9, r10, and r11, which are scratch.
Chris@4 200
Chris@4 201
Chris@4 202
Chris@4 203 ;;; Save registers that the compiler may be using, and adjust esp to
Chris@4 204 ;;; make room for our stack frame.
Chris@4 205
Chris@4 206
Chris@4 207 ;;; Retrieve the function arguments. r8d will hold cur_match
Chris@4 208 ;;; throughout the entire function. edx will hold the pointer to the
Chris@4 209 ;;; deflate_state structure during the function's setup (before
Chris@4 210 ;;; entering the main loop.
Chris@4 211
Chris@4 212 ; parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match)
Chris@4 213
Chris@4 214 ; this clear high 32 bits of r8, which can be garbage in both r8 and rdx
Chris@4 215
Chris@4 216 mov [save_rdi],rdi
Chris@4 217 mov [save_rsi],rsi
Chris@4 218 mov [save_rbx],rbx
Chris@4 219 mov [save_rbp],rbp
Chris@4 220 IFDEF INFOZIP
Chris@4 221 mov r8d,ecx
Chris@4 222 ELSE
Chris@4 223 mov r8d,edx
Chris@4 224 ENDIF
Chris@4 225 mov [save_r12],r12
Chris@4 226 mov [save_r13],r13
Chris@4 227 ; mov [save_r14],r14
Chris@4 228 ; mov [save_r15],r15
Chris@4 229
Chris@4 230
Chris@4 231 ;;; uInt wmask = s->w_mask;
Chris@4 232 ;;; unsigned chain_length = s->max_chain_length;
Chris@4 233 ;;; if (s->prev_length >= s->good_match) {
Chris@4 234 ;;; chain_length >>= 2;
Chris@4 235 ;;; }
Chris@4 236
Chris@4 237 mov edi, prev_length
Chris@4 238 mov esi, good_match
Chris@4 239 mov eax, WMask
Chris@4 240 mov ebx, max_chain_length
Chris@4 241 cmp edi, esi
Chris@4 242 jl LastMatchGood
Chris@4 243 shr ebx, 2
Chris@4 244 LastMatchGood:
Chris@4 245
Chris@4 246 ;;; chainlen is decremented once beforehand so that the function can
Chris@4 247 ;;; use the sign flag instead of the zero flag for the exit test.
Chris@4 248 ;;; It is then shifted into the high word, to make room for the wmask
Chris@4 249 ;;; value, which it will always accompany.
Chris@4 250
Chris@4 251 dec ebx
Chris@4 252 shl ebx, 16
Chris@4 253 or ebx, eax
Chris@4 254
Chris@4 255 ;;; on zlib only
Chris@4 256 ;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
Chris@4 257
Chris@4 258 IFDEF INFOZIP
Chris@4 259 mov [chainlenwmask], ebx
Chris@4 260 ; on infozip nice_match = [nice_match]
Chris@4 261 ELSE
Chris@4 262 mov eax, nice_match
Chris@4 263 mov [chainlenwmask], ebx
Chris@4 264 mov r10d, Lookahead
Chris@4 265 cmp r10d, eax
Chris@4 266 cmovnl r10d, eax
Chris@4 267 mov [nicematch],r10d
Chris@4 268 ENDIF
Chris@4 269
Chris@4 270 ;;; register Bytef *scan = s->window + s->strstart;
Chris@4 271 mov r10, window_ad
Chris@4 272 mov ebp, strstart
Chris@4 273 lea r13, [r10 + rbp]
Chris@4 274
Chris@4 275 ;;; Determine how many bytes the scan ptr is off from being
Chris@4 276 ;;; dword-aligned.
Chris@4 277
Chris@4 278 mov r9,r13
Chris@4 279 neg r13
Chris@4 280 and r13,3
Chris@4 281
Chris@4 282 ;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
Chris@4 283 ;;; s->strstart - (IPos)MAX_DIST(s) : NIL;
Chris@4 284 IFDEF INFOZIP
Chris@4 285 mov eax,07efah ; MAX_DIST = (WSIZE-MIN_LOOKAHEAD) (0x8000-(3+8+1))
Chris@4 286 ELSE
Chris@4 287 mov eax, window_size
Chris@4 288 sub eax, MIN_LOOKAHEAD
Chris@4 289 ENDIF
Chris@4 290 xor edi,edi
Chris@4 291 sub ebp, eax
Chris@4 292
Chris@4 293 mov r11d, prev_length
Chris@4 294
Chris@4 295 cmovng ebp,edi
Chris@4 296
Chris@4 297 ;;; int best_len = s->prev_length;
Chris@4 298
Chris@4 299
Chris@4 300 ;;; Store the sum of s->window + best_len in esi locally, and in esi.
Chris@4 301
Chris@4 302 lea rsi,[r10+r11]
Chris@4 303
Chris@4 304 ;;; register ush scan_start = *(ushf*)scan;
Chris@4 305 ;;; register ush scan_end = *(ushf*)(scan+best_len-1);
Chris@4 306 ;;; Posf *prev = s->prev;
Chris@4 307
Chris@4 308 movzx r12d,word ptr [r9]
Chris@4 309 movzx ebx, word ptr [r9 + r11 - 1]
Chris@4 310
Chris@4 311 mov rdi, prev_ad
Chris@4 312
Chris@4 313 ;;; Jump into the main loop.
Chris@4 314
Chris@4 315 mov edx, [chainlenwmask]
Chris@4 316
Chris@4 317 cmp bx,word ptr [rsi + r8 - 1]
Chris@4 318 jz LookupLoopIsZero
Chris@4 319
Chris@4 320 LookupLoop1:
Chris@4 321 and r8d, edx
Chris@4 322
Chris@4 323 movzx r8d, word ptr [rdi + r8*2]
Chris@4 324 cmp r8d, ebp
Chris@4 325 jbe LeaveNow
Chris@4 326 sub edx, 00010000h
Chris@4 327 js LeaveNow
Chris@4 328
Chris@4 329 LoopEntry1:
Chris@4 330 cmp bx,word ptr [rsi + r8 - 1]
Chris@4 331 jz LookupLoopIsZero
Chris@4 332
Chris@4 333 LookupLoop2:
Chris@4 334 and r8d, edx
Chris@4 335
Chris@4 336 movzx r8d, word ptr [rdi + r8*2]
Chris@4 337 cmp r8d, ebp
Chris@4 338 jbe LeaveNow
Chris@4 339 sub edx, 00010000h
Chris@4 340 js LeaveNow
Chris@4 341
Chris@4 342 LoopEntry2:
Chris@4 343 cmp bx,word ptr [rsi + r8 - 1]
Chris@4 344 jz LookupLoopIsZero
Chris@4 345
Chris@4 346 LookupLoop4:
Chris@4 347 and r8d, edx
Chris@4 348
Chris@4 349 movzx r8d, word ptr [rdi + r8*2]
Chris@4 350 cmp r8d, ebp
Chris@4 351 jbe LeaveNow
Chris@4 352 sub edx, 00010000h
Chris@4 353 js LeaveNow
Chris@4 354
Chris@4 355 LoopEntry4:
Chris@4 356
Chris@4 357 cmp bx,word ptr [rsi + r8 - 1]
Chris@4 358 jnz LookupLoop1
Chris@4 359 jmp LookupLoopIsZero
Chris@4 360
Chris@4 361
Chris@4 362 ;;; do {
Chris@4 363 ;;; match = s->window + cur_match;
Chris@4 364 ;;; if (*(ushf*)(match+best_len-1) != scan_end ||
Chris@4 365 ;;; *(ushf*)match != scan_start) continue;
Chris@4 366 ;;; [...]
Chris@4 367 ;;; } while ((cur_match = prev[cur_match & wmask]) > limit
Chris@4 368 ;;; && --chain_length != 0);
Chris@4 369 ;;;
Chris@4 370 ;;; Here is the inner loop of the function. The function will spend the
Chris@4 371 ;;; majority of its time in this loop, and majority of that time will
Chris@4 372 ;;; be spent in the first ten instructions.
Chris@4 373 ;;;
Chris@4 374 ;;; Within this loop:
Chris@4 375 ;;; ebx = scanend
Chris@4 376 ;;; r8d = curmatch
Chris@4 377 ;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
Chris@4 378 ;;; esi = windowbestlen - i.e., (window + bestlen)
Chris@4 379 ;;; edi = prev
Chris@4 380 ;;; ebp = limit
Chris@4 381
Chris@4 382 LookupLoop:
Chris@4 383 and r8d, edx
Chris@4 384
Chris@4 385 movzx r8d, word ptr [rdi + r8*2]
Chris@4 386 cmp r8d, ebp
Chris@4 387 jbe LeaveNow
Chris@4 388 sub edx, 00010000h
Chris@4 389 js LeaveNow
Chris@4 390
Chris@4 391 LoopEntry:
Chris@4 392
Chris@4 393 cmp bx,word ptr [rsi + r8 - 1]
Chris@4 394 jnz LookupLoop1
Chris@4 395 LookupLoopIsZero:
Chris@4 396 cmp r12w, word ptr [r10 + r8]
Chris@4 397 jnz LookupLoop1
Chris@4 398
Chris@4 399
Chris@4 400 ;;; Store the current value of chainlen.
Chris@4 401 mov [chainlenwmask], edx
Chris@4 402
Chris@4 403 ;;; Point edi to the string under scrutiny, and esi to the string we
Chris@4 404 ;;; are hoping to match it up with. In actuality, esi and edi are
Chris@4 405 ;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
Chris@4 406 ;;; initialized to -(MAX_MATCH_8 - scanalign).
Chris@4 407
Chris@4 408 lea rsi,[r8+r10]
Chris@4 409 mov rdx, 0fffffffffffffef8h; -(MAX_MATCH_8)
Chris@4 410 lea rsi, [rsi + r13 + 0108h] ;MAX_MATCH_8]
Chris@4 411 lea rdi, [r9 + r13 + 0108h] ;MAX_MATCH_8]
Chris@4 412
Chris@4 413 prefetcht1 [rsi+rdx]
Chris@4 414 prefetcht1 [rdi+rdx]
Chris@4 415
Chris@4 416
Chris@4 417 ;;; Test the strings for equality, 8 bytes at a time. At the end,
Chris@4 418 ;;; adjust rdx so that it is offset to the exact byte that mismatched.
Chris@4 419 ;;;
Chris@4 420 ;;; We already know at this point that the first three bytes of the
Chris@4 421 ;;; strings match each other, and they can be safely passed over before
Chris@4 422 ;;; starting the compare loop. So what this code does is skip over 0-3
Chris@4 423 ;;; bytes, as much as necessary in order to dword-align the edi
Chris@4 424 ;;; pointer. (rsi will still be misaligned three times out of four.)
Chris@4 425 ;;;
Chris@4 426 ;;; It should be confessed that this loop usually does not represent
Chris@4 427 ;;; much of the total running time. Replacing it with a more
Chris@4 428 ;;; straightforward "rep cmpsb" would not drastically degrade
Chris@4 429 ;;; performance.
Chris@4 430
Chris@4 431
Chris@4 432 LoopCmps:
Chris@4 433 mov rax, [rsi + rdx]
Chris@4 434 xor rax, [rdi + rdx]
Chris@4 435 jnz LeaveLoopCmps
Chris@4 436
Chris@4 437 mov rax, [rsi + rdx + 8]
Chris@4 438 xor rax, [rdi + rdx + 8]
Chris@4 439 jnz LeaveLoopCmps8
Chris@4 440
Chris@4 441
Chris@4 442 mov rax, [rsi + rdx + 8+8]
Chris@4 443 xor rax, [rdi + rdx + 8+8]
Chris@4 444 jnz LeaveLoopCmps16
Chris@4 445
Chris@4 446 add rdx,8+8+8
Chris@4 447
Chris@4 448 jnz short LoopCmps
Chris@4 449 jmp short LenMaximum
Chris@4 450 LeaveLoopCmps16: add rdx,8
Chris@4 451 LeaveLoopCmps8: add rdx,8
Chris@4 452 LeaveLoopCmps:
Chris@4 453
Chris@4 454 test eax, 0000FFFFh
Chris@4 455 jnz LenLower
Chris@4 456
Chris@4 457 test eax,0ffffffffh
Chris@4 458
Chris@4 459 jnz LenLower32
Chris@4 460
Chris@4 461 add rdx,4
Chris@4 462 shr rax,32
Chris@4 463 or ax,ax
Chris@4 464 jnz LenLower
Chris@4 465
Chris@4 466 LenLower32:
Chris@4 467 shr eax,16
Chris@4 468 add rdx,2
Chris@4 469 LenLower: sub al, 1
Chris@4 470 adc rdx, 0
Chris@4 471 ;;; Calculate the length of the match. If it is longer than MAX_MATCH,
Chris@4 472 ;;; then automatically accept it as the best possible match and leave.
Chris@4 473
Chris@4 474 lea rax, [rdi + rdx]
Chris@4 475 sub rax, r9
Chris@4 476 cmp eax, MAX_MATCH
Chris@4 477 jge LenMaximum
Chris@4 478
Chris@4 479 ;;; If the length of the match is not longer than the best match we
Chris@4 480 ;;; have so far, then forget it and return to the lookup loop.
Chris@4 481 ;///////////////////////////////////
Chris@4 482
Chris@4 483 cmp eax, r11d
Chris@4 484 jg LongerMatch
Chris@4 485
Chris@4 486 lea rsi,[r10+r11]
Chris@4 487
Chris@4 488 mov rdi, prev_ad
Chris@4 489 mov edx, [chainlenwmask]
Chris@4 490 jmp LookupLoop
Chris@4 491
Chris@4 492 ;;; s->match_start = cur_match;
Chris@4 493 ;;; best_len = len;
Chris@4 494 ;;; if (len >= nice_match) break;
Chris@4 495 ;;; scan_end = *(ushf*)(scan+best_len-1);
Chris@4 496
Chris@4 497 LongerMatch:
Chris@4 498 mov r11d, eax
Chris@4 499 mov match_start, r8d
Chris@4 500 cmp eax, [nicematch]
Chris@4 501 jge LeaveNow
Chris@4 502
Chris@4 503 lea rsi,[r10+rax]
Chris@4 504
Chris@4 505 movzx ebx, word ptr [r9 + rax - 1]
Chris@4 506 mov rdi, prev_ad
Chris@4 507 mov edx, [chainlenwmask]
Chris@4 508 jmp LookupLoop
Chris@4 509
Chris@4 510 ;;; Accept the current string, with the maximum possible length.
Chris@4 511
Chris@4 512 LenMaximum:
Chris@4 513 mov r11d,MAX_MATCH
Chris@4 514 mov match_start, r8d
Chris@4 515
Chris@4 516 ;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
Chris@4 517 ;;; return s->lookahead;
Chris@4 518
Chris@4 519 LeaveNow:
Chris@4 520 IFDEF INFOZIP
Chris@4 521 mov eax,r11d
Chris@4 522 ELSE
Chris@4 523 mov eax, Lookahead
Chris@4 524 cmp r11d, eax
Chris@4 525 cmovng eax, r11d
Chris@4 526 ENDIF
Chris@4 527
Chris@4 528 ;;; Restore the stack and return from whence we came.
Chris@4 529
Chris@4 530
Chris@4 531 mov rsi,[save_rsi]
Chris@4 532 mov rdi,[save_rdi]
Chris@4 533 mov rbx,[save_rbx]
Chris@4 534 mov rbp,[save_rbp]
Chris@4 535 mov r12,[save_r12]
Chris@4 536 mov r13,[save_r13]
Chris@4 537 ; mov r14,[save_r14]
Chris@4 538 ; mov r15,[save_r15]
Chris@4 539
Chris@4 540
Chris@4 541 ret 0
Chris@4 542 ; please don't remove this string !
Chris@4 543 ; Your can freely use gvmat64 in any free or commercial app
Chris@4 544 ; but it is far better don't remove the string in the binary!
Chris@4 545 db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0
Chris@4 546 longest_match ENDP
Chris@4 547
Chris@4 548 match_init PROC
Chris@4 549 ret 0
Chris@4 550 match_init ENDP
Chris@4 551
Chris@4 552
Chris@4 553 END