annotate src/zlib-1.2.8/contrib/masmx64/inffasx64.asm @ 81:7029a4916348

Merge build update
author Chris Cannam
date Thu, 31 Oct 2019 13:36:58 +0000
parents 5ea0608b923f
children
rev   line source
Chris@43 1 ; inffasx64.asm is a hand tuned assembler version of inffast.c - fast decoding
Chris@43 2 ; version for AMD64 on Windows using Microsoft C compiler
Chris@43 3 ;
Chris@43 4 ; inffasx64.asm is automatically convert from AMD64 portion of inffas86.c
Chris@43 5 ; inffasx64.asm is called by inffas8664.c, which contain more info.
Chris@43 6
Chris@43 7
Chris@43 8 ; to compile this file, I use option
Chris@43 9 ; ml64.exe /Flinffasx64 /c /Zi inffasx64.asm
Chris@43 10 ; with Microsoft Macro Assembler (x64) for AMD64
Chris@43 11 ;
Chris@43 12
Chris@43 13 ; This file compile with Microsoft Macro Assembler (x64) for AMD64
Chris@43 14 ;
Chris@43 15 ; ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK
Chris@43 16 ;
Chris@43 17 ; (you can get Windows WDK with ml64 for AMD64 from
Chris@43 18 ; http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price)
Chris@43 19 ;
Chris@43 20
Chris@43 21
Chris@43 22 .code
Chris@43 23 inffas8664fnc PROC
Chris@43 24
Chris@43 25 ; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
Chris@43 26 ; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
Chris@43 27 ;
Chris@43 28 ; All registers must be preserved across the call, except for
Chris@43 29 ; rax, rcx, rdx, r8, r-9, r10, and r11, which are scratch.
Chris@43 30
Chris@43 31
Chris@43 32 mov [rsp-8],rsi
Chris@43 33 mov [rsp-16],rdi
Chris@43 34 mov [rsp-24],r12
Chris@43 35 mov [rsp-32],r13
Chris@43 36 mov [rsp-40],r14
Chris@43 37 mov [rsp-48],r15
Chris@43 38 mov [rsp-56],rbx
Chris@43 39
Chris@43 40 mov rax,rcx
Chris@43 41
Chris@43 42 mov [rax+8], rbp ; /* save regs rbp and rsp */
Chris@43 43 mov [rax], rsp
Chris@43 44
Chris@43 45 mov rsp, rax ; /* make rsp point to &ar */
Chris@43 46
Chris@43 47 mov rsi, [rsp+16] ; /* rsi = in */
Chris@43 48 mov rdi, [rsp+32] ; /* rdi = out */
Chris@43 49 mov r9, [rsp+24] ; /* r9 = last */
Chris@43 50 mov r10, [rsp+48] ; /* r10 = end */
Chris@43 51 mov rbp, [rsp+64] ; /* rbp = lcode */
Chris@43 52 mov r11, [rsp+72] ; /* r11 = dcode */
Chris@43 53 mov rdx, [rsp+80] ; /* rdx = hold */
Chris@43 54 mov ebx, [rsp+88] ; /* ebx = bits */
Chris@43 55 mov r12d, [rsp+100] ; /* r12d = lmask */
Chris@43 56 mov r13d, [rsp+104] ; /* r13d = dmask */
Chris@43 57 ; /* r14d = len */
Chris@43 58 ; /* r15d = dist */
Chris@43 59
Chris@43 60
Chris@43 61 cld
Chris@43 62 cmp r10, rdi
Chris@43 63 je L_one_time ; /* if only one decode left */
Chris@43 64 cmp r9, rsi
Chris@43 65
Chris@43 66 jne L_do_loop
Chris@43 67
Chris@43 68
Chris@43 69 L_one_time:
Chris@43 70 mov r8, r12 ; /* r8 = lmask */
Chris@43 71 cmp bl, 32
Chris@43 72 ja L_get_length_code_one_time
Chris@43 73
Chris@43 74 lodsd ; /* eax = *(uint *)in++ */
Chris@43 75 mov cl, bl ; /* cl = bits, needs it for shifting */
Chris@43 76 add bl, 32 ; /* bits += 32 */
Chris@43 77 shl rax, cl
Chris@43 78 or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
Chris@43 79 jmp L_get_length_code_one_time
Chris@43 80
Chris@43 81 ALIGN 4
Chris@43 82 L_while_test:
Chris@43 83 cmp r10, rdi
Chris@43 84 jbe L_break_loop
Chris@43 85 cmp r9, rsi
Chris@43 86 jbe L_break_loop
Chris@43 87
Chris@43 88 L_do_loop:
Chris@43 89 mov r8, r12 ; /* r8 = lmask */
Chris@43 90 cmp bl, 32
Chris@43 91 ja L_get_length_code ; /* if (32 < bits) */
Chris@43 92
Chris@43 93 lodsd ; /* eax = *(uint *)in++ */
Chris@43 94 mov cl, bl ; /* cl = bits, needs it for shifting */
Chris@43 95 add bl, 32 ; /* bits += 32 */
Chris@43 96 shl rax, cl
Chris@43 97 or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
Chris@43 98
Chris@43 99 L_get_length_code:
Chris@43 100 and r8, rdx ; /* r8 &= hold */
Chris@43 101 mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */
Chris@43 102
Chris@43 103 mov cl, ah ; /* cl = this.bits */
Chris@43 104 sub bl, ah ; /* bits -= this.bits */
Chris@43 105 shr rdx, cl ; /* hold >>= this.bits */
Chris@43 106
Chris@43 107 test al, al
Chris@43 108 jnz L_test_for_length_base ; /* if (op != 0) 45.7% */
Chris@43 109
Chris@43 110 mov r8, r12 ; /* r8 = lmask */
Chris@43 111 shr eax, 16 ; /* output this.val char */
Chris@43 112 stosb
Chris@43 113
Chris@43 114 L_get_length_code_one_time:
Chris@43 115 and r8, rdx ; /* r8 &= hold */
Chris@43 116 mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */
Chris@43 117
Chris@43 118 L_dolen:
Chris@43 119 mov cl, ah ; /* cl = this.bits */
Chris@43 120 sub bl, ah ; /* bits -= this.bits */
Chris@43 121 shr rdx, cl ; /* hold >>= this.bits */
Chris@43 122
Chris@43 123 test al, al
Chris@43 124 jnz L_test_for_length_base ; /* if (op != 0) 45.7% */
Chris@43 125
Chris@43 126 shr eax, 16 ; /* output this.val char */
Chris@43 127 stosb
Chris@43 128 jmp L_while_test
Chris@43 129
Chris@43 130 ALIGN 4
Chris@43 131 L_test_for_length_base:
Chris@43 132 mov r14d, eax ; /* len = this */
Chris@43 133 shr r14d, 16 ; /* len = this.val */
Chris@43 134 mov cl, al
Chris@43 135
Chris@43 136 test al, 16
Chris@43 137 jz L_test_for_second_level_length ; /* if ((op & 16) == 0) 8% */
Chris@43 138 and cl, 15 ; /* op &= 15 */
Chris@43 139 jz L_decode_distance ; /* if (!op) */
Chris@43 140
Chris@43 141 L_add_bits_to_len:
Chris@43 142 sub bl, cl
Chris@43 143 xor eax, eax
Chris@43 144 inc eax
Chris@43 145 shl eax, cl
Chris@43 146 dec eax
Chris@43 147 and eax, edx ; /* eax &= hold */
Chris@43 148 shr rdx, cl
Chris@43 149 add r14d, eax ; /* len += hold & mask[op] */
Chris@43 150
Chris@43 151 L_decode_distance:
Chris@43 152 mov r8, r13 ; /* r8 = dmask */
Chris@43 153 cmp bl, 32
Chris@43 154 ja L_get_distance_code ; /* if (32 < bits) */
Chris@43 155
Chris@43 156 lodsd ; /* eax = *(uint *)in++ */
Chris@43 157 mov cl, bl ; /* cl = bits, needs it for shifting */
Chris@43 158 add bl, 32 ; /* bits += 32 */
Chris@43 159 shl rax, cl
Chris@43 160 or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
Chris@43 161
Chris@43 162 L_get_distance_code:
Chris@43 163 and r8, rdx ; /* r8 &= hold */
Chris@43 164 mov eax, [r11+r8*4] ; /* eax = dcode[hold & dmask] */
Chris@43 165
Chris@43 166 L_dodist:
Chris@43 167 mov r15d, eax ; /* dist = this */
Chris@43 168 shr r15d, 16 ; /* dist = this.val */
Chris@43 169 mov cl, ah
Chris@43 170 sub bl, ah ; /* bits -= this.bits */
Chris@43 171 shr rdx, cl ; /* hold >>= this.bits */
Chris@43 172 mov cl, al ; /* cl = this.op */
Chris@43 173
Chris@43 174 test al, 16 ; /* if ((op & 16) == 0) */
Chris@43 175 jz L_test_for_second_level_dist
Chris@43 176 and cl, 15 ; /* op &= 15 */
Chris@43 177 jz L_check_dist_one
Chris@43 178
Chris@43 179 L_add_bits_to_dist:
Chris@43 180 sub bl, cl
Chris@43 181 xor eax, eax
Chris@43 182 inc eax
Chris@43 183 shl eax, cl
Chris@43 184 dec eax ; /* (1 << op) - 1 */
Chris@43 185 and eax, edx ; /* eax &= hold */
Chris@43 186 shr rdx, cl
Chris@43 187 add r15d, eax ; /* dist += hold & ((1 << op) - 1) */
Chris@43 188
Chris@43 189 L_check_window:
Chris@43 190 mov r8, rsi ; /* save in so from can use it's reg */
Chris@43 191 mov rax, rdi
Chris@43 192 sub rax, [rsp+40] ; /* nbytes = out - beg */
Chris@43 193
Chris@43 194 cmp eax, r15d
Chris@43 195 jb L_clip_window ; /* if (dist > nbytes) 4.2% */
Chris@43 196
Chris@43 197 mov ecx, r14d ; /* ecx = len */
Chris@43 198 mov rsi, rdi
Chris@43 199 sub rsi, r15 ; /* from = out - dist */
Chris@43 200
Chris@43 201 sar ecx, 1
Chris@43 202 jnc L_copy_two ; /* if len % 2 == 0 */
Chris@43 203
Chris@43 204 rep movsw
Chris@43 205 mov al, [rsi]
Chris@43 206 mov [rdi], al
Chris@43 207 inc rdi
Chris@43 208
Chris@43 209 mov rsi, r8 ; /* move in back to %rsi, toss from */
Chris@43 210 jmp L_while_test
Chris@43 211
Chris@43 212 L_copy_two:
Chris@43 213 rep movsw
Chris@43 214 mov rsi, r8 ; /* move in back to %rsi, toss from */
Chris@43 215 jmp L_while_test
Chris@43 216
Chris@43 217 ALIGN 4
Chris@43 218 L_check_dist_one:
Chris@43 219 cmp r15d, 1 ; /* if dist 1, is a memset */
Chris@43 220 jne L_check_window
Chris@43 221 cmp [rsp+40], rdi ; /* if out == beg, outside window */
Chris@43 222 je L_check_window
Chris@43 223
Chris@43 224 mov ecx, r14d ; /* ecx = len */
Chris@43 225 mov al, [rdi-1]
Chris@43 226 mov ah, al
Chris@43 227
Chris@43 228 sar ecx, 1
Chris@43 229 jnc L_set_two
Chris@43 230 mov [rdi], al
Chris@43 231 inc rdi
Chris@43 232
Chris@43 233 L_set_two:
Chris@43 234 rep stosw
Chris@43 235 jmp L_while_test
Chris@43 236
Chris@43 237 ALIGN 4
Chris@43 238 L_test_for_second_level_length:
Chris@43 239 test al, 64
Chris@43 240 jnz L_test_for_end_of_block ; /* if ((op & 64) != 0) */
Chris@43 241
Chris@43 242 xor eax, eax
Chris@43 243 inc eax
Chris@43 244 shl eax, cl
Chris@43 245 dec eax
Chris@43 246 and eax, edx ; /* eax &= hold */
Chris@43 247 add eax, r14d ; /* eax += len */
Chris@43 248 mov eax, [rbp+rax*4] ; /* eax = lcode[val+(hold&mask[op])]*/
Chris@43 249 jmp L_dolen
Chris@43 250
Chris@43 251 ALIGN 4
Chris@43 252 L_test_for_second_level_dist:
Chris@43 253 test al, 64
Chris@43 254 jnz L_invalid_distance_code ; /* if ((op & 64) != 0) */
Chris@43 255
Chris@43 256 xor eax, eax
Chris@43 257 inc eax
Chris@43 258 shl eax, cl
Chris@43 259 dec eax
Chris@43 260 and eax, edx ; /* eax &= hold */
Chris@43 261 add eax, r15d ; /* eax += dist */
Chris@43 262 mov eax, [r11+rax*4] ; /* eax = dcode[val+(hold&mask[op])]*/
Chris@43 263 jmp L_dodist
Chris@43 264
Chris@43 265 ALIGN 4
Chris@43 266 L_clip_window:
Chris@43 267 mov ecx, eax ; /* ecx = nbytes */
Chris@43 268 mov eax, [rsp+92] ; /* eax = wsize, prepare for dist cmp */
Chris@43 269 neg ecx ; /* nbytes = -nbytes */
Chris@43 270
Chris@43 271 cmp eax, r15d
Chris@43 272 jb L_invalid_distance_too_far ; /* if (dist > wsize) */
Chris@43 273
Chris@43 274 add ecx, r15d ; /* nbytes = dist - nbytes */
Chris@43 275 cmp dword ptr [rsp+96], 0
Chris@43 276 jne L_wrap_around_window ; /* if (write != 0) */
Chris@43 277
Chris@43 278 mov rsi, [rsp+56] ; /* from = window */
Chris@43 279 sub eax, ecx ; /* eax -= nbytes */
Chris@43 280 add rsi, rax ; /* from += wsize - nbytes */
Chris@43 281
Chris@43 282 mov eax, r14d ; /* eax = len */
Chris@43 283 cmp r14d, ecx
Chris@43 284 jbe L_do_copy ; /* if (nbytes >= len) */
Chris@43 285
Chris@43 286 sub eax, ecx ; /* eax -= nbytes */
Chris@43 287 rep movsb
Chris@43 288 mov rsi, rdi
Chris@43 289 sub rsi, r15 ; /* from = &out[ -dist ] */
Chris@43 290 jmp L_do_copy
Chris@43 291
Chris@43 292 ALIGN 4
Chris@43 293 L_wrap_around_window:
Chris@43 294 mov eax, [rsp+96] ; /* eax = write */
Chris@43 295 cmp ecx, eax
Chris@43 296 jbe L_contiguous_in_window ; /* if (write >= nbytes) */
Chris@43 297
Chris@43 298 mov esi, [rsp+92] ; /* from = wsize */
Chris@43 299 add rsi, [rsp+56] ; /* from += window */
Chris@43 300 add rsi, rax ; /* from += write */
Chris@43 301 sub rsi, rcx ; /* from -= nbytes */
Chris@43 302 sub ecx, eax ; /* nbytes -= write */
Chris@43 303
Chris@43 304 mov eax, r14d ; /* eax = len */
Chris@43 305 cmp eax, ecx
Chris@43 306 jbe L_do_copy ; /* if (nbytes >= len) */
Chris@43 307
Chris@43 308 sub eax, ecx ; /* len -= nbytes */
Chris@43 309 rep movsb
Chris@43 310 mov rsi, [rsp+56] ; /* from = window */
Chris@43 311 mov ecx, [rsp+96] ; /* nbytes = write */
Chris@43 312 cmp eax, ecx
Chris@43 313 jbe L_do_copy ; /* if (nbytes >= len) */
Chris@43 314
Chris@43 315 sub eax, ecx ; /* len -= nbytes */
Chris@43 316 rep movsb
Chris@43 317 mov rsi, rdi
Chris@43 318 sub rsi, r15 ; /* from = out - dist */
Chris@43 319 jmp L_do_copy
Chris@43 320
Chris@43 321 ALIGN 4
Chris@43 322 L_contiguous_in_window:
Chris@43 323 mov rsi, [rsp+56] ; /* rsi = window */
Chris@43 324 add rsi, rax
Chris@43 325 sub rsi, rcx ; /* from += write - nbytes */
Chris@43 326
Chris@43 327 mov eax, r14d ; /* eax = len */
Chris@43 328 cmp eax, ecx
Chris@43 329 jbe L_do_copy ; /* if (nbytes >= len) */
Chris@43 330
Chris@43 331 sub eax, ecx ; /* len -= nbytes */
Chris@43 332 rep movsb
Chris@43 333 mov rsi, rdi
Chris@43 334 sub rsi, r15 ; /* from = out - dist */
Chris@43 335 jmp L_do_copy ; /* if (nbytes >= len) */
Chris@43 336
Chris@43 337 ALIGN 4
Chris@43 338 L_do_copy:
Chris@43 339 mov ecx, eax ; /* ecx = len */
Chris@43 340 rep movsb
Chris@43 341
Chris@43 342 mov rsi, r8 ; /* move in back to %esi, toss from */
Chris@43 343 jmp L_while_test
Chris@43 344
Chris@43 345 L_test_for_end_of_block:
Chris@43 346 test al, 32
Chris@43 347 jz L_invalid_literal_length_code
Chris@43 348 mov dword ptr [rsp+116], 1
Chris@43 349 jmp L_break_loop_with_status
Chris@43 350
Chris@43 351 L_invalid_literal_length_code:
Chris@43 352 mov dword ptr [rsp+116], 2
Chris@43 353 jmp L_break_loop_with_status
Chris@43 354
Chris@43 355 L_invalid_distance_code:
Chris@43 356 mov dword ptr [rsp+116], 3
Chris@43 357 jmp L_break_loop_with_status
Chris@43 358
Chris@43 359 L_invalid_distance_too_far:
Chris@43 360 mov dword ptr [rsp+116], 4
Chris@43 361 jmp L_break_loop_with_status
Chris@43 362
Chris@43 363 L_break_loop:
Chris@43 364 mov dword ptr [rsp+116], 0
Chris@43 365
Chris@43 366 L_break_loop_with_status:
Chris@43 367 ; /* put in, out, bits, and hold back into ar and pop esp */
Chris@43 368 mov [rsp+16], rsi ; /* in */
Chris@43 369 mov [rsp+32], rdi ; /* out */
Chris@43 370 mov [rsp+88], ebx ; /* bits */
Chris@43 371 mov [rsp+80], rdx ; /* hold */
Chris@43 372
Chris@43 373 mov rax, [rsp] ; /* restore rbp and rsp */
Chris@43 374 mov rbp, [rsp+8]
Chris@43 375 mov rsp, rax
Chris@43 376
Chris@43 377
Chris@43 378
Chris@43 379 mov rsi,[rsp-8]
Chris@43 380 mov rdi,[rsp-16]
Chris@43 381 mov r12,[rsp-24]
Chris@43 382 mov r13,[rsp-32]
Chris@43 383 mov r14,[rsp-40]
Chris@43 384 mov r15,[rsp-48]
Chris@43 385 mov rbx,[rsp-56]
Chris@43 386
Chris@43 387 ret 0
Chris@43 388 ; :
Chris@43 389 ; : "m" (ar)
Chris@43 390 ; : "memory", "%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi",
Chris@43 391 ; "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
Chris@43 392 ; );
Chris@43 393
Chris@43 394 inffas8664fnc ENDP
Chris@43 395 ;_TEXT ENDS
Chris@43 396 END