annotate src/zlib-1.2.7/contrib/masmx64/inffasx64.asm @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents e13257ea84a4
children
rev   line source
Chris@4 1 ; inffasx64.asm is a hand tuned assembler version of inffast.c - fast decoding
Chris@4 2 ; version for AMD64 on Windows using Microsoft C compiler
Chris@4 3 ;
Chris@4 4 ; inffasx64.asm is automatically convert from AMD64 portion of inffas86.c
Chris@4 5 ; inffasx64.asm is called by inffas8664.c, which contain more info.
Chris@4 6
Chris@4 7
Chris@4 8 ; to compile this file, I use option
Chris@4 9 ; ml64.exe /Flinffasx64 /c /Zi inffasx64.asm
Chris@4 10 ; with Microsoft Macro Assembler (x64) for AMD64
Chris@4 11 ;
Chris@4 12
Chris@4 13 ; This file compile with Microsoft Macro Assembler (x64) for AMD64
Chris@4 14 ;
Chris@4 15 ; ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK
Chris@4 16 ;
Chris@4 17 ; (you can get Windows WDK with ml64 for AMD64 from
Chris@4 18 ; http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price)
Chris@4 19 ;
Chris@4 20
Chris@4 21
Chris@4 22 .code
Chris@4 23 inffas8664fnc PROC
Chris@4 24
Chris@4 25 ; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
Chris@4 26 ; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
Chris@4 27 ;
Chris@4 28 ; All registers must be preserved across the call, except for
Chris@4 29 ; rax, rcx, rdx, r8, r-9, r10, and r11, which are scratch.
Chris@4 30
Chris@4 31
Chris@4 32 mov [rsp-8],rsi
Chris@4 33 mov [rsp-16],rdi
Chris@4 34 mov [rsp-24],r12
Chris@4 35 mov [rsp-32],r13
Chris@4 36 mov [rsp-40],r14
Chris@4 37 mov [rsp-48],r15
Chris@4 38 mov [rsp-56],rbx
Chris@4 39
Chris@4 40 mov rax,rcx
Chris@4 41
Chris@4 42 mov [rax+8], rbp ; /* save regs rbp and rsp */
Chris@4 43 mov [rax], rsp
Chris@4 44
Chris@4 45 mov rsp, rax ; /* make rsp point to &ar */
Chris@4 46
Chris@4 47 mov rsi, [rsp+16] ; /* rsi = in */
Chris@4 48 mov rdi, [rsp+32] ; /* rdi = out */
Chris@4 49 mov r9, [rsp+24] ; /* r9 = last */
Chris@4 50 mov r10, [rsp+48] ; /* r10 = end */
Chris@4 51 mov rbp, [rsp+64] ; /* rbp = lcode */
Chris@4 52 mov r11, [rsp+72] ; /* r11 = dcode */
Chris@4 53 mov rdx, [rsp+80] ; /* rdx = hold */
Chris@4 54 mov ebx, [rsp+88] ; /* ebx = bits */
Chris@4 55 mov r12d, [rsp+100] ; /* r12d = lmask */
Chris@4 56 mov r13d, [rsp+104] ; /* r13d = dmask */
Chris@4 57 ; /* r14d = len */
Chris@4 58 ; /* r15d = dist */
Chris@4 59
Chris@4 60
Chris@4 61 cld
Chris@4 62 cmp r10, rdi
Chris@4 63 je L_one_time ; /* if only one decode left */
Chris@4 64 cmp r9, rsi
Chris@4 65
Chris@4 66 jne L_do_loop
Chris@4 67
Chris@4 68
Chris@4 69 L_one_time:
Chris@4 70 mov r8, r12 ; /* r8 = lmask */
Chris@4 71 cmp bl, 32
Chris@4 72 ja L_get_length_code_one_time
Chris@4 73
Chris@4 74 lodsd ; /* eax = *(uint *)in++ */
Chris@4 75 mov cl, bl ; /* cl = bits, needs it for shifting */
Chris@4 76 add bl, 32 ; /* bits += 32 */
Chris@4 77 shl rax, cl
Chris@4 78 or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
Chris@4 79 jmp L_get_length_code_one_time
Chris@4 80
Chris@4 81 ALIGN 4
Chris@4 82 L_while_test:
Chris@4 83 cmp r10, rdi
Chris@4 84 jbe L_break_loop
Chris@4 85 cmp r9, rsi
Chris@4 86 jbe L_break_loop
Chris@4 87
Chris@4 88 L_do_loop:
Chris@4 89 mov r8, r12 ; /* r8 = lmask */
Chris@4 90 cmp bl, 32
Chris@4 91 ja L_get_length_code ; /* if (32 < bits) */
Chris@4 92
Chris@4 93 lodsd ; /* eax = *(uint *)in++ */
Chris@4 94 mov cl, bl ; /* cl = bits, needs it for shifting */
Chris@4 95 add bl, 32 ; /* bits += 32 */
Chris@4 96 shl rax, cl
Chris@4 97 or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
Chris@4 98
Chris@4 99 L_get_length_code:
Chris@4 100 and r8, rdx ; /* r8 &= hold */
Chris@4 101 mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */
Chris@4 102
Chris@4 103 mov cl, ah ; /* cl = this.bits */
Chris@4 104 sub bl, ah ; /* bits -= this.bits */
Chris@4 105 shr rdx, cl ; /* hold >>= this.bits */
Chris@4 106
Chris@4 107 test al, al
Chris@4 108 jnz L_test_for_length_base ; /* if (op != 0) 45.7% */
Chris@4 109
Chris@4 110 mov r8, r12 ; /* r8 = lmask */
Chris@4 111 shr eax, 16 ; /* output this.val char */
Chris@4 112 stosb
Chris@4 113
Chris@4 114 L_get_length_code_one_time:
Chris@4 115 and r8, rdx ; /* r8 &= hold */
Chris@4 116 mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */
Chris@4 117
Chris@4 118 L_dolen:
Chris@4 119 mov cl, ah ; /* cl = this.bits */
Chris@4 120 sub bl, ah ; /* bits -= this.bits */
Chris@4 121 shr rdx, cl ; /* hold >>= this.bits */
Chris@4 122
Chris@4 123 test al, al
Chris@4 124 jnz L_test_for_length_base ; /* if (op != 0) 45.7% */
Chris@4 125
Chris@4 126 shr eax, 16 ; /* output this.val char */
Chris@4 127 stosb
Chris@4 128 jmp L_while_test
Chris@4 129
Chris@4 130 ALIGN 4
Chris@4 131 L_test_for_length_base:
Chris@4 132 mov r14d, eax ; /* len = this */
Chris@4 133 shr r14d, 16 ; /* len = this.val */
Chris@4 134 mov cl, al
Chris@4 135
Chris@4 136 test al, 16
Chris@4 137 jz L_test_for_second_level_length ; /* if ((op & 16) == 0) 8% */
Chris@4 138 and cl, 15 ; /* op &= 15 */
Chris@4 139 jz L_decode_distance ; /* if (!op) */
Chris@4 140
Chris@4 141 L_add_bits_to_len:
Chris@4 142 sub bl, cl
Chris@4 143 xor eax, eax
Chris@4 144 inc eax
Chris@4 145 shl eax, cl
Chris@4 146 dec eax
Chris@4 147 and eax, edx ; /* eax &= hold */
Chris@4 148 shr rdx, cl
Chris@4 149 add r14d, eax ; /* len += hold & mask[op] */
Chris@4 150
Chris@4 151 L_decode_distance:
Chris@4 152 mov r8, r13 ; /* r8 = dmask */
Chris@4 153 cmp bl, 32
Chris@4 154 ja L_get_distance_code ; /* if (32 < bits) */
Chris@4 155
Chris@4 156 lodsd ; /* eax = *(uint *)in++ */
Chris@4 157 mov cl, bl ; /* cl = bits, needs it for shifting */
Chris@4 158 add bl, 32 ; /* bits += 32 */
Chris@4 159 shl rax, cl
Chris@4 160 or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
Chris@4 161
Chris@4 162 L_get_distance_code:
Chris@4 163 and r8, rdx ; /* r8 &= hold */
Chris@4 164 mov eax, [r11+r8*4] ; /* eax = dcode[hold & dmask] */
Chris@4 165
Chris@4 166 L_dodist:
Chris@4 167 mov r15d, eax ; /* dist = this */
Chris@4 168 shr r15d, 16 ; /* dist = this.val */
Chris@4 169 mov cl, ah
Chris@4 170 sub bl, ah ; /* bits -= this.bits */
Chris@4 171 shr rdx, cl ; /* hold >>= this.bits */
Chris@4 172 mov cl, al ; /* cl = this.op */
Chris@4 173
Chris@4 174 test al, 16 ; /* if ((op & 16) == 0) */
Chris@4 175 jz L_test_for_second_level_dist
Chris@4 176 and cl, 15 ; /* op &= 15 */
Chris@4 177 jz L_check_dist_one
Chris@4 178
Chris@4 179 L_add_bits_to_dist:
Chris@4 180 sub bl, cl
Chris@4 181 xor eax, eax
Chris@4 182 inc eax
Chris@4 183 shl eax, cl
Chris@4 184 dec eax ; /* (1 << op) - 1 */
Chris@4 185 and eax, edx ; /* eax &= hold */
Chris@4 186 shr rdx, cl
Chris@4 187 add r15d, eax ; /* dist += hold & ((1 << op) - 1) */
Chris@4 188
Chris@4 189 L_check_window:
Chris@4 190 mov r8, rsi ; /* save in so from can use it's reg */
Chris@4 191 mov rax, rdi
Chris@4 192 sub rax, [rsp+40] ; /* nbytes = out - beg */
Chris@4 193
Chris@4 194 cmp eax, r15d
Chris@4 195 jb L_clip_window ; /* if (dist > nbytes) 4.2% */
Chris@4 196
Chris@4 197 mov ecx, r14d ; /* ecx = len */
Chris@4 198 mov rsi, rdi
Chris@4 199 sub rsi, r15 ; /* from = out - dist */
Chris@4 200
Chris@4 201 sar ecx, 1
Chris@4 202 jnc L_copy_two ; /* if len % 2 == 0 */
Chris@4 203
Chris@4 204 rep movsw
Chris@4 205 mov al, [rsi]
Chris@4 206 mov [rdi], al
Chris@4 207 inc rdi
Chris@4 208
Chris@4 209 mov rsi, r8 ; /* move in back to %rsi, toss from */
Chris@4 210 jmp L_while_test
Chris@4 211
Chris@4 212 L_copy_two:
Chris@4 213 rep movsw
Chris@4 214 mov rsi, r8 ; /* move in back to %rsi, toss from */
Chris@4 215 jmp L_while_test
Chris@4 216
Chris@4 217 ALIGN 4
Chris@4 218 L_check_dist_one:
Chris@4 219 cmp r15d, 1 ; /* if dist 1, is a memset */
Chris@4 220 jne L_check_window
Chris@4 221 cmp [rsp+40], rdi ; /* if out == beg, outside window */
Chris@4 222 je L_check_window
Chris@4 223
Chris@4 224 mov ecx, r14d ; /* ecx = len */
Chris@4 225 mov al, [rdi-1]
Chris@4 226 mov ah, al
Chris@4 227
Chris@4 228 sar ecx, 1
Chris@4 229 jnc L_set_two
Chris@4 230 mov [rdi], al
Chris@4 231 inc rdi
Chris@4 232
Chris@4 233 L_set_two:
Chris@4 234 rep stosw
Chris@4 235 jmp L_while_test
Chris@4 236
Chris@4 237 ALIGN 4
Chris@4 238 L_test_for_second_level_length:
Chris@4 239 test al, 64
Chris@4 240 jnz L_test_for_end_of_block ; /* if ((op & 64) != 0) */
Chris@4 241
Chris@4 242 xor eax, eax
Chris@4 243 inc eax
Chris@4 244 shl eax, cl
Chris@4 245 dec eax
Chris@4 246 and eax, edx ; /* eax &= hold */
Chris@4 247 add eax, r14d ; /* eax += len */
Chris@4 248 mov eax, [rbp+rax*4] ; /* eax = lcode[val+(hold&mask[op])]*/
Chris@4 249 jmp L_dolen
Chris@4 250
Chris@4 251 ALIGN 4
Chris@4 252 L_test_for_second_level_dist:
Chris@4 253 test al, 64
Chris@4 254 jnz L_invalid_distance_code ; /* if ((op & 64) != 0) */
Chris@4 255
Chris@4 256 xor eax, eax
Chris@4 257 inc eax
Chris@4 258 shl eax, cl
Chris@4 259 dec eax
Chris@4 260 and eax, edx ; /* eax &= hold */
Chris@4 261 add eax, r15d ; /* eax += dist */
Chris@4 262 mov eax, [r11+rax*4] ; /* eax = dcode[val+(hold&mask[op])]*/
Chris@4 263 jmp L_dodist
Chris@4 264
Chris@4 265 ALIGN 4
Chris@4 266 L_clip_window:
Chris@4 267 mov ecx, eax ; /* ecx = nbytes */
Chris@4 268 mov eax, [rsp+92] ; /* eax = wsize, prepare for dist cmp */
Chris@4 269 neg ecx ; /* nbytes = -nbytes */
Chris@4 270
Chris@4 271 cmp eax, r15d
Chris@4 272 jb L_invalid_distance_too_far ; /* if (dist > wsize) */
Chris@4 273
Chris@4 274 add ecx, r15d ; /* nbytes = dist - nbytes */
Chris@4 275 cmp dword ptr [rsp+96], 0
Chris@4 276 jne L_wrap_around_window ; /* if (write != 0) */
Chris@4 277
Chris@4 278 mov rsi, [rsp+56] ; /* from = window */
Chris@4 279 sub eax, ecx ; /* eax -= nbytes */
Chris@4 280 add rsi, rax ; /* from += wsize - nbytes */
Chris@4 281
Chris@4 282 mov eax, r14d ; /* eax = len */
Chris@4 283 cmp r14d, ecx
Chris@4 284 jbe L_do_copy ; /* if (nbytes >= len) */
Chris@4 285
Chris@4 286 sub eax, ecx ; /* eax -= nbytes */
Chris@4 287 rep movsb
Chris@4 288 mov rsi, rdi
Chris@4 289 sub rsi, r15 ; /* from = &out[ -dist ] */
Chris@4 290 jmp L_do_copy
Chris@4 291
Chris@4 292 ALIGN 4
Chris@4 293 L_wrap_around_window:
Chris@4 294 mov eax, [rsp+96] ; /* eax = write */
Chris@4 295 cmp ecx, eax
Chris@4 296 jbe L_contiguous_in_window ; /* if (write >= nbytes) */
Chris@4 297
Chris@4 298 mov esi, [rsp+92] ; /* from = wsize */
Chris@4 299 add rsi, [rsp+56] ; /* from += window */
Chris@4 300 add rsi, rax ; /* from += write */
Chris@4 301 sub rsi, rcx ; /* from -= nbytes */
Chris@4 302 sub ecx, eax ; /* nbytes -= write */
Chris@4 303
Chris@4 304 mov eax, r14d ; /* eax = len */
Chris@4 305 cmp eax, ecx
Chris@4 306 jbe L_do_copy ; /* if (nbytes >= len) */
Chris@4 307
Chris@4 308 sub eax, ecx ; /* len -= nbytes */
Chris@4 309 rep movsb
Chris@4 310 mov rsi, [rsp+56] ; /* from = window */
Chris@4 311 mov ecx, [rsp+96] ; /* nbytes = write */
Chris@4 312 cmp eax, ecx
Chris@4 313 jbe L_do_copy ; /* if (nbytes >= len) */
Chris@4 314
Chris@4 315 sub eax, ecx ; /* len -= nbytes */
Chris@4 316 rep movsb
Chris@4 317 mov rsi, rdi
Chris@4 318 sub rsi, r15 ; /* from = out - dist */
Chris@4 319 jmp L_do_copy
Chris@4 320
Chris@4 321 ALIGN 4
Chris@4 322 L_contiguous_in_window:
Chris@4 323 mov rsi, [rsp+56] ; /* rsi = window */
Chris@4 324 add rsi, rax
Chris@4 325 sub rsi, rcx ; /* from += write - nbytes */
Chris@4 326
Chris@4 327 mov eax, r14d ; /* eax = len */
Chris@4 328 cmp eax, ecx
Chris@4 329 jbe L_do_copy ; /* if (nbytes >= len) */
Chris@4 330
Chris@4 331 sub eax, ecx ; /* len -= nbytes */
Chris@4 332 rep movsb
Chris@4 333 mov rsi, rdi
Chris@4 334 sub rsi, r15 ; /* from = out - dist */
Chris@4 335 jmp L_do_copy ; /* if (nbytes >= len) */
Chris@4 336
Chris@4 337 ALIGN 4
Chris@4 338 L_do_copy:
Chris@4 339 mov ecx, eax ; /* ecx = len */
Chris@4 340 rep movsb
Chris@4 341
Chris@4 342 mov rsi, r8 ; /* move in back to %esi, toss from */
Chris@4 343 jmp L_while_test
Chris@4 344
Chris@4 345 L_test_for_end_of_block:
Chris@4 346 test al, 32
Chris@4 347 jz L_invalid_literal_length_code
Chris@4 348 mov dword ptr [rsp+116], 1
Chris@4 349 jmp L_break_loop_with_status
Chris@4 350
Chris@4 351 L_invalid_literal_length_code:
Chris@4 352 mov dword ptr [rsp+116], 2
Chris@4 353 jmp L_break_loop_with_status
Chris@4 354
Chris@4 355 L_invalid_distance_code:
Chris@4 356 mov dword ptr [rsp+116], 3
Chris@4 357 jmp L_break_loop_with_status
Chris@4 358
Chris@4 359 L_invalid_distance_too_far:
Chris@4 360 mov dword ptr [rsp+116], 4
Chris@4 361 jmp L_break_loop_with_status
Chris@4 362
Chris@4 363 L_break_loop:
Chris@4 364 mov dword ptr [rsp+116], 0
Chris@4 365
Chris@4 366 L_break_loop_with_status:
Chris@4 367 ; /* put in, out, bits, and hold back into ar and pop esp */
Chris@4 368 mov [rsp+16], rsi ; /* in */
Chris@4 369 mov [rsp+32], rdi ; /* out */
Chris@4 370 mov [rsp+88], ebx ; /* bits */
Chris@4 371 mov [rsp+80], rdx ; /* hold */
Chris@4 372
Chris@4 373 mov rax, [rsp] ; /* restore rbp and rsp */
Chris@4 374 mov rbp, [rsp+8]
Chris@4 375 mov rsp, rax
Chris@4 376
Chris@4 377
Chris@4 378
Chris@4 379 mov rsi,[rsp-8]
Chris@4 380 mov rdi,[rsp-16]
Chris@4 381 mov r12,[rsp-24]
Chris@4 382 mov r13,[rsp-32]
Chris@4 383 mov r14,[rsp-40]
Chris@4 384 mov r15,[rsp-48]
Chris@4 385 mov rbx,[rsp-56]
Chris@4 386
Chris@4 387 ret 0
Chris@4 388 ; :
Chris@4 389 ; : "m" (ar)
Chris@4 390 ; : "memory", "%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi",
Chris@4 391 ; "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
Chris@4 392 ; );
Chris@4 393
Chris@4 394 inffas8664fnc ENDP
Chris@4 395 ;_TEXT ENDS
Chris@4 396 END