Mercurial > hg > sv-dependency-builds
comparison src/zlib-1.2.7/contrib/gcc_gvmat64/gvmat64.S @ 4:e13257ea84a4
Add bzip2, zlib, liblo, portaudio sources
author | Chris Cannam |
---|---|
date | Wed, 20 Mar 2013 13:59:52 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
3:6c505a35919a | 4:e13257ea84a4 |
---|---|
1 /* | |
2 ;uInt longest_match_x64( | |
3 ; deflate_state *s, | |
4 ; IPos cur_match); // current match | |
5 | |
6 ; gvmat64.S -- Asm portion of the optimized longest_match for 32 bits x86_64 | |
7 ; (AMD64 on Athlon 64, Opteron, Phenom | |
8 ; and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7) | |
9 ; this file is translation from gvmat64.asm to GCC 4.x (for Linux, Mac XCode) | |
10 ; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant. | |
11 ; | |
12 ; File written by Gilles Vollant, by converting to assembly the longest_match | |
13 ; from Jean-loup Gailly in deflate.c of zLib and infoZip zip. | |
14 ; and by taking inspiration on asm686 with masm, optimised assembly code | |
15 ; from Brian Raiter, written 1998 | |
16 ; | |
17 ; This software is provided 'as-is', without any express or implied | |
18 ; warranty. In no event will the authors be held liable for any damages | |
19 ; arising from the use of this software. | |
20 ; | |
21 ; Permission is granted to anyone to use this software for any purpose, | |
22 ; including commercial applications, and to alter it and redistribute it | |
23 ; freely, subject to the following restrictions: | |
24 ; | |
25 ; 1. The origin of this software must not be misrepresented; you must not | |
26 ; claim that you wrote the original software. If you use this software | |
27 ; in a product, an acknowledgment in the product documentation would be | |
28 ; appreciated but is not required. | |
29 ; 2. Altered source versions must be plainly marked as such, and must not be | |
30 ; misrepresented as being the original software | |
31 ; 3. This notice may not be removed or altered from any source distribution. | |
32 ; | |
33 ; http://www.zlib.net | |
34 ; http://www.winimage.com/zLibDll | |
35 ; http://www.muppetlabs.com/~breadbox/software/assembly.html | |
36 ; | |
37 ; to compile this file for zLib, I use option: | |
38 ; gcc -c -arch x86_64 gvmat64.S | |
39 | |
40 | |
41 ;uInt longest_match(s, cur_match) | |
42 ; deflate_state *s; | |
43 ; IPos cur_match; // current match / | |
44 ; | |
45 ; with XCode for Mac, I had strange error with some jump on intel syntax | |
46 ; this is why BEFORE_JMP and AFTER_JMP are used | |
47 */ | |
48 | |
49 | |
50 #define BEFORE_JMP .att_syntax | |
51 #define AFTER_JMP .intel_syntax noprefix | |
52 | |
53 #ifndef NO_UNDERLINE | |
54 # define match_init _match_init | |
55 # define longest_match _longest_match | |
56 #endif | |
57 | |
58 .intel_syntax noprefix | |
59 | |
60 .globl match_init, longest_match | |
61 .text | |
62 longest_match: | |
63 | |
64 | |
65 | |
66 #define LocalVarsSize 96 | |
67 /* | |
68 ; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12 | |
69 ; free register : r14,r15 | |
70 ; register can be saved : rsp | |
71 */ | |
72 | |
73 #define chainlenwmask (rsp + 8 - LocalVarsSize) | |
74 #define nicematch (rsp + 16 - LocalVarsSize) | |
75 | |
76 #define save_rdi (rsp + 24 - LocalVarsSize) | |
77 #define save_rsi (rsp + 32 - LocalVarsSize) | |
78 #define save_rbx (rsp + 40 - LocalVarsSize) | |
79 #define save_rbp (rsp + 48 - LocalVarsSize) | |
80 #define save_r12 (rsp + 56 - LocalVarsSize) | |
81 #define save_r13 (rsp + 64 - LocalVarsSize) | |
82 #define save_r14 (rsp + 72 - LocalVarsSize) | |
83 #define save_r15 (rsp + 80 - LocalVarsSize) | |
84 | |
85 | |
86 /* | |
87 ; all the +4 offsets are due to the addition of pending_buf_size (in zlib | |
88 ; in the deflate_state structure since the asm code was first written | |
89 ; (if you compile with zlib 1.0.4 or older, remove the +4). | |
90 ; Note : these value are good with a 8 bytes boundary pack structure | |
91 */ | |
92 | |
93 #define MAX_MATCH 258 | |
94 #define MIN_MATCH 3 | |
95 #define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1) | |
96 | |
97 /* | |
98 ;;; Offsets for fields in the deflate_state structure. These numbers | |
99 ;;; are calculated from the definition of deflate_state, with the | |
100 ;;; assumption that the compiler will dword-align the fields. (Thus, | |
101 ;;; changing the definition of deflate_state could easily cause this | |
102 ;;; program to crash horribly, without so much as a warning at | |
103 ;;; compile time. Sigh.) | |
104 | |
105 ; all the +zlib1222add offsets are due to the addition of fields | |
106 ; in zlib in the deflate_state structure since the asm code was first written | |
107 ; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). | |
108 ; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). | |
109 ; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). | |
110 */ | |
111 | |
112 | |
113 | |
114 /* you can check the structure offset by running | |
115 | |
116 #include <stdlib.h> | |
117 #include <stdio.h> | |
118 #include "deflate.h" | |
119 | |
120 void print_depl() | |
121 { | |
122 deflate_state ds; | |
123 deflate_state *s=&ds; | |
124 printf("size pointer=%u\n",(int)sizeof(void*)); | |
125 | |
126 printf("#define dsWSize %u\n",(int)(((char*)&(s->w_size))-((char*)s))); | |
127 printf("#define dsWMask %u\n",(int)(((char*)&(s->w_mask))-((char*)s))); | |
128 printf("#define dsWindow %u\n",(int)(((char*)&(s->window))-((char*)s))); | |
129 printf("#define dsPrev %u\n",(int)(((char*)&(s->prev))-((char*)s))); | |
130 printf("#define dsMatchLen %u\n",(int)(((char*)&(s->match_length))-((char*)s))); | |
131 printf("#define dsPrevMatch %u\n",(int)(((char*)&(s->prev_match))-((char*)s))); | |
132 printf("#define dsStrStart %u\n",(int)(((char*)&(s->strstart))-((char*)s))); | |
133 printf("#define dsMatchStart %u\n",(int)(((char*)&(s->match_start))-((char*)s))); | |
134 printf("#define dsLookahead %u\n",(int)(((char*)&(s->lookahead))-((char*)s))); | |
135 printf("#define dsPrevLen %u\n",(int)(((char*)&(s->prev_length))-((char*)s))); | |
136 printf("#define dsMaxChainLen %u\n",(int)(((char*)&(s->max_chain_length))-((char*)s))); | |
137 printf("#define dsGoodMatch %u\n",(int)(((char*)&(s->good_match))-((char*)s))); | |
138 printf("#define dsNiceMatch %u\n",(int)(((char*)&(s->nice_match))-((char*)s))); | |
139 } | |
140 */ | |
141 | |
142 #define dsWSize 68 | |
143 #define dsWMask 76 | |
144 #define dsWindow 80 | |
145 #define dsPrev 96 | |
146 #define dsMatchLen 144 | |
147 #define dsPrevMatch 148 | |
148 #define dsStrStart 156 | |
149 #define dsMatchStart 160 | |
150 #define dsLookahead 164 | |
151 #define dsPrevLen 168 | |
152 #define dsMaxChainLen 172 | |
153 #define dsGoodMatch 188 | |
154 #define dsNiceMatch 192 | |
155 | |
156 #define window_size [ rcx + dsWSize] | |
157 #define WMask [ rcx + dsWMask] | |
158 #define window_ad [ rcx + dsWindow] | |
159 #define prev_ad [ rcx + dsPrev] | |
160 #define strstart [ rcx + dsStrStart] | |
161 #define match_start [ rcx + dsMatchStart] | |
162 #define Lookahead [ rcx + dsLookahead] //; 0ffffffffh on infozip | |
163 #define prev_length [ rcx + dsPrevLen] | |
164 #define max_chain_length [ rcx + dsMaxChainLen] | |
165 #define good_match [ rcx + dsGoodMatch] | |
166 #define nice_match [ rcx + dsNiceMatch] | |
167 | |
168 /* | |
169 ; windows: | |
170 ; parameter 1 in rcx(deflate state s), param 2 in rdx (cur match) | |
171 | |
172 ; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and | |
173 ; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp | |
174 ; | |
175 ; All registers must be preserved across the call, except for | |
176 ; rax, rcx, rdx, r8, r9, r10, and r11, which are scratch. | |
177 | |
178 ; | |
179 ; gcc on macosx-linux: | |
180 ; see http://www.x86-64.org/documentation/abi-0.99.pdf | |
181 ; param 1 in rdi, param 2 in rsi | |
182 ; rbx, rsp, rbp, r12 to r15 must be preserved | |
183 | |
184 ;;; Save registers that the compiler may be using, and adjust esp to | |
185 ;;; make room for our stack frame. | |
186 | |
187 | |
188 ;;; Retrieve the function arguments. r8d will hold cur_match | |
189 ;;; throughout the entire function. edx will hold the pointer to the | |
190 ;;; deflate_state structure during the function's setup (before | |
191 ;;; entering the main loop. | |
192 | |
193 ; ms: parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match) | |
194 ; mac: param 1 in rdi, param 2 rsi | |
195 ; this clear high 32 bits of r8, which can be garbage in both r8 and rdx | |
196 */ | |
197 mov [save_rbx],rbx | |
198 mov [save_rbp],rbp | |
199 | |
200 | |
201 mov rcx,rdi | |
202 | |
203 mov r8d,esi | |
204 | |
205 | |
206 mov [save_r12],r12 | |
207 mov [save_r13],r13 | |
208 mov [save_r14],r14 | |
209 mov [save_r15],r15 | |
210 | |
211 | |
212 //;;; uInt wmask = s->w_mask; | |
213 //;;; unsigned chain_length = s->max_chain_length; | |
214 //;;; if (s->prev_length >= s->good_match) { | |
215 //;;; chain_length >>= 2; | |
216 //;;; } | |
217 | |
218 | |
219 mov edi, prev_length | |
220 mov esi, good_match | |
221 mov eax, WMask | |
222 mov ebx, max_chain_length | |
223 cmp edi, esi | |
224 jl LastMatchGood | |
225 shr ebx, 2 | |
226 LastMatchGood: | |
227 | |
228 //;;; chainlen is decremented once beforehand so that the function can | |
229 //;;; use the sign flag instead of the zero flag for the exit test. | |
230 //;;; It is then shifted into the high word, to make room for the wmask | |
231 //;;; value, which it will always accompany. | |
232 | |
233 dec ebx | |
234 shl ebx, 16 | |
235 or ebx, eax | |
236 | |
237 //;;; on zlib only | |
238 //;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; | |
239 | |
240 | |
241 | |
242 mov eax, nice_match | |
243 mov [chainlenwmask], ebx | |
244 mov r10d, Lookahead | |
245 cmp r10d, eax | |
246 cmovnl r10d, eax | |
247 mov [nicematch],r10d | |
248 | |
249 | |
250 | |
251 //;;; register Bytef *scan = s->window + s->strstart; | |
252 mov r10, window_ad | |
253 mov ebp, strstart | |
254 lea r13, [r10 + rbp] | |
255 | |
256 //;;; Determine how many bytes the scan ptr is off from being | |
257 //;;; dword-aligned. | |
258 | |
259 mov r9,r13 | |
260 neg r13 | |
261 and r13,3 | |
262 | |
263 //;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? | |
264 //;;; s->strstart - (IPos)MAX_DIST(s) : NIL; | |
265 | |
266 | |
267 mov eax, window_size | |
268 sub eax, MIN_LOOKAHEAD | |
269 | |
270 | |
271 xor edi,edi | |
272 sub ebp, eax | |
273 | |
274 mov r11d, prev_length | |
275 | |
276 cmovng ebp,edi | |
277 | |
278 //;;; int best_len = s->prev_length; | |
279 | |
280 | |
281 //;;; Store the sum of s->window + best_len in esi locally, and in esi. | |
282 | |
283 lea rsi,[r10+r11] | |
284 | |
285 //;;; register ush scan_start = *(ushf*)scan; | |
286 //;;; register ush scan_end = *(ushf*)(scan+best_len-1); | |
287 //;;; Posf *prev = s->prev; | |
288 | |
289 movzx r12d,word ptr [r9] | |
290 movzx ebx, word ptr [r9 + r11 - 1] | |
291 | |
292 mov rdi, prev_ad | |
293 | |
294 //;;; Jump into the main loop. | |
295 | |
296 mov edx, [chainlenwmask] | |
297 | |
298 cmp bx,word ptr [rsi + r8 - 1] | |
299 jz LookupLoopIsZero | |
300 | |
301 | |
302 | |
303 LookupLoop1: | |
304 and r8d, edx | |
305 | |
306 movzx r8d, word ptr [rdi + r8*2] | |
307 cmp r8d, ebp | |
308 jbe LeaveNow | |
309 | |
310 | |
311 | |
312 sub edx, 0x00010000 | |
313 BEFORE_JMP | |
314 js LeaveNow | |
315 AFTER_JMP | |
316 | |
317 LoopEntry1: | |
318 cmp bx,word ptr [rsi + r8 - 1] | |
319 BEFORE_JMP | |
320 jz LookupLoopIsZero | |
321 AFTER_JMP | |
322 | |
323 LookupLoop2: | |
324 and r8d, edx | |
325 | |
326 movzx r8d, word ptr [rdi + r8*2] | |
327 cmp r8d, ebp | |
328 BEFORE_JMP | |
329 jbe LeaveNow | |
330 AFTER_JMP | |
331 sub edx, 0x00010000 | |
332 BEFORE_JMP | |
333 js LeaveNow | |
334 AFTER_JMP | |
335 | |
336 LoopEntry2: | |
337 cmp bx,word ptr [rsi + r8 - 1] | |
338 BEFORE_JMP | |
339 jz LookupLoopIsZero | |
340 AFTER_JMP | |
341 | |
342 LookupLoop4: | |
343 and r8d, edx | |
344 | |
345 movzx r8d, word ptr [rdi + r8*2] | |
346 cmp r8d, ebp | |
347 BEFORE_JMP | |
348 jbe LeaveNow | |
349 AFTER_JMP | |
350 sub edx, 0x00010000 | |
351 BEFORE_JMP | |
352 js LeaveNow | |
353 AFTER_JMP | |
354 | |
355 LoopEntry4: | |
356 | |
357 cmp bx,word ptr [rsi + r8 - 1] | |
358 BEFORE_JMP | |
359 jnz LookupLoop1 | |
360 jmp LookupLoopIsZero | |
361 AFTER_JMP | |
362 /* | |
363 ;;; do { | |
364 ;;; match = s->window + cur_match; | |
365 ;;; if (*(ushf*)(match+best_len-1) != scan_end || | |
366 ;;; *(ushf*)match != scan_start) continue; | |
367 ;;; [...] | |
368 ;;; } while ((cur_match = prev[cur_match & wmask]) > limit | |
369 ;;; && --chain_length != 0); | |
370 ;;; | |
371 ;;; Here is the inner loop of the function. The function will spend the | |
372 ;;; majority of its time in this loop, and majority of that time will | |
373 ;;; be spent in the first ten instructions. | |
374 ;;; | |
375 ;;; Within this loop: | |
376 ;;; ebx = scanend | |
377 ;;; r8d = curmatch | |
378 ;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) | |
379 ;;; esi = windowbestlen - i.e., (window + bestlen) | |
380 ;;; edi = prev | |
381 ;;; ebp = limit | |
382 */ | |
383 .balign 16 | |
384 LookupLoop: | |
385 and r8d, edx | |
386 | |
387 movzx r8d, word ptr [rdi + r8*2] | |
388 cmp r8d, ebp | |
389 BEFORE_JMP | |
390 jbe LeaveNow | |
391 AFTER_JMP | |
392 sub edx, 0x00010000 | |
393 BEFORE_JMP | |
394 js LeaveNow | |
395 AFTER_JMP | |
396 | |
397 LoopEntry: | |
398 | |
399 cmp bx,word ptr [rsi + r8 - 1] | |
400 BEFORE_JMP | |
401 jnz LookupLoop1 | |
402 AFTER_JMP | |
403 LookupLoopIsZero: | |
404 cmp r12w, word ptr [r10 + r8] | |
405 BEFORE_JMP | |
406 jnz LookupLoop1 | |
407 AFTER_JMP | |
408 | |
409 | |
410 //;;; Store the current value of chainlen. | |
411 mov [chainlenwmask], edx | |
412 /* | |
413 ;;; Point edi to the string under scrutiny, and esi to the string we | |
414 ;;; are hoping to match it up with. In actuality, esi and edi are | |
415 ;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is | |
416 ;;; initialized to -(MAX_MATCH_8 - scanalign). | |
417 */ | |
418 lea rsi,[r8+r10] | |
419 mov rdx, 0xfffffffffffffef8 //; -(MAX_MATCH_8) | |
420 lea rsi, [rsi + r13 + 0x0108] //;MAX_MATCH_8] | |
421 lea rdi, [r9 + r13 + 0x0108] //;MAX_MATCH_8] | |
422 | |
423 prefetcht1 [rsi+rdx] | |
424 prefetcht1 [rdi+rdx] | |
425 | |
426 /* | |
427 ;;; Test the strings for equality, 8 bytes at a time. At the end, | |
428 ;;; adjust rdx so that it is offset to the exact byte that mismatched. | |
429 ;;; | |
430 ;;; We already know at this point that the first three bytes of the | |
431 ;;; strings match each other, and they can be safely passed over before | |
432 ;;; starting the compare loop. So what this code does is skip over 0-3 | |
433 ;;; bytes, as much as necessary in order to dword-align the edi | |
434 ;;; pointer. (rsi will still be misaligned three times out of four.) | |
435 ;;; | |
436 ;;; It should be confessed that this loop usually does not represent | |
437 ;;; much of the total running time. Replacing it with a more | |
438 ;;; straightforward "rep cmpsb" would not drastically degrade | |
439 ;;; performance. | |
440 */ | |
441 | |
442 LoopCmps: | |
443 mov rax, [rsi + rdx] | |
444 xor rax, [rdi + rdx] | |
445 jnz LeaveLoopCmps | |
446 | |
447 mov rax, [rsi + rdx + 8] | |
448 xor rax, [rdi + rdx + 8] | |
449 jnz LeaveLoopCmps8 | |
450 | |
451 | |
452 mov rax, [rsi + rdx + 8+8] | |
453 xor rax, [rdi + rdx + 8+8] | |
454 jnz LeaveLoopCmps16 | |
455 | |
456 add rdx,8+8+8 | |
457 | |
458 BEFORE_JMP | |
459 jnz LoopCmps | |
460 jmp LenMaximum | |
461 AFTER_JMP | |
462 | |
463 LeaveLoopCmps16: add rdx,8 | |
464 LeaveLoopCmps8: add rdx,8 | |
465 LeaveLoopCmps: | |
466 | |
467 test eax, 0x0000FFFF | |
468 jnz LenLower | |
469 | |
470 test eax,0xffffffff | |
471 | |
472 jnz LenLower32 | |
473 | |
474 add rdx,4 | |
475 shr rax,32 | |
476 or ax,ax | |
477 BEFORE_JMP | |
478 jnz LenLower | |
479 AFTER_JMP | |
480 | |
481 LenLower32: | |
482 shr eax,16 | |
483 add rdx,2 | |
484 | |
485 LenLower: | |
486 sub al, 1 | |
487 adc rdx, 0 | |
488 //;;; Calculate the length of the match. If it is longer than MAX_MATCH, | |
489 //;;; then automatically accept it as the best possible match and leave. | |
490 | |
491 lea rax, [rdi + rdx] | |
492 sub rax, r9 | |
493 cmp eax, MAX_MATCH | |
494 BEFORE_JMP | |
495 jge LenMaximum | |
496 AFTER_JMP | |
497 /* | |
498 ;;; If the length of the match is not longer than the best match we | |
499 ;;; have so far, then forget it and return to the lookup loop. | |
500 ;/////////////////////////////////// | |
501 */ | |
502 cmp eax, r11d | |
503 jg LongerMatch | |
504 | |
505 lea rsi,[r10+r11] | |
506 | |
507 mov rdi, prev_ad | |
508 mov edx, [chainlenwmask] | |
509 BEFORE_JMP | |
510 jmp LookupLoop | |
511 AFTER_JMP | |
512 /* | |
513 ;;; s->match_start = cur_match; | |
514 ;;; best_len = len; | |
515 ;;; if (len >= nice_match) break; | |
516 ;;; scan_end = *(ushf*)(scan+best_len-1); | |
517 */ | |
518 LongerMatch: | |
519 mov r11d, eax | |
520 mov match_start, r8d | |
521 cmp eax, [nicematch] | |
522 BEFORE_JMP | |
523 jge LeaveNow | |
524 AFTER_JMP | |
525 | |
526 lea rsi,[r10+rax] | |
527 | |
528 movzx ebx, word ptr [r9 + rax - 1] | |
529 mov rdi, prev_ad | |
530 mov edx, [chainlenwmask] | |
531 BEFORE_JMP | |
532 jmp LookupLoop | |
533 AFTER_JMP | |
534 | |
535 //;;; Accept the current string, with the maximum possible length. | |
536 | |
537 LenMaximum: | |
538 mov r11d,MAX_MATCH | |
539 mov match_start, r8d | |
540 | |
541 //;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len; | |
542 //;;; return s->lookahead; | |
543 | |
544 LeaveNow: | |
545 mov eax, Lookahead | |
546 cmp r11d, eax | |
547 cmovng eax, r11d | |
548 | |
549 | |
550 | |
551 //;;; Restore the stack and return from whence we came. | |
552 | |
553 | |
554 // mov rsi,[save_rsi] | |
555 // mov rdi,[save_rdi] | |
556 mov rbx,[save_rbx] | |
557 mov rbp,[save_rbp] | |
558 mov r12,[save_r12] | |
559 mov r13,[save_r13] | |
560 mov r14,[save_r14] | |
561 mov r15,[save_r15] | |
562 | |
563 | |
564 ret 0 | |
565 //; please don't remove this string ! | |
566 //; Your can freely use gvmat64 in any free or commercial app | |
567 //; but it is far better don't remove the string in the binary! | |
568 // db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0 | |
569 | |
570 | |
571 match_init: | |
572 ret 0 | |
573 | |
574 |