postprocess_template.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 /**
22  * @file
23  * mmx/mmx2/3dnow postprocess code.
24  */
25 
26 #include "libavutil/x86/asm.h"
27 
28 /* A single TEMPLATE_PP_* should be defined (to 1) when this template is
29  * included. The following macros will define its dependencies to 1 as well
30  * (like MMX2 depending on MMX), and will define to 0 all the others. Every
31  * TEMPLATE_PP_* need to be undef at the end. */
32 
33 #ifdef TEMPLATE_PP_C
34 # define RENAME(a) a ## _C
35 #else
36 # define TEMPLATE_PP_C 0
37 #endif
38 
39 #ifdef TEMPLATE_PP_ALTIVEC
40 # define RENAME(a) a ## _altivec
41 #else
42 # define TEMPLATE_PP_ALTIVEC 0
43 #endif
44 
45 #ifdef TEMPLATE_PP_MMX
46 # define RENAME(a) a ## _MMX
47 #else
48 # define TEMPLATE_PP_MMX 0
49 #endif
50 
51 #ifdef TEMPLATE_PP_MMXEXT
52 # undef TEMPLATE_PP_MMX
53 # define TEMPLATE_PP_MMX 1
54 # define RENAME(a) a ## _MMX2
55 #else
56 # define TEMPLATE_PP_MMXEXT 0
57 #endif
58 
59 #ifdef TEMPLATE_PP_3DNOW
60 # undef TEMPLATE_PP_MMX
61 # define TEMPLATE_PP_MMX 1
62 # define RENAME(a) a ## _3DNow
63 #else
64 # define TEMPLATE_PP_3DNOW 0
65 #endif
66 
67 #ifdef TEMPLATE_PP_SSE2
68 # undef TEMPLATE_PP_MMX
69 # define TEMPLATE_PP_MMX 1
70 # undef TEMPLATE_PP_MMXEXT
71 # define TEMPLATE_PP_MMXEXT 1
72 # define RENAME(a) a ## _SSE2
73 #else
74 # define TEMPLATE_PP_SSE2 0
75 #endif
76 
77 #undef REAL_PAVGB
78 #undef PAVGB
79 #undef PMINUB
80 #undef PMAXUB
81 
82 #if TEMPLATE_PP_MMXEXT
83 #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
84 #elif TEMPLATE_PP_3DNOW
85 #define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
86 #endif
87 #define PAVGB(a,b) REAL_PAVGB(a,b)
88 
89 #if TEMPLATE_PP_MMXEXT
90 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
91 #elif TEMPLATE_PP_MMX
92 #define PMINUB(b,a,t) \
93  "movq " #a ", " #t " \n\t"\
94  "psubusb " #b ", " #t " \n\t"\
95  "psubb " #t ", " #a " \n\t"
96 #endif
97 
98 #if TEMPLATE_PP_MMXEXT
99 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
100 #elif TEMPLATE_PP_MMX
101 #define PMAXUB(a,b) \
102  "psubusb " #a ", " #b " \n\t"\
103  "paddb " #a ", " #b " \n\t"
104 #endif
105 
106 //FIXME? |255-0| = 1 (should not be a problem ...)
107 #if TEMPLATE_PP_MMX
108 /**
109  * Check if the middle 8x8 Block in the given 8x16 block is flat
110  */
111 static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){
112  int numEq= 0, dcOk;
113  src+= stride*4; // src points to begin of the 8x8 Block
114  __asm__ volatile(
115  "movq %0, %%mm7 \n\t"
116  "movq %1, %%mm6 \n\t"
117  : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
118  );
119 
120  __asm__ volatile(
121  "lea (%2, %3), %%"REG_a" \n\t"
122 // 0 1 2 3 4 5 6 7 8 9
123 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
124 
125  "movq (%2), %%mm0 \n\t"
126  "movq (%%"REG_a"), %%mm1 \n\t"
127  "movq %%mm0, %%mm3 \n\t"
128  "movq %%mm0, %%mm4 \n\t"
129  PMAXUB(%%mm1, %%mm4)
130  PMINUB(%%mm1, %%mm3, %%mm5)
131  "psubb %%mm1, %%mm0 \n\t" // mm0 = difference
132  "paddb %%mm7, %%mm0 \n\t"
133  "pcmpgtb %%mm6, %%mm0 \n\t"
134 
135  "movq (%%"REG_a",%3), %%mm2 \n\t"
136  PMAXUB(%%mm2, %%mm4)
137  PMINUB(%%mm2, %%mm3, %%mm5)
138  "psubb %%mm2, %%mm1 \n\t"
139  "paddb %%mm7, %%mm1 \n\t"
140  "pcmpgtb %%mm6, %%mm1 \n\t"
141  "paddb %%mm1, %%mm0 \n\t"
142 
143  "movq (%%"REG_a", %3, 2), %%mm1 \n\t"
144  PMAXUB(%%mm1, %%mm4)
145  PMINUB(%%mm1, %%mm3, %%mm5)
146  "psubb %%mm1, %%mm2 \n\t"
147  "paddb %%mm7, %%mm2 \n\t"
148  "pcmpgtb %%mm6, %%mm2 \n\t"
149  "paddb %%mm2, %%mm0 \n\t"
150 
151  "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t"
152 
153  "movq (%2, %3, 4), %%mm2 \n\t"
154  PMAXUB(%%mm2, %%mm4)
155  PMINUB(%%mm2, %%mm3, %%mm5)
156  "psubb %%mm2, %%mm1 \n\t"
157  "paddb %%mm7, %%mm1 \n\t"
158  "pcmpgtb %%mm6, %%mm1 \n\t"
159  "paddb %%mm1, %%mm0 \n\t"
160 
161  "movq (%%"REG_a"), %%mm1 \n\t"
162  PMAXUB(%%mm1, %%mm4)
163  PMINUB(%%mm1, %%mm3, %%mm5)
164  "psubb %%mm1, %%mm2 \n\t"
165  "paddb %%mm7, %%mm2 \n\t"
166  "pcmpgtb %%mm6, %%mm2 \n\t"
167  "paddb %%mm2, %%mm0 \n\t"
168 
169  "movq (%%"REG_a", %3), %%mm2 \n\t"
170  PMAXUB(%%mm2, %%mm4)
171  PMINUB(%%mm2, %%mm3, %%mm5)
172  "psubb %%mm2, %%mm1 \n\t"
173  "paddb %%mm7, %%mm1 \n\t"
174  "pcmpgtb %%mm6, %%mm1 \n\t"
175  "paddb %%mm1, %%mm0 \n\t"
176 
177  "movq (%%"REG_a", %3, 2), %%mm1 \n\t"
178  PMAXUB(%%mm1, %%mm4)
179  PMINUB(%%mm1, %%mm3, %%mm5)
180  "psubb %%mm1, %%mm2 \n\t"
181  "paddb %%mm7, %%mm2 \n\t"
182  "pcmpgtb %%mm6, %%mm2 \n\t"
183  "paddb %%mm2, %%mm0 \n\t"
184  "psubusb %%mm3, %%mm4 \n\t"
185 
186  " \n\t"
187 #if TEMPLATE_PP_MMXEXT
188  "pxor %%mm7, %%mm7 \n\t"
189  "psadbw %%mm7, %%mm0 \n\t"
190 #else
191  "movq %%mm0, %%mm1 \n\t"
192  "psrlw $8, %%mm0 \n\t"
193  "paddb %%mm1, %%mm0 \n\t"
194  "movq %%mm0, %%mm1 \n\t"
195  "psrlq $16, %%mm0 \n\t"
196  "paddb %%mm1, %%mm0 \n\t"
197  "movq %%mm0, %%mm1 \n\t"
198  "psrlq $32, %%mm0 \n\t"
199  "paddb %%mm1, %%mm0 \n\t"
200 #endif
201  "movq %4, %%mm7 \n\t" // QP,..., QP
202  "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
203  "psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0
204  "packssdw %%mm4, %%mm4 \n\t"
205  "movd %%mm0, %0 \n\t"
206  "movd %%mm4, %1 \n\t"
207 
208  : "=r" (numEq), "=r" (dcOk)
209  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
210  : "%"REG_a
211  );
212 
213  numEq= (-numEq) &0xFF;
214  if(numEq > c->ppMode.flatnessThreshold){
215  if(dcOk) return 0;
216  else return 1;
217  }else{
218  return 2;
219  }
220 }
221 #endif //TEMPLATE_PP_MMX
222 
223 /**
224  * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
225  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
226  */
227 #if !TEMPLATE_PP_ALTIVEC
228 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
229 {
230 #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
231  src+= stride*3;
232  __asm__ volatile( //"movv %0 %1 %2\n\t"
233  "movq %2, %%mm0 \n\t" // QP,..., QP
234  "pxor %%mm4, %%mm4 \n\t"
235 
236  "movq (%0), %%mm6 \n\t"
237  "movq (%0, %1), %%mm5 \n\t"
238  "movq %%mm5, %%mm1 \n\t"
239  "movq %%mm6, %%mm2 \n\t"
240  "psubusb %%mm6, %%mm5 \n\t"
241  "psubusb %%mm1, %%mm2 \n\t"
242  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
243  "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
244  "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
245 
246  "pand %%mm2, %%mm6 \n\t"
247  "pandn %%mm1, %%mm2 \n\t"
248  "por %%mm2, %%mm6 \n\t"// First Line to Filter
249 
250  "movq (%0, %1, 8), %%mm5 \n\t"
251  "lea (%0, %1, 4), %%"REG_a" \n\t"
252  "lea (%0, %1, 8), %%"REG_c" \n\t"
253  "sub %1, %%"REG_c" \n\t"
254  "add %1, %0 \n\t" // %0 points to line 1 not 0
255  "movq (%0, %1, 8), %%mm7 \n\t"
256  "movq %%mm5, %%mm1 \n\t"
257  "movq %%mm7, %%mm2 \n\t"
258  "psubusb %%mm7, %%mm5 \n\t"
259  "psubusb %%mm1, %%mm2 \n\t"
260  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
261  "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
262  "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
263 
264  "pand %%mm2, %%mm7 \n\t"
265  "pandn %%mm1, %%mm2 \n\t"
266  "por %%mm2, %%mm7 \n\t" // First Line to Filter
267 
268 
269  // 1 2 3 4 5 6 7 8
270  // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1
271  // 6 4 2 2 1 1
272  // 6 4 4 2
273  // 6 8 2
274 
275  "movq (%0, %1), %%mm0 \n\t" // 1
276  "movq %%mm0, %%mm1 \n\t" // 1
277  PAVGB(%%mm6, %%mm0) //1 1 /2
278  PAVGB(%%mm6, %%mm0) //3 1 /4
279 
280  "movq (%0, %1, 4), %%mm2 \n\t" // 1
281  "movq %%mm2, %%mm5 \n\t" // 1
282  PAVGB((%%REGa), %%mm2) // 11 /2
283  PAVGB((%0, %1, 2), %%mm2) // 211 /4
284  "movq %%mm2, %%mm3 \n\t" // 211 /4
285  "movq (%0), %%mm4 \n\t" // 1
286  PAVGB(%%mm4, %%mm3) // 4 211 /8
287  PAVGB(%%mm0, %%mm3) //642211 /16
288  "movq %%mm3, (%0) \n\t" // X
289  // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
290  "movq %%mm1, %%mm0 \n\t" // 1
291  PAVGB(%%mm6, %%mm0) //1 1 /2
292  "movq %%mm4, %%mm3 \n\t" // 1
293  PAVGB((%0,%1,2), %%mm3) // 1 1 /2
294  PAVGB((%%REGa,%1,2), %%mm5) // 11 /2
295  PAVGB((%%REGa), %%mm5) // 211 /4
296  PAVGB(%%mm5, %%mm3) // 2 2211 /8
297  PAVGB(%%mm0, %%mm3) //4242211 /16
298  "movq %%mm3, (%0,%1) \n\t" // X
299  // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
300  PAVGB(%%mm4, %%mm6) //11 /2
301  "movq (%%"REG_c"), %%mm0 \n\t" // 1
302  PAVGB((%%REGa, %1, 2), %%mm0) // 11/2
303  "movq %%mm0, %%mm3 \n\t" // 11/2
304  PAVGB(%%mm1, %%mm0) // 2 11/4
305  PAVGB(%%mm6, %%mm0) //222 11/8
306  PAVGB(%%mm2, %%mm0) //22242211/16
307  "movq (%0, %1, 2), %%mm2 \n\t" // 1
308  "movq %%mm0, (%0, %1, 2) \n\t" // X
309  // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
310  "movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1
311  PAVGB((%%REGc), %%mm0) // 11 /2
312  PAVGB(%%mm0, %%mm6) //11 11 /4
313  PAVGB(%%mm1, %%mm4) // 11 /2
314  PAVGB(%%mm2, %%mm1) // 11 /2
315  PAVGB(%%mm1, %%mm6) //1122 11 /8
316  PAVGB(%%mm5, %%mm6) //112242211 /16
317  "movq (%%"REG_a"), %%mm5 \n\t" // 1
318  "movq %%mm6, (%%"REG_a") \n\t" // X
319  // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
320  "movq (%%"REG_a", %1, 4), %%mm6 \n\t" // 1
321  PAVGB(%%mm7, %%mm6) // 11 /2
322  PAVGB(%%mm4, %%mm6) // 11 11 /4
323  PAVGB(%%mm3, %%mm6) // 11 2211 /8
324  PAVGB(%%mm5, %%mm2) // 11 /2
325  "movq (%0, %1, 4), %%mm4 \n\t" // 1
326  PAVGB(%%mm4, %%mm2) // 112 /4
327  PAVGB(%%mm2, %%mm6) // 112242211 /16
328  "movq %%mm6, (%0, %1, 4) \n\t" // X
329  // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
330  PAVGB(%%mm7, %%mm1) // 11 2 /4
331  PAVGB(%%mm4, %%mm5) // 11 /2
332  PAVGB(%%mm5, %%mm0) // 11 11 /4
333  "movq (%%"REG_a", %1, 2), %%mm6 \n\t" // 1
334  PAVGB(%%mm6, %%mm1) // 11 4 2 /8
335  PAVGB(%%mm0, %%mm1) // 11224222 /16
336  "movq %%mm1, (%%"REG_a", %1, 2) \n\t" // X
337  // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
338  PAVGB((%%REGc), %%mm2) // 112 4 /8
339  "movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1
340  PAVGB(%%mm0, %%mm6) // 1 1 /2
341  PAVGB(%%mm7, %%mm6) // 1 12 /4
342  PAVGB(%%mm2, %%mm6) // 1122424 /4
343  "movq %%mm6, (%%"REG_c") \n\t" // X
344  // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
345  PAVGB(%%mm7, %%mm5) // 11 2 /4
346  PAVGB(%%mm7, %%mm5) // 11 6 /8
347 
348  PAVGB(%%mm3, %%mm0) // 112 /4
349  PAVGB(%%mm0, %%mm5) // 112246 /16
350  "movq %%mm5, (%%"REG_a", %1, 4) \n\t" // X
351  "sub %1, %0 \n\t"
352 
353  :
354  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
355  : "%"REG_a, "%"REG_c
356  );
357 #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
358  const int l1= stride;
359  const int l2= stride + l1;
360  const int l3= stride + l2;
361  const int l4= stride + l3;
362  const int l5= stride + l4;
363  const int l6= stride + l5;
364  const int l7= stride + l6;
365  const int l8= stride + l7;
366  const int l9= stride + l8;
367  int x;
368  src+= stride*3;
369  for(x=0; x<BLOCK_SIZE; x++){
370  const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
371  const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
372 
373  int sums[10];
374  sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
375  sums[1] = sums[0] - first + src[l4];
376  sums[2] = sums[1] - first + src[l5];
377  sums[3] = sums[2] - first + src[l6];
378  sums[4] = sums[3] - first + src[l7];
379  sums[5] = sums[4] - src[l1] + src[l8];
380  sums[6] = sums[5] - src[l2] + last;
381  sums[7] = sums[6] - src[l3] + last;
382  sums[8] = sums[7] - src[l4] + last;
383  sums[9] = sums[8] - src[l5] + last;
384 
385  src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
386  src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
387  src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
388  src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
389  src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
390  src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
391  src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
392  src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
393 
394  src++;
395  }
396 #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
397 }
398 #endif //TEMPLATE_PP_ALTIVEC
399 
400 /**
401  * Experimental Filter 1
402  * will not damage linear gradients
403  * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
404  * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
405  * MMX2 version does correct clipping C version does not
406  */
407 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
408 {
409 #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
410  src+= stride*3;
411 
412  __asm__ volatile(
413  "pxor %%mm7, %%mm7 \n\t" // 0
414  "lea (%0, %1), %%"REG_a" \n\t"
415  "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
416 // 0 1 2 3 4 5 6 7 8 9
417 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
418  "movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3
419  "movq (%0, %1, 4), %%mm1 \n\t" // line 4
420  "movq %%mm1, %%mm2 \n\t" // line 4
421  "psubusb %%mm0, %%mm1 \n\t"
422  "psubusb %%mm2, %%mm0 \n\t"
423  "por %%mm1, %%mm0 \n\t" // |l2 - l3|
424  "movq (%%"REG_c"), %%mm3 \n\t" // line 5
425  "movq (%%"REG_c", %1), %%mm4 \n\t" // line 6
426  "movq %%mm3, %%mm5 \n\t" // line 5
427  "psubusb %%mm4, %%mm3 \n\t"
428  "psubusb %%mm5, %%mm4 \n\t"
429  "por %%mm4, %%mm3 \n\t" // |l5 - l6|
430  PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
431  "movq %%mm2, %%mm1 \n\t" // line 4
432  "psubusb %%mm5, %%mm2 \n\t"
433  "movq %%mm2, %%mm4 \n\t"
434  "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
435  "psubusb %%mm1, %%mm5 \n\t"
436  "por %%mm5, %%mm4 \n\t" // |l4 - l5|
437  "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
438  "movq %%mm4, %%mm3 \n\t" // d
439  "movq %2, %%mm0 \n\t"
440  "paddusb %%mm0, %%mm0 \n\t"
441  "psubusb %%mm0, %%mm4 \n\t"
442  "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
443  "psubusb "MANGLE(b01)", %%mm3 \n\t"
444  "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
445 
446  PAVGB(%%mm7, %%mm3) // d/2
447  "movq %%mm3, %%mm1 \n\t" // d/2
448  PAVGB(%%mm7, %%mm3) // d/4
449  PAVGB(%%mm1, %%mm3) // 3*d/8
450 
451  "movq (%0, %1, 4), %%mm0 \n\t" // line 4
452  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
453  "psubusb %%mm3, %%mm0 \n\t"
454  "pxor %%mm2, %%mm0 \n\t"
455  "movq %%mm0, (%0, %1, 4) \n\t" // line 4
456 
457  "movq (%%"REG_c"), %%mm0 \n\t" // line 5
458  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
459  "paddusb %%mm3, %%mm0 \n\t"
460  "pxor %%mm2, %%mm0 \n\t"
461  "movq %%mm0, (%%"REG_c") \n\t" // line 5
462 
463  PAVGB(%%mm7, %%mm1) // d/4
464 
465  "movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3
466  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
467  "psubusb %%mm1, %%mm0 \n\t"
468  "pxor %%mm2, %%mm0 \n\t"
469  "movq %%mm0, (%%"REG_a", %1, 2) \n\t" // line 3
470 
471  "movq (%%"REG_c", %1), %%mm0 \n\t" // line 6
472  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
473  "paddusb %%mm1, %%mm0 \n\t"
474  "pxor %%mm2, %%mm0 \n\t"
475  "movq %%mm0, (%%"REG_c", %1) \n\t" // line 6
476 
477  PAVGB(%%mm7, %%mm1) // d/8
478 
479  "movq (%%"REG_a", %1), %%mm0 \n\t" // line 2
480  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
481  "psubusb %%mm1, %%mm0 \n\t"
482  "pxor %%mm2, %%mm0 \n\t"
483  "movq %%mm0, (%%"REG_a", %1) \n\t" // line 2
484 
485  "movq (%%"REG_c", %1, 2), %%mm0 \n\t" // line 7
486  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
487  "paddusb %%mm1, %%mm0 \n\t"
488  "pxor %%mm2, %%mm0 \n\t"
489  "movq %%mm0, (%%"REG_c", %1, 2) \n\t" // line 7
490 
491  :
492  : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb)
493  : "%"REG_a, "%"REG_c
494  );
495 #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
496 
497  const int l1= stride;
498  const int l2= stride + l1;
499  const int l3= stride + l2;
500  const int l4= stride + l3;
501  const int l5= stride + l4;
502  const int l6= stride + l5;
503  const int l7= stride + l6;
504 // const int l8= stride + l7;
505 // const int l9= stride + l8;
506  int x;
507 
508  src+= stride*3;
509  for(x=0; x<BLOCK_SIZE; x++){
510  int a= src[l3] - src[l4];
511  int b= src[l4] - src[l5];
512  int c= src[l5] - src[l6];
513 
514  int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1);
515  d= FFMAX(d, 0);
516 
517  if(d < co->QP*2){
518  int v = d * FFSIGN(-b);
519 
520  src[l2] +=v>>3;
521  src[l3] +=v>>2;
522  src[l4] +=(3*v)>>3;
523  src[l5] -=(3*v)>>3;
524  src[l6] -=v>>2;
525  src[l7] -=v>>3;
526  }
527  src++;
528  }
529 #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
530 }
531 
532 #if !TEMPLATE_PP_ALTIVEC
533 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
534 {
535 #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
536 /*
537  uint8_t tmp[16];
538  const int l1= stride;
539  const int l2= stride + l1;
540  const int l3= stride + l2;
541  const int l4= (int)tmp - (int)src - stride*3;
542  const int l5= (int)tmp - (int)src - stride*3 + 8;
543  const int l6= stride*3 + l3;
544  const int l7= stride + l6;
545  const int l8= stride + l7;
546 
547  memcpy(tmp, src+stride*7, 8);
548  memcpy(tmp+8, src+stride*8, 8);
549 */
550  src+= stride*4;
551  __asm__ volatile(
552 
553 #if 0 //slightly more accurate and slightly slower
554  "pxor %%mm7, %%mm7 \n\t" // 0
555  "lea (%0, %1), %%"REG_a" \n\t"
556  "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
557 // 0 1 2 3 4 5 6 7
558 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
559 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
560 
561 
562  "movq (%0, %1, 2), %%mm0 \n\t" // l2
563  "movq (%0), %%mm1 \n\t" // l0
564  "movq %%mm0, %%mm2 \n\t" // l2
565  PAVGB(%%mm7, %%mm0) // ~l2/2
566  PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
567  PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
568 
569  "movq (%%"REG_a"), %%mm1 \n\t" // l1
570  "movq (%%"REG_a", %1, 2), %%mm3 \n\t" // l3
571  "movq %%mm1, %%mm4 \n\t" // l1
572  PAVGB(%%mm7, %%mm1) // ~l1/2
573  PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
574  PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
575 
576  "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
577  "psubusb %%mm1, %%mm0 \n\t"
578  "psubusb %%mm4, %%mm1 \n\t"
579  "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
580 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
581 
582  "movq (%0, %1, 4), %%mm0 \n\t" // l4
583  "movq %%mm0, %%mm4 \n\t" // l4
584  PAVGB(%%mm7, %%mm0) // ~l4/2
585  PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
586  PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
587 
588  "movq (%%"REG_c"), %%mm2 \n\t" // l5
589  "movq %%mm3, %%mm5 \n\t" // l3
590  PAVGB(%%mm7, %%mm3) // ~l3/2
591  PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
592  PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
593 
594  "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
595  "psubusb %%mm3, %%mm0 \n\t"
596  "psubusb %%mm6, %%mm3 \n\t"
597  "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
598  "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
599 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
600 
601  "movq (%%"REG_c", %1), %%mm6 \n\t" // l6
602  "movq %%mm6, %%mm5 \n\t" // l6
603  PAVGB(%%mm7, %%mm6) // ~l6/2
604  PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
605  PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
606 
607  "movq (%%"REG_c", %1, 2), %%mm5 \n\t" // l7
608  "movq %%mm2, %%mm4 \n\t" // l5
609  PAVGB(%%mm7, %%mm2) // ~l5/2
610  PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
611  PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
612 
613  "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
614  "psubusb %%mm2, %%mm6 \n\t"
615  "psubusb %%mm4, %%mm2 \n\t"
616  "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
617 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
618 
619 
620  PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
621  "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ?
622  "paddusb "MANGLE(b01)", %%mm4 \n\t"
623  "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
624  "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
625  "pand %%mm4, %%mm3 \n\t"
626 
627  "movq %%mm3, %%mm1 \n\t"
628 // "psubusb "MANGLE(b01)", %%mm3 \n\t"
629  PAVGB(%%mm7, %%mm3)
630  PAVGB(%%mm7, %%mm3)
631  "paddusb %%mm1, %%mm3 \n\t"
632 // "paddusb "MANGLE(b01)", %%mm3 \n\t"
633 
634  "movq (%%"REG_a", %1, 2), %%mm6 \n\t" //l3
635  "movq (%0, %1, 4), %%mm5 \n\t" //l4
636  "movq (%0, %1, 4), %%mm4 \n\t" //l4
637  "psubusb %%mm6, %%mm5 \n\t"
638  "psubusb %%mm4, %%mm6 \n\t"
639  "por %%mm6, %%mm5 \n\t" // |l3-l4|
640  "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
641  "pxor %%mm6, %%mm0 \n\t"
642  "pand %%mm0, %%mm3 \n\t"
643  PMINUB(%%mm5, %%mm3, %%mm0)
644 
645  "psubusb "MANGLE(b01)", %%mm3 \n\t"
646  PAVGB(%%mm7, %%mm3)
647 
648  "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
649  "movq (%0, %1, 4), %%mm2 \n\t"
650  "pxor %%mm6, %%mm0 \n\t"
651  "pxor %%mm6, %%mm2 \n\t"
652  "psubb %%mm3, %%mm0 \n\t"
653  "paddb %%mm3, %%mm2 \n\t"
654  "pxor %%mm6, %%mm0 \n\t"
655  "pxor %%mm6, %%mm2 \n\t"
656  "movq %%mm0, (%%"REG_a", %1, 2) \n\t"
657  "movq %%mm2, (%0, %1, 4) \n\t"
658 #endif //0
659 
660  "lea (%0, %1), %%"REG_a" \n\t"
661  "pcmpeqb %%mm6, %%mm6 \n\t" // -1
662 // 0 1 2 3 4 5 6 7
663 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
664 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
665 
666 
667  "movq (%%"REG_a", %1, 2), %%mm1 \n\t" // l3
668  "movq (%0, %1, 4), %%mm0 \n\t" // l4
669  "pxor %%mm6, %%mm1 \n\t" // -l3-1
670  PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
671 // mm1=-l3-1, mm0=128-q
672 
673  "movq (%%"REG_a", %1, 4), %%mm2 \n\t" // l5
674  "movq (%%"REG_a", %1), %%mm3 \n\t" // l2
675  "pxor %%mm6, %%mm2 \n\t" // -l5-1
676  "movq %%mm2, %%mm5 \n\t" // -l5-1
677  "movq "MANGLE(b80)", %%mm4 \n\t" // 128
678  "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
679  PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
680  PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
681  PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
682  PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
683 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
684 
685  "movq (%%"REG_a"), %%mm2 \n\t" // l1
686  "pxor %%mm6, %%mm2 \n\t" // -l1-1
687  PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
688  PAVGB((%0), %%mm1) // (l0-l3+256)/2
689  "movq "MANGLE(b80)", %%mm3 \n\t" // 128
690  PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
691  PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
692  PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
693 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
694 
695  PAVGB((%%REGc, %1), %%mm5) // (l6-l5+256)/2
696  "movq (%%"REG_c", %1, 2), %%mm1 \n\t" // l7
697  "pxor %%mm6, %%mm1 \n\t" // -l7-1
698  PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
699  "movq "MANGLE(b80)", %%mm2 \n\t" // 128
700  PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
701  PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
702  PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
703 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
704 
705  "movq "MANGLE(b00)", %%mm1 \n\t" // 0
706  "movq "MANGLE(b00)", %%mm5 \n\t" // 0
707  "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
708  "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
709  PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
710  PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
711  PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
712 
713 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
714 
715  "movq "MANGLE(b00)", %%mm7 \n\t" // 0
716  "movq %2, %%mm2 \n\t" // QP
717  PAVGB(%%mm6, %%mm2) // 128 + QP/2
718  "psubb %%mm6, %%mm2 \n\t"
719 
720  "movq %%mm4, %%mm1 \n\t"
721  "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
722  "pxor %%mm1, %%mm4 \n\t"
723  "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
724  "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
725  "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
726 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
727 
728  "movq %%mm4, %%mm3 \n\t" // d
729  "psubusb "MANGLE(b01)", %%mm4 \n\t"
730  PAVGB(%%mm7, %%mm4) // d/32
731  PAVGB(%%mm7, %%mm4) // (d + 32)/64
732  "paddb %%mm3, %%mm4 \n\t" // 5d/64
733  "pand %%mm2, %%mm4 \n\t"
734 
735  "movq "MANGLE(b80)", %%mm5 \n\t" // 128
736  "psubb %%mm0, %%mm5 \n\t" // q
737  "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
738  "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
739  "pxor %%mm7, %%mm5 \n\t"
740 
741  PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
742  "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
743 
744  "pand %%mm7, %%mm4 \n\t"
745  "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
746  "movq (%0, %1, 4), %%mm2 \n\t"
747  "pxor %%mm1, %%mm0 \n\t"
748  "pxor %%mm1, %%mm2 \n\t"
749  "paddb %%mm4, %%mm0 \n\t"
750  "psubb %%mm4, %%mm2 \n\t"
751  "pxor %%mm1, %%mm0 \n\t"
752  "pxor %%mm1, %%mm2 \n\t"
753  "movq %%mm0, (%%"REG_a", %1, 2) \n\t"
754  "movq %%mm2, (%0, %1, 4) \n\t"
755 
756  :
757  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
758  : "%"REG_a, "%"REG_c
759  );
760 
761 /*
762  {
763  int x;
764  src-= stride;
765  for(x=0; x<BLOCK_SIZE; x++){
766  const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
767  if(FFABS(middleEnergy)< 8*QP){
768  const int q=(src[l4] - src[l5])/2;
769  const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
770  const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
771 
772  int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
773  d= FFMAX(d, 0);
774 
775  d= (5*d + 32) >> 6;
776  d*= FFSIGN(-middleEnergy);
777 
778  if(q>0){
779  d= d<0 ? 0 : d;
780  d= d>q ? q : d;
781  }else{
782  d= d>0 ? 0 : d;
783  d= d<q ? q : d;
784  }
785 
786  src[l4]-= d;
787  src[l5]+= d;
788  }
789  src++;
790  }
791  src-=8;
792  for(x=0; x<8; x++){
793  int y;
794  for(y=4; y<6; y++){
795  int d= src[x+y*stride] - tmp[x+(y-4)*8];
796  int ad= FFABS(d);
797  static int max=0;
798  static int sum=0;
799  static int num=0;
800  static int bias=0;
801 
802  if(max<ad) max=ad;
803  sum+= ad>3 ? 1 : 0;
804  if(ad>3){
805  src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
806  }
807  if(y==4) bias+=d;
808  num++;
809  if(num%1000000 == 0){
810  av_log(c, AV_LOG_INFO, " %d %d %d %d\n", num, sum, max, bias);
811  }
812  }
813  }
814 }
815 */
816 #elif TEMPLATE_PP_MMX
817  DECLARE_ALIGNED(8, uint64_t, tmp)[4]; // make space for 4 8-byte vars
818  src+= stride*4;
819  __asm__ volatile(
820  "pxor %%mm7, %%mm7 \n\t"
821 // 0 1 2 3 4 5 6 7
822 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1
823 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1
824 
825  "movq (%0), %%mm0 \n\t"
826  "movq %%mm0, %%mm1 \n\t"
827  "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
828  "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
829 
830  "movq (%0, %1), %%mm2 \n\t"
831  "lea (%0, %1, 2), %%"REG_a" \n\t"
832  "movq %%mm2, %%mm3 \n\t"
833  "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
834  "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
835 
836  "movq (%%"REG_a"), %%mm4 \n\t"
837  "movq %%mm4, %%mm5 \n\t"
838  "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
839  "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
840 
841  "paddw %%mm0, %%mm0 \n\t" // 2L0
842  "paddw %%mm1, %%mm1 \n\t" // 2H0
843  "psubw %%mm4, %%mm2 \n\t" // L1 - L2
844  "psubw %%mm5, %%mm3 \n\t" // H1 - H2
845  "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
846  "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
847 
848  "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
849  "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
850  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
851  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
852 
853  "movq (%%"REG_a", %1), %%mm2 \n\t"
854  "movq %%mm2, %%mm3 \n\t"
855  "punpcklbw %%mm7, %%mm2 \n\t" // L3
856  "punpckhbw %%mm7, %%mm3 \n\t" // H3
857 
858  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
859  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
860  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
861  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
862  "movq %%mm0, (%3) \n\t" // 2L0 - 5L1 + 5L2 - 2L3
863  "movq %%mm1, 8(%3) \n\t" // 2H0 - 5H1 + 5H2 - 2H3
864 
865  "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
866  "movq %%mm0, %%mm1 \n\t"
867  "punpcklbw %%mm7, %%mm0 \n\t" // L4
868  "punpckhbw %%mm7, %%mm1 \n\t" // H4
869 
870  "psubw %%mm0, %%mm2 \n\t" // L3 - L4
871  "psubw %%mm1, %%mm3 \n\t" // H3 - H4
872  "movq %%mm2, 16(%3) \n\t" // L3 - L4
873  "movq %%mm3, 24(%3) \n\t" // H3 - H4
874  "paddw %%mm4, %%mm4 \n\t" // 2L2
875  "paddw %%mm5, %%mm5 \n\t" // 2H2
876  "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
877  "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
878 
879  "lea (%%"REG_a", %1), %0 \n\t"
880  "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
881  "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
882  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
883  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
884 //50 opcodes so far
885  "movq (%0, %1, 2), %%mm2 \n\t"
886  "movq %%mm2, %%mm3 \n\t"
887  "punpcklbw %%mm7, %%mm2 \n\t" // L5
888  "punpckhbw %%mm7, %%mm3 \n\t" // H5
889  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
890  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
891  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
892  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
893 
894  "movq (%%"REG_a", %1, 4), %%mm6 \n\t"
895  "punpcklbw %%mm7, %%mm6 \n\t" // L6
896  "psubw %%mm6, %%mm2 \n\t" // L5 - L6
897  "movq (%%"REG_a", %1, 4), %%mm6 \n\t"
898  "punpckhbw %%mm7, %%mm6 \n\t" // H6
899  "psubw %%mm6, %%mm3 \n\t" // H5 - H6
900 
901  "paddw %%mm0, %%mm0 \n\t" // 2L4
902  "paddw %%mm1, %%mm1 \n\t" // 2H4
903  "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
904  "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
905 
906  "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
907  "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
908  "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
909  "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
910 
911  "movq (%0, %1, 4), %%mm2 \n\t"
912  "movq %%mm2, %%mm3 \n\t"
913  "punpcklbw %%mm7, %%mm2 \n\t" // L7
914  "punpckhbw %%mm7, %%mm3 \n\t" // H7
915 
916  "paddw %%mm2, %%mm2 \n\t" // 2L7
917  "paddw %%mm3, %%mm3 \n\t" // 2H7
918  "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
919  "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
920 
921  "movq (%3), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
922  "movq 8(%3), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
923 
924 #if TEMPLATE_PP_MMXEXT
925  "movq %%mm7, %%mm6 \n\t" // 0
926  "psubw %%mm0, %%mm6 \n\t"
927  "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
928  "movq %%mm7, %%mm6 \n\t" // 0
929  "psubw %%mm1, %%mm6 \n\t"
930  "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
931  "movq %%mm7, %%mm6 \n\t" // 0
932  "psubw %%mm2, %%mm6 \n\t"
933  "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
934  "movq %%mm7, %%mm6 \n\t" // 0
935  "psubw %%mm3, %%mm6 \n\t"
936  "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
937 #else
938  "movq %%mm7, %%mm6 \n\t" // 0
939  "pcmpgtw %%mm0, %%mm6 \n\t"
940  "pxor %%mm6, %%mm0 \n\t"
941  "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
942  "movq %%mm7, %%mm6 \n\t" // 0
943  "pcmpgtw %%mm1, %%mm6 \n\t"
944  "pxor %%mm6, %%mm1 \n\t"
945  "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
946  "movq %%mm7, %%mm6 \n\t" // 0
947  "pcmpgtw %%mm2, %%mm6 \n\t"
948  "pxor %%mm6, %%mm2 \n\t"
949  "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
950  "movq %%mm7, %%mm6 \n\t" // 0
951  "pcmpgtw %%mm3, %%mm6 \n\t"
952  "pxor %%mm6, %%mm3 \n\t"
953  "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
954 #endif
955 
956 #if TEMPLATE_PP_MMXEXT
957  "pminsw %%mm2, %%mm0 \n\t"
958  "pminsw %%mm3, %%mm1 \n\t"
959 #else
960  "movq %%mm0, %%mm6 \n\t"
961  "psubusw %%mm2, %%mm6 \n\t"
962  "psubw %%mm6, %%mm0 \n\t"
963  "movq %%mm1, %%mm6 \n\t"
964  "psubusw %%mm3, %%mm6 \n\t"
965  "psubw %%mm6, %%mm1 \n\t"
966 #endif
967 
968  "movd %2, %%mm2 \n\t" // QP
969  "punpcklbw %%mm7, %%mm2 \n\t"
970 
971  "movq %%mm7, %%mm6 \n\t" // 0
972  "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
973  "pxor %%mm6, %%mm4 \n\t"
974  "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
975  "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
976  "pxor %%mm7, %%mm5 \n\t"
977  "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
978 // 100 opcodes
979  "psllw $3, %%mm2 \n\t" // 8QP
980  "movq %%mm2, %%mm3 \n\t" // 8QP
981  "pcmpgtw %%mm4, %%mm2 \n\t"
982  "pcmpgtw %%mm5, %%mm3 \n\t"
983  "pand %%mm2, %%mm4 \n\t"
984  "pand %%mm3, %%mm5 \n\t"
985 
986 
987  "psubusw %%mm0, %%mm4 \n\t" // hd
988  "psubusw %%mm1, %%mm5 \n\t" // ld
989 
990 
991  "movq "MANGLE(w05)", %%mm2 \n\t" // 5
992  "pmullw %%mm2, %%mm4 \n\t"
993  "pmullw %%mm2, %%mm5 \n\t"
994  "movq "MANGLE(w20)", %%mm2 \n\t" // 32
995  "paddw %%mm2, %%mm4 \n\t"
996  "paddw %%mm2, %%mm5 \n\t"
997  "psrlw $6, %%mm4 \n\t"
998  "psrlw $6, %%mm5 \n\t"
999 
1000  "movq 16(%3), %%mm0 \n\t" // L3 - L4
1001  "movq 24(%3), %%mm1 \n\t" // H3 - H4
1002 
1003  "pxor %%mm2, %%mm2 \n\t"
1004  "pxor %%mm3, %%mm3 \n\t"
1005 
1006  "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
1007  "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
1008  "pxor %%mm2, %%mm0 \n\t"
1009  "pxor %%mm3, %%mm1 \n\t"
1010  "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
1011  "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
1012  "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
1013  "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
1014 
1015  "pxor %%mm6, %%mm2 \n\t"
1016  "pxor %%mm7, %%mm3 \n\t"
1017  "pand %%mm2, %%mm4 \n\t"
1018  "pand %%mm3, %%mm5 \n\t"
1019 
1020 #if TEMPLATE_PP_MMXEXT
1021  "pminsw %%mm0, %%mm4 \n\t"
1022  "pminsw %%mm1, %%mm5 \n\t"
1023 #else
1024  "movq %%mm4, %%mm2 \n\t"
1025  "psubusw %%mm0, %%mm2 \n\t"
1026  "psubw %%mm2, %%mm4 \n\t"
1027  "movq %%mm5, %%mm2 \n\t"
1028  "psubusw %%mm1, %%mm2 \n\t"
1029  "psubw %%mm2, %%mm5 \n\t"
1030 #endif
1031  "pxor %%mm6, %%mm4 \n\t"
1032  "pxor %%mm7, %%mm5 \n\t"
1033  "psubw %%mm6, %%mm4 \n\t"
1034  "psubw %%mm7, %%mm5 \n\t"
1035  "packsswb %%mm5, %%mm4 \n\t"
1036  "movq (%0), %%mm0 \n\t"
1037  "paddb %%mm4, %%mm0 \n\t"
1038  "movq %%mm0, (%0) \n\t"
1039  "movq (%0, %1), %%mm0 \n\t"
1040  "psubb %%mm4, %%mm0 \n\t"
1041  "movq %%mm0, (%0, %1) \n\t"
1042 
1043  : "+r" (src)
1044  : "r" ((x86_reg)stride), "m" (c->pQPb), "r"(tmp)
1045  : "%"REG_a
1046  );
1047 #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1048  const int l1= stride;
1049  const int l2= stride + l1;
1050  const int l3= stride + l2;
1051  const int l4= stride + l3;
1052  const int l5= stride + l4;
1053  const int l6= stride + l5;
1054  const int l7= stride + l6;
1055  const int l8= stride + l7;
1056 // const int l9= stride + l8;
1057  int x;
1058  src+= stride*3;
1059  for(x=0; x<BLOCK_SIZE; x++){
1060  const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1061  if(FFABS(middleEnergy) < 8*c->QP){
1062  const int q=(src[l4] - src[l5])/2;
1063  const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1064  const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1065 
1066  int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
1067  d= FFMAX(d, 0);
1068 
1069  d= (5*d + 32) >> 6;
1070  d*= FFSIGN(-middleEnergy);
1071 
1072  if(q>0){
1073  d= d<0 ? 0 : d;
1074  d= d>q ? q : d;
1075  }else{
1076  d= d>0 ? 0 : d;
1077  d= d<q ? q : d;
1078  }
1079 
1080  src[l4]-= d;
1081  src[l5]+= d;
1082  }
1083  src++;
1084  }
1085 #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1086 }
1087 #endif //TEMPLATE_PP_ALTIVEC
1088 
1089 #if !TEMPLATE_PP_ALTIVEC
1090 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
1091 {
1092 #if HAVE_7REGS && (TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW)
1093  DECLARE_ALIGNED(8, uint64_t, tmp)[3];
1094  __asm__ volatile(
1095  "pxor %%mm6, %%mm6 \n\t"
1096  "pcmpeqb %%mm7, %%mm7 \n\t"
1097  "movq %2, %%mm0 \n\t"
1098  "punpcklbw %%mm6, %%mm0 \n\t"
1099  "psrlw $1, %%mm0 \n\t"
1100  "psubw %%mm7, %%mm0 \n\t"
1101  "packuswb %%mm0, %%mm0 \n\t"
1102  "movq %%mm0, %3 \n\t"
1103 
1104  "lea (%0, %1), %%"REG_a" \n\t"
1105  "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1106 
1107 // 0 1 2 3 4 5 6 7 8 9
1108 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1109 
1110 #undef REAL_FIND_MIN_MAX
1111 #undef FIND_MIN_MAX
1112 #if TEMPLATE_PP_MMXEXT
1113 #define REAL_FIND_MIN_MAX(addr)\
1114  "movq " #addr ", %%mm0 \n\t"\
1115  "pminub %%mm0, %%mm7 \n\t"\
1116  "pmaxub %%mm0, %%mm6 \n\t"
1117 #else
1118 #define REAL_FIND_MIN_MAX(addr)\
1119  "movq " #addr ", %%mm0 \n\t"\
1120  "movq %%mm7, %%mm1 \n\t"\
1121  "psubusb %%mm0, %%mm6 \n\t"\
1122  "paddb %%mm0, %%mm6 \n\t"\
1123  "psubusb %%mm0, %%mm1 \n\t"\
1124  "psubb %%mm1, %%mm7 \n\t"
1125 #endif
1126 #define FIND_MIN_MAX(addr) REAL_FIND_MIN_MAX(addr)
1127 
1128 FIND_MIN_MAX((%%REGa))
1129 FIND_MIN_MAX((%%REGa, %1))
1130 FIND_MIN_MAX((%%REGa, %1, 2))
1131 FIND_MIN_MAX((%0, %1, 4))
1132 FIND_MIN_MAX((%%REGd))
1133 FIND_MIN_MAX((%%REGd, %1))
1134 FIND_MIN_MAX((%%REGd, %1, 2))
1135 FIND_MIN_MAX((%0, %1, 8))
1136 
1137  "movq %%mm7, %%mm4 \n\t"
1138  "psrlq $8, %%mm7 \n\t"
1139 #if TEMPLATE_PP_MMXEXT
1140  "pminub %%mm4, %%mm7 \n\t" // min of pixels
1141  "pshufw $0xF9, %%mm7, %%mm4 \n\t"
1142  "pminub %%mm4, %%mm7 \n\t" // min of pixels
1143  "pshufw $0xFE, %%mm7, %%mm4 \n\t"
1144  "pminub %%mm4, %%mm7 \n\t"
1145 #else
1146  "movq %%mm7, %%mm1 \n\t"
1147  "psubusb %%mm4, %%mm1 \n\t"
1148  "psubb %%mm1, %%mm7 \n\t"
1149  "movq %%mm7, %%mm4 \n\t"
1150  "psrlq $16, %%mm7 \n\t"
1151  "movq %%mm7, %%mm1 \n\t"
1152  "psubusb %%mm4, %%mm1 \n\t"
1153  "psubb %%mm1, %%mm7 \n\t"
1154  "movq %%mm7, %%mm4 \n\t"
1155  "psrlq $32, %%mm7 \n\t"
1156  "movq %%mm7, %%mm1 \n\t"
1157  "psubusb %%mm4, %%mm1 \n\t"
1158  "psubb %%mm1, %%mm7 \n\t"
1159 #endif
1160 
1161 
1162  "movq %%mm6, %%mm4 \n\t"
1163  "psrlq $8, %%mm6 \n\t"
1164 #if TEMPLATE_PP_MMXEXT
1165  "pmaxub %%mm4, %%mm6 \n\t" // max of pixels
1166  "pshufw $0xF9, %%mm6, %%mm4 \n\t"
1167  "pmaxub %%mm4, %%mm6 \n\t"
1168  "pshufw $0xFE, %%mm6, %%mm4 \n\t"
1169  "pmaxub %%mm4, %%mm6 \n\t"
1170 #else
1171  "psubusb %%mm4, %%mm6 \n\t"
1172  "paddb %%mm4, %%mm6 \n\t"
1173  "movq %%mm6, %%mm4 \n\t"
1174  "psrlq $16, %%mm6 \n\t"
1175  "psubusb %%mm4, %%mm6 \n\t"
1176  "paddb %%mm4, %%mm6 \n\t"
1177  "movq %%mm6, %%mm4 \n\t"
1178  "psrlq $32, %%mm6 \n\t"
1179  "psubusb %%mm4, %%mm6 \n\t"
1180  "paddb %%mm4, %%mm6 \n\t"
1181 #endif
1182  "movq %%mm6, %%mm0 \n\t" // max
1183  "psubb %%mm7, %%mm6 \n\t" // max - min
1184  "push %4 \n\t"
1185  "movd %%mm6, %k4 \n\t"
1186  "cmpb "MANGLE(deringThreshold)", %b4 \n\t"
1187  "pop %4 \n\t"
1188  " jb 1f \n\t"
1189  PAVGB(%%mm0, %%mm7) // a=(max + min)/2
1190  "punpcklbw %%mm7, %%mm7 \n\t"
1191  "punpcklbw %%mm7, %%mm7 \n\t"
1192  "punpcklbw %%mm7, %%mm7 \n\t"
1193  "movq %%mm7, (%4) \n\t"
1194 
1195  "movq (%0), %%mm0 \n\t" // L10
1196  "movq %%mm0, %%mm1 \n\t" // L10
1197  "movq %%mm0, %%mm2 \n\t" // L10
1198  "psllq $8, %%mm1 \n\t"
1199  "psrlq $8, %%mm2 \n\t"
1200  "movd -4(%0), %%mm3 \n\t"
1201  "movd 8(%0), %%mm4 \n\t"
1202  "psrlq $24, %%mm3 \n\t"
1203  "psllq $56, %%mm4 \n\t"
1204  "por %%mm3, %%mm1 \n\t" // L00
1205  "por %%mm4, %%mm2 \n\t" // L20
1206  "movq %%mm1, %%mm3 \n\t" // L00
1207  PAVGB(%%mm2, %%mm1) // (L20 + L00)/2
1208  PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4
1209  "psubusb %%mm7, %%mm0 \n\t"
1210  "psubusb %%mm7, %%mm2 \n\t"
1211  "psubusb %%mm7, %%mm3 \n\t"
1212  "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1
1213  "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1
1214  "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1
1215  "paddb %%mm2, %%mm0 \n\t"
1216  "paddb %%mm3, %%mm0 \n\t"
1217 
1218  "movq (%%"REG_a"), %%mm2 \n\t" // L11
1219  "movq %%mm2, %%mm3 \n\t" // L11
1220  "movq %%mm2, %%mm4 \n\t" // L11
1221  "psllq $8, %%mm3 \n\t"
1222  "psrlq $8, %%mm4 \n\t"
1223  "movd -4(%%"REG_a"), %%mm5 \n\t"
1224  "movd 8(%%"REG_a"), %%mm6 \n\t"
1225  "psrlq $24, %%mm5 \n\t"
1226  "psllq $56, %%mm6 \n\t"
1227  "por %%mm5, %%mm3 \n\t" // L01
1228  "por %%mm6, %%mm4 \n\t" // L21
1229  "movq %%mm3, %%mm5 \n\t" // L01
1230  PAVGB(%%mm4, %%mm3) // (L21 + L01)/2
1231  PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4
1232  "psubusb %%mm7, %%mm2 \n\t"
1233  "psubusb %%mm7, %%mm4 \n\t"
1234  "psubusb %%mm7, %%mm5 \n\t"
1235  "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1
1236  "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1
1237  "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1
1238  "paddb %%mm4, %%mm2 \n\t"
1239  "paddb %%mm5, %%mm2 \n\t"
1240 // 0, 2, 3, 1
1241 #define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1242  "movq " #src ", " #sx " \n\t" /* src[0] */\
1243  "movq " #sx ", " #lx " \n\t" /* src[0] */\
1244  "movq " #sx ", " #t0 " \n\t" /* src[0] */\
1245  "psllq $8, " #lx " \n\t"\
1246  "psrlq $8, " #t0 " \n\t"\
1247  "movd -4" #src ", " #t1 " \n\t"\
1248  "psrlq $24, " #t1 " \n\t"\
1249  "por " #t1 ", " #lx " \n\t" /* src[-1] */\
1250  "movd 8" #src ", " #t1 " \n\t"\
1251  "psllq $56, " #t1 " \n\t"\
1252  "por " #t1 ", " #t0 " \n\t" /* src[+1] */\
1253  "movq " #lx ", " #t1 " \n\t" /* src[-1] */\
1254  PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\
1255  PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\
1256  PAVGB(lx, pplx) \
1257  "movq " #lx ", 8(%4) \n\t"\
1258  "movq (%4), " #lx " \n\t"\
1259  "psubusb " #lx ", " #t1 " \n\t"\
1260  "psubusb " #lx ", " #t0 " \n\t"\
1261  "psubusb " #lx ", " #sx " \n\t"\
1262  "movq "MANGLE(b00)", " #lx " \n\t"\
1263  "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\
1264  "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\
1265  "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\
1266  "paddb " #t1 ", " #t0 " \n\t"\
1267  "paddb " #t0 ", " #sx " \n\t"\
1268 \
1269  PAVGB(plx, pplx) /* filtered */\
1270  "movq " #dst ", " #t0 " \n\t" /* dst */\
1271  "movq " #t0 ", " #t1 " \n\t" /* dst */\
1272  "psubusb %3, " #t0 " \n\t"\
1273  "paddusb %3, " #t1 " \n\t"\
1274  PMAXUB(t0, pplx)\
1275  PMINUB(t1, pplx, t0)\
1276  "paddb " #sx ", " #ppsx " \n\t"\
1277  "paddb " #psx ", " #ppsx " \n\t"\
1278  "#paddb "MANGLE(b02)", " #ppsx " \n\t"\
1279  "pand "MANGLE(b08)", " #ppsx " \n\t"\
1280  "pcmpeqb " #lx ", " #ppsx " \n\t"\
1281  "pand " #ppsx ", " #pplx " \n\t"\
1282  "pandn " #dst ", " #ppsx " \n\t"\
1283  "por " #pplx ", " #ppsx " \n\t"\
1284  "movq " #ppsx ", " #dst " \n\t"\
1285  "movq 8(%4), " #lx " \n\t"
1286 
1287 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1288  REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1)
1289 /*
1290 0000000
1291 1111111
1292 
1293 1111110
1294 1111101
1295 1111100
1296 1111011
1297 1111010
1298 1111001
1299 
1300 1111000
1301 1110111
1302 
1303 */
1304 //DERING_CORE(dst ,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)
1305 DERING_CORE((%%REGa) ,(%%REGa, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1306 DERING_CORE((%%REGa, %1) ,(%%REGa, %1, 2),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1307 DERING_CORE((%%REGa, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1308 DERING_CORE((%0, %1, 4) ,(%%REGd) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1309 DERING_CORE((%%REGd) ,(%%REGd, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1310 DERING_CORE((%%REGd, %1) ,(%%REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1311 DERING_CORE((%%REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1312 DERING_CORE((%0, %1, 8) ,(%%REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1313 
1314  "1: \n\t"
1315  : : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2), "q"(tmp)
1316  : "%"REG_a, "%"REG_d
1317  );
1318 #else // HAVE_7REGS && (TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW)
1319  int y;
1320  int min=255;
1321  int max=0;
1322  int avg;
1323  uint8_t *p;
1324  int s[10];
1325  const int QP2= c->QP/2 + 1;
1326 
1327  src --;
1328  for(y=1; y<9; y++){
1329  int x;
1330  p= src + stride*y;
1331  for(x=1; x<9; x++){
1332  p++;
1333  if(*p > max) max= *p;
1334  if(*p < min) min= *p;
1335  }
1336  }
1337  avg= (min + max + 1)>>1;
1338 
1339  if(max - min <deringThreshold) return;
1340 
1341  for(y=0; y<10; y++){
1342  int t = 0;
1343 
1344  if(src[stride*y + 0] > avg) t+= 1;
1345  if(src[stride*y + 1] > avg) t+= 2;
1346  if(src[stride*y + 2] > avg) t+= 4;
1347  if(src[stride*y + 3] > avg) t+= 8;
1348  if(src[stride*y + 4] > avg) t+= 16;
1349  if(src[stride*y + 5] > avg) t+= 32;
1350  if(src[stride*y + 6] > avg) t+= 64;
1351  if(src[stride*y + 7] > avg) t+= 128;
1352  if(src[stride*y + 8] > avg) t+= 256;
1353  if(src[stride*y + 9] > avg) t+= 512;
1354 
1355  t |= (~t)<<16;
1356  t &= (t<<1) & (t>>1);
1357  s[y] = t;
1358  }
1359 
1360  for(y=1; y<9; y++){
1361  int t = s[y-1] & s[y] & s[y+1];
1362  t|= t>>16;
1363  s[y-1]= t;
1364  }
1365 
1366  for(y=1; y<9; y++){
1367  int x;
1368  int t = s[y-1];
1369 
1370  p= src + stride*y;
1371  for(x=1; x<9; x++){
1372  p++;
1373  if(t & (1<<x)){
1374  int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1375  +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
1376  +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1377  f= (f + 8)>>4;
1378 
1379 #ifdef DEBUG_DERING_THRESHOLD
1380  __asm__ volatile("emms\n\t":);
1381  {
1382  static long long numPixels=0;
1383  if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1384 // if((max-min)<20 || (max-min)*QP<200)
1385 // if((max-min)*QP < 500)
1386 // if(max-min<QP/2)
1387  if(max-min < 20){
1388  static int numSkipped=0;
1389  static int errorSum=0;
1390  static int worstQP=0;
1391  static int worstRange=0;
1392  static int worstDiff=0;
1393  int diff= (f - *p);
1394  int absDiff= FFABS(diff);
1395  int error= diff*diff;
1396 
1397  if(x==1 || x==8 || y==1 || y==8) continue;
1398 
1399  numSkipped++;
1400  if(absDiff > worstDiff){
1401  worstDiff= absDiff;
1402  worstQP= QP;
1403  worstRange= max-min;
1404  }
1405  errorSum+= error;
1406 
1407  if(1024LL*1024LL*1024LL % numSkipped == 0){
1408  av_log(c, AV_LOG_INFO, "sum:%1.3f, skip:%d, wQP:%d, "
1409  "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1410  (float)errorSum/numSkipped, numSkipped, worstQP, worstRange,
1411  worstDiff, (float)numSkipped/numPixels);
1412  }
1413  }
1414  }
1415 #endif
1416  if (*p + QP2 < f) *p= *p + QP2;
1417  else if(*p - QP2 > f) *p= *p - QP2;
1418  else *p=f;
1419  }
1420  }
1421  }
1422 #ifdef DEBUG_DERING_THRESHOLD
1423  if(max-min < 20){
1424  for(y=1; y<9; y++){
1425  int x;
1426  int t = 0;
1427  p= src + stride*y;
1428  for(x=1; x<9; x++){
1429  p++;
1430  *p = FFMIN(*p + 20, 255);
1431  }
1432  }
1433 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1434  }
1435 #endif
1436 #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1437 }
1438 #endif //TEMPLATE_PP_ALTIVEC
1439 
1440 /**
1441  * Deinterlace the given block by linearly interpolating every second line.
1442  * will be called for every 8x8 block and can read & write from line 4-15
1443  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1444  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1445  */
1447 {
1448 #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1449  src+= 4*stride;
1450  __asm__ volatile(
1451  "lea (%0, %1), %%"REG_a" \n\t"
1452  "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
1453 // 0 1 2 3 4 5 6 7 8 9
1454 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
1455 
1456  "movq (%0), %%mm0 \n\t"
1457  "movq (%%"REG_a", %1), %%mm1 \n\t"
1458  PAVGB(%%mm1, %%mm0)
1459  "movq %%mm0, (%%"REG_a") \n\t"
1460  "movq (%0, %1, 4), %%mm0 \n\t"
1461  PAVGB(%%mm0, %%mm1)
1462  "movq %%mm1, (%%"REG_a", %1, 2) \n\t"
1463  "movq (%%"REG_c", %1), %%mm1 \n\t"
1464  PAVGB(%%mm1, %%mm0)
1465  "movq %%mm0, (%%"REG_c") \n\t"
1466  "movq (%0, %1, 8), %%mm0 \n\t"
1467  PAVGB(%%mm0, %%mm1)
1468  "movq %%mm1, (%%"REG_c", %1, 2) \n\t"
1469 
1470  : : "r" (src), "r" ((x86_reg)stride)
1471  : "%"REG_a, "%"REG_c
1472  );
1473 #else
1474  int a, b, x;
1475  src+= 4*stride;
1476 
1477  for(x=0; x<2; x++){
1478  a= *(uint32_t*)&src[stride*0];
1479  b= *(uint32_t*)&src[stride*2];
1480  *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1481  a= *(uint32_t*)&src[stride*4];
1482  *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1483  b= *(uint32_t*)&src[stride*6];
1484  *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1485  a= *(uint32_t*)&src[stride*8];
1486  *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1487  src += 4;
1488  }
1489 #endif
1490 }
1491 
1492 /**
1493  * Deinterlace the given block by cubic interpolating every second line.
1494  * will be called for every 8x8 block and can read & write from line 4-15
1495  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1496  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1497  * this filter will read lines 3-15 and write 7-13
1498  */
1500 {
1501 #if TEMPLATE_PP_SSE2 || TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1502  src+= stride*3;
1503  __asm__ volatile(
1504  "lea (%0, %1), %%"REG_a" \n\t"
1505  "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1506  "lea (%%"REG_d", %1, 4), %%"REG_c" \n\t"
1507  "add %1, %%"REG_c" \n\t"
1508 #if TEMPLATE_PP_SSE2
1509  "pxor %%xmm7, %%xmm7 \n\t"
1510 #define REAL_DEINT_CUBIC(a,b,c,d,e)\
1511  "movq " #a ", %%xmm0 \n\t"\
1512  "movq " #b ", %%xmm1 \n\t"\
1513  "movq " #d ", %%xmm2 \n\t"\
1514  "movq " #e ", %%xmm3 \n\t"\
1515  "pavgb %%xmm2, %%xmm1 \n\t"\
1516  "pavgb %%xmm3, %%xmm0 \n\t"\
1517  "punpcklbw %%xmm7, %%xmm0 \n\t"\
1518  "punpcklbw %%xmm7, %%xmm1 \n\t"\
1519  "psubw %%xmm1, %%xmm0 \n\t"\
1520  "psraw $3, %%xmm0 \n\t"\
1521  "psubw %%xmm0, %%xmm1 \n\t"\
1522  "packuswb %%xmm1, %%xmm1 \n\t"\
1523  "movlps %%xmm1, " #c " \n\t"
1524 #else //TEMPLATE_PP_SSE2
1525  "pxor %%mm7, %%mm7 \n\t"
1526 // 0 1 2 3 4 5 6 7 8 9 10
1527 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1528 
1529 #define REAL_DEINT_CUBIC(a,b,c,d,e)\
1530  "movq " #a ", %%mm0 \n\t"\
1531  "movq " #b ", %%mm1 \n\t"\
1532  "movq " #d ", %%mm2 \n\t"\
1533  "movq " #e ", %%mm3 \n\t"\
1534  PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\
1535  PAVGB(%%mm3, %%mm0) /* (a+e) /2 */\
1536  "movq %%mm0, %%mm2 \n\t"\
1537  "punpcklbw %%mm7, %%mm0 \n\t"\
1538  "punpckhbw %%mm7, %%mm2 \n\t"\
1539  "movq %%mm1, %%mm3 \n\t"\
1540  "punpcklbw %%mm7, %%mm1 \n\t"\
1541  "punpckhbw %%mm7, %%mm3 \n\t"\
1542  "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\
1543  "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\
1544  "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\
1545  "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\
1546  "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\
1547  "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\
1548  "packuswb %%mm3, %%mm1 \n\t"\
1549  "movq %%mm1, " #c " \n\t"
1550 #endif //TEMPLATE_PP_SSE2
1551 #define DEINT_CUBIC(a,b,c,d,e) REAL_DEINT_CUBIC(a,b,c,d,e)
1552 
1553 DEINT_CUBIC((%0) , (%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd, %1))
1554 DEINT_CUBIC((%%REGa, %1), (%0, %1, 4) , (%%REGd) , (%%REGd, %1), (%0, %1, 8))
1555 DEINT_CUBIC((%0, %1, 4) , (%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGc))
1556 DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc) , (%%REGc, %1, 2))
1557 
1558  : : "r" (src), "r" ((x86_reg)stride)
1559  :
1560 #if TEMPLATE_PP_SSE2
1561  XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm7",)
1562 #endif
1563  "%"REG_a, "%"REG_d, "%"REG_c
1564  );
1565 #undef REAL_DEINT_CUBIC
1566 #else //TEMPLATE_PP_SSE2 || TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1567  int x;
1568  src+= stride*3;
1569  for(x=0; x<8; x++){
1570  src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
1571  src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
1572  src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
1573  src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
1574  src++;
1575  }
1576 #endif //TEMPLATE_PP_SSE2 || TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1577 }
1578 
1579 /**
1580  * Deinterlace the given block by filtering every second line with a (-1 4 2 4 -1) filter.
1581  * will be called for every 8x8 block and can read & write from line 4-15
1582  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1583  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1584  * this filter will read lines 4-13 and write 5-11
1585  */
1586 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
1587 {
1588 #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1589  src+= stride*4;
1590  __asm__ volatile(
1591  "lea (%0, %1), %%"REG_a" \n\t"
1592  "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1593  "pxor %%mm7, %%mm7 \n\t"
1594  "movq (%2), %%mm0 \n\t"
1595 // 0 1 2 3 4 5 6 7 8 9 10
1596 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1597 
1598 #define REAL_DEINT_FF(a,b,c,d)\
1599  "movq " #a ", %%mm1 \n\t"\
1600  "movq " #b ", %%mm2 \n\t"\
1601  "movq " #c ", %%mm3 \n\t"\
1602  "movq " #d ", %%mm4 \n\t"\
1603  PAVGB(%%mm3, %%mm1) \
1604  PAVGB(%%mm4, %%mm0) \
1605  "movq %%mm0, %%mm3 \n\t"\
1606  "punpcklbw %%mm7, %%mm0 \n\t"\
1607  "punpckhbw %%mm7, %%mm3 \n\t"\
1608  "movq %%mm1, %%mm4 \n\t"\
1609  "punpcklbw %%mm7, %%mm1 \n\t"\
1610  "punpckhbw %%mm7, %%mm4 \n\t"\
1611  "psllw $2, %%mm1 \n\t"\
1612  "psllw $2, %%mm4 \n\t"\
1613  "psubw %%mm0, %%mm1 \n\t"\
1614  "psubw %%mm3, %%mm4 \n\t"\
1615  "movq %%mm2, %%mm5 \n\t"\
1616  "movq %%mm2, %%mm0 \n\t"\
1617  "punpcklbw %%mm7, %%mm2 \n\t"\
1618  "punpckhbw %%mm7, %%mm5 \n\t"\
1619  "paddw %%mm2, %%mm1 \n\t"\
1620  "paddw %%mm5, %%mm4 \n\t"\
1621  "psraw $2, %%mm1 \n\t"\
1622  "psraw $2, %%mm4 \n\t"\
1623  "packuswb %%mm4, %%mm1 \n\t"\
1624  "movq %%mm1, " #b " \n\t"\
1625 
1626 #define DEINT_FF(a,b,c,d) REAL_DEINT_FF(a,b,c,d)
1627 
1628 DEINT_FF((%0) , (%%REGa) , (%%REGa, %1), (%%REGa, %1, 2))
1629 DEINT_FF((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd) )
1630 DEINT_FF((%0, %1, 4) , (%%REGd) , (%%REGd, %1), (%%REGd, %1, 2))
1631 DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
1632 
1633  "movq %%mm0, (%2) \n\t"
1634  : : "r" (src), "r" ((x86_reg)stride), "r"(tmp)
1635  : "%"REG_a, "%"REG_d
1636  );
1637 #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1638  int x;
1639  src+= stride*4;
1640  for(x=0; x<8; x++){
1641  int t1= tmp[x];
1642  int t2= src[stride*1];
1643 
1644  src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3);
1645  t1= src[stride*4];
1646  src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3);
1647  t2= src[stride*6];
1648  src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3);
1649  t1= src[stride*8];
1650  src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
1651  tmp[x]= t1;
1652 
1653  src++;
1654  }
1655 #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1656 }
1657 
1658 /**
1659  * Deinterlace the given block by filtering every line with a (-1 2 6 2 -1) filter.
1660  * will be called for every 8x8 block and can read & write from line 4-15
1661  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1662  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1663  * this filter will read lines 4-13 and write 4-11
1664  */
1665 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
1666 {
1667 #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1668  src+= stride*4;
1669  __asm__ volatile(
1670  "lea (%0, %1), %%"REG_a" \n\t"
1671  "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1672  "pxor %%mm7, %%mm7 \n\t"
1673  "movq (%2), %%mm0 \n\t"
1674  "movq (%3), %%mm1 \n\t"
1675 // 0 1 2 3 4 5 6 7 8 9 10
1676 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1677 
1678 #define REAL_DEINT_L5(t1,t2,a,b,c)\
1679  "movq " #a ", %%mm2 \n\t"\
1680  "movq " #b ", %%mm3 \n\t"\
1681  "movq " #c ", %%mm4 \n\t"\
1682  PAVGB(t2, %%mm3) \
1683  PAVGB(t1, %%mm4) \
1684  "movq %%mm2, %%mm5 \n\t"\
1685  "movq %%mm2, " #t1 " \n\t"\
1686  "punpcklbw %%mm7, %%mm2 \n\t"\
1687  "punpckhbw %%mm7, %%mm5 \n\t"\
1688  "movq %%mm2, %%mm6 \n\t"\
1689  "paddw %%mm2, %%mm2 \n\t"\
1690  "paddw %%mm6, %%mm2 \n\t"\
1691  "movq %%mm5, %%mm6 \n\t"\
1692  "paddw %%mm5, %%mm5 \n\t"\
1693  "paddw %%mm6, %%mm5 \n\t"\
1694  "movq %%mm3, %%mm6 \n\t"\
1695  "punpcklbw %%mm7, %%mm3 \n\t"\
1696  "punpckhbw %%mm7, %%mm6 \n\t"\
1697  "paddw %%mm3, %%mm3 \n\t"\
1698  "paddw %%mm6, %%mm6 \n\t"\
1699  "paddw %%mm3, %%mm2 \n\t"\
1700  "paddw %%mm6, %%mm5 \n\t"\
1701  "movq %%mm4, %%mm6 \n\t"\
1702  "punpcklbw %%mm7, %%mm4 \n\t"\
1703  "punpckhbw %%mm7, %%mm6 \n\t"\
1704  "psubw %%mm4, %%mm2 \n\t"\
1705  "psubw %%mm6, %%mm5 \n\t"\
1706  "psraw $2, %%mm2 \n\t"\
1707  "psraw $2, %%mm5 \n\t"\
1708  "packuswb %%mm5, %%mm2 \n\t"\
1709  "movq %%mm2, " #a " \n\t"\
1710 
1711 #define DEINT_L5(t1,t2,a,b,c) REAL_DEINT_L5(t1,t2,a,b,c)
1712 
1713 DEINT_L5(%%mm0, %%mm1, (%0) , (%%REGa) , (%%REGa, %1) )
1714 DEINT_L5(%%mm1, %%mm0, (%%REGa) , (%%REGa, %1) , (%%REGa, %1, 2))
1715 DEINT_L5(%%mm0, %%mm1, (%%REGa, %1) , (%%REGa, %1, 2), (%0, %1, 4) )
1716 DEINT_L5(%%mm1, %%mm0, (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd) )
1717 DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%REGd) , (%%REGd, %1) )
1718 DEINT_L5(%%mm1, %%mm0, (%%REGd) , (%%REGd, %1) , (%%REGd, %1, 2))
1719 DEINT_L5(%%mm0, %%mm1, (%%REGd, %1) , (%%REGd, %1, 2), (%0, %1, 8) )
1720 DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
1721 
1722  "movq %%mm0, (%2) \n\t"
1723  "movq %%mm1, (%3) \n\t"
1724  : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2)
1725  : "%"REG_a, "%"REG_d
1726  );
1727 #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1728  int x;
1729  src+= stride*4;
1730  for(x=0; x<8; x++){
1731  int t1= tmp[x];
1732  int t2= tmp2[x];
1733  int t3= src[0];
1734 
1735  src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3);
1736  t1= src[stride*1];
1737  src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3);
1738  t2= src[stride*2];
1739  src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3);
1740  t3= src[stride*3];
1741  src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3);
1742  t1= src[stride*4];
1743  src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3);
1744  t2= src[stride*5];
1745  src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3);
1746  t3= src[stride*6];
1747  src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3);
1748  t1= src[stride*7];
1749  src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3);
1750 
1751  tmp[x]= t3;
1752  tmp2[x]= t1;
1753 
1754  src++;
1755  }
1756 #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1757 }
1758 
1759 /**
1760  * Deinterlace the given block by filtering all lines with a (1 2 1) filter.
1761  * will be called for every 8x8 block and can read & write from line 4-15
1762  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1763  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1764  * this filter will read lines 4-13 and write 4-11
1765  */
1766 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
1767 {
1768 #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1769  src+= 4*stride;
1770  __asm__ volatile(
1771  "lea (%0, %1), %%"REG_a" \n\t"
1772  "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1773 // 0 1 2 3 4 5 6 7 8 9
1774 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1775 
1776  "movq (%2), %%mm0 \n\t" // L0
1777  "movq (%%"REG_a"), %%mm1 \n\t" // L2
1778  PAVGB(%%mm1, %%mm0) // L0+L2
1779  "movq (%0), %%mm2 \n\t" // L1
1780  PAVGB(%%mm2, %%mm0)
1781  "movq %%mm0, (%0) \n\t"
1782  "movq (%%"REG_a", %1), %%mm0 \n\t" // L3
1783  PAVGB(%%mm0, %%mm2) // L1+L3
1784  PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
1785  "movq %%mm2, (%%"REG_a") \n\t"
1786  "movq (%%"REG_a", %1, 2), %%mm2 \n\t" // L4
1787  PAVGB(%%mm2, %%mm1) // L2+L4
1788  PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
1789  "movq %%mm1, (%%"REG_a", %1) \n\t"
1790  "movq (%0, %1, 4), %%mm1 \n\t" // L5
1791  PAVGB(%%mm1, %%mm0) // L3+L5
1792  PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
1793  "movq %%mm0, (%%"REG_a", %1, 2) \n\t"
1794  "movq (%%"REG_d"), %%mm0 \n\t" // L6
1795  PAVGB(%%mm0, %%mm2) // L4+L6
1796  PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
1797  "movq %%mm2, (%0, %1, 4) \n\t"
1798  "movq (%%"REG_d", %1), %%mm2 \n\t" // L7
1799  PAVGB(%%mm2, %%mm1) // L5+L7
1800  PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
1801  "movq %%mm1, (%%"REG_d") \n\t"
1802  "movq (%%"REG_d", %1, 2), %%mm1 \n\t" // L8
1803  PAVGB(%%mm1, %%mm0) // L6+L8
1804  PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
1805  "movq %%mm0, (%%"REG_d", %1) \n\t"
1806  "movq (%0, %1, 8), %%mm0 \n\t" // L9
1807  PAVGB(%%mm0, %%mm2) // L7+L9
1808  PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
1809  "movq %%mm2, (%%"REG_d", %1, 2) \n\t"
1810  "movq %%mm1, (%2) \n\t"
1811 
1812  : : "r" (src), "r" ((x86_reg)stride), "r" (tmp)
1813  : "%"REG_a, "%"REG_d
1814  );
1815 #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1816  int a, b, c, x;
1817  src+= 4*stride;
1818 
1819  for(x=0; x<2; x++){
1820  a= *(uint32_t*)&tmp[stride*0];
1821  b= *(uint32_t*)&src[stride*0];
1822  c= *(uint32_t*)&src[stride*1];
1823  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1824  *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1825 
1826  a= *(uint32_t*)&src[stride*2];
1827  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1828  *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1829 
1830  b= *(uint32_t*)&src[stride*3];
1831  c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1832  *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1833 
1834  c= *(uint32_t*)&src[stride*4];
1835  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1836  *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1837 
1838  a= *(uint32_t*)&src[stride*5];
1839  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1840  *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1841 
1842  b= *(uint32_t*)&src[stride*6];
1843  c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1844  *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1845 
1846  c= *(uint32_t*)&src[stride*7];
1847  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1848  *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1849 
1850  a= *(uint32_t*)&src[stride*8];
1851  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1852  *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1853 
1854  *(uint32_t*)&tmp[stride*0]= c;
1855  src += 4;
1856  tmp += 4;
1857  }
1858 #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1859 }
1860 
1861 /**
1862  * Deinterlace the given block by applying a median filter to every second line.
1863  * will be called for every 8x8 block and can read & write from line 4-15,
1864  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1865  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1866  */
1867 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
1868 {
1869 #if TEMPLATE_PP_MMX
1870  src+= 4*stride;
1871 #if TEMPLATE_PP_MMXEXT
1872  __asm__ volatile(
1873  "lea (%0, %1), %%"REG_a" \n\t"
1874  "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1875 // 0 1 2 3 4 5 6 7 8 9
1876 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1877 
1878  "movq (%0), %%mm0 \n\t" //
1879  "movq (%%"REG_a", %1), %%mm2 \n\t" //
1880  "movq (%%"REG_a"), %%mm1 \n\t" //
1881  "movq %%mm0, %%mm3 \n\t"
1882  "pmaxub %%mm1, %%mm0 \n\t" //
1883  "pminub %%mm3, %%mm1 \n\t" //
1884  "pmaxub %%mm2, %%mm1 \n\t" //
1885  "pminub %%mm1, %%mm0 \n\t"
1886  "movq %%mm0, (%%"REG_a") \n\t"
1887 
1888  "movq (%0, %1, 4), %%mm0 \n\t" //
1889  "movq (%%"REG_a", %1, 2), %%mm1 \n\t" //
1890  "movq %%mm2, %%mm3 \n\t"
1891  "pmaxub %%mm1, %%mm2 \n\t" //
1892  "pminub %%mm3, %%mm1 \n\t" //
1893  "pmaxub %%mm0, %%mm1 \n\t" //
1894  "pminub %%mm1, %%mm2 \n\t"
1895  "movq %%mm2, (%%"REG_a", %1, 2) \n\t"
1896 
1897  "movq (%%"REG_d"), %%mm2 \n\t" //
1898  "movq (%%"REG_d", %1), %%mm1 \n\t" //
1899  "movq %%mm2, %%mm3 \n\t"
1900  "pmaxub %%mm0, %%mm2 \n\t" //
1901  "pminub %%mm3, %%mm0 \n\t" //
1902  "pmaxub %%mm1, %%mm0 \n\t" //
1903  "pminub %%mm0, %%mm2 \n\t"
1904  "movq %%mm2, (%%"REG_d") \n\t"
1905 
1906  "movq (%%"REG_d", %1, 2), %%mm2 \n\t" //
1907  "movq (%0, %1, 8), %%mm0 \n\t" //
1908  "movq %%mm2, %%mm3 \n\t"
1909  "pmaxub %%mm0, %%mm2 \n\t" //
1910  "pminub %%mm3, %%mm0 \n\t" //
1911  "pmaxub %%mm1, %%mm0 \n\t" //
1912  "pminub %%mm0, %%mm2 \n\t"
1913  "movq %%mm2, (%%"REG_d", %1, 2) \n\t"
1914 
1915 
1916  : : "r" (src), "r" ((x86_reg)stride)
1917  : "%"REG_a, "%"REG_d
1918  );
1919 
1920 #else // MMX without MMX2
1921  __asm__ volatile(
1922  "lea (%0, %1), %%"REG_a" \n\t"
1923  "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1924 // 0 1 2 3 4 5 6 7 8 9
1925 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1926  "pxor %%mm7, %%mm7 \n\t"
1927 
1928 #define REAL_MEDIAN(a,b,c)\
1929  "movq " #a ", %%mm0 \n\t"\
1930  "movq " #b ", %%mm2 \n\t"\
1931  "movq " #c ", %%mm1 \n\t"\
1932  "movq %%mm0, %%mm3 \n\t"\
1933  "movq %%mm1, %%mm4 \n\t"\
1934  "movq %%mm2, %%mm5 \n\t"\
1935  "psubusb %%mm1, %%mm3 \n\t"\
1936  "psubusb %%mm2, %%mm4 \n\t"\
1937  "psubusb %%mm0, %%mm5 \n\t"\
1938  "pcmpeqb %%mm7, %%mm3 \n\t"\
1939  "pcmpeqb %%mm7, %%mm4 \n\t"\
1940  "pcmpeqb %%mm7, %%mm5 \n\t"\
1941  "movq %%mm3, %%mm6 \n\t"\
1942  "pxor %%mm4, %%mm3 \n\t"\
1943  "pxor %%mm5, %%mm4 \n\t"\
1944  "pxor %%mm6, %%mm5 \n\t"\
1945  "por %%mm3, %%mm1 \n\t"\
1946  "por %%mm4, %%mm2 \n\t"\
1947  "por %%mm5, %%mm0 \n\t"\
1948  "pand %%mm2, %%mm0 \n\t"\
1949  "pand %%mm1, %%mm0 \n\t"\
1950  "movq %%mm0, " #b " \n\t"
1951 #define MEDIAN(a,b,c) REAL_MEDIAN(a,b,c)
1952 
1953 MEDIAN((%0) , (%%REGa) , (%%REGa, %1))
1954 MEDIAN((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4))
1955 MEDIAN((%0, %1, 4) , (%%REGd) , (%%REGd, %1))
1956 MEDIAN((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8))
1957 
1958  : : "r" (src), "r" ((x86_reg)stride)
1959  : "%"REG_a, "%"REG_d
1960  );
1961 #endif //TEMPLATE_PP_MMXEXT
1962 #else //TEMPLATE_PP_MMX
1963  int x, y;
1964  src+= 4*stride;
1965  // FIXME - there should be a way to do a few columns in parallel like w/mmx
1966  for(x=0; x<8; x++){
1967  uint8_t *colsrc = src;
1968  for (y=0; y<4; y++){
1969  int a, b, c, d, e, f;
1970  a = colsrc[0 ];
1971  b = colsrc[stride ];
1972  c = colsrc[stride*2];
1973  d = (a-b)>>31;
1974  e = (b-c)>>31;
1975  f = (c-a)>>31;
1976  colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
1977  colsrc += stride*2;
1978  }
1979  src++;
1980  }
1981 #endif //TEMPLATE_PP_MMX
1982 }
1983 
1984 #if TEMPLATE_PP_MMX
1985 /**
1986  * Transpose and shift the given 8x8 Block into dst1 and dst2.
1987  */
1988 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
1989 {
1990  __asm__(
1991  "lea (%0, %1), %%"REG_a" \n\t"
1992 // 0 1 2 3 4 5 6 7 8 9
1993 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1994  "movq (%0), %%mm0 \n\t" // 12345678
1995  "movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh
1996  "movq %%mm0, %%mm2 \n\t" // 12345678
1997  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
1998  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
1999 
2000  "movq (%%"REG_a", %1), %%mm1 \n\t"
2001  "movq (%%"REG_a", %1, 2), %%mm3 \n\t"
2002  "movq %%mm1, %%mm4 \n\t"
2003  "punpcklbw %%mm3, %%mm1 \n\t"
2004  "punpckhbw %%mm3, %%mm4 \n\t"
2005 
2006  "movq %%mm0, %%mm3 \n\t"
2007  "punpcklwd %%mm1, %%mm0 \n\t"
2008  "punpckhwd %%mm1, %%mm3 \n\t"
2009  "movq %%mm2, %%mm1 \n\t"
2010  "punpcklwd %%mm4, %%mm2 \n\t"
2011  "punpckhwd %%mm4, %%mm1 \n\t"
2012 
2013  "movd %%mm0, 128(%2) \n\t"
2014  "psrlq $32, %%mm0 \n\t"
2015  "movd %%mm0, 144(%2) \n\t"
2016  "movd %%mm3, 160(%2) \n\t"
2017  "psrlq $32, %%mm3 \n\t"
2018  "movd %%mm3, 176(%2) \n\t"
2019  "movd %%mm3, 48(%3) \n\t"
2020  "movd %%mm2, 192(%2) \n\t"
2021  "movd %%mm2, 64(%3) \n\t"
2022  "psrlq $32, %%mm2 \n\t"
2023  "movd %%mm2, 80(%3) \n\t"
2024  "movd %%mm1, 96(%3) \n\t"
2025  "psrlq $32, %%mm1 \n\t"
2026  "movd %%mm1, 112(%3) \n\t"
2027 
2028  "lea (%%"REG_a", %1, 4), %%"REG_a" \n\t"
2029 
2030  "movq (%0, %1, 4), %%mm0 \n\t" // 12345678
2031  "movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh
2032  "movq %%mm0, %%mm2 \n\t" // 12345678
2033  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2034  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2035 
2036  "movq (%%"REG_a", %1), %%mm1 \n\t"
2037  "movq (%%"REG_a", %1, 2), %%mm3 \n\t"
2038  "movq %%mm1, %%mm4 \n\t"
2039  "punpcklbw %%mm3, %%mm1 \n\t"
2040  "punpckhbw %%mm3, %%mm4 \n\t"
2041 
2042  "movq %%mm0, %%mm3 \n\t"
2043  "punpcklwd %%mm1, %%mm0 \n\t"
2044  "punpckhwd %%mm1, %%mm3 \n\t"
2045  "movq %%mm2, %%mm1 \n\t"
2046  "punpcklwd %%mm4, %%mm2 \n\t"
2047  "punpckhwd %%mm4, %%mm1 \n\t"
2048 
2049  "movd %%mm0, 132(%2) \n\t"
2050  "psrlq $32, %%mm0 \n\t"
2051  "movd %%mm0, 148(%2) \n\t"
2052  "movd %%mm3, 164(%2) \n\t"
2053  "psrlq $32, %%mm3 \n\t"
2054  "movd %%mm3, 180(%2) \n\t"
2055  "movd %%mm3, 52(%3) \n\t"
2056  "movd %%mm2, 196(%2) \n\t"
2057  "movd %%mm2, 68(%3) \n\t"
2058  "psrlq $32, %%mm2 \n\t"
2059  "movd %%mm2, 84(%3) \n\t"
2060  "movd %%mm1, 100(%3) \n\t"
2061  "psrlq $32, %%mm1 \n\t"
2062  "movd %%mm1, 116(%3) \n\t"
2063 
2064 
2065  :: "r" (src), "r" ((x86_reg)srcStride), "r" (dst1), "r" (dst2)
2066  : "%"REG_a
2067  );
2068 }
2069 
2070 /**
2071  * Transpose the given 8x8 block.
2072  */
2073 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
2074 {
2075  __asm__(
2076  "lea (%0, %1), %%"REG_a" \n\t"
2077  "lea (%%"REG_a",%1,4), %%"REG_d" \n\t"
2078 // 0 1 2 3 4 5 6 7 8 9
2079 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
2080  "movq (%2), %%mm0 \n\t" // 12345678
2081  "movq 16(%2), %%mm1 \n\t" // abcdefgh
2082  "movq %%mm0, %%mm2 \n\t" // 12345678
2083  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2084  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2085 
2086  "movq 32(%2), %%mm1 \n\t"
2087  "movq 48(%2), %%mm3 \n\t"
2088  "movq %%mm1, %%mm4 \n\t"
2089  "punpcklbw %%mm3, %%mm1 \n\t"
2090  "punpckhbw %%mm3, %%mm4 \n\t"
2091 
2092  "movq %%mm0, %%mm3 \n\t"
2093  "punpcklwd %%mm1, %%mm0 \n\t"
2094  "punpckhwd %%mm1, %%mm3 \n\t"
2095  "movq %%mm2, %%mm1 \n\t"
2096  "punpcklwd %%mm4, %%mm2 \n\t"
2097  "punpckhwd %%mm4, %%mm1 \n\t"
2098 
2099  "movd %%mm0, (%0) \n\t"
2100  "psrlq $32, %%mm0 \n\t"
2101  "movd %%mm0, (%%"REG_a") \n\t"
2102  "movd %%mm3, (%%"REG_a", %1) \n\t"
2103  "psrlq $32, %%mm3 \n\t"
2104  "movd %%mm3, (%%"REG_a", %1, 2) \n\t"
2105  "movd %%mm2, (%0, %1, 4) \n\t"
2106  "psrlq $32, %%mm2 \n\t"
2107  "movd %%mm2, (%%"REG_d") \n\t"
2108  "movd %%mm1, (%%"REG_d", %1) \n\t"
2109  "psrlq $32, %%mm1 \n\t"
2110  "movd %%mm1, (%%"REG_d", %1, 2) \n\t"
2111 
2112 
2113  "movq 64(%2), %%mm0 \n\t" // 12345678
2114  "movq 80(%2), %%mm1 \n\t" // abcdefgh
2115  "movq %%mm0, %%mm2 \n\t" // 12345678
2116  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2117  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2118 
2119  "movq 96(%2), %%mm1 \n\t"
2120  "movq 112(%2), %%mm3 \n\t"
2121  "movq %%mm1, %%mm4 \n\t"
2122  "punpcklbw %%mm3, %%mm1 \n\t"
2123  "punpckhbw %%mm3, %%mm4 \n\t"
2124 
2125  "movq %%mm0, %%mm3 \n\t"
2126  "punpcklwd %%mm1, %%mm0 \n\t"
2127  "punpckhwd %%mm1, %%mm3 \n\t"
2128  "movq %%mm2, %%mm1 \n\t"
2129  "punpcklwd %%mm4, %%mm2 \n\t"
2130  "punpckhwd %%mm4, %%mm1 \n\t"
2131 
2132  "movd %%mm0, 4(%0) \n\t"
2133  "psrlq $32, %%mm0 \n\t"
2134  "movd %%mm0, 4(%%"REG_a") \n\t"
2135  "movd %%mm3, 4(%%"REG_a", %1) \n\t"
2136  "psrlq $32, %%mm3 \n\t"
2137  "movd %%mm3, 4(%%"REG_a", %1, 2) \n\t"
2138  "movd %%mm2, 4(%0, %1, 4) \n\t"
2139  "psrlq $32, %%mm2 \n\t"
2140  "movd %%mm2, 4(%%"REG_d") \n\t"
2141  "movd %%mm1, 4(%%"REG_d", %1) \n\t"
2142  "psrlq $32, %%mm1 \n\t"
2143  "movd %%mm1, 4(%%"REG_d", %1, 2) \n\t"
2144 
2145  :: "r" (dst), "r" ((x86_reg)dstStride), "r" (src)
2146  : "%"REG_a, "%"REG_d
2147  );
2148 }
2149 #endif //TEMPLATE_PP_MMX
2150 //static long test=0;
2151 
2152 #if !TEMPLATE_PP_ALTIVEC
2153 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
2154  uint8_t *tempBlurred, uint32_t *tempBlurredPast, int *maxNoise)
2155 {
2156  // to save a register (FIXME do this outside of the loops)
2157  tempBlurredPast[127]= maxNoise[0];
2158  tempBlurredPast[128]= maxNoise[1];
2159  tempBlurredPast[129]= maxNoise[2];
2160 
2161 #define FAST_L2_DIFF
2162 //#define L1_DIFF //u should change the thresholds too if u try that one
2163 #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
2164  __asm__ volatile(
2165  "lea (%2, %2, 2), %%"REG_a" \n\t" // 3*stride
2166  "lea (%2, %2, 4), %%"REG_d" \n\t" // 5*stride
2167  "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
2168 // 0 1 2 3 4 5 6 7 8 9
2169 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2
2170 //FIXME reorder?
2171 #ifdef L1_DIFF //needs mmx2
2172  "movq (%0), %%mm0 \n\t" // L0
2173  "psadbw (%1), %%mm0 \n\t" // |L0-R0|
2174  "movq (%0, %2), %%mm1 \n\t" // L1
2175  "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1|
2176  "movq (%0, %2, 2), %%mm2 \n\t" // L2
2177  "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2|
2178  "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
2179  "psadbw (%1, %%"REG_a"), %%mm3 \n\t" // |L3-R3|
2180 
2181  "movq (%0, %2, 4), %%mm4 \n\t" // L4
2182  "paddw %%mm1, %%mm0 \n\t"
2183  "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4|
2184  "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5
2185  "paddw %%mm2, %%mm0 \n\t"
2186  "psadbw (%1, %%"REG_d"), %%mm5 \n\t" // |L5-R5|
2187  "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6
2188  "paddw %%mm3, %%mm0 \n\t"
2189  "psadbw (%1, %%"REG_a", 2), %%mm6 \n\t" // |L6-R6|
2190  "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7
2191  "paddw %%mm4, %%mm0 \n\t"
2192  "psadbw (%1, %%"REG_c"), %%mm7 \n\t" // |L7-R7|
2193  "paddw %%mm5, %%mm6 \n\t"
2194  "paddw %%mm7, %%mm6 \n\t"
2195  "paddw %%mm6, %%mm0 \n\t"
2196 #else //L1_DIFF
2197 #if defined (FAST_L2_DIFF)
2198  "pcmpeqb %%mm7, %%mm7 \n\t"
2199  "movq "MANGLE(b80)", %%mm6 \n\t"
2200  "pxor %%mm0, %%mm0 \n\t"
2201 #define REAL_L2_DIFF_CORE(a, b)\
2202  "movq " #a ", %%mm5 \n\t"\
2203  "movq " #b ", %%mm2 \n\t"\
2204  "pxor %%mm7, %%mm2 \n\t"\
2205  PAVGB(%%mm2, %%mm5)\
2206  "paddb %%mm6, %%mm5 \n\t"\
2207  "movq %%mm5, %%mm2 \n\t"\
2208  "psllw $8, %%mm5 \n\t"\
2209  "pmaddwd %%mm5, %%mm5 \n\t"\
2210  "pmaddwd %%mm2, %%mm2 \n\t"\
2211  "paddd %%mm2, %%mm5 \n\t"\
2212  "psrld $14, %%mm5 \n\t"\
2213  "paddd %%mm5, %%mm0 \n\t"
2214 
2215 #else //defined (FAST_L2_DIFF)
2216  "pxor %%mm7, %%mm7 \n\t"
2217  "pxor %%mm0, %%mm0 \n\t"
2218 #define REAL_L2_DIFF_CORE(a, b)\
2219  "movq " #a ", %%mm5 \n\t"\
2220  "movq " #b ", %%mm2 \n\t"\
2221  "movq %%mm5, %%mm1 \n\t"\
2222  "movq %%mm2, %%mm3 \n\t"\
2223  "punpcklbw %%mm7, %%mm5 \n\t"\
2224  "punpckhbw %%mm7, %%mm1 \n\t"\
2225  "punpcklbw %%mm7, %%mm2 \n\t"\
2226  "punpckhbw %%mm7, %%mm3 \n\t"\
2227  "psubw %%mm2, %%mm5 \n\t"\
2228  "psubw %%mm3, %%mm1 \n\t"\
2229  "pmaddwd %%mm5, %%mm5 \n\t"\
2230  "pmaddwd %%mm1, %%mm1 \n\t"\
2231  "paddd %%mm1, %%mm5 \n\t"\
2232  "paddd %%mm5, %%mm0 \n\t"
2233 
2234 #endif //defined (FAST_L2_DIFF)
2235 
2236 #define L2_DIFF_CORE(a, b) REAL_L2_DIFF_CORE(a, b)
2237 
2238 L2_DIFF_CORE((%0) , (%1))
2239 L2_DIFF_CORE((%0, %2) , (%1, %2))
2240 L2_DIFF_CORE((%0, %2, 2) , (%1, %2, 2))
2241 L2_DIFF_CORE((%0, %%REGa) , (%1, %%REGa))
2242 L2_DIFF_CORE((%0, %2, 4) , (%1, %2, 4))
2243 L2_DIFF_CORE((%0, %%REGd) , (%1, %%REGd))
2244 L2_DIFF_CORE((%0, %%REGa,2), (%1, %%REGa,2))
2245 L2_DIFF_CORE((%0, %%REGc) , (%1, %%REGc))
2246 
2247 #endif //L1_DIFF
2248 
2249  "movq %%mm0, %%mm4 \n\t"
2250  "psrlq $32, %%mm0 \n\t"
2251  "paddd %%mm0, %%mm4 \n\t"
2252  "movd %%mm4, %%ecx \n\t"
2253  "shll $2, %%ecx \n\t"
2254  "mov %3, %%"REG_d" \n\t"
2255  "addl -4(%%"REG_d"), %%ecx \n\t"
2256  "addl 4(%%"REG_d"), %%ecx \n\t"
2257  "addl -1024(%%"REG_d"), %%ecx \n\t"
2258  "addl $4, %%ecx \n\t"
2259  "addl 1024(%%"REG_d"), %%ecx \n\t"
2260  "shrl $3, %%ecx \n\t"
2261  "movl %%ecx, (%%"REG_d") \n\t"
2262 
2263 // "mov %3, %%"REG_c" \n\t"
2264 // "mov %%"REG_c", test \n\t"
2265 // "jmp 4f \n\t"
2266  "cmpl 512(%%"REG_d"), %%ecx \n\t"
2267  " jb 2f \n\t"
2268  "cmpl 516(%%"REG_d"), %%ecx \n\t"
2269  " jb 1f \n\t"
2270 
2271  "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride
2272  "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
2273  "movq (%0), %%mm0 \n\t" // L0
2274  "movq (%0, %2), %%mm1 \n\t" // L1
2275  "movq (%0, %2, 2), %%mm2 \n\t" // L2
2276  "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
2277  "movq (%0, %2, 4), %%mm4 \n\t" // L4
2278  "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5
2279  "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6
2280  "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7
2281  "movq %%mm0, (%1) \n\t" // L0
2282  "movq %%mm1, (%1, %2) \n\t" // L1
2283  "movq %%mm2, (%1, %2, 2) \n\t" // L2
2284  "movq %%mm3, (%1, %%"REG_a") \n\t" // L3
2285  "movq %%mm4, (%1, %2, 4) \n\t" // L4
2286  "movq %%mm5, (%1, %%"REG_d") \n\t" // L5
2287  "movq %%mm6, (%1, %%"REG_a", 2) \n\t" // L6
2288  "movq %%mm7, (%1, %%"REG_c") \n\t" // L7
2289  "jmp 4f \n\t"
2290 
2291  "1: \n\t"
2292  "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride
2293  "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
2294  "movq (%0), %%mm0 \n\t" // L0
2295  PAVGB((%1), %%mm0) // L0
2296  "movq (%0, %2), %%mm1 \n\t" // L1
2297  PAVGB((%1, %2), %%mm1) // L1
2298  "movq (%0, %2, 2), %%mm2 \n\t" // L2
2299  PAVGB((%1, %2, 2), %%mm2) // L2
2300  "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
2301  PAVGB((%1, %%REGa), %%mm3) // L3
2302  "movq (%0, %2, 4), %%mm4 \n\t" // L4
2303  PAVGB((%1, %2, 4), %%mm4) // L4
2304  "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5
2305  PAVGB((%1, %%REGd), %%mm5) // L5
2306  "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6
2307  PAVGB((%1, %%REGa, 2), %%mm6) // L6
2308  "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7
2309  PAVGB((%1, %%REGc), %%mm7) // L7
2310  "movq %%mm0, (%1) \n\t" // R0
2311  "movq %%mm1, (%1, %2) \n\t" // R1
2312  "movq %%mm2, (%1, %2, 2) \n\t" // R2
2313  "movq %%mm3, (%1, %%"REG_a") \n\t" // R3
2314  "movq %%mm4, (%1, %2, 4) \n\t" // R4
2315  "movq %%mm5, (%1, %%"REG_d") \n\t" // R5
2316  "movq %%mm6, (%1, %%"REG_a", 2) \n\t" // R6
2317  "movq %%mm7, (%1, %%"REG_c") \n\t" // R7
2318  "movq %%mm0, (%0) \n\t" // L0
2319  "movq %%mm1, (%0, %2) \n\t" // L1
2320  "movq %%mm2, (%0, %2, 2) \n\t" // L2
2321  "movq %%mm3, (%0, %%"REG_a") \n\t" // L3
2322  "movq %%mm4, (%0, %2, 4) \n\t" // L4
2323  "movq %%mm5, (%0, %%"REG_d") \n\t" // L5
2324  "movq %%mm6, (%0, %%"REG_a", 2) \n\t" // L6
2325  "movq %%mm7, (%0, %%"REG_c") \n\t" // L7
2326  "jmp 4f \n\t"
2327 
2328  "2: \n\t"
2329  "cmpl 508(%%"REG_d"), %%ecx \n\t"
2330  " jb 3f \n\t"
2331 
2332  "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride
2333  "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
2334  "movq (%0), %%mm0 \n\t" // L0
2335  "movq (%0, %2), %%mm1 \n\t" // L1
2336  "movq (%0, %2, 2), %%mm2 \n\t" // L2
2337  "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
2338  "movq (%1), %%mm4 \n\t" // R0
2339  "movq (%1, %2), %%mm5 \n\t" // R1
2340  "movq (%1, %2, 2), %%mm6 \n\t" // R2
2341  "movq (%1, %%"REG_a"), %%mm7 \n\t" // R3
2342  PAVGB(%%mm4, %%mm0)
2343  PAVGB(%%mm5, %%mm1)
2344  PAVGB(%%mm6, %%mm2)
2345  PAVGB(%%mm7, %%mm3)
2346  PAVGB(%%mm4, %%mm0)
2347  PAVGB(%%mm5, %%mm1)
2348  PAVGB(%%mm6, %%mm2)
2349  PAVGB(%%mm7, %%mm3)
2350  "movq %%mm0, (%1) \n\t" // R0
2351  "movq %%mm1, (%1, %2) \n\t" // R1
2352  "movq %%mm2, (%1, %2, 2) \n\t" // R2
2353  "movq %%mm3, (%1, %%"REG_a") \n\t" // R3
2354  "movq %%mm0, (%0) \n\t" // L0
2355  "movq %%mm1, (%0, %2) \n\t" // L1
2356  "movq %%mm2, (%0, %2, 2) \n\t" // L2
2357  "movq %%mm3, (%0, %%"REG_a") \n\t" // L3
2358 
2359  "movq (%0, %2, 4), %%mm0 \n\t" // L4
2360  "movq (%0, %%"REG_d"), %%mm1 \n\t" // L5
2361  "movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6
2362  "movq (%0, %%"REG_c"), %%mm3 \n\t" // L7
2363  "movq (%1, %2, 4), %%mm4 \n\t" // R4
2364  "movq (%1, %%"REG_d"), %%mm5 \n\t" // R5
2365  "movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6
2366  "movq (%1, %%"REG_c"), %%mm7 \n\t" // R7
2367  PAVGB(%%mm4, %%mm0)
2368  PAVGB(%%mm5, %%mm1)
2369  PAVGB(%%mm6, %%mm2)
2370  PAVGB(%%mm7, %%mm3)
2371  PAVGB(%%mm4, %%mm0)
2372  PAVGB(%%mm5, %%mm1)
2373  PAVGB(%%mm6, %%mm2)
2374  PAVGB(%%mm7, %%mm3)
2375  "movq %%mm0, (%1, %2, 4) \n\t" // R4
2376  "movq %%mm1, (%1, %%"REG_d") \n\t" // R5
2377  "movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6
2378  "movq %%mm3, (%1, %%"REG_c") \n\t" // R7
2379  "movq %%mm0, (%0, %2, 4) \n\t" // L4
2380  "movq %%mm1, (%0, %%"REG_d") \n\t" // L5
2381  "movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6
2382  "movq %%mm3, (%0, %%"REG_c") \n\t" // L7
2383  "jmp 4f \n\t"
2384 
2385  "3: \n\t"
2386  "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride
2387  "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
2388  "movq (%0), %%mm0 \n\t" // L0
2389  "movq (%0, %2), %%mm1 \n\t" // L1
2390  "movq (%0, %2, 2), %%mm2 \n\t" // L2
2391  "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
2392  "movq (%1), %%mm4 \n\t" // R0
2393  "movq (%1, %2), %%mm5 \n\t" // R1
2394  "movq (%1, %2, 2), %%mm6 \n\t" // R2
2395  "movq (%1, %%"REG_a"), %%mm7 \n\t" // R3
2396  PAVGB(%%mm4, %%mm0)
2397  PAVGB(%%mm5, %%mm1)
2398  PAVGB(%%mm6, %%mm2)
2399  PAVGB(%%mm7, %%mm3)
2400  PAVGB(%%mm4, %%mm0)
2401  PAVGB(%%mm5, %%mm1)
2402  PAVGB(%%mm6, %%mm2)
2403  PAVGB(%%mm7, %%mm3)
2404  PAVGB(%%mm4, %%mm0)
2405  PAVGB(%%mm5, %%mm1)
2406  PAVGB(%%mm6, %%mm2)
2407  PAVGB(%%mm7, %%mm3)
2408  "movq %%mm0, (%1) \n\t" // R0
2409  "movq %%mm1, (%1, %2) \n\t" // R1
2410  "movq %%mm2, (%1, %2, 2) \n\t" // R2
2411  "movq %%mm3, (%1, %%"REG_a") \n\t" // R3
2412  "movq %%mm0, (%0) \n\t" // L0
2413  "movq %%mm1, (%0, %2) \n\t" // L1
2414  "movq %%mm2, (%0, %2, 2) \n\t" // L2
2415  "movq %%mm3, (%0, %%"REG_a") \n\t" // L3
2416 
2417  "movq (%0, %2, 4), %%mm0 \n\t" // L4
2418  "movq (%0, %%"REG_d"), %%mm1 \n\t" // L5
2419  "movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6
2420  "movq (%0, %%"REG_c"), %%mm3 \n\t" // L7
2421  "movq (%1, %2, 4), %%mm4 \n\t" // R4
2422  "movq (%1, %%"REG_d"), %%mm5 \n\t" // R5
2423  "movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6
2424  "movq (%1, %%"REG_c"), %%mm7 \n\t" // R7
2425  PAVGB(%%mm4, %%mm0)
2426  PAVGB(%%mm5, %%mm1)
2427  PAVGB(%%mm6, %%mm2)
2428  PAVGB(%%mm7, %%mm3)
2429  PAVGB(%%mm4, %%mm0)
2430  PAVGB(%%mm5, %%mm1)
2431  PAVGB(%%mm6, %%mm2)
2432  PAVGB(%%mm7, %%mm3)
2433  PAVGB(%%mm4, %%mm0)
2434  PAVGB(%%mm5, %%mm1)
2435  PAVGB(%%mm6, %%mm2)
2436  PAVGB(%%mm7, %%mm3)
2437  "movq %%mm0, (%1, %2, 4) \n\t" // R4
2438  "movq %%mm1, (%1, %%"REG_d") \n\t" // R5
2439  "movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6
2440  "movq %%mm3, (%1, %%"REG_c") \n\t" // R7
2441  "movq %%mm0, (%0, %2, 4) \n\t" // L4
2442  "movq %%mm1, (%0, %%"REG_d") \n\t" // L5
2443  "movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6
2444  "movq %%mm3, (%0, %%"REG_c") \n\t" // L7
2445 
2446  "4: \n\t"
2447 
2448  :: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast)
2449  : "%"REG_a, "%"REG_d, "%"REG_c, "memory"
2450  );
2451 #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
2452 {
2453  int y;
2454  int d=0;
2455 // int sysd=0;
2456  int i;
2457 
2458  for(y=0; y<8; y++){
2459  int x;
2460  for(x=0; x<8; x++){
2461  int ref= tempBlurred[ x + y*stride ];
2462  int cur= src[ x + y*stride ];
2463  int d1=ref - cur;
2464 // if(x==0 || x==7) d1+= d1>>1;
2465 // if(y==0 || y==7) d1+= d1>>1;
2466 // d+= FFABS(d1);
2467  d+= d1*d1;
2468 // sysd+= d1;
2469  }
2470  }
2471  i=d;
2472  d= (
2473  4*d
2474  +(*(tempBlurredPast-256))
2475  +(*(tempBlurredPast-1))+ (*(tempBlurredPast+1))
2476  +(*(tempBlurredPast+256))
2477  +4)>>3;
2478  *tempBlurredPast=i;
2479 // ((*tempBlurredPast)*3 + d + 2)>>2;
2480 
2481 /*
2482 Switch between
2483  1 0 0 0 0 0 0 (0)
2484 64 32 16 8 4 2 1 (1)
2485 64 48 36 27 20 15 11 (33) (approx)
2486 64 56 49 43 37 33 29 (200) (approx)
2487 */
2488  if(d > maxNoise[1]){
2489  if(d < maxNoise[2]){
2490  for(y=0; y<8; y++){
2491  int x;
2492  for(x=0; x<8; x++){
2493  int ref= tempBlurred[ x + y*stride ];
2494  int cur= src[ x + y*stride ];
2495  tempBlurred[ x + y*stride ]=
2496  src[ x + y*stride ]=
2497  (ref + cur + 1)>>1;
2498  }
2499  }
2500  }else{
2501  for(y=0; y<8; y++){
2502  int x;
2503  for(x=0; x<8; x++){
2504  tempBlurred[ x + y*stride ]= src[ x + y*stride ];
2505  }
2506  }
2507  }
2508  }else{
2509  if(d < maxNoise[0]){
2510  for(y=0; y<8; y++){
2511  int x;
2512  for(x=0; x<8; x++){
2513  int ref= tempBlurred[ x + y*stride ];
2514  int cur= src[ x + y*stride ];
2515  tempBlurred[ x + y*stride ]=
2516  src[ x + y*stride ]=
2517  (ref*7 + cur + 4)>>3;
2518  }
2519  }
2520  }else{
2521  for(y=0; y<8; y++){
2522  int x;
2523  for(x=0; x<8; x++){
2524  int ref= tempBlurred[ x + y*stride ];
2525  int cur= src[ x + y*stride ];
2526  tempBlurred[ x + y*stride ]=
2527  src[ x + y*stride ]=
2528  (ref*3 + cur + 2)>>2;
2529  }
2530  }
2531  }
2532  }
2533 }
2534 #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
2535 }
2536 #endif //TEMPLATE_PP_ALTIVEC
2537 
2538 #if TEMPLATE_PP_MMX
2539 /**
2540  * accurate deblock filter
2541  */
2542 static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){
2543  int64_t dc_mask, eq_mask, both_masks;
2544  int64_t sums[10*8*2];
2545  src+= step*3; // src points to begin of the 8x8 Block
2546  //{ START_TIMER
2547  __asm__ volatile(
2548  "movq %0, %%mm7 \n\t"
2549  "movq %1, %%mm6 \n\t"
2550  : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
2551  );
2552 
2553  __asm__ volatile(
2554  "lea (%2, %3), %%"REG_a" \n\t"
2555 // 0 1 2 3 4 5 6 7 8 9
2556 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
2557 
2558  "movq (%2), %%mm0 \n\t"
2559  "movq (%%"REG_a"), %%mm1 \n\t"
2560  "movq %%mm1, %%mm3 \n\t"
2561  "movq %%mm1, %%mm4 \n\t"
2562  "psubb %%mm1, %%mm0 \n\t" // mm0 = difference
2563  "paddb %%mm7, %%mm0 \n\t"
2564  "pcmpgtb %%mm6, %%mm0 \n\t"
2565 
2566  "movq (%%"REG_a",%3), %%mm2 \n\t"
2567  PMAXUB(%%mm2, %%mm4)
2568  PMINUB(%%mm2, %%mm3, %%mm5)
2569  "psubb %%mm2, %%mm1 \n\t"
2570  "paddb %%mm7, %%mm1 \n\t"
2571  "pcmpgtb %%mm6, %%mm1 \n\t"
2572  "paddb %%mm1, %%mm0 \n\t"
2573 
2574  "movq (%%"REG_a", %3, 2), %%mm1 \n\t"
2575  PMAXUB(%%mm1, %%mm4)
2576  PMINUB(%%mm1, %%mm3, %%mm5)
2577  "psubb %%mm1, %%mm2 \n\t"
2578  "paddb %%mm7, %%mm2 \n\t"
2579  "pcmpgtb %%mm6, %%mm2 \n\t"
2580  "paddb %%mm2, %%mm0 \n\t"
2581 
2582  "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t"
2583 
2584  "movq (%2, %3, 4), %%mm2 \n\t"
2585  PMAXUB(%%mm2, %%mm4)
2586  PMINUB(%%mm2, %%mm3, %%mm5)
2587  "psubb %%mm2, %%mm1 \n\t"
2588  "paddb %%mm7, %%mm1 \n\t"
2589  "pcmpgtb %%mm6, %%mm1 \n\t"
2590  "paddb %%mm1, %%mm0 \n\t"
2591 
2592  "movq (%%"REG_a"), %%mm1 \n\t"
2593  PMAXUB(%%mm1, %%mm4)
2594  PMINUB(%%mm1, %%mm3, %%mm5)
2595  "psubb %%mm1, %%mm2 \n\t"
2596  "paddb %%mm7, %%mm2 \n\t"
2597  "pcmpgtb %%mm6, %%mm2 \n\t"
2598  "paddb %%mm2, %%mm0 \n\t"
2599 
2600  "movq (%%"REG_a", %3), %%mm2 \n\t"
2601  PMAXUB(%%mm2, %%mm4)
2602  PMINUB(%%mm2, %%mm3, %%mm5)
2603  "psubb %%mm2, %%mm1 \n\t"
2604  "paddb %%mm7, %%mm1 \n\t"
2605  "pcmpgtb %%mm6, %%mm1 \n\t"
2606  "paddb %%mm1, %%mm0 \n\t"
2607 
2608  "movq (%%"REG_a", %3, 2), %%mm1 \n\t"
2609  PMAXUB(%%mm1, %%mm4)
2610  PMINUB(%%mm1, %%mm3, %%mm5)
2611  "psubb %%mm1, %%mm2 \n\t"
2612  "paddb %%mm7, %%mm2 \n\t"
2613  "pcmpgtb %%mm6, %%mm2 \n\t"
2614  "paddb %%mm2, %%mm0 \n\t"
2615 
2616  "movq (%2, %3, 8), %%mm2 \n\t"
2617  PMAXUB(%%mm2, %%mm4)
2618  PMINUB(%%mm2, %%mm3, %%mm5)
2619  "psubb %%mm2, %%mm1 \n\t"
2620  "paddb %%mm7, %%mm1 \n\t"
2621  "pcmpgtb %%mm6, %%mm1 \n\t"
2622  "paddb %%mm1, %%mm0 \n\t"
2623 
2624  "movq (%%"REG_a", %3, 4), %%mm1 \n\t"
2625  "psubb %%mm1, %%mm2 \n\t"
2626  "paddb %%mm7, %%mm2 \n\t"
2627  "pcmpgtb %%mm6, %%mm2 \n\t"
2628  "paddb %%mm2, %%mm0 \n\t"
2629  "psubusb %%mm3, %%mm4 \n\t"
2630 
2631  "pxor %%mm6, %%mm6 \n\t"
2632  "movq %4, %%mm7 \n\t" // QP,..., QP
2633  "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
2634  "psubusb %%mm4, %%mm7 \n\t" // Diff >=2QP -> 0
2635  "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2636  "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2637  "movq %%mm7, %1 \n\t"
2638 
2639  "movq %5, %%mm7 \n\t"
2640  "punpcklbw %%mm7, %%mm7 \n\t"
2641  "punpcklbw %%mm7, %%mm7 \n\t"
2642  "punpcklbw %%mm7, %%mm7 \n\t"
2643  "psubb %%mm0, %%mm6 \n\t"
2644  "pcmpgtb %%mm7, %%mm6 \n\t"
2645  "movq %%mm6, %0 \n\t"
2646 
2647  : "=m" (eq_mask), "=m" (dc_mask)
2648  : "r" (src), "r" ((x86_reg)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
2649  : "%"REG_a
2650  );
2651 
2652  both_masks = dc_mask & eq_mask;
2653 
2654  if(both_masks){
2655  x86_reg offset= -8*step;
2656  int64_t *temp_sums= sums;
2657 
2658  __asm__ volatile(
2659  "movq %2, %%mm0 \n\t" // QP,..., QP
2660  "pxor %%mm4, %%mm4 \n\t"
2661 
2662  "movq (%0), %%mm6 \n\t"
2663  "movq (%0, %1), %%mm5 \n\t"
2664  "movq %%mm5, %%mm1 \n\t"
2665  "movq %%mm6, %%mm2 \n\t"
2666  "psubusb %%mm6, %%mm5 \n\t"
2667  "psubusb %%mm1, %%mm2 \n\t"
2668  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2669  "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2670  "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2671 
2672  "pxor %%mm6, %%mm1 \n\t"
2673  "pand %%mm0, %%mm1 \n\t"
2674  "pxor %%mm1, %%mm6 \n\t"
2675  // 0:QP 6:First
2676 
2677  "movq (%0, %1, 8), %%mm5 \n\t"
2678  "add %1, %0 \n\t" // %0 points to line 1 not 0
2679  "movq (%0, %1, 8), %%mm7 \n\t"
2680  "movq %%mm5, %%mm1 \n\t"
2681  "movq %%mm7, %%mm2 \n\t"
2682  "psubusb %%mm7, %%mm5 \n\t"
2683  "psubusb %%mm1, %%mm2 \n\t"
2684  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2685  "movq %2, %%mm0 \n\t" // QP,..., QP
2686  "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2687  "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2688 
2689  "pxor %%mm7, %%mm1 \n\t"
2690  "pand %%mm0, %%mm1 \n\t"
2691  "pxor %%mm1, %%mm7 \n\t"
2692 
2693  "movq %%mm6, %%mm5 \n\t"
2694  "punpckhbw %%mm4, %%mm6 \n\t"
2695  "punpcklbw %%mm4, %%mm5 \n\t"
2696  // 4:0 5/6:First 7:Last
2697 
2698  "movq %%mm5, %%mm0 \n\t"
2699  "movq %%mm6, %%mm1 \n\t"
2700  "psllw $2, %%mm0 \n\t"
2701  "psllw $2, %%mm1 \n\t"
2702  "paddw "MANGLE(w04)", %%mm0 \n\t"
2703  "paddw "MANGLE(w04)", %%mm1 \n\t"
2704 
2705 #define NEXT\
2706  "movq (%0), %%mm2 \n\t"\
2707  "movq (%0), %%mm3 \n\t"\
2708  "add %1, %0 \n\t"\
2709  "punpcklbw %%mm4, %%mm2 \n\t"\
2710  "punpckhbw %%mm4, %%mm3 \n\t"\
2711  "paddw %%mm2, %%mm0 \n\t"\
2712  "paddw %%mm3, %%mm1 \n\t"
2713 
2714 #define PREV\
2715  "movq (%0), %%mm2 \n\t"\
2716  "movq (%0), %%mm3 \n\t"\
2717  "add %1, %0 \n\t"\
2718  "punpcklbw %%mm4, %%mm2 \n\t"\
2719  "punpckhbw %%mm4, %%mm3 \n\t"\
2720  "psubw %%mm2, %%mm0 \n\t"\
2721  "psubw %%mm3, %%mm1 \n\t"
2722 
2723 
2724  NEXT //0
2725  NEXT //1
2726  NEXT //2
2727  "movq %%mm0, (%3) \n\t"
2728  "movq %%mm1, 8(%3) \n\t"
2729 
2730  NEXT //3
2731  "psubw %%mm5, %%mm0 \n\t"
2732  "psubw %%mm6, %%mm1 \n\t"
2733  "movq %%mm0, 16(%3) \n\t"
2734  "movq %%mm1, 24(%3) \n\t"
2735 
2736  NEXT //4
2737  "psubw %%mm5, %%mm0 \n\t"
2738  "psubw %%mm6, %%mm1 \n\t"
2739  "movq %%mm0, 32(%3) \n\t"
2740  "movq %%mm1, 40(%3) \n\t"
2741 
2742  NEXT //5
2743  "psubw %%mm5, %%mm0 \n\t"
2744  "psubw %%mm6, %%mm1 \n\t"
2745  "movq %%mm0, 48(%3) \n\t"
2746  "movq %%mm1, 56(%3) \n\t"
2747 
2748  NEXT //6
2749  "psubw %%mm5, %%mm0 \n\t"
2750  "psubw %%mm6, %%mm1 \n\t"
2751  "movq %%mm0, 64(%3) \n\t"
2752  "movq %%mm1, 72(%3) \n\t"
2753 
2754  "movq %%mm7, %%mm6 \n\t"
2755  "punpckhbw %%mm4, %%mm7 \n\t"
2756  "punpcklbw %%mm4, %%mm6 \n\t"
2757 
2758  NEXT //7
2759  "mov %4, %0 \n\t"
2760  "add %1, %0 \n\t"
2761  PREV //0
2762  "movq %%mm0, 80(%3) \n\t"
2763  "movq %%mm1, 88(%3) \n\t"
2764 
2765  PREV //1
2766  "paddw %%mm6, %%mm0 \n\t"
2767  "paddw %%mm7, %%mm1 \n\t"
2768  "movq %%mm0, 96(%3) \n\t"
2769  "movq %%mm1, 104(%3) \n\t"
2770 
2771  PREV //2
2772  "paddw %%mm6, %%mm0 \n\t"
2773  "paddw %%mm7, %%mm1 \n\t"
2774  "movq %%mm0, 112(%3) \n\t"
2775  "movq %%mm1, 120(%3) \n\t"
2776 
2777  PREV //3
2778  "paddw %%mm6, %%mm0 \n\t"
2779  "paddw %%mm7, %%mm1 \n\t"
2780  "movq %%mm0, 128(%3) \n\t"
2781  "movq %%mm1, 136(%3) \n\t"
2782 
2783  PREV //4
2784  "paddw %%mm6, %%mm0 \n\t"
2785  "paddw %%mm7, %%mm1 \n\t"
2786  "movq %%mm0, 144(%3) \n\t"
2787  "movq %%mm1, 152(%3) \n\t"
2788 
2789  "mov %4, %0 \n\t" //FIXME
2790 
2791  : "+&r"(src)
2792  : "r" ((x86_reg)step), "m" (c->pQPb), "r"(sums), "g"(src)
2793  );
2794 
2795  src+= step; // src points to begin of the 8x8 Block
2796 
2797  __asm__ volatile(
2798  "movq %4, %%mm6 \n\t"
2799  "pcmpeqb %%mm5, %%mm5 \n\t"
2800  "pxor %%mm6, %%mm5 \n\t"
2801  "pxor %%mm7, %%mm7 \n\t"
2802 
2803  "1: \n\t"
2804  "movq (%1), %%mm0 \n\t"
2805  "movq 8(%1), %%mm1 \n\t"
2806  "paddw 32(%1), %%mm0 \n\t"
2807  "paddw 40(%1), %%mm1 \n\t"
2808  "movq (%0, %3), %%mm2 \n\t"
2809  "movq %%mm2, %%mm3 \n\t"
2810  "movq %%mm2, %%mm4 \n\t"
2811  "punpcklbw %%mm7, %%mm2 \n\t"
2812  "punpckhbw %%mm7, %%mm3 \n\t"
2813  "paddw %%mm2, %%mm0 \n\t"
2814  "paddw %%mm3, %%mm1 \n\t"
2815  "paddw %%mm2, %%mm0 \n\t"
2816  "paddw %%mm3, %%mm1 \n\t"
2817  "psrlw $4, %%mm0 \n\t"
2818  "psrlw $4, %%mm1 \n\t"
2819  "packuswb %%mm1, %%mm0 \n\t"
2820  "pand %%mm6, %%mm0 \n\t"
2821  "pand %%mm5, %%mm4 \n\t"
2822  "por %%mm4, %%mm0 \n\t"
2823  "movq %%mm0, (%0, %3) \n\t"
2824  "add $16, %1 \n\t"
2825  "add %2, %0 \n\t"
2826  " js 1b \n\t"
2827 
2828  : "+r"(offset), "+r"(temp_sums)
2829  : "r" ((x86_reg)step), "r"(src - offset), "m"(both_masks)
2830  );
2831  }else
2832  src+= step; // src points to begin of the 8x8 Block
2833 
2834  if(eq_mask != -1LL){
2835  uint8_t *temp_src= src;
2836  DECLARE_ALIGNED(8, uint64_t, tmp)[4]; // make space for 4 8-byte vars
2837  __asm__ volatile(
2838  "pxor %%mm7, %%mm7 \n\t"
2839 // 0 1 2 3 4 5 6 7 8 9
2840 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1
2841 
2842  "movq (%0), %%mm0 \n\t"
2843  "movq %%mm0, %%mm1 \n\t"
2844  "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
2845  "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
2846 
2847  "movq (%0, %1), %%mm2 \n\t"
2848  "lea (%0, %1, 2), %%"REG_a" \n\t"
2849  "movq %%mm2, %%mm3 \n\t"
2850  "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
2851  "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
2852 
2853  "movq (%%"REG_a"), %%mm4 \n\t"
2854  "movq %%mm4, %%mm5 \n\t"
2855  "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
2856  "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
2857 
2858  "paddw %%mm0, %%mm0 \n\t" // 2L0
2859  "paddw %%mm1, %%mm1 \n\t" // 2H0
2860  "psubw %%mm4, %%mm2 \n\t" // L1 - L2
2861  "psubw %%mm5, %%mm3 \n\t" // H1 - H2
2862  "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
2863  "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
2864 
2865  "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
2866  "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
2867  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
2868  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
2869 
2870  "movq (%%"REG_a", %1), %%mm2 \n\t"
2871  "movq %%mm2, %%mm3 \n\t"
2872  "punpcklbw %%mm7, %%mm2 \n\t" // L3
2873  "punpckhbw %%mm7, %%mm3 \n\t" // H3
2874 
2875  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
2876  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
2877  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2878  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2879  "movq %%mm0, (%4) \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2880  "movq %%mm1, 8(%4) \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2881 
2882  "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
2883  "movq %%mm0, %%mm1 \n\t"
2884  "punpcklbw %%mm7, %%mm0 \n\t" // L4
2885  "punpckhbw %%mm7, %%mm1 \n\t" // H4
2886 
2887  "psubw %%mm0, %%mm2 \n\t" // L3 - L4
2888  "psubw %%mm1, %%mm3 \n\t" // H3 - H4
2889  "movq %%mm2, 16(%4) \n\t" // L3 - L4
2890  "movq %%mm3, 24(%4) \n\t" // H3 - H4
2891  "paddw %%mm4, %%mm4 \n\t" // 2L2
2892  "paddw %%mm5, %%mm5 \n\t" // 2H2
2893  "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
2894  "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
2895 
2896  "lea (%%"REG_a", %1), %0 \n\t"
2897  "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
2898  "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
2899  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
2900  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
2901 //50 opcodes so far
2902  "movq (%0, %1, 2), %%mm2 \n\t"
2903  "movq %%mm2, %%mm3 \n\t"
2904  "punpcklbw %%mm7, %%mm2 \n\t" // L5
2905  "punpckhbw %%mm7, %%mm3 \n\t" // H5
2906  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
2907  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
2908  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
2909  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
2910 
2911  "movq (%%"REG_a", %1, 4), %%mm6 \n\t"
2912  "punpcklbw %%mm7, %%mm6 \n\t" // L6
2913  "psubw %%mm6, %%mm2 \n\t" // L5 - L6
2914  "movq (%%"REG_a", %1, 4), %%mm6 \n\t"
2915  "punpckhbw %%mm7, %%mm6 \n\t" // H6
2916  "psubw %%mm6, %%mm3 \n\t" // H5 - H6
2917 
2918  "paddw %%mm0, %%mm0 \n\t" // 2L4
2919  "paddw %%mm1, %%mm1 \n\t" // 2H4
2920  "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
2921  "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
2922 
2923  "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
2924  "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
2925  "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
2926  "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
2927 
2928  "movq (%0, %1, 4), %%mm2 \n\t"
2929  "movq %%mm2, %%mm3 \n\t"
2930  "punpcklbw %%mm7, %%mm2 \n\t" // L7
2931  "punpckhbw %%mm7, %%mm3 \n\t" // H7
2932 
2933  "paddw %%mm2, %%mm2 \n\t" // 2L7
2934  "paddw %%mm3, %%mm3 \n\t" // 2H7
2935  "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
2936  "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
2937 
2938  "movq (%4), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2939  "movq 8(%4), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2940 
2941 #if TEMPLATE_PP_MMXEXT
2942  "movq %%mm7, %%mm6 \n\t" // 0
2943  "psubw %%mm0, %%mm6 \n\t"
2944  "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
2945  "movq %%mm7, %%mm6 \n\t" // 0
2946  "psubw %%mm1, %%mm6 \n\t"
2947  "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
2948  "movq %%mm7, %%mm6 \n\t" // 0
2949  "psubw %%mm2, %%mm6 \n\t"
2950  "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
2951  "movq %%mm7, %%mm6 \n\t" // 0
2952  "psubw %%mm3, %%mm6 \n\t"
2953  "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
2954 #else
2955  "movq %%mm7, %%mm6 \n\t" // 0
2956  "pcmpgtw %%mm0, %%mm6 \n\t"
2957  "pxor %%mm6, %%mm0 \n\t"
2958  "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
2959  "movq %%mm7, %%mm6 \n\t" // 0
2960  "pcmpgtw %%mm1, %%mm6 \n\t"
2961  "pxor %%mm6, %%mm1 \n\t"
2962  "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
2963  "movq %%mm7, %%mm6 \n\t" // 0
2964  "pcmpgtw %%mm2, %%mm6 \n\t"
2965  "pxor %%mm6, %%mm2 \n\t"
2966  "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
2967  "movq %%mm7, %%mm6 \n\t" // 0
2968  "pcmpgtw %%mm3, %%mm6 \n\t"
2969  "pxor %%mm6, %%mm3 \n\t"
2970  "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
2971 #endif
2972 
2973 #if TEMPLATE_PP_MMXEXT
2974  "pminsw %%mm2, %%mm0 \n\t"
2975  "pminsw %%mm3, %%mm1 \n\t"
2976 #else
2977  "movq %%mm0, %%mm6 \n\t"
2978  "psubusw %%mm2, %%mm6 \n\t"
2979  "psubw %%mm6, %%mm0 \n\t"
2980  "movq %%mm1, %%mm6 \n\t"
2981  "psubusw %%mm3, %%mm6 \n\t"
2982  "psubw %%mm6, %%mm1 \n\t"
2983 #endif
2984 
2985  "movd %2, %%mm2 \n\t" // QP
2986  "punpcklbw %%mm7, %%mm2 \n\t"
2987 
2988  "movq %%mm7, %%mm6 \n\t" // 0
2989  "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
2990  "pxor %%mm6, %%mm4 \n\t"
2991  "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
2992  "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
2993  "pxor %%mm7, %%mm5 \n\t"
2994  "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
2995 // 100 opcodes
2996  "psllw $3, %%mm2 \n\t" // 8QP
2997  "movq %%mm2, %%mm3 \n\t" // 8QP
2998  "pcmpgtw %%mm4, %%mm2 \n\t"
2999  "pcmpgtw %%mm5, %%mm3 \n\t"
3000  "pand %%mm2, %%mm4 \n\t"
3001  "pand %%mm3, %%mm5 \n\t"
3002 
3003 
3004  "psubusw %%mm0, %%mm4 \n\t" // hd
3005  "psubusw %%mm1, %%mm5 \n\t" // ld
3006 
3007 
3008  "movq "MANGLE(w05)", %%mm2 \n\t" // 5
3009  "pmullw %%mm2, %%mm4 \n\t"
3010  "pmullw %%mm2, %%mm5 \n\t"
3011  "movq "MANGLE(w20)", %%mm2 \n\t" // 32
3012  "paddw %%mm2, %%mm4 \n\t"
3013  "paddw %%mm2, %%mm5 \n\t"
3014  "psrlw $6, %%mm4 \n\t"
3015  "psrlw $6, %%mm5 \n\t"
3016 
3017  "movq 16(%4), %%mm0 \n\t" // L3 - L4
3018  "movq 24(%4), %%mm1 \n\t" // H3 - H4
3019 
3020  "pxor %%mm2, %%mm2 \n\t"
3021  "pxor %%mm3, %%mm3 \n\t"
3022 
3023  "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
3024  "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
3025  "pxor %%mm2, %%mm0 \n\t"
3026  "pxor %%mm3, %%mm1 \n\t"
3027  "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
3028  "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
3029  "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
3030  "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
3031 
3032  "pxor %%mm6, %%mm2 \n\t"
3033  "pxor %%mm7, %%mm3 \n\t"
3034  "pand %%mm2, %%mm4 \n\t"
3035  "pand %%mm3, %%mm5 \n\t"
3036 
3037 #if TEMPLATE_PP_MMXEXT
3038  "pminsw %%mm0, %%mm4 \n\t"
3039  "pminsw %%mm1, %%mm5 \n\t"
3040 #else
3041  "movq %%mm4, %%mm2 \n\t"
3042  "psubusw %%mm0, %%mm2 \n\t"
3043  "psubw %%mm2, %%mm4 \n\t"
3044  "movq %%mm5, %%mm2 \n\t"
3045  "psubusw %%mm1, %%mm2 \n\t"
3046  "psubw %%mm2, %%mm5 \n\t"
3047 #endif
3048  "pxor %%mm6, %%mm4 \n\t"
3049  "pxor %%mm7, %%mm5 \n\t"
3050  "psubw %%mm6, %%mm4 \n\t"
3051  "psubw %%mm7, %%mm5 \n\t"
3052  "packsswb %%mm5, %%mm4 \n\t"
3053  "movq %3, %%mm1 \n\t"
3054  "pandn %%mm4, %%mm1 \n\t"
3055  "movq (%0), %%mm0 \n\t"
3056  "paddb %%mm1, %%mm0 \n\t"
3057  "movq %%mm0, (%0) \n\t"
3058  "movq (%0, %1), %%mm0 \n\t"
3059  "psubb %%mm1, %%mm0 \n\t"
3060  "movq %%mm0, (%0, %1) \n\t"
3061 
3062  : "+r" (temp_src)
3063  : "r" ((x86_reg)step), "m" (c->pQPb), "m"(eq_mask), "r"(tmp)
3064  : "%"REG_a
3065  );
3066  }
3067 /*if(step==16){
3068  STOP_TIMER("step16")
3069 }else{
3070  STOP_TIMER("stepX")
3071 }
3072  } */
3073 }
3074 #endif //TEMPLATE_PP_MMX
3075 
3076 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3077  const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
3078 
3079 /**
3080  * Copy a block from src to dst and fixes the blacklevel.
3081  * levelFix == 0 -> do not touch the brightness & contrast
3082  */
3083 #undef REAL_SCALED_CPY
3084 #undef SCALED_CPY
3085 
3086 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride,
3087  int levelFix, int64_t *packedOffsetAndScale)
3088 {
3089 #if !TEMPLATE_PP_MMX
3090  int i;
3091 #endif
3092  if(levelFix){
3093 #if TEMPLATE_PP_MMX
3094  __asm__ volatile(
3095  "movq (%%"REG_a"), %%mm2 \n\t" // packedYOffset
3096  "movq 8(%%"REG_a"), %%mm3 \n\t" // packedYScale
3097  "lea (%2,%4), %%"REG_a" \n\t"
3098  "lea (%3,%5), %%"REG_d" \n\t"
3099  "pxor %%mm4, %%mm4 \n\t"
3100 #if TEMPLATE_PP_MMXEXT
3101 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
3102  "movq " #src1 ", %%mm0 \n\t"\
3103  "movq " #src1 ", %%mm5 \n\t"\
3104  "movq " #src2 ", %%mm1 \n\t"\
3105  "movq " #src2 ", %%mm6 \n\t"\
3106  "punpcklbw %%mm0, %%mm0 \n\t"\
3107  "punpckhbw %%mm5, %%mm5 \n\t"\
3108  "punpcklbw %%mm1, %%mm1 \n\t"\
3109  "punpckhbw %%mm6, %%mm6 \n\t"\
3110  "pmulhuw %%mm3, %%mm0 \n\t"\
3111  "pmulhuw %%mm3, %%mm5 \n\t"\
3112  "pmulhuw %%mm3, %%mm1 \n\t"\
3113  "pmulhuw %%mm3, %%mm6 \n\t"\
3114  "psubw %%mm2, %%mm0 \n\t"\
3115  "psubw %%mm2, %%mm5 \n\t"\
3116  "psubw %%mm2, %%mm1 \n\t"\
3117  "psubw %%mm2, %%mm6 \n\t"\
3118  "packuswb %%mm5, %%mm0 \n\t"\
3119  "packuswb %%mm6, %%mm1 \n\t"\
3120  "movq %%mm0, " #dst1 " \n\t"\
3121  "movq %%mm1, " #dst2 " \n\t"\
3122 
3123 #else //TEMPLATE_PP_MMXEXT
3124 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
3125  "movq " #src1 ", %%mm0 \n\t"\
3126  "movq " #src1 ", %%mm5 \n\t"\
3127  "punpcklbw %%mm4, %%mm0 \n\t"\
3128  "punpckhbw %%mm4, %%mm5 \n\t"\
3129  "psubw %%mm2, %%mm0 \n\t"\
3130  "psubw %%mm2, %%mm5 \n\t"\
3131  "movq " #src2 ", %%mm1 \n\t"\
3132  "psllw $6, %%mm0 \n\t"\
3133  "psllw $6, %%mm5 \n\t"\
3134  "pmulhw %%mm3, %%mm0 \n\t"\
3135  "movq " #src2 ", %%mm6 \n\t"\
3136  "pmulhw %%mm3, %%mm5 \n\t"\
3137  "punpcklbw %%mm4, %%mm1 \n\t"\
3138  "punpckhbw %%mm4, %%mm6 \n\t"\
3139  "psubw %%mm2, %%mm1 \n\t"\
3140  "psubw %%mm2, %%mm6 \n\t"\
3141  "psllw $6, %%mm1 \n\t"\
3142  "psllw $6, %%mm6 \n\t"\
3143  "pmulhw %%mm3, %%mm1 \n\t"\
3144  "pmulhw %%mm3, %%mm6 \n\t"\
3145  "packuswb %%mm5, %%mm0 \n\t"\
3146  "packuswb %%mm6, %%mm1 \n\t"\
3147  "movq %%mm0, " #dst1 " \n\t"\
3148  "movq %%mm1, " #dst2 " \n\t"\
3149 
3150 #endif //TEMPLATE_PP_MMXEXT
3151 #define SCALED_CPY(src1, src2, dst1, dst2)\
3152  REAL_SCALED_CPY(src1, src2, dst1, dst2)
3153 
3154 SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5))
3155 SCALED_CPY((%2, %4, 2), (%%REGa, %4, 2), (%3, %5, 2), (%%REGd, %5, 2))
3156 SCALED_CPY((%2, %4, 4), (%%REGa, %4, 4), (%3, %5, 4), (%%REGd, %5, 4))
3157  "lea (%%"REG_a",%4,4), %%"REG_a" \n\t"
3158  "lea (%%"REG_d",%5,4), %%"REG_d" \n\t"
3159 SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2))
3160 
3161 
3162  : "=&a" (packedOffsetAndScale)
3163  : "0" (packedOffsetAndScale),
3164  "r"(src),
3165  "r"(dst),
3166  "r" ((x86_reg)srcStride),
3167  "r" ((x86_reg)dstStride)
3168  : "%"REG_d
3169  );
3170 #else //TEMPLATE_PP_MMX
3171  for(i=0; i<8; i++)
3172  memcpy( &(dst[dstStride*i]),
3173  &(src[srcStride*i]), BLOCK_SIZE);
3174 #endif //TEMPLATE_PP_MMX
3175  }else{
3176 #if TEMPLATE_PP_MMX
3177  __asm__ volatile(
3178  "lea (%0,%2), %%"REG_a" \n\t"
3179  "lea (%1,%3), %%"REG_d" \n\t"
3180 
3181 #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \
3182  "movq " #src1 ", %%mm0 \n\t"\
3183  "movq " #src2 ", %%mm1 \n\t"\
3184  "movq %%mm0, " #dst1 " \n\t"\
3185  "movq %%mm1, " #dst2 " \n\t"\
3186 
3187 #define SIMPLE_CPY(src1, src2, dst1, dst2)\
3188  REAL_SIMPLE_CPY(src1, src2, dst1, dst2)
3189 
3190 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
3191 SIMPLE_CPY((%0, %2, 2), (%%REGa, %2, 2), (%1, %3, 2), (%%REGd, %3, 2))
3192 SIMPLE_CPY((%0, %2, 4), (%%REGa, %2, 4), (%1, %3, 4), (%%REGd, %3, 4))
3193  "lea (%%"REG_a",%2,4), %%"REG_a" \n\t"
3194  "lea (%%"REG_d",%3,4), %%"REG_d" \n\t"
3195 SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2))
3196 
3197  : : "r" (src),
3198  "r" (dst),
3199  "r" ((x86_reg)srcStride),
3200  "r" ((x86_reg)dstStride)
3201  : "%"REG_a, "%"REG_d
3202  );
3203 #else //TEMPLATE_PP_MMX
3204  for(i=0; i<8; i++)
3205  memcpy( &(dst[dstStride*i]),
3206  &(src[srcStride*i]), BLOCK_SIZE);
3207 #endif //TEMPLATE_PP_MMX
3208  }
3209 }
3210 
3211 /**
3212  * Duplicate the given 8 src pixels ? times upward
3213  */
3214 static inline void RENAME(duplicate)(uint8_t src[], int stride)
3215 {
3216 #if TEMPLATE_PP_MMX
3217  __asm__ volatile(
3218  "movq (%0), %%mm0 \n\t"
3219  "movq %%mm0, (%0, %1, 4) \n\t"
3220  "add %1, %0 \n\t"
3221  "movq %%mm0, (%0) \n\t"
3222  "movq %%mm0, (%0, %1) \n\t"
3223  "movq %%mm0, (%0, %1, 2) \n\t"
3224  "movq %%mm0, (%0, %1, 4) \n\t"
3225  : "+r" (src)
3226  : "r" ((x86_reg)-stride)
3227  );
3228 #else
3229  int i;
3230  uint8_t *p=src;
3231  for(i=0; i<5; i++){
3232  p-= stride;
3233  memcpy(p, src, 8);
3234  }
3235 #endif
3236 }
3237 
3238 /**
3239  * Filter array of bytes (Y or U or V values)
3240  */
3241 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3242  const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
3243 {
3244  DECLARE_ALIGNED(8, PPContext, c)= *c2; //copy to stack for faster access
3245  int x,y;
3246 #ifdef TEMPLATE_PP_TIME_MODE
3247  const int mode= TEMPLATE_PP_TIME_MODE;
3248 #else
3249  const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
3250 #endif
3251  int black=0, white=255; // blackest black and whitest white in the picture
3252  int QPCorrecture= 256*256;
3253 
3254  int copyAhead;
3255 #if TEMPLATE_PP_MMX
3256  int i;
3257 #endif
3258 
3259  const int qpHShift= isColor ? 4-c.hChromaSubSample : 4;
3260  const int qpVShift= isColor ? 4-c.vChromaSubSample : 4;
3261 
3262  //FIXME remove
3263  uint64_t * const yHistogram= c.yHistogram;
3264  uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride;
3265  uint8_t * const tempDst= (dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride) + 32;
3266  //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
3267 
3268 #if TEMPLATE_PP_MMX
3269  for(i=0; i<57; i++){
3270  int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
3271  int threshold= offset*2 + 1;
3272  c.mmxDcOffset[i]= 0x7F - offset;
3273  c.mmxDcThreshold[i]= 0x7F - threshold;
3274  c.mmxDcOffset[i]*= 0x0101010101010101LL;
3275  c.mmxDcThreshold[i]*= 0x0101010101010101LL;
3276  }
3277 #endif
3278 
3279  if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
3280  else if( (mode & LINEAR_BLEND_DEINT_FILTER)
3281  || (mode & FFMPEG_DEINT_FILTER)
3282  || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14;
3283  else if( (mode & V_DEBLOCK)
3284  || (mode & LINEAR_IPOL_DEINT_FILTER)
3285  || (mode & MEDIAN_DEINT_FILTER)
3286  || (mode & V_A_DEBLOCK)) copyAhead=13;
3287  else if(mode & V_X1_FILTER) copyAhead=11;
3288 // else if(mode & V_RK1_FILTER) copyAhead=10;
3289  else if(mode & DERING) copyAhead=9;
3290  else copyAhead=8;
3291 
3292  copyAhead-= 8;
3293 
3294  if(!isColor){
3295  uint64_t sum= 0;
3296  int i;
3297  uint64_t maxClipped;
3298  uint64_t clipped;
3299  double scale;
3300 
3301  c.frameNum++;
3302  // first frame is fscked so we ignore it
3303  if(c.frameNum == 1) yHistogram[0]= width*(uint64_t)height/64*15/256;
3304 
3305  for(i=0; i<256; i++){
3306  sum+= yHistogram[i];
3307  }
3308 
3309  /* We always get a completely black picture first. */
3310  maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold);
3311 
3312  clipped= sum;
3313  for(black=255; black>0; black--){
3314  if(clipped < maxClipped) break;
3315  clipped-= yHistogram[black];
3316  }
3317 
3318  clipped= sum;
3319  for(white=0; white<256; white++){
3320  if(clipped < maxClipped) break;
3321  clipped-= yHistogram[white];
3322  }
3323 
3324  scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black);
3325 
3326 #if TEMPLATE_PP_MMXEXT
3327  c.packedYScale= (uint16_t)(scale*256.0 + 0.5);
3328  c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
3329 #else
3330  c.packedYScale= (uint16_t)(scale*1024.0 + 0.5);
3331  c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
3332 #endif
3333 
3334  c.packedYOffset|= c.packedYOffset<<32;
3335  c.packedYOffset|= c.packedYOffset<<16;
3336 
3337  c.packedYScale|= c.packedYScale<<32;
3338  c.packedYScale|= c.packedYScale<<16;
3339 
3340  if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5);
3341  else QPCorrecture= 256*256;
3342  }else{
3343  c.packedYScale= 0x0100010001000100LL;
3344  c.packedYOffset= 0;
3345  QPCorrecture= 256*256;
3346  }
3347 
3348  /* copy & deinterlace first row of blocks */
3349  y=-BLOCK_SIZE;
3350  {
3351  const uint8_t *srcBlock= &(src[y*srcStride]);
3352  uint8_t *dstBlock= tempDst + dstStride;
3353 
3354  // From this point on it is guaranteed that we can read and write 16 lines downward
3355  // finish 1 block before the next otherwise we might have a problem
3356  // with the L1 Cache of the P4 ... or only a few blocks at a time or something
3357  for(x=0; x<width; x+=BLOCK_SIZE){
3358 
3359 #if TEMPLATE_PP_MMXEXT
3360 /*
3361  prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3362  prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3363  prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3364  prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3365 */
3366 
3367  __asm__(
3368  "mov %4, %%"REG_a" \n\t"
3369  "shr $2, %%"REG_a" \n\t"
3370  "and $6, %%"REG_a" \n\t"
3371  "add %5, %%"REG_a" \n\t"
3372  "mov %%"REG_a", %%"REG_d" \n\t"
3373  "imul %1, %%"REG_a" \n\t"
3374  "imul %3, %%"REG_d" \n\t"
3375  "prefetchnta 32(%%"REG_a", %0) \n\t"
3376  "prefetcht0 32(%%"REG_d", %2) \n\t"
3377  "add %1, %%"REG_a" \n\t"
3378  "add %3, %%"REG_d" \n\t"
3379  "prefetchnta 32(%%"REG_a", %0) \n\t"
3380  "prefetcht0 32(%%"REG_d", %2) \n\t"
3381  :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride),
3382  "g" ((x86_reg)x), "g" ((x86_reg)copyAhead)
3383  : "%"REG_a, "%"REG_d
3384  );
3385 
3386 #elif TEMPLATE_PP_3DNOW
3387 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ...
3388 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3389  prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3390  prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3391  prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3392 */
3393 #endif
3394 
3395  RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
3396  srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
3397 
3398  RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
3399 
3400  if(mode & LINEAR_IPOL_DEINT_FILTER)
3401  RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3402  else if(mode & LINEAR_BLEND_DEINT_FILTER)
3403  RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
3404  else if(mode & MEDIAN_DEINT_FILTER)
3405  RENAME(deInterlaceMedian)(dstBlock, dstStride);
3406  else if(mode & CUBIC_IPOL_DEINT_FILTER)
3407  RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3408  else if(mode & FFMPEG_DEINT_FILTER)
3409  RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
3410  else if(mode & LOWPASS5_DEINT_FILTER)
3411  RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
3412 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3413  RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3414 */
3415  dstBlock+=8;
3416  srcBlock+=8;
3417  }
3418  if(width==FFABS(dstStride))
3419  linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
3420  else{
3421  int i;
3422  for(i=0; i<copyAhead; i++){
3423  memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
3424  }
3425  }
3426  }
3427 
3428  for(y=0; y<height; y+=BLOCK_SIZE){
3429  //1% speedup if these are here instead of the inner loop
3430  const uint8_t *srcBlock= &(src[y*srcStride]);
3431  uint8_t *dstBlock= &(dst[y*dstStride]);
3432 #if TEMPLATE_PP_MMX
3433  uint8_t *tempBlock1= c.tempBlocks;
3434  uint8_t *tempBlock2= c.tempBlocks + 8;
3435 #endif
3436  const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
3437  int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)];
3438  int QP=0;
3439  /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3440  if not than use a temporary buffer */
3441  if(y+15 >= height){
3442  int i;
3443  /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
3444  blockcopy to dst later */
3445  linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
3446  FFMAX(height-y-copyAhead, 0), srcStride);
3447 
3448  /* duplicate last line of src to fill the void up to line (copyAhead+7) */
3449  for(i=FFMAX(height-y, 8); i<copyAhead+8; i++)
3450  memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride));
3451 
3452  /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
3453  linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride);
3454 
3455  /* duplicate last line of dst to fill the void up to line (copyAhead) */
3456  for(i=height-y+1; i<=copyAhead; i++)
3457  memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride));
3458 
3459  dstBlock= tempDst + dstStride;
3460  srcBlock= tempSrc;
3461  }
3462 
3463  // From this point on it is guaranteed that we can read and write 16 lines downward
3464  // finish 1 block before the next otherwise we might have a problem
3465  // with the L1 Cache of the P4 ... or only a few blocks at a time or something
3466  for(x=0; x<width; x+=BLOCK_SIZE){
3467  const int stride= dstStride;
3468 #if TEMPLATE_PP_MMX
3469  uint8_t *tmpXchg;
3470 #endif
3471  if(isColor){
3472  QP= QPptr[x>>qpHShift];
3473  c.nonBQP= nonBQPptr[x>>qpHShift];
3474  }else{
3475  QP= QPptr[x>>4];
3476  QP= (QP* QPCorrecture + 256*128)>>16;
3477  c.nonBQP= nonBQPptr[x>>4];
3478  c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16;
3479  yHistogram[ srcBlock[srcStride*12 + 4] ]++;
3480  }
3481  c.QP= QP;
3482 #if TEMPLATE_PP_MMX
3483  __asm__ volatile(
3484  "movd %1, %%mm7 \n\t"
3485  "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
3486  "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
3487  "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
3488  "movq %%mm7, %0 \n\t"
3489  : "=m" (c.pQPb)
3490  : "r" (QP)
3491  );
3492 #endif
3493 
3494 
3495 #if TEMPLATE_PP_MMXEXT
3496 /*
3497  prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3498  prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3499  prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3500  prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3501 */
3502 
3503  __asm__(
3504  "mov %4, %%"REG_a" \n\t"
3505  "shr $2, %%"REG_a" \n\t"
3506  "and $6, %%"REG_a" \n\t"
3507  "add %5, %%"REG_a" \n\t"
3508  "mov %%"REG_a", %%"REG_d" \n\t"
3509  "imul %1, %%"REG_a" \n\t"
3510  "imul %3, %%"REG_d" \n\t"
3511  "prefetchnta 32(%%"REG_a", %0) \n\t"
3512  "prefetcht0 32(%%"REG_d", %2) \n\t"
3513  "add %1, %%"REG_a" \n\t"
3514  "add %3, %%"REG_d" \n\t"
3515  "prefetchnta 32(%%"REG_a", %0) \n\t"
3516  "prefetcht0 32(%%"REG_d", %2) \n\t"
3517  :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride),
3518  "g" ((x86_reg)x), "g" ((x86_reg)copyAhead)
3519  : "%"REG_a, "%"REG_d
3520  );
3521 
3522 #elif TEMPLATE_PP_3DNOW
3523 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ...
3524 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3525  prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3526  prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3527  prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3528 */
3529 #endif
3530 
3531  RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
3532  srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
3533 
3534  if(mode & LINEAR_IPOL_DEINT_FILTER)
3535  RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3536  else if(mode & LINEAR_BLEND_DEINT_FILTER)
3537  RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
3538  else if(mode & MEDIAN_DEINT_FILTER)
3539  RENAME(deInterlaceMedian)(dstBlock, dstStride);
3540  else if(mode & CUBIC_IPOL_DEINT_FILTER)
3541  RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3542  else if(mode & FFMPEG_DEINT_FILTER)
3543  RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
3544  else if(mode & LOWPASS5_DEINT_FILTER)
3545  RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
3546 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3547  RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3548 */
3549 
3550  /* only deblock if we have 2 blocks */
3551  if(y + 8 < height){
3552  if(mode & V_X1_FILTER)
3553  RENAME(vertX1Filter)(dstBlock, stride, &c);
3554  else if(mode & V_DEBLOCK){
3555  const int t= RENAME(vertClassify)(dstBlock, stride, &c);
3556 
3557  if(t==1)
3558  RENAME(doVertLowPass)(dstBlock, stride, &c);
3559  else if(t==2)
3560  RENAME(doVertDefFilter)(dstBlock, stride, &c);
3561  }else if(mode & V_A_DEBLOCK){
3562  RENAME(do_a_deblock)(dstBlock, stride, 1, &c);
3563  }
3564  }
3565 
3566 #if TEMPLATE_PP_MMX
3567  RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
3568 #endif
3569  /* check if we have a previous block to deblock it with dstBlock */
3570  if(x - 8 >= 0){
3571 #if TEMPLATE_PP_MMX
3572  if(mode & H_X1_FILTER)
3573  RENAME(vertX1Filter)(tempBlock1, 16, &c);
3574  else if(mode & H_DEBLOCK){
3575 //START_TIMER
3576  const int t= RENAME(vertClassify)(tempBlock1, 16, &c);
3577 //STOP_TIMER("dc & minmax")
3578  if(t==1)
3579  RENAME(doVertLowPass)(tempBlock1, 16, &c);
3580  else if(t==2)
3581  RENAME(doVertDefFilter)(tempBlock1, 16, &c);
3582  }else if(mode & H_A_DEBLOCK){
3583  RENAME(do_a_deblock)(tempBlock1, 16, 1, &c);
3584  }
3585 
3586  RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
3587 
3588 #else
3589  if(mode & H_X1_FILTER)
3590  horizX1Filter(dstBlock-4, stride, QP);
3591  else if(mode & H_DEBLOCK){
3592 #if TEMPLATE_PP_ALTIVEC
3593  DECLARE_ALIGNED(16, unsigned char, tempBlock)[272];
3594  int t;
3595  transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
3596 
3597  t = vertClassify_altivec(tempBlock-48, 16, &c);
3598  if(t==1) {
3599  doVertLowPass_altivec(tempBlock-48, 16, &c);
3600  transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3601  }
3602  else if(t==2) {
3603  doVertDefFilter_altivec(tempBlock-48, 16, &c);
3604  transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3605  }
3606 #else
3607  const int t= RENAME(horizClassify)(dstBlock-4, stride, &c);
3608 
3609  if(t==1)
3610  RENAME(doHorizLowPass)(dstBlock-4, stride, &c);
3611  else if(t==2)
3612  RENAME(doHorizDefFilter)(dstBlock-4, stride, &c);
3613 #endif
3614  }else if(mode & H_A_DEBLOCK){
3615  RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c);
3616  }
3617 #endif //TEMPLATE_PP_MMX
3618  if(mode & DERING){
3619  //FIXME filter first line
3620  if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
3621  }
3622 
3623  if(mode & TEMP_NOISE_FILTER)
3624  {
3625  RENAME(tempNoiseReducer)(dstBlock-8, stride,
3626  c.tempBlurred[isColor] + y*dstStride + x,
3627  c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256,
3628  c.ppMode.maxTmpNoise);
3629  }
3630  }
3631 
3632  dstBlock+=8;
3633  srcBlock+=8;
3634 
3635 #if TEMPLATE_PP_MMX
3636  tmpXchg= tempBlock1;
3637  tempBlock1= tempBlock2;
3638  tempBlock2 = tmpXchg;
3639 #endif
3640  }
3641 
3642  if(mode & DERING){
3643  if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
3644  }
3645 
3646  if((mode & TEMP_NOISE_FILTER)){
3647  RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
3648  c.tempBlurred[isColor] + y*dstStride + x,
3649  c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256,
3650  c.ppMode.maxTmpNoise);
3651  }
3652 
3653  /* did we use a tmp buffer for the last lines*/
3654  if(y+15 >= height){
3655  uint8_t *dstBlock= &(dst[y*dstStride]);
3656  if(width==FFABS(dstStride))
3657  linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
3658  else{
3659  int i;
3660  for(i=0; i<height-y; i++){
3661  memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
3662  }
3663  }
3664  }
3665 /*
3666  for(x=0; x<width; x+=32){
3667  volatile int i;
3668  i+= dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
3669  + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
3670  + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
3671  + dstBlock[x +13*dstStride]
3672  + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
3673  }*/
3674  }
3675 #if TEMPLATE_PP_3DNOW
3676  __asm__ volatile("femms");
3677 #elif TEMPLATE_PP_MMX
3678  __asm__ volatile("emms");
3679 #endif
3680 
3681 #ifdef DEBUG_BRIGHTNESS
3682  if(!isColor){
3683  int max=1;
3684  int i;
3685  for(i=0; i<256; i++)
3686  if(yHistogram[i] > max) max=yHistogram[i];
3687 
3688  for(i=1; i<256; i++){
3689  int x;
3690  int start=yHistogram[i-1]/(max/256+1);
3691  int end=yHistogram[i]/(max/256+1);
3692  int inc= end > start ? 1 : -1;
3693  for(x=start; x!=end+inc; x+=inc)
3694  dst[ i*dstStride + x]+=128;
3695  }
3696 
3697  for(i=0; i<100; i+=2){
3698  dst[ (white)*dstStride + i]+=128;
3699  dst[ (black)*dstStride + i]+=128;
3700  }
3701  }
3702 #endif
3703 
3704  *c2= c; //copy local context back
3705 
3706 }
3707 
3708 #undef RENAME
3709 #undef TEMPLATE_PP_C
3710 #undef TEMPLATE_PP_ALTIVEC
3711 #undef TEMPLATE_PP_MMX
3712 #undef TEMPLATE_PP_MMXEXT
3713 #undef TEMPLATE_PP_3DNOW
3714 #undef TEMPLATE_PP_SSE2
Definition: start.py:1
static void RENAME() deInterlaceL5(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
Deinterlace the given block by filtering every line with a (-1 2 6 2 -1) filter.
#define MANGLE(a)
#define QP(qP, depth)
Definition: h264_ps.c:62
float v
const char * s
Definition: avisynth_c.h:668
static int vertClassify_altivec(uint8_t src[], int stride, PPContext *c)
FIXME Range Coding of cr are ref
Definition: snow.txt:367
#define c2
Definition: idct_sh4.c:27
if max(w)>1 w=0.9 *w/max(w)
Sinusoidal phase f
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:59
static void RENAME() duplicate(uint8_t src[], int stride)
Duplicate the given 8 src pixels ? times upward.
#define TEMPLATE_PP_SSE2
#define BLOCK_SIZE
Definition: adx.h:53
static void RENAME() doVertLowPass(uint8_t *src, int stride, PPContext *c)
Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) using the...
int stride
Definition: mace.c:144
static void RENAME() deInterlaceFF(uint8_t src[], int stride, uint8_t *tmp)
Deinterlace the given block by filtering every second line with a (-1 4 2 4 -1) filter.
static void transpose_16x8_char_toPackedAlign_altivec(unsigned char *dst, unsigned char *src, int stride)
#define LINEAR_BLEND_DEINT_FILTER
#define H_X1_FILTER
set threshold d
uint8_t
mode
Definition: f_perms.c:27
#define b
Definition: input.c:42
end end
static void transpose_8x16_char_fromPackedAlign_altivec(unsigned char *dst, unsigned char *src, int stride)
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But first
static void RENAME() deInterlaceBlendLinear(uint8_t src[], int stride, uint8_t *tmp)
Deinterlace the given block by filtering all lines with a (1 2 1) filter.
#define LOWPASS5_DEINT_FILTER
static void horizX1Filter(uint8_t *src, int stride, int QP)
Experimental Filter 1 (Horizontal) will not damage linear gradients Flat blocks should look like they...
Definition: postprocess.c:381
Discrete Time axis x
#define DERING
static void RENAME() deInterlaceMedian(uint8_t src[], int stride)
Deinterlace the given block by applying a median filter to every second line.
#define H_A_DEBLOCK
static void linecpy(void *dest, const void *src, int lines, int stride)
#define MEDIAN_DEINT_FILTER
#define CUBIC_IPOL_DEINT_FILTER
static void RENAME() dering(uint8_t src[], int stride, PPContext *c)
#define t1
Definition: regdef.h:29
void av_log(void *avcl, int level, const char *fmt,...)
Definition: log.c:246
#define V_A_DEBLOCK
#define t3
Definition: regdef.h:31
static const uint8_t offset[127][2]
Definition: vf_spp.c:70
#define FFMAX(a, b)
Definition: common.h:56
static void RENAME() deInterlaceInterpolateLinear(uint8_t src[], int stride)
Deinterlace the given block by linearly interpolating every second line.
#define V_DEBLOCK
#define FFMIN(a, b)
Definition: common.h:58
#define LINEAR_IPOL_DEINT_FILTER
#define FFSIGN(a)
Definition: common.h:54
t
Definition: genspecsines3.m:6
#define H_DEBLOCK
static void RENAME() blockCopy(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride, int levelFix, int64_t *packedOffsetAndScale)
Copy a block from src to dst and fixes the blacklevel.
#define XMM_CLOBBERS(...)
#define FFMPEG_DEINT_FILTER
#define FFABS(a)
Definition: common.h:53
static void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext *c)
#define diff(a, as, b, bs)
Definition: vf_phase.c:80
static void RENAME() deInterlaceInterpolateCubic(uint8_t src[], int stride)
Deinterlace the given block by cubic interpolating every second line.
static int width
Definition: tests/utils.c:158
AVS_Value src
Definition: avisynth_c.h:523
static void RENAME() postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c)
Filter array of bytes (Y or U or V values)
static void RENAME() vertX1Filter(uint8_t *src, int stride, PPContext *co)
Experimental Filter 1 will not damage linear gradients Flat blocks should look like they were passed ...
#define RENAME(a)
Definition: mpegaudiodec.c:109
BYTE int const BYTE int int int height
Definition: avisynth_c.h:713
synthesis window for stochastic i
#define V_X1_FILTER
Definition: huffyuv.h:56
static void RENAME() doVertDefFilter(uint8_t src[], int stride, PPContext *c)
#define QP_STORE_T
Definition: postprocess.h:54
static void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c)
static double c[64]
Same thing on a dB scale
function y
Definition: D.m:1
#define PAVGB(a, b)
static void RENAME() tempNoiseReducer(uint8_t *src, int stride, uint8_t *tempBlurred, uint32_t *tempBlurredPast, int *maxNoise)
int x86_reg
#define TEMP_NOISE_FILTER
#define avg(d, s)
Definition: dsputil_align.c:52
else dst[i][x+y *dst_stride[i]]
Definition: vf_mcdeint.c:160
#define AV_LOG_INFO
Definition: log.h:156
#define av_always_inline
Definition: attributes.h:41
postprocess context.
#define LEVEL_FIX
Brightness & Contrast.
float min
#define t2
Definition: regdef.h:30
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But a word about which is also called distortion Distortion can be quantified by almost any quality measurement one chooses the sum of squared differences is used but more complex methods that consider psychovisual effects can be used as well It makes no difference in this discussion First step
#define CLIP(a)