vf_fspp.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at>
3  * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
4  *
5  * This file is part of MPlayer.
6  *
7  * MPlayer is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * MPlayer is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License along
18  * with MPlayer; if not, write to the Free Software Foundation, Inc.,
19  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20  */
21 
22 /*
23  * This implementation is based on an algorithm described in
24  * "Aria Nosratinia Embedded Post-Processing for
25  * Enhancement of Compressed Images (1999)"
26  * (http://citeseer.nj.nec.com/nosratinia99embedded.html)
27  * Futher, with splitting (i)dct into hor/ver passes, one of them can be
28  * performed once per block, not pixel. This allows for much better speed.
29  */
30 
31 /*
32  Heavily optimized version of SPP filter by Nikolaj
33  */
34 
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <inttypes.h>
39 #include <math.h>
40 
41 #include "config.h"
42 
43 #include "mp_msg.h"
44 #include "cpudetect.h"
45 #include "img_format.h"
46 #include "mp_image.h"
47 #include "vf.h"
48 #include "av_helpers.h"
49 #include "libvo/fastmemcpy.h"
50 
51 #include "libavutil/internal.h"
52 #include "libavutil/intreadwrite.h"
53 #include "libavutil/mem.h"
54 #include "libavutil/x86/asm.h"
55 #include "libavcodec/avcodec.h"
56 #include "libavcodec/dsputil.h"
57 
58 #undef free
59 #undef malloc
60 
61 //===========================================================================//
62 #define BLOCKSZ 12
63 
64 static const short custom_threshold[64]=
65 // values (296) can't be too high
66 // -it causes too big quant dependence
67 // or maybe overflow(check), which results in some flashing
68 { 71, 296, 295, 237, 71, 40, 38, 19,
69  245, 193, 185, 121, 102, 73, 53, 27,
70  158, 129, 141, 107, 97, 73, 50, 26,
71  102, 116, 109, 98, 82, 66, 45, 23,
72  71, 94, 95, 81, 70, 56, 38, 20,
73  56, 77, 74, 66, 56, 44, 30, 15,
74  38, 53, 50, 45, 38, 30, 21, 11,
75  20, 27, 26, 23, 20, 15, 11, 5
76 };
77 
78 static const uint8_t __attribute__((aligned(32))) dither[8][8]={
79  { 0, 48, 12, 60, 3, 51, 15, 63, },
80  { 32, 16, 44, 28, 35, 19, 47, 31, },
81  { 8, 56, 4, 52, 11, 59, 7, 55, },
82  { 40, 24, 36, 20, 43, 27, 39, 23, },
83  { 2, 50, 14, 62, 1, 49, 13, 61, },
84  { 34, 18, 46, 30, 33, 17, 45, 29, },
85  { 10, 58, 6, 54, 9, 57, 5, 53, },
86  { 42, 26, 38, 22, 41, 25, 37, 21, },
87 };
88 
89 struct vf_priv_s { //align 16 !
90  uint64_t threshold_mtx_noq[8*2];
91  uint64_t threshold_mtx[8*2];//used in both C & MMX (& later SSE2) versions
92 
95  int qp;
96  int mpeg2;
97  int prev_q;
99  int16_t *temp;
100  int bframes;
101  char *non_b_qp;
102 };
103 
104 
105 #if !HAVE_MMX
106 
107 //This func reads from 1 slice, 1 and clears 0 & 1
108 static void store_slice_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale)
109 {int y, x;
110 #define STORE(pos) \
111  temp= (src[x + pos] + (d[pos]>>log2_scale))>>(6-log2_scale); \
112  src[x + pos]=src[x + pos - 8*src_stride]=0; \
113  if(temp & 0x100) temp= ~(temp>>31); \
114  dst[x + pos]= temp;
115 
116  for(y=0; y<height; y++){
117  const uint8_t *d= dither[y];
118  for(x=0; x<width; x+=8){
119  int temp;
120  STORE(0);
121  STORE(1);
122  STORE(2);
123  STORE(3);
124  STORE(4);
125  STORE(5);
126  STORE(6);
127  STORE(7);
128  }
129  src+=src_stride;
130  dst+=dst_stride;
131  }
132 }
133 
134 //This func reads from 2 slices, 0 & 2 and clears 2-nd
135 static void store_slice2_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale)
136 {int y, x;
137 #define STORE2(pos) \
138  temp= (src[x + pos] + src[x + pos + 16*src_stride] + (d[pos]>>log2_scale))>>(6-log2_scale); \
139  src[x + pos + 16*src_stride]=0; \
140  if(temp & 0x100) temp= ~(temp>>31); \
141  dst[x + pos]= temp;
142 
143  for(y=0; y<height; y++){
144  const uint8_t *d= dither[y];
145  for(x=0; x<width; x+=8){
146  int temp;
147  STORE2(0);
148  STORE2(1);
149  STORE2(2);
150  STORE2(3);
151  STORE2(4);
152  STORE2(5);
153  STORE2(6);
154  STORE2(7);
155  }
156  src+=src_stride;
157  dst+=dst_stride;
158  }
159 }
160 
161 static void mul_thrmat_c(struct vf_priv_s *p,int q)
162 {
163  int a;
164  for(a=0;a<64;a++)
165  ((short*)p->threshold_mtx)[a]=q * ((short*)p->threshold_mtx_noq)[a];//ints faster in C
166 }
167 
168 static void column_fidct_c(int16_t* thr_adr, int16_t *data, int16_t *output, int cnt);
169 static void row_idct_c(int16_t* workspace,
170  int16_t* output_adr, int output_stride, int cnt);
171 static void row_fdct_c(int16_t *data, const uint8_t *pixels, int line_size, int cnt);
172 
173 //this is rather ugly, but there is no need for function pointers
174 #define store_slice_s store_slice_c
175 #define store_slice2_s store_slice2_c
176 #define mul_thrmat_s mul_thrmat_c
177 #define column_fidct_s column_fidct_c
178 #define row_idct_s row_idct_c
179 #define row_fdct_s row_fdct_c
180 
181 #else /* HAVE_MMX */
182 
183 //This func reads from 1 slice, 1 and clears 0 & 1
184 static void store_slice_mmx(uint8_t *dst, int16_t *src, long dst_stride, long src_stride, long width, long height, long log2_scale)
185 {
186  const uint8_t *od=&dither[0][0];
187  const uint8_t *end=&dither[height][0];
188  width = (width+7)&~7;
189  dst_stride-=width;
190  //src_stride=(src_stride-width)*2;
191  __asm__ volatile(
192  "mov %5, %%"REG_d" \n\t"
193  "mov %6, %%"REG_S" \n\t"
194  "mov %7, %%"REG_D" \n\t"
195  "mov %1, %%"REG_a" \n\t"
196  "movd %%"REG_d", %%mm5 \n\t"
197  "xor $-1, %%"REG_d" \n\t"
198  "mov %%"REG_a", %%"REG_c" \n\t"
199  "add $7, %%"REG_d" \n\t"
200  "neg %%"REG_a" \n\t"
201  "sub %0, %%"REG_c" \n\t"
202  "add %%"REG_c", %%"REG_c" \n\t"
203  "movd %%"REG_d", %%mm2 \n\t"
204  "mov %%"REG_c", %1 \n\t"
205  "mov %2, %%"REG_d" \n\t"
206  "shl $4, %%"REG_a" \n\t"
207 
208  "2: \n\t"
209  "movq (%%"REG_d"), %%mm3 \n\t"
210  "movq %%mm3, %%mm4 \n\t"
211  "pxor %%mm7, %%mm7 \n\t"
212  "punpcklbw %%mm7, %%mm3 \n\t"
213  "punpckhbw %%mm7, %%mm4 \n\t"
214  "mov %0, %%"REG_c" \n\t"
215  "psraw %%mm5, %%mm3 \n\t"
216  "psraw %%mm5, %%mm4 \n\t"
217  "1: \n\t"
218  "movq %%mm7, (%%"REG_S",%%"REG_a",) \n\t"
219  "movq (%%"REG_S"), %%mm0 \n\t"
220  "movq 8(%%"REG_S"), %%mm1 \n\t"
221 
222  "movq %%mm7, 8(%%"REG_S",%%"REG_a",) \n\t"
223  "paddw %%mm3, %%mm0 \n\t"
224  "paddw %%mm4, %%mm1 \n\t"
225 
226  "movq %%mm7, (%%"REG_S") \n\t"
227  "psraw %%mm2, %%mm0 \n\t"
228  "psraw %%mm2, %%mm1 \n\t"
229 
230  "movq %%mm7, 8(%%"REG_S") \n\t"
231  "packuswb %%mm1, %%mm0 \n\t"
232  "add $16, %%"REG_S" \n\t"
233 
234  "movq %%mm0, (%%"REG_D") \n\t"
235  "add $8, %%"REG_D" \n\t"
236  "sub $8, %%"REG_c" \n\t"
237  "jg 1b \n\t"
238  "add %1, %%"REG_S" \n\t"
239  "add $8, %%"REG_d" \n\t"
240  "add %3, %%"REG_D" \n\t"
241  "cmp %4, %%"REG_d" \n\t"
242  "jl 2b \n\t"
243 
244  :
245  : "m" (width), "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end),
246  "m" (log2_scale), "m" (src), "m" (dst) //input
247  : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
248  );
249 }
250 
251 //This func reads from 2 slices, 0 & 2 and clears 2-nd
252 static void store_slice2_mmx(uint8_t *dst, int16_t *src, long dst_stride, long src_stride, long width, long height, long log2_scale)
253 {
254  const uint8_t *od=&dither[0][0];
255  const uint8_t *end=&dither[height][0];
256  width = (width+7)&~7;
257  dst_stride-=width;
258  //src_stride=(src_stride-width)*2;
259  __asm__ volatile(
260  "mov %5, %%"REG_d" \n\t"
261  "mov %6, %%"REG_S" \n\t"
262  "mov %7, %%"REG_D" \n\t"
263  "mov %1, %%"REG_a" \n\t"
264  "movd %%"REG_d", %%mm5 \n\t"
265  "xor $-1, %%"REG_d" \n\t"
266  "mov %%"REG_a", %%"REG_c" \n\t"
267  "add $7, %%"REG_d" \n\t"
268  "sub %0, %%"REG_c" \n\t"
269  "add %%"REG_c", %%"REG_c" \n\t"
270  "movd %%"REG_d", %%mm2 \n\t"
271  "mov %%"REG_c", %1 \n\t"
272  "mov %2, %%"REG_d" \n\t"
273  "shl $5, %%"REG_a" \n\t"
274 
275  "2: \n\t"
276  "movq (%%"REG_d"), %%mm3 \n\t"
277  "movq %%mm3, %%mm4 \n\t"
278  "pxor %%mm7, %%mm7 \n\t"
279  "punpcklbw %%mm7, %%mm3 \n\t"
280  "punpckhbw %%mm7, %%mm4 \n\t"
281  "mov %0, %%"REG_c" \n\t"
282  "psraw %%mm5, %%mm3 \n\t"
283  "psraw %%mm5, %%mm4 \n\t"
284  "1: \n\t"
285  "movq (%%"REG_S"), %%mm0 \n\t"
286  "movq 8(%%"REG_S"), %%mm1 \n\t"
287  "paddw %%mm3, %%mm0 \n\t"
288 
289  "paddw (%%"REG_S",%%"REG_a",), %%mm0 \n\t"
290  "paddw %%mm4, %%mm1 \n\t"
291  "movq 8(%%"REG_S",%%"REG_a",), %%mm6 \n\t"
292 
293  "movq %%mm7, (%%"REG_S",%%"REG_a",) \n\t"
294  "psraw %%mm2, %%mm0 \n\t"
295  "paddw %%mm6, %%mm1 \n\t"
296 
297  "movq %%mm7, 8(%%"REG_S",%%"REG_a",) \n\t"
298  "psraw %%mm2, %%mm1 \n\t"
299  "packuswb %%mm1, %%mm0 \n\t"
300 
301  "movq %%mm0, (%%"REG_D") \n\t"
302  "add $16, %%"REG_S" \n\t"
303  "add $8, %%"REG_D" \n\t"
304  "sub $8, %%"REG_c" \n\t"
305  "jg 1b \n\t"
306  "add %1, %%"REG_S" \n\t"
307  "add $8, %%"REG_d" \n\t"
308  "add %3, %%"REG_D" \n\t"
309  "cmp %4, %%"REG_d" \n\t"
310  "jl 2b \n\t"
311 
312  :
313  : "m" (width), "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end),
314  "m" (log2_scale), "m" (src), "m" (dst) //input
315  : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_D, "%"REG_S
316  );
317 }
318 
319 static void mul_thrmat_mmx(struct vf_priv_s *p, int q)
320 {
321  uint64_t *adr=&p->threshold_mtx_noq[0];
322  __asm__ volatile(
323  "movd %0, %%mm7 \n\t"
324  "add $8*8*2, %%"REG_D" \n\t"
325  "movq 0*8(%%"REG_S"), %%mm0 \n\t"
326  "punpcklwd %%mm7, %%mm7 \n\t"
327  "movq 1*8(%%"REG_S"), %%mm1 \n\t"
328  "punpckldq %%mm7, %%mm7 \n\t"
329  "pmullw %%mm7, %%mm0 \n\t"
330 
331  "movq 2*8(%%"REG_S"), %%mm2 \n\t"
332  "pmullw %%mm7, %%mm1 \n\t"
333 
334  "movq 3*8(%%"REG_S"), %%mm3 \n\t"
335  "pmullw %%mm7, %%mm2 \n\t"
336 
337  "movq %%mm0, 0*8(%%"REG_D") \n\t"
338  "movq 4*8(%%"REG_S"), %%mm4 \n\t"
339  "pmullw %%mm7, %%mm3 \n\t"
340 
341  "movq %%mm1, 1*8(%%"REG_D") \n\t"
342  "movq 5*8(%%"REG_S"), %%mm5 \n\t"
343  "pmullw %%mm7, %%mm4 \n\t"
344 
345  "movq %%mm2, 2*8(%%"REG_D") \n\t"
346  "movq 6*8(%%"REG_S"), %%mm6 \n\t"
347  "pmullw %%mm7, %%mm5 \n\t"
348 
349  "movq %%mm3, 3*8(%%"REG_D") \n\t"
350  "movq 7*8+0*8(%%"REG_S"), %%mm0 \n\t"
351  "pmullw %%mm7, %%mm6 \n\t"
352 
353  "movq %%mm4, 4*8(%%"REG_D") \n\t"
354  "movq 7*8+1*8(%%"REG_S"), %%mm1 \n\t"
355  "pmullw %%mm7, %%mm0 \n\t"
356 
357  "movq %%mm5, 5*8(%%"REG_D") \n\t"
358  "movq 7*8+2*8(%%"REG_S"), %%mm2 \n\t"
359  "pmullw %%mm7, %%mm1 \n\t"
360 
361  "movq %%mm6, 6*8(%%"REG_D") \n\t"
362  "movq 7*8+3*8(%%"REG_S"), %%mm3 \n\t"
363  "pmullw %%mm7, %%mm2 \n\t"
364 
365  "movq %%mm0, 7*8+0*8(%%"REG_D") \n\t"
366  "movq 7*8+4*8(%%"REG_S"), %%mm4 \n\t"
367  "pmullw %%mm7, %%mm3 \n\t"
368 
369  "movq %%mm1, 7*8+1*8(%%"REG_D") \n\t"
370  "movq 7*8+5*8(%%"REG_S"), %%mm5 \n\t"
371  "pmullw %%mm7, %%mm4 \n\t"
372 
373  "movq %%mm2, 7*8+2*8(%%"REG_D") \n\t"
374  "movq 7*8+6*8(%%"REG_S"), %%mm6 \n\t"
375  "pmullw %%mm7, %%mm5 \n\t"
376 
377  "movq %%mm3, 7*8+3*8(%%"REG_D") \n\t"
378  "movq 14*8+0*8(%%"REG_S"), %%mm0 \n\t"
379  "pmullw %%mm7, %%mm6 \n\t"
380 
381  "movq %%mm4, 7*8+4*8(%%"REG_D") \n\t"
382  "movq 14*8+1*8(%%"REG_S"), %%mm1 \n\t"
383  "pmullw %%mm7, %%mm0 \n\t"
384 
385  "movq %%mm5, 7*8+5*8(%%"REG_D") \n\t"
386  "pmullw %%mm7, %%mm1 \n\t"
387 
388  "movq %%mm6, 7*8+6*8(%%"REG_D") \n\t"
389  "movq %%mm0, 14*8+0*8(%%"REG_D") \n\t"
390  "movq %%mm1, 14*8+1*8(%%"REG_D") \n\t"
391 
392  : "+g" (q), "+S" (adr), "+D" (adr)
393  :
394  );
395 }
396 
397 static void column_fidct_mmx(int16_t* thr_adr, int16_t *data, int16_t *output, int cnt);
398 static void row_idct_mmx(int16_t* workspace,
399  int16_t* output_adr, int output_stride, int cnt);
400 static void row_fdct_mmx(int16_t *data, const uint8_t *pixels, int line_size, int cnt);
401 
402 #define store_slice_s store_slice_mmx
403 #define store_slice2_s store_slice2_mmx
404 #define mul_thrmat_s mul_thrmat_mmx
405 #define column_fidct_s column_fidct_mmx
406 #define row_idct_s row_idct_mmx
407 #define row_fdct_s row_fdct_mmx
408 #endif // HAVE_MMX
409 
410 static void filter(struct vf_priv_s *p, uint8_t *dst, uint8_t *src,
411  int dst_stride, int src_stride,
412  int width, int height,
413  uint8_t *qp_store, int qp_stride, int is_luma)
414 {
415  int x, x0, y, es, qy, t;
416  const int stride= is_luma ? p->temp_stride : (width+16);//((width+16+15)&(~15))
417  const int step=6-p->log2_count;
418  const int qps= 3 + is_luma;
419  int32_t __attribute__((aligned(32))) block_align[4*8*BLOCKSZ+ 4*8*BLOCKSZ];
420  int16_t *block= (int16_t *)block_align;
421  int16_t *block3=(int16_t *)(block_align+4*8*BLOCKSZ);
422 
423  memset(block3, 0, 4*8*BLOCKSZ);
424 
425  //p->src=src-src_stride*8-8;//!
426  if (!src || !dst) return; // HACK avoid crash for Y8 colourspace
427  for(y=0; y<height; y++){
428  int index= 8 + 8*stride + y*stride;
429  fast_memcpy(p->src + index, src + y*src_stride, width);//this line can be avoided by using DR & user fr.buffers
430  for(x=0; x<8; x++){
431  p->src[index - x - 1]= p->src[index + x ];
432  p->src[index + width + x ]= p->src[index + width - x - 1];
433  }
434  }
435  for(y=0; y<8; y++){
436  fast_memcpy(p->src + ( 7-y)*stride, p->src + ( y+8)*stride, stride);
437  fast_memcpy(p->src + (height+8+y)*stride, p->src + (height-y+7)*stride, stride);
438  }
439  //FIXME (try edge emu)
440 
441  for(y=8; y<24; y++)
442  memset(p->temp+ 8 +y*stride, 0,width*sizeof(int16_t));
443 
444  for(y=step; y<height+8; y+=step){ //step= 1,2
445  qy=y-4;
446  if (qy>height-1) qy=height-1;
447  if (qy<0) qy=0;
448  qy=(qy>>qps)*qp_stride;
449  row_fdct_s(block, p->src + y*stride +2-(y&1), stride, 2);
450  for(x0=0; x0<width+8-8*(BLOCKSZ-1); x0+=8*(BLOCKSZ-1)){
451  row_fdct_s(block+8*8, p->src + y*stride+8+x0 +2-(y&1), stride, 2*(BLOCKSZ-1));
452  if(p->qp)
453  column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block+0*8, block3+0*8, 8*(BLOCKSZ-1)); //yes, this is a HOTSPOT
454  else
455  for (x=0; x<8*(BLOCKSZ-1); x+=8) {
456  t=x+x0-2; //correct t=x+x0-2-(y&1), but its the same
457  if (t<0) t=0;//t always < width-2
458  t=qp_store[qy+(t>>qps)];
459  t=norm_qscale(t, p->mpeg2);
460  if (t!=p->prev_q) p->prev_q=t, mul_thrmat_s(p, t);
461  column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block+x*8, block3+x*8, 8); //yes, this is a HOTSPOT
462  }
463  row_idct_s(block3+0*8, p->temp + (y&15)*stride+x0+2-(y&1), stride, 2*(BLOCKSZ-1));
464  memmove(block, block+(BLOCKSZ-1)*64, 8*8*sizeof(int16_t)); //cycling
465  memmove(block3, block3+(BLOCKSZ-1)*64, 6*8*sizeof(int16_t));
466  }
467  //
468  es=width+8-x0; // 8, ...
469  if (es>8)
470  row_fdct_s(block+8*8, p->src + y*stride+8+x0 +2-(y&1), stride, (es-4)>>2);
471  column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block, block3, es&(~1));
472  row_idct_s(block3+0*8, p->temp + (y&15)*stride+x0+2-(y&1), stride, es>>2);
473  {const int y1=y-8+step;//l5-7 l4-6
474  if (!(y1&7) && y1) {
475  if (y1&8) store_slice_s(dst + (y1-8)*dst_stride, p->temp+ 8 +8*stride,
476  dst_stride, stride, width, 8, 5-p->log2_count);
477  else store_slice2_s(dst + (y1-8)*dst_stride, p->temp+ 8 +0*stride,
478  dst_stride, stride, width, 8, 5-p->log2_count);
479  } }
480  }
481 
482  if (y&7) { // == height & 7
483  if (y&8) store_slice_s(dst + ((y-8)&~7)*dst_stride, p->temp+ 8 +8*stride,
484  dst_stride, stride, width, y&7, 5-p->log2_count);
485  else store_slice2_s(dst + ((y-8)&~7)*dst_stride, p->temp+ 8 +0*stride,
486  dst_stride, stride, width, y&7, 5-p->log2_count);
487  }
488 }
489 
490 static int config(struct vf_instance *vf,
491  int width, int height, int d_width, int d_height,
492  unsigned int flags, unsigned int outfmt)
493 {
494  int h= (height+16+15)&(~15);
495 
496  vf->priv->temp_stride= (width+16+15)&(~15);
497  vf->priv->temp= (int16_t*)av_mallocz(vf->priv->temp_stride*3*8*sizeof(int16_t));
498  //this can also be avoided, see above
499  vf->priv->src = (uint8_t*)av_malloc(vf->priv->temp_stride*h*sizeof(uint8_t));
500 
501  return ff_vf_next_config(vf,width,height,d_width,d_height,flags,outfmt);
502 }
503 
504 static void get_image(struct vf_instance *vf, mp_image_t *mpi)
505 {
506  if(mpi->flags&MP_IMGFLAG_PRESERVE) return; // don't change
507  // ok, we can do pp in-place (or pp disabled):
508  vf->dmpi=ff_vf_get_image(vf->next,mpi->imgfmt,
509  mpi->type, mpi->flags, mpi->width, mpi->height);
510  mpi->planes[0]=vf->dmpi->planes[0];
511  mpi->stride[0]=vf->dmpi->stride[0];
512  mpi->width=vf->dmpi->width;
513  if(mpi->flags&MP_IMGFLAG_PLANAR){
514  mpi->planes[1]=vf->dmpi->planes[1];
515  mpi->planes[2]=vf->dmpi->planes[2];
516  mpi->stride[1]=vf->dmpi->stride[1];
517  mpi->stride[2]=vf->dmpi->stride[2];
518  }
519  mpi->flags|=MP_IMGFLAG_DIRECT;
520 }
521 
522 static int put_image(struct vf_instance *vf, mp_image_t *mpi, double pts)
523 {
524  mp_image_t *dmpi;
525  if(!(mpi->flags&MP_IMGFLAG_DIRECT)){
526  // no DR, so get a new image! hope we'll get DR buffer:
527  dmpi=ff_vf_get_image(vf->next,mpi->imgfmt,
530  mpi->width,mpi->height);
531  ff_vf_clone_mpi_attributes(dmpi, mpi);
532  }else{
533  dmpi=vf->dmpi;
534  }
535 
536  vf->priv->mpeg2= mpi->qscale_type;
537  if(mpi->pict_type != 3 && mpi->qscale && !vf->priv->qp){
538  int w = mpi->qstride;
539  int h = (mpi->h + 15) >> 4;
540  if (!w) {
541  w = (mpi->w + 15) >> 4;
542  h = 1;
543  }
544  if(!vf->priv->non_b_qp)
545  vf->priv->non_b_qp= malloc(w*h);
546  fast_memcpy(vf->priv->non_b_qp, mpi->qscale, w*h);
547  }
548  if(vf->priv->log2_count || !(mpi->flags&MP_IMGFLAG_DIRECT)){
549  char *qp_tab= vf->priv->non_b_qp;
550  if(vf->priv->bframes || !qp_tab)
551  qp_tab= mpi->qscale;
552 
553  if(qp_tab || vf->priv->qp){
554  filter(vf->priv, dmpi->planes[0], mpi->planes[0], dmpi->stride[0], mpi->stride[0],
555  mpi->w, mpi->h, qp_tab, mpi->qstride, 1);
556  filter(vf->priv, dmpi->planes[1], mpi->planes[1], dmpi->stride[1], mpi->stride[1],
557  mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, qp_tab, mpi->qstride, 0);
558  filter(vf->priv, dmpi->planes[2], mpi->planes[2], dmpi->stride[2], mpi->stride[2],
559  mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, qp_tab, mpi->qstride, 0);
560  }else{
561  memcpy_pic(dmpi->planes[0], mpi->planes[0], mpi->w, mpi->h, dmpi->stride[0], mpi->stride[0]);
562  memcpy_pic(dmpi->planes[1], mpi->planes[1], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[1], mpi->stride[1]);
563  memcpy_pic(dmpi->planes[2], mpi->planes[2], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[2], mpi->stride[2]);
564  }
565  }
566 
567 #if HAVE_MMX
568  if(ff_gCpuCaps.hasMMX) __asm__ volatile ("emms\n\t");
569 #endif
570 #if HAVE_MMX2
571  if(ff_gCpuCaps.hasMMX2) __asm__ volatile ("sfence\n\t");
572 #endif
573  return ff_vf_next_put_image(vf,dmpi, pts);
574 }
575 
576 static void uninit(struct vf_instance *vf)
577 {
578  if(!vf->priv) return;
579 
580  av_free(vf->priv->temp);
581  vf->priv->temp= NULL;
582  av_free(vf->priv->src);
583  vf->priv->src= NULL;
584  //free(vf->priv->avctx);
585  //vf->priv->avctx= NULL;
586  free(vf->priv->non_b_qp);
587  vf->priv->non_b_qp= NULL;
588 
589  av_free(vf->priv);
590  vf->priv=NULL;
591 }
592 
593 //===========================================================================//
594 
595 static int query_format(struct vf_instance *vf, unsigned int fmt)
596 {
597  switch(fmt){
598  case IMGFMT_YVU9:
599  case IMGFMT_IF09:
600  case IMGFMT_YV12:
601  case IMGFMT_I420:
602  case IMGFMT_IYUV:
603  case IMGFMT_CLPL:
604  case IMGFMT_Y800:
605  case IMGFMT_Y8:
606  case IMGFMT_444P:
607  case IMGFMT_422P:
608  case IMGFMT_411P:
609  return ff_vf_next_query_format(vf,fmt);
610  }
611  return 0;
612 }
613 
614 static int control(struct vf_instance *vf, int request, void* data)
615 {
616  switch(request){
618  return 5;
619  case VFCTRL_SET_PP_LEVEL:
620  vf->priv->log2_count= *((unsigned int*)data);
621  if (vf->priv->log2_count < 4) vf->priv->log2_count=4;
622  return CONTROL_TRUE;
623  }
624  return ff_vf_next_control(vf,request,data);
625 }
626 
627 static int vf_open(vf_instance_t *vf, char *args)
628 {
629  int i=0, bias;
630  int custom_threshold_m[64];
631  int log2c=-1;
632 
633  vf->config=config;
634  vf->put_image=put_image;
635  vf->get_image=get_image;
637  vf->uninit=uninit;
638  vf->control= control;
639  vf->priv=av_mallocz(sizeof(struct vf_priv_s));//assumes align 16 !
640 
641  ff_init_avcodec();
642 
643  //vf->priv->avctx= avcodec_alloc_context();
644  //dsputil_init(&vf->priv->dsp, vf->priv->avctx);
645 
646  vf->priv->log2_count= 4;
647  vf->priv->bframes = 0;
648 
649  if (args) sscanf(args, "%d:%d:%d:%d", &log2c, &vf->priv->qp, &i, &vf->priv->bframes);
650 
651  if( log2c >=4 && log2c <=5 )
652  vf->priv->log2_count = log2c;
653  else if( log2c >= 6 )
654  vf->priv->log2_count = 5;
655 
656  if(vf->priv->qp < 0)
657  vf->priv->qp = 0;
658 
659  if (i < -15) i = -15;
660  if (i > 32) i = 32;
661 
662  bias= (1<<4)+i; //regulable
663  vf->priv->prev_q=0;
664  //
665  for(i=0;i<64;i++) //FIXME: tune custom_threshold[] and remove this !
666  custom_threshold_m[i]=(int)(custom_threshold[i]*(bias/71.)+ 0.5);
667  for(i=0;i<8;i++){
668  vf->priv->threshold_mtx_noq[2*i]=(uint64_t)custom_threshold_m[i*8+2]
669  |(((uint64_t)custom_threshold_m[i*8+6])<<16)
670  |(((uint64_t)custom_threshold_m[i*8+0])<<32)
671  |(((uint64_t)custom_threshold_m[i*8+4])<<48);
672  vf->priv->threshold_mtx_noq[2*i+1]=(uint64_t)custom_threshold_m[i*8+5]
673  |(((uint64_t)custom_threshold_m[i*8+3])<<16)
674  |(((uint64_t)custom_threshold_m[i*8+1])<<32)
675  |(((uint64_t)custom_threshold_m[i*8+7])<<48);
676  }
677 
678  if (vf->priv->qp) vf->priv->prev_q=vf->priv->qp, mul_thrmat_s(vf->priv, vf->priv->qp);
679 
680  return 1;
681 }
682 
684  "fast simple postprocess",
685  "fspp",
686  "Michael Niedermayer, Nikolaj Poroshin",
687  "",
688  vf_open,
689  NULL
690 };
691 
692 //====================================================================
693 //Specific spp's dct, idct and threshold functions
694 //I'd prefer to have them in the separate file.
695 
696 //#define MANGLE(a) #a
697 
698 //typedef int16_t int16_t; //! only int16_t
699 
700 #define DCTSIZE 8
701 #define DCTSIZE_S "8"
702 
703 #define FIX(x,s) ((int) ((x) * (1<<s) + 0.5)&0xffff)
704 #define C64(x) ((uint64_t)((x)|(x)<<16))<<32 | (uint64_t)(x) | (uint64_t)(x)<<16
705 #define FIX64(x,s) C64(FIX(x,s))
706 
707 #define MULTIPLY16H(x,k) (((x)*(k))>>16)
708 #define THRESHOLD(r,x,t) if(((unsigned)((x)+t))>t*2) r=(x);else r=0;
709 #define DESCALE(x,n) (((x) + (1 << ((n)-1))) >> n)
710 
711 #if HAVE_MMX
712 
713 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_382683433)=FIX64(0.382683433, 14);
714 DECLARE_ALIGNED(8, uint64_t, ff_MM_FIX_0_541196100)=FIX64(0.541196100, 14);
715 DECLARE_ALIGNED(8, uint64_t, ff_MM_FIX_0_707106781)=FIX64(0.707106781, 14);
716 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_306562965)=FIX64(1.306562965, 14);
717 
718 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562_A)=FIX64(1.414213562, 14);
719 
720 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_847759065)=FIX64(1.847759065, 13);
721 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_2_613125930)=FIX64(-2.613125930, 13); //-
722 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562)=FIX64(1.414213562, 13);
723 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_082392200)=FIX64(1.082392200, 13);
724 //for t3,t5,t7 == 0 shortcut
725 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_847759065)=FIX64(0.847759065, 14);
726 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_566454497)=FIX64(0.566454497, 14);
727 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_198912367)=FIX64(0.198912367, 14);
728 
729 DECLARE_ASM_CONST(8, uint64_t, MM_DESCALE_RND)=C64(4);
730 DECLARE_ASM_CONST(8, uint64_t, MM_2)=C64(2);
731 
732 #else /* !HAVE_MMX */
733 
735 static const int16_t FIX_0_382683433=FIX(0.382683433, 14);
736 static const int16_t FIX_0_541196100=FIX(0.541196100, 14);
737 static const int16_t FIX_0_707106781=FIX(0.707106781, 14);
738 static const int16_t FIX_1_306562965=FIX(1.306562965, 14);
739 static const int16_t FIX_1_414213562_A=FIX(1.414213562, 14);
740 static const int16_t FIX_1_847759065=FIX(1.847759065, 13);
741 static const int16_t FIX_2_613125930=FIX(-2.613125930, 13); //-
742 static const int16_t FIX_1_414213562=FIX(1.414213562, 13);
743 static const int16_t FIX_1_082392200=FIX(1.082392200, 13);
744 
745 #endif
746 
747 #if !HAVE_MMX
748 
749 static void column_fidct_c(int16_t* thr_adr, int16_t *data, int16_t *output, int cnt)
750 {
751  int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
752  int_simd16_t tmp10, tmp11, tmp12, tmp13;
753  int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
754  int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
755 
756  int16_t* dataptr;
757  int16_t* wsptr;
758  int16_t *threshold;
759  int ctr;
760 
761  dataptr = data;
762  wsptr = output;
763 
764  for (; cnt > 0; cnt-=2) { //start positions
765  threshold=(int16_t*)thr_adr;//threshold_mtx
766  for (ctr = DCTSIZE; ctr > 0; ctr--) {
767  // Process columns from input, add to output.
768  tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
769  tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
770 
771  tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
772  tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
773 
774  tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
775  tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
776 
777  tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
778  tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
779 
780  // Even part of FDCT
781 
782  tmp10 = tmp0 + tmp3;
783  tmp13 = tmp0 - tmp3;
784  tmp11 = tmp1 + tmp2;
785  tmp12 = tmp1 - tmp2;
786 
787  d0 = tmp10 + tmp11;
788  d4 = tmp10 - tmp11;
789 
790  z1 = MULTIPLY16H((tmp12 + tmp13) <<2, FIX_0_707106781);
791  d2 = tmp13 + z1;
792  d6 = tmp13 - z1;
793 
794  // Even part of IDCT
795 
796  THRESHOLD(tmp0, d0, threshold[0*8]);
797  THRESHOLD(tmp1, d2, threshold[2*8]);
798  THRESHOLD(tmp2, d4, threshold[4*8]);
799  THRESHOLD(tmp3, d6, threshold[6*8]);
800  tmp0+=2;
801  tmp10 = (tmp0 + tmp2)>>2;
802  tmp11 = (tmp0 - tmp2)>>2;
803 
804  tmp13 = (tmp1 + tmp3)>>2; //+2 ! (psnr decides)
805  tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
806 
807  tmp0 = tmp10 + tmp13; //->temps
808  tmp3 = tmp10 - tmp13; //->temps
809  tmp1 = tmp11 + tmp12; //->temps
810  tmp2 = tmp11 - tmp12; //->temps
811 
812  // Odd part of FDCT
813 
814  tmp10 = tmp4 + tmp5;
815  tmp11 = tmp5 + tmp6;
816  tmp12 = tmp6 + tmp7;
817 
818  z5 = MULTIPLY16H((tmp10 - tmp12)<<2, FIX_0_382683433);
819  z2 = MULTIPLY16H(tmp10 <<2, FIX_0_541196100) + z5;
820  z4 = MULTIPLY16H(tmp12 <<2, FIX_1_306562965) + z5;
821  z3 = MULTIPLY16H(tmp11 <<2, FIX_0_707106781);
822 
823  z11 = tmp7 + z3;
824  z13 = tmp7 - z3;
825 
826  d5 = z13 + z2;
827  d3 = z13 - z2;
828  d1 = z11 + z4;
829  d7 = z11 - z4;
830 
831  // Odd part of IDCT
832 
833  THRESHOLD(tmp4, d1, threshold[1*8]);
834  THRESHOLD(tmp5, d3, threshold[3*8]);
835  THRESHOLD(tmp6, d5, threshold[5*8]);
836  THRESHOLD(tmp7, d7, threshold[7*8]);
837 
838  //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
839  z13 = tmp6 + tmp5;
840  z10 = (tmp6 - tmp5)<<1;
841  z11 = tmp4 + tmp7;
842  z12 = (tmp4 - tmp7)<<1;
843 
844  tmp7 = (z11 + z13)>>2; //+2 !
845  tmp11 = MULTIPLY16H((z11 - z13)<<1, FIX_1_414213562);
846  z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
847  tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
848  tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - !!
849 
850  tmp6 = tmp12 - tmp7;
851  tmp5 = tmp11 - tmp6;
852  tmp4 = tmp10 + tmp5;
853 
854  wsptr[DCTSIZE*0]+= (tmp0 + tmp7);
855  wsptr[DCTSIZE*1]+= (tmp1 + tmp6);
856  wsptr[DCTSIZE*2]+= (tmp2 + tmp5);
857  wsptr[DCTSIZE*3]+= (tmp3 - tmp4);
858  wsptr[DCTSIZE*4]+= (tmp3 + tmp4);
859  wsptr[DCTSIZE*5]+= (tmp2 - tmp5);
860  wsptr[DCTSIZE*6]= (tmp1 - tmp6);
861  wsptr[DCTSIZE*7]= (tmp0 - tmp7);
862  //
863  dataptr++; //next column
864  wsptr++;
865  threshold++;
866  }
867  dataptr+=8; //skip each second start pos
868  wsptr +=8;
869  }
870 }
871 
872 #else /* HAVE_MMX */
873 
874 static void column_fidct_mmx(int16_t* thr_adr, int16_t *data, int16_t *output, int cnt)
875 {
876  uint64_t __attribute__((aligned(8))) temps[4];
877  __asm__ volatile(
878  ASMALIGN(4)
879  "1: \n\t"
880  "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t"
881  //
882  "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t"
883  "movq %%mm1, %%mm0 \n\t"
884 
885  "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1 \n\t" //t0
886  "movq %%mm7, %%mm3 \n\t"
887 
888  "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7 \n\t" //t3
889  "movq %%mm1, %%mm5 \n\t"
890 
891  "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t"
892  "psubw %%mm7, %%mm1 \n\t" //t13
893 
894  "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
895  "movq %%mm6, %%mm4 \n\t"
896 
897  "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6 \n\t" //t1
898  "paddw %%mm7, %%mm5 \n\t" //t10
899 
900  "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t2
901  "movq %%mm6, %%mm7 \n\t"
902 
903  "paddw %%mm2, %%mm6 \n\t" //t11
904  "psubw %%mm2, %%mm7 \n\t" //t12
905 
906  "movq %%mm5, %%mm2 \n\t"
907  "paddw %%mm6, %%mm5 \n\t" //d0
908  // i0 t13 t12 i3 i1 d0 - d4
909  "psubw %%mm6, %%mm2 \n\t" //d4
910  "paddw %%mm1, %%mm7 \n\t"
911 
912  "movq 4*16(%%"REG_d"), %%mm6 \n\t"
913  "psllw $2, %%mm7 \n\t"
914 
915  "psubw 0*16(%%"REG_d"), %%mm5 \n\t"
916  "psubw %%mm6, %%mm2 \n\t"
917 
918  "paddusw 0*16(%%"REG_d"), %%mm5 \n\t"
919  "paddusw %%mm6, %%mm2 \n\t"
920 
921  "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm7 \n\t"
922  //
923  "paddw 0*16(%%"REG_d"), %%mm5 \n\t"
924  "paddw %%mm6, %%mm2 \n\t"
925 
926  "psubusw 0*16(%%"REG_d"), %%mm5 \n\t"
927  "psubusw %%mm6, %%mm2 \n\t"
928 
929 //This func is totally compute-bound, operates at huge speed. So, DC shortcut
930 // at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3).
931 //However, typical numbers: nondc - 29%%, dc - 46%%, zero - 25%%. All <> 0 case is very rare.
932  "paddw "MANGLE(MM_2)", %%mm5 \n\t"
933  "movq %%mm2, %%mm6 \n\t"
934 
935  "paddw %%mm5, %%mm2 \n\t"
936  "psubw %%mm6, %%mm5 \n\t"
937 
938  "movq %%mm1, %%mm6 \n\t"
939  "paddw %%mm7, %%mm1 \n\t" //d2
940 
941  "psubw 2*16(%%"REG_d"), %%mm1 \n\t"
942  "psubw %%mm7, %%mm6 \n\t" //d6
943 
944  "movq 6*16(%%"REG_d"), %%mm7 \n\t"
945  "psraw $2, %%mm5 \n\t"
946 
947  "paddusw 2*16(%%"REG_d"), %%mm1 \n\t"
948  "psubw %%mm7, %%mm6 \n\t"
949  // t7 d2 /t11 t4 t6 - d6 /t10
950 
951  "paddw 2*16(%%"REG_d"), %%mm1 \n\t"
952  "paddusw %%mm7, %%mm6 \n\t"
953 
954  "psubusw 2*16(%%"REG_d"), %%mm1 \n\t"
955  "paddw %%mm7, %%mm6 \n\t"
956 
957  "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3 \n\t"
958  "psubusw %%mm7, %%mm6 \n\t"
959 
960  //movq [edi+"DCTSIZE_S"*2*2], mm1
961  //movq [edi+"DCTSIZE_S"*6*2], mm6
962  "movq %%mm1, %%mm7 \n\t"
963  "psraw $2, %%mm2 \n\t"
964 
965  "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4 \n\t"
966  "psubw %%mm6, %%mm1 \n\t"
967 
968  "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0 \n\t"
969  "paddw %%mm7, %%mm6 \n\t" //'t13
970 
971  "psraw $2, %%mm6 \n\t" //paddw mm6, MM_2 !! ---
972  "movq %%mm2, %%mm7 \n\t"
973 
974  "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t"
975  "paddw %%mm6, %%mm2 \n\t" //'t0
976 
977  "movq %%mm2, 0*8+%3 \n\t" //!
978  "psubw %%mm6, %%mm7 \n\t" //'t3
979 
980  "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
981  "psubw %%mm6, %%mm1 \n\t" //'t12
982 
983  "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t5
984  "movq %%mm5, %%mm6 \n\t"
985 
986  "movq %%mm7, 3*8+%3 \n\t"
987  "paddw %%mm2, %%mm3 \n\t" //t10
988 
989  "paddw %%mm4, %%mm2 \n\t" //t11
990  "paddw %%mm0, %%mm4 \n\t" //t12
991 
992  "movq %%mm3, %%mm7 \n\t"
993  "psubw %%mm4, %%mm3 \n\t"
994 
995  "psllw $2, %%mm3 \n\t"
996  "psllw $2, %%mm7 \n\t" //opt for P6
997 
998  "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t"
999  "psllw $2, %%mm4 \n\t"
1000 
1001  "pmulhw "MANGLE(ff_MM_FIX_0_541196100)", %%mm7 \n\t"
1002  "psllw $2, %%mm2 \n\t"
1003 
1004  "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t"
1005  "paddw %%mm1, %%mm5 \n\t" //'t1
1006 
1007  "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm2 \n\t"
1008  "psubw %%mm1, %%mm6 \n\t" //'t2
1009  // t7 't12 't11 t4 t6 - 't13 't10 ---
1010 
1011  "paddw %%mm3, %%mm7 \n\t" //z2
1012 
1013  "movq %%mm5, 1*8+%3 \n\t"
1014  "paddw %%mm3, %%mm4 \n\t" //z4
1015 
1016  "movq 3*16(%%"REG_d"), %%mm3 \n\t"
1017  "movq %%mm0, %%mm1 \n\t"
1018 
1019  "movq %%mm6, 2*8+%3 \n\t"
1020  "psubw %%mm2, %%mm1 \n\t" //z13
1021 
1022 //===
1023  "paddw %%mm2, %%mm0 \n\t" //z11
1024  "movq %%mm1, %%mm5 \n\t"
1025 
1026  "movq 5*16(%%"REG_d"), %%mm2 \n\t"
1027  "psubw %%mm7, %%mm1 \n\t" //d3
1028 
1029  "paddw %%mm7, %%mm5 \n\t" //d5
1030  "psubw %%mm3, %%mm1 \n\t"
1031 
1032  "movq 1*16(%%"REG_d"), %%mm7 \n\t"
1033  "psubw %%mm2, %%mm5 \n\t"
1034 
1035  "movq %%mm0, %%mm6 \n\t"
1036  "paddw %%mm4, %%mm0 \n\t" //d1
1037 
1038  "paddusw %%mm3, %%mm1 \n\t"
1039  "psubw %%mm4, %%mm6 \n\t" //d7
1040 
1041  // d1 d3 - - - d5 d7 -
1042  "movq 7*16(%%"REG_d"), %%mm4 \n\t"
1043  "psubw %%mm7, %%mm0 \n\t"
1044 
1045  "psubw %%mm4, %%mm6 \n\t"
1046  "paddusw %%mm2, %%mm5 \n\t"
1047 
1048  "paddusw %%mm4, %%mm6 \n\t"
1049  "paddw %%mm3, %%mm1 \n\t"
1050 
1051  "paddw %%mm2, %%mm5 \n\t"
1052  "paddw %%mm4, %%mm6 \n\t"
1053 
1054  "psubusw %%mm3, %%mm1 \n\t"
1055  "psubusw %%mm2, %%mm5 \n\t"
1056 
1057  "psubusw %%mm4, %%mm6 \n\t"
1058  "movq %%mm1, %%mm4 \n\t"
1059 
1060  "por %%mm5, %%mm4 \n\t"
1061  "paddusw %%mm7, %%mm0 \n\t"
1062 
1063  "por %%mm6, %%mm4 \n\t"
1064  "paddw %%mm7, %%mm0 \n\t"
1065 
1066  "packssdw %%mm4, %%mm4 \n\t"
1067  "psubusw %%mm7, %%mm0 \n\t"
1068 
1069  "movd %%mm4, %%"REG_a" \n\t"
1070  "or %%"REG_a", %%"REG_a" \n\t"
1071  "jnz 2f \n\t"
1072  //movq [edi+"DCTSIZE_S"*3*2], mm1
1073  //movq [edi+"DCTSIZE_S"*5*2], mm5
1074  //movq [edi+"DCTSIZE_S"*1*2], mm0
1075  //movq [edi+"DCTSIZE_S"*7*2], mm6
1076  // t4 t5 - - - t6 t7 -
1077  //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0
1078 //Typical numbers: nondc - 19%%, dc - 26%%, zero - 55%%. zero case alone isn't worthwhile
1079  "movq 0*8+%3, %%mm4 \n\t"
1080  "movq %%mm0, %%mm1 \n\t"
1081 
1082  "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6
1083  "movq %%mm1, %%mm2 \n\t"
1084 
1085  "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t"
1086  "movq %%mm2, %%mm3 \n\t"
1087 
1088  "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5
1089  "paddw %%mm4, %%mm5 \n\t"
1090 
1091  "movq 1*8+%3, %%mm6 \n\t"
1092  //paddw mm3, MM_2
1093  "psraw $2, %%mm3 \n\t" //tmp7
1094 
1095  "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4
1096  "psubw %%mm3, %%mm4 \n\t"
1097 
1098  "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t"
1099  "paddw %%mm3, %%mm5 \n\t"
1100 
1101  "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
1102  "paddw %%mm6, %%mm7 \n\t"
1103 
1104  "movq 2*8+%3, %%mm3 \n\t"
1105  "psubw %%mm0, %%mm6 \n\t"
1106 
1107  "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t"
1108  "paddw %%mm0, %%mm7 \n\t"
1109 
1110  "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
1111  "paddw %%mm3, %%mm4 \n\t"
1112 
1113  "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
1114  "psubw %%mm1, %%mm3 \n\t"
1115 
1116  "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t"
1117  "paddw %%mm1, %%mm4 \n\t"
1118 
1119  "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t"
1120  "paddw %%mm3, %%mm5 \n\t"
1121 
1122  "movq 3*8+%3, %%mm0 \n\t"
1123  "add $8, %%"REG_S" \n\t"
1124 
1125  "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
1126  "paddw %%mm0, %%mm6 \n\t"
1127 
1128  "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
1129  "psubw %%mm2, %%mm0 \n\t"
1130 
1131  "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t"
1132  "paddw %%mm2, %%mm6 \n\t"
1133 
1134  "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
1135  "paddw %%mm0, %%mm7 \n\t"
1136 
1137  "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
1138 
1139  "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
1140  "add $8, %%"REG_D" \n\t"
1141  "jmp 4f \n\t"
1142 
1143  "2: \n\t"
1144  //--- non DC2
1145  //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1 (actually thr1, thr1, thr1-1)
1146  //psraw mm5, 2
1147  //psraw mm0, 2
1148  //psraw mm6, 2
1149  "movq %%mm5, %%mm3 \n\t"
1150  "psubw %%mm1, %%mm5 \n\t"
1151 
1152  "psllw $1, %%mm5 \n\t" //'z10
1153  "paddw %%mm1, %%mm3 \n\t" //'z13
1154 
1155  "movq %%mm0, %%mm2 \n\t"
1156  "psubw %%mm6, %%mm0 \n\t"
1157 
1158  "movq %%mm5, %%mm1 \n\t"
1159  "psllw $1, %%mm0 \n\t" //'z12
1160 
1161  "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //-
1162  "paddw %%mm0, %%mm5 \n\t"
1163 
1164  "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5
1165  "paddw %%mm6, %%mm2 \n\t" //'z11
1166 
1167  "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t"
1168  "movq %%mm2, %%mm7 \n\t"
1169 
1170  //---
1171  "movq 0*8+%3, %%mm4 \n\t"
1172  "psubw %%mm3, %%mm2 \n\t"
1173 
1174  "psllw $1, %%mm2 \n\t"
1175  "paddw %%mm3, %%mm7 \n\t" //'t7
1176 
1177  "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11
1178  "movq %%mm4, %%mm6 \n\t"
1179  //paddw mm7, MM_2
1180  "psraw $2, %%mm7 \n\t"
1181 
1182  "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t"
1183  "psubw %%mm7, %%mm6 \n\t"
1184 
1185  "movq 1*8+%3, %%mm3 \n\t"
1186  "paddw %%mm7, %%mm4 \n\t"
1187 
1188  "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
1189  "paddw %%mm5, %%mm1 \n\t" //'t12
1190 
1191  "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
1192  "psubw %%mm7, %%mm1 \n\t" //'t6
1193 
1194  "movq 2*8+%3, %%mm7 \n\t"
1195  "psubw %%mm5, %%mm0 \n\t" //'t10
1196 
1197  "movq 3*8+%3, %%mm6 \n\t"
1198  "movq %%mm3, %%mm5 \n\t"
1199 
1200  "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t"
1201  "psubw %%mm1, %%mm5 \n\t"
1202 
1203  "psubw %%mm1, %%mm2 \n\t" //'t5
1204  "paddw %%mm1, %%mm3 \n\t"
1205 
1206  "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
1207  "movq %%mm7, %%mm4 \n\t"
1208 
1209  "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t"
1210  "psubw %%mm2, %%mm4 \n\t"
1211 
1212  "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t"
1213  "paddw %%mm2, %%mm7 \n\t"
1214 
1215  "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
1216  "paddw %%mm2, %%mm0 \n\t" //'t4
1217 
1218  // 't4 't6 't5 - - - - 't7
1219  "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
1220  "movq %%mm6, %%mm1 \n\t"
1221 
1222  "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t"
1223  "psubw %%mm0, %%mm1 \n\t"
1224 
1225  "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t"
1226  "paddw %%mm0, %%mm6 \n\t"
1227 
1228  "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
1229  "add $8, %%"REG_S" \n\t"
1230 
1231  "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
1232 
1233  "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
1234  "add $8, %%"REG_D" \n\t"
1235 
1236  "4: \n\t"
1237 //=part 2 (the same)===========================================================
1238  "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t"
1239  //
1240  "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t"
1241  "movq %%mm1, %%mm0 \n\t"
1242 
1243  "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1 \n\t" //t0
1244  "movq %%mm7, %%mm3 \n\t"
1245 
1246  "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7 \n\t" //t3
1247  "movq %%mm1, %%mm5 \n\t"
1248 
1249  "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t"
1250  "psubw %%mm7, %%mm1 \n\t" //t13
1251 
1252  "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
1253  "movq %%mm6, %%mm4 \n\t"
1254 
1255  "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6 \n\t" //t1
1256  "paddw %%mm7, %%mm5 \n\t" //t10
1257 
1258  "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t2
1259  "movq %%mm6, %%mm7 \n\t"
1260 
1261  "paddw %%mm2, %%mm6 \n\t" //t11
1262  "psubw %%mm2, %%mm7 \n\t" //t12
1263 
1264  "movq %%mm5, %%mm2 \n\t"
1265  "paddw %%mm6, %%mm5 \n\t" //d0
1266  // i0 t13 t12 i3 i1 d0 - d4
1267  "psubw %%mm6, %%mm2 \n\t" //d4
1268  "paddw %%mm1, %%mm7 \n\t"
1269 
1270  "movq 1*8+4*16(%%"REG_d"), %%mm6 \n\t"
1271  "psllw $2, %%mm7 \n\t"
1272 
1273  "psubw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
1274  "psubw %%mm6, %%mm2 \n\t"
1275 
1276  "paddusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
1277  "paddusw %%mm6, %%mm2 \n\t"
1278 
1279  "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm7 \n\t"
1280  //
1281  "paddw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
1282  "paddw %%mm6, %%mm2 \n\t"
1283 
1284  "psubusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
1285  "psubusw %%mm6, %%mm2 \n\t"
1286 
1287 //This func is totally compute-bound, operates at huge speed. So, DC shortcut
1288 // at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3).
1289 //However, typical numbers: nondc - 29%%, dc - 46%%, zero - 25%%. All <> 0 case is very rare.
1290  "paddw "MANGLE(MM_2)", %%mm5 \n\t"
1291  "movq %%mm2, %%mm6 \n\t"
1292 
1293  "paddw %%mm5, %%mm2 \n\t"
1294  "psubw %%mm6, %%mm5 \n\t"
1295 
1296  "movq %%mm1, %%mm6 \n\t"
1297  "paddw %%mm7, %%mm1 \n\t" //d2
1298 
1299  "psubw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
1300  "psubw %%mm7, %%mm6 \n\t" //d6
1301 
1302  "movq 1*8+6*16(%%"REG_d"), %%mm7 \n\t"
1303  "psraw $2, %%mm5 \n\t"
1304 
1305  "paddusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
1306  "psubw %%mm7, %%mm6 \n\t"
1307  // t7 d2 /t11 t4 t6 - d6 /t10
1308 
1309  "paddw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
1310  "paddusw %%mm7, %%mm6 \n\t"
1311 
1312  "psubusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
1313  "paddw %%mm7, %%mm6 \n\t"
1314 
1315  "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3 \n\t"
1316  "psubusw %%mm7, %%mm6 \n\t"
1317 
1318  //movq [edi+"DCTSIZE_S"*2*2], mm1
1319  //movq [edi+"DCTSIZE_S"*6*2], mm6
1320  "movq %%mm1, %%mm7 \n\t"
1321  "psraw $2, %%mm2 \n\t"
1322 
1323  "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4 \n\t"
1324  "psubw %%mm6, %%mm1 \n\t"
1325 
1326  "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0 \n\t"
1327  "paddw %%mm7, %%mm6 \n\t" //'t13
1328 
1329  "psraw $2, %%mm6 \n\t" //paddw mm6, MM_2 !! ---
1330  "movq %%mm2, %%mm7 \n\t"
1331 
1332  "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t"
1333  "paddw %%mm6, %%mm2 \n\t" //'t0
1334 
1335  "movq %%mm2, 0*8+%3 \n\t" //!
1336  "psubw %%mm6, %%mm7 \n\t" //'t3
1337 
1338  "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
1339  "psubw %%mm6, %%mm1 \n\t" //'t12
1340 
1341  "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t5
1342  "movq %%mm5, %%mm6 \n\t"
1343 
1344  "movq %%mm7, 3*8+%3 \n\t"
1345  "paddw %%mm2, %%mm3 \n\t" //t10
1346 
1347  "paddw %%mm4, %%mm2 \n\t" //t11
1348  "paddw %%mm0, %%mm4 \n\t" //t12
1349 
1350  "movq %%mm3, %%mm7 \n\t"
1351  "psubw %%mm4, %%mm3 \n\t"
1352 
1353  "psllw $2, %%mm3 \n\t"
1354  "psllw $2, %%mm7 \n\t" //opt for P6
1355 
1356  "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t"
1357  "psllw $2, %%mm4 \n\t"
1358 
1359  "pmulhw "MANGLE(ff_MM_FIX_0_541196100)", %%mm7 \n\t"
1360  "psllw $2, %%mm2 \n\t"
1361 
1362  "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t"
1363  "paddw %%mm1, %%mm5 \n\t" //'t1
1364 
1365  "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm2 \n\t"
1366  "psubw %%mm1, %%mm6 \n\t" //'t2
1367  // t7 't12 't11 t4 t6 - 't13 't10 ---
1368 
1369  "paddw %%mm3, %%mm7 \n\t" //z2
1370 
1371  "movq %%mm5, 1*8+%3 \n\t"
1372  "paddw %%mm3, %%mm4 \n\t" //z4
1373 
1374  "movq 1*8+3*16(%%"REG_d"), %%mm3 \n\t"
1375  "movq %%mm0, %%mm1 \n\t"
1376 
1377  "movq %%mm6, 2*8+%3 \n\t"
1378  "psubw %%mm2, %%mm1 \n\t" //z13
1379 
1380 //===
1381  "paddw %%mm2, %%mm0 \n\t" //z11
1382  "movq %%mm1, %%mm5 \n\t"
1383 
1384  "movq 1*8+5*16(%%"REG_d"), %%mm2 \n\t"
1385  "psubw %%mm7, %%mm1 \n\t" //d3
1386 
1387  "paddw %%mm7, %%mm5 \n\t" //d5
1388  "psubw %%mm3, %%mm1 \n\t"
1389 
1390  "movq 1*8+1*16(%%"REG_d"), %%mm7 \n\t"
1391  "psubw %%mm2, %%mm5 \n\t"
1392 
1393  "movq %%mm0, %%mm6 \n\t"
1394  "paddw %%mm4, %%mm0 \n\t" //d1
1395 
1396  "paddusw %%mm3, %%mm1 \n\t"
1397  "psubw %%mm4, %%mm6 \n\t" //d7
1398 
1399  // d1 d3 - - - d5 d7 -
1400  "movq 1*8+7*16(%%"REG_d"), %%mm4 \n\t"
1401  "psubw %%mm7, %%mm0 \n\t"
1402 
1403  "psubw %%mm4, %%mm6 \n\t"
1404  "paddusw %%mm2, %%mm5 \n\t"
1405 
1406  "paddusw %%mm4, %%mm6 \n\t"
1407  "paddw %%mm3, %%mm1 \n\t"
1408 
1409  "paddw %%mm2, %%mm5 \n\t"
1410  "paddw %%mm4, %%mm6 \n\t"
1411 
1412  "psubusw %%mm3, %%mm1 \n\t"
1413  "psubusw %%mm2, %%mm5 \n\t"
1414 
1415  "psubusw %%mm4, %%mm6 \n\t"
1416  "movq %%mm1, %%mm4 \n\t"
1417 
1418  "por %%mm5, %%mm4 \n\t"
1419  "paddusw %%mm7, %%mm0 \n\t"
1420 
1421  "por %%mm6, %%mm4 \n\t"
1422  "paddw %%mm7, %%mm0 \n\t"
1423 
1424  "packssdw %%mm4, %%mm4 \n\t"
1425  "psubusw %%mm7, %%mm0 \n\t"
1426 
1427  "movd %%mm4, %%"REG_a" \n\t"
1428  "or %%"REG_a", %%"REG_a" \n\t"
1429  "jnz 3f \n\t"
1430  //movq [edi+"DCTSIZE_S"*3*2], mm1
1431  //movq [edi+"DCTSIZE_S"*5*2], mm5
1432  //movq [edi+"DCTSIZE_S"*1*2], mm0
1433  //movq [edi+"DCTSIZE_S"*7*2], mm6
1434  // t4 t5 - - - t6 t7 -
1435  //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0
1436 //Typical numbers: nondc - 19%%, dc - 26%%, zero - 55%%. zero case alone isn't worthwhile
1437  "movq 0*8+%3, %%mm4 \n\t"
1438  "movq %%mm0, %%mm1 \n\t"
1439 
1440  "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6
1441  "movq %%mm1, %%mm2 \n\t"
1442 
1443  "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t"
1444  "movq %%mm2, %%mm3 \n\t"
1445 
1446  "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5
1447  "paddw %%mm4, %%mm5 \n\t"
1448 
1449  "movq 1*8+%3, %%mm6 \n\t"
1450  //paddw mm3, MM_2
1451  "psraw $2, %%mm3 \n\t" //tmp7
1452 
1453  "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4
1454  "psubw %%mm3, %%mm4 \n\t"
1455 
1456  "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t"
1457  "paddw %%mm3, %%mm5 \n\t"
1458 
1459  "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
1460  "paddw %%mm6, %%mm7 \n\t"
1461 
1462  "movq 2*8+%3, %%mm3 \n\t"
1463  "psubw %%mm0, %%mm6 \n\t"
1464 
1465  "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t"
1466  "paddw %%mm0, %%mm7 \n\t"
1467 
1468  "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
1469  "paddw %%mm3, %%mm4 \n\t"
1470 
1471  "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
1472  "psubw %%mm1, %%mm3 \n\t"
1473 
1474  "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t"
1475  "paddw %%mm1, %%mm4 \n\t"
1476 
1477  "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t"
1478  "paddw %%mm3, %%mm5 \n\t"
1479 
1480  "movq 3*8+%3, %%mm0 \n\t"
1481  "add $24, %%"REG_S" \n\t"
1482 
1483  "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
1484  "paddw %%mm0, %%mm6 \n\t"
1485 
1486  "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
1487  "psubw %%mm2, %%mm0 \n\t"
1488 
1489  "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t"
1490  "paddw %%mm2, %%mm6 \n\t"
1491 
1492  "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
1493  "paddw %%mm0, %%mm7 \n\t"
1494 
1495  "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
1496 
1497  "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
1498  "add $24, %%"REG_D" \n\t"
1499  "sub $2, %%"REG_c" \n\t"
1500  "jnz 1b \n\t"
1501  "jmp 5f \n\t"
1502 
1503  "3: \n\t"
1504  //--- non DC2
1505  //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1 (actually thr1, thr1, thr1-1)
1506  //psraw mm5, 2
1507  //psraw mm0, 2
1508  //psraw mm6, 2
1509  "movq %%mm5, %%mm3 \n\t"
1510  "psubw %%mm1, %%mm5 \n\t"
1511 
1512  "psllw $1, %%mm5 \n\t" //'z10
1513  "paddw %%mm1, %%mm3 \n\t" //'z13
1514 
1515  "movq %%mm0, %%mm2 \n\t"
1516  "psubw %%mm6, %%mm0 \n\t"
1517 
1518  "movq %%mm5, %%mm1 \n\t"
1519  "psllw $1, %%mm0 \n\t" //'z12
1520 
1521  "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //-
1522  "paddw %%mm0, %%mm5 \n\t"
1523 
1524  "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5
1525  "paddw %%mm6, %%mm2 \n\t" //'z11
1526 
1527  "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t"
1528  "movq %%mm2, %%mm7 \n\t"
1529 
1530  //---
1531  "movq 0*8+%3, %%mm4 \n\t"
1532  "psubw %%mm3, %%mm2 \n\t"
1533 
1534  "psllw $1, %%mm2 \n\t"
1535  "paddw %%mm3, %%mm7 \n\t" //'t7
1536 
1537  "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11
1538  "movq %%mm4, %%mm6 \n\t"
1539  //paddw mm7, MM_2
1540  "psraw $2, %%mm7 \n\t"
1541 
1542  "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t"
1543  "psubw %%mm7, %%mm6 \n\t"
1544 
1545  "movq 1*8+%3, %%mm3 \n\t"
1546  "paddw %%mm7, %%mm4 \n\t"
1547 
1548  "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
1549  "paddw %%mm5, %%mm1 \n\t" //'t12
1550 
1551  "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
1552  "psubw %%mm7, %%mm1 \n\t" //'t6
1553 
1554  "movq 2*8+%3, %%mm7 \n\t"
1555  "psubw %%mm5, %%mm0 \n\t" //'t10
1556 
1557  "movq 3*8+%3, %%mm6 \n\t"
1558  "movq %%mm3, %%mm5 \n\t"
1559 
1560  "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t"
1561  "psubw %%mm1, %%mm5 \n\t"
1562 
1563  "psubw %%mm1, %%mm2 \n\t" //'t5
1564  "paddw %%mm1, %%mm3 \n\t"
1565 
1566  "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
1567  "movq %%mm7, %%mm4 \n\t"
1568 
1569  "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t"
1570  "psubw %%mm2, %%mm4 \n\t"
1571 
1572  "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t"
1573  "paddw %%mm2, %%mm7 \n\t"
1574 
1575  "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
1576  "paddw %%mm2, %%mm0 \n\t" //'t4
1577 
1578  // 't4 't6 't5 - - - - 't7
1579  "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
1580  "movq %%mm6, %%mm1 \n\t"
1581 
1582  "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t"
1583  "psubw %%mm0, %%mm1 \n\t"
1584 
1585  "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t"
1586  "paddw %%mm0, %%mm6 \n\t"
1587 
1588  "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
1589  "add $24, %%"REG_S" \n\t"
1590 
1591  "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
1592 
1593  "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
1594  "add $24, %%"REG_D" \n\t"
1595  "sub $2, %%"REG_c" \n\t"
1596  "jnz 1b \n\t"
1597  "5: \n\t"
1598 
1599  : "+S"(data), "+D"(output), "+c"(cnt), "=o"(temps)
1600  : "d"(thr_adr)
1601  : "%"REG_a
1602  );
1603 }
1604 
1605 #endif // HAVE_MMX
1606 
1607 #if !HAVE_MMX
1608 
1609 static void row_idct_c(int16_t* workspace,
1610  int16_t* output_adr, int output_stride, int cnt)
1611 {
1612  int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1613  int_simd16_t tmp10, tmp11, tmp12, tmp13;
1614  int_simd16_t z5, z10, z11, z12, z13;
1615  int16_t* outptr;
1616  int16_t* wsptr;
1617 
1618  cnt*=4;
1619  wsptr = workspace;
1620  outptr = output_adr;
1621  for (; cnt > 0; cnt--) {
1622  // Even part
1623  //Simd version reads 4x4 block and transposes it
1624  tmp10 = ( wsptr[2] + wsptr[3]);
1625  tmp11 = ( wsptr[2] - wsptr[3]);
1626 
1627  tmp13 = ( wsptr[0] + wsptr[1]);
1628  tmp12 = (MULTIPLY16H( wsptr[0] - wsptr[1], FIX_1_414213562_A)<<2) - tmp13;//this shift order to avoid overflow
1629 
1630  tmp0 = tmp10 + tmp13; //->temps
1631  tmp3 = tmp10 - tmp13; //->temps
1632  tmp1 = tmp11 + tmp12;
1633  tmp2 = tmp11 - tmp12;
1634 
1635  // Odd part
1636  //Also transpose, with previous:
1637  // ---- ---- ||||
1638  // ---- ---- idct ||||
1639  // ---- ---- ---> ||||
1640  // ---- ---- ||||
1641  z13 = wsptr[4] + wsptr[5];
1642  z10 = wsptr[4] - wsptr[5];
1643  z11 = wsptr[6] + wsptr[7];
1644  z12 = wsptr[6] - wsptr[7];
1645 
1646  tmp7 = z11 + z13;
1647  tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
1648 
1649  z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
1650  tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
1651  tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - FIX_
1652 
1653  tmp6 = (tmp12<<3) - tmp7;
1654  tmp5 = (tmp11<<3) - tmp6;
1655  tmp4 = (tmp10<<3) + tmp5;
1656 
1657  // Final output stage: descale and write column
1658  outptr[0*output_stride]+= DESCALE(tmp0 + tmp7, 3);
1659  outptr[1*output_stride]+= DESCALE(tmp1 + tmp6, 3);
1660  outptr[2*output_stride]+= DESCALE(tmp2 + tmp5, 3);
1661  outptr[3*output_stride]+= DESCALE(tmp3 - tmp4, 3);
1662  outptr[4*output_stride]+= DESCALE(tmp3 + tmp4, 3);
1663  outptr[5*output_stride]+= DESCALE(tmp2 - tmp5, 3);
1664  outptr[6*output_stride]+= DESCALE(tmp1 - tmp6, 3); //no += ?
1665  outptr[7*output_stride]+= DESCALE(tmp0 - tmp7, 3); //no += ?
1666  outptr++;
1667 
1668  wsptr += DCTSIZE; // advance pointer to next row
1669  }
1670 }
1671 
1672 #else /* HAVE_MMX */
1673 
1674 static void row_idct_mmx (int16_t* workspace,
1675  int16_t* output_adr, int output_stride, int cnt)
1676 {
1677  uint64_t __attribute__((aligned(8))) temps[4];
1678  __asm__ volatile(
1679  "lea (%%"REG_a",%%"REG_a",2), %%"REG_d" \n\t"
1680  "1: \n\t"
1681  "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm0 \n\t"
1682  //
1683 
1684  "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm1 \n\t"
1685  "movq %%mm0, %%mm4 \n\t"
1686 
1687  "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
1688  "punpcklwd %%mm1, %%mm0 \n\t"
1689 
1690  "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm3 \n\t"
1691  "punpckhwd %%mm1, %%mm4 \n\t"
1692 
1693  //transpose 4x4
1694  "movq %%mm2, %%mm7 \n\t"
1695  "punpcklwd %%mm3, %%mm2 \n\t"
1696 
1697  "movq %%mm0, %%mm6 \n\t"
1698  "punpckldq %%mm2, %%mm0 \n\t" //0
1699 
1700  "punpckhdq %%mm2, %%mm6 \n\t" //1
1701  "movq %%mm0, %%mm5 \n\t"
1702 
1703  "punpckhwd %%mm3, %%mm7 \n\t"
1704  "psubw %%mm6, %%mm0 \n\t"
1705 
1706  "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm0 \n\t"
1707  "movq %%mm4, %%mm2 \n\t"
1708 
1709  "punpckldq %%mm7, %%mm4 \n\t" //2
1710  "paddw %%mm6, %%mm5 \n\t"
1711 
1712  "punpckhdq %%mm7, %%mm2 \n\t" //3
1713  "movq %%mm4, %%mm1 \n\t"
1714 
1715  "psllw $2, %%mm0 \n\t"
1716  "paddw %%mm2, %%mm4 \n\t" //t10
1717 
1718  "movq "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_S"), %%mm3 \n\t"
1719  "psubw %%mm2, %%mm1 \n\t" //t11
1720 
1721  "movq "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_S"), %%mm2 \n\t"
1722  "psubw %%mm5, %%mm0 \n\t"
1723 
1724  "movq %%mm4, %%mm6 \n\t"
1725  "paddw %%mm5, %%mm4 \n\t" //t0
1726 
1727  "psubw %%mm5, %%mm6 \n\t" //t3
1728  "movq %%mm1, %%mm7 \n\t"
1729 
1730  "movq "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_S"), %%mm5 \n\t"
1731  "paddw %%mm0, %%mm1 \n\t" //t1
1732 
1733  "movq %%mm4, 0*8+%3 \n\t" //t0
1734  "movq %%mm3, %%mm4 \n\t"
1735 
1736  "movq %%mm6, 1*8+%3 \n\t" //t3
1737  "punpcklwd %%mm2, %%mm3 \n\t"
1738 
1739  //transpose 4x4
1740  "movq "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_S"), %%mm6 \n\t"
1741  "punpckhwd %%mm2, %%mm4 \n\t"
1742 
1743  "movq %%mm5, %%mm2 \n\t"
1744  "punpcklwd %%mm6, %%mm5 \n\t"
1745 
1746  "psubw %%mm0, %%mm7 \n\t" //t2
1747  "punpckhwd %%mm6, %%mm2 \n\t"
1748 
1749  "movq %%mm3, %%mm0 \n\t"
1750  "punpckldq %%mm5, %%mm3 \n\t" //4
1751 
1752  "punpckhdq %%mm5, %%mm0 \n\t" //5
1753  "movq %%mm4, %%mm5 \n\t"
1754 
1755  //
1756  "movq %%mm3, %%mm6 \n\t"
1757  "punpckldq %%mm2, %%mm4 \n\t" //6
1758 
1759  "psubw %%mm0, %%mm3 \n\t" //z10
1760  "punpckhdq %%mm2, %%mm5 \n\t" //7
1761 
1762  "paddw %%mm0, %%mm6 \n\t" //z13
1763  "movq %%mm4, %%mm2 \n\t"
1764 
1765  "movq %%mm3, %%mm0 \n\t"
1766  "psubw %%mm5, %%mm4 \n\t" //z12
1767 
1768  "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm0 \n\t" //-
1769  "paddw %%mm4, %%mm3 \n\t"
1770 
1771  "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm3 \n\t" //z5
1772  "paddw %%mm5, %%mm2 \n\t" //z11 >
1773 
1774  "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm4 \n\t"
1775  "movq %%mm2, %%mm5 \n\t"
1776 
1777  "psubw %%mm6, %%mm2 \n\t"
1778  "paddw %%mm6, %%mm5 \n\t" //t7
1779 
1780  "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //t11
1781  "paddw %%mm3, %%mm0 \n\t" //t12
1782 
1783  "psllw $3, %%mm0 \n\t"
1784  "psubw %%mm3, %%mm4 \n\t" //t10
1785 
1786  "movq 0*8+%3, %%mm6 \n\t"
1787  "movq %%mm1, %%mm3 \n\t"
1788 
1789  "psllw $3, %%mm4 \n\t"
1790  "psubw %%mm5, %%mm0 \n\t" //t6
1791 
1792  "psllw $3, %%mm2 \n\t"
1793  "paddw %%mm0, %%mm1 \n\t" //d1
1794 
1795  "psubw %%mm0, %%mm2 \n\t" //t5
1796  "psubw %%mm0, %%mm3 \n\t" //d6
1797 
1798  "paddw %%mm2, %%mm4 \n\t" //t4
1799  "movq %%mm7, %%mm0 \n\t"
1800 
1801  "paddw %%mm2, %%mm7 \n\t" //d2
1802  "psubw %%mm2, %%mm0 \n\t" //d5
1803 
1804  "movq "MANGLE(MM_DESCALE_RND)", %%mm2 \n\t" //4
1805  "psubw %%mm5, %%mm6 \n\t" //d7
1806 
1807  "paddw 0*8+%3, %%mm5 \n\t" //d0
1808  "paddw %%mm2, %%mm1 \n\t"
1809 
1810  "paddw %%mm2, %%mm5 \n\t"
1811  "psraw $3, %%mm1 \n\t"
1812 
1813  "paddw %%mm2, %%mm7 \n\t"
1814  "psraw $3, %%mm5 \n\t"
1815 
1816  "paddw (%%"REG_D"), %%mm5 \n\t"
1817  "psraw $3, %%mm7 \n\t"
1818 
1819  "paddw (%%"REG_D",%%"REG_a",), %%mm1 \n\t"
1820  "paddw %%mm2, %%mm0 \n\t"
1821 
1822  "paddw (%%"REG_D",%%"REG_a",2), %%mm7 \n\t"
1823  "paddw %%mm2, %%mm3 \n\t"
1824 
1825  "movq %%mm5, (%%"REG_D") \n\t"
1826  "paddw %%mm2, %%mm6 \n\t"
1827 
1828  "movq %%mm1, (%%"REG_D",%%"REG_a",) \n\t"
1829  "psraw $3, %%mm0 \n\t"
1830 
1831  "movq %%mm7, (%%"REG_D",%%"REG_a",2) \n\t"
1832  "add %%"REG_d", %%"REG_D" \n\t" //3*ls
1833 
1834  "movq 1*8+%3, %%mm5 \n\t" //t3
1835  "psraw $3, %%mm3 \n\t"
1836 
1837  "paddw (%%"REG_D",%%"REG_a",2), %%mm0 \n\t"
1838  "psubw %%mm4, %%mm5 \n\t" //d3
1839 
1840  "paddw (%%"REG_D",%%"REG_d",), %%mm3 \n\t"
1841  "psraw $3, %%mm6 \n\t"
1842 
1843  "paddw 1*8+%3, %%mm4 \n\t" //d4
1844  "paddw %%mm2, %%mm5 \n\t"
1845 
1846  "paddw (%%"REG_D",%%"REG_a",4), %%mm6 \n\t"
1847  "paddw %%mm2, %%mm4 \n\t"
1848 
1849  "movq %%mm0, (%%"REG_D",%%"REG_a",2) \n\t"
1850  "psraw $3, %%mm5 \n\t"
1851 
1852  "paddw (%%"REG_D"), %%mm5 \n\t"
1853  "psraw $3, %%mm4 \n\t"
1854 
1855  "paddw (%%"REG_D",%%"REG_a",), %%mm4 \n\t"
1856  "add $"DCTSIZE_S"*2*4, %%"REG_S" \n\t" //4 rows
1857 
1858  "movq %%mm3, (%%"REG_D",%%"REG_d",) \n\t"
1859  "movq %%mm6, (%%"REG_D",%%"REG_a",4) \n\t"
1860  "movq %%mm5, (%%"REG_D") \n\t"
1861  "movq %%mm4, (%%"REG_D",%%"REG_a",) \n\t"
1862 
1863  "sub %%"REG_d", %%"REG_D" \n\t"
1864  "add $8, %%"REG_D" \n\t"
1865  "dec %%"REG_c" \n\t"
1866  "jnz 1b \n\t"
1867 
1868  : "+S"(workspace), "+D"(output_adr), "+c"(cnt), "=o"(temps)
1869  : "a"(output_stride*sizeof(short))
1870  : "%"REG_d
1871  );
1872 }
1873 
1874 #endif // HAVE_MMX
1875 
1876 #if !HAVE_MMX
1877 
1878 static void row_fdct_c(int16_t *data, const uint8_t *pixels, int line_size, int cnt)
1879 {
1880  int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1881  int_simd16_t tmp10, tmp11, tmp12, tmp13;
1882  int_simd16_t z1, z2, z3, z4, z5, z11, z13;
1883  int16_t *dataptr;
1884 
1885  cnt*=4;
1886  // Pass 1: process rows.
1887 
1888  dataptr = data;
1889  for (; cnt > 0; cnt--) {
1890  tmp0 = pixels[line_size*0] + pixels[line_size*7];
1891  tmp7 = pixels[line_size*0] - pixels[line_size*7];
1892  tmp1 = pixels[line_size*1] + pixels[line_size*6];
1893  tmp6 = pixels[line_size*1] - pixels[line_size*6];
1894  tmp2 = pixels[line_size*2] + pixels[line_size*5];
1895  tmp5 = pixels[line_size*2] - pixels[line_size*5];
1896  tmp3 = pixels[line_size*3] + pixels[line_size*4];
1897  tmp4 = pixels[line_size*3] - pixels[line_size*4];
1898 
1899  // Even part
1900 
1901  tmp10 = tmp0 + tmp3;
1902  tmp13 = tmp0 - tmp3;
1903  tmp11 = tmp1 + tmp2;
1904  tmp12 = tmp1 - tmp2;
1905  //Even columns are written first, this leads to different order of columns
1906  //in column_fidct(), but they are processed independently, so all ok.
1907  //Later in the row_idct() columns readed at the same order.
1908  dataptr[2] = tmp10 + tmp11;
1909  dataptr[3] = tmp10 - tmp11;
1910 
1911  z1 = MULTIPLY16H((tmp12 + tmp13)<<2, FIX_0_707106781);
1912  dataptr[0] = tmp13 + z1;
1913  dataptr[1] = tmp13 - z1;
1914 
1915  // Odd part
1916 
1917  tmp10 = (tmp4 + tmp5) <<2;
1918  tmp11 = (tmp5 + tmp6) <<2;
1919  tmp12 = (tmp6 + tmp7) <<2;
1920 
1921  z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
1922  z2 = MULTIPLY16H(tmp10, FIX_0_541196100) + z5;
1923  z4 = MULTIPLY16H(tmp12, FIX_1_306562965) + z5;
1924  z3 = MULTIPLY16H(tmp11, FIX_0_707106781);
1925 
1926  z11 = tmp7 + z3;
1927  z13 = tmp7 - z3;
1928 
1929  dataptr[4] = z13 + z2;
1930  dataptr[5] = z13 - z2;
1931  dataptr[6] = z11 + z4;
1932  dataptr[7] = z11 - z4;
1933 
1934  pixels++; // advance pointer to next column
1935  dataptr += DCTSIZE;
1936  }
1937 }
1938 
1939 #else /* HAVE_MMX */
1940 
1941 static void row_fdct_mmx(int16_t *data, const uint8_t *pixels, int line_size, int cnt)
1942 {
1943  uint64_t __attribute__((aligned(8))) temps[4];
1944  __asm__ volatile(
1945  "lea (%%"REG_a",%%"REG_a",2), %%"REG_d" \n\t"
1946  "6: \n\t"
1947  "movd (%%"REG_S"), %%mm0 \n\t"
1948  "pxor %%mm7, %%mm7 \n\t"
1949 
1950  "movd (%%"REG_S",%%"REG_a",), %%mm1 \n\t"
1951  "punpcklbw %%mm7, %%mm0 \n\t"
1952 
1953  "movd (%%"REG_S",%%"REG_a",2), %%mm2 \n\t"
1954  "punpcklbw %%mm7, %%mm1 \n\t"
1955 
1956  "punpcklbw %%mm7, %%mm2 \n\t"
1957  "add %%"REG_d", %%"REG_S" \n\t"
1958 
1959  "movq %%mm0, %%mm5 \n\t"
1960  //
1961 
1962  "movd (%%"REG_S",%%"REG_a",4), %%mm3 \n\t" //7 ;prefetch!
1963  "movq %%mm1, %%mm6 \n\t"
1964 
1965  "movd (%%"REG_S",%%"REG_d",), %%mm4 \n\t" //6
1966  "punpcklbw %%mm7, %%mm3 \n\t"
1967 
1968  "psubw %%mm3, %%mm5 \n\t"
1969  "punpcklbw %%mm7, %%mm4 \n\t"
1970 
1971  "paddw %%mm3, %%mm0 \n\t"
1972  "psubw %%mm4, %%mm6 \n\t"
1973 
1974  "movd (%%"REG_S",%%"REG_a",2), %%mm3 \n\t" //5
1975  "paddw %%mm4, %%mm1 \n\t"
1976 
1977  "movq %%mm5, 0*8+%3 \n\t" //t7
1978  "punpcklbw %%mm7, %%mm3 \n\t"
1979 
1980  "movq %%mm6, 1*8+%3 \n\t" //t6
1981  "movq %%mm2, %%mm4 \n\t"
1982 
1983  "movd (%%"REG_S"), %%mm5 \n\t" //3
1984  "paddw %%mm3, %%mm2 \n\t"
1985 
1986  "movd (%%"REG_S",%%"REG_a",), %%mm6 \n\t" //4
1987  "punpcklbw %%mm7, %%mm5 \n\t"
1988 
1989  "psubw %%mm3, %%mm4 \n\t"
1990  "punpcklbw %%mm7, %%mm6 \n\t"
1991 
1992  "movq %%mm5, %%mm3 \n\t"
1993  "paddw %%mm6, %%mm5 \n\t" //t3
1994 
1995  "psubw %%mm6, %%mm3 \n\t" //t4 ; t0 t1 t2 t4 t5 t3 - -
1996  "movq %%mm0, %%mm6 \n\t"
1997 
1998  "movq %%mm1, %%mm7 \n\t"
1999  "psubw %%mm5, %%mm0 \n\t" //t13
2000 
2001  "psubw %%mm2, %%mm1 \n\t"
2002  "paddw %%mm2, %%mm7 \n\t" //t11
2003 
2004  "paddw %%mm0, %%mm1 \n\t"
2005  "movq %%mm7, %%mm2 \n\t"
2006 
2007  "psllw $2, %%mm1 \n\t"
2008  "paddw %%mm5, %%mm6 \n\t" //t10
2009 
2010  "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm1 \n\t"
2011  "paddw %%mm6, %%mm7 \n\t" //d2
2012 
2013  "psubw %%mm2, %%mm6 \n\t" //d3
2014  "movq %%mm0, %%mm5 \n\t"
2015 
2016  //transpose 4x4
2017  "movq %%mm7, %%mm2 \n\t"
2018  "punpcklwd %%mm6, %%mm7 \n\t"
2019 
2020  "paddw %%mm1, %%mm0 \n\t" //d0
2021  "punpckhwd %%mm6, %%mm2 \n\t"
2022 
2023  "psubw %%mm1, %%mm5 \n\t" //d1
2024  "movq %%mm0, %%mm6 \n\t"
2025 
2026  "movq 1*8+%3, %%mm1 \n\t"
2027  "punpcklwd %%mm5, %%mm0 \n\t"
2028 
2029  "punpckhwd %%mm5, %%mm6 \n\t"
2030  "movq %%mm0, %%mm5 \n\t"
2031 
2032  "punpckldq %%mm7, %%mm0 \n\t" //0
2033  "paddw %%mm4, %%mm3 \n\t"
2034 
2035  "punpckhdq %%mm7, %%mm5 \n\t" //1
2036  "movq %%mm6, %%mm7 \n\t"
2037 
2038  "movq %%mm0, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
2039  "punpckldq %%mm2, %%mm6 \n\t" //2
2040 
2041  "movq %%mm5, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
2042  "punpckhdq %%mm2, %%mm7 \n\t" //3
2043 
2044  "movq %%mm6, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
2045  "paddw %%mm1, %%mm4 \n\t"
2046 
2047  "movq %%mm7, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
2048  "psllw $2, %%mm3 \n\t" //t10
2049 
2050  "movq 0*8+%3, %%mm2 \n\t"
2051  "psllw $2, %%mm4 \n\t" //t11
2052 
2053  "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm4 \n\t" //z3
2054  "paddw %%mm2, %%mm1 \n\t"
2055 
2056  "psllw $2, %%mm1 \n\t" //t12
2057  "movq %%mm3, %%mm0 \n\t"
2058 
2059  "pmulhw "MANGLE(ff_MM_FIX_0_541196100)", %%mm0 \n\t"
2060  "psubw %%mm1, %%mm3 \n\t"
2061 
2062  "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t" //z5
2063  "movq %%mm2, %%mm5 \n\t"
2064 
2065  "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm1 \n\t"
2066  "psubw %%mm4, %%mm2 \n\t" //z13
2067 
2068  "paddw %%mm4, %%mm5 \n\t" //z11
2069  "movq %%mm2, %%mm6 \n\t"
2070 
2071  "paddw %%mm3, %%mm0 \n\t" //z2
2072  "movq %%mm5, %%mm7 \n\t"
2073 
2074  "paddw %%mm0, %%mm2 \n\t" //d4
2075  "psubw %%mm0, %%mm6 \n\t" //d5
2076 
2077  "movq %%mm2, %%mm4 \n\t"
2078  "paddw %%mm3, %%mm1 \n\t" //z4
2079 
2080  //transpose 4x4
2081  "punpcklwd %%mm6, %%mm2 \n\t"
2082  "paddw %%mm1, %%mm5 \n\t" //d6
2083 
2084  "punpckhwd %%mm6, %%mm4 \n\t"
2085  "psubw %%mm1, %%mm7 \n\t" //d7
2086 
2087  "movq %%mm5, %%mm6 \n\t"
2088  "punpcklwd %%mm7, %%mm5 \n\t"
2089 
2090  "punpckhwd %%mm7, %%mm6 \n\t"
2091  "movq %%mm2, %%mm7 \n\t"
2092 
2093  "punpckldq %%mm5, %%mm2 \n\t" //4
2094  "sub %%"REG_d", %%"REG_S" \n\t"
2095 
2096  "punpckhdq %%mm5, %%mm7 \n\t" //5
2097  "movq %%mm4, %%mm5 \n\t"
2098 
2099  "movq %%mm2, "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_D") \n\t"
2100  "punpckldq %%mm6, %%mm4 \n\t" //6
2101 
2102  "movq %%mm7, "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_D") \n\t"
2103  "punpckhdq %%mm6, %%mm5 \n\t" //7
2104 
2105  "movq %%mm4, "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_D") \n\t"
2106  "add $4, %%"REG_S" \n\t"
2107 
2108  "movq %%mm5, "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_D") \n\t"
2109  "add $"DCTSIZE_S"*2*4, %%"REG_D" \n\t" //4 rows
2110  "dec %%"REG_c" \n\t"
2111  "jnz 6b \n\t"
2112 
2113  : "+S"(pixels), "+D"(data), "+c"(cnt), "=o"(temps)
2114  : "a"(line_size)
2115  : "%"REG_d);
2116 }
2117 
2118 #endif // HAVE_MMX
static const int16_t FIX_0_707106781
Definition: vf_fspp.c:737
uint64_t threshold_mtx_noq[8 *2]
Definition: vf_fspp.c:90
void * av_mallocz(size_t size)
Allocate a block of size bytes with alignment suitable for all memory accesses (including vectors if ...
Definition: mem.c:205
#define MANGLE(a)
unsigned int imgfmt
Definition: mp_image.h:130
static void column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt)
Definition: vf_fspp.c:749
static const int16_t FIX_1_306562965
Definition: vf_fspp.c:738
#define DESCALE(x, n)
Definition: vf_fspp.c:709
void(* get_image)(struct vf_instance *vf, mp_image_t *mpi)
Definition: vf.h:66
int qscale_type
Definition: mp_image.h:139
char * qscale
Definition: mp_image.h:135
const char * fmt
Definition: avisynth_c.h:669
#define VFCTRL_SET_PP_LEVEL
Definition: vf.h:99
memory handling functions
int stride
Definition: vf_ow.c:59
int(* control)(struct vf_instance *vf, int request, void *data)
Definition: vf.h:62
static int query_format(struct vf_instance *vf, unsigned int fmt)
Definition: vf_fspp.c:595
y1
Definition: lab5.m:33
int32_t int_simd16_t
Definition: vf_fspp.c:734
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:59
mp_image_t * ff_vf_get_image(vf_instance_t *vf, unsigned int outfmt, int mp_imgtype, int mp_imgflag, int w, int h)
Definition: vf_mp.c:380
#define MP_IMGFLAG_PRESERVE
Definition: mp_image.h:46
#define IMGFMT_YVU9
Definition: img_format.h:117
static void store_slice2_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale)
Definition: vf_fspp.c:135
int temp_stride
Definition: vf_fspp.c:94
#define FIX(x, s)
Definition: vf_fspp.c:703
#define IMGFMT_YV12
Definition: img_format.h:119
static const short custom_threshold[64]
Definition: vf_fspp.c:64
static int config(struct vf_instance *vf, int width, int height, int d_width, int d_height, unsigned int flags, unsigned int outfmt)
Definition: vf_fspp.c:490
#define CONTROL_TRUE
Definition: mpc_info.h:37
void(* uninit)(struct vf_instance *vf)
Definition: vf.h:74
#define BLOCKSZ
Definition: vf_fspp.c:62
#define row_idct_s
Definition: vf_fspp.c:178
#define IMGFMT_Y8
Definition: img_format.h:124
output residual component w
int ff_vf_next_config(struct vf_instance *vf, int width, int height, int d_width, int d_height, unsigned int flags, unsigned int outfmt)
Definition: vf_mp.c:584
set threshold d
int width
Definition: mp_image.h:131
int(* put_image)(struct vf_instance *vf, mp_image_t *mpi, double pts)
Definition: vf.h:68
uint8_t
#define THRESHOLD(r, x, t)
Definition: vf_fspp.c:708
int mpeg2
Definition: vf_fspp.c:96
#define IMGFMT_CLPL
Definition: img_format.h:122
static int put_image(struct vf_instance *vf, mp_image_t *mpi, double pts)
Definition: vf_fspp.c:522
end end
Definition: vf.h:31
unsigned char * planes[MP_MAX_PLANES]
Definition: mp_image.h:133
int stride[MP_MAX_PLANES]
Definition: mp_image.h:134
int pict_type
Definition: mp_image.h:137
static void store_slice_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale)
Definition: vf_fspp.c:108
int width
Definition: vf_fil.c:32
#define IMGFMT_IF09
Definition: img_format.h:118
int qstride
Definition: mp_image.h:136
static void row_idct_c(int16_t *workspace, int16_t *output_adr, int output_stride, int cnt)
Definition: vf_fspp.c:1609
Discrete Time axis x
int hasMMX2
Definition: cpudetect.h:34
#define column_fidct_s
Definition: vf_fspp.c:177
#define mul_thrmat_s
Definition: vf_fspp.c:176
int16_t * temp
Definition: vf_fspp.c:99
void av_free(void *ptr)
Free a memory block which has been allocated with av_malloc(z)() or av_realloc(). ...
Definition: mem.c:183
#define FIX64(x, s)
Definition: vf_fspp.c:705
#define MP_IMGTYPE_TEMP
Definition: mp_image.h:104
int(* query_format)(struct vf_instance *vf, unsigned int fmt)
Definition: vf.h:64
int qp_stride
Definition: vf_qp.c:43
int qp
Definition: vf_fspp.c:95
Spectrum Plot time data
int ff_vf_next_control(struct vf_instance *vf, int request, void *data)
Definition: vf_mp.c:613
#define IMGFMT_IYUV
Definition: img_format.h:121
uint64_t threshold_mtx[8 *2]
Definition: vf_fspp.c:91
static void row_fdct_c(int16_t *data, const uint8_t *pixels, int line_size, int cnt)
Definition: vf_fspp.c:1878
init variable d2
external API header
uint8_t * src
Definition: vf_fspp.c:98
static void filter(struct vf_priv_s *p, uint8_t *dst, uint8_t *src, int dst_stride, int src_stride, int width, int height, uint8_t *qp_store, int qp_stride, int is_luma)
Definition: vf_fspp.c:410
static int control(struct vf_instance *vf, int request, void *data)
Definition: vf_fspp.c:614
common internal API header
#define MP_IMGFLAG_PREFER_ALIGNED_STRIDE
Definition: mp_image.h:61
#define ASMALIGN(ZEROBITS)
Definition: mp_image.h:37
#define DCTSIZE
Definition: vf_fspp.c:700
int prev_q
Definition: vf_fspp.c:97
static void mul_thrmat_c(struct vf_priv_s *p, int q)
Definition: vf_fspp.c:161
t
Definition: genspecsines3.m:6
int32_t
static const int16_t FIX_2_613125930
Definition: vf_fspp.c:741
static const int16_t FIX_0_382683433
Definition: vf_fspp.c:735
void ff_init_avcodec(void)
Definition: vf_mp.c:287
int chroma_y_shift
Definition: mp_image.h:145
int hasMMX
Definition: cpudetect.h:33
void ff_vf_clone_mpi_attributes(mp_image_t *dst, mp_image_t *src)
Definition: vf_mp.c:293
#define VFCTRL_QUERY_MAX_PP_LEVEL
Definition: vf.h:98
struct vf_instance * next
Definition: vf.h:84
#define STORE2(pos)
#define STORE(pos)
NULL
Definition: eval.c:55
static const int16_t FIX_0_541196100
Definition: vf_fspp.c:736
static const int16_t FIX_1_414213562
Definition: vf_fspp.c:742
#define memcpy_pic(d, s, b, h, ds, ss)
Definition: fastmemcpy.h:62
void * av_malloc(size_t size)
Allocate a block of size bytes with alignment suitable for all memory accesses (including vectors if ...
Definition: mem.c:73
int index
Definition: gxfenc.c:89
synthesis window for stochastic i
#define IMGFMT_Y800
Definition: img_format.h:123
const vf_info_t ff_vf_info_fspp
Definition: vf_fspp.c:683
static const uint8_t __attribute__((aligned(32)))
Definition: vf_fspp.c:78
#define IMGFMT_422P
Definition: img_format.h:131
#define store_slice_s
Definition: vf_fspp.c:174
int bframes
Definition: vf_fspp.c:100
int w
Definition: mp_image.h:132
int(* config)(struct vf_instance *vf, int width, int height, int d_width, int d_height, unsigned int flags, unsigned int outfmt)
Definition: vf.h:59
#define store_slice2_s
Definition: vf_fspp.c:175
static void uninit(struct vf_instance *vf)
Definition: vf_fspp.c:576
static void get_image(struct vf_instance *vf, mp_image_t *mpi)
Definition: vf_fspp.c:504
static int flags
Definition: cpu.c:23
static const int16_t FIX_1_847759065
Definition: vf_fspp.c:740
#define C64(x)
Definition: vf_fspp.c:704
int height
Definition: mp_image.h:131
static const int16_t FIX_1_414213562_A
Definition: vf_fspp.c:739
unsigned char type
Definition: mp_image.h:127
static int vf_open(vf_instance_t *vf, char *args)
Definition: vf_fspp.c:627
#define IMGFMT_444P
Definition: img_format.h:130
static const int16_t FIX_1_082392200
Definition: vf_fspp.c:743
#define row_fdct_s
Definition: vf_fspp.c:179
#define MP_IMGFLAG_PLANAR
Definition: mp_image.h:76
void * fast_memcpy(void *to, const void *from, size_t len)
these buffered frames must be flushed immediately if a new input produces new output(Example:frame rate-doubling filter:filter_frame must(1) flush the second copy of the previous frame, if it is still there,(2) push the first copy of the incoming frame,(3) keep the second copy for later.) If the input frame is not enough to produce output
function y
Definition: D.m:1
int ff_vf_next_put_image(struct vf_instance *vf, mp_image_t *mpi, double pts)
Definition: vf_mp.c:539
DSP utils.
int log2_count
Definition: vf_fspp.c:93
char * non_b_qp
Definition: vf_fspp.c:101
#define DCTSIZE_S
Definition: vf_fspp.c:701
DECLARE_ASM_CONST(8, int, deringThreshold)
else dst[i][x+y *dst_stride[i]]
Definition: vf_mcdeint.c:160
int chroma_x_shift
Definition: mp_image.h:144
CpuCaps ff_gCpuCaps
Definition: vf_mp.c:193
#define MP_IMGFLAG_ACCEPT_STRIDE
Definition: mp_image.h:63
struct vf_priv_s * priv
Definition: vf.h:86
#define MP_IMGFLAG_DIRECT
Definition: mp_image.h:91
static int norm_qscale(int qscale, int type)
Definition: vf.h:154
#define IMGFMT_I420
Definition: img_format.h:120
int h
Definition: mp_image.h:132
int ff_vf_next_query_format(struct vf_instance *vf, unsigned int fmt)
Definition: vf_mp.c:371
Definition: vf.h:56
unsigned int flags
Definition: mp_image.h:126
int height
Definition: vf_fil.c:31
#define IMGFMT_411P
Definition: img_format.h:132
mp_image_t * dmpi
Definition: vf.h:85
MUSIC TECHNOLOGY GROUP UNIVERSITAT POMPEU FABRA Free Non Commercial Binary License Agreement UNIVERSITAT POMPEU OR INDICATING ACCEPTANCE BY SELECTING THE ACCEPT BUTTON ON ANY DOWNLOAD OR INSTALL YOU ACCEPT THE TERMS OF THE LICENSE SUMMARY TABLE Software MELODIA Melody Extraction vamp plug in Licensor Music Technology Group Universitat Pompeu Plaça de la Spain Permitted purposes Non commercial internal research and validation and educational purposes only All commercial uses in a production either internal or are prohibited by this license and require an additional commercial exploitation license TERMS AND CONDITIONS SOFTWARE Software means the software programs identified herein in binary any other machine readable any updates or error corrections provided by and any user programming guides and other documentation provided to you by UPF under this Agreement LICENSE Subject to the terms and conditions of this UPF grants you a royalty free
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But a word about which is also called distortion Distortion can be quantified by almost any quality measurement one chooses the sum of squared differences is used but more complex methods that consider psychovisual effects can be used as well It makes no difference in this discussion First step
#define MULTIPLY16H(x, k)
Definition: vf_fspp.c:707