annotate ffmpeg/libavcodec/x86/vp8dsp.asm @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 ;******************************************************************************
yading@10 2 ;* VP8 MMXEXT optimizations
yading@10 3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
yading@10 4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
yading@10 5 ;*
yading@10 6 ;* This file is part of FFmpeg.
yading@10 7 ;*
yading@10 8 ;* FFmpeg is free software; you can redistribute it and/or
yading@10 9 ;* modify it under the terms of the GNU Lesser General Public
yading@10 10 ;* License as published by the Free Software Foundation; either
yading@10 11 ;* version 2.1 of the License, or (at your option) any later version.
yading@10 12 ;*
yading@10 13 ;* FFmpeg is distributed in the hope that it will be useful,
yading@10 14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 16 ;* Lesser General Public License for more details.
yading@10 17 ;*
yading@10 18 ;* You should have received a copy of the GNU Lesser General Public
yading@10 19 ;* License along with FFmpeg; if not, write to the Free Software
yading@10 20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 21 ;******************************************************************************
yading@10 22
yading@10 23 %include "libavutil/x86/x86util.asm"
yading@10 24
yading@10 25 SECTION_RODATA
yading@10 26
yading@10 27 fourtap_filter_hw_m: times 4 dw -6, 123
yading@10 28 times 4 dw 12, -1
yading@10 29 times 4 dw -9, 93
yading@10 30 times 4 dw 50, -6
yading@10 31 times 4 dw -6, 50
yading@10 32 times 4 dw 93, -9
yading@10 33 times 4 dw -1, 12
yading@10 34 times 4 dw 123, -6
yading@10 35
yading@10 36 sixtap_filter_hw_m: times 4 dw 2, -11
yading@10 37 times 4 dw 108, 36
yading@10 38 times 4 dw -8, 1
yading@10 39 times 4 dw 3, -16
yading@10 40 times 4 dw 77, 77
yading@10 41 times 4 dw -16, 3
yading@10 42 times 4 dw 1, -8
yading@10 43 times 4 dw 36, 108
yading@10 44 times 4 dw -11, 2
yading@10 45
yading@10 46 fourtap_filter_hb_m: times 8 db -6, 123
yading@10 47 times 8 db 12, -1
yading@10 48 times 8 db -9, 93
yading@10 49 times 8 db 50, -6
yading@10 50 times 8 db -6, 50
yading@10 51 times 8 db 93, -9
yading@10 52 times 8 db -1, 12
yading@10 53 times 8 db 123, -6
yading@10 54
yading@10 55 sixtap_filter_hb_m: times 8 db 2, 1
yading@10 56 times 8 db -11, 108
yading@10 57 times 8 db 36, -8
yading@10 58 times 8 db 3, 3
yading@10 59 times 8 db -16, 77
yading@10 60 times 8 db 77, -16
yading@10 61 times 8 db 1, 2
yading@10 62 times 8 db -8, 36
yading@10 63 times 8 db 108, -11
yading@10 64
yading@10 65 fourtap_filter_v_m: times 8 dw -6
yading@10 66 times 8 dw 123
yading@10 67 times 8 dw 12
yading@10 68 times 8 dw -1
yading@10 69 times 8 dw -9
yading@10 70 times 8 dw 93
yading@10 71 times 8 dw 50
yading@10 72 times 8 dw -6
yading@10 73 times 8 dw -6
yading@10 74 times 8 dw 50
yading@10 75 times 8 dw 93
yading@10 76 times 8 dw -9
yading@10 77 times 8 dw -1
yading@10 78 times 8 dw 12
yading@10 79 times 8 dw 123
yading@10 80 times 8 dw -6
yading@10 81
yading@10 82 sixtap_filter_v_m: times 8 dw 2
yading@10 83 times 8 dw -11
yading@10 84 times 8 dw 108
yading@10 85 times 8 dw 36
yading@10 86 times 8 dw -8
yading@10 87 times 8 dw 1
yading@10 88 times 8 dw 3
yading@10 89 times 8 dw -16
yading@10 90 times 8 dw 77
yading@10 91 times 8 dw 77
yading@10 92 times 8 dw -16
yading@10 93 times 8 dw 3
yading@10 94 times 8 dw 1
yading@10 95 times 8 dw -8
yading@10 96 times 8 dw 36
yading@10 97 times 8 dw 108
yading@10 98 times 8 dw -11
yading@10 99 times 8 dw 2
yading@10 100
yading@10 101 bilinear_filter_vw_m: times 8 dw 1
yading@10 102 times 8 dw 2
yading@10 103 times 8 dw 3
yading@10 104 times 8 dw 4
yading@10 105 times 8 dw 5
yading@10 106 times 8 dw 6
yading@10 107 times 8 dw 7
yading@10 108
yading@10 109 bilinear_filter_vb_m: times 8 db 7, 1
yading@10 110 times 8 db 6, 2
yading@10 111 times 8 db 5, 3
yading@10 112 times 8 db 4, 4
yading@10 113 times 8 db 3, 5
yading@10 114 times 8 db 2, 6
yading@10 115 times 8 db 1, 7
yading@10 116
yading@10 117 %ifdef PIC
yading@10 118 %define fourtap_filter_hw picregq
yading@10 119 %define sixtap_filter_hw picregq
yading@10 120 %define fourtap_filter_hb picregq
yading@10 121 %define sixtap_filter_hb picregq
yading@10 122 %define fourtap_filter_v picregq
yading@10 123 %define sixtap_filter_v picregq
yading@10 124 %define bilinear_filter_vw picregq
yading@10 125 %define bilinear_filter_vb picregq
yading@10 126 %define npicregs 1
yading@10 127 %else
yading@10 128 %define fourtap_filter_hw fourtap_filter_hw_m
yading@10 129 %define sixtap_filter_hw sixtap_filter_hw_m
yading@10 130 %define fourtap_filter_hb fourtap_filter_hb_m
yading@10 131 %define sixtap_filter_hb sixtap_filter_hb_m
yading@10 132 %define fourtap_filter_v fourtap_filter_v_m
yading@10 133 %define sixtap_filter_v sixtap_filter_v_m
yading@10 134 %define bilinear_filter_vw bilinear_filter_vw_m
yading@10 135 %define bilinear_filter_vb bilinear_filter_vb_m
yading@10 136 %define npicregs 0
yading@10 137 %endif
yading@10 138
yading@10 139 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
yading@10 140 filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
yading@10 141
yading@10 142 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
yading@10 143 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
yading@10 144 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
yading@10 145
yading@10 146 pw_27: times 8 dw 27
yading@10 147 pw_63: times 8 dw 63
yading@10 148 pw_256: times 8 dw 256
yading@10 149 pw_20091: times 4 dw 20091
yading@10 150 pw_17734: times 4 dw 17734
yading@10 151
yading@10 152 pb_4: times 16 db 4
yading@10 153 pb_F8: times 16 db 0xF8
yading@10 154 pb_FE: times 16 db 0xFE
yading@10 155 pb_27_63: times 8 db 27, 63
yading@10 156 pb_18_63: times 8 db 18, 63
yading@10 157 pb_9_63: times 8 db 9, 63
yading@10 158
yading@10 159 cextern pb_1
yading@10 160 cextern pw_3
yading@10 161 cextern pb_3
yading@10 162 cextern pw_4
yading@10 163 cextern pw_9
yading@10 164 cextern pw_18
yading@10 165 cextern pw_64
yading@10 166 cextern pb_80
yading@10 167
yading@10 168 SECTION .text
yading@10 169
yading@10 170 ;-----------------------------------------------------------------------------
yading@10 171 ; subpel MC functions:
yading@10 172 ;
yading@10 173 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
yading@10 174 ; uint8_t *src, int srcstride,
yading@10 175 ; int height, int mx, int my);
yading@10 176 ;-----------------------------------------------------------------------------
yading@10 177
yading@10 178 %macro FILTER_SSSE3 1
yading@10 179 cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
yading@10 180 lea mxd, [mxq*3]
yading@10 181 mova m3, [filter_h6_shuf2]
yading@10 182 mova m4, [filter_h6_shuf3]
yading@10 183 %ifdef PIC
yading@10 184 lea picregq, [sixtap_filter_hb_m]
yading@10 185 %endif
yading@10 186 mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
yading@10 187 mova m6, [sixtap_filter_hb+mxq*8-32]
yading@10 188 mova m7, [sixtap_filter_hb+mxq*8-16]
yading@10 189
yading@10 190 .nextrow:
yading@10 191 movu m0, [srcq-2]
yading@10 192 mova m1, m0
yading@10 193 mova m2, m0
yading@10 194 %if mmsize == 8
yading@10 195 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
yading@10 196 ; shuffle with a memory operand
yading@10 197 punpcklbw m0, [srcq+3]
yading@10 198 %else
yading@10 199 pshufb m0, [filter_h6_shuf1]
yading@10 200 %endif
yading@10 201 pshufb m1, m3
yading@10 202 pshufb m2, m4
yading@10 203 pmaddubsw m0, m5
yading@10 204 pmaddubsw m1, m6
yading@10 205 pmaddubsw m2, m7
yading@10 206 paddsw m0, m1
yading@10 207 paddsw m0, m2
yading@10 208 pmulhrsw m0, [pw_256]
yading@10 209 packuswb m0, m0
yading@10 210 movh [dstq], m0 ; store
yading@10 211
yading@10 212 ; go to next line
yading@10 213 add dstq, dststrideq
yading@10 214 add srcq, srcstrideq
yading@10 215 dec heightd ; next row
yading@10 216 jg .nextrow
yading@10 217 REP_RET
yading@10 218
yading@10 219 cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
yading@10 220 shl mxd, 4
yading@10 221 mova m2, [pw_256]
yading@10 222 mova m3, [filter_h2_shuf]
yading@10 223 mova m4, [filter_h4_shuf]
yading@10 224 %ifdef PIC
yading@10 225 lea picregq, [fourtap_filter_hb_m]
yading@10 226 %endif
yading@10 227 mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
yading@10 228 mova m6, [fourtap_filter_hb+mxq]
yading@10 229
yading@10 230 .nextrow:
yading@10 231 movu m0, [srcq-1]
yading@10 232 mova m1, m0
yading@10 233 pshufb m0, m3
yading@10 234 pshufb m1, m4
yading@10 235 pmaddubsw m0, m5
yading@10 236 pmaddubsw m1, m6
yading@10 237 paddsw m0, m1
yading@10 238 pmulhrsw m0, m2
yading@10 239 packuswb m0, m0
yading@10 240 movh [dstq], m0 ; store
yading@10 241
yading@10 242 ; go to next line
yading@10 243 add dstq, dststrideq
yading@10 244 add srcq, srcstrideq
yading@10 245 dec heightd ; next row
yading@10 246 jg .nextrow
yading@10 247 REP_RET
yading@10 248
yading@10 249 cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
yading@10 250 shl myd, 4
yading@10 251 %ifdef PIC
yading@10 252 lea picregq, [fourtap_filter_hb_m]
yading@10 253 %endif
yading@10 254 mova m5, [fourtap_filter_hb+myq-16]
yading@10 255 mova m6, [fourtap_filter_hb+myq]
yading@10 256 mova m7, [pw_256]
yading@10 257
yading@10 258 ; read 3 lines
yading@10 259 sub srcq, srcstrideq
yading@10 260 movh m0, [srcq]
yading@10 261 movh m1, [srcq+ srcstrideq]
yading@10 262 movh m2, [srcq+2*srcstrideq]
yading@10 263 add srcq, srcstrideq
yading@10 264
yading@10 265 .nextrow:
yading@10 266 movh m3, [srcq+2*srcstrideq] ; read new row
yading@10 267 mova m4, m0
yading@10 268 mova m0, m1
yading@10 269 punpcklbw m4, m1
yading@10 270 mova m1, m2
yading@10 271 punpcklbw m2, m3
yading@10 272 pmaddubsw m4, m5
yading@10 273 pmaddubsw m2, m6
yading@10 274 paddsw m4, m2
yading@10 275 mova m2, m3
yading@10 276 pmulhrsw m4, m7
yading@10 277 packuswb m4, m4
yading@10 278 movh [dstq], m4
yading@10 279
yading@10 280 ; go to next line
yading@10 281 add dstq, dststrideq
yading@10 282 add srcq, srcstrideq
yading@10 283 dec heightd ; next row
yading@10 284 jg .nextrow
yading@10 285 REP_RET
yading@10 286
yading@10 287 cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
yading@10 288 lea myd, [myq*3]
yading@10 289 %ifdef PIC
yading@10 290 lea picregq, [sixtap_filter_hb_m]
yading@10 291 %endif
yading@10 292 lea myq, [sixtap_filter_hb+myq*8]
yading@10 293
yading@10 294 ; read 5 lines
yading@10 295 sub srcq, srcstrideq
yading@10 296 sub srcq, srcstrideq
yading@10 297 movh m0, [srcq]
yading@10 298 movh m1, [srcq+srcstrideq]
yading@10 299 movh m2, [srcq+srcstrideq*2]
yading@10 300 lea srcq, [srcq+srcstrideq*2]
yading@10 301 add srcq, srcstrideq
yading@10 302 movh m3, [srcq]
yading@10 303 movh m4, [srcq+srcstrideq]
yading@10 304
yading@10 305 .nextrow:
yading@10 306 movh m5, [srcq+2*srcstrideq] ; read new row
yading@10 307 mova m6, m0
yading@10 308 punpcklbw m6, m5
yading@10 309 mova m0, m1
yading@10 310 punpcklbw m1, m2
yading@10 311 mova m7, m3
yading@10 312 punpcklbw m7, m4
yading@10 313 pmaddubsw m6, [myq-48]
yading@10 314 pmaddubsw m1, [myq-32]
yading@10 315 pmaddubsw m7, [myq-16]
yading@10 316 paddsw m6, m1
yading@10 317 paddsw m6, m7
yading@10 318 mova m1, m2
yading@10 319 mova m2, m3
yading@10 320 pmulhrsw m6, [pw_256]
yading@10 321 mova m3, m4
yading@10 322 packuswb m6, m6
yading@10 323 mova m4, m5
yading@10 324 movh [dstq], m6
yading@10 325
yading@10 326 ; go to next line
yading@10 327 add dstq, dststrideq
yading@10 328 add srcq, srcstrideq
yading@10 329 dec heightd ; next row
yading@10 330 jg .nextrow
yading@10 331 REP_RET
yading@10 332 %endmacro
yading@10 333
yading@10 334 INIT_MMX ssse3
yading@10 335 FILTER_SSSE3 4
yading@10 336 INIT_XMM ssse3
yading@10 337 FILTER_SSSE3 8
yading@10 338
yading@10 339 ; 4x4 block, H-only 4-tap filter
yading@10 340 INIT_MMX mmxext
yading@10 341 cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
yading@10 342 shl mxd, 4
yading@10 343 %ifdef PIC
yading@10 344 lea picregq, [fourtap_filter_hw_m]
yading@10 345 %endif
yading@10 346 movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
yading@10 347 movq mm5, [fourtap_filter_hw+mxq]
yading@10 348 movq mm7, [pw_64]
yading@10 349 pxor mm6, mm6
yading@10 350
yading@10 351 .nextrow:
yading@10 352 movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels
yading@10 353
yading@10 354 ; first set of 2 pixels
yading@10 355 movq mm2, mm1 ; byte ABCD..
yading@10 356 punpcklbw mm1, mm6 ; byte->word ABCD
yading@10 357 pshufw mm0, mm2, 9 ; byte CDEF..
yading@10 358 punpcklbw mm0, mm6 ; byte->word CDEF
yading@10 359 pshufw mm3, mm1, 0x94 ; word ABBC
yading@10 360 pshufw mm1, mm0, 0x94 ; word CDDE
yading@10 361 pmaddwd mm3, mm4 ; multiply 2px with F0/F1
yading@10 362 movq mm0, mm1 ; backup for second set of pixels
yading@10 363 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
yading@10 364 paddd mm3, mm1 ; finish 1st 2px
yading@10 365
yading@10 366 ; second set of 2 pixels, use backup of above
yading@10 367 punpckhbw mm2, mm6 ; byte->word EFGH
yading@10 368 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
yading@10 369 pshufw mm1, mm2, 0x94 ; word EFFG
yading@10 370 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
yading@10 371 paddd mm0, mm1 ; finish 2nd 2px
yading@10 372
yading@10 373 ; merge two sets of 2 pixels into one set of 4, round/clip/store
yading@10 374 packssdw mm3, mm0 ; merge dword->word (4px)
yading@10 375 paddsw mm3, mm7 ; rounding
yading@10 376 psraw mm3, 7
yading@10 377 packuswb mm3, mm6 ; clip and word->bytes
yading@10 378 movd [dstq], mm3 ; store
yading@10 379
yading@10 380 ; go to next line
yading@10 381 add dstq, dststrideq
yading@10 382 add srcq, srcstrideq
yading@10 383 dec heightd ; next row
yading@10 384 jg .nextrow
yading@10 385 REP_RET
yading@10 386
yading@10 387 ; 4x4 block, H-only 6-tap filter
yading@10 388 INIT_MMX mmxext
yading@10 389 cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
yading@10 390 lea mxd, [mxq*3]
yading@10 391 %ifdef PIC
yading@10 392 lea picregq, [sixtap_filter_hw_m]
yading@10 393 %endif
yading@10 394 movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
yading@10 395 movq mm5, [sixtap_filter_hw+mxq*8-32]
yading@10 396 movq mm6, [sixtap_filter_hw+mxq*8-16]
yading@10 397 movq mm7, [pw_64]
yading@10 398 pxor mm3, mm3
yading@10 399
yading@10 400 .nextrow:
yading@10 401 movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels
yading@10 402
yading@10 403 ; first set of 2 pixels
yading@10 404 movq mm2, mm1 ; byte ABCD..
yading@10 405 punpcklbw mm1, mm3 ; byte->word ABCD
yading@10 406 pshufw mm0, mm2, 0x9 ; byte CDEF..
yading@10 407 punpckhbw mm2, mm3 ; byte->word EFGH
yading@10 408 punpcklbw mm0, mm3 ; byte->word CDEF
yading@10 409 pshufw mm1, mm1, 0x94 ; word ABBC
yading@10 410 pshufw mm2, mm2, 0x94 ; word EFFG
yading@10 411 pmaddwd mm1, mm4 ; multiply 2px with F0/F1
yading@10 412 pshufw mm3, mm0, 0x94 ; word CDDE
yading@10 413 movq mm0, mm3 ; backup for second set of pixels
yading@10 414 pmaddwd mm3, mm5 ; multiply 2px with F2/F3
yading@10 415 paddd mm1, mm3 ; add to 1st 2px cache
yading@10 416 movq mm3, mm2 ; backup for second set of pixels
yading@10 417 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
yading@10 418 paddd mm1, mm2 ; finish 1st 2px
yading@10 419
yading@10 420 ; second set of 2 pixels, use backup of above
yading@10 421 movd mm2, [srcq+3] ; byte FGHI (prevent overreads)
yading@10 422 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
yading@10 423 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
yading@10 424 paddd mm0, mm3 ; add to 2nd 2px cache
yading@10 425 pxor mm3, mm3
yading@10 426 punpcklbw mm2, mm3 ; byte->word FGHI
yading@10 427 pshufw mm2, mm2, 0xE9 ; word GHHI
yading@10 428 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
yading@10 429 paddd mm0, mm2 ; finish 2nd 2px
yading@10 430
yading@10 431 ; merge two sets of 2 pixels into one set of 4, round/clip/store
yading@10 432 packssdw mm1, mm0 ; merge dword->word (4px)
yading@10 433 paddsw mm1, mm7 ; rounding
yading@10 434 psraw mm1, 7
yading@10 435 packuswb mm1, mm3 ; clip and word->bytes
yading@10 436 movd [dstq], mm1 ; store
yading@10 437
yading@10 438 ; go to next line
yading@10 439 add dstq, dststrideq
yading@10 440 add srcq, srcstrideq
yading@10 441 dec heightd ; next row
yading@10 442 jg .nextrow
yading@10 443 REP_RET
yading@10 444
yading@10 445 INIT_XMM sse2
yading@10 446 cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
yading@10 447 shl mxd, 5
yading@10 448 %ifdef PIC
yading@10 449 lea picregq, [fourtap_filter_v_m]
yading@10 450 %endif
yading@10 451 lea mxq, [fourtap_filter_v+mxq-32]
yading@10 452 pxor m7, m7
yading@10 453 mova m4, [pw_64]
yading@10 454 mova m5, [mxq+ 0]
yading@10 455 mova m6, [mxq+16]
yading@10 456 %ifdef m8
yading@10 457 mova m8, [mxq+32]
yading@10 458 mova m9, [mxq+48]
yading@10 459 %endif
yading@10 460 .nextrow:
yading@10 461 movq m0, [srcq-1]
yading@10 462 movq m1, [srcq-0]
yading@10 463 movq m2, [srcq+1]
yading@10 464 movq m3, [srcq+2]
yading@10 465 punpcklbw m0, m7
yading@10 466 punpcklbw m1, m7
yading@10 467 punpcklbw m2, m7
yading@10 468 punpcklbw m3, m7
yading@10 469 pmullw m0, m5
yading@10 470 pmullw m1, m6
yading@10 471 %ifdef m8
yading@10 472 pmullw m2, m8
yading@10 473 pmullw m3, m9
yading@10 474 %else
yading@10 475 pmullw m2, [mxq+32]
yading@10 476 pmullw m3, [mxq+48]
yading@10 477 %endif
yading@10 478 paddsw m0, m1
yading@10 479 paddsw m2, m3
yading@10 480 paddsw m0, m2
yading@10 481 paddsw m0, m4
yading@10 482 psraw m0, 7
yading@10 483 packuswb m0, m7
yading@10 484 movh [dstq], m0 ; store
yading@10 485
yading@10 486 ; go to next line
yading@10 487 add dstq, dststrideq
yading@10 488 add srcq, srcstrideq
yading@10 489 dec heightd ; next row
yading@10 490 jg .nextrow
yading@10 491 REP_RET
yading@10 492
yading@10 493 INIT_XMM sse2
yading@10 494 cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
yading@10 495 lea mxd, [mxq*3]
yading@10 496 shl mxd, 4
yading@10 497 %ifdef PIC
yading@10 498 lea picregq, [sixtap_filter_v_m]
yading@10 499 %endif
yading@10 500 lea mxq, [sixtap_filter_v+mxq-96]
yading@10 501 pxor m7, m7
yading@10 502 mova m6, [pw_64]
yading@10 503 %ifdef m8
yading@10 504 mova m8, [mxq+ 0]
yading@10 505 mova m9, [mxq+16]
yading@10 506 mova m10, [mxq+32]
yading@10 507 mova m11, [mxq+48]
yading@10 508 mova m12, [mxq+64]
yading@10 509 mova m13, [mxq+80]
yading@10 510 %endif
yading@10 511 .nextrow:
yading@10 512 movq m0, [srcq-2]
yading@10 513 movq m1, [srcq-1]
yading@10 514 movq m2, [srcq-0]
yading@10 515 movq m3, [srcq+1]
yading@10 516 movq m4, [srcq+2]
yading@10 517 movq m5, [srcq+3]
yading@10 518 punpcklbw m0, m7
yading@10 519 punpcklbw m1, m7
yading@10 520 punpcklbw m2, m7
yading@10 521 punpcklbw m3, m7
yading@10 522 punpcklbw m4, m7
yading@10 523 punpcklbw m5, m7
yading@10 524 %ifdef m8
yading@10 525 pmullw m0, m8
yading@10 526 pmullw m1, m9
yading@10 527 pmullw m2, m10
yading@10 528 pmullw m3, m11
yading@10 529 pmullw m4, m12
yading@10 530 pmullw m5, m13
yading@10 531 %else
yading@10 532 pmullw m0, [mxq+ 0]
yading@10 533 pmullw m1, [mxq+16]
yading@10 534 pmullw m2, [mxq+32]
yading@10 535 pmullw m3, [mxq+48]
yading@10 536 pmullw m4, [mxq+64]
yading@10 537 pmullw m5, [mxq+80]
yading@10 538 %endif
yading@10 539 paddsw m1, m4
yading@10 540 paddsw m0, m5
yading@10 541 paddsw m1, m2
yading@10 542 paddsw m0, m3
yading@10 543 paddsw m0, m1
yading@10 544 paddsw m0, m6
yading@10 545 psraw m0, 7
yading@10 546 packuswb m0, m7
yading@10 547 movh [dstq], m0 ; store
yading@10 548
yading@10 549 ; go to next line
yading@10 550 add dstq, dststrideq
yading@10 551 add srcq, srcstrideq
yading@10 552 dec heightd ; next row
yading@10 553 jg .nextrow
yading@10 554 REP_RET
yading@10 555
yading@10 556 %macro FILTER_V 1
yading@10 557 ; 4x4 block, V-only 4-tap filter
yading@10 558 cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
yading@10 559 shl myd, 5
yading@10 560 %ifdef PIC
yading@10 561 lea picregq, [fourtap_filter_v_m]
yading@10 562 %endif
yading@10 563 lea myq, [fourtap_filter_v+myq-32]
yading@10 564 mova m6, [pw_64]
yading@10 565 pxor m7, m7
yading@10 566 mova m5, [myq+48]
yading@10 567
yading@10 568 ; read 3 lines
yading@10 569 sub srcq, srcstrideq
yading@10 570 movh m0, [srcq]
yading@10 571 movh m1, [srcq+ srcstrideq]
yading@10 572 movh m2, [srcq+2*srcstrideq]
yading@10 573 add srcq, srcstrideq
yading@10 574 punpcklbw m0, m7
yading@10 575 punpcklbw m1, m7
yading@10 576 punpcklbw m2, m7
yading@10 577
yading@10 578 .nextrow:
yading@10 579 ; first calculate negative taps (to prevent losing positive overflows)
yading@10 580 movh m4, [srcq+2*srcstrideq] ; read new row
yading@10 581 punpcklbw m4, m7
yading@10 582 mova m3, m4
yading@10 583 pmullw m0, [myq+0]
yading@10 584 pmullw m4, m5
yading@10 585 paddsw m4, m0
yading@10 586
yading@10 587 ; then calculate positive taps
yading@10 588 mova m0, m1
yading@10 589 pmullw m1, [myq+16]
yading@10 590 paddsw m4, m1
yading@10 591 mova m1, m2
yading@10 592 pmullw m2, [myq+32]
yading@10 593 paddsw m4, m2
yading@10 594 mova m2, m3
yading@10 595
yading@10 596 ; round/clip/store
yading@10 597 paddsw m4, m6
yading@10 598 psraw m4, 7
yading@10 599 packuswb m4, m7
yading@10 600 movh [dstq], m4
yading@10 601
yading@10 602 ; go to next line
yading@10 603 add dstq, dststrideq
yading@10 604 add srcq, srcstrideq
yading@10 605 dec heightd ; next row
yading@10 606 jg .nextrow
yading@10 607 REP_RET
yading@10 608
yading@10 609
yading@10 610 ; 4x4 block, V-only 6-tap filter
yading@10 611 cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
yading@10 612 shl myd, 4
yading@10 613 lea myq, [myq*3]
yading@10 614 %ifdef PIC
yading@10 615 lea picregq, [sixtap_filter_v_m]
yading@10 616 %endif
yading@10 617 lea myq, [sixtap_filter_v+myq-96]
yading@10 618 pxor m7, m7
yading@10 619
yading@10 620 ; read 5 lines
yading@10 621 sub srcq, srcstrideq
yading@10 622 sub srcq, srcstrideq
yading@10 623 movh m0, [srcq]
yading@10 624 movh m1, [srcq+srcstrideq]
yading@10 625 movh m2, [srcq+srcstrideq*2]
yading@10 626 lea srcq, [srcq+srcstrideq*2]
yading@10 627 add srcq, srcstrideq
yading@10 628 movh m3, [srcq]
yading@10 629 movh m4, [srcq+srcstrideq]
yading@10 630 punpcklbw m0, m7
yading@10 631 punpcklbw m1, m7
yading@10 632 punpcklbw m2, m7
yading@10 633 punpcklbw m3, m7
yading@10 634 punpcklbw m4, m7
yading@10 635
yading@10 636 .nextrow:
yading@10 637 ; first calculate negative taps (to prevent losing positive overflows)
yading@10 638 mova m5, m1
yading@10 639 pmullw m5, [myq+16]
yading@10 640 mova m6, m4
yading@10 641 pmullw m6, [myq+64]
yading@10 642 paddsw m6, m5
yading@10 643
yading@10 644 ; then calculate positive taps
yading@10 645 movh m5, [srcq+2*srcstrideq] ; read new row
yading@10 646 punpcklbw m5, m7
yading@10 647 pmullw m0, [myq+0]
yading@10 648 paddsw m6, m0
yading@10 649 mova m0, m1
yading@10 650 mova m1, m2
yading@10 651 pmullw m2, [myq+32]
yading@10 652 paddsw m6, m2
yading@10 653 mova m2, m3
yading@10 654 pmullw m3, [myq+48]
yading@10 655 paddsw m6, m3
yading@10 656 mova m3, m4
yading@10 657 mova m4, m5
yading@10 658 pmullw m5, [myq+80]
yading@10 659 paddsw m6, m5
yading@10 660
yading@10 661 ; round/clip/store
yading@10 662 paddsw m6, [pw_64]
yading@10 663 psraw m6, 7
yading@10 664 packuswb m6, m7
yading@10 665 movh [dstq], m6
yading@10 666
yading@10 667 ; go to next line
yading@10 668 add dstq, dststrideq
yading@10 669 add srcq, srcstrideq
yading@10 670 dec heightd ; next row
yading@10 671 jg .nextrow
yading@10 672 REP_RET
yading@10 673 %endmacro
yading@10 674
yading@10 675 INIT_MMX mmxext
yading@10 676 FILTER_V 4
yading@10 677 INIT_XMM sse2
yading@10 678 FILTER_V 8
yading@10 679
yading@10 680 %macro FILTER_BILINEAR 1
yading@10 681 cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
yading@10 682 shl myd, 4
yading@10 683 %ifdef PIC
yading@10 684 lea picregq, [bilinear_filter_vw_m]
yading@10 685 %endif
yading@10 686 pxor m6, m6
yading@10 687 mova m5, [bilinear_filter_vw+myq-1*16]
yading@10 688 neg myq
yading@10 689 mova m4, [bilinear_filter_vw+myq+7*16]
yading@10 690 .nextrow:
yading@10 691 movh m0, [srcq+srcstrideq*0]
yading@10 692 movh m1, [srcq+srcstrideq*1]
yading@10 693 movh m3, [srcq+srcstrideq*2]
yading@10 694 punpcklbw m0, m6
yading@10 695 punpcklbw m1, m6
yading@10 696 punpcklbw m3, m6
yading@10 697 mova m2, m1
yading@10 698 pmullw m0, m4
yading@10 699 pmullw m1, m5
yading@10 700 pmullw m2, m4
yading@10 701 pmullw m3, m5
yading@10 702 paddsw m0, m1
yading@10 703 paddsw m2, m3
yading@10 704 psraw m0, 2
yading@10 705 psraw m2, 2
yading@10 706 pavgw m0, m6
yading@10 707 pavgw m2, m6
yading@10 708 %if mmsize == 8
yading@10 709 packuswb m0, m0
yading@10 710 packuswb m2, m2
yading@10 711 movh [dstq+dststrideq*0], m0
yading@10 712 movh [dstq+dststrideq*1], m2
yading@10 713 %else
yading@10 714 packuswb m0, m2
yading@10 715 movh [dstq+dststrideq*0], m0
yading@10 716 movhps [dstq+dststrideq*1], m0
yading@10 717 %endif
yading@10 718
yading@10 719 lea dstq, [dstq+dststrideq*2]
yading@10 720 lea srcq, [srcq+srcstrideq*2]
yading@10 721 sub heightd, 2
yading@10 722 jg .nextrow
yading@10 723 REP_RET
yading@10 724
yading@10 725 cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
yading@10 726 shl mxd, 4
yading@10 727 %ifdef PIC
yading@10 728 lea picregq, [bilinear_filter_vw_m]
yading@10 729 %endif
yading@10 730 pxor m6, m6
yading@10 731 mova m5, [bilinear_filter_vw+mxq-1*16]
yading@10 732 neg mxq
yading@10 733 mova m4, [bilinear_filter_vw+mxq+7*16]
yading@10 734 .nextrow:
yading@10 735 movh m0, [srcq+srcstrideq*0+0]
yading@10 736 movh m1, [srcq+srcstrideq*0+1]
yading@10 737 movh m2, [srcq+srcstrideq*1+0]
yading@10 738 movh m3, [srcq+srcstrideq*1+1]
yading@10 739 punpcklbw m0, m6
yading@10 740 punpcklbw m1, m6
yading@10 741 punpcklbw m2, m6
yading@10 742 punpcklbw m3, m6
yading@10 743 pmullw m0, m4
yading@10 744 pmullw m1, m5
yading@10 745 pmullw m2, m4
yading@10 746 pmullw m3, m5
yading@10 747 paddsw m0, m1
yading@10 748 paddsw m2, m3
yading@10 749 psraw m0, 2
yading@10 750 psraw m2, 2
yading@10 751 pavgw m0, m6
yading@10 752 pavgw m2, m6
yading@10 753 %if mmsize == 8
yading@10 754 packuswb m0, m0
yading@10 755 packuswb m2, m2
yading@10 756 movh [dstq+dststrideq*0], m0
yading@10 757 movh [dstq+dststrideq*1], m2
yading@10 758 %else
yading@10 759 packuswb m0, m2
yading@10 760 movh [dstq+dststrideq*0], m0
yading@10 761 movhps [dstq+dststrideq*1], m0
yading@10 762 %endif
yading@10 763
yading@10 764 lea dstq, [dstq+dststrideq*2]
yading@10 765 lea srcq, [srcq+srcstrideq*2]
yading@10 766 sub heightd, 2
yading@10 767 jg .nextrow
yading@10 768 REP_RET
yading@10 769 %endmacro
yading@10 770
yading@10 771 INIT_MMX mmxext
yading@10 772 FILTER_BILINEAR 4
yading@10 773 INIT_XMM sse2
yading@10 774 FILTER_BILINEAR 8
yading@10 775
yading@10 776 %macro FILTER_BILINEAR_SSSE3 1
yading@10 777 cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
yading@10 778 shl myd, 4
yading@10 779 %ifdef PIC
yading@10 780 lea picregq, [bilinear_filter_vb_m]
yading@10 781 %endif
yading@10 782 pxor m4, m4
yading@10 783 mova m3, [bilinear_filter_vb+myq-16]
yading@10 784 .nextrow:
yading@10 785 movh m0, [srcq+srcstrideq*0]
yading@10 786 movh m1, [srcq+srcstrideq*1]
yading@10 787 movh m2, [srcq+srcstrideq*2]
yading@10 788 punpcklbw m0, m1
yading@10 789 punpcklbw m1, m2
yading@10 790 pmaddubsw m0, m3
yading@10 791 pmaddubsw m1, m3
yading@10 792 psraw m0, 2
yading@10 793 psraw m1, 2
yading@10 794 pavgw m0, m4
yading@10 795 pavgw m1, m4
yading@10 796 %if mmsize==8
yading@10 797 packuswb m0, m0
yading@10 798 packuswb m1, m1
yading@10 799 movh [dstq+dststrideq*0], m0
yading@10 800 movh [dstq+dststrideq*1], m1
yading@10 801 %else
yading@10 802 packuswb m0, m1
yading@10 803 movh [dstq+dststrideq*0], m0
yading@10 804 movhps [dstq+dststrideq*1], m0
yading@10 805 %endif
yading@10 806
yading@10 807 lea dstq, [dstq+dststrideq*2]
yading@10 808 lea srcq, [srcq+srcstrideq*2]
yading@10 809 sub heightd, 2
yading@10 810 jg .nextrow
yading@10 811 REP_RET
yading@10 812
yading@10 813 cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
yading@10 814 shl mxd, 4
yading@10 815 %ifdef PIC
yading@10 816 lea picregq, [bilinear_filter_vb_m]
yading@10 817 %endif
yading@10 818 pxor m4, m4
yading@10 819 mova m2, [filter_h2_shuf]
yading@10 820 mova m3, [bilinear_filter_vb+mxq-16]
yading@10 821 .nextrow:
yading@10 822 movu m0, [srcq+srcstrideq*0]
yading@10 823 movu m1, [srcq+srcstrideq*1]
yading@10 824 pshufb m0, m2
yading@10 825 pshufb m1, m2
yading@10 826 pmaddubsw m0, m3
yading@10 827 pmaddubsw m1, m3
yading@10 828 psraw m0, 2
yading@10 829 psraw m1, 2
yading@10 830 pavgw m0, m4
yading@10 831 pavgw m1, m4
yading@10 832 %if mmsize==8
yading@10 833 packuswb m0, m0
yading@10 834 packuswb m1, m1
yading@10 835 movh [dstq+dststrideq*0], m0
yading@10 836 movh [dstq+dststrideq*1], m1
yading@10 837 %else
yading@10 838 packuswb m0, m1
yading@10 839 movh [dstq+dststrideq*0], m0
yading@10 840 movhps [dstq+dststrideq*1], m0
yading@10 841 %endif
yading@10 842
yading@10 843 lea dstq, [dstq+dststrideq*2]
yading@10 844 lea srcq, [srcq+srcstrideq*2]
yading@10 845 sub heightd, 2
yading@10 846 jg .nextrow
yading@10 847 REP_RET
yading@10 848 %endmacro
yading@10 849
yading@10 850 INIT_MMX ssse3
yading@10 851 FILTER_BILINEAR_SSSE3 4
yading@10 852 INIT_XMM ssse3
yading@10 853 FILTER_BILINEAR_SSSE3 8
yading@10 854
yading@10 855 INIT_MMX mmx
yading@10 856 cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
yading@10 857 .nextrow:
yading@10 858 movq mm0, [srcq+srcstrideq*0]
yading@10 859 movq mm1, [srcq+srcstrideq*1]
yading@10 860 lea srcq, [srcq+srcstrideq*2]
yading@10 861 movq [dstq+dststrideq*0], mm0
yading@10 862 movq [dstq+dststrideq*1], mm1
yading@10 863 lea dstq, [dstq+dststrideq*2]
yading@10 864 sub heightd, 2
yading@10 865 jg .nextrow
yading@10 866 REP_RET
yading@10 867
yading@10 868 %if ARCH_X86_32
yading@10 869 INIT_MMX mmx
yading@10 870 cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height
yading@10 871 .nextrow:
yading@10 872 movq mm0, [srcq+srcstrideq*0+0]
yading@10 873 movq mm1, [srcq+srcstrideq*0+8]
yading@10 874 movq mm2, [srcq+srcstrideq*1+0]
yading@10 875 movq mm3, [srcq+srcstrideq*1+8]
yading@10 876 lea srcq, [srcq+srcstrideq*2]
yading@10 877 movq [dstq+dststrideq*0+0], mm0
yading@10 878 movq [dstq+dststrideq*0+8], mm1
yading@10 879 movq [dstq+dststrideq*1+0], mm2
yading@10 880 movq [dstq+dststrideq*1+8], mm3
yading@10 881 lea dstq, [dstq+dststrideq*2]
yading@10 882 sub heightd, 2
yading@10 883 jg .nextrow
yading@10 884 REP_RET
yading@10 885 %endif
yading@10 886
yading@10 887 INIT_XMM sse
yading@10 888 cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
yading@10 889 .nextrow:
yading@10 890 movups xmm0, [srcq+srcstrideq*0]
yading@10 891 movups xmm1, [srcq+srcstrideq*1]
yading@10 892 lea srcq, [srcq+srcstrideq*2]
yading@10 893 movaps [dstq+dststrideq*0], xmm0
yading@10 894 movaps [dstq+dststrideq*1], xmm1
yading@10 895 lea dstq, [dstq+dststrideq*2]
yading@10 896 sub heightd, 2
yading@10 897 jg .nextrow
yading@10 898 REP_RET
yading@10 899
yading@10 900 ;-----------------------------------------------------------------------------
yading@10 901 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], int stride);
yading@10 902 ;-----------------------------------------------------------------------------
yading@10 903
yading@10 904 %macro ADD_DC 4
yading@10 905 %4 m2, [dst1q+%3]
yading@10 906 %4 m3, [dst1q+strideq+%3]
yading@10 907 %4 m4, [dst2q+%3]
yading@10 908 %4 m5, [dst2q+strideq+%3]
yading@10 909 paddusb m2, %1
yading@10 910 paddusb m3, %1
yading@10 911 paddusb m4, %1
yading@10 912 paddusb m5, %1
yading@10 913 psubusb m2, %2
yading@10 914 psubusb m3, %2
yading@10 915 psubusb m4, %2
yading@10 916 psubusb m5, %2
yading@10 917 %4 [dst1q+%3], m2
yading@10 918 %4 [dst1q+strideq+%3], m3
yading@10 919 %4 [dst2q+%3], m4
yading@10 920 %4 [dst2q+strideq+%3], m5
yading@10 921 %endmacro
yading@10 922
yading@10 923 INIT_MMX mmx
yading@10 924 cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
yading@10 925 ; load data
yading@10 926 movd m0, [blockq]
yading@10 927
yading@10 928 ; calculate DC
yading@10 929 paddw m0, [pw_4]
yading@10 930 pxor m1, m1
yading@10 931 psraw m0, 3
yading@10 932 movd [blockq], m1
yading@10 933 psubw m1, m0
yading@10 934 packuswb m0, m0
yading@10 935 packuswb m1, m1
yading@10 936 punpcklbw m0, m0
yading@10 937 punpcklbw m1, m1
yading@10 938 punpcklwd m0, m0
yading@10 939 punpcklwd m1, m1
yading@10 940
yading@10 941 ; add DC
yading@10 942 DEFINE_ARGS dst1, dst2, stride
yading@10 943 lea dst2q, [dst1q+strideq*2]
yading@10 944 ADD_DC m0, m1, 0, movh
yading@10 945 RET
yading@10 946
yading@10 947 INIT_XMM sse4
yading@10 948 cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
yading@10 949 ; load data
yading@10 950 movd m0, [blockq]
yading@10 951 pxor m1, m1
yading@10 952
yading@10 953 ; calculate DC
yading@10 954 paddw m0, [pw_4]
yading@10 955 movd [blockq], m1
yading@10 956 DEFINE_ARGS dst1, dst2, stride
yading@10 957 lea dst2q, [dst1q+strideq*2]
yading@10 958 movd m2, [dst1q]
yading@10 959 movd m3, [dst1q+strideq]
yading@10 960 movd m4, [dst2q]
yading@10 961 movd m5, [dst2q+strideq]
yading@10 962 psraw m0, 3
yading@10 963 pshuflw m0, m0, 0
yading@10 964 punpcklqdq m0, m0
yading@10 965 punpckldq m2, m3
yading@10 966 punpckldq m4, m5
yading@10 967 punpcklbw m2, m1
yading@10 968 punpcklbw m4, m1
yading@10 969 paddw m2, m0
yading@10 970 paddw m4, m0
yading@10 971 packuswb m2, m4
yading@10 972 movd [dst1q], m2
yading@10 973 pextrd [dst1q+strideq], m2, 1
yading@10 974 pextrd [dst2q], m2, 2
yading@10 975 pextrd [dst2q+strideq], m2, 3
yading@10 976 RET
yading@10 977
yading@10 978 ;-----------------------------------------------------------------------------
yading@10 979 ; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], int stride);
yading@10 980 ;-----------------------------------------------------------------------------
yading@10 981
yading@10 982 %if ARCH_X86_32
yading@10 983 INIT_MMX mmx
yading@10 984 cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride
yading@10 985 ; load data
yading@10 986 movd m0, [blockq+32*0] ; A
yading@10 987 movd m1, [blockq+32*2] ; C
yading@10 988 punpcklwd m0, [blockq+32*1] ; A B
yading@10 989 punpcklwd m1, [blockq+32*3] ; C D
yading@10 990 punpckldq m0, m1 ; A B C D
yading@10 991 pxor m6, m6
yading@10 992
yading@10 993 ; calculate DC
yading@10 994 paddw m0, [pw_4]
yading@10 995 movd [blockq+32*0], m6
yading@10 996 movd [blockq+32*1], m6
yading@10 997 movd [blockq+32*2], m6
yading@10 998 movd [blockq+32*3], m6
yading@10 999 psraw m0, 3
yading@10 1000 psubw m6, m0
yading@10 1001 packuswb m0, m0
yading@10 1002 packuswb m6, m6
yading@10 1003 punpcklbw m0, m0 ; AABBCCDD
yading@10 1004 punpcklbw m6, m6 ; AABBCCDD
yading@10 1005 movq m1, m0
yading@10 1006 movq m7, m6
yading@10 1007 punpcklbw m0, m0 ; AAAABBBB
yading@10 1008 punpckhbw m1, m1 ; CCCCDDDD
yading@10 1009 punpcklbw m6, m6 ; AAAABBBB
yading@10 1010 punpckhbw m7, m7 ; CCCCDDDD
yading@10 1011
yading@10 1012 ; add DC
yading@10 1013 DEFINE_ARGS dst1, dst2, stride
yading@10 1014 lea dst2q, [dst1q+strideq*2]
yading@10 1015 ADD_DC m0, m6, 0, mova
yading@10 1016 ADD_DC m1, m7, 8, mova
yading@10 1017 RET
yading@10 1018 %endif
yading@10 1019
yading@10 1020 INIT_XMM sse2
yading@10 1021 cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
yading@10 1022 ; load data
yading@10 1023 movd m0, [blockq+32*0] ; A
yading@10 1024 movd m1, [blockq+32*2] ; C
yading@10 1025 punpcklwd m0, [blockq+32*1] ; A B
yading@10 1026 punpcklwd m1, [blockq+32*3] ; C D
yading@10 1027 punpckldq m0, m1 ; A B C D
yading@10 1028 pxor m1, m1
yading@10 1029
yading@10 1030 ; calculate DC
yading@10 1031 paddw m0, [pw_4]
yading@10 1032 movd [blockq+32*0], m1
yading@10 1033 movd [blockq+32*1], m1
yading@10 1034 movd [blockq+32*2], m1
yading@10 1035 movd [blockq+32*3], m1
yading@10 1036 psraw m0, 3
yading@10 1037 psubw m1, m0
yading@10 1038 packuswb m0, m0
yading@10 1039 packuswb m1, m1
yading@10 1040 punpcklbw m0, m0
yading@10 1041 punpcklbw m1, m1
yading@10 1042 punpcklbw m0, m0
yading@10 1043 punpcklbw m1, m1
yading@10 1044
yading@10 1045 ; add DC
yading@10 1046 DEFINE_ARGS dst1, dst2, stride
yading@10 1047 lea dst2q, [dst1q+strideq*2]
yading@10 1048 ADD_DC m0, m1, 0, mova
yading@10 1049 RET
yading@10 1050
yading@10 1051 ;-----------------------------------------------------------------------------
yading@10 1052 ; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], int stride);
yading@10 1053 ;-----------------------------------------------------------------------------
yading@10 1054
yading@10 1055 INIT_MMX mmx
yading@10 1056 cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
yading@10 1057 ; load data
yading@10 1058 movd m0, [blockq+32*0] ; A
yading@10 1059 movd m1, [blockq+32*2] ; C
yading@10 1060 punpcklwd m0, [blockq+32*1] ; A B
yading@10 1061 punpcklwd m1, [blockq+32*3] ; C D
yading@10 1062 punpckldq m0, m1 ; A B C D
yading@10 1063 pxor m6, m6
yading@10 1064
yading@10 1065 ; calculate DC
yading@10 1066 paddw m0, [pw_4]
yading@10 1067 movd [blockq+32*0], m6
yading@10 1068 movd [blockq+32*1], m6
yading@10 1069 movd [blockq+32*2], m6
yading@10 1070 movd [blockq+32*3], m6
yading@10 1071 psraw m0, 3
yading@10 1072 psubw m6, m0
yading@10 1073 packuswb m0, m0
yading@10 1074 packuswb m6, m6
yading@10 1075 punpcklbw m0, m0 ; AABBCCDD
yading@10 1076 punpcklbw m6, m6 ; AABBCCDD
yading@10 1077 movq m1, m0
yading@10 1078 movq m7, m6
yading@10 1079 punpcklbw m0, m0 ; AAAABBBB
yading@10 1080 punpckhbw m1, m1 ; CCCCDDDD
yading@10 1081 punpcklbw m6, m6 ; AAAABBBB
yading@10 1082 punpckhbw m7, m7 ; CCCCDDDD
yading@10 1083
yading@10 1084 ; add DC
yading@10 1085 DEFINE_ARGS dst1, dst2, stride
yading@10 1086 lea dst2q, [dst1q+strideq*2]
yading@10 1087 ADD_DC m0, m6, 0, mova
yading@10 1088 lea dst1q, [dst1q+strideq*4]
yading@10 1089 lea dst2q, [dst2q+strideq*4]
yading@10 1090 ADD_DC m1, m7, 0, mova
yading@10 1091 RET
yading@10 1092
yading@10 1093 ;-----------------------------------------------------------------------------
yading@10 1094 ; void vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], int stride);
yading@10 1095 ;-----------------------------------------------------------------------------
yading@10 1096
yading@10 1097 ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
yading@10 1098 ; this macro assumes that m6/m7 have words for 20091/17734 loaded
yading@10 1099 %macro VP8_MULTIPLY_SUMSUB 4
yading@10 1100 mova %3, %1
yading@10 1101 mova %4, %2
yading@10 1102 pmulhw %3, m6 ;20091(1)
yading@10 1103 pmulhw %4, m6 ;20091(2)
yading@10 1104 paddw %3, %1
yading@10 1105 paddw %4, %2
yading@10 1106 paddw %1, %1
yading@10 1107 paddw %2, %2
yading@10 1108 pmulhw %1, m7 ;35468(1)
yading@10 1109 pmulhw %2, m7 ;35468(2)
yading@10 1110 psubw %1, %4
yading@10 1111 paddw %2, %3
yading@10 1112 %endmacro
yading@10 1113
yading@10 1114 ; calculate x0=%1+%3; x1=%1-%3
yading@10 1115 ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
yading@10 1116 ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
yading@10 1117 ; %5/%6 are temporary registers
yading@10 1118 ; we assume m6/m7 have constant words 20091/17734 loaded in them
yading@10 1119 %macro VP8_IDCT_TRANSFORM4x4_1D 6
yading@10 1120 SUMSUB_BA w, %3, %1, %5 ;t0, t1
yading@10 1121 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
yading@10 1122 SUMSUB_BA w, %4, %3, %5 ;tmp0, tmp3
yading@10 1123 SUMSUB_BA w, %2, %1, %5 ;tmp1, tmp2
yading@10 1124 SWAP %4, %1
yading@10 1125 SWAP %4, %3
yading@10 1126 %endmacro
yading@10 1127
yading@10 1128 %macro VP8_IDCT_ADD 0
yading@10 1129 cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
yading@10 1130 ; load block data
yading@10 1131 movq m0, [blockq+ 0]
yading@10 1132 movq m1, [blockq+ 8]
yading@10 1133 movq m2, [blockq+16]
yading@10 1134 movq m3, [blockq+24]
yading@10 1135 movq m6, [pw_20091]
yading@10 1136 movq m7, [pw_17734]
yading@10 1137 %if cpuflag(sse)
yading@10 1138 xorps xmm0, xmm0
yading@10 1139 movaps [blockq+ 0], xmm0
yading@10 1140 movaps [blockq+16], xmm0
yading@10 1141 %else
yading@10 1142 pxor m4, m4
yading@10 1143 movq [blockq+ 0], m4
yading@10 1144 movq [blockq+ 8], m4
yading@10 1145 movq [blockq+16], m4
yading@10 1146 movq [blockq+24], m4
yading@10 1147 %endif
yading@10 1148
yading@10 1149 ; actual IDCT
yading@10 1150 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
yading@10 1151 TRANSPOSE4x4W 0, 1, 2, 3, 4
yading@10 1152 paddw m0, [pw_4]
yading@10 1153 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
yading@10 1154 TRANSPOSE4x4W 0, 1, 2, 3, 4
yading@10 1155
yading@10 1156 ; store
yading@10 1157 pxor m4, m4
yading@10 1158 DEFINE_ARGS dst1, dst2, stride
yading@10 1159 lea dst2q, [dst1q+2*strideq]
yading@10 1160 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
yading@10 1161 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
yading@10 1162
yading@10 1163 RET
yading@10 1164 %endmacro
yading@10 1165
yading@10 1166 %if ARCH_X86_32
yading@10 1167 INIT_MMX mmx
yading@10 1168 VP8_IDCT_ADD
yading@10 1169 %endif
yading@10 1170 INIT_MMX sse
yading@10 1171 VP8_IDCT_ADD
yading@10 1172
yading@10 1173 ;-----------------------------------------------------------------------------
yading@10 1174 ; void vp8_luma_dc_wht_mmxext(int16_t block[4][4][16], int16_t dc[16])
yading@10 1175 ;-----------------------------------------------------------------------------
yading@10 1176
yading@10 1177 %macro SCATTER_WHT 3
yading@10 1178 movd dc1d, m%1
yading@10 1179 movd dc2d, m%2
yading@10 1180 mov [blockq+2*16*(0+%3)], dc1w
yading@10 1181 mov [blockq+2*16*(1+%3)], dc2w
yading@10 1182 shr dc1d, 16
yading@10 1183 shr dc2d, 16
yading@10 1184 psrlq m%1, 32
yading@10 1185 psrlq m%2, 32
yading@10 1186 mov [blockq+2*16*(4+%3)], dc1w
yading@10 1187 mov [blockq+2*16*(5+%3)], dc2w
yading@10 1188 movd dc1d, m%1
yading@10 1189 movd dc2d, m%2
yading@10 1190 mov [blockq+2*16*(8+%3)], dc1w
yading@10 1191 mov [blockq+2*16*(9+%3)], dc2w
yading@10 1192 shr dc1d, 16
yading@10 1193 shr dc2d, 16
yading@10 1194 mov [blockq+2*16*(12+%3)], dc1w
yading@10 1195 mov [blockq+2*16*(13+%3)], dc2w
yading@10 1196 %endmacro
yading@10 1197
yading@10 1198 %macro HADAMARD4_1D 4
yading@10 1199 SUMSUB_BADC w, %2, %1, %4, %3
yading@10 1200 SUMSUB_BADC w, %4, %2, %3, %1
yading@10 1201 SWAP %1, %4, %3
yading@10 1202 %endmacro
yading@10 1203
yading@10 1204 %macro VP8_DC_WHT 0
yading@10 1205 cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
yading@10 1206 movq m0, [dc1q]
yading@10 1207 movq m1, [dc1q+8]
yading@10 1208 movq m2, [dc1q+16]
yading@10 1209 movq m3, [dc1q+24]
yading@10 1210 %if cpuflag(sse)
yading@10 1211 xorps xmm0, xmm0
yading@10 1212 movaps [dc1q+ 0], xmm0
yading@10 1213 movaps [dc1q+16], xmm0
yading@10 1214 %else
yading@10 1215 pxor m4, m4
yading@10 1216 movq [dc1q+ 0], m4
yading@10 1217 movq [dc1q+ 8], m4
yading@10 1218 movq [dc1q+16], m4
yading@10 1219 movq [dc1q+24], m4
yading@10 1220 %endif
yading@10 1221 HADAMARD4_1D 0, 1, 2, 3
yading@10 1222 TRANSPOSE4x4W 0, 1, 2, 3, 4
yading@10 1223 paddw m0, [pw_3]
yading@10 1224 HADAMARD4_1D 0, 1, 2, 3
yading@10 1225 psraw m0, 3
yading@10 1226 psraw m1, 3
yading@10 1227 psraw m2, 3
yading@10 1228 psraw m3, 3
yading@10 1229 SCATTER_WHT 0, 1, 0
yading@10 1230 SCATTER_WHT 2, 3, 2
yading@10 1231 RET
yading@10 1232 %endmacro
yading@10 1233
yading@10 1234 %if ARCH_X86_32
yading@10 1235 INIT_MMX mmx
yading@10 1236 VP8_DC_WHT
yading@10 1237 %endif
yading@10 1238 INIT_MMX sse
yading@10 1239 VP8_DC_WHT
yading@10 1240
yading@10 1241 ;-----------------------------------------------------------------------------
yading@10 1242 ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
yading@10 1243 ;-----------------------------------------------------------------------------
yading@10 1244
yading@10 1245 ; macro called with 7 mm register indexes as argument, and 4 regular registers
yading@10 1246 ;
yading@10 1247 ; first 4 mm registers will carry the transposed pixel data
yading@10 1248 ; the other three are scratchspace (one would be sufficient, but this allows
yading@10 1249 ; for more spreading/pipelining and thus faster execution on OOE CPUs)
yading@10 1250 ;
yading@10 1251 ; first two regular registers are buf+4*stride and buf+5*stride
yading@10 1252 ; third is -stride, fourth is +stride
yading@10 1253 %macro READ_8x4_INTERLEAVED 11
yading@10 1254 ; interleave 8 (A-H) rows of 4 pixels each
yading@10 1255 movd m%1, [%8+%10*4] ; A0-3
yading@10 1256 movd m%5, [%9+%10*4] ; B0-3
yading@10 1257 movd m%2, [%8+%10*2] ; C0-3
yading@10 1258 movd m%6, [%8+%10] ; D0-3
yading@10 1259 movd m%3, [%8] ; E0-3
yading@10 1260 movd m%7, [%9] ; F0-3
yading@10 1261 movd m%4, [%9+%11] ; G0-3
yading@10 1262 punpcklbw m%1, m%5 ; A/B interleaved
yading@10 1263 movd m%5, [%9+%11*2] ; H0-3
yading@10 1264 punpcklbw m%2, m%6 ; C/D interleaved
yading@10 1265 punpcklbw m%3, m%7 ; E/F interleaved
yading@10 1266 punpcklbw m%4, m%5 ; G/H interleaved
yading@10 1267 %endmacro
yading@10 1268
yading@10 1269 ; macro called with 7 mm register indexes as argument, and 5 regular registers
yading@10 1270 ; first 11 mean the same as READ_8x4_TRANSPOSED above
yading@10 1271 ; fifth regular register is scratchspace to reach the bottom 8 rows, it
yading@10 1272 ; will be set to second regular register + 8*stride at the end
yading@10 1273 %macro READ_16x4_INTERLEAVED 12
yading@10 1274 ; transpose 16 (A-P) rows of 4 pixels each
yading@10 1275 lea %12, [r0+8*r2]
yading@10 1276
yading@10 1277 ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
yading@10 1278 movd m%1, [%8+%10*4] ; A0-3
yading@10 1279 movd m%3, [%12+%10*4] ; I0-3
yading@10 1280 movd m%2, [%8+%10*2] ; C0-3
yading@10 1281 movd m%4, [%12+%10*2] ; K0-3
yading@10 1282 movd m%6, [%8+%10] ; D0-3
yading@10 1283 movd m%5, [%12+%10] ; L0-3
yading@10 1284 movd m%7, [%12] ; M0-3
yading@10 1285 add %12, %11
yading@10 1286 punpcklbw m%1, m%3 ; A/I
yading@10 1287 movd m%3, [%8] ; E0-3
yading@10 1288 punpcklbw m%2, m%4 ; C/K
yading@10 1289 punpcklbw m%6, m%5 ; D/L
yading@10 1290 punpcklbw m%3, m%7 ; E/M
yading@10 1291 punpcklbw m%2, m%6 ; C/D/K/L interleaved
yading@10 1292
yading@10 1293 ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
yading@10 1294 movd m%5, [%9+%10*4] ; B0-3
yading@10 1295 movd m%4, [%12+%10*4] ; J0-3
yading@10 1296 movd m%7, [%9] ; F0-3
yading@10 1297 movd m%6, [%12] ; N0-3
yading@10 1298 punpcklbw m%5, m%4 ; B/J
yading@10 1299 punpcklbw m%7, m%6 ; F/N
yading@10 1300 punpcklbw m%1, m%5 ; A/B/I/J interleaved
yading@10 1301 punpcklbw m%3, m%7 ; E/F/M/N interleaved
yading@10 1302 movd m%4, [%9+%11] ; G0-3
yading@10 1303 movd m%6, [%12+%11] ; O0-3
yading@10 1304 movd m%5, [%9+%11*2] ; H0-3
yading@10 1305 movd m%7, [%12+%11*2] ; P0-3
yading@10 1306 punpcklbw m%4, m%6 ; G/O
yading@10 1307 punpcklbw m%5, m%7 ; H/P
yading@10 1308 punpcklbw m%4, m%5 ; G/H/O/P interleaved
yading@10 1309 %endmacro
yading@10 1310
yading@10 1311 ; write 4 mm registers of 2 dwords each
yading@10 1312 ; first four arguments are mm register indexes containing source data
yading@10 1313 ; last four are registers containing buf+4*stride, buf+5*stride,
yading@10 1314 ; -stride and +stride
yading@10 1315 %macro WRITE_4x2D 8
yading@10 1316 ; write out (2 dwords per register)
yading@10 1317 movd [%5+%7*4], m%1
yading@10 1318 movd [%5+%7*2], m%2
yading@10 1319 movd [%5], m%3
yading@10 1320 movd [%6+%8], m%4
yading@10 1321 punpckhdq m%1, m%1
yading@10 1322 punpckhdq m%2, m%2
yading@10 1323 punpckhdq m%3, m%3
yading@10 1324 punpckhdq m%4, m%4
yading@10 1325 movd [%6+%7*4], m%1
yading@10 1326 movd [%5+%7], m%2
yading@10 1327 movd [%6], m%3
yading@10 1328 movd [%6+%8*2], m%4
yading@10 1329 %endmacro
yading@10 1330
yading@10 1331 ; write 4 xmm registers of 4 dwords each
yading@10 1332 ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
yading@10 1333 ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
yading@10 1334 ; we add 1*stride to the third regular registry in the process
yading@10 1335 ; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
yading@10 1336 ; same memory region), or 8 if they cover two separate buffers (third one points to
yading@10 1337 ; a different memory region than the first two), allowing for more optimal code for
yading@10 1338 ; the 16-width case
yading@10 1339 %macro WRITE_4x4D 10
yading@10 1340 ; write out (4 dwords per register), start with dwords zero
yading@10 1341 movd [%5+%8*4], m%1
yading@10 1342 movd [%5], m%2
yading@10 1343 movd [%7+%8*4], m%3
yading@10 1344 movd [%7], m%4
yading@10 1345
yading@10 1346 ; store dwords 1
yading@10 1347 psrldq m%1, 4
yading@10 1348 psrldq m%2, 4
yading@10 1349 psrldq m%3, 4
yading@10 1350 psrldq m%4, 4
yading@10 1351 movd [%6+%8*4], m%1
yading@10 1352 movd [%6], m%2
yading@10 1353 %if %10 == 16
yading@10 1354 movd [%6+%9*4], m%3
yading@10 1355 %endif
yading@10 1356 movd [%7+%9], m%4
yading@10 1357
yading@10 1358 ; write dwords 2
yading@10 1359 psrldq m%1, 4
yading@10 1360 psrldq m%2, 4
yading@10 1361 %if %10 == 8
yading@10 1362 movd [%5+%8*2], m%1
yading@10 1363 movd %5d, m%3
yading@10 1364 %endif
yading@10 1365 psrldq m%3, 4
yading@10 1366 psrldq m%4, 4
yading@10 1367 %if %10 == 16
yading@10 1368 movd [%5+%8*2], m%1
yading@10 1369 %endif
yading@10 1370 movd [%6+%9], m%2
yading@10 1371 movd [%7+%8*2], m%3
yading@10 1372 movd [%7+%9*2], m%4
yading@10 1373 add %7, %9
yading@10 1374
yading@10 1375 ; store dwords 3
yading@10 1376 psrldq m%1, 4
yading@10 1377 psrldq m%2, 4
yading@10 1378 psrldq m%3, 4
yading@10 1379 psrldq m%4, 4
yading@10 1380 %if %10 == 8
yading@10 1381 mov [%7+%8*4], %5d
yading@10 1382 movd [%6+%8*2], m%1
yading@10 1383 %else
yading@10 1384 movd [%5+%8], m%1
yading@10 1385 %endif
yading@10 1386 movd [%6+%9*2], m%2
yading@10 1387 movd [%7+%8*2], m%3
yading@10 1388 movd [%7+%9*2], m%4
yading@10 1389 %endmacro
yading@10 1390
yading@10 1391 ; write 4 or 8 words in the mmx/xmm registers as 8 lines
yading@10 1392 ; 1 and 2 are the registers to write, this can be the same (for SSE2)
yading@10 1393 ; for pre-SSE4:
yading@10 1394 ; 3 is a general-purpose register that we will clobber
yading@10 1395 ; for SSE4:
yading@10 1396 ; 3 is a pointer to the destination's 5th line
yading@10 1397 ; 4 is a pointer to the destination's 4th line
yading@10 1398 ; 5/6 is -stride and +stride
yading@10 1399 %macro WRITE_2x4W 6
yading@10 1400 movd %3d, %1
yading@10 1401 punpckhdq %1, %1
yading@10 1402 mov [%4+%5*4], %3w
yading@10 1403 shr %3, 16
yading@10 1404 add %4, %6
yading@10 1405 mov [%4+%5*4], %3w
yading@10 1406
yading@10 1407 movd %3d, %1
yading@10 1408 add %4, %5
yading@10 1409 mov [%4+%5*2], %3w
yading@10 1410 shr %3, 16
yading@10 1411 mov [%4+%5 ], %3w
yading@10 1412
yading@10 1413 movd %3d, %2
yading@10 1414 punpckhdq %2, %2
yading@10 1415 mov [%4 ], %3w
yading@10 1416 shr %3, 16
yading@10 1417 mov [%4+%6 ], %3w
yading@10 1418
yading@10 1419 movd %3d, %2
yading@10 1420 add %4, %6
yading@10 1421 mov [%4+%6 ], %3w
yading@10 1422 shr %3, 16
yading@10 1423 mov [%4+%6*2], %3w
yading@10 1424 add %4, %5
yading@10 1425 %endmacro
yading@10 1426
yading@10 1427 %macro WRITE_8W 5
yading@10 1428 %if cpuflag(sse4)
yading@10 1429 pextrw [%3+%4*4], %1, 0
yading@10 1430 pextrw [%2+%4*4], %1, 1
yading@10 1431 pextrw [%3+%4*2], %1, 2
yading@10 1432 pextrw [%3+%4 ], %1, 3
yading@10 1433 pextrw [%3 ], %1, 4
yading@10 1434 pextrw [%2 ], %1, 5
yading@10 1435 pextrw [%2+%5 ], %1, 6
yading@10 1436 pextrw [%2+%5*2], %1, 7
yading@10 1437 %else
yading@10 1438 movd %2d, %1
yading@10 1439 psrldq %1, 4
yading@10 1440 mov [%3+%4*4], %2w
yading@10 1441 shr %2, 16
yading@10 1442 add %3, %5
yading@10 1443 mov [%3+%4*4], %2w
yading@10 1444
yading@10 1445 movd %2d, %1
yading@10 1446 psrldq %1, 4
yading@10 1447 add %3, %4
yading@10 1448 mov [%3+%4*2], %2w
yading@10 1449 shr %2, 16
yading@10 1450 mov [%3+%4 ], %2w
yading@10 1451
yading@10 1452 movd %2d, %1
yading@10 1453 psrldq %1, 4
yading@10 1454 mov [%3 ], %2w
yading@10 1455 shr %2, 16
yading@10 1456 mov [%3+%5 ], %2w
yading@10 1457
yading@10 1458 movd %2d, %1
yading@10 1459 add %3, %5
yading@10 1460 mov [%3+%5 ], %2w
yading@10 1461 shr %2, 16
yading@10 1462 mov [%3+%5*2], %2w
yading@10 1463 %endif
yading@10 1464 %endmacro
yading@10 1465
yading@10 1466 %macro SIMPLE_LOOPFILTER 2
yading@10 1467 cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr
yading@10 1468 %if mmsize == 8 ; mmx/mmxext
yading@10 1469 mov cntrq, 2
yading@10 1470 %endif
yading@10 1471 %if cpuflag(ssse3)
yading@10 1472 pxor m0, m0
yading@10 1473 %endif
yading@10 1474 SPLATB_REG m7, flim, m0 ; splat "flim" into register
yading@10 1475
yading@10 1476 ; set up indexes to address 4 rows
yading@10 1477 %if mmsize == 8
yading@10 1478 DEFINE_ARGS dst1, mstride, stride, cntr, dst2
yading@10 1479 %else
yading@10 1480 DEFINE_ARGS dst1, mstride, stride, dst3, dst2
yading@10 1481 %endif
yading@10 1482 mov strideq, mstrideq
yading@10 1483 neg mstrideq
yading@10 1484 %ifidn %1, h
yading@10 1485 lea dst1q, [dst1q+4*strideq-2]
yading@10 1486 %endif
yading@10 1487
yading@10 1488 %if mmsize == 8 ; mmx / mmxext
yading@10 1489 .next8px:
yading@10 1490 %endif
yading@10 1491 %ifidn %1, v
yading@10 1492 ; read 4 half/full rows of pixels
yading@10 1493 mova m0, [dst1q+mstrideq*2] ; p1
yading@10 1494 mova m1, [dst1q+mstrideq] ; p0
yading@10 1495 mova m2, [dst1q] ; q0
yading@10 1496 mova m3, [dst1q+ strideq] ; q1
yading@10 1497 %else ; h
yading@10 1498 lea dst2q, [dst1q+ strideq]
yading@10 1499
yading@10 1500 %if mmsize == 8 ; mmx/mmxext
yading@10 1501 READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq
yading@10 1502 %else ; sse2
yading@10 1503 READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q
yading@10 1504 %endif
yading@10 1505 TRANSPOSE4x4W 0, 1, 2, 3, 4
yading@10 1506 %endif
yading@10 1507
yading@10 1508 ; simple_limit
yading@10 1509 mova m5, m2 ; m5=backup of q0
yading@10 1510 mova m6, m1 ; m6=backup of p0
yading@10 1511 psubusb m1, m2 ; p0-q0
yading@10 1512 psubusb m2, m6 ; q0-p0
yading@10 1513 por m1, m2 ; FFABS(p0-q0)
yading@10 1514 paddusb m1, m1 ; m1=FFABS(p0-q0)*2
yading@10 1515
yading@10 1516 mova m4, m3
yading@10 1517 mova m2, m0
yading@10 1518 psubusb m3, m0 ; q1-p1
yading@10 1519 psubusb m0, m4 ; p1-q1
yading@10 1520 por m3, m0 ; FFABS(p1-q1)
yading@10 1521 mova m0, [pb_80]
yading@10 1522 pxor m2, m0
yading@10 1523 pxor m4, m0
yading@10 1524 psubsb m2, m4 ; m2=p1-q1 (signed) backup for below
yading@10 1525 pand m3, [pb_FE]
yading@10 1526 psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed
yading@10 1527 paddusb m3, m1
yading@10 1528 psubusb m3, m7
yading@10 1529 pxor m1, m1
yading@10 1530 pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
yading@10 1531
yading@10 1532 ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
yading@10 1533 mova m4, m5
yading@10 1534 pxor m5, m0
yading@10 1535 pxor m0, m6
yading@10 1536 psubsb m5, m0 ; q0-p0 (signed)
yading@10 1537 paddsb m2, m5
yading@10 1538 paddsb m2, m5
yading@10 1539 paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0)
yading@10 1540 pand m2, m3 ; apply filter mask (m3)
yading@10 1541
yading@10 1542 mova m3, [pb_F8]
yading@10 1543 mova m1, m2
yading@10 1544 paddsb m2, [pb_4] ; f1<<3=a+4
yading@10 1545 paddsb m1, [pb_3] ; f2<<3=a+3
yading@10 1546 pand m2, m3
yading@10 1547 pand m1, m3 ; cache f2<<3
yading@10 1548
yading@10 1549 pxor m0, m0
yading@10 1550 pxor m3, m3
yading@10 1551 pcmpgtb m0, m2 ; which values are <0?
yading@10 1552 psubb m3, m2 ; -f1<<3
yading@10 1553 psrlq m2, 3 ; +f1
yading@10 1554 psrlq m3, 3 ; -f1
yading@10 1555 pand m3, m0
yading@10 1556 pandn m0, m2
yading@10 1557 psubusb m4, m0
yading@10 1558 paddusb m4, m3 ; q0-f1
yading@10 1559
yading@10 1560 pxor m0, m0
yading@10 1561 pxor m3, m3
yading@10 1562 pcmpgtb m0, m1 ; which values are <0?
yading@10 1563 psubb m3, m1 ; -f2<<3
yading@10 1564 psrlq m1, 3 ; +f2
yading@10 1565 psrlq m3, 3 ; -f2
yading@10 1566 pand m3, m0
yading@10 1567 pandn m0, m1
yading@10 1568 paddusb m6, m0
yading@10 1569 psubusb m6, m3 ; p0+f2
yading@10 1570
yading@10 1571 ; store
yading@10 1572 %ifidn %1, v
yading@10 1573 mova [dst1q], m4
yading@10 1574 mova [dst1q+mstrideq], m6
yading@10 1575 %else ; h
yading@10 1576 inc dst1q
yading@10 1577 SBUTTERFLY bw, 6, 4, 0
yading@10 1578
yading@10 1579 %if mmsize == 16 ; sse2
yading@10 1580 %if cpuflag(sse4)
yading@10 1581 inc dst2q
yading@10 1582 %endif
yading@10 1583 WRITE_8W m6, dst2q, dst1q, mstrideq, strideq
yading@10 1584 lea dst2q, [dst3q+mstrideq+1]
yading@10 1585 %if cpuflag(sse4)
yading@10 1586 inc dst3q
yading@10 1587 %endif
yading@10 1588 WRITE_8W m4, dst3q, dst2q, mstrideq, strideq
yading@10 1589 %else ; mmx/mmxext
yading@10 1590 WRITE_2x4W m6, m4, dst2q, dst1q, mstrideq, strideq
yading@10 1591 %endif
yading@10 1592 %endif
yading@10 1593
yading@10 1594 %if mmsize == 8 ; mmx/mmxext
yading@10 1595 ; next 8 pixels
yading@10 1596 %ifidn %1, v
yading@10 1597 add dst1q, 8 ; advance 8 cols = pixels
yading@10 1598 %else ; h
yading@10 1599 lea dst1q, [dst1q+strideq*8-1] ; advance 8 rows = lines
yading@10 1600 %endif
yading@10 1601 dec cntrq
yading@10 1602 jg .next8px
yading@10 1603 REP_RET
yading@10 1604 %else ; sse2
yading@10 1605 RET
yading@10 1606 %endif
yading@10 1607 %endmacro
yading@10 1608
yading@10 1609 %if ARCH_X86_32
yading@10 1610 INIT_MMX mmx
yading@10 1611 SIMPLE_LOOPFILTER v, 4
yading@10 1612 SIMPLE_LOOPFILTER h, 5
yading@10 1613 INIT_MMX mmxext
yading@10 1614 SIMPLE_LOOPFILTER v, 4
yading@10 1615 SIMPLE_LOOPFILTER h, 5
yading@10 1616 %endif
yading@10 1617
yading@10 1618 INIT_XMM sse2
yading@10 1619 SIMPLE_LOOPFILTER v, 3
yading@10 1620 SIMPLE_LOOPFILTER h, 5
yading@10 1621 INIT_XMM ssse3
yading@10 1622 SIMPLE_LOOPFILTER v, 3
yading@10 1623 SIMPLE_LOOPFILTER h, 5
yading@10 1624 INIT_XMM sse4
yading@10 1625 SIMPLE_LOOPFILTER h, 5
yading@10 1626
yading@10 1627 ;-----------------------------------------------------------------------------
yading@10 1628 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
yading@10 1629 ; int flimE, int flimI, int hev_thr);
yading@10 1630 ;-----------------------------------------------------------------------------
yading@10 1631
yading@10 1632 %macro INNER_LOOPFILTER 2
yading@10 1633 %define stack_size 0
yading@10 1634 %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
yading@10 1635 %ifidn %1, v ; [3]=hev() result
yading@10 1636 %define stack_size mmsize * -4
yading@10 1637 %else ; h ; extra storage space for transposes
yading@10 1638 %define stack_size mmsize * -5
yading@10 1639 %endif
yading@10 1640 %endif
yading@10 1641
yading@10 1642 %if %2 == 8 ; chroma
yading@10 1643 cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, stack_size, dst, dst8, stride, flimE, flimI, hevthr
yading@10 1644 %else ; luma
yading@10 1645 cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, flimI, hevthr
yading@10 1646 %endif
yading@10 1647
yading@10 1648 %if cpuflag(ssse3)
yading@10 1649 pxor m7, m7
yading@10 1650 %endif
yading@10 1651
yading@10 1652 %ifndef m8
yading@10 1653 ; splat function arguments
yading@10 1654 SPLATB_REG m0, flimEq, m7 ; E
yading@10 1655 SPLATB_REG m1, flimIq, m7 ; I
yading@10 1656 SPLATB_REG m2, hevthrq, m7 ; hev_thresh
yading@10 1657
yading@10 1658 %define m_flimE [rsp]
yading@10 1659 %define m_flimI [rsp+mmsize]
yading@10 1660 %define m_hevthr [rsp+mmsize*2]
yading@10 1661 %define m_maskres [rsp+mmsize*3]
yading@10 1662 %define m_p0backup [rsp+mmsize*3]
yading@10 1663 %define m_q0backup [rsp+mmsize*4]
yading@10 1664
yading@10 1665 mova m_flimE, m0
yading@10 1666 mova m_flimI, m1
yading@10 1667 mova m_hevthr, m2
yading@10 1668 %else
yading@10 1669 %define m_flimE m9
yading@10 1670 %define m_flimI m10
yading@10 1671 %define m_hevthr m11
yading@10 1672 %define m_maskres m12
yading@10 1673 %define m_p0backup m12
yading@10 1674 %define m_q0backup m8
yading@10 1675
yading@10 1676 ; splat function arguments
yading@10 1677 SPLATB_REG m_flimE, flimEq, m7 ; E
yading@10 1678 SPLATB_REG m_flimI, flimIq, m7 ; I
yading@10 1679 SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
yading@10 1680 %endif
yading@10 1681
yading@10 1682 %if %2 == 8 ; chroma
yading@10 1683 DEFINE_ARGS dst1, dst8, mstride, stride, dst2
yading@10 1684 %elif mmsize == 8
yading@10 1685 DEFINE_ARGS dst1, mstride, stride, dst2, cntr
yading@10 1686 mov cntrq, 2
yading@10 1687 %else
yading@10 1688 DEFINE_ARGS dst1, mstride, stride, dst2, dst8
yading@10 1689 %endif
yading@10 1690 mov strideq, mstrideq
yading@10 1691 neg mstrideq
yading@10 1692 %ifidn %1, h
yading@10 1693 lea dst1q, [dst1q+strideq*4-4]
yading@10 1694 %if %2 == 8 ; chroma
yading@10 1695 lea dst8q, [dst8q+strideq*4-4]
yading@10 1696 %endif
yading@10 1697 %endif
yading@10 1698
yading@10 1699 %if mmsize == 8
yading@10 1700 .next8px:
yading@10 1701 %endif
yading@10 1702 ; read
yading@10 1703 lea dst2q, [dst1q+strideq]
yading@10 1704 %ifidn %1, v
yading@10 1705 %if %2 == 8 && mmsize == 16
yading@10 1706 %define movrow movh
yading@10 1707 %else
yading@10 1708 %define movrow mova
yading@10 1709 %endif
yading@10 1710 movrow m0, [dst1q+mstrideq*4] ; p3
yading@10 1711 movrow m1, [dst2q+mstrideq*4] ; p2
yading@10 1712 movrow m2, [dst1q+mstrideq*2] ; p1
yading@10 1713 movrow m5, [dst2q] ; q1
yading@10 1714 movrow m6, [dst2q+ strideq*1] ; q2
yading@10 1715 movrow m7, [dst2q+ strideq*2] ; q3
yading@10 1716 %if mmsize == 16 && %2 == 8
yading@10 1717 movhps m0, [dst8q+mstrideq*4]
yading@10 1718 movhps m2, [dst8q+mstrideq*2]
yading@10 1719 add dst8q, strideq
yading@10 1720 movhps m1, [dst8q+mstrideq*4]
yading@10 1721 movhps m5, [dst8q]
yading@10 1722 movhps m6, [dst8q+ strideq ]
yading@10 1723 movhps m7, [dst8q+ strideq*2]
yading@10 1724 add dst8q, mstrideq
yading@10 1725 %endif
yading@10 1726 %elif mmsize == 8 ; mmx/mmxext (h)
yading@10 1727 ; read 8 rows of 8px each
yading@10 1728 movu m0, [dst1q+mstrideq*4]
yading@10 1729 movu m1, [dst2q+mstrideq*4]
yading@10 1730 movu m2, [dst1q+mstrideq*2]
yading@10 1731 movu m3, [dst1q+mstrideq ]
yading@10 1732 movu m4, [dst1q]
yading@10 1733 movu m5, [dst2q]
yading@10 1734 movu m6, [dst2q+ strideq ]
yading@10 1735
yading@10 1736 ; 8x8 transpose
yading@10 1737 TRANSPOSE4x4B 0, 1, 2, 3, 7
yading@10 1738 mova m_q0backup, m1
yading@10 1739 movu m7, [dst2q+ strideq*2]
yading@10 1740 TRANSPOSE4x4B 4, 5, 6, 7, 1
yading@10 1741 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
yading@10 1742 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
yading@10 1743 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
yading@10 1744 mova m1, m_q0backup
yading@10 1745 mova m_q0backup, m2 ; store q0
yading@10 1746 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
yading@10 1747 mova m_p0backup, m5 ; store p0
yading@10 1748 SWAP 1, 4
yading@10 1749 SWAP 2, 4
yading@10 1750 SWAP 6, 3
yading@10 1751 SWAP 5, 3
yading@10 1752 %else ; sse2 (h)
yading@10 1753 %if %2 == 16
yading@10 1754 lea dst8q, [dst1q+ strideq*8]
yading@10 1755 %endif
yading@10 1756
yading@10 1757 ; read 16 rows of 8px each, interleave
yading@10 1758 movh m0, [dst1q+mstrideq*4]
yading@10 1759 movh m1, [dst8q+mstrideq*4]
yading@10 1760 movh m2, [dst1q+mstrideq*2]
yading@10 1761 movh m5, [dst8q+mstrideq*2]
yading@10 1762 movh m3, [dst1q+mstrideq ]
yading@10 1763 movh m6, [dst8q+mstrideq ]
yading@10 1764 movh m4, [dst1q]
yading@10 1765 movh m7, [dst8q]
yading@10 1766 punpcklbw m0, m1 ; A/I
yading@10 1767 punpcklbw m2, m5 ; C/K
yading@10 1768 punpcklbw m3, m6 ; D/L
yading@10 1769 punpcklbw m4, m7 ; E/M
yading@10 1770
yading@10 1771 add dst8q, strideq
yading@10 1772 movh m1, [dst2q+mstrideq*4]
yading@10 1773 movh m6, [dst8q+mstrideq*4]
yading@10 1774 movh m5, [dst2q]
yading@10 1775 movh m7, [dst8q]
yading@10 1776 punpcklbw m1, m6 ; B/J
yading@10 1777 punpcklbw m5, m7 ; F/N
yading@10 1778 movh m6, [dst2q+ strideq ]
yading@10 1779 movh m7, [dst8q+ strideq ]
yading@10 1780 punpcklbw m6, m7 ; G/O
yading@10 1781
yading@10 1782 ; 8x16 transpose
yading@10 1783 TRANSPOSE4x4B 0, 1, 2, 3, 7
yading@10 1784 %ifdef m8
yading@10 1785 SWAP 1, 8
yading@10 1786 %else
yading@10 1787 mova m_q0backup, m1
yading@10 1788 %endif
yading@10 1789 movh m7, [dst2q+ strideq*2]
yading@10 1790 movh m1, [dst8q+ strideq*2]
yading@10 1791 punpcklbw m7, m1 ; H/P
yading@10 1792 TRANSPOSE4x4B 4, 5, 6, 7, 1
yading@10 1793 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
yading@10 1794 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
yading@10 1795 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
yading@10 1796 %ifdef m8
yading@10 1797 SWAP 1, 8
yading@10 1798 SWAP 2, 8
yading@10 1799 %else
yading@10 1800 mova m1, m_q0backup
yading@10 1801 mova m_q0backup, m2 ; store q0
yading@10 1802 %endif
yading@10 1803 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
yading@10 1804 %ifdef m12
yading@10 1805 SWAP 5, 12
yading@10 1806 %else
yading@10 1807 mova m_p0backup, m5 ; store p0
yading@10 1808 %endif
yading@10 1809 SWAP 1, 4
yading@10 1810 SWAP 2, 4
yading@10 1811 SWAP 6, 3
yading@10 1812 SWAP 5, 3
yading@10 1813 %endif
yading@10 1814
yading@10 1815 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
yading@10 1816 mova m4, m1
yading@10 1817 SWAP 4, 1
yading@10 1818 psubusb m4, m0 ; p2-p3
yading@10 1819 psubusb m0, m1 ; p3-p2
yading@10 1820 por m0, m4 ; abs(p3-p2)
yading@10 1821
yading@10 1822 mova m4, m2
yading@10 1823 SWAP 4, 2
yading@10 1824 psubusb m4, m1 ; p1-p2
yading@10 1825 psubusb m1, m2 ; p2-p1
yading@10 1826 por m1, m4 ; abs(p2-p1)
yading@10 1827
yading@10 1828 mova m4, m6
yading@10 1829 SWAP 4, 6
yading@10 1830 psubusb m4, m7 ; q2-q3
yading@10 1831 psubusb m7, m6 ; q3-q2
yading@10 1832 por m7, m4 ; abs(q3-q2)
yading@10 1833
yading@10 1834 mova m4, m5
yading@10 1835 SWAP 4, 5
yading@10 1836 psubusb m4, m6 ; q1-q2
yading@10 1837 psubusb m6, m5 ; q2-q1
yading@10 1838 por m6, m4 ; abs(q2-q1)
yading@10 1839
yading@10 1840 %if notcpuflag(mmxext)
yading@10 1841 mova m4, m_flimI
yading@10 1842 pxor m3, m3
yading@10 1843 psubusb m0, m4
yading@10 1844 psubusb m1, m4
yading@10 1845 psubusb m7, m4
yading@10 1846 psubusb m6, m4
yading@10 1847 pcmpeqb m0, m3 ; abs(p3-p2) <= I
yading@10 1848 pcmpeqb m1, m3 ; abs(p2-p1) <= I
yading@10 1849 pcmpeqb m7, m3 ; abs(q3-q2) <= I
yading@10 1850 pcmpeqb m6, m3 ; abs(q2-q1) <= I
yading@10 1851 pand m0, m1
yading@10 1852 pand m7, m6
yading@10 1853 pand m0, m7
yading@10 1854 %else ; mmxext/sse2
yading@10 1855 pmaxub m0, m1
yading@10 1856 pmaxub m6, m7
yading@10 1857 pmaxub m0, m6
yading@10 1858 %endif
yading@10 1859
yading@10 1860 ; normal_limit and high_edge_variance for p1-p0, q1-q0
yading@10 1861 SWAP 7, 3 ; now m7 is zero
yading@10 1862 %ifidn %1, v
yading@10 1863 movrow m3, [dst1q+mstrideq ] ; p0
yading@10 1864 %if mmsize == 16 && %2 == 8
yading@10 1865 movhps m3, [dst8q+mstrideq ]
yading@10 1866 %endif
yading@10 1867 %elifdef m12
yading@10 1868 SWAP 3, 12
yading@10 1869 %else
yading@10 1870 mova m3, m_p0backup
yading@10 1871 %endif
yading@10 1872
yading@10 1873 mova m1, m2
yading@10 1874 SWAP 1, 2
yading@10 1875 mova m6, m3
yading@10 1876 SWAP 3, 6
yading@10 1877 psubusb m1, m3 ; p1-p0
yading@10 1878 psubusb m6, m2 ; p0-p1
yading@10 1879 por m1, m6 ; abs(p1-p0)
yading@10 1880 %if notcpuflag(mmxext)
yading@10 1881 mova m6, m1
yading@10 1882 psubusb m1, m4
yading@10 1883 psubusb m6, m_hevthr
yading@10 1884 pcmpeqb m1, m7 ; abs(p1-p0) <= I
yading@10 1885 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
yading@10 1886 pand m0, m1
yading@10 1887 mova m_maskres, m6
yading@10 1888 %else ; mmxext/sse2
yading@10 1889 pmaxub m0, m1 ; max_I
yading@10 1890 SWAP 1, 4 ; max_hev_thresh
yading@10 1891 %endif
yading@10 1892
yading@10 1893 SWAP 6, 4 ; now m6 is I
yading@10 1894 %ifidn %1, v
yading@10 1895 movrow m4, [dst1q] ; q0
yading@10 1896 %if mmsize == 16 && %2 == 8
yading@10 1897 movhps m4, [dst8q]
yading@10 1898 %endif
yading@10 1899 %elifdef m8
yading@10 1900 SWAP 4, 8
yading@10 1901 %else
yading@10 1902 mova m4, m_q0backup
yading@10 1903 %endif
yading@10 1904 mova m1, m4
yading@10 1905 SWAP 1, 4
yading@10 1906 mova m7, m5
yading@10 1907 SWAP 7, 5
yading@10 1908 psubusb m1, m5 ; q0-q1
yading@10 1909 psubusb m7, m4 ; q1-q0
yading@10 1910 por m1, m7 ; abs(q1-q0)
yading@10 1911 %if notcpuflag(mmxext)
yading@10 1912 mova m7, m1
yading@10 1913 psubusb m1, m6
yading@10 1914 psubusb m7, m_hevthr
yading@10 1915 pxor m6, m6
yading@10 1916 pcmpeqb m1, m6 ; abs(q1-q0) <= I
yading@10 1917 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
yading@10 1918 mova m6, m_maskres
yading@10 1919 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
yading@10 1920 pand m6, m7
yading@10 1921 %else ; mmxext/sse2
yading@10 1922 pxor m7, m7
yading@10 1923 pmaxub m0, m1
yading@10 1924 pmaxub m6, m1
yading@10 1925 psubusb m0, m_flimI
yading@10 1926 psubusb m6, m_hevthr
yading@10 1927 pcmpeqb m0, m7 ; max(abs(..)) <= I
yading@10 1928 pcmpeqb m6, m7 ; !(max(abs..) > thresh)
yading@10 1929 %endif
yading@10 1930 %ifdef m12
yading@10 1931 SWAP 6, 12
yading@10 1932 %else
yading@10 1933 mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
yading@10 1934 %endif
yading@10 1935
yading@10 1936 ; simple_limit
yading@10 1937 mova m1, m3
yading@10 1938 SWAP 1, 3
yading@10 1939 mova m6, m4 ; keep copies of p0/q0 around for later use
yading@10 1940 SWAP 6, 4
yading@10 1941 psubusb m1, m4 ; p0-q0
yading@10 1942 psubusb m6, m3 ; q0-p0
yading@10 1943 por m1, m6 ; abs(q0-p0)
yading@10 1944 paddusb m1, m1 ; m1=2*abs(q0-p0)
yading@10 1945
yading@10 1946 mova m7, m2
yading@10 1947 SWAP 7, 2
yading@10 1948 mova m6, m5
yading@10 1949 SWAP 6, 5
yading@10 1950 psubusb m7, m5 ; p1-q1
yading@10 1951 psubusb m6, m2 ; q1-p1
yading@10 1952 por m7, m6 ; abs(q1-p1)
yading@10 1953 pxor m6, m6
yading@10 1954 pand m7, [pb_FE]
yading@10 1955 psrlq m7, 1 ; abs(q1-p1)/2
yading@10 1956 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
yading@10 1957 psubusb m7, m_flimE
yading@10 1958 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
yading@10 1959 pand m0, m7 ; normal_limit result
yading@10 1960
yading@10 1961 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
yading@10 1962 %ifdef m8 ; x86-64 && sse2
yading@10 1963 mova m8, [pb_80]
yading@10 1964 %define m_pb_80 m8
yading@10 1965 %else ; x86-32 or mmx/mmxext
yading@10 1966 %define m_pb_80 [pb_80]
yading@10 1967 %endif
yading@10 1968 mova m1, m4
yading@10 1969 mova m7, m3
yading@10 1970 pxor m1, m_pb_80
yading@10 1971 pxor m7, m_pb_80
yading@10 1972 psubsb m1, m7 ; (signed) q0-p0
yading@10 1973 mova m6, m2
yading@10 1974 mova m7, m5
yading@10 1975 pxor m6, m_pb_80
yading@10 1976 pxor m7, m_pb_80
yading@10 1977 psubsb m6, m7 ; (signed) p1-q1
yading@10 1978 mova m7, m_maskres
yading@10 1979 pandn m7, m6
yading@10 1980 paddsb m7, m1
yading@10 1981 paddsb m7, m1
yading@10 1982 paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1)
yading@10 1983
yading@10 1984 pand m7, m0
yading@10 1985 mova m1, [pb_F8]
yading@10 1986 mova m6, m7
yading@10 1987 paddsb m7, [pb_3]
yading@10 1988 paddsb m6, [pb_4]
yading@10 1989 pand m7, m1
yading@10 1990 pand m6, m1
yading@10 1991
yading@10 1992 pxor m1, m1
yading@10 1993 pxor m0, m0
yading@10 1994 pcmpgtb m1, m7
yading@10 1995 psubb m0, m7
yading@10 1996 psrlq m7, 3 ; +f2
yading@10 1997 psrlq m0, 3 ; -f2
yading@10 1998 pand m0, m1
yading@10 1999 pandn m1, m7
yading@10 2000 psubusb m3, m0
yading@10 2001 paddusb m3, m1 ; p0+f2
yading@10 2002
yading@10 2003 pxor m1, m1
yading@10 2004 pxor m0, m0
yading@10 2005 pcmpgtb m0, m6
yading@10 2006 psubb m1, m6
yading@10 2007 psrlq m6, 3 ; +f1
yading@10 2008 psrlq m1, 3 ; -f1
yading@10 2009 pand m1, m0
yading@10 2010 pandn m0, m6
yading@10 2011 psubusb m4, m0
yading@10 2012 paddusb m4, m1 ; q0-f1
yading@10 2013
yading@10 2014 %ifdef m12
yading@10 2015 SWAP 6, 12
yading@10 2016 %else
yading@10 2017 mova m6, m_maskres
yading@10 2018 %endif
yading@10 2019 %if notcpuflag(mmxext)
yading@10 2020 mova m7, [pb_1]
yading@10 2021 %else ; mmxext/sse2
yading@10 2022 pxor m7, m7
yading@10 2023 %endif
yading@10 2024 pand m0, m6
yading@10 2025 pand m1, m6
yading@10 2026 %if notcpuflag(mmxext)
yading@10 2027 paddusb m0, m7
yading@10 2028 pand m1, [pb_FE]
yading@10 2029 pandn m7, m0
yading@10 2030 psrlq m1, 1
yading@10 2031 psrlq m7, 1
yading@10 2032 SWAP 0, 7
yading@10 2033 %else ; mmxext/sse2
yading@10 2034 psubusb m1, [pb_1]
yading@10 2035 pavgb m0, m7 ; a
yading@10 2036 pavgb m1, m7 ; -a
yading@10 2037 %endif
yading@10 2038 psubusb m5, m0
yading@10 2039 psubusb m2, m1
yading@10 2040 paddusb m5, m1 ; q1-a
yading@10 2041 paddusb m2, m0 ; p1+a
yading@10 2042
yading@10 2043 ; store
yading@10 2044 %ifidn %1, v
yading@10 2045 movrow [dst1q+mstrideq*2], m2
yading@10 2046 movrow [dst1q+mstrideq ], m3
yading@10 2047 movrow [dst1q], m4
yading@10 2048 movrow [dst1q+ strideq ], m5
yading@10 2049 %if mmsize == 16 && %2 == 8
yading@10 2050 movhps [dst8q+mstrideq*2], m2
yading@10 2051 movhps [dst8q+mstrideq ], m3
yading@10 2052 movhps [dst8q], m4
yading@10 2053 movhps [dst8q+ strideq ], m5
yading@10 2054 %endif
yading@10 2055 %else ; h
yading@10 2056 add dst1q, 2
yading@10 2057 add dst2q, 2
yading@10 2058
yading@10 2059 ; 4x8/16 transpose
yading@10 2060 TRANSPOSE4x4B 2, 3, 4, 5, 6
yading@10 2061
yading@10 2062 %if mmsize == 8 ; mmx/mmxext (h)
yading@10 2063 WRITE_4x2D 2, 3, 4, 5, dst1q, dst2q, mstrideq, strideq
yading@10 2064 %else ; sse2 (h)
yading@10 2065 lea dst8q, [dst8q+mstrideq +2]
yading@10 2066 WRITE_4x4D 2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2
yading@10 2067 %endif
yading@10 2068 %endif
yading@10 2069
yading@10 2070 %if mmsize == 8
yading@10 2071 %if %2 == 8 ; chroma
yading@10 2072 %ifidn %1, h
yading@10 2073 sub dst1q, 2
yading@10 2074 %endif
yading@10 2075 cmp dst1q, dst8q
yading@10 2076 mov dst1q, dst8q
yading@10 2077 jnz .next8px
yading@10 2078 %else
yading@10 2079 %ifidn %1, h
yading@10 2080 lea dst1q, [dst1q+ strideq*8-2]
yading@10 2081 %else ; v
yading@10 2082 add dst1q, 8
yading@10 2083 %endif
yading@10 2084 dec cntrq
yading@10 2085 jg .next8px
yading@10 2086 %endif
yading@10 2087 REP_RET
yading@10 2088 %else ; mmsize == 16
yading@10 2089 RET
yading@10 2090 %endif
yading@10 2091 %endmacro
yading@10 2092
yading@10 2093 %if ARCH_X86_32
yading@10 2094 INIT_MMX mmx
yading@10 2095 INNER_LOOPFILTER v, 16
yading@10 2096 INNER_LOOPFILTER h, 16
yading@10 2097 INNER_LOOPFILTER v, 8
yading@10 2098 INNER_LOOPFILTER h, 8
yading@10 2099
yading@10 2100 INIT_MMX mmxext
yading@10 2101 INNER_LOOPFILTER v, 16
yading@10 2102 INNER_LOOPFILTER h, 16
yading@10 2103 INNER_LOOPFILTER v, 8
yading@10 2104 INNER_LOOPFILTER h, 8
yading@10 2105 %endif
yading@10 2106
yading@10 2107 INIT_XMM sse2
yading@10 2108 INNER_LOOPFILTER v, 16
yading@10 2109 INNER_LOOPFILTER h, 16
yading@10 2110 INNER_LOOPFILTER v, 8
yading@10 2111 INNER_LOOPFILTER h, 8
yading@10 2112
yading@10 2113 INIT_XMM ssse3
yading@10 2114 INNER_LOOPFILTER v, 16
yading@10 2115 INNER_LOOPFILTER h, 16
yading@10 2116 INNER_LOOPFILTER v, 8
yading@10 2117 INNER_LOOPFILTER h, 8
yading@10 2118
yading@10 2119 ;-----------------------------------------------------------------------------
yading@10 2120 ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
yading@10 2121 ; int flimE, int flimI, int hev_thr);
yading@10 2122 ;-----------------------------------------------------------------------------
yading@10 2123
yading@10 2124 %macro MBEDGE_LOOPFILTER 2
yading@10 2125 %define stack_size 0
yading@10 2126 %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
yading@10 2127 %if mmsize == 16 ; [3]=hev() result
yading@10 2128 ; [4]=filter tmp result
yading@10 2129 ; [5]/[6] = p2/q2 backup
yading@10 2130 ; [7]=lim_res sign result
yading@10 2131 %define stack_size mmsize * -7
yading@10 2132 %else ; 8 ; extra storage space for transposes
yading@10 2133 %define stack_size mmsize * -8
yading@10 2134 %endif
yading@10 2135 %endif
yading@10 2136
yading@10 2137 %if %2 == 8 ; chroma
yading@10 2138 cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, stack_size, dst1, dst8, stride, flimE, flimI, hevthr
yading@10 2139 %else ; luma
yading@10 2140 cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, flimI, hevthr
yading@10 2141 %endif
yading@10 2142
yading@10 2143 %if cpuflag(ssse3)
yading@10 2144 pxor m7, m7
yading@10 2145 %endif
yading@10 2146
yading@10 2147 %ifndef m8
yading@10 2148 ; splat function arguments
yading@10 2149 SPLATB_REG m0, flimEq, m7 ; E
yading@10 2150 SPLATB_REG m1, flimIq, m7 ; I
yading@10 2151 SPLATB_REG m2, hevthrq, m7 ; hev_thresh
yading@10 2152
yading@10 2153 %define m_flimE [rsp]
yading@10 2154 %define m_flimI [rsp+mmsize]
yading@10 2155 %define m_hevthr [rsp+mmsize*2]
yading@10 2156 %define m_maskres [rsp+mmsize*3]
yading@10 2157 %define m_limres [rsp+mmsize*4]
yading@10 2158 %define m_p0backup [rsp+mmsize*3]
yading@10 2159 %define m_q0backup [rsp+mmsize*4]
yading@10 2160 %define m_p2backup [rsp+mmsize*5]
yading@10 2161 %define m_q2backup [rsp+mmsize*6]
yading@10 2162 %if mmsize == 16
yading@10 2163 %define m_limsign [rsp]
yading@10 2164 %else
yading@10 2165 %define m_limsign [rsp+mmsize*7]
yading@10 2166 %endif
yading@10 2167
yading@10 2168 mova m_flimE, m0
yading@10 2169 mova m_flimI, m1
yading@10 2170 mova m_hevthr, m2
yading@10 2171 %else ; sse2 on x86-64
yading@10 2172 %define m_flimE m9
yading@10 2173 %define m_flimI m10
yading@10 2174 %define m_hevthr m11
yading@10 2175 %define m_maskres m12
yading@10 2176 %define m_limres m8
yading@10 2177 %define m_p0backup m12
yading@10 2178 %define m_q0backup m8
yading@10 2179 %define m_p2backup m13
yading@10 2180 %define m_q2backup m14
yading@10 2181 %define m_limsign m9
yading@10 2182
yading@10 2183 ; splat function arguments
yading@10 2184 SPLATB_REG m_flimE, flimEq, m7 ; E
yading@10 2185 SPLATB_REG m_flimI, flimIq, m7 ; I
yading@10 2186 SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
yading@10 2187 %endif
yading@10 2188
yading@10 2189 %if %2 == 8 ; chroma
yading@10 2190 DEFINE_ARGS dst1, dst8, mstride, stride, dst2
yading@10 2191 %elif mmsize == 8
yading@10 2192 DEFINE_ARGS dst1, mstride, stride, dst2, cntr
yading@10 2193 mov cntrq, 2
yading@10 2194 %else
yading@10 2195 DEFINE_ARGS dst1, mstride, stride, dst2, dst8
yading@10 2196 %endif
yading@10 2197 mov strideq, mstrideq
yading@10 2198 neg mstrideq
yading@10 2199 %ifidn %1, h
yading@10 2200 lea dst1q, [dst1q+strideq*4-4]
yading@10 2201 %if %2 == 8 ; chroma
yading@10 2202 lea dst8q, [dst8q+strideq*4-4]
yading@10 2203 %endif
yading@10 2204 %endif
yading@10 2205
yading@10 2206 %if mmsize == 8
yading@10 2207 .next8px:
yading@10 2208 %endif
yading@10 2209 ; read
yading@10 2210 lea dst2q, [dst1q+ strideq ]
yading@10 2211 %ifidn %1, v
yading@10 2212 %if %2 == 8 && mmsize == 16
yading@10 2213 %define movrow movh
yading@10 2214 %else
yading@10 2215 %define movrow mova
yading@10 2216 %endif
yading@10 2217 movrow m0, [dst1q+mstrideq*4] ; p3
yading@10 2218 movrow m1, [dst2q+mstrideq*4] ; p2
yading@10 2219 movrow m2, [dst1q+mstrideq*2] ; p1
yading@10 2220 movrow m5, [dst2q] ; q1
yading@10 2221 movrow m6, [dst2q+ strideq ] ; q2
yading@10 2222 movrow m7, [dst2q+ strideq*2] ; q3
yading@10 2223 %if mmsize == 16 && %2 == 8
yading@10 2224 movhps m0, [dst8q+mstrideq*4]
yading@10 2225 movhps m2, [dst8q+mstrideq*2]
yading@10 2226 add dst8q, strideq
yading@10 2227 movhps m1, [dst8q+mstrideq*4]
yading@10 2228 movhps m5, [dst8q]
yading@10 2229 movhps m6, [dst8q+ strideq ]
yading@10 2230 movhps m7, [dst8q+ strideq*2]
yading@10 2231 add dst8q, mstrideq
yading@10 2232 %endif
yading@10 2233 %elif mmsize == 8 ; mmx/mmxext (h)
yading@10 2234 ; read 8 rows of 8px each
yading@10 2235 movu m0, [dst1q+mstrideq*4]
yading@10 2236 movu m1, [dst2q+mstrideq*4]
yading@10 2237 movu m2, [dst1q+mstrideq*2]
yading@10 2238 movu m3, [dst1q+mstrideq ]
yading@10 2239 movu m4, [dst1q]
yading@10 2240 movu m5, [dst2q]
yading@10 2241 movu m6, [dst2q+ strideq ]
yading@10 2242
yading@10 2243 ; 8x8 transpose
yading@10 2244 TRANSPOSE4x4B 0, 1, 2, 3, 7
yading@10 2245 mova m_q0backup, m1
yading@10 2246 movu m7, [dst2q+ strideq*2]
yading@10 2247 TRANSPOSE4x4B 4, 5, 6, 7, 1
yading@10 2248 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
yading@10 2249 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
yading@10 2250 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
yading@10 2251 mova m1, m_q0backup
yading@10 2252 mova m_q0backup, m2 ; store q0
yading@10 2253 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
yading@10 2254 mova m_p0backup, m5 ; store p0
yading@10 2255 SWAP 1, 4
yading@10 2256 SWAP 2, 4
yading@10 2257 SWAP 6, 3
yading@10 2258 SWAP 5, 3
yading@10 2259 %else ; sse2 (h)
yading@10 2260 %if %2 == 16
yading@10 2261 lea dst8q, [dst1q+ strideq*8 ]
yading@10 2262 %endif
yading@10 2263
yading@10 2264 ; read 16 rows of 8px each, interleave
yading@10 2265 movh m0, [dst1q+mstrideq*4]
yading@10 2266 movh m1, [dst8q+mstrideq*4]
yading@10 2267 movh m2, [dst1q+mstrideq*2]
yading@10 2268 movh m5, [dst8q+mstrideq*2]
yading@10 2269 movh m3, [dst1q+mstrideq ]
yading@10 2270 movh m6, [dst8q+mstrideq ]
yading@10 2271 movh m4, [dst1q]
yading@10 2272 movh m7, [dst8q]
yading@10 2273 punpcklbw m0, m1 ; A/I
yading@10 2274 punpcklbw m2, m5 ; C/K
yading@10 2275 punpcklbw m3, m6 ; D/L
yading@10 2276 punpcklbw m4, m7 ; E/M
yading@10 2277
yading@10 2278 add dst8q, strideq
yading@10 2279 movh m1, [dst2q+mstrideq*4]
yading@10 2280 movh m6, [dst8q+mstrideq*4]
yading@10 2281 movh m5, [dst2q]
yading@10 2282 movh m7, [dst8q]
yading@10 2283 punpcklbw m1, m6 ; B/J
yading@10 2284 punpcklbw m5, m7 ; F/N
yading@10 2285 movh m6, [dst2q+ strideq ]
yading@10 2286 movh m7, [dst8q+ strideq ]
yading@10 2287 punpcklbw m6, m7 ; G/O
yading@10 2288
yading@10 2289 ; 8x16 transpose
yading@10 2290 TRANSPOSE4x4B 0, 1, 2, 3, 7
yading@10 2291 %ifdef m8
yading@10 2292 SWAP 1, 8
yading@10 2293 %else
yading@10 2294 mova m_q0backup, m1
yading@10 2295 %endif
yading@10 2296 movh m7, [dst2q+ strideq*2]
yading@10 2297 movh m1, [dst8q+ strideq*2]
yading@10 2298 punpcklbw m7, m1 ; H/P
yading@10 2299 TRANSPOSE4x4B 4, 5, 6, 7, 1
yading@10 2300 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
yading@10 2301 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
yading@10 2302 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
yading@10 2303 %ifdef m8
yading@10 2304 SWAP 1, 8
yading@10 2305 SWAP 2, 8
yading@10 2306 %else
yading@10 2307 mova m1, m_q0backup
yading@10 2308 mova m_q0backup, m2 ; store q0
yading@10 2309 %endif
yading@10 2310 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
yading@10 2311 %ifdef m12
yading@10 2312 SWAP 5, 12
yading@10 2313 %else
yading@10 2314 mova m_p0backup, m5 ; store p0
yading@10 2315 %endif
yading@10 2316 SWAP 1, 4
yading@10 2317 SWAP 2, 4
yading@10 2318 SWAP 6, 3
yading@10 2319 SWAP 5, 3
yading@10 2320 %endif
yading@10 2321
yading@10 2322 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
yading@10 2323 mova m4, m1
yading@10 2324 SWAP 4, 1
yading@10 2325 psubusb m4, m0 ; p2-p3
yading@10 2326 psubusb m0, m1 ; p3-p2
yading@10 2327 por m0, m4 ; abs(p3-p2)
yading@10 2328
yading@10 2329 mova m4, m2
yading@10 2330 SWAP 4, 2
yading@10 2331 psubusb m4, m1 ; p1-p2
yading@10 2332 mova m_p2backup, m1
yading@10 2333 psubusb m1, m2 ; p2-p1
yading@10 2334 por m1, m4 ; abs(p2-p1)
yading@10 2335
yading@10 2336 mova m4, m6
yading@10 2337 SWAP 4, 6
yading@10 2338 psubusb m4, m7 ; q2-q3
yading@10 2339 psubusb m7, m6 ; q3-q2
yading@10 2340 por m7, m4 ; abs(q3-q2)
yading@10 2341
yading@10 2342 mova m4, m5
yading@10 2343 SWAP 4, 5
yading@10 2344 psubusb m4, m6 ; q1-q2
yading@10 2345 mova m_q2backup, m6
yading@10 2346 psubusb m6, m5 ; q2-q1
yading@10 2347 por m6, m4 ; abs(q2-q1)
yading@10 2348
yading@10 2349 %if notcpuflag(mmxext)
yading@10 2350 mova m4, m_flimI
yading@10 2351 pxor m3, m3
yading@10 2352 psubusb m0, m4
yading@10 2353 psubusb m1, m4
yading@10 2354 psubusb m7, m4
yading@10 2355 psubusb m6, m4
yading@10 2356 pcmpeqb m0, m3 ; abs(p3-p2) <= I
yading@10 2357 pcmpeqb m1, m3 ; abs(p2-p1) <= I
yading@10 2358 pcmpeqb m7, m3 ; abs(q3-q2) <= I
yading@10 2359 pcmpeqb m6, m3 ; abs(q2-q1) <= I
yading@10 2360 pand m0, m1
yading@10 2361 pand m7, m6
yading@10 2362 pand m0, m7
yading@10 2363 %else ; mmxext/sse2
yading@10 2364 pmaxub m0, m1
yading@10 2365 pmaxub m6, m7
yading@10 2366 pmaxub m0, m6
yading@10 2367 %endif
yading@10 2368
yading@10 2369 ; normal_limit and high_edge_variance for p1-p0, q1-q0
yading@10 2370 SWAP 7, 3 ; now m7 is zero
yading@10 2371 %ifidn %1, v
yading@10 2372 movrow m3, [dst1q+mstrideq ] ; p0
yading@10 2373 %if mmsize == 16 && %2 == 8
yading@10 2374 movhps m3, [dst8q+mstrideq ]
yading@10 2375 %endif
yading@10 2376 %elifdef m12
yading@10 2377 SWAP 3, 12
yading@10 2378 %else
yading@10 2379 mova m3, m_p0backup
yading@10 2380 %endif
yading@10 2381
yading@10 2382 mova m1, m2
yading@10 2383 SWAP 1, 2
yading@10 2384 mova m6, m3
yading@10 2385 SWAP 3, 6
yading@10 2386 psubusb m1, m3 ; p1-p0
yading@10 2387 psubusb m6, m2 ; p0-p1
yading@10 2388 por m1, m6 ; abs(p1-p0)
yading@10 2389 %if notcpuflag(mmxext)
yading@10 2390 mova m6, m1
yading@10 2391 psubusb m1, m4
yading@10 2392 psubusb m6, m_hevthr
yading@10 2393 pcmpeqb m1, m7 ; abs(p1-p0) <= I
yading@10 2394 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
yading@10 2395 pand m0, m1
yading@10 2396 mova m_maskres, m6
yading@10 2397 %else ; mmxext/sse2
yading@10 2398 pmaxub m0, m1 ; max_I
yading@10 2399 SWAP 1, 4 ; max_hev_thresh
yading@10 2400 %endif
yading@10 2401
yading@10 2402 SWAP 6, 4 ; now m6 is I
yading@10 2403 %ifidn %1, v
yading@10 2404 movrow m4, [dst1q] ; q0
yading@10 2405 %if mmsize == 16 && %2 == 8
yading@10 2406 movhps m4, [dst8q]
yading@10 2407 %endif
yading@10 2408 %elifdef m8
yading@10 2409 SWAP 4, 8
yading@10 2410 %else
yading@10 2411 mova m4, m_q0backup
yading@10 2412 %endif
yading@10 2413 mova m1, m4
yading@10 2414 SWAP 1, 4
yading@10 2415 mova m7, m5
yading@10 2416 SWAP 7, 5
yading@10 2417 psubusb m1, m5 ; q0-q1
yading@10 2418 psubusb m7, m4 ; q1-q0
yading@10 2419 por m1, m7 ; abs(q1-q0)
yading@10 2420 %if notcpuflag(mmxext)
yading@10 2421 mova m7, m1
yading@10 2422 psubusb m1, m6
yading@10 2423 psubusb m7, m_hevthr
yading@10 2424 pxor m6, m6
yading@10 2425 pcmpeqb m1, m6 ; abs(q1-q0) <= I
yading@10 2426 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
yading@10 2427 mova m6, m_maskres
yading@10 2428 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
yading@10 2429 pand m6, m7
yading@10 2430 %else ; mmxext/sse2
yading@10 2431 pxor m7, m7
yading@10 2432 pmaxub m0, m1
yading@10 2433 pmaxub m6, m1
yading@10 2434 psubusb m0, m_flimI
yading@10 2435 psubusb m6, m_hevthr
yading@10 2436 pcmpeqb m0, m7 ; max(abs(..)) <= I
yading@10 2437 pcmpeqb m6, m7 ; !(max(abs..) > thresh)
yading@10 2438 %endif
yading@10 2439 %ifdef m12
yading@10 2440 SWAP 6, 12
yading@10 2441 %else
yading@10 2442 mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
yading@10 2443 %endif
yading@10 2444
yading@10 2445 ; simple_limit
yading@10 2446 mova m1, m3
yading@10 2447 SWAP 1, 3
yading@10 2448 mova m6, m4 ; keep copies of p0/q0 around for later use
yading@10 2449 SWAP 6, 4
yading@10 2450 psubusb m1, m4 ; p0-q0
yading@10 2451 psubusb m6, m3 ; q0-p0
yading@10 2452 por m1, m6 ; abs(q0-p0)
yading@10 2453 paddusb m1, m1 ; m1=2*abs(q0-p0)
yading@10 2454
yading@10 2455 mova m7, m2
yading@10 2456 SWAP 7, 2
yading@10 2457 mova m6, m5
yading@10 2458 SWAP 6, 5
yading@10 2459 psubusb m7, m5 ; p1-q1
yading@10 2460 psubusb m6, m2 ; q1-p1
yading@10 2461 por m7, m6 ; abs(q1-p1)
yading@10 2462 pxor m6, m6
yading@10 2463 pand m7, [pb_FE]
yading@10 2464 psrlq m7, 1 ; abs(q1-p1)/2
yading@10 2465 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
yading@10 2466 psubusb m7, m_flimE
yading@10 2467 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
yading@10 2468 pand m0, m7 ; normal_limit result
yading@10 2469
yading@10 2470 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
yading@10 2471 %ifdef m8 ; x86-64 && sse2
yading@10 2472 mova m8, [pb_80]
yading@10 2473 %define m_pb_80 m8
yading@10 2474 %else ; x86-32 or mmx/mmxext
yading@10 2475 %define m_pb_80 [pb_80]
yading@10 2476 %endif
yading@10 2477 mova m1, m4
yading@10 2478 mova m7, m3
yading@10 2479 pxor m1, m_pb_80
yading@10 2480 pxor m7, m_pb_80
yading@10 2481 psubsb m1, m7 ; (signed) q0-p0
yading@10 2482 mova m6, m2
yading@10 2483 mova m7, m5
yading@10 2484 pxor m6, m_pb_80
yading@10 2485 pxor m7, m_pb_80
yading@10 2486 psubsb m6, m7 ; (signed) p1-q1
yading@10 2487 mova m7, m_maskres
yading@10 2488 paddsb m6, m1
yading@10 2489 paddsb m6, m1
yading@10 2490 paddsb m6, m1
yading@10 2491 pand m6, m0
yading@10 2492 %ifdef m8
yading@10 2493 mova m_limres, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge
yading@10 2494 pand m_limres, m7
yading@10 2495 %else
yading@10 2496 mova m0, m6
yading@10 2497 pand m0, m7
yading@10 2498 mova m_limres, m0
yading@10 2499 %endif
yading@10 2500 pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common
yading@10 2501
yading@10 2502 mova m1, [pb_F8]
yading@10 2503 mova m6, m7
yading@10 2504 paddsb m7, [pb_3]
yading@10 2505 paddsb m6, [pb_4]
yading@10 2506 pand m7, m1
yading@10 2507 pand m6, m1
yading@10 2508
yading@10 2509 pxor m1, m1
yading@10 2510 pxor m0, m0
yading@10 2511 pcmpgtb m1, m7
yading@10 2512 psubb m0, m7
yading@10 2513 psrlq m7, 3 ; +f2
yading@10 2514 psrlq m0, 3 ; -f2
yading@10 2515 pand m0, m1
yading@10 2516 pandn m1, m7
yading@10 2517 psubusb m3, m0
yading@10 2518 paddusb m3, m1 ; p0+f2
yading@10 2519
yading@10 2520 pxor m1, m1
yading@10 2521 pxor m0, m0
yading@10 2522 pcmpgtb m0, m6
yading@10 2523 psubb m1, m6
yading@10 2524 psrlq m6, 3 ; +f1
yading@10 2525 psrlq m1, 3 ; -f1
yading@10 2526 pand m1, m0
yading@10 2527 pandn m0, m6
yading@10 2528 psubusb m4, m0
yading@10 2529 paddusb m4, m1 ; q0-f1
yading@10 2530
yading@10 2531 ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
yading@10 2532 %if cpuflag(ssse3)
yading@10 2533 mova m7, [pb_1]
yading@10 2534 %else
yading@10 2535 mova m7, [pw_63]
yading@10 2536 %endif
yading@10 2537 %ifdef m8
yading@10 2538 SWAP 1, 8
yading@10 2539 %else
yading@10 2540 mova m1, m_limres
yading@10 2541 %endif
yading@10 2542 pxor m0, m0
yading@10 2543 mova m6, m1
yading@10 2544 pcmpgtb m0, m1 ; which are negative
yading@10 2545 %if cpuflag(ssse3)
yading@10 2546 punpcklbw m6, m7 ; interleave with "1" for rounding
yading@10 2547 punpckhbw m1, m7
yading@10 2548 %else
yading@10 2549 punpcklbw m6, m0 ; signed byte->word
yading@10 2550 punpckhbw m1, m0
yading@10 2551 %endif
yading@10 2552 mova m_limsign, m0
yading@10 2553 %if cpuflag(ssse3)
yading@10 2554 mova m7, [pb_27_63]
yading@10 2555 %ifndef m8
yading@10 2556 mova m_limres, m1
yading@10 2557 %endif
yading@10 2558 %ifdef m10
yading@10 2559 SWAP 0, 10 ; don't lose lim_sign copy
yading@10 2560 %endif
yading@10 2561 mova m0, m7
yading@10 2562 pmaddubsw m7, m6
yading@10 2563 SWAP 6, 7
yading@10 2564 pmaddubsw m0, m1
yading@10 2565 SWAP 1, 0
yading@10 2566 %ifdef m10
yading@10 2567 SWAP 0, 10
yading@10 2568 %else
yading@10 2569 mova m0, m_limsign
yading@10 2570 %endif
yading@10 2571 %else
yading@10 2572 mova m_maskres, m6 ; backup for later in filter
yading@10 2573 mova m_limres, m1
yading@10 2574 pmullw m6, [pw_27]
yading@10 2575 pmullw m1, [pw_27]
yading@10 2576 paddw m6, m7
yading@10 2577 paddw m1, m7
yading@10 2578 %endif
yading@10 2579 psraw m6, 7
yading@10 2580 psraw m1, 7
yading@10 2581 packsswb m6, m1 ; a0
yading@10 2582 pxor m1, m1
yading@10 2583 psubb m1, m6
yading@10 2584 pand m1, m0 ; -a0
yading@10 2585 pandn m0, m6 ; +a0
yading@10 2586 %if cpuflag(ssse3)
yading@10 2587 mova m6, [pb_18_63] ; pipelining
yading@10 2588 %endif
yading@10 2589 psubusb m3, m1
yading@10 2590 paddusb m4, m1
yading@10 2591 paddusb m3, m0 ; p0+a0
yading@10 2592 psubusb m4, m0 ; q0-a0
yading@10 2593
yading@10 2594 %if cpuflag(ssse3)
yading@10 2595 SWAP 6, 7
yading@10 2596 %ifdef m10
yading@10 2597 SWAP 1, 10
yading@10 2598 %else
yading@10 2599 mova m1, m_limres
yading@10 2600 %endif
yading@10 2601 mova m0, m7
yading@10 2602 pmaddubsw m7, m6
yading@10 2603 SWAP 6, 7
yading@10 2604 pmaddubsw m0, m1
yading@10 2605 SWAP 1, 0
yading@10 2606 %ifdef m10
yading@10 2607 SWAP 0, 10
yading@10 2608 %endif
yading@10 2609 mova m0, m_limsign
yading@10 2610 %else
yading@10 2611 mova m6, m_maskres
yading@10 2612 mova m1, m_limres
yading@10 2613 pmullw m6, [pw_18]
yading@10 2614 pmullw m1, [pw_18]
yading@10 2615 paddw m6, m7
yading@10 2616 paddw m1, m7
yading@10 2617 %endif
yading@10 2618 mova m0, m_limsign
yading@10 2619 psraw m6, 7
yading@10 2620 psraw m1, 7
yading@10 2621 packsswb m6, m1 ; a1
yading@10 2622 pxor m1, m1
yading@10 2623 psubb m1, m6
yading@10 2624 pand m1, m0 ; -a1
yading@10 2625 pandn m0, m6 ; +a1
yading@10 2626 %if cpuflag(ssse3)
yading@10 2627 mova m6, [pb_9_63]
yading@10 2628 %endif
yading@10 2629 psubusb m2, m1
yading@10 2630 paddusb m5, m1
yading@10 2631 paddusb m2, m0 ; p1+a1
yading@10 2632 psubusb m5, m0 ; q1-a1
yading@10 2633
yading@10 2634 %if cpuflag(ssse3)
yading@10 2635 SWAP 6, 7
yading@10 2636 %ifdef m10
yading@10 2637 SWAP 1, 10
yading@10 2638 %else
yading@10 2639 mova m1, m_limres
yading@10 2640 %endif
yading@10 2641 mova m0, m7
yading@10 2642 pmaddubsw m7, m6
yading@10 2643 SWAP 6, 7
yading@10 2644 pmaddubsw m0, m1
yading@10 2645 SWAP 1, 0
yading@10 2646 %else
yading@10 2647 %ifdef m8
yading@10 2648 SWAP 6, 12
yading@10 2649 SWAP 1, 8
yading@10 2650 %else
yading@10 2651 mova m6, m_maskres
yading@10 2652 mova m1, m_limres
yading@10 2653 %endif
yading@10 2654 pmullw m6, [pw_9]
yading@10 2655 pmullw m1, [pw_9]
yading@10 2656 paddw m6, m7
yading@10 2657 paddw m1, m7
yading@10 2658 %endif
yading@10 2659 %ifdef m9
yading@10 2660 SWAP 7, 9
yading@10 2661 %else
yading@10 2662 mova m7, m_limsign
yading@10 2663 %endif
yading@10 2664 psraw m6, 7
yading@10 2665 psraw m1, 7
yading@10 2666 packsswb m6, m1 ; a1
yading@10 2667 pxor m0, m0
yading@10 2668 psubb m0, m6
yading@10 2669 pand m0, m7 ; -a1
yading@10 2670 pandn m7, m6 ; +a1
yading@10 2671 %ifdef m8
yading@10 2672 SWAP 1, 13
yading@10 2673 SWAP 6, 14
yading@10 2674 %else
yading@10 2675 mova m1, m_p2backup
yading@10 2676 mova m6, m_q2backup
yading@10 2677 %endif
yading@10 2678 psubusb m1, m0
yading@10 2679 paddusb m6, m0
yading@10 2680 paddusb m1, m7 ; p1+a1
yading@10 2681 psubusb m6, m7 ; q1-a1
yading@10 2682
yading@10 2683 ; store
yading@10 2684 %ifidn %1, v
yading@10 2685 movrow [dst2q+mstrideq*4], m1
yading@10 2686 movrow [dst1q+mstrideq*2], m2
yading@10 2687 movrow [dst1q+mstrideq ], m3
yading@10 2688 movrow [dst1q], m4
yading@10 2689 movrow [dst2q], m5
yading@10 2690 movrow [dst2q+ strideq ], m6
yading@10 2691 %if mmsize == 16 && %2 == 8
yading@10 2692 add dst8q, mstrideq
yading@10 2693 movhps [dst8q+mstrideq*2], m1
yading@10 2694 movhps [dst8q+mstrideq ], m2
yading@10 2695 movhps [dst8q], m3
yading@10 2696 add dst8q, strideq
yading@10 2697 movhps [dst8q], m4
yading@10 2698 movhps [dst8q+ strideq ], m5
yading@10 2699 movhps [dst8q+ strideq*2], m6
yading@10 2700 %endif
yading@10 2701 %else ; h
yading@10 2702 inc dst1q
yading@10 2703 inc dst2q
yading@10 2704
yading@10 2705 ; 4x8/16 transpose
yading@10 2706 TRANSPOSE4x4B 1, 2, 3, 4, 0
yading@10 2707 SBUTTERFLY bw, 5, 6, 0
yading@10 2708
yading@10 2709 %if mmsize == 8 ; mmx/mmxext (h)
yading@10 2710 WRITE_4x2D 1, 2, 3, 4, dst1q, dst2q, mstrideq, strideq
yading@10 2711 add dst1q, 4
yading@10 2712 WRITE_2x4W m5, m6, dst2q, dst1q, mstrideq, strideq
yading@10 2713 %else ; sse2 (h)
yading@10 2714 lea dst8q, [dst8q+mstrideq+1]
yading@10 2715 WRITE_4x4D 1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2
yading@10 2716 lea dst1q, [dst2q+mstrideq+4]
yading@10 2717 lea dst8q, [dst8q+mstrideq+4]
yading@10 2718 %if cpuflag(sse4)
yading@10 2719 add dst2q, 4
yading@10 2720 %endif
yading@10 2721 WRITE_8W m5, dst2q, dst1q, mstrideq, strideq
yading@10 2722 %if cpuflag(sse4)
yading@10 2723 lea dst2q, [dst8q+ strideq ]
yading@10 2724 %endif
yading@10 2725 WRITE_8W m6, dst2q, dst8q, mstrideq, strideq
yading@10 2726 %endif
yading@10 2727 %endif
yading@10 2728
yading@10 2729 %if mmsize == 8
yading@10 2730 %if %2 == 8 ; chroma
yading@10 2731 %ifidn %1, h
yading@10 2732 sub dst1q, 5
yading@10 2733 %endif
yading@10 2734 cmp dst1q, dst8q
yading@10 2735 mov dst1q, dst8q
yading@10 2736 jnz .next8px
yading@10 2737 %else
yading@10 2738 %ifidn %1, h
yading@10 2739 lea dst1q, [dst1q+ strideq*8-5]
yading@10 2740 %else ; v
yading@10 2741 add dst1q, 8
yading@10 2742 %endif
yading@10 2743 dec cntrq
yading@10 2744 jg .next8px
yading@10 2745 %endif
yading@10 2746 REP_RET
yading@10 2747 %else ; mmsize == 16
yading@10 2748 RET
yading@10 2749 %endif
yading@10 2750 %endmacro
yading@10 2751
yading@10 2752 %if ARCH_X86_32
yading@10 2753 INIT_MMX mmx
yading@10 2754 MBEDGE_LOOPFILTER v, 16
yading@10 2755 MBEDGE_LOOPFILTER h, 16
yading@10 2756 MBEDGE_LOOPFILTER v, 8
yading@10 2757 MBEDGE_LOOPFILTER h, 8
yading@10 2758
yading@10 2759 INIT_MMX mmxext
yading@10 2760 MBEDGE_LOOPFILTER v, 16
yading@10 2761 MBEDGE_LOOPFILTER h, 16
yading@10 2762 MBEDGE_LOOPFILTER v, 8
yading@10 2763 MBEDGE_LOOPFILTER h, 8
yading@10 2764 %endif
yading@10 2765
yading@10 2766 INIT_XMM sse2
yading@10 2767 MBEDGE_LOOPFILTER v, 16
yading@10 2768 MBEDGE_LOOPFILTER h, 16
yading@10 2769 MBEDGE_LOOPFILTER v, 8
yading@10 2770 MBEDGE_LOOPFILTER h, 8
yading@10 2771
yading@10 2772 INIT_XMM ssse3
yading@10 2773 MBEDGE_LOOPFILTER v, 16
yading@10 2774 MBEDGE_LOOPFILTER h, 16
yading@10 2775 MBEDGE_LOOPFILTER v, 8
yading@10 2776 MBEDGE_LOOPFILTER h, 8
yading@10 2777
yading@10 2778 INIT_XMM sse4
yading@10 2779 MBEDGE_LOOPFILTER h, 16
yading@10 2780 MBEDGE_LOOPFILTER h, 8