32     const int w2= (width+1)>>1;
    33     const int w_l= (width>>1);
    34     const int w_r= w2 - 1;
    46             "pcmpeqd   %%xmm7, %%xmm7         \n\t"    47             "pcmpeqd   %%xmm3, %%xmm3         \n\t"    48             "psllw         $1, %%xmm3         \n\t"    49             "paddw     %%xmm7, %%xmm3         \n\t"    50             "psllw        $13, %%xmm3         \n\t"    52         for(; i<w_l-15; i+=16){
    54                 "movdqu   (%1), %%xmm1        \n\t"    55                 "movdqu 16(%1), %%xmm5        \n\t"    56                 "movdqu  2(%1), %%xmm2        \n\t"    57                 "movdqu 18(%1), %%xmm6        \n\t"    58                 "paddw  %%xmm1, %%xmm2        \n\t"    59                 "paddw  %%xmm5, %%xmm6        \n\t"    60                 "paddw  %%xmm7, %%xmm2        \n\t"    61                 "paddw  %%xmm7, %%xmm6        \n\t"    62                 "pmulhw %%xmm3, %%xmm2        \n\t"    63                 "pmulhw %%xmm3, %%xmm6        \n\t"    64                 "paddw    (%0), %%xmm2        \n\t"    65                 "paddw  16(%0), %%xmm6        \n\t"    66                 "movdqa %%xmm2, (%0)          \n\t"    67                 "movdqa %%xmm6, 16(%0)        \n\t"    68                 :: 
"r"(&b[
i]), 
"r"(&ref[i])
    80         for(; (((
x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){
    81             dst[
i] = dst[
i] - (b[
i] + b[i + 1]);
    83         for(; i<w_r-15; i+=16){
    85                 "movdqu   (%1), %%xmm1        \n\t"    86                 "movdqu 16(%1), %%xmm5        \n\t"    87                 "movdqu  2(%1), %%xmm2        \n\t"    88                 "movdqu 18(%1), %%xmm6        \n\t"    89                 "paddw  %%xmm1, %%xmm2        \n\t"    90                 "paddw  %%xmm5, %%xmm6        \n\t"    91                 "movdqa   (%0), %%xmm0        \n\t"    92                 "movdqa 16(%0), %%xmm4        \n\t"    93                 "psubw  %%xmm2, %%xmm0        \n\t"    94                 "psubw  %%xmm6, %%xmm4        \n\t"    95                 "movdqa %%xmm0, (%0)          \n\t"    96                 "movdqa %%xmm4, 16(%0)        \n\t"    97                 :: 
"r"(&dst[
i]), 
"r"(&b[i])
   110             "psllw         $15, %%xmm7        \n\t"   111             "pcmpeqw    %%xmm6, %%xmm6        \n\t"   112             "psrlw         $13, %%xmm6        \n\t"   113             "paddw      %%xmm7, %%xmm6        \n\t"   115         for(; i<w_l-15; i+=16){
   117                 "movdqu   (%1), %%xmm0        \n\t"   118                 "movdqu 16(%1), %%xmm4        \n\t"   119                 "movdqu  2(%1), %%xmm1        \n\t"   120                 "movdqu 18(%1), %%xmm5        \n\t"    121                 "paddw  %%xmm6, %%xmm0        \n\t"   122                 "paddw  %%xmm6, %%xmm4        \n\t"   123                 "paddw  %%xmm7, %%xmm1        \n\t"   124                 "paddw  %%xmm7, %%xmm5        \n\t"   125                 "pavgw  %%xmm1, %%xmm0        \n\t"   126                 "pavgw  %%xmm5, %%xmm4        \n\t"   127                 "psubw  %%xmm7, %%xmm0        \n\t"   128                 "psubw  %%xmm7, %%xmm4        \n\t"   129                 "psraw      $1, %%xmm0        \n\t"   130                 "psraw      $1, %%xmm4        \n\t"   131                 "movdqa   (%0), %%xmm1        \n\t"   132                 "movdqa 16(%0), %%xmm5        \n\t"   133                 "paddw  %%xmm1, %%xmm0        \n\t"   134                 "paddw  %%xmm5, %%xmm4        \n\t"   135                 "psraw      $2, %%xmm0        \n\t"   136                 "psraw      $2, %%xmm4        \n\t"   137                 "paddw  %%xmm1, %%xmm0        \n\t"   138                 "paddw  %%xmm5, %%xmm4        \n\t"   139                 "movdqa %%xmm0, (%0)          \n\t"   140                 "movdqa %%xmm4, 16(%0)        \n\t"   141                 :: 
"r"(&b[
i]), 
"r"(&ref[i])
   146         b[0] = b_0 + ((2 * ref[1] + 
W_BO-1 + 4 * b_0) >> 
W_BS);
   153         for(; (((
x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){
   154             temp[
i] = src[
i] - ((-
W_AM*(b[
i] + b[i+1]))>>
W_AS);
   156         for(; i<w_r-7; i+=8){
   158                 "movdqu  2(%1), %%xmm2        \n\t"   159                 "movdqu 18(%1), %%xmm6        \n\t"   160                 "paddw    (%1), %%xmm2        \n\t"   161                 "paddw  16(%1), %%xmm6        \n\t"   162                 "movdqu   (%0), %%xmm0        \n\t"   163                 "movdqu 16(%0), %%xmm4        \n\t"   164                 "paddw  %%xmm2, %%xmm0        \n\t"   165                 "paddw  %%xmm6, %%xmm4        \n\t"   166                 "psraw      $1, %%xmm2        \n\t"   167                 "psraw      $1, %%xmm6        \n\t"   168                 "paddw  %%xmm0, %%xmm2        \n\t"   169                 "paddw  %%xmm4, %%xmm6        \n\t"   170                 "movdqa %%xmm2, (%2)          \n\t"   171                 "movdqa %%xmm6, 16(%2)        \n\t"   172                 :: 
"r"(&src[
i]), 
"r"(&b[i]), 
"r"(&temp[
i])
   182         for (; (i & 0x3E) != 0x3E; i-=2){
   186         for (i-=62; i>=0; i-=64){
   188                 "movdqa      (%1), %%xmm0       \n\t"   189                 "movdqa    16(%1), %%xmm2       \n\t"   190                 "movdqa    32(%1), %%xmm4       \n\t"   191                 "movdqa    48(%1), %%xmm6       \n\t"   192                 "movdqa      (%1), %%xmm1       \n\t"   193                 "movdqa    16(%1), %%xmm3       \n\t"   194                 "movdqa    32(%1), %%xmm5       \n\t"   195                 "movdqa    48(%1), %%xmm7       \n\t"   196                 "punpcklwd   (%2), %%xmm0       \n\t"   197                 "punpcklwd 16(%2), %%xmm2       \n\t"   198                 "punpcklwd 32(%2), %%xmm4       \n\t"   199                 "punpcklwd 48(%2), %%xmm6       \n\t"   200                 "movdqa    %%xmm0, (%0)         \n\t"   201                 "movdqa    %%xmm2, 32(%0)       \n\t"   202                 "movdqa    %%xmm4, 64(%0)       \n\t"   203                 "movdqa    %%xmm6, 96(%0)       \n\t"   204                 "punpckhwd   (%2), %%xmm1       \n\t"   205                 "punpckhwd 16(%2), %%xmm3       \n\t"   206                 "punpckhwd 32(%2), %%xmm5       \n\t"   207                 "punpckhwd 48(%2), %%xmm7       \n\t"   208                 "movdqa    %%xmm1, 16(%0)       \n\t"   209                 "movdqa    %%xmm3, 48(%0)       \n\t"   210                 "movdqa    %%xmm5, 80(%0)       \n\t"   211                 "movdqa    %%xmm7, 112(%0)      \n\t"   212                 :: 
"r"(&(
b)[i]), 
"r"(&(
b)[i>>1]), 
"r"(&(
temp)[i>>1])
   219 static void ff_snow_horizontal_compose97i_mmx(
IDWTELEM *b, 
IDWTELEM *temp, 
int width){
   220     const int w2= (width+1)>>1;
   221     const int w_l= (width>>1);
   222     const int w_r= w2 - 1;
   231             "pcmpeqw    %%mm7, %%mm7         \n\t"   232             "pcmpeqw    %%mm3, %%mm3         \n\t"   233             "psllw         $1, %%mm3         \n\t"   234             "paddw      %%mm7, %%mm3         \n\t"   235             "psllw        $13, %%mm3         \n\t"   237         for(; i<w_l-7; i+=8){
   239                 "movq     (%1), %%mm2        \n\t"   240                 "movq    8(%1), %%mm6        \n\t"   241                 "paddw   2(%1), %%mm2        \n\t"   242                 "paddw  10(%1), %%mm6        \n\t"   243                 "paddw   %%mm7, %%mm2        \n\t"   244                 "paddw   %%mm7, %%mm6        \n\t"   245                 "pmulhw  %%mm3, %%mm2        \n\t"   246                 "pmulhw  %%mm3, %%mm6        \n\t"   247                 "paddw    (%0), %%mm2        \n\t"   248                 "paddw   8(%0), %%mm6        \n\t"   249                 "movq    %%mm2, (%0)         \n\t"   250                 "movq    %%mm6, 8(%0)        \n\t"   251                 :: 
"r"(&b[
i]), 
"r"(&ref[i])
   262         for(; i<w_r-7; i+=8){
   264                 "movq     (%1), %%mm2        \n\t"   265                 "movq    8(%1), %%mm6        \n\t"   266                 "paddw   2(%1), %%mm2        \n\t"   267                 "paddw  10(%1), %%mm6        \n\t"   268                 "movq     (%0), %%mm0        \n\t"   269                 "movq    8(%0), %%mm4        \n\t"   270                 "psubw   %%mm2, %%mm0        \n\t"   271                 "psubw   %%mm6, %%mm4        \n\t"   272                 "movq    %%mm0, (%0)         \n\t"   273                 "movq    %%mm4, 8(%0)        \n\t"   274                 :: 
"r"(&dst[
i]), 
"r"(&b[i])
   285         b[0] = b[0] + (((2 * ref[1] + 
W_BO) + 4 * b[0]) >> 
W_BS);
   287             "psllw         $15, %%mm7        \n\t"   288             "pcmpeqw     %%mm6, %%mm6        \n\t"   289             "psrlw         $13, %%mm6        \n\t"   290             "paddw       %%mm7, %%mm6        \n\t"   292         for(; i<w_l-7; i+=8){
   294                 "movq     (%1), %%mm0        \n\t"   295                 "movq    8(%1), %%mm4        \n\t"   296                 "movq    2(%1), %%mm1        \n\t"   297                 "movq   10(%1), %%mm5        \n\t"   298                 "paddw   %%mm6, %%mm0        \n\t"   299                 "paddw   %%mm6, %%mm4        \n\t"   300                 "paddw   %%mm7, %%mm1        \n\t"   301                 "paddw   %%mm7, %%mm5        \n\t"   302                 "pavgw   %%mm1, %%mm0        \n\t"   303                 "pavgw   %%mm5, %%mm4        \n\t"   304                 "psubw   %%mm7, %%mm0        \n\t"   305                 "psubw   %%mm7, %%mm4        \n\t"   306                 "psraw      $1, %%mm0        \n\t"   307                 "psraw      $1, %%mm4        \n\t"   308                 "movq     (%0), %%mm1        \n\t"   309                 "movq    8(%0), %%mm5        \n\t"   310                 "paddw   %%mm1, %%mm0        \n\t"   311                 "paddw   %%mm5, %%mm4        \n\t"   312                 "psraw      $2, %%mm0        \n\t"   313                 "psraw      $2, %%mm4        \n\t"   314                 "paddw   %%mm1, %%mm0        \n\t"   315                 "paddw   %%mm5, %%mm4        \n\t"   316                 "movq    %%mm0, (%0)         \n\t"   317                 "movq    %%mm4, 8(%0)        \n\t"   318                 :: 
"r"(&b[
i]), 
"r"(&ref[i])
   329         for(; i<w_r-7; i+=8){
   331                 "movq    2(%1), %%mm2        \n\t"   332                 "movq   10(%1), %%mm6        \n\t"   333                 "paddw    (%1), %%mm2        \n\t"   334                 "paddw   8(%1), %%mm6        \n\t"   335                 "movq     (%0), %%mm0        \n\t"   336                 "movq    8(%0), %%mm4        \n\t"   337                 "paddw   %%mm2, %%mm0        \n\t"   338                 "paddw   %%mm6, %%mm4        \n\t"   339                 "psraw      $1, %%mm2        \n\t"   340                 "psraw      $1, %%mm6        \n\t"   341                 "paddw   %%mm0, %%mm2        \n\t"   342                 "paddw   %%mm4, %%mm6        \n\t"   343                 "movq    %%mm2, (%2)         \n\t"   344                 "movq    %%mm6, 8(%2)        \n\t"   345                 :: 
"r"(&src[
i]), 
"r"(&b[i]), 
"r"(&temp[
i])
   355         for (; (i & 0x1E) != 0x1E; i-=2){
   359         for (i-=30; i>=0; i-=32){
   361                 "movq        (%1), %%mm0       \n\t"   362                 "movq       8(%1), %%mm2       \n\t"   363                 "movq      16(%1), %%mm4       \n\t"   364                 "movq      24(%1), %%mm6       \n\t"   365                 "movq        (%1), %%mm1       \n\t"   366                 "movq       8(%1), %%mm3       \n\t"   367                 "movq      16(%1), %%mm5       \n\t"   368                 "movq      24(%1), %%mm7       \n\t"   369                 "punpcklwd   (%2), %%mm0       \n\t"   370                 "punpcklwd  8(%2), %%mm2       \n\t"   371                 "punpcklwd 16(%2), %%mm4       \n\t"   372                 "punpcklwd 24(%2), %%mm6       \n\t"   373                 "movq       %%mm0, (%0)        \n\t"   374                 "movq       %%mm2, 16(%0)      \n\t"   375                 "movq       %%mm4, 32(%0)      \n\t"   376                 "movq       %%mm6, 48(%0)      \n\t"   377                 "punpckhwd   (%2), %%mm1       \n\t"   378                 "punpckhwd  8(%2), %%mm3       \n\t"   379                 "punpckhwd 16(%2), %%mm5       \n\t"   380                 "punpckhwd 24(%2), %%mm7       \n\t"   381                 "movq       %%mm1, 8(%0)       \n\t"   382                 "movq       %%mm3, 24(%0)      \n\t"   383                 "movq       %%mm5, 40(%0)      \n\t"   384                 "movq       %%mm7, 56(%0)      \n\t"   385                 :: 
"r"(&b[
i]), 
"r"(&b[i>>1]), 
"r"(&temp[i>>1])
   393 #define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\   394         ""op" ("r",%%"REG_d"), %%"t0"      \n\t"\   395         ""op" 16("r",%%"REG_d"), %%"t1"    \n\t"\   396         ""op" 32("r",%%"REG_d"), %%"t2"    \n\t"\   397         ""op" 48("r",%%"REG_d"), %%"t3"    \n\t"   399 #define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\   400         snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)   402 #define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\   403         snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3)   405 #define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\   406         "psubw %%"s0", %%"t0" \n\t"\   407         "psubw %%"s1", %%"t1" \n\t"\   408         "psubw %%"s2", %%"t2" \n\t"\   409         "psubw %%"s3", %%"t3" \n\t"   411 #define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\   412         "movdqa %%"s0", ("w",%%"REG_d")      \n\t"\   413         "movdqa %%"s1", 16("w",%%"REG_d")    \n\t"\   414         "movdqa %%"s2", 32("w",%%"REG_d")    \n\t"\   415         "movdqa %%"s3", 48("w",%%"REG_d")    \n\t"   417 #define snow_vertical_compose_sra(n,t0,t1,t2,t3)\   418         "psraw $"n", %%"t0" \n\t"\   419         "psraw $"n", %%"t1" \n\t"\   420         "psraw $"n", %%"t2" \n\t"\   421         "psraw $"n", %%"t3" \n\t"   423 #define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\   424         "paddw %%"s0", %%"t0" \n\t"\   425         "paddw %%"s1", %%"t1" \n\t"\   426         "paddw %%"s2", %%"t2" \n\t"\   427         "paddw %%"s3", %%"t3" \n\t"   429 #define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\   430         "pmulhw %%"s0", %%"t0" \n\t"\   431         "pmulhw %%"s1", %%"t1" \n\t"\   432         "pmulhw %%"s2", %%"t2" \n\t"\   433         "pmulhw %%"s3", %%"t3" \n\t"   435 #define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\   436         "movdqa %%"s0", %%"t0" \n\t"\   437         "movdqa %%"s1", %%"t1" \n\t"\   438         "movdqa %%"s2", %%"t2" \n\t"\   439         "movdqa %%"s3", %%"t3" \n\t"   457         snow_vertical_compose_sse2_load(
"%4",
"xmm0",
"xmm2",
"xmm4",
"xmm6")
   458         snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6")
   461         "pcmpeqw    %%xmm0, %%xmm0                   \
n\
t"
   462         "pcmpeqw    %%xmm2, %%xmm2                   \
n\
t"
   463         "paddw      %%xmm2, %%xmm2                   \
n\
t"
   464         "paddw      %%xmm0, %%xmm2                   \
n\
t"
   465         "psllw         $13, %%xmm2                   \
n\
t"
   466         snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7")
   467         snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7")
   468         snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7")
   469         snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7")
   470         snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
   471         snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7")
   472         snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
   473         snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6")
   475         "pcmpeqw %%xmm7, %%xmm7                      \
n\
t"
   476         "pcmpeqw %%xmm5, %%xmm5                      \
n\
t"
   477         "psllw $15, %%xmm7                           \
n\
t"
   478         "psrlw $13, %%xmm5                           \
n\
t"
   479         "paddw %%xmm7, %%xmm5                        \
n\
t"
   480         snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6")
   481         "movq   (%2,%%"REG_d"), %%xmm1        \
n\
t"
   482         "movq  8(%2,%%"REG_d"), %%xmm3        \
n\
t"
   483         "paddw %%xmm7, %%xmm1                        \
n\
t"
   484         "paddw %%xmm7, %%xmm3                        \
n\
t"
   485         "pavgw %%xmm1, %%xmm0                        \
n\
t"
   486         "pavgw %%xmm3, %%xmm2                        \
n\
t"
   487         "movq 16(%2,%%"REG_d"), %%xmm1        \
n\
t"
   488         "movq 24(%2,%%"REG_d"), %%xmm3        \
n\
t"
   489         "paddw %%xmm7, %%xmm1                        \
n\
t"
   490         "paddw %%xmm7, %%xmm3                        \
n\
t"
   491         "pavgw %%xmm1, %%xmm4                        \
n\
t"
   492         "pavgw %%xmm3, %%xmm6                        \
n\
t"
   493         snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6")
   494         snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
   495         snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
   497         snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6")
   498         snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
   499         snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6")
   500         snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6")
   501         snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
   502         snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
   503         snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
   504         snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6")
   505         snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6")
   508         "sub $64, %%"REG_d"                          \
n\
t"
   511         :"
r"(b0),"
r"(b1),"
r"(b2),"
r"(b3),"
r"(b4),"
r"(b5));
   514 #define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\   515         ""op" ("r",%%"REG_d"), %%"t0"   \n\t"\   516         ""op" 8("r",%%"REG_d"), %%"t1"  \n\t"\   517         ""op" 16("r",%%"REG_d"), %%"t2" \n\t"\   518         ""op" 24("r",%%"REG_d"), %%"t3" \n\t"   520 #define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\   521         snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)   523 #define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\   524         snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3)   526 #define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\   527         "movq %%"s0", ("w",%%"REG_d")   \n\t"\   528         "movq %%"s1", 8("w",%%"REG_d")  \n\t"\   529         "movq %%"s2", 16("w",%%"REG_d") \n\t"\   530         "movq %%"s3", 24("w",%%"REG_d") \n\t"   532 #define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\   533         "movq %%"s0", %%"t0" \n\t"\   534         "movq %%"s1", %%"t1" \n\t"\   535         "movq %%"s2", %%"t2" \n\t"\   536         "movq %%"s3", %%"t3" \n\t"   554         snow_vertical_compose_mmx_load(
"%4",
"mm1",
"mm3",
"mm5",
"mm7")
   555         snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
   556         "pcmpeqw    %%mm0, %%mm0                     \
n\
t"
   557         "pcmpeqw    %%mm2, %%mm2                     \
n\
t"
   558         "paddw      %%mm2, %%mm2                     \
n\
t"
   559         "paddw      %%mm0, %%mm2                     \
n\
t"
   560         "psllw        $13, %%mm2                     \
n\
t"
   561         snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
   562         snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
   563         snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
   564         snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
   565         snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
   566         snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
   567         snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
   568         snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")
   569         "pcmpeqw %%mm7, %%mm7                        \
n\
t"
   570         "pcmpeqw %%mm5, %%mm5                        \
n\
t"
   571         "psllw $15, %%mm7                            \
n\
t"
   572         "psrlw $13, %%mm5                            \
n\
t"
   573         "paddw %%mm7, %%mm5                          \
n\
t"
   574         snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
   575         "movq   (%2,%%"REG_d"), %%mm1         \
n\
t"
   576         "movq  8(%2,%%"REG_d"), %%mm3         \
n\
t"
   577         "paddw %%mm7, %%mm1                          \
n\
t"
   578         "paddw %%mm7, %%mm3                          \
n\
t"
   579         "pavgw %%mm1, %%mm0                          \
n\
t"
   580         "pavgw %%mm3, %%mm2                          \
n\
t"
   581         "movq 16(%2,%%"REG_d"), %%mm1         \
n\
t"
   582         "movq 24(%2,%%"REG_d"), %%mm3         \
n\
t"
   583         "paddw %%mm7, %%mm1                          \
n\
t"
   584         "paddw %%mm7, %%mm3                          \
n\
t"
   585         "pavgw %%mm1, %%mm4                          \
n\
t"
   586         "pavgw %%mm3, %%mm6                          \
n\
t"
   587         snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
   588         snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
   589         snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
   591         snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
   592         snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
   593         snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6")
   594         snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6")
   595         snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
   596         snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
   597         snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
   598         snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6")
   599         snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6")
   602         "sub $32, %%"REG_d"                          \
n\
t"
   605         :"
r"(b0),"
r"(b1),"
r"(b2),"
r"(b3),"
r"(b4),"
r"(b5));
   609 #define snow_inner_add_yblock_sse2_header \   610     IDWTELEM * * dst_array = sb->line + src_y;\   613              "mov  %7, %%"REG_c"             \n\t"\   615              "mov  %4, %%"REG_S"             \n\t"\   616              "pxor %%xmm7, %%xmm7            \n\t" \   617              "pcmpeqd %%xmm3, %%xmm3         \n\t"\   618              "psllw $15, %%xmm3              \n\t"\   619              "psrlw $12, %%xmm3              \n\t" \   621              "mov %1, %%"REG_D"              \n\t"\   622              "mov (%%"REG_D"), %%"REG_D"     \n\t"\   623              "add %3, %%"REG_D"              \n\t"   625 #define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\   626              "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\   627              "movq (%%"REG_d"), %%"out_reg1" \n\t"\   628              "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\   629              "punpcklbw %%xmm7, %%"out_reg1" \n\t"\   630              "punpcklbw %%xmm7, %%"out_reg2" \n\t"\   631              "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\   632              "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\   633              "punpcklbw %%xmm7, %%xmm0       \n\t"\   634              "punpcklbw %%xmm7, %%xmm4       \n\t"\   635              "pmullw %%xmm0, %%"out_reg1"    \n\t"\   636              "pmullw %%xmm4, %%"out_reg2"    \n\t"   638 #define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\   639              "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\   640              "movq (%%"REG_d"), %%"out_reg1" \n\t"\   641              "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\   642              "punpcklbw %%xmm7, %%"out_reg1" \n\t"\   643              "punpcklbw %%xmm7, %%"out_reg2" \n\t"\   644              "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\   645              "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\   646              "punpcklbw %%xmm7, %%xmm0       \n\t"\   647              "punpcklbw %%xmm7, %%xmm4       \n\t"\   648              "pmullw %%xmm0, %%"out_reg1"    \n\t"\   649              "pmullw %%xmm4, %%"out_reg2"    \n\t"   651 #define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \   652              snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\   653              "paddusw %%xmm2, %%xmm1         \n\t"\   654              "paddusw %%xmm6, %%xmm5         \n\t"   656 #define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \   657              snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\   658              "paddusw %%xmm2, %%xmm1         \n\t"\   659              "paddusw %%xmm6, %%xmm5         \n\t"   661 #define snow_inner_add_yblock_sse2_end_common1\   662              "add $32, %%"REG_S"             \n\t"\   663              "add %%"REG_c", %0              \n\t"\   664              "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\   665              "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\   666              "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\   667              "add %%"REG_c", (%%"REG_a")     \n\t"   669 #define snow_inner_add_yblock_sse2_end_common2\   671              :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\   673              "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\   674              "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");   676 #define snow_inner_add_yblock_sse2_end_8\   677              "sal $1, %%"REG_c"              \n\t"\   678              "add"OPSIZE" $"PTR_SIZE"*2, %1  \n\t"\   679              snow_inner_add_yblock_sse2_end_common1\   680              "sar $1, %%"REG_c"              \n\t"\   682              snow_inner_add_yblock_sse2_end_common2   684 #define snow_inner_add_yblock_sse2_end_16\   685              "add"OPSIZE" $"PTR_SIZE"*1, %1  \n\t"\   686              snow_inner_add_yblock_sse2_end_common1\   688              snow_inner_add_yblock_sse2_end_common2   690 static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(
const uint8_t *obmc, 
const x86_reg obmc_stride, 
uint8_t * * block, 
int b_w, 
x86_reg b_h,
   692 snow_inner_add_yblock_sse2_header
   693 snow_inner_add_yblock_sse2_start_8(
"xmm1", 
"xmm5", 
"3", 
"0")
   694 snow_inner_add_yblock_sse2_accum_8("2", "8")
   695 snow_inner_add_yblock_sse2_accum_8("1", "128")
   696 snow_inner_add_yblock_sse2_accum_8("0", "136")
   698              "mov %0, %%"REG_d"              \
n\
t"
   699              "movdqa (%%"REG_D"), %%xmm0     \
n\
t"
   700              "movdqa %%xmm1, %%xmm2          \
n\
t"
   702              "punpckhwd %%xmm7, %%xmm1       \
n\
t"
   703              "punpcklwd %%xmm7, %%xmm2       \
n\
t"
   704              "paddd %%xmm2, %%xmm0           \
n\
t"
   705              "movdqa 16(%%"REG_D"), %%xmm2   \
n\
t"
   706              "paddd %%xmm1, %%xmm2           \
n\
t"
   707              "paddd %%xmm3, %%xmm0           \
n\
t"
   708              "paddd %%xmm3, %%xmm2           \
n\
t"
   710              "mov %1, %%"REG_D"              \
n\
t"
   711              "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\
n\
t"
   712              "add %3, %%"REG_D"              \
n\
t"
   714              "movdqa (%%"REG_D"), %%xmm4     \
n\
t"
   715              "movdqa %%xmm5, %%xmm6          \
n\
t"
   716              "punpckhwd %%xmm7, %%xmm5       \
n\
t"
   717              "punpcklwd %%xmm7, %%xmm6       \
n\
t"
   718              "paddd %%xmm6, %%xmm4           \
n\
t"
   719              "movdqa 16(%%"REG_D"), %%xmm6   \
n\
t"
   720              "paddd %%xmm5, %%xmm6           \
n\
t"
   721              "paddd %%xmm3, %%xmm4           \
n\
t"
   722              "paddd %%xmm3, %%xmm6           \
n\
t"
   724              "psrad $8, %%xmm0               \
n\
t" 
   725              "psrad $8, %%xmm2               \
n\
t" 
   726              "packssdw %%xmm2, %%xmm0        \
n\
t"
   727              "packuswb %%xmm7, %%xmm0        \
n\
t"
   728              "movq %%xmm0, (%%"REG_d")       \
n\
t"
   730              "psrad $8, %%xmm4               \
n\
t" 
   731              "psrad $8, %%xmm6               \
n\
t" 
   732              "packssdw %%xmm6, %%xmm4        \
n\
t"
   733              "packuswb %%xmm7, %%xmm4        \
n\
t"
   734              "movq %%xmm4, (%%"REG_d",%%"REG_c");\
n\
t"
   735 snow_inner_add_yblock_sse2_end_8
   738 static 
void inner_add_yblock_bw_16_obmc_32_sse2(const 
uint8_t *obmc, const 
x86_reg obmc_stride, 
uint8_t * * block, 
int b_w, 
x86_reg b_h,
   740 snow_inner_add_yblock_sse2_header
   741 snow_inner_add_yblock_sse2_start_16(
"xmm1", 
"xmm5", 
"3", 
"0")
   742 snow_inner_add_yblock_sse2_accum_16("2", "16")
   743 snow_inner_add_yblock_sse2_accum_16("1", "512")
   744 snow_inner_add_yblock_sse2_accum_16("0", "528")
   746              "mov %0, %%"REG_d"              \
n\
t"
   747              "psrlw $4, %%xmm1               \
n\
t"
   748              "psrlw $4, %%xmm5               \
n\
t"
   749              "paddw   (%%"REG_D"), %%xmm1    \
n\
t"
   750              "paddw 16(%%"REG_D"), %%xmm5    \
n\
t"
   751              "paddw %%xmm3, %%xmm1           \
n\
t"
   752              "paddw %%xmm3, %%xmm5           \
n\
t"
   753              "psraw $4, %%xmm1               \
n\
t" 
   754              "psraw $4, %%xmm5               \
n\
t" 
   755              "packuswb %%xmm5, %%xmm1        \
n\
t"
   757              "movdqu %%xmm1, (%%"REG_d")       \
n\
t"
   759 snow_inner_add_yblock_sse2_end_16
   762 #define snow_inner_add_yblock_mmx_header \   763     IDWTELEM * * dst_array = sb->line + src_y;\   766              "mov  %7, %%"REG_c"             \n\t"\   768              "mov  %4, %%"REG_S"             \n\t"\   769              "pxor %%mm7, %%mm7              \n\t" \   770              "pcmpeqd %%mm3, %%mm3           \n\t"\   771              "psllw $15, %%mm3               \n\t"\   772              "psrlw $12, %%mm3               \n\t" \   774              "mov %1, %%"REG_D"              \n\t"\   775              "mov (%%"REG_D"), %%"REG_D"     \n\t"\   776              "add %3, %%"REG_D"              \n\t"   778 #define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\   779              "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\   780              "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\   781              "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\   782              "punpcklbw %%mm7, %%"out_reg1" \n\t"\   783              "punpcklbw %%mm7, %%"out_reg2" \n\t"\   784              "movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\   785              "movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\   786              "punpcklbw %%mm7, %%mm0       \n\t"\   787              "punpcklbw %%mm7, %%mm4       \n\t"\   788              "pmullw %%mm0, %%"out_reg1"    \n\t"\   789              "pmullw %%mm4, %%"out_reg2"    \n\t"   791 #define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \   792              snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\   793              "paddusw %%mm2, %%mm1         \n\t"\   794              "paddusw %%mm6, %%mm5         \n\t"   796 #define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\   797              "mov %0, %%"REG_d"              \n\t"\   798              "psrlw $4, %%mm1                \n\t"\   799              "psrlw $4, %%mm5                \n\t"\   800              "paddw "read_offset"(%%"REG_D"), %%mm1 \n\t"\   801              "paddw "read_offset"+8(%%"REG_D"), %%mm5 \n\t"\   802              "paddw %%mm3, %%mm1             \n\t"\   803              "paddw %%mm3, %%mm5             \n\t"\   804              "psraw $4, %%mm1                \n\t"\   805              "psraw $4, %%mm5                \n\t"\   806              "packuswb %%mm5, %%mm1          \n\t"\   807              "movq %%mm1, "write_offset"(%%"REG_d") \n\t"   809 #define snow_inner_add_yblock_mmx_end(s_step)\   810              "add $"s_step", %%"REG_S"             \n\t"\   811              "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\   812              "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\   813              "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\   814              "add %%"REG_c", (%%"REG_a")     \n\t"\   815              "add"OPSIZE " $"PTR_SIZE"*1, %1 \n\t"\   816              "add %%"REG_c", %0              \n\t"\   819              :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\   821              "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\   822              "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");   826 snow_inner_add_yblock_mmx_header
   827 snow_inner_add_yblock_mmx_start(
"mm1", 
"mm5", 
"3", 
"0", 
"0")
   828 snow_inner_add_yblock_mmx_accum("2", "8", "0")
   829 snow_inner_add_yblock_mmx_accum("1", "128", "0")
   830 snow_inner_add_yblock_mmx_accum("0", "136", "0")
   831 snow_inner_add_yblock_mmx_mix("0", "0")
   832 snow_inner_add_yblock_mmx_end("16")
   837 snow_inner_add_yblock_mmx_header
   838 snow_inner_add_yblock_mmx_start(
"mm1", 
"mm5", 
"3", 
"0", 
"0")
   839 snow_inner_add_yblock_mmx_accum("2", "16", "0")
   840 snow_inner_add_yblock_mmx_accum("1", "512", "0")
   841 snow_inner_add_yblock_mmx_accum("0", "528", "0")
   842 snow_inner_add_yblock_mmx_mix("0", "0")
   844 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
   845 snow_inner_add_yblock_mmx_accum("2", "24", "8")
   846 snow_inner_add_yblock_mmx_accum("1", "520", "8")
   847 snow_inner_add_yblock_mmx_accum("0", "536", "8")
   848 snow_inner_add_yblock_mmx_mix("16", "8")
   849 snow_inner_add_yblock_mmx_end("32")
   852 static 
void ff_snow_inner_add_yblock_sse2(const 
uint8_t *obmc, const 
int obmc_stride, 
uint8_t * * block, 
int b_w, 
int b_h,
   856         inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
   857     else if (b_w == 8 && obmc_stride == 16) {
   859             inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
   861             inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
   863          ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
   866 static void ff_snow_inner_add_yblock_mmx(
const uint8_t *obmc, 
const int obmc_stride, 
uint8_t * * block, 
int b_w, 
int b_h,
   869         inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
   870     else if (b_w == 8 && obmc_stride == 16)
   871         inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
   873         ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
 
void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride, uint8_t **block, int b_w, int b_h, int src_x, int src_y, int src_stride, slice_buffer *sb, int add, uint8_t *dst8)
FIXME Range Coding of cr are ref
static av_always_inline void snow_horizontal_compose_lift_lead_out(int i, IDWTELEM *dst, IDWTELEM *src, IDWTELEM *ref, int width, int w, int lift_high, int mul, int add, int shift)
#define AV_CPU_FLAG_MMXEXT
SSE integer functions or AMD MMX ext. 
void(* horizontal_compose97i)(IDWTELEM *b, IDWTELEM *temp, int width)
Used to minimize the amount of memory used in order to optimize cache performance. 
void ff_dwt_init_x86(SnowDWTContext *c)
#define AV_CPU_FLAG_MMX
standard MMX 
synthesis window for stochastic i
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU. 
static av_always_inline void snow_interleave_line_header(int *i, int width, IDWTELEM *low, IDWTELEM *high)
void(* inner_add_yblock)(const uint8_t *obmc, const int obmc_stride, uint8_t **block, int b_w, int b_h, int src_x, int src_y, int src_stride, slice_buffer *sb, int add, uint8_t *dst8)
void(* vertical_compose97i)(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width)
else dst[i][x+y *dst_stride[i]]
#define AV_CPU_FLAG_SSE2
PIV SSE2 functions. 
static av_always_inline void snow_horizontal_compose_liftS_lead_out(int i, IDWTELEM *dst, IDWTELEM *src, IDWTELEM *ref, int width, int w)