Revision ce611a27 libavcodec/i386/snowdsp_mmx.c

View differences:

libavcodec/i386/snowdsp_mmx.c
111 111

  
112 112
        i = 0;
113 113
        asm volatile(
114
            "pcmpeqd    %%xmm7, %%xmm7        \n\t"
115
            "psrad         $29, %%xmm7        \n\t"
114
            "pslld          $1, %%xmm7        \n\t"
116 115
        ::);
117 116
        for(; i<w_l-7; i+=8){
118 117
            asm volatile(
......
157 156
                "movdqu 20(%1), %%xmm6        \n\t"
158 157
                "paddd    (%1), %%xmm2        \n\t"
159 158
                "paddd  16(%1), %%xmm6        \n\t"
160
                "movdqa %%xmm2, %%xmm0        \n\t"
161
                "movdqa %%xmm6, %%xmm4        \n\t"
162
                "pslld      $2, %%xmm2        \n\t"
163
                "pslld      $2, %%xmm6        \n\t"
164
                "psubd  %%xmm2, %%xmm0        \n\t"
165
                "psubd  %%xmm6, %%xmm4        \n\t"
166
                "psrad      $1, %%xmm0        \n\t"
167
                "psrad      $1, %%xmm4        \n\t"
168
                "movdqu   (%0), %%xmm2        \n\t"
169
                "movdqu 16(%0), %%xmm6        \n\t"
170
                "psubd  %%xmm0, %%xmm2        \n\t"
171
                "psubd  %%xmm4, %%xmm6        \n\t"
159
                "movdqu   (%0), %%xmm0        \n\t"
160
                "movdqu 16(%0), %%xmm4        \n\t"
161
                "paddd  %%xmm2, %%xmm0        \n\t"
162
                "paddd  %%xmm6, %%xmm4        \n\t"
163
                "psrad      $1, %%xmm2        \n\t"
164
                "psrad      $1, %%xmm6        \n\t"
165
                "paddd  %%xmm0, %%xmm2        \n\t"
166
                "paddd  %%xmm4, %%xmm6        \n\t"
172 167
                "movdqa %%xmm2, (%2)          \n\t"
173 168
                "movdqa %%xmm6, 16(%2)        \n\t"
174 169
                :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
175 170
                 : "memory"
176 171
               );
177 172
        }
178
        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO, W_AS);
173
        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
179 174
    }
180 175

  
181 176
    {
......
291 286
        DWTELEM * const ref = b+w2 - 1;
292 287

  
293 288
        i = 1;
294
        b[0] = b[0] + (((2 * ref[1] + W_BO-1) + 4 * b[0]) >> W_BS);
289
        b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
295 290
        asm volatile(
296
            "pcmpeqd     %%mm7, %%mm7        \n\t"
297
            "psrld         $29, %%mm7        \n\t"
291
            "pslld          $1, %%mm7        \n\t"
298 292
           ::);
299 293
        for(; i<w_l-3; i+=4){
300 294
            asm volatile(
......
333 327
                "movq   12(%1), %%mm6        \n\t"
334 328
                "paddd    (%1), %%mm2        \n\t"
335 329
                "paddd   8(%1), %%mm6        \n\t"
336
                "pxor    %%mm0, %%mm0        \n\t" //note: the 2 xor could be avoided if we would flip the rounding direction
337
                "pxor    %%mm4, %%mm4        \n\t"
338
                "psubd   %%mm2, %%mm0        \n\t"
339
                "psubd   %%mm6, %%mm4        \n\t"
340
                "psrad      $1, %%mm0        \n\t"
341
                "psrad      $1, %%mm4        \n\t"
342
                "psubd   %%mm0, %%mm2        \n\t"
343
                "psubd   %%mm4, %%mm6        \n\t"
344 330
                "movq     (%0), %%mm0        \n\t"
345 331
                "movq    8(%0), %%mm4        \n\t"
332
                "paddd   %%mm2, %%mm0        \n\t"
333
                "paddd   %%mm6, %%mm4        \n\t"
334
                "psrad      $1, %%mm2        \n\t"
335
                "psrad      $1, %%mm6        \n\t"
346 336
                "paddd   %%mm0, %%mm2        \n\t"
347 337
                "paddd   %%mm4, %%mm6        \n\t"
348 338
                "movq    %%mm2, (%2)         \n\t"
......
351 341
                 : "memory"
352 342
               );
353 343
        }
354
        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO, W_AS);
344
        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
355 345
    }
356 346

  
357 347
    {

Also available in: Unified diff