Revision ce611a27

View differences:

libavcodec/i386/snowdsp_mmx.c
111 111

  
112 112
        i = 0;
113 113
        asm volatile(
114
            "pcmpeqd    %%xmm7, %%xmm7        \n\t"
115
            "psrad         $29, %%xmm7        \n\t"
114
            "pslld          $1, %%xmm7        \n\t"
116 115
        ::);
117 116
        for(; i<w_l-7; i+=8){
118 117
            asm volatile(
......
157 156
                "movdqu 20(%1), %%xmm6        \n\t"
158 157
                "paddd    (%1), %%xmm2        \n\t"
159 158
                "paddd  16(%1), %%xmm6        \n\t"
160
                "movdqa %%xmm2, %%xmm0        \n\t"
161
                "movdqa %%xmm6, %%xmm4        \n\t"
162
                "pslld      $2, %%xmm2        \n\t"
163
                "pslld      $2, %%xmm6        \n\t"
164
                "psubd  %%xmm2, %%xmm0        \n\t"
165
                "psubd  %%xmm6, %%xmm4        \n\t"
166
                "psrad      $1, %%xmm0        \n\t"
167
                "psrad      $1, %%xmm4        \n\t"
168
                "movdqu   (%0), %%xmm2        \n\t"
169
                "movdqu 16(%0), %%xmm6        \n\t"
170
                "psubd  %%xmm0, %%xmm2        \n\t"
171
                "psubd  %%xmm4, %%xmm6        \n\t"
159
                "movdqu   (%0), %%xmm0        \n\t"
160
                "movdqu 16(%0), %%xmm4        \n\t"
161
                "paddd  %%xmm2, %%xmm0        \n\t"
162
                "paddd  %%xmm6, %%xmm4        \n\t"
163
                "psrad      $1, %%xmm2        \n\t"
164
                "psrad      $1, %%xmm6        \n\t"
165
                "paddd  %%xmm0, %%xmm2        \n\t"
166
                "paddd  %%xmm4, %%xmm6        \n\t"
172 167
                "movdqa %%xmm2, (%2)          \n\t"
173 168
                "movdqa %%xmm6, 16(%2)        \n\t"
174 169
                :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
175 170
                 : "memory"
176 171
               );
177 172
        }
178
        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO, W_AS);
173
        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
179 174
    }
180 175

  
181 176
    {
......
291 286
        DWTELEM * const ref = b+w2 - 1;
292 287

  
293 288
        i = 1;
294
        b[0] = b[0] + (((2 * ref[1] + W_BO-1) + 4 * b[0]) >> W_BS);
289
        b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
295 290
        asm volatile(
296
            "pcmpeqd     %%mm7, %%mm7        \n\t"
297
            "psrld         $29, %%mm7        \n\t"
291
            "pslld          $1, %%mm7        \n\t"
298 292
           ::);
299 293
        for(; i<w_l-3; i+=4){
300 294
            asm volatile(
......
333 327
                "movq   12(%1), %%mm6        \n\t"
334 328
                "paddd    (%1), %%mm2        \n\t"
335 329
                "paddd   8(%1), %%mm6        \n\t"
336
                "pxor    %%mm0, %%mm0        \n\t" //note: the 2 xor could be avoided if we would flip the rounding direction
337
                "pxor    %%mm4, %%mm4        \n\t"
338
                "psubd   %%mm2, %%mm0        \n\t"
339
                "psubd   %%mm6, %%mm4        \n\t"
340
                "psrad      $1, %%mm0        \n\t"
341
                "psrad      $1, %%mm4        \n\t"
342
                "psubd   %%mm0, %%mm2        \n\t"
343
                "psubd   %%mm4, %%mm6        \n\t"
344 330
                "movq     (%0), %%mm0        \n\t"
345 331
                "movq    8(%0), %%mm4        \n\t"
332
                "paddd   %%mm2, %%mm0        \n\t"
333
                "paddd   %%mm6, %%mm4        \n\t"
334
                "psrad      $1, %%mm2        \n\t"
335
                "psrad      $1, %%mm6        \n\t"
346 336
                "paddd   %%mm0, %%mm2        \n\t"
347 337
                "paddd   %%mm4, %%mm6        \n\t"
348 338
                "movq    %%mm2, (%2)         \n\t"
......
351 341
                 : "memory"
352 342
               );
353 343
        }
354
        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO, W_AS);
344
        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
355 345
    }
356 346

  
357 347
    {
libavcodec/snow.c
775 775
    int i;
776 776

  
777 777
    assert(shift == 4);
778
#define LIFTS(src, ref, inv) ((inv) ? (src) + (((ref) + 4*(src))>>shift): (16*4*(src) + 4*(ref) + 8 + (5<<27))/(5*16) - (1<<23))
778
#define LIFTS(src, ref, inv) ((inv) ? (src) + (((ref) + 4*(src))>>shift): -((-16*4*(src) + 4*(ref) + add + 5 + (5<<27))/(5*16) - (1<<23)))
779 779
    if(mirror_left){
780 780
        dst[0] = LIFTS(src[0], mul*2*ref[0]+add, inverse);
781 781
        dst += dst_step;
......
1113 1113
    DWTELEM temp[width];
1114 1114
    const int w2= (width+1)>>1;
1115 1115

  
1116
    lift (temp+w2, b    +1, b      , 1, 2, 2, width, -W_AM, W_AO, W_AS, 1, 0);
1117
    liftS(temp   , b      , temp+w2, 1, 2, 1, width, -W_BM, W_BO, W_BS, 0, 0);
1116
    lift (temp+w2, b    +1, b      , 1, 2, 2, width,  W_AM, W_AO, W_AS, 1, 1);
1117
    liftS(temp   , b      , temp+w2, 1, 2, 1, width,  W_BM, W_BO, W_BS, 0, 0);
1118 1118
    lift5(b   +w2, temp+w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 0);
1119 1119
    lift (b      , temp   , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 0);
1120 1120
}
......
1150 1150
#ifdef liftS
1151 1151
        b1[i] -= (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
1152 1152
#else
1153
        b1[i] = (16*4*b1[i] - 4*(b0[i] + b2[i]) + 8*5 + (5<<27)) / (5*16) - (1<<23);
1153
        b1[i] = (16*4*b1[i] - 4*(b0[i] + b2[i]) + W_BO*5 + (5<<27)) / (5*16) - (1<<23);
1154 1154
#endif
1155 1155
    }
1156 1156
}
......
1344 1344

  
1345 1345
    lift (temp   , b      , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 1);
1346 1346
    lift5(temp+w2, b   +w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 1);
1347
    liftS(b      , temp   , temp+w2, 2, 1, 1, width,  W_BM, W_BO-1, W_BS, 0, 1);
1348
    lift (b+1    , temp+w2, b      , 2, 1, 2, width, -W_AM, W_AO, W_AS, 1, 1);
1347
    liftS(b      , temp   , temp+w2, 2, 1, 1, width,  W_BM, W_BO, W_BS, 0, 1);
1348
    lift (b+1    , temp+w2, b      , 2, 1, 2, width,  W_AM, W_AO, W_AS, 1, 0);
1349 1349
}
1350 1350

  
1351 1351
static void vertical_compose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
libavcodec/snow.h
165 165

  
166 166
static av_always_inline void snow_horizontal_compose_liftS_lead_out(int i, DWTELEM * dst, DWTELEM * src, DWTELEM * ref, int width, int w){
167 167
        for(; i<w; i++){
168
            dst[i] = src[i] + ((ref[i] + ref[(i+1)]+W_BO-1 + 4 * src[i]) >> W_BS);
168
            dst[i] = src[i] + ((ref[i] + ref[(i+1)]+W_BO + 4 * src[i]) >> W_BS);
169 169
        }
170 170

  
171 171
        if(width&1){
172
            dst[w] = src[w] + ((2 * ref[w] + W_BO-1 + 4 * src[w]) >> W_BS);
172
            dst[w] = src[w] + ((2 * ref[w] + W_BO + 4 * src[w]) >> W_BS);
173 173
        }
174 174
}
175 175

  
tests/ffmpeg.regression.ref
141 141
2654678 ./tests/data/a-ffv1.avi
142 142
799d3db687f6cdd7a837ec156efc171f *./tests/data/out.yuv
143 143
stddev:  0.00 PSNR:99.99 bytes:7602176
144
9078723c943de5d79490f54b99e6ea9e *./tests/data/a-snow.avi
145
156656 ./tests/data/a-snow.avi
146
f2932084b52e2ede167c9ba21eae0656 *./tests/data/out.yuv
144
958d649d09b7361d5f00b5b3fcccbcd2 *./tests/data/a-snow.avi
145
156606 ./tests/data/a-snow.avi
146
b19cb7f9134f922326028c6bb44e96de *./tests/data/out.yuv
147 147
stddev: 23.14 PSNR:20.83 bytes:7602176
148 148
ba999e86070aa971376e7f317a022c37 *./tests/data/a-snow53.avi
149 149
3519486 ./tests/data/a-snow53.avi
tests/rotozoom.regression.ref
141 141
3525804 ./tests/data/a-ffv1.avi
142 142
dde5895817ad9d219f79a52d0bdfb001 *./tests/data/out.yuv
143 143
stddev:  0.00 PSNR:99.99 bytes:7602176
144
40a6e938ac2bd92ee12cd57925e86454 *./tests/data/a-snow.avi
145
68758 ./tests/data/a-snow.avi
146
1e356854142898c7c4aab4bfedadf235 *./tests/data/out.yuv
144
2cfa1bdb443d04a890208a83fd239461 *./tests/data/a-snow.avi
145
68872 ./tests/data/a-snow.avi
146
64a0495b7ab53509d3b791465262795c *./tests/data/out.yuv
147 147
stddev: 10.86 PSNR:27.40 bytes:7602176
148 148
3d0da6aeec9b80c6ee0ff4b747bdd0f0 *./tests/data/a-snow53.avi
149 149
2721980 ./tests/data/a-snow53.avi
tests/seek.regression.ref
2046 2046
ret: 0 st: 0 dts:0.040000 pts:0.040000 pos:9610 size:1075 flags:0
2047 2047
----------------
2048 2048
tests/data/a-snow.avi
2049
ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2986 flags:1
2049
ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2987 flags:1
2050 2050
ret: 0 st:-1 ts:-1.000000 flags:0
2051
ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2986 flags:1
2051
ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2987 flags:1
2052 2052
ret: 0 st:-1 ts:1.894167 flags:1
2053
ret: 0 st: 0 dts:1.440000 pts:1.440000 pos:46794 size:3663 flags:1
2053
ret: 0 st: 0 dts:1.440000 pts:1.440000 pos:46908 size:3663 flags:1
2054 2054
ret: 0 st: 0 ts:0.800000 flags:0
2055
ret: 0 st: 0 dts:0.960000 pts:0.960000 pos:31726 size:3478 flags:1
2055
ret: 0 st: 0 dts:0.960000 pts:0.960000 pos:31690 size:3478 flags:1
2056 2056
ret:-1 st: 0 ts:-0.320000 flags:1
2057 2057
ret:-1 st:-1 ts:2.576668 flags:0
2058 2058
ret: 0 st:-1 ts:1.470835 flags:1
2059
ret: 0 st: 0 dts:1.440000 pts:1.440000 pos:46794 size:3663 flags:1
2059
ret: 0 st: 0 dts:1.440000 pts:1.440000 pos:46908 size:3663 flags:1
2060 2060
ret: 0 st: 0 ts:0.360000 flags:0
2061
ret: 0 st: 0 dts:0.480000 pts:0.480000 pos:18006 size:3229 flags:1
2061
ret: 0 st: 0 dts:0.480000 pts:0.480000 pos:17990 size:3229 flags:1
2062 2062
ret:-1 st: 0 ts:-0.760000 flags:1
2063 2063
ret:-1 st:-1 ts:2.153336 flags:0
2064 2064
ret: 0 st:-1 ts:1.047503 flags:1
2065
ret: 0 st: 0 dts:0.960000 pts:0.960000 pos:31726 size:3478 flags:1
2065
ret: 0 st: 0 dts:0.960000 pts:0.960000 pos:31690 size:3478 flags:1
2066 2066
ret: 0 st: 0 ts:-0.040000 flags:0
2067
ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2986 flags:1
2067
ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2987 flags:1
2068 2068
ret: 0 st: 0 ts:2.840000 flags:1
2069
ret: 0 st: 0 dts:1.920000 pts:1.920000 pos:63240 size:3635 flags:1
2069
ret: 0 st: 0 dts:1.920000 pts:1.920000 pos:63350 size:3635 flags:1
2070 2070
ret: 0 st:-1 ts:1.730004 flags:0
2071
ret: 0 st: 0 dts:1.920000 pts:1.920000 pos:63240 size:3635 flags:1
2071
ret: 0 st: 0 dts:1.920000 pts:1.920000 pos:63350 size:3635 flags:1
2072 2072
ret: 0 st:-1 ts:0.624171 flags:1
2073
ret: 0 st: 0 dts:0.480000 pts:0.480000 pos:18006 size:3229 flags:1
2073
ret: 0 st: 0 dts:0.480000 pts:0.480000 pos:17990 size:3229 flags:1
2074 2074
ret: 0 st: 0 ts:-0.480000 flags:0
2075
ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2986 flags:1
2075
ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2987 flags:1
2076 2076
ret: 0 st: 0 ts:2.400000 flags:1
2077
ret: 0 st: 0 dts:1.920000 pts:1.920000 pos:63240 size:3635 flags:1
2077
ret: 0 st: 0 dts:1.920000 pts:1.920000 pos:63350 size:3635 flags:1
2078 2078
ret: 0 st:-1 ts:1.306672 flags:0
2079
ret: 0 st: 0 dts:1.440000 pts:1.440000 pos:46794 size:3663 flags:1
2079
ret: 0 st: 0 dts:1.440000 pts:1.440000 pos:46908 size:3663 flags:1
2080 2080
ret: 0 st:-1 ts:0.200839 flags:1
2081
ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2986 flags:1
2081
ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2987 flags:1
2082 2082
ret: 0 st: 0 ts:-0.920000 flags:0
2083
ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2986 flags:1
2083
ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2987 flags:1
2084 2084
ret: 0 st: 0 ts:2.000000 flags:1
2085
ret: 0 st: 0 dts:1.920000 pts:1.920000 pos:63240 size:3635 flags:1
2085
ret: 0 st: 0 dts:1.920000 pts:1.920000 pos:63350 size:3635 flags:1
2086 2086
ret: 0 st:-1 ts:0.883340 flags:0
2087
ret: 0 st: 0 dts:0.960000 pts:0.960000 pos:31726 size:3478 flags:1
2087
ret: 0 st: 0 dts:0.960000 pts:0.960000 pos:31690 size:3478 flags:1
2088 2088
ret:-1 st:-1 ts:-0.222493 flags:1
2089 2089
ret:-1 st: 0 ts:2.680000 flags:0
2090 2090
ret: 0 st: 0 ts:1.560000 flags:1
2091
ret: 0 st: 0 dts:1.440000 pts:1.440000 pos:46794 size:3663 flags:1
2091
ret: 0 st: 0 dts:1.440000 pts:1.440000 pos:46908 size:3663 flags:1
2092 2092
ret: 0 st:-1 ts:0.460008 flags:0
2093
ret: 0 st: 0 dts:0.480000 pts:0.480000 pos:18006 size:3229 flags:1
2093
ret: 0 st: 0 dts:0.480000 pts:0.480000 pos:17990 size:3229 flags:1
2094 2094
ret:-1 st:-1 ts:-0.645825 flags:1
2095 2095
----------------
2096 2096
tests/data/a-snow53.avi

Also available in: Unified diff