Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil_internal.h @ 8dffcca5

History | View | Annotate | Download (47.9 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file
27
 * DSP utils
28
 */
29

    
30
#include "dsputil.h"
31

    
32
/* draw the edges of width 'w' of an image of size width, height */
33
//FIXME check that this is ok for mpeg4 interlaced
34
static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w, int sides)
35
{
36
    uint8_t *ptr, *last_line;
37
    int i;
38

    
39
    /* left and right */
40
    ptr = buf;
41
    for(i=0;i<height;i++) {
42
        memset(ptr - w, ptr[0], w);
43
        memset(ptr + width, ptr[width-1], w);
44
        ptr += wrap;
45
    }
46

    
47
    /* top and bottom + corners */
48
    buf -= w;
49
    last_line = buf + (height - 1) * wrap;
50
    if (sides & EDGE_TOP)
51
        for(i = 0; i < w; i++)
52
            memcpy(buf - (i + 1) * wrap, buf, width + w + w); // top
53
    if (sides & EDGE_BOTTOM)
54
        for (i = 0; i < w; i++)
55
            memcpy(last_line + (i + 1) * wrap, last_line, width + w + w); // bottom
56
}
57

    
58
/**
59
 * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
60
 * @param buf destination buffer
61
 * @param src source buffer
62
 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
63
 * @param block_w width of block
64
 * @param block_h height of block
65
 * @param src_x x coordinate of the top left sample of the block in the source buffer
66
 * @param src_y y coordinate of the top left sample of the block in the source buffer
67
 * @param w width of the source buffer
68
 * @param h height of the source buffer
69
 */
70
void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
71
                                    int src_x, int src_y, int w, int h){
72
    int x, y;
73
    int start_y, start_x, end_y, end_x;
74

    
75
    if(src_y>= h){
76
        src+= (h-1-src_y)*linesize;
77
        src_y=h-1;
78
    }else if(src_y<=-block_h){
79
        src+= (1-block_h-src_y)*linesize;
80
        src_y=1-block_h;
81
    }
82
    if(src_x>= w){
83
        src+= (w-1-src_x);
84
        src_x=w-1;
85
    }else if(src_x<=-block_w){
86
        src+= (1-block_w-src_x);
87
        src_x=1-block_w;
88
    }
89

    
90
    start_y= FFMAX(0, -src_y);
91
    start_x= FFMAX(0, -src_x);
92
    end_y= FFMIN(block_h, h-src_y);
93
    end_x= FFMIN(block_w, w-src_x);
94
    assert(start_y < end_y && block_h);
95
    assert(start_x < end_x && block_w);
96

    
97
    w    = end_x - start_x;
98
    src += start_y*linesize + start_x;
99
    buf += start_x;
100

    
101
    //top
102
    for(y=0; y<start_y; y++){
103
        memcpy(buf, src, w);
104
        buf += linesize;
105
    }
106

    
107
    // copy existing part
108
    for(; y<end_y; y++){
109
        memcpy(buf, src, w);
110
        src += linesize;
111
        buf += linesize;
112
    }
113

    
114
    //bottom
115
    src -= linesize;
116
    for(; y<block_h; y++){
117
        memcpy(buf, src, w);
118
        buf += linesize;
119
    }
120

    
121
    buf -= block_h * linesize + start_x;
122
    while (block_h--){
123
       //left
124
        for(x=0; x<start_x; x++){
125
            buf[x] = buf[start_x];
126
        }
127

    
128
       //right
129
        for(x=end_x; x<block_w; x++){
130
            buf[x] = buf[end_x - 1];
131
        }
132
        buf += linesize;
133
    }
134
}
135

    
136
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
137
{
138
    int i;
139
    for(i=0;i<8;i++) {
140
        pixels[0] += block[0];
141
        pixels[1] += block[1];
142
        pixels[2] += block[2];
143
        pixels[3] += block[3];
144
        pixels[4] += block[4];
145
        pixels[5] += block[5];
146
        pixels[6] += block[6];
147
        pixels[7] += block[7];
148
        pixels += line_size;
149
        block += 8;
150
    }
151
}
152

    
153
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
154
{
155
    int i;
156
    for(i=0;i<4;i++) {
157
        pixels[0] += block[0];
158
        pixels[1] += block[1];
159
        pixels[2] += block[2];
160
        pixels[3] += block[3];
161
        pixels += line_size;
162
        block += 4;
163
    }
164
}
165

    
166
#if 0
167

168
#define PIXOP2(OPNAME, OP) \
169
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
170
{\
171
    int i;\
172
    for(i=0; i<h; i++){\
173
        OP(*((uint64_t*)block), AV_RN64(pixels));\
174
        pixels+=line_size;\
175
        block +=line_size;\
176
    }\
177
}\
178
\
179
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
180
{\
181
    int i;\
182
    for(i=0; i<h; i++){\
183
        const uint64_t a= AV_RN64(pixels  );\
184
        const uint64_t b= AV_RN64(pixels+1);\
185
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
186
        pixels+=line_size;\
187
        block +=line_size;\
188
    }\
189
}\
190
\
191
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
192
{\
193
    int i;\
194
    for(i=0; i<h; i++){\
195
        const uint64_t a= AV_RN64(pixels  );\
196
        const uint64_t b= AV_RN64(pixels+1);\
197
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
198
        pixels+=line_size;\
199
        block +=line_size;\
200
    }\
201
}\
202
\
203
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
204
{\
205
    int i;\
206
    for(i=0; i<h; i++){\
207
        const uint64_t a= AV_RN64(pixels          );\
208
        const uint64_t b= AV_RN64(pixels+line_size);\
209
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
210
        pixels+=line_size;\
211
        block +=line_size;\
212
    }\
213
}\
214
\
215
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
216
{\
217
    int i;\
218
    for(i=0; i<h; i++){\
219
        const uint64_t a= AV_RN64(pixels          );\
220
        const uint64_t b= AV_RN64(pixels+line_size);\
221
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
222
        pixels+=line_size;\
223
        block +=line_size;\
224
    }\
225
}\
226
\
227
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
228
{\
229
        int i;\
230
        const uint64_t a= AV_RN64(pixels  );\
231
        const uint64_t b= AV_RN64(pixels+1);\
232
        uint64_t l0=  (a&0x0303030303030303ULL)\
233
                    + (b&0x0303030303030303ULL)\
234
                    + 0x0202020202020202ULL;\
235
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
236
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
237
        uint64_t l1,h1;\
238
\
239
        pixels+=line_size;\
240
        for(i=0; i<h; i+=2){\
241
            uint64_t a= AV_RN64(pixels  );\
242
            uint64_t b= AV_RN64(pixels+1);\
243
            l1=  (a&0x0303030303030303ULL)\
244
               + (b&0x0303030303030303ULL);\
245
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
246
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
247
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
248
            pixels+=line_size;\
249
            block +=line_size;\
250
            a= AV_RN64(pixels  );\
251
            b= AV_RN64(pixels+1);\
252
            l0=  (a&0x0303030303030303ULL)\
253
               + (b&0x0303030303030303ULL)\
254
               + 0x0202020202020202ULL;\
255
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
256
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
257
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
258
            pixels+=line_size;\
259
            block +=line_size;\
260
        }\
261
}\
262
\
263
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
264
{\
265
        int i;\
266
        const uint64_t a= AV_RN64(pixels  );\
267
        const uint64_t b= AV_RN64(pixels+1);\
268
        uint64_t l0=  (a&0x0303030303030303ULL)\
269
                    + (b&0x0303030303030303ULL)\
270
                    + 0x0101010101010101ULL;\
271
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
272
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
273
        uint64_t l1,h1;\
274
\
275
        pixels+=line_size;\
276
        for(i=0; i<h; i+=2){\
277
            uint64_t a= AV_RN64(pixels  );\
278
            uint64_t b= AV_RN64(pixels+1);\
279
            l1=  (a&0x0303030303030303ULL)\
280
               + (b&0x0303030303030303ULL);\
281
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
282
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
283
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
284
            pixels+=line_size;\
285
            block +=line_size;\
286
            a= AV_RN64(pixels  );\
287
            b= AV_RN64(pixels+1);\
288
            l0=  (a&0x0303030303030303ULL)\
289
               + (b&0x0303030303030303ULL)\
290
               + 0x0101010101010101ULL;\
291
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
292
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
293
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
294
            pixels+=line_size;\
295
            block +=line_size;\
296
        }\
297
}\
298
\
299
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
300
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
301
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
302
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
303
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
304
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
305
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
306

307
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
308
#else // 64 bit variant
309

    
310
#define PIXOP2(OPNAME, OP) \
311
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
312
    int i;\
313
    for(i=0; i<h; i++){\
314
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
315
        pixels+=line_size;\
316
        block +=line_size;\
317
    }\
318
}\
319
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
320
    int i;\
321
    for(i=0; i<h; i++){\
322
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
323
        pixels+=line_size;\
324
        block +=line_size;\
325
    }\
326
}\
327
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
328
    int i;\
329
    for(i=0; i<h; i++){\
330
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
331
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
332
        pixels+=line_size;\
333
        block +=line_size;\
334
    }\
335
}\
336
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
337
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
338
}\
339
\
340
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
341
                                                int src_stride1, int src_stride2, int h){\
342
    int i;\
343
    for(i=0; i<h; i++){\
344
        uint32_t a,b;\
345
        a= AV_RN32(&src1[i*src_stride1  ]);\
346
        b= AV_RN32(&src2[i*src_stride2  ]);\
347
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
348
        a= AV_RN32(&src1[i*src_stride1+4]);\
349
        b= AV_RN32(&src2[i*src_stride2+4]);\
350
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
351
    }\
352
}\
353
\
354
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
355
                                                int src_stride1, int src_stride2, int h){\
356
    int i;\
357
    for(i=0; i<h; i++){\
358
        uint32_t a,b;\
359
        a= AV_RN32(&src1[i*src_stride1  ]);\
360
        b= AV_RN32(&src2[i*src_stride2  ]);\
361
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
362
        a= AV_RN32(&src1[i*src_stride1+4]);\
363
        b= AV_RN32(&src2[i*src_stride2+4]);\
364
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
365
    }\
366
}\
367
\
368
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
369
                                                int src_stride1, int src_stride2, int h){\
370
    int i;\
371
    for(i=0; i<h; i++){\
372
        uint32_t a,b;\
373
        a= AV_RN32(&src1[i*src_stride1  ]);\
374
        b= AV_RN32(&src2[i*src_stride2  ]);\
375
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
376
    }\
377
}\
378
\
379
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
380
                                                int src_stride1, int src_stride2, int h){\
381
    int i;\
382
    for(i=0; i<h; i++){\
383
        uint32_t a,b;\
384
        a= AV_RN16(&src1[i*src_stride1  ]);\
385
        b= AV_RN16(&src2[i*src_stride2  ]);\
386
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
387
    }\
388
}\
389
\
390
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
391
                                                int src_stride1, int src_stride2, int h){\
392
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
393
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
394
}\
395
\
396
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
397
                                                int src_stride1, int src_stride2, int h){\
398
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
399
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
400
}\
401
\
402
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
403
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
404
}\
405
\
406
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
407
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
408
}\
409
\
410
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
411
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
412
}\
413
\
414
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
415
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
416
}\
417
\
418
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
419
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
420
    int i;\
421
    for(i=0; i<h; i++){\
422
        uint32_t a, b, c, d, l0, l1, h0, h1;\
423
        a= AV_RN32(&src1[i*src_stride1]);\
424
        b= AV_RN32(&src2[i*src_stride2]);\
425
        c= AV_RN32(&src3[i*src_stride3]);\
426
        d= AV_RN32(&src4[i*src_stride4]);\
427
        l0=  (a&0x03030303UL)\
428
           + (b&0x03030303UL)\
429
           + 0x02020202UL;\
430
        h0= ((a&0xFCFCFCFCUL)>>2)\
431
          + ((b&0xFCFCFCFCUL)>>2);\
432
        l1=  (c&0x03030303UL)\
433
           + (d&0x03030303UL);\
434
        h1= ((c&0xFCFCFCFCUL)>>2)\
435
          + ((d&0xFCFCFCFCUL)>>2);\
436
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
437
        a= AV_RN32(&src1[i*src_stride1+4]);\
438
        b= AV_RN32(&src2[i*src_stride2+4]);\
439
        c= AV_RN32(&src3[i*src_stride3+4]);\
440
        d= AV_RN32(&src4[i*src_stride4+4]);\
441
        l0=  (a&0x03030303UL)\
442
           + (b&0x03030303UL)\
443
           + 0x02020202UL;\
444
        h0= ((a&0xFCFCFCFCUL)>>2)\
445
          + ((b&0xFCFCFCFCUL)>>2);\
446
        l1=  (c&0x03030303UL)\
447
           + (d&0x03030303UL);\
448
        h1= ((c&0xFCFCFCFCUL)>>2)\
449
          + ((d&0xFCFCFCFCUL)>>2);\
450
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
451
    }\
452
}\
453
\
454
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
455
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
456
}\
457
\
458
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
459
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
460
}\
461
\
462
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
463
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
464
}\
465
\
466
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
467
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
468
}\
469
\
470
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
471
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
472
    int i;\
473
    for(i=0; i<h; i++){\
474
        uint32_t a, b, c, d, l0, l1, h0, h1;\
475
        a= AV_RN32(&src1[i*src_stride1]);\
476
        b= AV_RN32(&src2[i*src_stride2]);\
477
        c= AV_RN32(&src3[i*src_stride3]);\
478
        d= AV_RN32(&src4[i*src_stride4]);\
479
        l0=  (a&0x03030303UL)\
480
           + (b&0x03030303UL)\
481
           + 0x01010101UL;\
482
        h0= ((a&0xFCFCFCFCUL)>>2)\
483
          + ((b&0xFCFCFCFCUL)>>2);\
484
        l1=  (c&0x03030303UL)\
485
           + (d&0x03030303UL);\
486
        h1= ((c&0xFCFCFCFCUL)>>2)\
487
          + ((d&0xFCFCFCFCUL)>>2);\
488
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
489
        a= AV_RN32(&src1[i*src_stride1+4]);\
490
        b= AV_RN32(&src2[i*src_stride2+4]);\
491
        c= AV_RN32(&src3[i*src_stride3+4]);\
492
        d= AV_RN32(&src4[i*src_stride4+4]);\
493
        l0=  (a&0x03030303UL)\
494
           + (b&0x03030303UL)\
495
           + 0x01010101UL;\
496
        h0= ((a&0xFCFCFCFCUL)>>2)\
497
          + ((b&0xFCFCFCFCUL)>>2);\
498
        l1=  (c&0x03030303UL)\
499
           + (d&0x03030303UL);\
500
        h1= ((c&0xFCFCFCFCUL)>>2)\
501
          + ((d&0xFCFCFCFCUL)>>2);\
502
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
503
    }\
504
}\
505
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
506
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
507
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
508
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
509
}\
510
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
511
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
512
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
513
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
514
}\
515
\
516
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
517
{\
518
        int i, a0, b0, a1, b1;\
519
        a0= pixels[0];\
520
        b0= pixels[1] + 2;\
521
        a0 += b0;\
522
        b0 += pixels[2];\
523
\
524
        pixels+=line_size;\
525
        for(i=0; i<h; i+=2){\
526
            a1= pixels[0];\
527
            b1= pixels[1];\
528
            a1 += b1;\
529
            b1 += pixels[2];\
530
\
531
            block[0]= (a1+a0)>>2; /* FIXME non put */\
532
            block[1]= (b1+b0)>>2;\
533
\
534
            pixels+=line_size;\
535
            block +=line_size;\
536
\
537
            a0= pixels[0];\
538
            b0= pixels[1] + 2;\
539
            a0 += b0;\
540
            b0 += pixels[2];\
541
\
542
            block[0]= (a1+a0)>>2;\
543
            block[1]= (b1+b0)>>2;\
544
            pixels+=line_size;\
545
            block +=line_size;\
546
        }\
547
}\
548
\
549
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
550
{\
551
        int i;\
552
        const uint32_t a= AV_RN32(pixels  );\
553
        const uint32_t b= AV_RN32(pixels+1);\
554
        uint32_t l0=  (a&0x03030303UL)\
555
                    + (b&0x03030303UL)\
556
                    + 0x02020202UL;\
557
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
558
                   + ((b&0xFCFCFCFCUL)>>2);\
559
        uint32_t l1,h1;\
560
\
561
        pixels+=line_size;\
562
        for(i=0; i<h; i+=2){\
563
            uint32_t a= AV_RN32(pixels  );\
564
            uint32_t b= AV_RN32(pixels+1);\
565
            l1=  (a&0x03030303UL)\
566
               + (b&0x03030303UL);\
567
            h1= ((a&0xFCFCFCFCUL)>>2)\
568
              + ((b&0xFCFCFCFCUL)>>2);\
569
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
570
            pixels+=line_size;\
571
            block +=line_size;\
572
            a= AV_RN32(pixels  );\
573
            b= AV_RN32(pixels+1);\
574
            l0=  (a&0x03030303UL)\
575
               + (b&0x03030303UL)\
576
               + 0x02020202UL;\
577
            h0= ((a&0xFCFCFCFCUL)>>2)\
578
              + ((b&0xFCFCFCFCUL)>>2);\
579
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
580
            pixels+=line_size;\
581
            block +=line_size;\
582
        }\
583
}\
584
\
585
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
586
{\
587
    int j;\
588
    for(j=0; j<2; j++){\
589
        int i;\
590
        const uint32_t a= AV_RN32(pixels  );\
591
        const uint32_t b= AV_RN32(pixels+1);\
592
        uint32_t l0=  (a&0x03030303UL)\
593
                    + (b&0x03030303UL)\
594
                    + 0x02020202UL;\
595
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
596
                   + ((b&0xFCFCFCFCUL)>>2);\
597
        uint32_t l1,h1;\
598
\
599
        pixels+=line_size;\
600
        for(i=0; i<h; i+=2){\
601
            uint32_t a= AV_RN32(pixels  );\
602
            uint32_t b= AV_RN32(pixels+1);\
603
            l1=  (a&0x03030303UL)\
604
               + (b&0x03030303UL);\
605
            h1= ((a&0xFCFCFCFCUL)>>2)\
606
              + ((b&0xFCFCFCFCUL)>>2);\
607
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
608
            pixels+=line_size;\
609
            block +=line_size;\
610
            a= AV_RN32(pixels  );\
611
            b= AV_RN32(pixels+1);\
612
            l0=  (a&0x03030303UL)\
613
               + (b&0x03030303UL)\
614
               + 0x02020202UL;\
615
            h0= ((a&0xFCFCFCFCUL)>>2)\
616
              + ((b&0xFCFCFCFCUL)>>2);\
617
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
618
            pixels+=line_size;\
619
            block +=line_size;\
620
        }\
621
        pixels+=4-line_size*(h+1);\
622
        block +=4-line_size*h;\
623
    }\
624
}\
625
\
626
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
627
{\
628
    int j;\
629
    for(j=0; j<2; j++){\
630
        int i;\
631
        const uint32_t a= AV_RN32(pixels  );\
632
        const uint32_t b= AV_RN32(pixels+1);\
633
        uint32_t l0=  (a&0x03030303UL)\
634
                    + (b&0x03030303UL)\
635
                    + 0x01010101UL;\
636
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
637
                   + ((b&0xFCFCFCFCUL)>>2);\
638
        uint32_t l1,h1;\
639
\
640
        pixels+=line_size;\
641
        for(i=0; i<h; i+=2){\
642
            uint32_t a= AV_RN32(pixels  );\
643
            uint32_t b= AV_RN32(pixels+1);\
644
            l1=  (a&0x03030303UL)\
645
               + (b&0x03030303UL);\
646
            h1= ((a&0xFCFCFCFCUL)>>2)\
647
              + ((b&0xFCFCFCFCUL)>>2);\
648
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
649
            pixels+=line_size;\
650
            block +=line_size;\
651
            a= AV_RN32(pixels  );\
652
            b= AV_RN32(pixels+1);\
653
            l0=  (a&0x03030303UL)\
654
               + (b&0x03030303UL)\
655
               + 0x01010101UL;\
656
            h0= ((a&0xFCFCFCFCUL)>>2)\
657
              + ((b&0xFCFCFCFCUL)>>2);\
658
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
659
            pixels+=line_size;\
660
            block +=line_size;\
661
        }\
662
        pixels+=4-line_size*(h+1);\
663
        block +=4-line_size*h;\
664
    }\
665
}\
666
\
667
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
668
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
669
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
670
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
671
av_unused CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
672
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
673
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
674
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
675

    
676
#define op_avg(a, b) a = rnd_avg32(a, b)
677
#endif
678

    
679
#define op_put(a, b) a = b
680

    
681
PIXOP2(avg, op_avg)
682
PIXOP2(put, op_put)
683
#undef op_avg
684
#undef op_put
685

    
686
#define put_no_rnd_pixels8_c  put_pixels8_c
687
#define put_no_rnd_pixels16_c put_pixels16_c
688

    
689
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
690
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
691
}
692

    
693
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
694
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
695
}
696

    
697
#define H264_CHROMA_MC(OPNAME, OP)\
698
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
699
    const int A=(8-x)*(8-y);\
700
    const int B=(  x)*(8-y);\
701
    const int C=(8-x)*(  y);\
702
    const int D=(  x)*(  y);\
703
    int i;\
704
    \
705
    assert(x<8 && y<8 && x>=0 && y>=0);\
706
\
707
    if(D){\
708
        for(i=0; i<h; i++){\
709
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
710
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
711
            dst+= stride;\
712
            src+= stride;\
713
        }\
714
    }else{\
715
        const int E= B+C;\
716
        const int step= C ? stride : 1;\
717
        for(i=0; i<h; i++){\
718
            OP(dst[0], (A*src[0] + E*src[step+0]));\
719
            OP(dst[1], (A*src[1] + E*src[step+1]));\
720
            dst+= stride;\
721
            src+= stride;\
722
        }\
723
    }\
724
}\
725
\
726
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
727
    const int A=(8-x)*(8-y);\
728
    const int B=(  x)*(8-y);\
729
    const int C=(8-x)*(  y);\
730
    const int D=(  x)*(  y);\
731
    int i;\
732
    \
733
    assert(x<8 && y<8 && x>=0 && y>=0);\
734
\
735
    if(D){\
736
        for(i=0; i<h; i++){\
737
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
738
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
739
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
740
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
741
            dst+= stride;\
742
            src+= stride;\
743
        }\
744
    }else{\
745
        const int E= B+C;\
746
        const int step= C ? stride : 1;\
747
        for(i=0; i<h; i++){\
748
            OP(dst[0], (A*src[0] + E*src[step+0]));\
749
            OP(dst[1], (A*src[1] + E*src[step+1]));\
750
            OP(dst[2], (A*src[2] + E*src[step+2]));\
751
            OP(dst[3], (A*src[3] + E*src[step+3]));\
752
            dst+= stride;\
753
            src+= stride;\
754
        }\
755
    }\
756
}\
757
\
758
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
759
    const int A=(8-x)*(8-y);\
760
    const int B=(  x)*(8-y);\
761
    const int C=(8-x)*(  y);\
762
    const int D=(  x)*(  y);\
763
    int i;\
764
    \
765
    assert(x<8 && y<8 && x>=0 && y>=0);\
766
\
767
    if(D){\
768
        for(i=0; i<h; i++){\
769
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
770
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
771
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
772
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
773
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
774
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
775
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
776
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
777
            dst+= stride;\
778
            src+= stride;\
779
        }\
780
    }else{\
781
        const int E= B+C;\
782
        const int step= C ? stride : 1;\
783
        for(i=0; i<h; i++){\
784
            OP(dst[0], (A*src[0] + E*src[step+0]));\
785
            OP(dst[1], (A*src[1] + E*src[step+1]));\
786
            OP(dst[2], (A*src[2] + E*src[step+2]));\
787
            OP(dst[3], (A*src[3] + E*src[step+3]));\
788
            OP(dst[4], (A*src[4] + E*src[step+4]));\
789
            OP(dst[5], (A*src[5] + E*src[step+5]));\
790
            OP(dst[6], (A*src[6] + E*src[step+6]));\
791
            OP(dst[7], (A*src[7] + E*src[step+7]));\
792
            dst+= stride;\
793
            src+= stride;\
794
        }\
795
    }\
796
}
797

    
798
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
799
#define op_put(a, b) a = (((b) + 32)>>6)
800

    
801
H264_CHROMA_MC(put_       , op_put)
802
H264_CHROMA_MC(avg_       , op_avg)
803
#undef op_avg
804
#undef op_put
805

    
806
#if 1
807
#define H264_LOWPASS(OPNAME, OP, OP2) \
808
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
809
    const int h=2;\
810
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
811
    int i;\
812
    for(i=0; i<h; i++)\
813
    {\
814
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
815
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
816
        dst+=dstStride;\
817
        src+=srcStride;\
818
    }\
819
}\
820
\
821
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
822
    const int w=2;\
823
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
824
    int i;\
825
    for(i=0; i<w; i++)\
826
    {\
827
        const int srcB= src[-2*srcStride];\
828
        const int srcA= src[-1*srcStride];\
829
        const int src0= src[0 *srcStride];\
830
        const int src1= src[1 *srcStride];\
831
        const int src2= src[2 *srcStride];\
832
        const int src3= src[3 *srcStride];\
833
        const int src4= src[4 *srcStride];\
834
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
835
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
836
        dst++;\
837
        src++;\
838
    }\
839
}\
840
\
841
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
842
    const int h=2;\
843
    const int w=2;\
844
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
845
    int i;\
846
    src -= 2*srcStride;\
847
    for(i=0; i<h+5; i++)\
848
    {\
849
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
850
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
851
        tmp+=tmpStride;\
852
        src+=srcStride;\
853
    }\
854
    tmp -= tmpStride*(h+5-2);\
855
    for(i=0; i<w; i++)\
856
    {\
857
        const int tmpB= tmp[-2*tmpStride];\
858
        const int tmpA= tmp[-1*tmpStride];\
859
        const int tmp0= tmp[0 *tmpStride];\
860
        const int tmp1= tmp[1 *tmpStride];\
861
        const int tmp2= tmp[2 *tmpStride];\
862
        const int tmp3= tmp[3 *tmpStride];\
863
        const int tmp4= tmp[4 *tmpStride];\
864
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
865
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
866
        dst++;\
867
        tmp++;\
868
    }\
869
}\
870
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
871
    const int h=4;\
872
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
873
    int i;\
874
    for(i=0; i<h; i++)\
875
    {\
876
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
877
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
878
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
879
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
880
        dst+=dstStride;\
881
        src+=srcStride;\
882
    }\
883
}\
884
\
885
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
886
    const int w=4;\
887
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
888
    int i;\
889
    for(i=0; i<w; i++)\
890
    {\
891
        const int srcB= src[-2*srcStride];\
892
        const int srcA= src[-1*srcStride];\
893
        const int src0= src[0 *srcStride];\
894
        const int src1= src[1 *srcStride];\
895
        const int src2= src[2 *srcStride];\
896
        const int src3= src[3 *srcStride];\
897
        const int src4= src[4 *srcStride];\
898
        const int src5= src[5 *srcStride];\
899
        const int src6= src[6 *srcStride];\
900
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
901
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
902
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
903
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
904
        dst++;\
905
        src++;\
906
    }\
907
}\
908
\
909
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
910
    const int h=4;\
911
    const int w=4;\
912
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
913
    int i;\
914
    src -= 2*srcStride;\
915
    for(i=0; i<h+5; i++)\
916
    {\
917
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
918
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
919
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
920
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
921
        tmp+=tmpStride;\
922
        src+=srcStride;\
923
    }\
924
    tmp -= tmpStride*(h+5-2);\
925
    for(i=0; i<w; i++)\
926
    {\
927
        const int tmpB= tmp[-2*tmpStride];\
928
        const int tmpA= tmp[-1*tmpStride];\
929
        const int tmp0= tmp[0 *tmpStride];\
930
        const int tmp1= tmp[1 *tmpStride];\
931
        const int tmp2= tmp[2 *tmpStride];\
932
        const int tmp3= tmp[3 *tmpStride];\
933
        const int tmp4= tmp[4 *tmpStride];\
934
        const int tmp5= tmp[5 *tmpStride];\
935
        const int tmp6= tmp[6 *tmpStride];\
936
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
937
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
938
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
939
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
940
        dst++;\
941
        tmp++;\
942
    }\
943
}\
944
\
945
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
946
    const int h=8;\
947
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
948
    int i;\
949
    for(i=0; i<h; i++)\
950
    {\
951
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
952
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
953
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
954
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
955
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
956
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
957
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
958
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
959
        dst+=dstStride;\
960
        src+=srcStride;\
961
    }\
962
}\
963
\
964
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
965
    const int w=8;\
966
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
967
    int i;\
968
    for(i=0; i<w; i++)\
969
    {\
970
        const int srcB= src[-2*srcStride];\
971
        const int srcA= src[-1*srcStride];\
972
        const int src0= src[0 *srcStride];\
973
        const int src1= src[1 *srcStride];\
974
        const int src2= src[2 *srcStride];\
975
        const int src3= src[3 *srcStride];\
976
        const int src4= src[4 *srcStride];\
977
        const int src5= src[5 *srcStride];\
978
        const int src6= src[6 *srcStride];\
979
        const int src7= src[7 *srcStride];\
980
        const int src8= src[8 *srcStride];\
981
        const int src9= src[9 *srcStride];\
982
        const int src10=src[10*srcStride];\
983
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
984
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
985
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
986
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
987
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
988
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
989
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
990
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
991
        dst++;\
992
        src++;\
993
    }\
994
}\
995
\
996
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
997
    const int h=8;\
998
    const int w=8;\
999
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1000
    int i;\
1001
    src -= 2*srcStride;\
1002
    for(i=0; i<h+5; i++)\
1003
    {\
1004
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1005
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1006
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1007
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1008
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1009
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1010
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1011
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1012
        tmp+=tmpStride;\
1013
        src+=srcStride;\
1014
    }\
1015
    tmp -= tmpStride*(h+5-2);\
1016
    for(i=0; i<w; i++)\
1017
    {\
1018
        const int tmpB= tmp[-2*tmpStride];\
1019
        const int tmpA= tmp[-1*tmpStride];\
1020
        const int tmp0= tmp[0 *tmpStride];\
1021
        const int tmp1= tmp[1 *tmpStride];\
1022
        const int tmp2= tmp[2 *tmpStride];\
1023
        const int tmp3= tmp[3 *tmpStride];\
1024
        const int tmp4= tmp[4 *tmpStride];\
1025
        const int tmp5= tmp[5 *tmpStride];\
1026
        const int tmp6= tmp[6 *tmpStride];\
1027
        const int tmp7= tmp[7 *tmpStride];\
1028
        const int tmp8= tmp[8 *tmpStride];\
1029
        const int tmp9= tmp[9 *tmpStride];\
1030
        const int tmp10=tmp[10*tmpStride];\
1031
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1032
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1033
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1034
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1035
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1036
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1037
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1038
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1039
        dst++;\
1040
        tmp++;\
1041
    }\
1042
}\
1043
\
1044
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1045
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1046
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1047
    src += 8*srcStride;\
1048
    dst += 8*dstStride;\
1049
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1050
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1051
}\
1052
\
1053
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1054
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1055
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1056
    src += 8*srcStride;\
1057
    dst += 8*dstStride;\
1058
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1059
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1060
}\
1061
\
1062
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1063
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
1064
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
1065
    src += 8*srcStride;\
1066
    dst += 8*dstStride;\
1067
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
1068
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
1069
}\
1070

    
1071
#define H264_MC(OPNAME, SIZE) \
1072
static av_unused void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1073
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
1074
}\
1075
\
1076
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1077
    uint8_t half[SIZE*SIZE];\
1078
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
1079
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
1080
}\
1081
\
1082
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1083
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
1084
}\
1085
\
1086
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1087
    uint8_t half[SIZE*SIZE];\
1088
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
1089
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
1090
}\
1091
\
1092
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1093
    uint8_t full[SIZE*(SIZE+5)];\
1094
    uint8_t * const full_mid= full + SIZE*2;\
1095
    uint8_t half[SIZE*SIZE];\
1096
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1097
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
1098
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
1099
}\
1100
\
1101
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1102
    uint8_t full[SIZE*(SIZE+5)];\
1103
    uint8_t * const full_mid= full + SIZE*2;\
1104
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1105
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
1106
}\
1107
\
1108
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1109
    uint8_t full[SIZE*(SIZE+5)];\
1110
    uint8_t * const full_mid= full + SIZE*2;\
1111
    uint8_t half[SIZE*SIZE];\
1112
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1113
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
1114
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
1115
}\
1116
\
1117
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1118
    uint8_t full[SIZE*(SIZE+5)];\
1119
    uint8_t * const full_mid= full + SIZE*2;\
1120
    uint8_t halfH[SIZE*SIZE];\
1121
    uint8_t halfV[SIZE*SIZE];\
1122
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1123
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1124
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1125
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1126
}\
1127
\
1128
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1129
    uint8_t full[SIZE*(SIZE+5)];\
1130
    uint8_t * const full_mid= full + SIZE*2;\
1131
    uint8_t halfH[SIZE*SIZE];\
1132
    uint8_t halfV[SIZE*SIZE];\
1133
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1134
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
1135
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1136
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1137
}\
1138
\
1139
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1140
    uint8_t full[SIZE*(SIZE+5)];\
1141
    uint8_t * const full_mid= full + SIZE*2;\
1142
    uint8_t halfH[SIZE*SIZE];\
1143
    uint8_t halfV[SIZE*SIZE];\
1144
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1145
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1146
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1147
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1148
}\
1149
\
1150
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1151
    uint8_t full[SIZE*(SIZE+5)];\
1152
    uint8_t * const full_mid= full + SIZE*2;\
1153
    uint8_t halfH[SIZE*SIZE];\
1154
    uint8_t halfV[SIZE*SIZE];\
1155
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1156
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
1157
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1158
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1159
}\
1160
\
1161
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1162
    int16_t tmp[SIZE*(SIZE+5)];\
1163
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
1164
}\
1165
\
1166
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1167
    int16_t tmp[SIZE*(SIZE+5)];\
1168
    uint8_t halfH[SIZE*SIZE];\
1169
    uint8_t halfHV[SIZE*SIZE];\
1170
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1171
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1172
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
1173
}\
1174
\
1175
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1176
    int16_t tmp[SIZE*(SIZE+5)];\
1177
    uint8_t halfH[SIZE*SIZE];\
1178
    uint8_t halfHV[SIZE*SIZE];\
1179
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1180
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1181
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
1182
}\
1183
\
1184
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1185
    uint8_t full[SIZE*(SIZE+5)];\
1186
    uint8_t * const full_mid= full + SIZE*2;\
1187
    int16_t tmp[SIZE*(SIZE+5)];\
1188
    uint8_t halfV[SIZE*SIZE];\
1189
    uint8_t halfHV[SIZE*SIZE];\
1190
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1191
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1192
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1193
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
1194
}\
1195
\
1196
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1197
    uint8_t full[SIZE*(SIZE+5)];\
1198
    uint8_t * const full_mid= full + SIZE*2;\
1199
    int16_t tmp[SIZE*(SIZE+5)];\
1200
    uint8_t halfV[SIZE*SIZE];\
1201
    uint8_t halfHV[SIZE*SIZE];\
1202
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
1203
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1204
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1205
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
1206
}\
1207

    
1208
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1209
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
1210
#define op_put(a, b)  a = cm[((b) + 16)>>5]
1211
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
1212
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
1213

    
1214
H264_LOWPASS(put_       , op_put, op2_put)
1215
H264_LOWPASS(avg_       , op_avg, op2_avg)
1216
H264_MC(put_, 2)
1217
H264_MC(put_, 4)
1218
H264_MC(put_, 8)
1219
H264_MC(put_, 16)
1220
H264_MC(avg_, 4)
1221
H264_MC(avg_, 8)
1222
H264_MC(avg_, 16)
1223

    
1224
#undef op_avg
1225
#undef op_put
1226
#undef op2_avg
1227
#undef op2_put
1228
#endif
1229

    
1230
#define put_h264_qpel8_mc00_c  ff_put_pixels8x8_c
1231
#define avg_h264_qpel8_mc00_c  ff_avg_pixels8x8_c
1232
#define put_h264_qpel16_mc00_c ff_put_pixels16x16_c
1233
#define avg_h264_qpel16_mc00_c ff_avg_pixels16x16_c
1234

    
1235
void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
1236
    put_pixels8_c(dst, src, stride, 8);
1237
}
1238
void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
1239
    avg_pixels8_c(dst, src, stride, 8);
1240
}
1241
void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
1242
    put_pixels16_c(dst, src, stride, 16);
1243
}
1244
void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
1245
    avg_pixels16_c(dst, src, stride, 16);
1246
}
1247

    
1248
static void clear_block_c(DCTELEM *block)
1249
{
1250
    memset(block, 0, sizeof(DCTELEM)*64);
1251
}
1252

    
1253
/**
1254
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
1255
 */
1256
static void clear_blocks_c(DCTELEM *blocks)
1257
{
1258
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
1259
}