Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 31304587

History | View | Annotate | Download (154 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file dsputil.c
27
 * DSP utils
28
 */
29

    
30
#include "avcodec.h"
31
#include "dsputil.h"
32
#include "mpegvideo.h"
33
#include "simple_idct.h"
34
#include "faandct.h"
35
#include "h263.h"
36
#include "snow.h"
37

    
38
/* snow.c */
39
void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
40

    
41
/* vorbis.c */
42
void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
43

    
44
/* flacenc.c */
45
void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
46

    
47
/* pngdec.c */
48
void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
49

    
50
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
51
uint32_t ff_squareTbl[512] = {0, };
52

    
53
static const unsigned long pb_7f = 0x7f7f7f7f7f7f7f7fUL;
54
static const unsigned long pb_80 = 0x8080808080808080UL;
55

    
56
const uint8_t ff_zigzag_direct[64] = {
57
    0,   1,  8, 16,  9,  2,  3, 10,
58
    17, 24, 32, 25, 18, 11,  4,  5,
59
    12, 19, 26, 33, 40, 48, 41, 34,
60
    27, 20, 13,  6,  7, 14, 21, 28,
61
    35, 42, 49, 56, 57, 50, 43, 36,
62
    29, 22, 15, 23, 30, 37, 44, 51,
63
    58, 59, 52, 45, 38, 31, 39, 46,
64
    53, 60, 61, 54, 47, 55, 62, 63
65
};
66

    
67
/* Specific zigzag scan for 248 idct. NOTE that unlike the
68
   specification, we interleave the fields */
69
const uint8_t ff_zigzag248_direct[64] = {
70
     0,  8,  1,  9, 16, 24,  2, 10,
71
    17, 25, 32, 40, 48, 56, 33, 41,
72
    18, 26,  3, 11,  4, 12, 19, 27,
73
    34, 42, 49, 57, 50, 58, 35, 43,
74
    20, 28,  5, 13,  6, 14, 21, 29,
75
    36, 44, 51, 59, 52, 60, 37, 45,
76
    22, 30,  7, 15, 23, 31, 38, 46,
77
    53, 61, 54, 62, 39, 47, 55, 63,
78
};
79

    
80
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
81
DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
82

    
83
const uint8_t ff_alternate_horizontal_scan[64] = {
84
    0,  1,   2,  3,  8,  9, 16, 17,
85
    10, 11,  4,  5,  6,  7, 15, 14,
86
    13, 12, 19, 18, 24, 25, 32, 33,
87
    26, 27, 20, 21, 22, 23, 28, 29,
88
    30, 31, 34, 35, 40, 41, 48, 49,
89
    42, 43, 36, 37, 38, 39, 44, 45,
90
    46, 47, 50, 51, 56, 57, 58, 59,
91
    52, 53, 54, 55, 60, 61, 62, 63,
92
};
93

    
94
const uint8_t ff_alternate_vertical_scan[64] = {
95
    0,  8,  16, 24,  1,  9,  2, 10,
96
    17, 25, 32, 40, 48, 56, 57, 49,
97
    41, 33, 26, 18,  3, 11,  4, 12,
98
    19, 27, 34, 42, 50, 58, 35, 43,
99
    51, 59, 20, 28,  5, 13,  6, 14,
100
    21, 29, 36, 44, 52, 60, 37, 45,
101
    53, 61, 22, 30,  7, 15, 23, 31,
102
    38, 46, 54, 62, 39, 47, 55, 63,
103
};
104

    
105
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
106
const uint32_t ff_inverse[256]={
107
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
108
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
109
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
110
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
111
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
112
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
113
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
114
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
115
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
116
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
117
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
118
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
119
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
120
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
121
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
122
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
123
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
124
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
125
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
126
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
127
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
128
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
129
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
130
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
131
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
132
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
133
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
134
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
135
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
136
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
137
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
138
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
139
};
140

    
141
/* Input permutation for the simple_idct_mmx */
142
static const uint8_t simple_mmx_permutation[64]={
143
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
144
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
145
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
146
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
147
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
148
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
149
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
150
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
151
};
152

    
153
static int pix_sum_c(uint8_t * pix, int line_size)
154
{
155
    int s, i, j;
156

    
157
    s = 0;
158
    for (i = 0; i < 16; i++) {
159
        for (j = 0; j < 16; j += 8) {
160
            s += pix[0];
161
            s += pix[1];
162
            s += pix[2];
163
            s += pix[3];
164
            s += pix[4];
165
            s += pix[5];
166
            s += pix[6];
167
            s += pix[7];
168
            pix += 8;
169
        }
170
        pix += line_size - 16;
171
    }
172
    return s;
173
}
174

    
175
static int pix_norm1_c(uint8_t * pix, int line_size)
176
{
177
    int s, i, j;
178
    uint32_t *sq = ff_squareTbl + 256;
179

    
180
    s = 0;
181
    for (i = 0; i < 16; i++) {
182
        for (j = 0; j < 16; j += 8) {
183
#if 0
184
            s += sq[pix[0]];
185
            s += sq[pix[1]];
186
            s += sq[pix[2]];
187
            s += sq[pix[3]];
188
            s += sq[pix[4]];
189
            s += sq[pix[5]];
190
            s += sq[pix[6]];
191
            s += sq[pix[7]];
192
#else
193
#if LONG_MAX > 2147483647
194
            register uint64_t x=*(uint64_t*)pix;
195
            s += sq[x&0xff];
196
            s += sq[(x>>8)&0xff];
197
            s += sq[(x>>16)&0xff];
198
            s += sq[(x>>24)&0xff];
199
            s += sq[(x>>32)&0xff];
200
            s += sq[(x>>40)&0xff];
201
            s += sq[(x>>48)&0xff];
202
            s += sq[(x>>56)&0xff];
203
#else
204
            register uint32_t x=*(uint32_t*)pix;
205
            s += sq[x&0xff];
206
            s += sq[(x>>8)&0xff];
207
            s += sq[(x>>16)&0xff];
208
            s += sq[(x>>24)&0xff];
209
            x=*(uint32_t*)(pix+4);
210
            s += sq[x&0xff];
211
            s += sq[(x>>8)&0xff];
212
            s += sq[(x>>16)&0xff];
213
            s += sq[(x>>24)&0xff];
214
#endif
215
#endif
216
            pix += 8;
217
        }
218
        pix += line_size - 16;
219
    }
220
    return s;
221
}
222

    
223
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
224
    int i;
225

    
226
    for(i=0; i+8<=w; i+=8){
227
        dst[i+0]= bswap_32(src[i+0]);
228
        dst[i+1]= bswap_32(src[i+1]);
229
        dst[i+2]= bswap_32(src[i+2]);
230
        dst[i+3]= bswap_32(src[i+3]);
231
        dst[i+4]= bswap_32(src[i+4]);
232
        dst[i+5]= bswap_32(src[i+5]);
233
        dst[i+6]= bswap_32(src[i+6]);
234
        dst[i+7]= bswap_32(src[i+7]);
235
    }
236
    for(;i<w; i++){
237
        dst[i+0]= bswap_32(src[i+0]);
238
    }
239
}
240

    
241
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
242
{
243
    int s, i;
244
    uint32_t *sq = ff_squareTbl + 256;
245

    
246
    s = 0;
247
    for (i = 0; i < h; i++) {
248
        s += sq[pix1[0] - pix2[0]];
249
        s += sq[pix1[1] - pix2[1]];
250
        s += sq[pix1[2] - pix2[2]];
251
        s += sq[pix1[3] - pix2[3]];
252
        pix1 += line_size;
253
        pix2 += line_size;
254
    }
255
    return s;
256
}
257

    
258
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
259
{
260
    int s, i;
261
    uint32_t *sq = ff_squareTbl + 256;
262

    
263
    s = 0;
264
    for (i = 0; i < h; i++) {
265
        s += sq[pix1[0] - pix2[0]];
266
        s += sq[pix1[1] - pix2[1]];
267
        s += sq[pix1[2] - pix2[2]];
268
        s += sq[pix1[3] - pix2[3]];
269
        s += sq[pix1[4] - pix2[4]];
270
        s += sq[pix1[5] - pix2[5]];
271
        s += sq[pix1[6] - pix2[6]];
272
        s += sq[pix1[7] - pix2[7]];
273
        pix1 += line_size;
274
        pix2 += line_size;
275
    }
276
    return s;
277
}
278

    
279
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
280
{
281
    int s, i;
282
    uint32_t *sq = ff_squareTbl + 256;
283

    
284
    s = 0;
285
    for (i = 0; i < h; i++) {
286
        s += sq[pix1[ 0] - pix2[ 0]];
287
        s += sq[pix1[ 1] - pix2[ 1]];
288
        s += sq[pix1[ 2] - pix2[ 2]];
289
        s += sq[pix1[ 3] - pix2[ 3]];
290
        s += sq[pix1[ 4] - pix2[ 4]];
291
        s += sq[pix1[ 5] - pix2[ 5]];
292
        s += sq[pix1[ 6] - pix2[ 6]];
293
        s += sq[pix1[ 7] - pix2[ 7]];
294
        s += sq[pix1[ 8] - pix2[ 8]];
295
        s += sq[pix1[ 9] - pix2[ 9]];
296
        s += sq[pix1[10] - pix2[10]];
297
        s += sq[pix1[11] - pix2[11]];
298
        s += sq[pix1[12] - pix2[12]];
299
        s += sq[pix1[13] - pix2[13]];
300
        s += sq[pix1[14] - pix2[14]];
301
        s += sq[pix1[15] - pix2[15]];
302

    
303
        pix1 += line_size;
304
        pix2 += line_size;
305
    }
306
    return s;
307
}
308

    
309

    
310
#ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
311
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
312
    int s, i, j;
313
    const int dec_count= w==8 ? 3 : 4;
314
    int tmp[32*32];
315
    int level, ori;
316
    static const int scale[2][2][4][4]={
317
      {
318
        {
319
            // 9/7 8x8 dec=3
320
            {268, 239, 239, 213},
321
            {  0, 224, 224, 152},
322
            {  0, 135, 135, 110},
323
        },{
324
            // 9/7 16x16 or 32x32 dec=4
325
            {344, 310, 310, 280},
326
            {  0, 320, 320, 228},
327
            {  0, 175, 175, 136},
328
            {  0, 129, 129, 102},
329
        }
330
      },{
331
        {
332
            // 5/3 8x8 dec=3
333
            {275, 245, 245, 218},
334
            {  0, 230, 230, 156},
335
            {  0, 138, 138, 113},
336
        },{
337
            // 5/3 16x16 or 32x32 dec=4
338
            {352, 317, 317, 286},
339
            {  0, 328, 328, 233},
340
            {  0, 180, 180, 140},
341
            {  0, 132, 132, 105},
342
        }
343
      }
344
    };
345

    
346
    for (i = 0; i < h; i++) {
347
        for (j = 0; j < w; j+=4) {
348
            tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
349
            tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
350
            tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
351
            tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
352
        }
353
        pix1 += line_size;
354
        pix2 += line_size;
355
    }
356

    
357
    ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
358

    
359
    s=0;
360
    assert(w==h);
361
    for(level=0; level<dec_count; level++){
362
        for(ori= level ? 1 : 0; ori<4; ori++){
363
            int size= w>>(dec_count-level);
364
            int sx= (ori&1) ? size : 0;
365
            int stride= 32<<(dec_count-level);
366
            int sy= (ori&2) ? stride>>1 : 0;
367

    
368
            for(i=0; i<size; i++){
369
                for(j=0; j<size; j++){
370
                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
371
                    s += FFABS(v);
372
                }
373
            }
374
        }
375
    }
376
    assert(s>=0);
377
    return s>>9;
378
}
379

    
380
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
381
    return w_c(v, pix1, pix2, line_size,  8, h, 1);
382
}
383

    
384
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
385
    return w_c(v, pix1, pix2, line_size,  8, h, 0);
386
}
387

    
388
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
389
    return w_c(v, pix1, pix2, line_size, 16, h, 1);
390
}
391

    
392
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
393
    return w_c(v, pix1, pix2, line_size, 16, h, 0);
394
}
395

    
396
int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
397
    return w_c(v, pix1, pix2, line_size, 32, h, 1);
398
}
399

    
400
int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
401
    return w_c(v, pix1, pix2, line_size, 32, h, 0);
402
}
403
#endif
404

    
405
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
406
{
407
    int i;
408

    
409
    /* read the pixels */
410
    for(i=0;i<8;i++) {
411
        block[0] = pixels[0];
412
        block[1] = pixels[1];
413
        block[2] = pixels[2];
414
        block[3] = pixels[3];
415
        block[4] = pixels[4];
416
        block[5] = pixels[5];
417
        block[6] = pixels[6];
418
        block[7] = pixels[7];
419
        pixels += line_size;
420
        block += 8;
421
    }
422
}
423

    
424
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
425
                          const uint8_t *s2, int stride){
426
    int i;
427

    
428
    /* read the pixels */
429
    for(i=0;i<8;i++) {
430
        block[0] = s1[0] - s2[0];
431
        block[1] = s1[1] - s2[1];
432
        block[2] = s1[2] - s2[2];
433
        block[3] = s1[3] - s2[3];
434
        block[4] = s1[4] - s2[4];
435
        block[5] = s1[5] - s2[5];
436
        block[6] = s1[6] - s2[6];
437
        block[7] = s1[7] - s2[7];
438
        s1 += stride;
439
        s2 += stride;
440
        block += 8;
441
    }
442
}
443

    
444

    
445
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
446
                                 int line_size)
447
{
448
    int i;
449
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
450

    
451
    /* read the pixels */
452
    for(i=0;i<8;i++) {
453
        pixels[0] = cm[block[0]];
454
        pixels[1] = cm[block[1]];
455
        pixels[2] = cm[block[2]];
456
        pixels[3] = cm[block[3]];
457
        pixels[4] = cm[block[4]];
458
        pixels[5] = cm[block[5]];
459
        pixels[6] = cm[block[6]];
460
        pixels[7] = cm[block[7]];
461

    
462
        pixels += line_size;
463
        block += 8;
464
    }
465
}
466

    
467
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
468
                                 int line_size)
469
{
470
    int i;
471
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
472

    
473
    /* read the pixels */
474
    for(i=0;i<4;i++) {
475
        pixels[0] = cm[block[0]];
476
        pixels[1] = cm[block[1]];
477
        pixels[2] = cm[block[2]];
478
        pixels[3] = cm[block[3]];
479

    
480
        pixels += line_size;
481
        block += 8;
482
    }
483
}
484

    
485
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
486
                                 int line_size)
487
{
488
    int i;
489
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
490

    
491
    /* read the pixels */
492
    for(i=0;i<2;i++) {
493
        pixels[0] = cm[block[0]];
494
        pixels[1] = cm[block[1]];
495

    
496
        pixels += line_size;
497
        block += 8;
498
    }
499
}
500

    
501
static void put_signed_pixels_clamped_c(const DCTELEM *block,
502
                                        uint8_t *restrict pixels,
503
                                        int line_size)
504
{
505
    int i, j;
506

    
507
    for (i = 0; i < 8; i++) {
508
        for (j = 0; j < 8; j++) {
509
            if (*block < -128)
510
                *pixels = 0;
511
            else if (*block > 127)
512
                *pixels = 255;
513
            else
514
                *pixels = (uint8_t)(*block + 128);
515
            block++;
516
            pixels++;
517
        }
518
        pixels += (line_size - 8);
519
    }
520
}
521

    
522
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
523
                          int line_size)
524
{
525
    int i;
526
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
527

    
528
    /* read the pixels */
529
    for(i=0;i<8;i++) {
530
        pixels[0] = cm[pixels[0] + block[0]];
531
        pixels[1] = cm[pixels[1] + block[1]];
532
        pixels[2] = cm[pixels[2] + block[2]];
533
        pixels[3] = cm[pixels[3] + block[3]];
534
        pixels[4] = cm[pixels[4] + block[4]];
535
        pixels[5] = cm[pixels[5] + block[5]];
536
        pixels[6] = cm[pixels[6] + block[6]];
537
        pixels[7] = cm[pixels[7] + block[7]];
538
        pixels += line_size;
539
        block += 8;
540
    }
541
}
542

    
543
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
544
                          int line_size)
545
{
546
    int i;
547
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
548

    
549
    /* read the pixels */
550
    for(i=0;i<4;i++) {
551
        pixels[0] = cm[pixels[0] + block[0]];
552
        pixels[1] = cm[pixels[1] + block[1]];
553
        pixels[2] = cm[pixels[2] + block[2]];
554
        pixels[3] = cm[pixels[3] + block[3]];
555
        pixels += line_size;
556
        block += 8;
557
    }
558
}
559

    
560
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
561
                          int line_size)
562
{
563
    int i;
564
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
565

    
566
    /* read the pixels */
567
    for(i=0;i<2;i++) {
568
        pixels[0] = cm[pixels[0] + block[0]];
569
        pixels[1] = cm[pixels[1] + block[1]];
570
        pixels += line_size;
571
        block += 8;
572
    }
573
}
574

    
575
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
576
{
577
    int i;
578
    for(i=0;i<8;i++) {
579
        pixels[0] += block[0];
580
        pixels[1] += block[1];
581
        pixels[2] += block[2];
582
        pixels[3] += block[3];
583
        pixels[4] += block[4];
584
        pixels[5] += block[5];
585
        pixels[6] += block[6];
586
        pixels[7] += block[7];
587
        pixels += line_size;
588
        block += 8;
589
    }
590
}
591

    
592
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
593
{
594
    int i;
595
    for(i=0;i<4;i++) {
596
        pixels[0] += block[0];
597
        pixels[1] += block[1];
598
        pixels[2] += block[2];
599
        pixels[3] += block[3];
600
        pixels += line_size;
601
        block += 4;
602
    }
603
}
604

    
605
static int sum_abs_dctelem_c(DCTELEM *block)
606
{
607
    int sum=0, i;
608
    for(i=0; i<64; i++)
609
        sum+= FFABS(block[i]);
610
    return sum;
611
}
612

    
613
#if 0
614

615
#define PIXOP2(OPNAME, OP) \
616
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
617
{\
618
    int i;\
619
    for(i=0; i<h; i++){\
620
        OP(*((uint64_t*)block), AV_RN64(pixels));\
621
        pixels+=line_size;\
622
        block +=line_size;\
623
    }\
624
}\
625
\
626
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
627
{\
628
    int i;\
629
    for(i=0; i<h; i++){\
630
        const uint64_t a= AV_RN64(pixels  );\
631
        const uint64_t b= AV_RN64(pixels+1);\
632
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
633
        pixels+=line_size;\
634
        block +=line_size;\
635
    }\
636
}\
637
\
638
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
639
{\
640
    int i;\
641
    for(i=0; i<h; i++){\
642
        const uint64_t a= AV_RN64(pixels  );\
643
        const uint64_t b= AV_RN64(pixels+1);\
644
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
645
        pixels+=line_size;\
646
        block +=line_size;\
647
    }\
648
}\
649
\
650
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
651
{\
652
    int i;\
653
    for(i=0; i<h; i++){\
654
        const uint64_t a= AV_RN64(pixels          );\
655
        const uint64_t b= AV_RN64(pixels+line_size);\
656
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
657
        pixels+=line_size;\
658
        block +=line_size;\
659
    }\
660
}\
661
\
662
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
663
{\
664
    int i;\
665
    for(i=0; i<h; i++){\
666
        const uint64_t a= AV_RN64(pixels          );\
667
        const uint64_t b= AV_RN64(pixels+line_size);\
668
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
669
        pixels+=line_size;\
670
        block +=line_size;\
671
    }\
672
}\
673
\
674
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
675
{\
676
        int i;\
677
        const uint64_t a= AV_RN64(pixels  );\
678
        const uint64_t b= AV_RN64(pixels+1);\
679
        uint64_t l0=  (a&0x0303030303030303ULL)\
680
                    + (b&0x0303030303030303ULL)\
681
                    + 0x0202020202020202ULL;\
682
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
683
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
684
        uint64_t l1,h1;\
685
\
686
        pixels+=line_size;\
687
        for(i=0; i<h; i+=2){\
688
            uint64_t a= AV_RN64(pixels  );\
689
            uint64_t b= AV_RN64(pixels+1);\
690
            l1=  (a&0x0303030303030303ULL)\
691
               + (b&0x0303030303030303ULL);\
692
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
693
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
694
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
695
            pixels+=line_size;\
696
            block +=line_size;\
697
            a= AV_RN64(pixels  );\
698
            b= AV_RN64(pixels+1);\
699
            l0=  (a&0x0303030303030303ULL)\
700
               + (b&0x0303030303030303ULL)\
701
               + 0x0202020202020202ULL;\
702
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
703
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
704
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
705
            pixels+=line_size;\
706
            block +=line_size;\
707
        }\
708
}\
709
\
710
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
711
{\
712
        int i;\
713
        const uint64_t a= AV_RN64(pixels  );\
714
        const uint64_t b= AV_RN64(pixels+1);\
715
        uint64_t l0=  (a&0x0303030303030303ULL)\
716
                    + (b&0x0303030303030303ULL)\
717
                    + 0x0101010101010101ULL;\
718
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
719
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
720
        uint64_t l1,h1;\
721
\
722
        pixels+=line_size;\
723
        for(i=0; i<h; i+=2){\
724
            uint64_t a= AV_RN64(pixels  );\
725
            uint64_t b= AV_RN64(pixels+1);\
726
            l1=  (a&0x0303030303030303ULL)\
727
               + (b&0x0303030303030303ULL);\
728
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
729
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
730
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
731
            pixels+=line_size;\
732
            block +=line_size;\
733
            a= AV_RN64(pixels  );\
734
            b= AV_RN64(pixels+1);\
735
            l0=  (a&0x0303030303030303ULL)\
736
               + (b&0x0303030303030303ULL)\
737
               + 0x0101010101010101ULL;\
738
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
739
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
740
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
741
            pixels+=line_size;\
742
            block +=line_size;\
743
        }\
744
}\
745
\
746
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
747
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
748
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
749
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
750
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
751
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
752
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
753

754
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
755
#else // 64 bit variant
756

    
757
#define PIXOP2(OPNAME, OP) \
758
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
759
    int i;\
760
    for(i=0; i<h; i++){\
761
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
762
        pixels+=line_size;\
763
        block +=line_size;\
764
    }\
765
}\
766
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
767
    int i;\
768
    for(i=0; i<h; i++){\
769
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
770
        pixels+=line_size;\
771
        block +=line_size;\
772
    }\
773
}\
774
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
775
    int i;\
776
    for(i=0; i<h; i++){\
777
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
778
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
779
        pixels+=line_size;\
780
        block +=line_size;\
781
    }\
782
}\
783
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
784
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
785
}\
786
\
787
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
788
                                                int src_stride1, int src_stride2, int h){\
789
    int i;\
790
    for(i=0; i<h; i++){\
791
        uint32_t a,b;\
792
        a= AV_RN32(&src1[i*src_stride1  ]);\
793
        b= AV_RN32(&src2[i*src_stride2  ]);\
794
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
795
        a= AV_RN32(&src1[i*src_stride1+4]);\
796
        b= AV_RN32(&src2[i*src_stride2+4]);\
797
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
798
    }\
799
}\
800
\
801
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
802
                                                int src_stride1, int src_stride2, int h){\
803
    int i;\
804
    for(i=0; i<h; i++){\
805
        uint32_t a,b;\
806
        a= AV_RN32(&src1[i*src_stride1  ]);\
807
        b= AV_RN32(&src2[i*src_stride2  ]);\
808
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
809
        a= AV_RN32(&src1[i*src_stride1+4]);\
810
        b= AV_RN32(&src2[i*src_stride2+4]);\
811
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
812
    }\
813
}\
814
\
815
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
816
                                                int src_stride1, int src_stride2, int h){\
817
    int i;\
818
    for(i=0; i<h; i++){\
819
        uint32_t a,b;\
820
        a= AV_RN32(&src1[i*src_stride1  ]);\
821
        b= AV_RN32(&src2[i*src_stride2  ]);\
822
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
823
    }\
824
}\
825
\
826
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
827
                                                int src_stride1, int src_stride2, int h){\
828
    int i;\
829
    for(i=0; i<h; i++){\
830
        uint32_t a,b;\
831
        a= AV_RN16(&src1[i*src_stride1  ]);\
832
        b= AV_RN16(&src2[i*src_stride2  ]);\
833
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
834
    }\
835
}\
836
\
837
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
838
                                                int src_stride1, int src_stride2, int h){\
839
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
840
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
841
}\
842
\
843
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
844
                                                int src_stride1, int src_stride2, int h){\
845
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
846
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
847
}\
848
\
849
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
850
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
851
}\
852
\
853
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
854
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
855
}\
856
\
857
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
858
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
859
}\
860
\
861
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
862
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
863
}\
864
\
865
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
866
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
867
    int i;\
868
    for(i=0; i<h; i++){\
869
        uint32_t a, b, c, d, l0, l1, h0, h1;\
870
        a= AV_RN32(&src1[i*src_stride1]);\
871
        b= AV_RN32(&src2[i*src_stride2]);\
872
        c= AV_RN32(&src3[i*src_stride3]);\
873
        d= AV_RN32(&src4[i*src_stride4]);\
874
        l0=  (a&0x03030303UL)\
875
           + (b&0x03030303UL)\
876
           + 0x02020202UL;\
877
        h0= ((a&0xFCFCFCFCUL)>>2)\
878
          + ((b&0xFCFCFCFCUL)>>2);\
879
        l1=  (c&0x03030303UL)\
880
           + (d&0x03030303UL);\
881
        h1= ((c&0xFCFCFCFCUL)>>2)\
882
          + ((d&0xFCFCFCFCUL)>>2);\
883
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
884
        a= AV_RN32(&src1[i*src_stride1+4]);\
885
        b= AV_RN32(&src2[i*src_stride2+4]);\
886
        c= AV_RN32(&src3[i*src_stride3+4]);\
887
        d= AV_RN32(&src4[i*src_stride4+4]);\
888
        l0=  (a&0x03030303UL)\
889
           + (b&0x03030303UL)\
890
           + 0x02020202UL;\
891
        h0= ((a&0xFCFCFCFCUL)>>2)\
892
          + ((b&0xFCFCFCFCUL)>>2);\
893
        l1=  (c&0x03030303UL)\
894
           + (d&0x03030303UL);\
895
        h1= ((c&0xFCFCFCFCUL)>>2)\
896
          + ((d&0xFCFCFCFCUL)>>2);\
897
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
898
    }\
899
}\
900
\
901
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
902
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
903
}\
904
\
905
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
906
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
907
}\
908
\
909
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
910
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
911
}\
912
\
913
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
914
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
915
}\
916
\
917
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
918
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
919
    int i;\
920
    for(i=0; i<h; i++){\
921
        uint32_t a, b, c, d, l0, l1, h0, h1;\
922
        a= AV_RN32(&src1[i*src_stride1]);\
923
        b= AV_RN32(&src2[i*src_stride2]);\
924
        c= AV_RN32(&src3[i*src_stride3]);\
925
        d= AV_RN32(&src4[i*src_stride4]);\
926
        l0=  (a&0x03030303UL)\
927
           + (b&0x03030303UL)\
928
           + 0x01010101UL;\
929
        h0= ((a&0xFCFCFCFCUL)>>2)\
930
          + ((b&0xFCFCFCFCUL)>>2);\
931
        l1=  (c&0x03030303UL)\
932
           + (d&0x03030303UL);\
933
        h1= ((c&0xFCFCFCFCUL)>>2)\
934
          + ((d&0xFCFCFCFCUL)>>2);\
935
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
936
        a= AV_RN32(&src1[i*src_stride1+4]);\
937
        b= AV_RN32(&src2[i*src_stride2+4]);\
938
        c= AV_RN32(&src3[i*src_stride3+4]);\
939
        d= AV_RN32(&src4[i*src_stride4+4]);\
940
        l0=  (a&0x03030303UL)\
941
           + (b&0x03030303UL)\
942
           + 0x01010101UL;\
943
        h0= ((a&0xFCFCFCFCUL)>>2)\
944
          + ((b&0xFCFCFCFCUL)>>2);\
945
        l1=  (c&0x03030303UL)\
946
           + (d&0x03030303UL);\
947
        h1= ((c&0xFCFCFCFCUL)>>2)\
948
          + ((d&0xFCFCFCFCUL)>>2);\
949
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
950
    }\
951
}\
952
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
953
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
954
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
955
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
956
}\
957
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
958
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
959
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
960
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
961
}\
962
\
963
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
964
{\
965
        int i, a0, b0, a1, b1;\
966
        a0= pixels[0];\
967
        b0= pixels[1] + 2;\
968
        a0 += b0;\
969
        b0 += pixels[2];\
970
\
971
        pixels+=line_size;\
972
        for(i=0; i<h; i+=2){\
973
            a1= pixels[0];\
974
            b1= pixels[1];\
975
            a1 += b1;\
976
            b1 += pixels[2];\
977
\
978
            block[0]= (a1+a0)>>2; /* FIXME non put */\
979
            block[1]= (b1+b0)>>2;\
980
\
981
            pixels+=line_size;\
982
            block +=line_size;\
983
\
984
            a0= pixels[0];\
985
            b0= pixels[1] + 2;\
986
            a0 += b0;\
987
            b0 += pixels[2];\
988
\
989
            block[0]= (a1+a0)>>2;\
990
            block[1]= (b1+b0)>>2;\
991
            pixels+=line_size;\
992
            block +=line_size;\
993
        }\
994
}\
995
\
996
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
997
{\
998
        int i;\
999
        const uint32_t a= AV_RN32(pixels  );\
1000
        const uint32_t b= AV_RN32(pixels+1);\
1001
        uint32_t l0=  (a&0x03030303UL)\
1002
                    + (b&0x03030303UL)\
1003
                    + 0x02020202UL;\
1004
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1005
                   + ((b&0xFCFCFCFCUL)>>2);\
1006
        uint32_t l1,h1;\
1007
\
1008
        pixels+=line_size;\
1009
        for(i=0; i<h; i+=2){\
1010
            uint32_t a= AV_RN32(pixels  );\
1011
            uint32_t b= AV_RN32(pixels+1);\
1012
            l1=  (a&0x03030303UL)\
1013
               + (b&0x03030303UL);\
1014
            h1= ((a&0xFCFCFCFCUL)>>2)\
1015
              + ((b&0xFCFCFCFCUL)>>2);\
1016
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1017
            pixels+=line_size;\
1018
            block +=line_size;\
1019
            a= AV_RN32(pixels  );\
1020
            b= AV_RN32(pixels+1);\
1021
            l0=  (a&0x03030303UL)\
1022
               + (b&0x03030303UL)\
1023
               + 0x02020202UL;\
1024
            h0= ((a&0xFCFCFCFCUL)>>2)\
1025
              + ((b&0xFCFCFCFCUL)>>2);\
1026
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1027
            pixels+=line_size;\
1028
            block +=line_size;\
1029
        }\
1030
}\
1031
\
1032
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1033
{\
1034
    int j;\
1035
    for(j=0; j<2; j++){\
1036
        int i;\
1037
        const uint32_t a= AV_RN32(pixels  );\
1038
        const uint32_t b= AV_RN32(pixels+1);\
1039
        uint32_t l0=  (a&0x03030303UL)\
1040
                    + (b&0x03030303UL)\
1041
                    + 0x02020202UL;\
1042
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1043
                   + ((b&0xFCFCFCFCUL)>>2);\
1044
        uint32_t l1,h1;\
1045
\
1046
        pixels+=line_size;\
1047
        for(i=0; i<h; i+=2){\
1048
            uint32_t a= AV_RN32(pixels  );\
1049
            uint32_t b= AV_RN32(pixels+1);\
1050
            l1=  (a&0x03030303UL)\
1051
               + (b&0x03030303UL);\
1052
            h1= ((a&0xFCFCFCFCUL)>>2)\
1053
              + ((b&0xFCFCFCFCUL)>>2);\
1054
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1055
            pixels+=line_size;\
1056
            block +=line_size;\
1057
            a= AV_RN32(pixels  );\
1058
            b= AV_RN32(pixels+1);\
1059
            l0=  (a&0x03030303UL)\
1060
               + (b&0x03030303UL)\
1061
               + 0x02020202UL;\
1062
            h0= ((a&0xFCFCFCFCUL)>>2)\
1063
              + ((b&0xFCFCFCFCUL)>>2);\
1064
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1065
            pixels+=line_size;\
1066
            block +=line_size;\
1067
        }\
1068
        pixels+=4-line_size*(h+1);\
1069
        block +=4-line_size*h;\
1070
    }\
1071
}\
1072
\
1073
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1074
{\
1075
    int j;\
1076
    for(j=0; j<2; j++){\
1077
        int i;\
1078
        const uint32_t a= AV_RN32(pixels  );\
1079
        const uint32_t b= AV_RN32(pixels+1);\
1080
        uint32_t l0=  (a&0x03030303UL)\
1081
                    + (b&0x03030303UL)\
1082
                    + 0x01010101UL;\
1083
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1084
                   + ((b&0xFCFCFCFCUL)>>2);\
1085
        uint32_t l1,h1;\
1086
\
1087
        pixels+=line_size;\
1088
        for(i=0; i<h; i+=2){\
1089
            uint32_t a= AV_RN32(pixels  );\
1090
            uint32_t b= AV_RN32(pixels+1);\
1091
            l1=  (a&0x03030303UL)\
1092
               + (b&0x03030303UL);\
1093
            h1= ((a&0xFCFCFCFCUL)>>2)\
1094
              + ((b&0xFCFCFCFCUL)>>2);\
1095
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1096
            pixels+=line_size;\
1097
            block +=line_size;\
1098
            a= AV_RN32(pixels  );\
1099
            b= AV_RN32(pixels+1);\
1100
            l0=  (a&0x03030303UL)\
1101
               + (b&0x03030303UL)\
1102
               + 0x01010101UL;\
1103
            h0= ((a&0xFCFCFCFCUL)>>2)\
1104
              + ((b&0xFCFCFCFCUL)>>2);\
1105
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1106
            pixels+=line_size;\
1107
            block +=line_size;\
1108
        }\
1109
        pixels+=4-line_size*(h+1);\
1110
        block +=4-line_size*h;\
1111
    }\
1112
}\
1113
\
1114
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1115
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1116
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1117
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1118
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1119
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1120
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1121
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1122

    
1123
#define op_avg(a, b) a = rnd_avg32(a, b)
1124
#endif
1125
#define op_put(a, b) a = b
1126

    
1127
PIXOP2(avg, op_avg)
1128
PIXOP2(put, op_put)
1129
#undef op_avg
1130
#undef op_put
1131

    
1132
#define avg2(a,b) ((a+b+1)>>1)
1133
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1134

    
1135
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1136
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1137
}
1138

    
1139
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1140
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1141
}
1142

    
1143
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1144
{
1145
    const int A=(16-x16)*(16-y16);
1146
    const int B=(   x16)*(16-y16);
1147
    const int C=(16-x16)*(   y16);
1148
    const int D=(   x16)*(   y16);
1149
    int i;
1150

    
1151
    for(i=0; i<h; i++)
1152
    {
1153
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1154
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1155
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1156
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1157
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1158
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1159
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1160
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1161
        dst+= stride;
1162
        src+= stride;
1163
    }
1164
}
1165

    
1166
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1167
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1168
{
1169
    int y, vx, vy;
1170
    const int s= 1<<shift;
1171

    
1172
    width--;
1173
    height--;
1174

    
1175
    for(y=0; y<h; y++){
1176
        int x;
1177

    
1178
        vx= ox;
1179
        vy= oy;
1180
        for(x=0; x<8; x++){ //XXX FIXME optimize
1181
            int src_x, src_y, frac_x, frac_y, index;
1182

    
1183
            src_x= vx>>16;
1184
            src_y= vy>>16;
1185
            frac_x= src_x&(s-1);
1186
            frac_y= src_y&(s-1);
1187
            src_x>>=shift;
1188
            src_y>>=shift;
1189

    
1190
            if((unsigned)src_x < width){
1191
                if((unsigned)src_y < height){
1192
                    index= src_x + src_y*stride;
1193
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1194
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1195
                                        + (  src[index+stride  ]*(s-frac_x)
1196
                                           + src[index+stride+1]*   frac_x )*   frac_y
1197
                                        + r)>>(shift*2);
1198
                }else{
1199
                    index= src_x + av_clip(src_y, 0, height)*stride;
1200
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1201
                                          + src[index       +1]*   frac_x )*s
1202
                                        + r)>>(shift*2);
1203
                }
1204
            }else{
1205
                if((unsigned)src_y < height){
1206
                    index= av_clip(src_x, 0, width) + src_y*stride;
1207
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1208
                                           + src[index+stride  ]*   frac_y )*s
1209
                                        + r)>>(shift*2);
1210
                }else{
1211
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1212
                    dst[y*stride + x]=    src[index         ];
1213
                }
1214
            }
1215

    
1216
            vx+= dxx;
1217
            vy+= dyx;
1218
        }
1219
        ox += dxy;
1220
        oy += dyy;
1221
    }
1222
}
1223

    
1224
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1225
    switch(width){
1226
    case 2: put_pixels2_c (dst, src, stride, height); break;
1227
    case 4: put_pixels4_c (dst, src, stride, height); break;
1228
    case 8: put_pixels8_c (dst, src, stride, height); break;
1229
    case 16:put_pixels16_c(dst, src, stride, height); break;
1230
    }
1231
}
1232

    
1233
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1234
    int i,j;
1235
    for (i=0; i < height; i++) {
1236
      for (j=0; j < width; j++) {
1237
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1238
      }
1239
      src += stride;
1240
      dst += stride;
1241
    }
1242
}
1243

    
1244
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1245
    int i,j;
1246
    for (i=0; i < height; i++) {
1247
      for (j=0; j < width; j++) {
1248
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1249
      }
1250
      src += stride;
1251
      dst += stride;
1252
    }
1253
}
1254

    
1255
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1256
    int i,j;
1257
    for (i=0; i < height; i++) {
1258
      for (j=0; j < width; j++) {
1259
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1260
      }
1261
      src += stride;
1262
      dst += stride;
1263
    }
1264
}
1265

    
1266
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1267
    int i,j;
1268
    for (i=0; i < height; i++) {
1269
      for (j=0; j < width; j++) {
1270
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1271
      }
1272
      src += stride;
1273
      dst += stride;
1274
    }
1275
}
1276

    
1277
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1278
    int i,j;
1279
    for (i=0; i < height; i++) {
1280
      for (j=0; j < width; j++) {
1281
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1282
      }
1283
      src += stride;
1284
      dst += stride;
1285
    }
1286
}
1287

    
1288
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1289
    int i,j;
1290
    for (i=0; i < height; i++) {
1291
      for (j=0; j < width; j++) {
1292
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1293
      }
1294
      src += stride;
1295
      dst += stride;
1296
    }
1297
}
1298

    
1299
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1300
    int i,j;
1301
    for (i=0; i < height; i++) {
1302
      for (j=0; j < width; j++) {
1303
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1304
      }
1305
      src += stride;
1306
      dst += stride;
1307
    }
1308
}
1309

    
1310
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1311
    int i,j;
1312
    for (i=0; i < height; i++) {
1313
      for (j=0; j < width; j++) {
1314
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1315
      }
1316
      src += stride;
1317
      dst += stride;
1318
    }
1319
}
1320

    
1321
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1322
    switch(width){
1323
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1324
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1325
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1326
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1327
    }
1328
}
1329

    
1330
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1331
    int i,j;
1332
    for (i=0; i < height; i++) {
1333
      for (j=0; j < width; j++) {
1334
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1335
      }
1336
      src += stride;
1337
      dst += stride;
1338
    }
1339
}
1340

    
1341
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1342
    int i,j;
1343
    for (i=0; i < height; i++) {
1344
      for (j=0; j < width; j++) {
1345
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1346
      }
1347
      src += stride;
1348
      dst += stride;
1349
    }
1350
}
1351

    
1352
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1353
    int i,j;
1354
    for (i=0; i < height; i++) {
1355
      for (j=0; j < width; j++) {
1356
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1357
      }
1358
      src += stride;
1359
      dst += stride;
1360
    }
1361
}
1362

    
1363
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1364
    int i,j;
1365
    for (i=0; i < height; i++) {
1366
      for (j=0; j < width; j++) {
1367
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1368
      }
1369
      src += stride;
1370
      dst += stride;
1371
    }
1372
}
1373

    
1374
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1375
    int i,j;
1376
    for (i=0; i < height; i++) {
1377
      for (j=0; j < width; j++) {
1378
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1379
      }
1380
      src += stride;
1381
      dst += stride;
1382
    }
1383
}
1384

    
1385
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1386
    int i,j;
1387
    for (i=0; i < height; i++) {
1388
      for (j=0; j < width; j++) {
1389
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1390
      }
1391
      src += stride;
1392
      dst += stride;
1393
    }
1394
}
1395

    
1396
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1397
    int i,j;
1398
    for (i=0; i < height; i++) {
1399
      for (j=0; j < width; j++) {
1400
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1401
      }
1402
      src += stride;
1403
      dst += stride;
1404
    }
1405
}
1406

    
1407
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1408
    int i,j;
1409
    for (i=0; i < height; i++) {
1410
      for (j=0; j < width; j++) {
1411
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1412
      }
1413
      src += stride;
1414
      dst += stride;
1415
    }
1416
}
1417
#if 0
1418
#define TPEL_WIDTH(width)\
1419
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1420
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1421
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1422
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1423
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1424
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1425
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1426
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1427
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1428
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1429
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1430
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1431
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1432
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1433
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1434
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1435
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1436
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1437
#endif
1438

    
1439
#define H264_CHROMA_MC(OPNAME, OP)\
1440
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1441
    const int A=(8-x)*(8-y);\
1442
    const int B=(  x)*(8-y);\
1443
    const int C=(8-x)*(  y);\
1444
    const int D=(  x)*(  y);\
1445
    int i;\
1446
    \
1447
    assert(x<8 && y<8 && x>=0 && y>=0);\
1448
\
1449
    if(D){\
1450
        for(i=0; i<h; i++){\
1451
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1452
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1453
            dst+= stride;\
1454
            src+= stride;\
1455
        }\
1456
    }else{\
1457
        const int E= B+C;\
1458
        const int step= C ? stride : 1;\
1459
        for(i=0; i<h; i++){\
1460
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1461
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1462
            dst+= stride;\
1463
            src+= stride;\
1464
        }\
1465
    }\
1466
}\
1467
\
1468
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1469
    const int A=(8-x)*(8-y);\
1470
    const int B=(  x)*(8-y);\
1471
    const int C=(8-x)*(  y);\
1472
    const int D=(  x)*(  y);\
1473
    int i;\
1474
    \
1475
    assert(x<8 && y<8 && x>=0 && y>=0);\
1476
\
1477
    if(D){\
1478
        for(i=0; i<h; i++){\
1479
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1480
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1481
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1482
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1483
            dst+= stride;\
1484
            src+= stride;\
1485
        }\
1486
    }else{\
1487
        const int E= B+C;\
1488
        const int step= C ? stride : 1;\
1489
        for(i=0; i<h; i++){\
1490
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1491
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1492
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1493
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1494
            dst+= stride;\
1495
            src+= stride;\
1496
        }\
1497
    }\
1498
}\
1499
\
1500
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1501
    const int A=(8-x)*(8-y);\
1502
    const int B=(  x)*(8-y);\
1503
    const int C=(8-x)*(  y);\
1504
    const int D=(  x)*(  y);\
1505
    int i;\
1506
    \
1507
    assert(x<8 && y<8 && x>=0 && y>=0);\
1508
\
1509
    if(D){\
1510
        for(i=0; i<h; i++){\
1511
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1512
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1513
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1514
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1515
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1516
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1517
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1518
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1519
            dst+= stride;\
1520
            src+= stride;\
1521
        }\
1522
    }else{\
1523
        const int E= B+C;\
1524
        const int step= C ? stride : 1;\
1525
        for(i=0; i<h; i++){\
1526
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1527
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1528
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1529
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1530
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1531
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1532
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1533
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1534
            dst+= stride;\
1535
            src+= stride;\
1536
        }\
1537
    }\
1538
}
1539

    
1540
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1541
#define op_put(a, b) a = (((b) + 32)>>6)
1542

    
1543
H264_CHROMA_MC(put_       , op_put)
1544
H264_CHROMA_MC(avg_       , op_avg)
1545
#undef op_avg
1546
#undef op_put
1547

    
1548
static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1549
    const int A=(8-x)*(8-y);
1550
    const int B=(  x)*(8-y);
1551
    const int C=(8-x)*(  y);
1552
    const int D=(  x)*(  y);
1553
    int i;
1554

    
1555
    assert(x<8 && y<8 && x>=0 && y>=0);
1556

    
1557
    for(i=0; i<h; i++)
1558
    {
1559
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1560
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1561
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1562
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1563
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1564
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1565
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1566
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1567
        dst+= stride;
1568
        src+= stride;
1569
    }
1570
}
1571

    
1572
#define QPEL_MC(r, OPNAME, RND, OP) \
1573
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1574
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1575
    int i;\
1576
    for(i=0; i<h; i++)\
1577
    {\
1578
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1579
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1580
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1581
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1582
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1583
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1584
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1585
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1586
        dst+=dstStride;\
1587
        src+=srcStride;\
1588
    }\
1589
}\
1590
\
1591
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1592
    const int w=8;\
1593
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1594
    int i;\
1595
    for(i=0; i<w; i++)\
1596
    {\
1597
        const int src0= src[0*srcStride];\
1598
        const int src1= src[1*srcStride];\
1599
        const int src2= src[2*srcStride];\
1600
        const int src3= src[3*srcStride];\
1601
        const int src4= src[4*srcStride];\
1602
        const int src5= src[5*srcStride];\
1603
        const int src6= src[6*srcStride];\
1604
        const int src7= src[7*srcStride];\
1605
        const int src8= src[8*srcStride];\
1606
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1607
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1608
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1609
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1610
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1611
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1612
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1613
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1614
        dst++;\
1615
        src++;\
1616
    }\
1617
}\
1618
\
1619
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1620
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1621
    int i;\
1622
    \
1623
    for(i=0; i<h; i++)\
1624
    {\
1625
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1626
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1627
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1628
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1629
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1630
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1631
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1632
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1633
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1634
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1635
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1636
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1637
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1638
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1639
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1640
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1641
        dst+=dstStride;\
1642
        src+=srcStride;\
1643
    }\
1644
}\
1645
\
1646
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1647
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1648
    int i;\
1649
    const int w=16;\
1650
    for(i=0; i<w; i++)\
1651
    {\
1652
        const int src0= src[0*srcStride];\
1653
        const int src1= src[1*srcStride];\
1654
        const int src2= src[2*srcStride];\
1655
        const int src3= src[3*srcStride];\
1656
        const int src4= src[4*srcStride];\
1657
        const int src5= src[5*srcStride];\
1658
        const int src6= src[6*srcStride];\
1659
        const int src7= src[7*srcStride];\
1660
        const int src8= src[8*srcStride];\
1661
        const int src9= src[9*srcStride];\
1662
        const int src10= src[10*srcStride];\
1663
        const int src11= src[11*srcStride];\
1664
        const int src12= src[12*srcStride];\
1665
        const int src13= src[13*srcStride];\
1666
        const int src14= src[14*srcStride];\
1667
        const int src15= src[15*srcStride];\
1668
        const int src16= src[16*srcStride];\
1669
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1670
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1671
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1672
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1673
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1674
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1675
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1676
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1677
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1678
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1679
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1680
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1681
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1682
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1683
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1684
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1685
        dst++;\
1686
        src++;\
1687
    }\
1688
}\
1689
\
1690
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1691
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1692
}\
1693
\
1694
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1695
    uint8_t half[64];\
1696
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1697
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1698
}\
1699
\
1700
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1701
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1702
}\
1703
\
1704
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1705
    uint8_t half[64];\
1706
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1707
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1708
}\
1709
\
1710
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1711
    uint8_t full[16*9];\
1712
    uint8_t half[64];\
1713
    copy_block9(full, src, 16, stride, 9);\
1714
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1715
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1716
}\
1717
\
1718
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1719
    uint8_t full[16*9];\
1720
    copy_block9(full, src, 16, stride, 9);\
1721
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1722
}\
1723
\
1724
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1725
    uint8_t full[16*9];\
1726
    uint8_t half[64];\
1727
    copy_block9(full, src, 16, stride, 9);\
1728
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1729
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1730
}\
1731
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1732
    uint8_t full[16*9];\
1733
    uint8_t halfH[72];\
1734
    uint8_t halfV[64];\
1735
    uint8_t halfHV[64];\
1736
    copy_block9(full, src, 16, stride, 9);\
1737
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1738
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1739
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1740
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1741
}\
1742
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1743
    uint8_t full[16*9];\
1744
    uint8_t halfH[72];\
1745
    uint8_t halfHV[64];\
1746
    copy_block9(full, src, 16, stride, 9);\
1747
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1748
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1749
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1750
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1751
}\
1752
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1753
    uint8_t full[16*9];\
1754
    uint8_t halfH[72];\
1755
    uint8_t halfV[64];\
1756
    uint8_t halfHV[64];\
1757
    copy_block9(full, src, 16, stride, 9);\
1758
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1759
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1760
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1761
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1762
}\
1763
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1764
    uint8_t full[16*9];\
1765
    uint8_t halfH[72];\
1766
    uint8_t halfHV[64];\
1767
    copy_block9(full, src, 16, stride, 9);\
1768
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1769
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1770
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1771
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1772
}\
1773
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1774
    uint8_t full[16*9];\
1775
    uint8_t halfH[72];\
1776
    uint8_t halfV[64];\
1777
    uint8_t halfHV[64];\
1778
    copy_block9(full, src, 16, stride, 9);\
1779
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1780
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1781
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1782
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1783
}\
1784
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1785
    uint8_t full[16*9];\
1786
    uint8_t halfH[72];\
1787
    uint8_t halfHV[64];\
1788
    copy_block9(full, src, 16, stride, 9);\
1789
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1790
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1791
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1792
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1793
}\
1794
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1795
    uint8_t full[16*9];\
1796
    uint8_t halfH[72];\
1797
    uint8_t halfV[64];\
1798
    uint8_t halfHV[64];\
1799
    copy_block9(full, src, 16, stride, 9);\
1800
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1801
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1802
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1803
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1804
}\
1805
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1806
    uint8_t full[16*9];\
1807
    uint8_t halfH[72];\
1808
    uint8_t halfHV[64];\
1809
    copy_block9(full, src, 16, stride, 9);\
1810
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1811
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1812
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1813
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1814
}\
1815
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1816
    uint8_t halfH[72];\
1817
    uint8_t halfHV[64];\
1818
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1819
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1820
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1821
}\
1822
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1823
    uint8_t halfH[72];\
1824
    uint8_t halfHV[64];\
1825
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1826
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1827
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1828
}\
1829
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1830
    uint8_t full[16*9];\
1831
    uint8_t halfH[72];\
1832
    uint8_t halfV[64];\
1833
    uint8_t halfHV[64];\
1834
    copy_block9(full, src, 16, stride, 9);\
1835
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1836
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1837
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1838
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1839
}\
1840
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1841
    uint8_t full[16*9];\
1842
    uint8_t halfH[72];\
1843
    copy_block9(full, src, 16, stride, 9);\
1844
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1845
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1846
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1847
}\
1848
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1849
    uint8_t full[16*9];\
1850
    uint8_t halfH[72];\
1851
    uint8_t halfV[64];\
1852
    uint8_t halfHV[64];\
1853
    copy_block9(full, src, 16, stride, 9);\
1854
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1855
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1856
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1857
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1858
}\
1859
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1860
    uint8_t full[16*9];\
1861
    uint8_t halfH[72];\
1862
    copy_block9(full, src, 16, stride, 9);\
1863
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1864
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1865
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1866
}\
1867
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1868
    uint8_t halfH[72];\
1869
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1870
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1871
}\
1872
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1873
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1874
}\
1875
\
1876
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1877
    uint8_t half[256];\
1878
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1879
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1880
}\
1881
\
1882
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1883
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1884
}\
1885
\
1886
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1887
    uint8_t half[256];\
1888
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1889
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1890
}\
1891
\
1892
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1893
    uint8_t full[24*17];\
1894
    uint8_t half[256];\
1895
    copy_block17(full, src, 24, stride, 17);\
1896
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1897
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1898
}\
1899
\
1900
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1901
    uint8_t full[24*17];\
1902
    copy_block17(full, src, 24, stride, 17);\
1903
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1904
}\
1905
\
1906
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1907
    uint8_t full[24*17];\
1908
    uint8_t half[256];\
1909
    copy_block17(full, src, 24, stride, 17);\
1910
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1911
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1912
}\
1913
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1914
    uint8_t full[24*17];\
1915
    uint8_t halfH[272];\
1916
    uint8_t halfV[256];\
1917
    uint8_t halfHV[256];\
1918
    copy_block17(full, src, 24, stride, 17);\
1919
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1920
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1921
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1922
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1923
}\
1924
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1925
    uint8_t full[24*17];\
1926
    uint8_t halfH[272];\
1927
    uint8_t halfHV[256];\
1928
    copy_block17(full, src, 24, stride, 17);\
1929
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1930
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1931
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1932
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1933
}\
1934
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1935
    uint8_t full[24*17];\
1936
    uint8_t halfH[272];\
1937
    uint8_t halfV[256];\
1938
    uint8_t halfHV[256];\
1939
    copy_block17(full, src, 24, stride, 17);\
1940
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1941
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1942
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1943
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1944
}\
1945
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1946
    uint8_t full[24*17];\
1947
    uint8_t halfH[272];\
1948
    uint8_t halfHV[256];\
1949
    copy_block17(full, src, 24, stride, 17);\
1950
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1951
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1952
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1953
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1954
}\
1955
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1956
    uint8_t full[24*17];\
1957
    uint8_t halfH[272];\
1958
    uint8_t halfV[256];\
1959
    uint8_t halfHV[256];\
1960
    copy_block17(full, src, 24, stride, 17);\
1961
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1962
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1963
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1964
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1965
}\
1966
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1967
    uint8_t full[24*17];\
1968
    uint8_t halfH[272];\
1969
    uint8_t halfHV[256];\
1970
    copy_block17(full, src, 24, stride, 17);\
1971
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1972
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1973
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1974
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1975
}\
1976
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1977
    uint8_t full[24*17];\
1978
    uint8_t halfH[272];\
1979
    uint8_t halfV[256];\
1980
    uint8_t halfHV[256];\
1981
    copy_block17(full, src, 24, stride, 17);\
1982
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1983
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1984
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1985
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1986
}\
1987
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1988
    uint8_t full[24*17];\
1989
    uint8_t halfH[272];\
1990
    uint8_t halfHV[256];\
1991
    copy_block17(full, src, 24, stride, 17);\
1992
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1993
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1994
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1995
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1996
}\
1997
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1998
    uint8_t halfH[272];\
1999
    uint8_t halfHV[256];\
2000
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2001
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2002
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2003
}\
2004
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2005
    uint8_t halfH[272];\
2006
    uint8_t halfHV[256];\
2007
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2008
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2009
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2010
}\
2011
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2012
    uint8_t full[24*17];\
2013
    uint8_t halfH[272];\
2014
    uint8_t halfV[256];\
2015
    uint8_t halfHV[256];\
2016
    copy_block17(full, src, 24, stride, 17);\
2017
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2018
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2019
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2020
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2021
}\
2022
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2023
    uint8_t full[24*17];\
2024
    uint8_t halfH[272];\
2025
    copy_block17(full, src, 24, stride, 17);\
2026
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2027
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2028
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2029
}\
2030
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2031
    uint8_t full[24*17];\
2032
    uint8_t halfH[272];\
2033
    uint8_t halfV[256];\
2034
    uint8_t halfHV[256];\
2035
    copy_block17(full, src, 24, stride, 17);\
2036
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2037
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2038
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2039
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2040
}\
2041
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2042
    uint8_t full[24*17];\
2043
    uint8_t halfH[272];\
2044
    copy_block17(full, src, 24, stride, 17);\
2045
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2046
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2047
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2048
}\
2049
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2050
    uint8_t halfH[272];\
2051
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2052
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2053
}
2054

    
2055
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2056
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2057
#define op_put(a, b) a = cm[((b) + 16)>>5]
2058
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2059

    
2060
QPEL_MC(0, put_       , _       , op_put)
2061
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2062
QPEL_MC(0, avg_       , _       , op_avg)
2063
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2064
#undef op_avg
2065
#undef op_avg_no_rnd
2066
#undef op_put
2067
#undef op_put_no_rnd
2068

    
2069
#if 1
2070
#define H264_LOWPASS(OPNAME, OP, OP2) \
2071
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2072
    const int h=2;\
2073
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2074
    int i;\
2075
    for(i=0; i<h; i++)\
2076
    {\
2077
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2078
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2079
        dst+=dstStride;\
2080
        src+=srcStride;\
2081
    }\
2082
}\
2083
\
2084
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2085
    const int w=2;\
2086
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2087
    int i;\
2088
    for(i=0; i<w; i++)\
2089
    {\
2090
        const int srcB= src[-2*srcStride];\
2091
        const int srcA= src[-1*srcStride];\
2092
        const int src0= src[0 *srcStride];\
2093
        const int src1= src[1 *srcStride];\
2094
        const int src2= src[2 *srcStride];\
2095
        const int src3= src[3 *srcStride];\
2096
        const int src4= src[4 *srcStride];\
2097
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2098
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2099
        dst++;\
2100
        src++;\
2101
    }\
2102
}\
2103
\
2104
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2105
    const int h=2;\
2106
    const int w=2;\
2107
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2108
    int i;\
2109
    src -= 2*srcStride;\
2110
    for(i=0; i<h+5; i++)\
2111
    {\
2112
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2113
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2114
        tmp+=tmpStride;\
2115
        src+=srcStride;\
2116
    }\
2117
    tmp -= tmpStride*(h+5-2);\
2118
    for(i=0; i<w; i++)\
2119
    {\
2120
        const int tmpB= tmp[-2*tmpStride];\
2121
        const int tmpA= tmp[-1*tmpStride];\
2122
        const int tmp0= tmp[0 *tmpStride];\
2123
        const int tmp1= tmp[1 *tmpStride];\
2124
        const int tmp2= tmp[2 *tmpStride];\
2125
        const int tmp3= tmp[3 *tmpStride];\
2126
        const int tmp4= tmp[4 *tmpStride];\
2127
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2128
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2129
        dst++;\
2130
        tmp++;\
2131
    }\
2132
}\
2133
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2134
    const int h=4;\
2135
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2136
    int i;\
2137
    for(i=0; i<h; i++)\
2138
    {\
2139
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2140
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2141
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2142
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2143
        dst+=dstStride;\
2144
        src+=srcStride;\
2145
    }\
2146
}\
2147
\
2148
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2149
    const int w=4;\
2150
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2151
    int i;\
2152
    for(i=0; i<w; i++)\
2153
    {\
2154
        const int srcB= src[-2*srcStride];\
2155
        const int srcA= src[-1*srcStride];\
2156
        const int src0= src[0 *srcStride];\
2157
        const int src1= src[1 *srcStride];\
2158
        const int src2= src[2 *srcStride];\
2159
        const int src3= src[3 *srcStride];\
2160
        const int src4= src[4 *srcStride];\
2161
        const int src5= src[5 *srcStride];\
2162
        const int src6= src[6 *srcStride];\
2163
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2164
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2165
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2166
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2167
        dst++;\
2168
        src++;\
2169
    }\
2170
}\
2171
\
2172
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2173
    const int h=4;\
2174
    const int w=4;\
2175
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2176
    int i;\
2177
    src -= 2*srcStride;\
2178
    for(i=0; i<h+5; i++)\
2179
    {\
2180
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2181
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2182
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2183
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2184
        tmp+=tmpStride;\
2185
        src+=srcStride;\
2186
    }\
2187
    tmp -= tmpStride*(h+5-2);\
2188
    for(i=0; i<w; i++)\
2189
    {\
2190
        const int tmpB= tmp[-2*tmpStride];\
2191
        const int tmpA= tmp[-1*tmpStride];\
2192
        const int tmp0= tmp[0 *tmpStride];\
2193
        const int tmp1= tmp[1 *tmpStride];\
2194
        const int tmp2= tmp[2 *tmpStride];\
2195
        const int tmp3= tmp[3 *tmpStride];\
2196
        const int tmp4= tmp[4 *tmpStride];\
2197
        const int tmp5= tmp[5 *tmpStride];\
2198
        const int tmp6= tmp[6 *tmpStride];\
2199
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2200
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2201
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2202
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2203
        dst++;\
2204
        tmp++;\
2205
    }\
2206
}\
2207
\
2208
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2209
    const int h=8;\
2210
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2211
    int i;\
2212
    for(i=0; i<h; i++)\
2213
    {\
2214
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2215
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2216
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2217
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2218
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2219
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2220
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2221
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2222
        dst+=dstStride;\
2223
        src+=srcStride;\
2224
    }\
2225
}\
2226
\
2227
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2228
    const int w=8;\
2229
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2230
    int i;\
2231
    for(i=0; i<w; i++)\
2232
    {\
2233
        const int srcB= src[-2*srcStride];\
2234
        const int srcA= src[-1*srcStride];\
2235
        const int src0= src[0 *srcStride];\
2236
        const int src1= src[1 *srcStride];\
2237
        const int src2= src[2 *srcStride];\
2238
        const int src3= src[3 *srcStride];\
2239
        const int src4= src[4 *srcStride];\
2240
        const int src5= src[5 *srcStride];\
2241
        const int src6= src[6 *srcStride];\
2242
        const int src7= src[7 *srcStride];\
2243
        const int src8= src[8 *srcStride];\
2244
        const int src9= src[9 *srcStride];\
2245
        const int src10=src[10*srcStride];\
2246
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2247
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2248
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2249
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2250
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2251
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2252
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2253
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2254
        dst++;\
2255
        src++;\
2256
    }\
2257
}\
2258
\
2259
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2260
    const int h=8;\
2261
    const int w=8;\
2262
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2263
    int i;\
2264
    src -= 2*srcStride;\
2265
    for(i=0; i<h+5; i++)\
2266
    {\
2267
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2268
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2269
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2270
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2271
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2272
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2273
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2274
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2275
        tmp+=tmpStride;\
2276
        src+=srcStride;\
2277
    }\
2278
    tmp -= tmpStride*(h+5-2);\
2279
    for(i=0; i<w; i++)\
2280
    {\
2281
        const int tmpB= tmp[-2*tmpStride];\
2282
        const int tmpA= tmp[-1*tmpStride];\
2283
        const int tmp0= tmp[0 *tmpStride];\
2284
        const int tmp1= tmp[1 *tmpStride];\
2285
        const int tmp2= tmp[2 *tmpStride];\
2286
        const int tmp3= tmp[3 *tmpStride];\
2287
        const int tmp4= tmp[4 *tmpStride];\
2288
        const int tmp5= tmp[5 *tmpStride];\
2289
        const int tmp6= tmp[6 *tmpStride];\
2290
        const int tmp7= tmp[7 *tmpStride];\
2291
        const int tmp8= tmp[8 *tmpStride];\
2292
        const int tmp9= tmp[9 *tmpStride];\
2293
        const int tmp10=tmp[10*tmpStride];\
2294
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2295
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2296
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2297
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2298
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2299
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2300
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2301
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2302
        dst++;\
2303
        tmp++;\
2304
    }\
2305
}\
2306
\
2307
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2308
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2309
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2310
    src += 8*srcStride;\
2311
    dst += 8*dstStride;\
2312
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2313
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2314
}\
2315
\
2316
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2317
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2318
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2319
    src += 8*srcStride;\
2320
    dst += 8*dstStride;\
2321
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2322
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2323
}\
2324
\
2325
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2326
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2327
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2328
    src += 8*srcStride;\
2329
    dst += 8*dstStride;\
2330
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2331
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2332
}\
2333

    
2334
#define H264_MC(OPNAME, SIZE) \
2335
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2336
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2337
}\
2338
\
2339
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2340
    uint8_t half[SIZE*SIZE];\
2341
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2342
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2343
}\
2344
\
2345
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2346
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2347
}\
2348
\
2349
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2350
    uint8_t half[SIZE*SIZE];\
2351
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2352
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2353
}\
2354
\
2355
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2356
    uint8_t full[SIZE*(SIZE+5)];\
2357
    uint8_t * const full_mid= full + SIZE*2;\
2358
    uint8_t half[SIZE*SIZE];\
2359
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2360
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2361
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2362
}\
2363
\
2364
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2365
    uint8_t full[SIZE*(SIZE+5)];\
2366
    uint8_t * const full_mid= full + SIZE*2;\
2367
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2368
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2369
}\
2370
\
2371
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2372
    uint8_t full[SIZE*(SIZE+5)];\
2373
    uint8_t * const full_mid= full + SIZE*2;\
2374
    uint8_t half[SIZE*SIZE];\
2375
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2376
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2377
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2378
}\
2379
\
2380
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2381
    uint8_t full[SIZE*(SIZE+5)];\
2382
    uint8_t * const full_mid= full + SIZE*2;\
2383
    uint8_t halfH[SIZE*SIZE];\
2384
    uint8_t halfV[SIZE*SIZE];\
2385
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2386
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2387
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2388
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2389
}\
2390
\
2391
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2392
    uint8_t full[SIZE*(SIZE+5)];\
2393
    uint8_t * const full_mid= full + SIZE*2;\
2394
    uint8_t halfH[SIZE*SIZE];\
2395
    uint8_t halfV[SIZE*SIZE];\
2396
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2397
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2398
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2399
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2400
}\
2401
\
2402
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2403
    uint8_t full[SIZE*(SIZE+5)];\
2404
    uint8_t * const full_mid= full + SIZE*2;\
2405
    uint8_t halfH[SIZE*SIZE];\
2406
    uint8_t halfV[SIZE*SIZE];\
2407
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2408
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2409
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2410
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2411
}\
2412
\
2413
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2414
    uint8_t full[SIZE*(SIZE+5)];\
2415
    uint8_t * const full_mid= full + SIZE*2;\
2416
    uint8_t halfH[SIZE*SIZE];\
2417
    uint8_t halfV[SIZE*SIZE];\
2418
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2419
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2420
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2421
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2422
}\
2423
\
2424
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2425
    int16_t tmp[SIZE*(SIZE+5)];\
2426
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2427
}\
2428
\
2429
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2430
    int16_t tmp[SIZE*(SIZE+5)];\
2431
    uint8_t halfH[SIZE*SIZE];\
2432
    uint8_t halfHV[SIZE*SIZE];\
2433
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2434
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2435
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2436
}\
2437
\
2438
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2439
    int16_t tmp[SIZE*(SIZE+5)];\
2440
    uint8_t halfH[SIZE*SIZE];\
2441
    uint8_t halfHV[SIZE*SIZE];\
2442
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2443
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2444
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2445
}\
2446
\
2447
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2448
    uint8_t full[SIZE*(SIZE+5)];\
2449
    uint8_t * const full_mid= full + SIZE*2;\
2450
    int16_t tmp[SIZE*(SIZE+5)];\
2451
    uint8_t halfV[SIZE*SIZE];\
2452
    uint8_t halfHV[SIZE*SIZE];\
2453
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2454
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2455
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2456
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2457
}\
2458
\
2459
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2460
    uint8_t full[SIZE*(SIZE+5)];\
2461
    uint8_t * const full_mid= full + SIZE*2;\
2462
    int16_t tmp[SIZE*(SIZE+5)];\
2463
    uint8_t halfV[SIZE*SIZE];\
2464
    uint8_t halfHV[SIZE*SIZE];\
2465
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2466
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2467
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2468
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2469
}\
2470

    
2471
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2472
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2473
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2474
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2475
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2476

    
2477
H264_LOWPASS(put_       , op_put, op2_put)
2478
H264_LOWPASS(avg_       , op_avg, op2_avg)
2479
H264_MC(put_, 2)
2480
H264_MC(put_, 4)
2481
H264_MC(put_, 8)
2482
H264_MC(put_, 16)
2483
H264_MC(avg_, 4)
2484
H264_MC(avg_, 8)
2485
H264_MC(avg_, 16)
2486

    
2487
#undef op_avg
2488
#undef op_put
2489
#undef op2_avg
2490
#undef op2_put
2491
#endif
2492

    
2493
#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2494
#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2495
#define H264_WEIGHT(W,H) \
2496
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2497
    int y; \
2498
    offset <<= log2_denom; \
2499
    if(log2_denom) offset += 1<<(log2_denom-1); \
2500
    for(y=0; y<H; y++, block += stride){ \
2501
        op_scale1(0); \
2502
        op_scale1(1); \
2503
        if(W==2) continue; \
2504
        op_scale1(2); \
2505
        op_scale1(3); \
2506
        if(W==4) continue; \
2507
        op_scale1(4); \
2508
        op_scale1(5); \
2509
        op_scale1(6); \
2510
        op_scale1(7); \
2511
        if(W==8) continue; \
2512
        op_scale1(8); \
2513
        op_scale1(9); \
2514
        op_scale1(10); \
2515
        op_scale1(11); \
2516
        op_scale1(12); \
2517
        op_scale1(13); \
2518
        op_scale1(14); \
2519
        op_scale1(15); \
2520
    } \
2521
} \
2522
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2523
    int y; \
2524
    offset = ((offset + 1) | 1) << log2_denom; \
2525
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2526
        op_scale2(0); \
2527
        op_scale2(1); \
2528
        if(W==2) continue; \
2529
        op_scale2(2); \
2530
        op_scale2(3); \
2531
        if(W==4) continue; \
2532
        op_scale2(4); \
2533
        op_scale2(5); \
2534
        op_scale2(6); \
2535
        op_scale2(7); \
2536
        if(W==8) continue; \
2537
        op_scale2(8); \
2538
        op_scale2(9); \
2539
        op_scale2(10); \
2540
        op_scale2(11); \
2541
        op_scale2(12); \
2542
        op_scale2(13); \
2543
        op_scale2(14); \
2544
        op_scale2(15); \
2545
    } \
2546
}
2547

    
2548
H264_WEIGHT(16,16)
2549
H264_WEIGHT(16,8)
2550
H264_WEIGHT(8,16)
2551
H264_WEIGHT(8,8)
2552
H264_WEIGHT(8,4)
2553
H264_WEIGHT(4,8)
2554
H264_WEIGHT(4,4)
2555
H264_WEIGHT(4,2)
2556
H264_WEIGHT(2,4)
2557
H264_WEIGHT(2,2)
2558

    
2559
#undef op_scale1
2560
#undef op_scale2
2561
#undef H264_WEIGHT
2562

    
2563
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2564
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2565
    int i;
2566

    
2567
    for(i=0; i<h; i++){
2568
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2569
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2570
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2571
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2572
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2573
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2574
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2575
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2576
        dst+=dstStride;
2577
        src+=srcStride;
2578
    }
2579
}
2580

    
2581
#ifdef CONFIG_CAVS_DECODER
2582
/* AVS specific */
2583
void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2584

    
2585
void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2586
    put_pixels8_c(dst, src, stride, 8);
2587
}
2588
void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2589
    avg_pixels8_c(dst, src, stride, 8);
2590
}
2591
void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2592
    put_pixels16_c(dst, src, stride, 16);
2593
}
2594
void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2595
    avg_pixels16_c(dst, src, stride, 16);
2596
}
2597
#endif /* CONFIG_CAVS_DECODER */
2598

    
2599
#if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2600
/* VC-1 specific */
2601
void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2602

    
2603
void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2604
    put_pixels8_c(dst, src, stride, 8);
2605
}
2606
#endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2607

    
2608
void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2609

    
2610
/* H264 specific */
2611
void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2612

    
2613
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2614
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2615
    int i;
2616

    
2617
    for(i=0; i<w; i++){
2618
        const int src_1= src[ -srcStride];
2619
        const int src0 = src[0          ];
2620
        const int src1 = src[  srcStride];
2621
        const int src2 = src[2*srcStride];
2622
        const int src3 = src[3*srcStride];
2623
        const int src4 = src[4*srcStride];
2624
        const int src5 = src[5*srcStride];
2625
        const int src6 = src[6*srcStride];
2626
        const int src7 = src[7*srcStride];
2627
        const int src8 = src[8*srcStride];
2628
        const int src9 = src[9*srcStride];
2629
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2630
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2631
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2632
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2633
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2634
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2635
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2636
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2637
        src++;
2638
        dst++;
2639
    }
2640
}
2641

    
2642
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2643
    put_pixels8_c(dst, src, stride, 8);
2644
}
2645

    
2646
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2647
    uint8_t half[64];
2648
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2649
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2650
}
2651

    
2652
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2653
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2654
}
2655

    
2656
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2657
    uint8_t half[64];
2658
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2659
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2660
}
2661

    
2662
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2663
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2664
}
2665

    
2666
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2667
    uint8_t halfH[88];
2668
    uint8_t halfV[64];
2669
    uint8_t halfHV[64];
2670
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2671
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2672
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2673
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2674
}
2675
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2676
    uint8_t halfH[88];
2677
    uint8_t halfV[64];
2678
    uint8_t halfHV[64];
2679
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2680
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2681
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2682
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2683
}
2684
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2685
    uint8_t halfH[88];
2686
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2687
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2688
}
2689

    
2690
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2691
    if(ENABLE_ANY_H263) {
2692
    int x;
2693
    const int strength= ff_h263_loop_filter_strength[qscale];
2694

    
2695
    for(x=0; x<8; x++){
2696
        int d1, d2, ad1;
2697
        int p0= src[x-2*stride];
2698
        int p1= src[x-1*stride];
2699
        int p2= src[x+0*stride];
2700
        int p3= src[x+1*stride];
2701
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2702

    
2703
        if     (d<-2*strength) d1= 0;
2704
        else if(d<-  strength) d1=-2*strength - d;
2705
        else if(d<   strength) d1= d;
2706
        else if(d< 2*strength) d1= 2*strength - d;
2707
        else                   d1= 0;
2708

    
2709
        p1 += d1;
2710
        p2 -= d1;
2711
        if(p1&256) p1= ~(p1>>31);
2712
        if(p2&256) p2= ~(p2>>31);
2713

    
2714
        src[x-1*stride] = p1;
2715
        src[x+0*stride] = p2;
2716

    
2717
        ad1= FFABS(d1)>>1;
2718

    
2719
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2720

    
2721
        src[x-2*stride] = p0 - d2;
2722
        src[x+  stride] = p3 + d2;
2723
    }
2724
    }
2725
}
2726

    
2727
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2728
    if(ENABLE_ANY_H263) {
2729
    int y;
2730
    const int strength= ff_h263_loop_filter_strength[qscale];
2731

    
2732
    for(y=0; y<8; y++){
2733
        int d1, d2, ad1;
2734
        int p0= src[y*stride-2];
2735
        int p1= src[y*stride-1];
2736
        int p2= src[y*stride+0];
2737
        int p3= src[y*stride+1];
2738
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2739

    
2740
        if     (d<-2*strength) d1= 0;
2741
        else if(d<-  strength) d1=-2*strength - d;
2742
        else if(d<   strength) d1= d;
2743
        else if(d< 2*strength) d1= 2*strength - d;
2744
        else                   d1= 0;
2745

    
2746
        p1 += d1;
2747
        p2 -= d1;
2748
        if(p1&256) p1= ~(p1>>31);
2749
        if(p2&256) p2= ~(p2>>31);
2750

    
2751
        src[y*stride-1] = p1;
2752
        src[y*stride+0] = p2;
2753

    
2754
        ad1= FFABS(d1)>>1;
2755

    
2756
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2757

    
2758
        src[y*stride-2] = p0 - d2;
2759
        src[y*stride+1] = p3 + d2;
2760
    }
2761
    }
2762
}
2763

    
2764
static void h261_loop_filter_c(uint8_t *src, int stride){
2765
    int x,y,xy,yz;
2766
    int temp[64];
2767

    
2768
    for(x=0; x<8; x++){
2769
        temp[x      ] = 4*src[x           ];
2770
        temp[x + 7*8] = 4*src[x + 7*stride];
2771
    }
2772
    for(y=1; y<7; y++){
2773
        for(x=0; x<8; x++){
2774
            xy = y * stride + x;
2775
            yz = y * 8 + x;
2776
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2777
        }
2778
    }
2779

    
2780
    for(y=0; y<8; y++){
2781
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2782
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2783
        for(x=1; x<7; x++){
2784
            xy = y * stride + x;
2785
            yz = y * 8 + x;
2786
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2787
        }
2788
    }
2789
}
2790

    
2791
static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2792
{
2793
    int i, d;
2794
    for( i = 0; i < 4; i++ ) {
2795
        if( tc0[i] < 0 ) {
2796
            pix += 4*ystride;
2797
            continue;
2798
        }
2799
        for( d = 0; d < 4; d++ ) {
2800
            const int p0 = pix[-1*xstride];
2801
            const int p1 = pix[-2*xstride];
2802
            const int p2 = pix[-3*xstride];
2803
            const int q0 = pix[0];
2804
            const int q1 = pix[1*xstride];
2805
            const int q2 = pix[2*xstride];
2806

    
2807
            if( FFABS( p0 - q0 ) < alpha &&
2808
                FFABS( p1 - p0 ) < beta &&
2809
                FFABS( q1 - q0 ) < beta ) {
2810

    
2811
                int tc = tc0[i];
2812
                int i_delta;
2813

    
2814
                if( FFABS( p2 - p0 ) < beta ) {
2815
                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2816
                    tc++;
2817
                }
2818
                if( FFABS( q2 - q0 ) < beta ) {
2819
                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2820
                    tc++;
2821
                }
2822

    
2823
                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2824
                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
2825
                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
2826
            }
2827
            pix += ystride;
2828
        }
2829
    }
2830
}
2831
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2832
{
2833
    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2834
}
2835
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2836
{
2837
    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2838
}
2839

    
2840
static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2841
{
2842
    int i, d;
2843
    for( i = 0; i < 4; i++ ) {
2844
        const int tc = tc0[i];
2845
        if( tc <= 0 ) {
2846
            pix += 2*ystride;
2847
            continue;
2848
        }
2849
        for( d = 0; d < 2; d++ ) {
2850
            const int p0 = pix[-1*xstride];
2851
            const int p1 = pix[-2*xstride];
2852
            const int q0 = pix[0];
2853
            const int q1 = pix[1*xstride];
2854

    
2855
            if( FFABS( p0 - q0 ) < alpha &&
2856
                FFABS( p1 - p0 ) < beta &&
2857
                FFABS( q1 - q0 ) < beta ) {
2858

    
2859
                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2860

    
2861
                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
2862
                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
2863
            }
2864
            pix += ystride;
2865
        }
2866
    }
2867
}
2868
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2869
{
2870
    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2871
}
2872
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2873
{
2874
    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2875
}
2876

    
2877
static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2878
{
2879
    int d;
2880
    for( d = 0; d < 8; d++ ) {
2881
        const int p0 = pix[-1*xstride];
2882
        const int p1 = pix[-2*xstride];
2883
        const int q0 = pix[0];
2884
        const int q1 = pix[1*xstride];
2885

    
2886
        if( FFABS( p0 - q0 ) < alpha &&
2887
            FFABS( p1 - p0 ) < beta &&
2888
            FFABS( q1 - q0 ) < beta ) {
2889

    
2890
            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
2891
            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
2892
        }
2893
        pix += ystride;
2894
    }
2895
}
2896
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2897
{
2898
    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2899
}
2900
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2901
{
2902
    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2903
}
2904

    
2905
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2906
{
2907
    int s, i;
2908

    
2909
    s = 0;
2910
    for(i=0;i<h;i++) {
2911
        s += abs(pix1[0] - pix2[0]);
2912
        s += abs(pix1[1] - pix2[1]);
2913
        s += abs(pix1[2] - pix2[2]);
2914
        s += abs(pix1[3] - pix2[3]);
2915
        s += abs(pix1[4] - pix2[4]);
2916
        s += abs(pix1[5] - pix2[5]);
2917
        s += abs(pix1[6] - pix2[6]);
2918
        s += abs(pix1[7] - pix2[7]);
2919
        s += abs(pix1[8] - pix2[8]);
2920
        s += abs(pix1[9] - pix2[9]);
2921
        s += abs(pix1[10] - pix2[10]);
2922
        s += abs(pix1[11] - pix2[11]);
2923
        s += abs(pix1[12] - pix2[12]);
2924
        s += abs(pix1[13] - pix2[13]);
2925
        s += abs(pix1[14] - pix2[14]);
2926
        s += abs(pix1[15] - pix2[15]);
2927
        pix1 += line_size;
2928
        pix2 += line_size;
2929
    }
2930
    return s;
2931
}
2932

    
2933
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2934
{
2935
    int s, i;
2936

    
2937
    s = 0;
2938
    for(i=0;i<h;i++) {
2939
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2940
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2941
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2942
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2943
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2944
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2945
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2946
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2947
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2948
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2949
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2950
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2951
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2952
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2953
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2954
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2955
        pix1 += line_size;
2956
        pix2 += line_size;
2957
    }
2958
    return s;
2959
}
2960

    
2961
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2962
{
2963
    int s, i;
2964
    uint8_t *pix3 = pix2 + line_size;
2965

    
2966
    s = 0;
2967
    for(i=0;i<h;i++) {
2968
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2969
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2970
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2971
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2972
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2973
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2974
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2975
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2976
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2977
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2978
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2979
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2980
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2981
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2982
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2983
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2984
        pix1 += line_size;
2985
        pix2 += line_size;
2986
        pix3 += line_size;
2987
    }
2988
    return s;
2989
}
2990

    
2991
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2992
{
2993
    int s, i;
2994
    uint8_t *pix3 = pix2 + line_size;
2995

    
2996
    s = 0;
2997
    for(i=0;i<h;i++) {
2998
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2999
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3000
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3001
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3002
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3003
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3004
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3005
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3006
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3007
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3008
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3009
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3010
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3011
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3012
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3013
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3014
        pix1 += line_size;
3015
        pix2 += line_size;
3016
        pix3 += line_size;
3017
    }
3018
    return s;
3019
}
3020

    
3021
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3022
{
3023
    int s, i;
3024

    
3025
    s = 0;
3026
    for(i=0;i<h;i++) {
3027
        s += abs(pix1[0] - pix2[0]);
3028
        s += abs(pix1[1] - pix2[1]);
3029
        s += abs(pix1[2] - pix2[2]);
3030
        s += abs(pix1[3] - pix2[3]);
3031
        s += abs(pix1[4] - pix2[4]);
3032
        s += abs(pix1[5] - pix2[5]);
3033
        s += abs(pix1[6] - pix2[6]);
3034
        s += abs(pix1[7] - pix2[7]);
3035
        pix1 += line_size;
3036
        pix2 += line_size;
3037
    }
3038
    return s;
3039
}
3040

    
3041
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3042
{
3043
    int s, i;
3044

    
3045
    s = 0;
3046
    for(i=0;i<h;i++) {
3047
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3048
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3049
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3050
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3051
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3052
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3053
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3054
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3055
        pix1 += line_size;
3056
        pix2 += line_size;
3057
    }
3058
    return s;
3059
}
3060

    
3061
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3062
{
3063
    int s, i;
3064
    uint8_t *pix3 = pix2 + line_size;
3065

    
3066
    s = 0;
3067
    for(i=0;i<h;i++) {
3068
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3069
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3070
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3071
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3072
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3073
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3074
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3075
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3076
        pix1 += line_size;
3077
        pix2 += line_size;
3078
        pix3 += line_size;
3079
    }
3080
    return s;
3081
}
3082

    
3083
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3084
{
3085
    int s, i;
3086
    uint8_t *pix3 = pix2 + line_size;
3087

    
3088
    s = 0;
3089
    for(i=0;i<h;i++) {
3090
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3091
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3092
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3093
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3094
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3095
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3096
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3097
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3098
        pix1 += line_size;
3099
        pix2 += line_size;
3100
        pix3 += line_size;
3101
    }
3102
    return s;
3103
}
3104

    
3105
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3106
    MpegEncContext *c = v;
3107
    int score1=0;
3108
    int score2=0;
3109
    int x,y;
3110

    
3111
    for(y=0; y<h; y++){
3112
        for(x=0; x<16; x++){
3113
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3114
        }
3115
        if(y+1<h){
3116
            for(x=0; x<15; x++){
3117
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3118
                             - s1[x+1] + s1[x+1+stride])
3119
                        -FFABS(  s2[x  ] - s2[x  +stride]
3120
                             - s2[x+1] + s2[x+1+stride]);
3121
            }
3122
        }
3123
        s1+= stride;
3124
        s2+= stride;
3125
    }
3126

    
3127
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3128
    else  return score1 + FFABS(score2)*8;
3129
}
3130

    
3131
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3132
    MpegEncContext *c = v;
3133
    int score1=0;
3134
    int score2=0;
3135
    int x,y;
3136

    
3137
    for(y=0; y<h; y++){
3138
        for(x=0; x<8; x++){
3139
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3140
        }
3141
        if(y+1<h){
3142
            for(x=0; x<7; x++){
3143
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3144
                             - s1[x+1] + s1[x+1+stride])
3145
                        -FFABS(  s2[x  ] - s2[x  +stride]
3146
                             - s2[x+1] + s2[x+1+stride]);
3147
            }
3148
        }
3149
        s1+= stride;
3150
        s2+= stride;
3151
    }
3152

    
3153
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3154
    else  return score1 + FFABS(score2)*8;
3155
}
3156

    
3157
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3158
    int i;
3159
    unsigned int sum=0;
3160

    
3161
    for(i=0; i<8*8; i++){
3162
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3163
        int w= weight[i];
3164
        b>>= RECON_SHIFT;
3165
        assert(-512<b && b<512);
3166

    
3167
        sum += (w*b)*(w*b)>>4;
3168
    }
3169
    return sum>>2;
3170
}
3171

    
3172
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3173
    int i;
3174

    
3175
    for(i=0; i<8*8; i++){
3176
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3177
    }
3178
}
3179

    
3180
/**
3181
 * permutes an 8x8 block.
3182
 * @param block the block which will be permuted according to the given permutation vector
3183
 * @param permutation the permutation vector
3184
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3185
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3186
 *                  (inverse) permutated to scantable order!
3187
 */
3188
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3189
{
3190
    int i;
3191
    DCTELEM temp[64];
3192

    
3193
    if(last<=0) return;
3194
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3195

    
3196
    for(i=0; i<=last; i++){
3197
        const int j= scantable[i];
3198
        temp[j]= block[j];
3199
        block[j]=0;
3200
    }
3201

    
3202
    for(i=0; i<=last; i++){
3203
        const int j= scantable[i];
3204
        const int perm_j= permutation[j];
3205
        block[perm_j]= temp[j];
3206
    }
3207
}
3208

    
3209
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3210
    return 0;
3211
}
3212

    
3213
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3214
    int i;
3215

    
3216
    memset(cmp, 0, sizeof(void*)*5);
3217

    
3218
    for(i=0; i<5; i++){
3219
        switch(type&0xFF){
3220
        case FF_CMP_SAD:
3221
            cmp[i]= c->sad[i];
3222
            break;
3223
        case FF_CMP_SATD:
3224
            cmp[i]= c->hadamard8_diff[i];
3225
            break;
3226
        case FF_CMP_SSE:
3227
            cmp[i]= c->sse[i];
3228
            break;
3229
        case FF_CMP_DCT:
3230
            cmp[i]= c->dct_sad[i];
3231
            break;
3232
        case FF_CMP_DCT264:
3233
            cmp[i]= c->dct264_sad[i];
3234
            break;
3235
        case FF_CMP_DCTMAX:
3236
            cmp[i]= c->dct_max[i];
3237
            break;
3238
        case FF_CMP_PSNR:
3239
            cmp[i]= c->quant_psnr[i];
3240
            break;
3241
        case FF_CMP_BIT:
3242
            cmp[i]= c->bit[i];
3243
            break;
3244
        case FF_CMP_RD:
3245
            cmp[i]= c->rd[i];
3246
            break;
3247
        case FF_CMP_VSAD:
3248
            cmp[i]= c->vsad[i];
3249
            break;
3250
        case FF_CMP_VSSE:
3251
            cmp[i]= c->vsse[i];
3252
            break;
3253
        case FF_CMP_ZERO:
3254
            cmp[i]= zero_cmp;
3255
            break;
3256
        case FF_CMP_NSSE:
3257
            cmp[i]= c->nsse[i];
3258
            break;
3259
#ifdef CONFIG_SNOW_ENCODER
3260
        case FF_CMP_W53:
3261
            cmp[i]= c->w53[i];
3262
            break;
3263
        case FF_CMP_W97:
3264
            cmp[i]= c->w97[i];
3265
            break;
3266
#endif
3267
        default:
3268
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3269
        }
3270
    }
3271
}
3272

    
3273
/**
3274
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3275
 */
3276
static void clear_blocks_c(DCTELEM *blocks)
3277
{
3278
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3279
}
3280

    
3281
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3282
    long i;
3283
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3284
        long a = *(long*)(src+i);
3285
        long b = *(long*)(dst+i);
3286
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3287
    }
3288
    for(; i<w; i++)
3289
        dst[i+0] += src[i+0];
3290
}
3291

    
3292
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3293
    long i;
3294
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3295
        long a = *(long*)(src1+i);
3296
        long b = *(long*)(src2+i);
3297
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3298
    }
3299
    for(; i<w; i++)
3300
        dst[i] = src1[i]+src2[i];
3301
}
3302

    
3303
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3304
    long i;
3305
#ifndef HAVE_FAST_UNALIGNED
3306
    if((long)src2 & (sizeof(long)-1)){
3307
        for(i=0; i+7<w; i+=8){
3308
            dst[i+0] = src1[i+0]-src2[i+0];
3309
            dst[i+1] = src1[i+1]-src2[i+1];
3310
            dst[i+2] = src1[i+2]-src2[i+2];
3311
            dst[i+3] = src1[i+3]-src2[i+3];
3312
            dst[i+4] = src1[i+4]-src2[i+4];
3313
            dst[i+5] = src1[i+5]-src2[i+5];
3314
            dst[i+6] = src1[i+6]-src2[i+6];
3315
            dst[i+7] = src1[i+7]-src2[i+7];
3316
        }
3317
    }else
3318
#endif
3319
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3320
        long a = *(long*)(src1+i);
3321
        long b = *(long*)(src2+i);
3322
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3323
    }
3324
    for(; i<w; i++)
3325
        dst[i+0] = src1[i+0]-src2[i+0];
3326
}
3327

    
3328
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3329
    int i;
3330
    uint8_t l, lt;
3331

    
3332
    l= *left;
3333
    lt= *left_top;
3334

    
3335
    for(i=0; i<w; i++){
3336
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3337
        lt= src1[i];
3338
        l= src2[i];
3339
        dst[i]= l - pred;
3340
    }
3341

    
3342
    *left= l;
3343
    *left_top= lt;
3344
}
3345

    
3346
#define BUTTERFLY2(o1,o2,i1,i2) \
3347
o1= (i1)+(i2);\
3348
o2= (i1)-(i2);
3349

    
3350
#define BUTTERFLY1(x,y) \
3351
{\
3352
    int a,b;\
3353
    a= x;\
3354
    b= y;\
3355
    x= a+b;\
3356
    y= a-b;\
3357
}
3358

    
3359
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3360

    
3361
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3362
    int i;
3363
    int temp[64];
3364
    int sum=0;
3365

    
3366
    assert(h==8);
3367

    
3368
    for(i=0; i<8; i++){
3369
        //FIXME try pointer walks
3370
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3371
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3372
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3373
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3374

    
3375
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3376
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3377
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3378
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3379

    
3380
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3381
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3382
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3383
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3384
    }
3385

    
3386
    for(i=0; i<8; i++){
3387
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3388
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3389
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3390
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3391

    
3392
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3393
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3394
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3395
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3396

    
3397
        sum +=
3398
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3399
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3400
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3401
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3402
    }
3403
#if 0
3404
static int maxi=0;
3405
if(sum>maxi){
3406
    maxi=sum;
3407
    printf("MAX:%d\n", maxi);
3408
}
3409
#endif
3410
    return sum;
3411
}
3412

    
3413
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3414
    int i;
3415
    int temp[64];
3416
    int sum=0;
3417

    
3418
    assert(h==8);
3419

    
3420
    for(i=0; i<8; i++){
3421
        //FIXME try pointer walks
3422
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3423
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3424
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3425
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3426

    
3427
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3428
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3429
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3430
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3431

    
3432
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3433
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3434
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3435
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3436
    }
3437

    
3438
    for(i=0; i<8; i++){
3439
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3440
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3441
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3442
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3443

    
3444
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3445
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3446
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3447
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3448

    
3449
        sum +=
3450
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3451
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3452
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3453
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3454
    }
3455

    
3456
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3457

    
3458
    return sum;
3459
}
3460

    
3461
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3462
    MpegEncContext * const s= (MpegEncContext *)c;
3463
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3464
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3465

    
3466
    assert(h==8);
3467

    
3468
    s->dsp.diff_pixels(temp, src1, src2, stride);
3469
    s->dsp.fdct(temp);
3470
    return s->dsp.sum_abs_dctelem(temp);
3471
}
3472

    
3473
#ifdef CONFIG_GPL
3474
#define DCT8_1D {\
3475
    const int s07 = SRC(0) + SRC(7);\
3476
    const int s16 = SRC(1) + SRC(6);\
3477
    const int s25 = SRC(2) + SRC(5);\
3478
    const int s34 = SRC(3) + SRC(4);\
3479
    const int a0 = s07 + s34;\
3480
    const int a1 = s16 + s25;\
3481
    const int a2 = s07 - s34;\
3482
    const int a3 = s16 - s25;\
3483
    const int d07 = SRC(0) - SRC(7);\
3484
    const int d16 = SRC(1) - SRC(6);\
3485
    const int d25 = SRC(2) - SRC(5);\
3486
    const int d34 = SRC(3) - SRC(4);\
3487
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3488
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3489
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3490
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3491
    DST(0,  a0 + a1     ) ;\
3492
    DST(1,  a4 + (a7>>2)) ;\
3493
    DST(2,  a2 + (a3>>1)) ;\
3494
    DST(3,  a5 + (a6>>2)) ;\
3495
    DST(4,  a0 - a1     ) ;\
3496
    DST(5,  a6 - (a5>>2)) ;\
3497
    DST(6, (a2>>1) - a3 ) ;\
3498
    DST(7, (a4>>2) - a7 ) ;\
3499
}
3500

    
3501
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3502
    MpegEncContext * const s= (MpegEncContext *)c;
3503
    DCTELEM dct[8][8];
3504
    int i;
3505
    int sum=0;
3506

    
3507
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3508

    
3509
#define SRC(x) dct[i][x]
3510
#define DST(x,v) dct[i][x]= v
3511
    for( i = 0; i < 8; i++ )
3512
        DCT8_1D
3513
#undef SRC
3514
#undef DST
3515

    
3516
#define SRC(x) dct[x][i]
3517
#define DST(x,v) sum += FFABS(v)
3518
    for( i = 0; i < 8; i++ )
3519
        DCT8_1D
3520
#undef SRC
3521
#undef DST
3522
    return sum;
3523
}
3524
#endif
3525

    
3526
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3527
    MpegEncContext * const s= (MpegEncContext *)c;
3528
    DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3529
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3530
    int sum=0, i;
3531

    
3532
    assert(h==8);
3533

    
3534
    s->dsp.diff_pixels(temp, src1, src2, stride);
3535
    s->dsp.fdct(temp);
3536

    
3537
    for(i=0; i<64; i++)
3538
        sum= FFMAX(sum, FFABS(temp[i]));
3539

    
3540
    return sum;
3541
}
3542

    
3543
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3544
    MpegEncContext * const s= (MpegEncContext *)c;
3545
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3546
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3547
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3548
    int sum=0, i;
3549

    
3550
    assert(h==8);
3551
    s->mb_intra=0;
3552

    
3553
    s->dsp.diff_pixels(temp, src1, src2, stride);
3554

    
3555
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3556

    
3557
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3558
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3559
    ff_simple_idct(temp); //FIXME
3560

    
3561
    for(i=0; i<64; i++)
3562
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3563

    
3564
    return sum;
3565
}
3566

    
3567
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3568
    MpegEncContext * const s= (MpegEncContext *)c;
3569
    const uint8_t *scantable= s->intra_scantable.permutated;
3570
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3571
    DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3572
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3573
    uint8_t * const bak= (uint8_t*)aligned_bak;
3574
    int i, last, run, bits, level, distoration, start_i;
3575
    const int esc_length= s->ac_esc_length;
3576
    uint8_t * length;
3577
    uint8_t * last_length;
3578

    
3579
    assert(h==8);
3580

    
3581
    for(i=0; i<8; i++){
3582
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3583
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3584
    }
3585

    
3586
    s->dsp.diff_pixels(temp, src1, src2, stride);
3587

    
3588
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3589

    
3590
    bits=0;
3591

    
3592
    if (s->mb_intra) {
3593
        start_i = 1;
3594
        length     = s->intra_ac_vlc_length;
3595
        last_length= s->intra_ac_vlc_last_length;
3596
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3597
    } else {
3598
        start_i = 0;
3599
        length     = s->inter_ac_vlc_length;
3600
        last_length= s->inter_ac_vlc_last_length;
3601
    }
3602

    
3603
    if(last>=start_i){
3604
        run=0;
3605
        for(i=start_i; i<last; i++){
3606
            int j= scantable[i];
3607
            level= temp[j];
3608

    
3609
            if(level){
3610
                level+=64;
3611
                if((level&(~127)) == 0){
3612
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3613
                }else
3614
                    bits+= esc_length;
3615
                run=0;
3616
            }else
3617
                run++;
3618
        }
3619
        i= scantable[last];
3620

    
3621
        level= temp[i] + 64;
3622

    
3623
        assert(level - 64);
3624

    
3625
        if((level&(~127)) == 0){
3626
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3627
        }else
3628
            bits+= esc_length;
3629

    
3630
    }
3631

    
3632
    if(last>=0){
3633
        if(s->mb_intra)
3634
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3635
        else
3636
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3637
    }
3638

    
3639
    s->dsp.idct_add(bak, stride, temp);
3640

    
3641
    distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3642

    
3643
    return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3644
}
3645

    
3646
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3647
    MpegEncContext * const s= (MpegEncContext *)c;
3648
    const uint8_t *scantable= s->intra_scantable.permutated;
3649
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3650
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3651
    int i, last, run, bits, level, start_i;
3652
    const int esc_length= s->ac_esc_length;
3653
    uint8_t * length;
3654
    uint8_t * last_length;
3655

    
3656
    assert(h==8);
3657

    
3658
    s->dsp.diff_pixels(temp, src1, src2, stride);
3659

    
3660
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3661

    
3662
    bits=0;
3663

    
3664
    if (s->mb_intra) {
3665
        start_i = 1;
3666
        length     = s->intra_ac_vlc_length;
3667
        last_length= s->intra_ac_vlc_last_length;
3668
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3669
    } else {
3670
        start_i = 0;
3671
        length     = s->inter_ac_vlc_length;
3672
        last_length= s->inter_ac_vlc_last_length;
3673
    }
3674

    
3675
    if(last>=start_i){
3676
        run=0;
3677
        for(i=start_i; i<last; i++){
3678
            int j= scantable[i];
3679
            level= temp[j];
3680

    
3681
            if(level){
3682
                level+=64;
3683
                if((level&(~127)) == 0){
3684
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3685
                }else
3686
                    bits+= esc_length;
3687
                run=0;
3688
            }else
3689
                run++;
3690
        }
3691
        i= scantable[last];
3692

    
3693
        level= temp[i] + 64;
3694

    
3695
        assert(level - 64);
3696

    
3697
        if((level&(~127)) == 0){
3698
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3699
        }else
3700
            bits+= esc_length;
3701
    }
3702

    
3703
    return bits;
3704
}
3705

    
3706
static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3707
    int score=0;
3708
    int x,y;
3709

    
3710
    for(y=1; y<h; y++){
3711
        for(x=0; x<16; x+=4){
3712
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])
3713
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3714
        }
3715
        s+= stride;
3716
    }
3717

    
3718
    return score;
3719
}
3720

    
3721
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3722
    int score=0;
3723
    int x,y;
3724

    
3725
    for(y=1; y<h; y++){
3726
        for(x=0; x<16; x++){
3727
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3728
        }
3729
        s1+= stride;
3730
        s2+= stride;
3731
    }
3732

    
3733
    return score;
3734
}
3735

    
3736
#define SQ(a) ((a)*(a))
3737
static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3738
    int score=0;
3739
    int x,y;
3740

    
3741
    for(y=1; y<h; y++){
3742
        for(x=0; x<16; x+=4){
3743
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3744
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3745
        }
3746
        s+= stride;
3747
    }
3748

    
3749
    return score;
3750
}
3751

    
3752
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3753
    int score=0;
3754
    int x,y;
3755

    
3756
    for(y=1; y<h; y++){
3757
        for(x=0; x<16; x++){
3758
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3759
        }
3760
        s1+= stride;
3761
        s2+= stride;
3762
    }
3763

    
3764
    return score;
3765
}
3766

    
3767
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3768
                               int size){
3769
    int score=0;
3770
    int i;
3771
    for(i=0; i<size; i++)
3772
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3773
    return score;
3774
}
3775

    
3776
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3777
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3778
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3779
#ifdef CONFIG_GPL
3780
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3781
#endif
3782
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3783
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3784
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3785
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3786

    
3787
static void vector_fmul_c(float *dst, const float *src, int len){
3788
    int i;
3789
    for(i=0; i<len; i++)
3790
        dst[i] *= src[i];
3791
}
3792

    
3793
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3794
    int i;
3795
    src1 += len-1;
3796
    for(i=0; i<len; i++)
3797
        dst[i] = src0[i] * src1[-i];
3798
}
3799

    
3800
void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3801
    int i;
3802
    for(i=0; i<len; i++)
3803
        dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3804
}
3805

    
3806
void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3807
    int i;
3808
    for(i=0; i<len; i++) {
3809
        int_fast32_t tmp = ((const int32_t*)src)[i];
3810
        if(tmp & 0xf0000){
3811
            tmp = (0x43c0ffff - tmp)>>31;
3812
            // is this faster on some gcc/cpu combinations?
3813
//          if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3814
//          else                 tmp = 0;
3815
        }
3816
        dst[i] = tmp - 0x8000;
3817
    }
3818
}
3819

    
3820
#define W0 2048
3821
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3822
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3823
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3824
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3825
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3826
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3827
#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
3828

    
3829
static void wmv2_idct_row(short * b)
3830
{
3831
    int s1,s2;
3832
    int a0,a1,a2,a3,a4,a5,a6,a7;
3833
    /*step 1*/
3834
    a1 = W1*b[1]+W7*b[7];
3835
    a7 = W7*b[1]-W1*b[7];
3836
    a5 = W5*b[5]+W3*b[3];
3837
    a3 = W3*b[5]-W5*b[3];
3838
    a2 = W2*b[2]+W6*b[6];
3839
    a6 = W6*b[2]-W2*b[6];
3840
    a0 = W0*b[0]+W0*b[4];
3841
    a4 = W0*b[0]-W0*b[4];
3842
    /*step 2*/
3843
    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3844
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
3845
    /*step 3*/
3846
    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3847
    b[1] = (a4+a6 +s1   + (1<<7))>>8;
3848
    b[2] = (a4-a6 +s2   + (1<<7))>>8;
3849
    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3850
    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3851
    b[5] = (a4-a6 -s2   + (1<<7))>>8;
3852
    b[6] = (a4+a6 -s1   + (1<<7))>>8;
3853
    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3854
}
3855
static void wmv2_idct_col(short * b)
3856
{
3857
    int s1,s2;
3858
    int a0,a1,a2,a3,a4,a5,a6,a7;
3859
    /*step 1, with extended precision*/
3860
    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3861
    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3862
    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3863
    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3864
    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3865
    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3866
    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
3867
    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
3868
    /*step 2*/
3869
    s1 = (181*(a1-a5+a7-a3)+128)>>8;
3870
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
3871
    /*step 3*/
3872
    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3873
    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
3874
    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
3875
    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3876

    
3877
    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3878
    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
3879
    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
3880
    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3881
}
3882
void ff_wmv2_idct_c(short * block){
3883
    int i;
3884

    
3885
    for(i=0;i<64;i+=8){
3886
        wmv2_idct_row(block+i);
3887
    }
3888
    for(i=0;i<8;i++){
3889
        wmv2_idct_col(block+i);
3890
    }
3891
}
3892
/* XXX: those functions should be suppressed ASAP when all IDCTs are
3893
 converted */
3894
static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
3895
{
3896
    ff_wmv2_idct_c(block);
3897
    put_pixels_clamped_c(block, dest, line_size);
3898
}
3899
static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
3900
{
3901
    ff_wmv2_idct_c(block);
3902
    add_pixels_clamped_c(block, dest, line_size);
3903
}
3904
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3905
{
3906
    j_rev_dct (block);
3907
    put_pixels_clamped_c(block, dest, line_size);
3908
}
3909
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3910
{
3911
    j_rev_dct (block);
3912
    add_pixels_clamped_c(block, dest, line_size);
3913
}
3914

    
3915
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3916
{
3917
    j_rev_dct4 (block);
3918
    put_pixels_clamped4_c(block, dest, line_size);
3919
}
3920
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3921
{
3922
    j_rev_dct4 (block);
3923
    add_pixels_clamped4_c(block, dest, line_size);
3924
}
3925

    
3926
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3927
{
3928
    j_rev_dct2 (block);
3929
    put_pixels_clamped2_c(block, dest, line_size);
3930
}
3931
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3932
{
3933
    j_rev_dct2 (block);
3934
    add_pixels_clamped2_c(block, dest, line_size);
3935
}
3936

    
3937
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3938
{
3939
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3940

    
3941
    dest[0] = cm[(block[0] + 4)>>3];
3942
}
3943
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3944
{
3945
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3946

    
3947
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3948
}
3949

    
3950
static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
3951

    
3952
/* init static data */
3953
void dsputil_static_init(void)
3954
{
3955
    int i;
3956

    
3957
    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
3958
    for(i=0;i<MAX_NEG_CROP;i++) {
3959
        ff_cropTbl[i] = 0;
3960
        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
3961
    }
3962

    
3963
    for(i=0;i<512;i++) {
3964
        ff_squareTbl[i] = (i - 256) * (i - 256);
3965
    }
3966

    
3967
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3968
}
3969

    
3970
int ff_check_alignment(void){
3971
    static int did_fail=0;
3972
    DECLARE_ALIGNED_16(int, aligned);
3973

    
3974
    if((long)&aligned & 15){
3975
        if(!did_fail){
3976
#if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
3977
            av_log(NULL, AV_LOG_ERROR,
3978
                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
3979
                "and may be very slow or crash. This is not a bug in libavcodec,\n"
3980
                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
3981
                "Do not report crashes to FFmpeg developers.\n");
3982
#endif
3983
            did_fail=1;
3984
        }
3985
        return -1;
3986
    }
3987
    return 0;
3988
}
3989

    
3990
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3991
{
3992
    int i;
3993

    
3994
    ff_check_alignment();
3995

    
3996
#ifdef CONFIG_ENCODERS
3997
    if(avctx->dct_algo==FF_DCT_FASTINT) {
3998
        c->fdct = fdct_ifast;
3999
        c->fdct248 = fdct_ifast248;
4000
    }
4001
    else if(avctx->dct_algo==FF_DCT_FAAN) {
4002
        c->fdct = ff_faandct;
4003
        c->fdct248 = ff_faandct248;
4004
    }
4005
    else {
4006
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4007
        c->fdct248 = ff_fdct248_islow;
4008
    }
4009
#endif //CONFIG_ENCODERS
4010

    
4011
    if(avctx->lowres==1){
4012
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
4013
            c->idct_put= ff_jref_idct4_put;
4014
            c->idct_add= ff_jref_idct4_add;
4015
        }else{
4016
            c->idct_put= ff_h264_lowres_idct_put_c;
4017
            c->idct_add= ff_h264_lowres_idct_add_c;
4018
        }
4019
        c->idct    = j_rev_dct4;
4020
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4021
    }else if(avctx->lowres==2){
4022
        c->idct_put= ff_jref_idct2_put;
4023
        c->idct_add= ff_jref_idct2_add;
4024
        c->idct    = j_rev_dct2;
4025
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4026
    }else if(avctx->lowres==3){
4027
        c->idct_put= ff_jref_idct1_put;
4028
        c->idct_add= ff_jref_idct1_add;
4029
        c->idct    = j_rev_dct1;
4030
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4031
    }else{
4032
        if(avctx->idct_algo==FF_IDCT_INT){
4033
            c->idct_put= ff_jref_idct_put;
4034
            c->idct_add= ff_jref_idct_add;
4035
            c->idct    = j_rev_dct;
4036
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4037
        }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
4038
                avctx->idct_algo==FF_IDCT_VP3){
4039
            c->idct_put= ff_vp3_idct_put_c;
4040
            c->idct_add= ff_vp3_idct_add_c;
4041
            c->idct    = ff_vp3_idct_c;
4042
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4043
        }else if(avctx->idct_algo==FF_IDCT_WMV2){
4044
            c->idct_put= ff_wmv2_idct_put_c;
4045
            c->idct_add= ff_wmv2_idct_add_c;
4046
            c->idct    = ff_wmv2_idct_c;
4047
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4048
        }else{ //accurate/default
4049
            c->idct_put= ff_simple_idct_put;
4050
            c->idct_add= ff_simple_idct_add;
4051
            c->idct    = ff_simple_idct;
4052
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4053
        }
4054
    }
4055

    
4056
    if (ENABLE_H264_DECODER) {
4057
        c->h264_idct_add= ff_h264_idct_add_c;
4058
        c->h264_idct8_add= ff_h264_idct8_add_c;
4059
        c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4060
        c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4061
    }
4062

    
4063
    c->get_pixels = get_pixels_c;
4064
    c->diff_pixels = diff_pixels_c;
4065
    c->put_pixels_clamped = put_pixels_clamped_c;
4066
    c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4067
    c->add_pixels_clamped = add_pixels_clamped_c;
4068
    c->add_pixels8 = add_pixels8_c;
4069
    c->add_pixels4 = add_pixels4_c;
4070
    c->sum_abs_dctelem = sum_abs_dctelem_c;
4071
    c->gmc1 = gmc1_c;
4072
    c->gmc = ff_gmc_c;
4073
    c->clear_blocks = clear_blocks_c;
4074
    c->pix_sum = pix_sum_c;
4075
    c->pix_norm1 = pix_norm1_c;
4076

    
4077
    /* TODO [0] 16  [1] 8 */
4078
    c->pix_abs[0][0] = pix_abs16_c;
4079
    c->pix_abs[0][1] = pix_abs16_x2_c;
4080
    c->pix_abs[0][2] = pix_abs16_y2_c;
4081
    c->pix_abs[0][3] = pix_abs16_xy2_c;
4082
    c->pix_abs[1][0] = pix_abs8_c;
4083
    c->pix_abs[1][1] = pix_abs8_x2_c;
4084
    c->pix_abs[1][2] = pix_abs8_y2_c;
4085
    c->pix_abs[1][3] = pix_abs8_xy2_c;
4086

    
4087
#define dspfunc(PFX, IDX, NUM) \
4088
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4089
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4090
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4091
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4092

    
4093
    dspfunc(put, 0, 16);
4094
    dspfunc(put_no_rnd, 0, 16);
4095
    dspfunc(put, 1, 8);
4096
    dspfunc(put_no_rnd, 1, 8);
4097
    dspfunc(put, 2, 4);
4098
    dspfunc(put, 3, 2);
4099

    
4100
    dspfunc(avg, 0, 16);
4101
    dspfunc(avg_no_rnd, 0, 16);
4102
    dspfunc(avg, 1, 8);
4103
    dspfunc(avg_no_rnd, 1, 8);
4104
    dspfunc(avg, 2, 4);
4105
    dspfunc(avg, 3, 2);
4106
#undef dspfunc
4107

    
4108
    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4109
    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4110

    
4111
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4112
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4113
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4114
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4115
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4116
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4117
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4118
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4119
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4120

    
4121
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4122
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4123
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4124
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4125
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4126
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4127
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4128
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4129
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4130

    
4131
#define dspfunc(PFX, IDX, NUM) \
4132
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4133
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4134
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4135
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4136
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4137
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4138
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4139
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4140
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4141
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4142
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4143
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4144
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4145
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4146
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4147
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4148

    
4149
    dspfunc(put_qpel, 0, 16);
4150
    dspfunc(put_no_rnd_qpel, 0, 16);
4151

    
4152
    dspfunc(avg_qpel, 0, 16);
4153
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4154

    
4155
    dspfunc(put_qpel, 1, 8);
4156
    dspfunc(put_no_rnd_qpel, 1, 8);
4157

    
4158
    dspfunc(avg_qpel, 1, 8);
4159
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4160

    
4161
    dspfunc(put_h264_qpel, 0, 16);
4162
    dspfunc(put_h264_qpel, 1, 8);
4163
    dspfunc(put_h264_qpel, 2, 4);
4164
    dspfunc(put_h264_qpel, 3, 2);
4165
    dspfunc(avg_h264_qpel, 0, 16);
4166
    dspfunc(avg_h264_qpel, 1, 8);
4167
    dspfunc(avg_h264_qpel, 2, 4);
4168

    
4169
#undef dspfunc
4170
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4171
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4172
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4173
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4174
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4175
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4176
    c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4177

    
4178
    c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4179
    c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4180
    c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4181
    c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4182
    c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4183
    c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4184
    c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4185
    c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4186
    c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4187
    c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4188
    c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4189
    c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4190
    c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4191
    c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4192
    c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4193
    c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4194
    c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4195
    c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4196
    c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4197
    c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4198

    
4199
#ifdef CONFIG_CAVS_DECODER
4200
    ff_cavsdsp_init(c,avctx);
4201
#endif
4202
#if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4203
    ff_vc1dsp_init(c,avctx);
4204
#endif
4205
#if defined(CONFIG_WMV2_DECODER) || defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4206
    ff_intrax8dsp_init(c,avctx);
4207
#endif
4208
#if defined(CONFIG_H264_ENCODER)
4209
    ff_h264dspenc_init(c,avctx);
4210
#endif
4211

    
4212
    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4213
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4214
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4215
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4216
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4217
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4218
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4219
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4220

    
4221
#define SET_CMP_FUNC(name) \