Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 917f55cc

History | View | Annotate | Download (154 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file dsputil.c
27
 * DSP utils
28
 */
29

    
30
#include "avcodec.h"
31
#include "dsputil.h"
32
#include "mpegvideo.h"
33
#include "simple_idct.h"
34
#include "faandct.h"
35
#include "h263.h"
36
#include "snow.h"
37

    
38
/* snow.c */
39
void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
40

    
41
/* vorbis.c */
42
void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
43

    
44
/* flacenc.c */
45
void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
46

    
47
/* pngdec.c */
48
void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
49

    
50
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
51
uint32_t ff_squareTbl[512] = {0, };
52

    
53
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
54
#define pb_7f (~0UL/255 * 0x7f)
55
#define pb_80 (~0UL/255 * 0x80)
56

    
57
const uint8_t ff_zigzag_direct[64] = {
58
    0,   1,  8, 16,  9,  2,  3, 10,
59
    17, 24, 32, 25, 18, 11,  4,  5,
60
    12, 19, 26, 33, 40, 48, 41, 34,
61
    27, 20, 13,  6,  7, 14, 21, 28,
62
    35, 42, 49, 56, 57, 50, 43, 36,
63
    29, 22, 15, 23, 30, 37, 44, 51,
64
    58, 59, 52, 45, 38, 31, 39, 46,
65
    53, 60, 61, 54, 47, 55, 62, 63
66
};
67

    
68
/* Specific zigzag scan for 248 idct. NOTE that unlike the
69
   specification, we interleave the fields */
70
const uint8_t ff_zigzag248_direct[64] = {
71
     0,  8,  1,  9, 16, 24,  2, 10,
72
    17, 25, 32, 40, 48, 56, 33, 41,
73
    18, 26,  3, 11,  4, 12, 19, 27,
74
    34, 42, 49, 57, 50, 58, 35, 43,
75
    20, 28,  5, 13,  6, 14, 21, 29,
76
    36, 44, 51, 59, 52, 60, 37, 45,
77
    22, 30,  7, 15, 23, 31, 38, 46,
78
    53, 61, 54, 62, 39, 47, 55, 63,
79
};
80

    
81
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
82
DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
83

    
84
const uint8_t ff_alternate_horizontal_scan[64] = {
85
    0,  1,   2,  3,  8,  9, 16, 17,
86
    10, 11,  4,  5,  6,  7, 15, 14,
87
    13, 12, 19, 18, 24, 25, 32, 33,
88
    26, 27, 20, 21, 22, 23, 28, 29,
89
    30, 31, 34, 35, 40, 41, 48, 49,
90
    42, 43, 36, 37, 38, 39, 44, 45,
91
    46, 47, 50, 51, 56, 57, 58, 59,
92
    52, 53, 54, 55, 60, 61, 62, 63,
93
};
94

    
95
const uint8_t ff_alternate_vertical_scan[64] = {
96
    0,  8,  16, 24,  1,  9,  2, 10,
97
    17, 25, 32, 40, 48, 56, 57, 49,
98
    41, 33, 26, 18,  3, 11,  4, 12,
99
    19, 27, 34, 42, 50, 58, 35, 43,
100
    51, 59, 20, 28,  5, 13,  6, 14,
101
    21, 29, 36, 44, 52, 60, 37, 45,
102
    53, 61, 22, 30,  7, 15, 23, 31,
103
    38, 46, 54, 62, 39, 47, 55, 63,
104
};
105

    
106
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
107
const uint32_t ff_inverse[256]={
108
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
109
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
110
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
111
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
112
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
113
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
114
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
115
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
116
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
117
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
118
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
119
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
120
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
121
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
122
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
123
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
124
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
125
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
126
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
127
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
128
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
129
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
130
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
131
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
132
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
133
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
134
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
135
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
136
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
137
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
138
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
139
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
140
};
141

    
142
/* Input permutation for the simple_idct_mmx */
143
static const uint8_t simple_mmx_permutation[64]={
144
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
145
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
146
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
147
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
148
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
149
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
150
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
151
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
152
};
153

    
154
static int pix_sum_c(uint8_t * pix, int line_size)
155
{
156
    int s, i, j;
157

    
158
    s = 0;
159
    for (i = 0; i < 16; i++) {
160
        for (j = 0; j < 16; j += 8) {
161
            s += pix[0];
162
            s += pix[1];
163
            s += pix[2];
164
            s += pix[3];
165
            s += pix[4];
166
            s += pix[5];
167
            s += pix[6];
168
            s += pix[7];
169
            pix += 8;
170
        }
171
        pix += line_size - 16;
172
    }
173
    return s;
174
}
175

    
176
static int pix_norm1_c(uint8_t * pix, int line_size)
177
{
178
    int s, i, j;
179
    uint32_t *sq = ff_squareTbl + 256;
180

    
181
    s = 0;
182
    for (i = 0; i < 16; i++) {
183
        for (j = 0; j < 16; j += 8) {
184
#if 0
185
            s += sq[pix[0]];
186
            s += sq[pix[1]];
187
            s += sq[pix[2]];
188
            s += sq[pix[3]];
189
            s += sq[pix[4]];
190
            s += sq[pix[5]];
191
            s += sq[pix[6]];
192
            s += sq[pix[7]];
193
#else
194
#if LONG_MAX > 2147483647
195
            register uint64_t x=*(uint64_t*)pix;
196
            s += sq[x&0xff];
197
            s += sq[(x>>8)&0xff];
198
            s += sq[(x>>16)&0xff];
199
            s += sq[(x>>24)&0xff];
200
            s += sq[(x>>32)&0xff];
201
            s += sq[(x>>40)&0xff];
202
            s += sq[(x>>48)&0xff];
203
            s += sq[(x>>56)&0xff];
204
#else
205
            register uint32_t x=*(uint32_t*)pix;
206
            s += sq[x&0xff];
207
            s += sq[(x>>8)&0xff];
208
            s += sq[(x>>16)&0xff];
209
            s += sq[(x>>24)&0xff];
210
            x=*(uint32_t*)(pix+4);
211
            s += sq[x&0xff];
212
            s += sq[(x>>8)&0xff];
213
            s += sq[(x>>16)&0xff];
214
            s += sq[(x>>24)&0xff];
215
#endif
216
#endif
217
            pix += 8;
218
        }
219
        pix += line_size - 16;
220
    }
221
    return s;
222
}
223

    
224
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
225
    int i;
226

    
227
    for(i=0; i+8<=w; i+=8){
228
        dst[i+0]= bswap_32(src[i+0]);
229
        dst[i+1]= bswap_32(src[i+1]);
230
        dst[i+2]= bswap_32(src[i+2]);
231
        dst[i+3]= bswap_32(src[i+3]);
232
        dst[i+4]= bswap_32(src[i+4]);
233
        dst[i+5]= bswap_32(src[i+5]);
234
        dst[i+6]= bswap_32(src[i+6]);
235
        dst[i+7]= bswap_32(src[i+7]);
236
    }
237
    for(;i<w; i++){
238
        dst[i+0]= bswap_32(src[i+0]);
239
    }
240
}
241

    
242
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
243
{
244
    int s, i;
245
    uint32_t *sq = ff_squareTbl + 256;
246

    
247
    s = 0;
248
    for (i = 0; i < h; i++) {
249
        s += sq[pix1[0] - pix2[0]];
250
        s += sq[pix1[1] - pix2[1]];
251
        s += sq[pix1[2] - pix2[2]];
252
        s += sq[pix1[3] - pix2[3]];
253
        pix1 += line_size;
254
        pix2 += line_size;
255
    }
256
    return s;
257
}
258

    
259
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
260
{
261
    int s, i;
262
    uint32_t *sq = ff_squareTbl + 256;
263

    
264
    s = 0;
265
    for (i = 0; i < h; i++) {
266
        s += sq[pix1[0] - pix2[0]];
267
        s += sq[pix1[1] - pix2[1]];
268
        s += sq[pix1[2] - pix2[2]];
269
        s += sq[pix1[3] - pix2[3]];
270
        s += sq[pix1[4] - pix2[4]];
271
        s += sq[pix1[5] - pix2[5]];
272
        s += sq[pix1[6] - pix2[6]];
273
        s += sq[pix1[7] - pix2[7]];
274
        pix1 += line_size;
275
        pix2 += line_size;
276
    }
277
    return s;
278
}
279

    
280
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
281
{
282
    int s, i;
283
    uint32_t *sq = ff_squareTbl + 256;
284

    
285
    s = 0;
286
    for (i = 0; i < h; i++) {
287
        s += sq[pix1[ 0] - pix2[ 0]];
288
        s += sq[pix1[ 1] - pix2[ 1]];
289
        s += sq[pix1[ 2] - pix2[ 2]];
290
        s += sq[pix1[ 3] - pix2[ 3]];
291
        s += sq[pix1[ 4] - pix2[ 4]];
292
        s += sq[pix1[ 5] - pix2[ 5]];
293
        s += sq[pix1[ 6] - pix2[ 6]];
294
        s += sq[pix1[ 7] - pix2[ 7]];
295
        s += sq[pix1[ 8] - pix2[ 8]];
296
        s += sq[pix1[ 9] - pix2[ 9]];
297
        s += sq[pix1[10] - pix2[10]];
298
        s += sq[pix1[11] - pix2[11]];
299
        s += sq[pix1[12] - pix2[12]];
300
        s += sq[pix1[13] - pix2[13]];
301
        s += sq[pix1[14] - pix2[14]];
302
        s += sq[pix1[15] - pix2[15]];
303

    
304
        pix1 += line_size;
305
        pix2 += line_size;
306
    }
307
    return s;
308
}
309

    
310

    
311
#ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
312
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
313
    int s, i, j;
314
    const int dec_count= w==8 ? 3 : 4;
315
    int tmp[32*32];
316
    int level, ori;
317
    static const int scale[2][2][4][4]={
318
      {
319
        {
320
            // 9/7 8x8 dec=3
321
            {268, 239, 239, 213},
322
            {  0, 224, 224, 152},
323
            {  0, 135, 135, 110},
324
        },{
325
            // 9/7 16x16 or 32x32 dec=4
326
            {344, 310, 310, 280},
327
            {  0, 320, 320, 228},
328
            {  0, 175, 175, 136},
329
            {  0, 129, 129, 102},
330
        }
331
      },{
332
        {
333
            // 5/3 8x8 dec=3
334
            {275, 245, 245, 218},
335
            {  0, 230, 230, 156},
336
            {  0, 138, 138, 113},
337
        },{
338
            // 5/3 16x16 or 32x32 dec=4
339
            {352, 317, 317, 286},
340
            {  0, 328, 328, 233},
341
            {  0, 180, 180, 140},
342
            {  0, 132, 132, 105},
343
        }
344
      }
345
    };
346

    
347
    for (i = 0; i < h; i++) {
348
        for (j = 0; j < w; j+=4) {
349
            tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
350
            tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
351
            tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
352
            tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
353
        }
354
        pix1 += line_size;
355
        pix2 += line_size;
356
    }
357

    
358
    ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
359

    
360
    s=0;
361
    assert(w==h);
362
    for(level=0; level<dec_count; level++){
363
        for(ori= level ? 1 : 0; ori<4; ori++){
364
            int size= w>>(dec_count-level);
365
            int sx= (ori&1) ? size : 0;
366
            int stride= 32<<(dec_count-level);
367
            int sy= (ori&2) ? stride>>1 : 0;
368

    
369
            for(i=0; i<size; i++){
370
                for(j=0; j<size; j++){
371
                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
372
                    s += FFABS(v);
373
                }
374
            }
375
        }
376
    }
377
    assert(s>=0);
378
    return s>>9;
379
}
380

    
381
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
382
    return w_c(v, pix1, pix2, line_size,  8, h, 1);
383
}
384

    
385
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
386
    return w_c(v, pix1, pix2, line_size,  8, h, 0);
387
}
388

    
389
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
390
    return w_c(v, pix1, pix2, line_size, 16, h, 1);
391
}
392

    
393
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
394
    return w_c(v, pix1, pix2, line_size, 16, h, 0);
395
}
396

    
397
int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
398
    return w_c(v, pix1, pix2, line_size, 32, h, 1);
399
}
400

    
401
int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
402
    return w_c(v, pix1, pix2, line_size, 32, h, 0);
403
}
404
#endif
405

    
406
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
407
{
408
    int i;
409

    
410
    /* read the pixels */
411
    for(i=0;i<8;i++) {
412
        block[0] = pixels[0];
413
        block[1] = pixels[1];
414
        block[2] = pixels[2];
415
        block[3] = pixels[3];
416
        block[4] = pixels[4];
417
        block[5] = pixels[5];
418
        block[6] = pixels[6];
419
        block[7] = pixels[7];
420
        pixels += line_size;
421
        block += 8;
422
    }
423
}
424

    
425
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
426
                          const uint8_t *s2, int stride){
427
    int i;
428

    
429
    /* read the pixels */
430
    for(i=0;i<8;i++) {
431
        block[0] = s1[0] - s2[0];
432
        block[1] = s1[1] - s2[1];
433
        block[2] = s1[2] - s2[2];
434
        block[3] = s1[3] - s2[3];
435
        block[4] = s1[4] - s2[4];
436
        block[5] = s1[5] - s2[5];
437
        block[6] = s1[6] - s2[6];
438
        block[7] = s1[7] - s2[7];
439
        s1 += stride;
440
        s2 += stride;
441
        block += 8;
442
    }
443
}
444

    
445

    
446
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
447
                                 int line_size)
448
{
449
    int i;
450
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
451

    
452
    /* read the pixels */
453
    for(i=0;i<8;i++) {
454
        pixels[0] = cm[block[0]];
455
        pixels[1] = cm[block[1]];
456
        pixels[2] = cm[block[2]];
457
        pixels[3] = cm[block[3]];
458
        pixels[4] = cm[block[4]];
459
        pixels[5] = cm[block[5]];
460
        pixels[6] = cm[block[6]];
461
        pixels[7] = cm[block[7]];
462

    
463
        pixels += line_size;
464
        block += 8;
465
    }
466
}
467

    
468
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
469
                                 int line_size)
470
{
471
    int i;
472
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
473

    
474
    /* read the pixels */
475
    for(i=0;i<4;i++) {
476
        pixels[0] = cm[block[0]];
477
        pixels[1] = cm[block[1]];
478
        pixels[2] = cm[block[2]];
479
        pixels[3] = cm[block[3]];
480

    
481
        pixels += line_size;
482
        block += 8;
483
    }
484
}
485

    
486
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
487
                                 int line_size)
488
{
489
    int i;
490
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
491

    
492
    /* read the pixels */
493
    for(i=0;i<2;i++) {
494
        pixels[0] = cm[block[0]];
495
        pixels[1] = cm[block[1]];
496

    
497
        pixels += line_size;
498
        block += 8;
499
    }
500
}
501

    
502
static void put_signed_pixels_clamped_c(const DCTELEM *block,
503
                                        uint8_t *restrict pixels,
504
                                        int line_size)
505
{
506
    int i, j;
507

    
508
    for (i = 0; i < 8; i++) {
509
        for (j = 0; j < 8; j++) {
510
            if (*block < -128)
511
                *pixels = 0;
512
            else if (*block > 127)
513
                *pixels = 255;
514
            else
515
                *pixels = (uint8_t)(*block + 128);
516
            block++;
517
            pixels++;
518
        }
519
        pixels += (line_size - 8);
520
    }
521
}
522

    
523
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
524
                          int line_size)
525
{
526
    int i;
527
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
528

    
529
    /* read the pixels */
530
    for(i=0;i<8;i++) {
531
        pixels[0] = cm[pixels[0] + block[0]];
532
        pixels[1] = cm[pixels[1] + block[1]];
533
        pixels[2] = cm[pixels[2] + block[2]];
534
        pixels[3] = cm[pixels[3] + block[3]];
535
        pixels[4] = cm[pixels[4] + block[4]];
536
        pixels[5] = cm[pixels[5] + block[5]];
537
        pixels[6] = cm[pixels[6] + block[6]];
538
        pixels[7] = cm[pixels[7] + block[7]];
539
        pixels += line_size;
540
        block += 8;
541
    }
542
}
543

    
544
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
545
                          int line_size)
546
{
547
    int i;
548
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
549

    
550
    /* read the pixels */
551
    for(i=0;i<4;i++) {
552
        pixels[0] = cm[pixels[0] + block[0]];
553
        pixels[1] = cm[pixels[1] + block[1]];
554
        pixels[2] = cm[pixels[2] + block[2]];
555
        pixels[3] = cm[pixels[3] + block[3]];
556
        pixels += line_size;
557
        block += 8;
558
    }
559
}
560

    
561
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
562
                          int line_size)
563
{
564
    int i;
565
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
566

    
567
    /* read the pixels */
568
    for(i=0;i<2;i++) {
569
        pixels[0] = cm[pixels[0] + block[0]];
570
        pixels[1] = cm[pixels[1] + block[1]];
571
        pixels += line_size;
572
        block += 8;
573
    }
574
}
575

    
576
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
577
{
578
    int i;
579
    for(i=0;i<8;i++) {
580
        pixels[0] += block[0];
581
        pixels[1] += block[1];
582
        pixels[2] += block[2];
583
        pixels[3] += block[3];
584
        pixels[4] += block[4];
585
        pixels[5] += block[5];
586
        pixels[6] += block[6];
587
        pixels[7] += block[7];
588
        pixels += line_size;
589
        block += 8;
590
    }
591
}
592

    
593
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
594
{
595
    int i;
596
    for(i=0;i<4;i++) {
597
        pixels[0] += block[0];
598
        pixels[1] += block[1];
599
        pixels[2] += block[2];
600
        pixels[3] += block[3];
601
        pixels += line_size;
602
        block += 4;
603
    }
604
}
605

    
606
static int sum_abs_dctelem_c(DCTELEM *block)
607
{
608
    int sum=0, i;
609
    for(i=0; i<64; i++)
610
        sum+= FFABS(block[i]);
611
    return sum;
612
}
613

    
614
#if 0
615

616
#define PIXOP2(OPNAME, OP) \
617
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
618
{\
619
    int i;\
620
    for(i=0; i<h; i++){\
621
        OP(*((uint64_t*)block), AV_RN64(pixels));\
622
        pixels+=line_size;\
623
        block +=line_size;\
624
    }\
625
}\
626
\
627
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
628
{\
629
    int i;\
630
    for(i=0; i<h; i++){\
631
        const uint64_t a= AV_RN64(pixels  );\
632
        const uint64_t b= AV_RN64(pixels+1);\
633
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
634
        pixels+=line_size;\
635
        block +=line_size;\
636
    }\
637
}\
638
\
639
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
640
{\
641
    int i;\
642
    for(i=0; i<h; i++){\
643
        const uint64_t a= AV_RN64(pixels  );\
644
        const uint64_t b= AV_RN64(pixels+1);\
645
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
646
        pixels+=line_size;\
647
        block +=line_size;\
648
    }\
649
}\
650
\
651
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
652
{\
653
    int i;\
654
    for(i=0; i<h; i++){\
655
        const uint64_t a= AV_RN64(pixels          );\
656
        const uint64_t b= AV_RN64(pixels+line_size);\
657
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
658
        pixels+=line_size;\
659
        block +=line_size;\
660
    }\
661
}\
662
\
663
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
664
{\
665
    int i;\
666
    for(i=0; i<h; i++){\
667
        const uint64_t a= AV_RN64(pixels          );\
668
        const uint64_t b= AV_RN64(pixels+line_size);\
669
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
670
        pixels+=line_size;\
671
        block +=line_size;\
672
    }\
673
}\
674
\
675
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
676
{\
677
        int i;\
678
        const uint64_t a= AV_RN64(pixels  );\
679
        const uint64_t b= AV_RN64(pixels+1);\
680
        uint64_t l0=  (a&0x0303030303030303ULL)\
681
                    + (b&0x0303030303030303ULL)\
682
                    + 0x0202020202020202ULL;\
683
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
684
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
685
        uint64_t l1,h1;\
686
\
687
        pixels+=line_size;\
688
        for(i=0; i<h; i+=2){\
689
            uint64_t a= AV_RN64(pixels  );\
690
            uint64_t b= AV_RN64(pixels+1);\
691
            l1=  (a&0x0303030303030303ULL)\
692
               + (b&0x0303030303030303ULL);\
693
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
694
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
695
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
696
            pixels+=line_size;\
697
            block +=line_size;\
698
            a= AV_RN64(pixels  );\
699
            b= AV_RN64(pixels+1);\
700
            l0=  (a&0x0303030303030303ULL)\
701
               + (b&0x0303030303030303ULL)\
702
               + 0x0202020202020202ULL;\
703
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
704
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
705
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
706
            pixels+=line_size;\
707
            block +=line_size;\
708
        }\
709
}\
710
\
711
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
712
{\
713
        int i;\
714
        const uint64_t a= AV_RN64(pixels  );\
715
        const uint64_t b= AV_RN64(pixels+1);\
716
        uint64_t l0=  (a&0x0303030303030303ULL)\
717
                    + (b&0x0303030303030303ULL)\
718
                    + 0x0101010101010101ULL;\
719
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
720
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
721
        uint64_t l1,h1;\
722
\
723
        pixels+=line_size;\
724
        for(i=0; i<h; i+=2){\
725
            uint64_t a= AV_RN64(pixels  );\
726
            uint64_t b= AV_RN64(pixels+1);\
727
            l1=  (a&0x0303030303030303ULL)\
728
               + (b&0x0303030303030303ULL);\
729
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
730
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
731
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
732
            pixels+=line_size;\
733
            block +=line_size;\
734
            a= AV_RN64(pixels  );\
735
            b= AV_RN64(pixels+1);\
736
            l0=  (a&0x0303030303030303ULL)\
737
               + (b&0x0303030303030303ULL)\
738
               + 0x0101010101010101ULL;\
739
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
740
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
741
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
742
            pixels+=line_size;\
743
            block +=line_size;\
744
        }\
745
}\
746
\
747
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
748
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
749
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
750
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
751
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
752
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
753
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
754

755
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
756
#else // 64 bit variant
757

    
758
#define PIXOP2(OPNAME, OP) \
759
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
760
    int i;\
761
    for(i=0; i<h; i++){\
762
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
763
        pixels+=line_size;\
764
        block +=line_size;\
765
    }\
766
}\
767
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
768
    int i;\
769
    for(i=0; i<h; i++){\
770
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
771
        pixels+=line_size;\
772
        block +=line_size;\
773
    }\
774
}\
775
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
776
    int i;\
777
    for(i=0; i<h; i++){\
778
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
779
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
780
        pixels+=line_size;\
781
        block +=line_size;\
782
    }\
783
}\
784
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
785
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
786
}\
787
\
788
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
789
                                                int src_stride1, int src_stride2, int h){\
790
    int i;\
791
    for(i=0; i<h; i++){\
792
        uint32_t a,b;\
793
        a= AV_RN32(&src1[i*src_stride1  ]);\
794
        b= AV_RN32(&src2[i*src_stride2  ]);\
795
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
796
        a= AV_RN32(&src1[i*src_stride1+4]);\
797
        b= AV_RN32(&src2[i*src_stride2+4]);\
798
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
799
    }\
800
}\
801
\
802
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
803
                                                int src_stride1, int src_stride2, int h){\
804
    int i;\
805
    for(i=0; i<h; i++){\
806
        uint32_t a,b;\
807
        a= AV_RN32(&src1[i*src_stride1  ]);\
808
        b= AV_RN32(&src2[i*src_stride2  ]);\
809
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
810
        a= AV_RN32(&src1[i*src_stride1+4]);\
811
        b= AV_RN32(&src2[i*src_stride2+4]);\
812
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
813
    }\
814
}\
815
\
816
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
817
                                                int src_stride1, int src_stride2, int h){\
818
    int i;\
819
    for(i=0; i<h; i++){\
820
        uint32_t a,b;\
821
        a= AV_RN32(&src1[i*src_stride1  ]);\
822
        b= AV_RN32(&src2[i*src_stride2  ]);\
823
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
824
    }\
825
}\
826
\
827
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
828
                                                int src_stride1, int src_stride2, int h){\
829
    int i;\
830
    for(i=0; i<h; i++){\
831
        uint32_t a,b;\
832
        a= AV_RN16(&src1[i*src_stride1  ]);\
833
        b= AV_RN16(&src2[i*src_stride2  ]);\
834
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
835
    }\
836
}\
837
\
838
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
839
                                                int src_stride1, int src_stride2, int h){\
840
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
841
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
842
}\
843
\
844
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
845
                                                int src_stride1, int src_stride2, int h){\
846
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
847
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
848
}\
849
\
850
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
851
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
852
}\
853
\
854
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
855
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
856
}\
857
\
858
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
859
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
860
}\
861
\
862
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
863
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
864
}\
865
\
866
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
867
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
868
    int i;\
869
    for(i=0; i<h; i++){\
870
        uint32_t a, b, c, d, l0, l1, h0, h1;\
871
        a= AV_RN32(&src1[i*src_stride1]);\
872
        b= AV_RN32(&src2[i*src_stride2]);\
873
        c= AV_RN32(&src3[i*src_stride3]);\
874
        d= AV_RN32(&src4[i*src_stride4]);\
875
        l0=  (a&0x03030303UL)\
876
           + (b&0x03030303UL)\
877
           + 0x02020202UL;\
878
        h0= ((a&0xFCFCFCFCUL)>>2)\
879
          + ((b&0xFCFCFCFCUL)>>2);\
880
        l1=  (c&0x03030303UL)\
881
           + (d&0x03030303UL);\
882
        h1= ((c&0xFCFCFCFCUL)>>2)\
883
          + ((d&0xFCFCFCFCUL)>>2);\
884
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
885
        a= AV_RN32(&src1[i*src_stride1+4]);\
886
        b= AV_RN32(&src2[i*src_stride2+4]);\
887
        c= AV_RN32(&src3[i*src_stride3+4]);\
888
        d= AV_RN32(&src4[i*src_stride4+4]);\
889
        l0=  (a&0x03030303UL)\
890
           + (b&0x03030303UL)\
891
           + 0x02020202UL;\
892
        h0= ((a&0xFCFCFCFCUL)>>2)\
893
          + ((b&0xFCFCFCFCUL)>>2);\
894
        l1=  (c&0x03030303UL)\
895
           + (d&0x03030303UL);\
896
        h1= ((c&0xFCFCFCFCUL)>>2)\
897
          + ((d&0xFCFCFCFCUL)>>2);\
898
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
899
    }\
900
}\
901
\
902
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
903
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
904
}\
905
\
906
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
907
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
908
}\
909
\
910
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
911
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
912
}\
913
\
914
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
915
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
916
}\
917
\
918
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
919
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
920
    int i;\
921
    for(i=0; i<h; i++){\
922
        uint32_t a, b, c, d, l0, l1, h0, h1;\
923
        a= AV_RN32(&src1[i*src_stride1]);\
924
        b= AV_RN32(&src2[i*src_stride2]);\
925
        c= AV_RN32(&src3[i*src_stride3]);\
926
        d= AV_RN32(&src4[i*src_stride4]);\
927
        l0=  (a&0x03030303UL)\
928
           + (b&0x03030303UL)\
929
           + 0x01010101UL;\
930
        h0= ((a&0xFCFCFCFCUL)>>2)\
931
          + ((b&0xFCFCFCFCUL)>>2);\
932
        l1=  (c&0x03030303UL)\
933
           + (d&0x03030303UL);\
934
        h1= ((c&0xFCFCFCFCUL)>>2)\
935
          + ((d&0xFCFCFCFCUL)>>2);\
936
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
937
        a= AV_RN32(&src1[i*src_stride1+4]);\
938
        b= AV_RN32(&src2[i*src_stride2+4]);\
939
        c= AV_RN32(&src3[i*src_stride3+4]);\
940
        d= AV_RN32(&src4[i*src_stride4+4]);\
941
        l0=  (a&0x03030303UL)\
942
           + (b&0x03030303UL)\
943
           + 0x01010101UL;\
944
        h0= ((a&0xFCFCFCFCUL)>>2)\
945
          + ((b&0xFCFCFCFCUL)>>2);\
946
        l1=  (c&0x03030303UL)\
947
           + (d&0x03030303UL);\
948
        h1= ((c&0xFCFCFCFCUL)>>2)\
949
          + ((d&0xFCFCFCFCUL)>>2);\
950
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
951
    }\
952
}\
953
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
954
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
955
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
956
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
957
}\
958
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
959
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
960
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
961
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
962
}\
963
\
964
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
965
{\
966
        int i, a0, b0, a1, b1;\
967
        a0= pixels[0];\
968
        b0= pixels[1] + 2;\
969
        a0 += b0;\
970
        b0 += pixels[2];\
971
\
972
        pixels+=line_size;\
973
        for(i=0; i<h; i+=2){\
974
            a1= pixels[0];\
975
            b1= pixels[1];\
976
            a1 += b1;\
977
            b1 += pixels[2];\
978
\
979
            block[0]= (a1+a0)>>2; /* FIXME non put */\
980
            block[1]= (b1+b0)>>2;\
981
\
982
            pixels+=line_size;\
983
            block +=line_size;\
984
\
985
            a0= pixels[0];\
986
            b0= pixels[1] + 2;\
987
            a0 += b0;\
988
            b0 += pixels[2];\
989
\
990
            block[0]= (a1+a0)>>2;\
991
            block[1]= (b1+b0)>>2;\
992
            pixels+=line_size;\
993
            block +=line_size;\
994
        }\
995
}\
996
\
997
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
998
{\
999
        int i;\
1000
        const uint32_t a= AV_RN32(pixels  );\
1001
        const uint32_t b= AV_RN32(pixels+1);\
1002
        uint32_t l0=  (a&0x03030303UL)\
1003
                    + (b&0x03030303UL)\
1004
                    + 0x02020202UL;\
1005
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1006
                   + ((b&0xFCFCFCFCUL)>>2);\
1007
        uint32_t l1,h1;\
1008
\
1009
        pixels+=line_size;\
1010
        for(i=0; i<h; i+=2){\
1011
            uint32_t a= AV_RN32(pixels  );\
1012
            uint32_t b= AV_RN32(pixels+1);\
1013
            l1=  (a&0x03030303UL)\
1014
               + (b&0x03030303UL);\
1015
            h1= ((a&0xFCFCFCFCUL)>>2)\
1016
              + ((b&0xFCFCFCFCUL)>>2);\
1017
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1018
            pixels+=line_size;\
1019
            block +=line_size;\
1020
            a= AV_RN32(pixels  );\
1021
            b= AV_RN32(pixels+1);\
1022
            l0=  (a&0x03030303UL)\
1023
               + (b&0x03030303UL)\
1024
               + 0x02020202UL;\
1025
            h0= ((a&0xFCFCFCFCUL)>>2)\
1026
              + ((b&0xFCFCFCFCUL)>>2);\
1027
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1028
            pixels+=line_size;\
1029
            block +=line_size;\
1030
        }\
1031
}\
1032
\
1033
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1034
{\
1035
    int j;\
1036
    for(j=0; j<2; j++){\
1037
        int i;\
1038
        const uint32_t a= AV_RN32(pixels  );\
1039
        const uint32_t b= AV_RN32(pixels+1);\
1040
        uint32_t l0=  (a&0x03030303UL)\
1041
                    + (b&0x03030303UL)\
1042
                    + 0x02020202UL;\
1043
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1044
                   + ((b&0xFCFCFCFCUL)>>2);\
1045
        uint32_t l1,h1;\
1046
\
1047
        pixels+=line_size;\
1048
        for(i=0; i<h; i+=2){\
1049
            uint32_t a= AV_RN32(pixels  );\
1050
            uint32_t b= AV_RN32(pixels+1);\
1051
            l1=  (a&0x03030303UL)\
1052
               + (b&0x03030303UL);\
1053
            h1= ((a&0xFCFCFCFCUL)>>2)\
1054
              + ((b&0xFCFCFCFCUL)>>2);\
1055
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1056
            pixels+=line_size;\
1057
            block +=line_size;\
1058
            a= AV_RN32(pixels  );\
1059
            b= AV_RN32(pixels+1);\
1060
            l0=  (a&0x03030303UL)\
1061
               + (b&0x03030303UL)\
1062
               + 0x02020202UL;\
1063
            h0= ((a&0xFCFCFCFCUL)>>2)\
1064
              + ((b&0xFCFCFCFCUL)>>2);\
1065
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1066
            pixels+=line_size;\
1067
            block +=line_size;\
1068
        }\
1069
        pixels+=4-line_size*(h+1);\
1070
        block +=4-line_size*h;\
1071
    }\
1072
}\
1073
\
1074
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1075
{\
1076
    int j;\
1077
    for(j=0; j<2; j++){\
1078
        int i;\
1079
        const uint32_t a= AV_RN32(pixels  );\
1080
        const uint32_t b= AV_RN32(pixels+1);\
1081
        uint32_t l0=  (a&0x03030303UL)\
1082
                    + (b&0x03030303UL)\
1083
                    + 0x01010101UL;\
1084
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1085
                   + ((b&0xFCFCFCFCUL)>>2);\
1086
        uint32_t l1,h1;\
1087
\
1088
        pixels+=line_size;\
1089
        for(i=0; i<h; i+=2){\
1090
            uint32_t a= AV_RN32(pixels  );\
1091
            uint32_t b= AV_RN32(pixels+1);\
1092
            l1=  (a&0x03030303UL)\
1093
               + (b&0x03030303UL);\
1094
            h1= ((a&0xFCFCFCFCUL)>>2)\
1095
              + ((b&0xFCFCFCFCUL)>>2);\
1096
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1097
            pixels+=line_size;\
1098
            block +=line_size;\
1099
            a= AV_RN32(pixels  );\
1100
            b= AV_RN32(pixels+1);\
1101
            l0=  (a&0x03030303UL)\
1102
               + (b&0x03030303UL)\
1103
               + 0x01010101UL;\
1104
            h0= ((a&0xFCFCFCFCUL)>>2)\
1105
              + ((b&0xFCFCFCFCUL)>>2);\
1106
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1107
            pixels+=line_size;\
1108
            block +=line_size;\
1109
        }\
1110
        pixels+=4-line_size*(h+1);\
1111
        block +=4-line_size*h;\
1112
    }\
1113
}\
1114
\
1115
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1116
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1117
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1118
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1119
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1120
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1121
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1122
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1123

    
1124
#define op_avg(a, b) a = rnd_avg32(a, b)
1125
#endif
1126
#define op_put(a, b) a = b
1127

    
1128
PIXOP2(avg, op_avg)
1129
PIXOP2(put, op_put)
1130
#undef op_avg
1131
#undef op_put
1132

    
1133
#define avg2(a,b) ((a+b+1)>>1)
1134
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1135

    
1136
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1137
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1138
}
1139

    
1140
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1141
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1142
}
1143

    
1144
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1145
{
1146
    const int A=(16-x16)*(16-y16);
1147
    const int B=(   x16)*(16-y16);
1148
    const int C=(16-x16)*(   y16);
1149
    const int D=(   x16)*(   y16);
1150
    int i;
1151

    
1152
    for(i=0; i<h; i++)
1153
    {
1154
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1155
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1156
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1157
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1158
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1159
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1160
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1161
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1162
        dst+= stride;
1163
        src+= stride;
1164
    }
1165
}
1166

    
1167
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1168
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1169
{
1170
    int y, vx, vy;
1171
    const int s= 1<<shift;
1172

    
1173
    width--;
1174
    height--;
1175

    
1176
    for(y=0; y<h; y++){
1177
        int x;
1178

    
1179
        vx= ox;
1180
        vy= oy;
1181
        for(x=0; x<8; x++){ //XXX FIXME optimize
1182
            int src_x, src_y, frac_x, frac_y, index;
1183

    
1184
            src_x= vx>>16;
1185
            src_y= vy>>16;
1186
            frac_x= src_x&(s-1);
1187
            frac_y= src_y&(s-1);
1188
            src_x>>=shift;
1189
            src_y>>=shift;
1190

    
1191
            if((unsigned)src_x < width){
1192
                if((unsigned)src_y < height){
1193
                    index= src_x + src_y*stride;
1194
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1195
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1196
                                        + (  src[index+stride  ]*(s-frac_x)
1197
                                           + src[index+stride+1]*   frac_x )*   frac_y
1198
                                        + r)>>(shift*2);
1199
                }else{
1200
                    index= src_x + av_clip(src_y, 0, height)*stride;
1201
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1202
                                          + src[index       +1]*   frac_x )*s
1203
                                        + r)>>(shift*2);
1204
                }
1205
            }else{
1206
                if((unsigned)src_y < height){
1207
                    index= av_clip(src_x, 0, width) + src_y*stride;
1208
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1209
                                           + src[index+stride  ]*   frac_y )*s
1210
                                        + r)>>(shift*2);
1211
                }else{
1212
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1213
                    dst[y*stride + x]=    src[index         ];
1214
                }
1215
            }
1216

    
1217
            vx+= dxx;
1218
            vy+= dyx;
1219
        }
1220
        ox += dxy;
1221
        oy += dyy;
1222
    }
1223
}
1224

    
1225
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1226
    switch(width){
1227
    case 2: put_pixels2_c (dst, src, stride, height); break;
1228
    case 4: put_pixels4_c (dst, src, stride, height); break;
1229
    case 8: put_pixels8_c (dst, src, stride, height); break;
1230
    case 16:put_pixels16_c(dst, src, stride, height); break;
1231
    }
1232
}
1233

    
1234
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1235
    int i,j;
1236
    for (i=0; i < height; i++) {
1237
      for (j=0; j < width; j++) {
1238
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1239
      }
1240
      src += stride;
1241
      dst += stride;
1242
    }
1243
}
1244

    
1245
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1246
    int i,j;
1247
    for (i=0; i < height; i++) {
1248
      for (j=0; j < width; j++) {
1249
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1250
      }
1251
      src += stride;
1252
      dst += stride;
1253
    }
1254
}
1255

    
1256
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1257
    int i,j;
1258
    for (i=0; i < height; i++) {
1259
      for (j=0; j < width; j++) {
1260
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1261
      }
1262
      src += stride;
1263
      dst += stride;
1264
    }
1265
}
1266

    
1267
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1268
    int i,j;
1269
    for (i=0; i < height; i++) {
1270
      for (j=0; j < width; j++) {
1271
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1272
      }
1273
      src += stride;
1274
      dst += stride;
1275
    }
1276
}
1277

    
1278
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1279
    int i,j;
1280
    for (i=0; i < height; i++) {
1281
      for (j=0; j < width; j++) {
1282
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1283
      }
1284
      src += stride;
1285
      dst += stride;
1286
    }
1287
}
1288

    
1289
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1290
    int i,j;
1291
    for (i=0; i < height; i++) {
1292
      for (j=0; j < width; j++) {
1293
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1294
      }
1295
      src += stride;
1296
      dst += stride;
1297
    }
1298
}
1299

    
1300
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1301
    int i,j;
1302
    for (i=0; i < height; i++) {
1303
      for (j=0; j < width; j++) {
1304
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1305
      }
1306
      src += stride;
1307
      dst += stride;
1308
    }
1309
}
1310

    
1311
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1312
    int i,j;
1313
    for (i=0; i < height; i++) {
1314
      for (j=0; j < width; j++) {
1315
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1316
      }
1317
      src += stride;
1318
      dst += stride;
1319
    }
1320
}
1321

    
1322
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1323
    switch(width){
1324
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1325
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1326
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1327
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1328
    }
1329
}
1330

    
1331
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1332
    int i,j;
1333
    for (i=0; i < height; i++) {
1334
      for (j=0; j < width; j++) {
1335
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1336
      }
1337
      src += stride;
1338
      dst += stride;
1339
    }
1340
}
1341

    
1342
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1343
    int i,j;
1344
    for (i=0; i < height; i++) {
1345
      for (j=0; j < width; j++) {
1346
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1347
      }
1348
      src += stride;
1349
      dst += stride;
1350
    }
1351
}
1352

    
1353
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1354
    int i,j;
1355
    for (i=0; i < height; i++) {
1356
      for (j=0; j < width; j++) {
1357
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1358
      }
1359
      src += stride;
1360
      dst += stride;
1361
    }
1362
}
1363

    
1364
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1365
    int i,j;
1366
    for (i=0; i < height; i++) {
1367
      for (j=0; j < width; j++) {
1368
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1369
      }
1370
      src += stride;
1371
      dst += stride;
1372
    }
1373
}
1374

    
1375
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1376
    int i,j;
1377
    for (i=0; i < height; i++) {
1378
      for (j=0; j < width; j++) {
1379
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1380
      }
1381
      src += stride;
1382
      dst += stride;
1383
    }
1384
}
1385

    
1386
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1387
    int i,j;
1388
    for (i=0; i < height; i++) {
1389
      for (j=0; j < width; j++) {
1390
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1391
      }
1392
      src += stride;
1393
      dst += stride;
1394
    }
1395
}
1396

    
1397
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1398
    int i,j;
1399
    for (i=0; i < height; i++) {
1400
      for (j=0; j < width; j++) {
1401
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1402
      }
1403
      src += stride;
1404
      dst += stride;
1405
    }
1406
}
1407

    
1408
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1409
    int i,j;
1410
    for (i=0; i < height; i++) {
1411
      for (j=0; j < width; j++) {
1412
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1413
      }
1414
      src += stride;
1415
      dst += stride;
1416
    }
1417
}
1418
#if 0
1419
#define TPEL_WIDTH(width)\
1420
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1421
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1422
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1423
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1424
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1425
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1426
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1427
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1428
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1429
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1430
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1431
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1432
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1433
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1434
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1435
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1436
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1437
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1438
#endif
1439

    
1440
#define H264_CHROMA_MC(OPNAME, OP)\
1441
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1442
    const int A=(8-x)*(8-y);\
1443
    const int B=(  x)*(8-y);\
1444
    const int C=(8-x)*(  y);\
1445
    const int D=(  x)*(  y);\
1446
    int i;\
1447
    \
1448
    assert(x<8 && y<8 && x>=0 && y>=0);\
1449
\
1450
    if(D){\
1451
        for(i=0; i<h; i++){\
1452
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1453
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1454
            dst+= stride;\
1455
            src+= stride;\
1456
        }\
1457
    }else{\
1458
        const int E= B+C;\
1459
        const int step= C ? stride : 1;\
1460
        for(i=0; i<h; i++){\
1461
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1462
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1463
            dst+= stride;\
1464
            src+= stride;\
1465
        }\
1466
    }\
1467
}\
1468
\
1469
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1470
    const int A=(8-x)*(8-y);\
1471
    const int B=(  x)*(8-y);\
1472
    const int C=(8-x)*(  y);\
1473
    const int D=(  x)*(  y);\
1474
    int i;\
1475
    \
1476
    assert(x<8 && y<8 && x>=0 && y>=0);\
1477
\
1478
    if(D){\
1479
        for(i=0; i<h; i++){\
1480
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1481
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1482
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1483
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1484
            dst+= stride;\
1485
            src+= stride;\
1486
        }\
1487
    }else{\
1488
        const int E= B+C;\
1489
        const int step= C ? stride : 1;\
1490
        for(i=0; i<h; i++){\
1491
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1492
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1493
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1494
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1495
            dst+= stride;\
1496
            src+= stride;\
1497
        }\
1498
    }\
1499
}\
1500
\
1501
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1502
    const int A=(8-x)*(8-y);\
1503
    const int B=(  x)*(8-y);\
1504
    const int C=(8-x)*(  y);\
1505
    const int D=(  x)*(  y);\
1506
    int i;\
1507
    \
1508
    assert(x<8 && y<8 && x>=0 && y>=0);\
1509
\
1510
    if(D){\
1511
        for(i=0; i<h; i++){\
1512
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1513
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1514
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1515
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1516
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1517
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1518
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1519
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1520
            dst+= stride;\
1521
            src+= stride;\
1522
        }\
1523
    }else{\
1524
        const int E= B+C;\
1525
        const int step= C ? stride : 1;\
1526
        for(i=0; i<h; i++){\
1527
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1528
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1529
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1530
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1531
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1532
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1533
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1534
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1535
            dst+= stride;\
1536
            src+= stride;\
1537
        }\
1538
    }\
1539
}
1540

    
1541
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1542
#define op_put(a, b) a = (((b) + 32)>>6)
1543

    
1544
H264_CHROMA_MC(put_       , op_put)
1545
H264_CHROMA_MC(avg_       , op_avg)
1546
#undef op_avg
1547
#undef op_put
1548

    
1549
static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1550
    const int A=(8-x)*(8-y);
1551
    const int B=(  x)*(8-y);
1552
    const int C=(8-x)*(  y);
1553
    const int D=(  x)*(  y);
1554
    int i;
1555

    
1556
    assert(x<8 && y<8 && x>=0 && y>=0);
1557

    
1558
    for(i=0; i<h; i++)
1559
    {
1560
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1561
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1562
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1563
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1564
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1565
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1566
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1567
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1568
        dst+= stride;
1569
        src+= stride;
1570
    }
1571
}
1572

    
1573
#define QPEL_MC(r, OPNAME, RND, OP) \
1574
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1575
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1576
    int i;\
1577
    for(i=0; i<h; i++)\
1578
    {\
1579
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1580
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1581
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1582
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1583
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1584
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1585
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1586
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1587
        dst+=dstStride;\
1588
        src+=srcStride;\
1589
    }\
1590
}\
1591
\
1592
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1593
    const int w=8;\
1594
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1595
    int i;\
1596
    for(i=0; i<w; i++)\
1597
    {\
1598
        const int src0= src[0*srcStride];\
1599
        const int src1= src[1*srcStride];\
1600
        const int src2= src[2*srcStride];\
1601
        const int src3= src[3*srcStride];\
1602
        const int src4= src[4*srcStride];\
1603
        const int src5= src[5*srcStride];\
1604
        const int src6= src[6*srcStride];\
1605
        const int src7= src[7*srcStride];\
1606
        const int src8= src[8*srcStride];\
1607
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1608
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1609
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1610
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1611
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1612
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1613
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1614
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1615
        dst++;\
1616
        src++;\
1617
    }\
1618
}\
1619
\
1620
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1621
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1622
    int i;\
1623
    \
1624
    for(i=0; i<h; i++)\
1625
    {\
1626
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1627
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1628
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1629
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1630
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1631
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1632
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1633
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1634
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1635
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1636
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1637
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1638
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1639
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1640
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1641
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1642
        dst+=dstStride;\
1643
        src+=srcStride;\
1644
    }\
1645
}\
1646
\
1647
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1648
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1649
    int i;\
1650
    const int w=16;\
1651
    for(i=0; i<w; i++)\
1652
    {\
1653
        const int src0= src[0*srcStride];\
1654
        const int src1= src[1*srcStride];\
1655
        const int src2= src[2*srcStride];\
1656
        const int src3= src[3*srcStride];\
1657
        const int src4= src[4*srcStride];\
1658
        const int src5= src[5*srcStride];\
1659
        const int src6= src[6*srcStride];\
1660
        const int src7= src[7*srcStride];\
1661
        const int src8= src[8*srcStride];\
1662
        const int src9= src[9*srcStride];\
1663
        const int src10= src[10*srcStride];\
1664
        const int src11= src[11*srcStride];\
1665
        const int src12= src[12*srcStride];\
1666
        const int src13= src[13*srcStride];\
1667
        const int src14= src[14*srcStride];\
1668
        const int src15= src[15*srcStride];\
1669
        const int src16= src[16*srcStride];\
1670
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1671
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1672
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1673
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1674
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1675
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1676
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1677
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1678
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1679
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1680
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1681
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1682
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1683
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1684
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1685
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1686
        dst++;\
1687
        src++;\
1688
    }\
1689
}\
1690
\
1691
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1692
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1693
}\
1694
\
1695
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1696
    uint8_t half[64];\
1697
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1698
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1699
}\
1700
\
1701
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1702
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1703
}\
1704
\
1705
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1706
    uint8_t half[64];\
1707
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1708
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1709
}\
1710
\
1711
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1712
    uint8_t full[16*9];\
1713
    uint8_t half[64];\
1714
    copy_block9(full, src, 16, stride, 9);\
1715
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1716
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1717
}\
1718
\
1719
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1720
    uint8_t full[16*9];\
1721
    copy_block9(full, src, 16, stride, 9);\
1722
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1723
}\
1724
\
1725
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1726
    uint8_t full[16*9];\
1727
    uint8_t half[64];\
1728
    copy_block9(full, src, 16, stride, 9);\
1729
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1730
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1731
}\
1732
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1733
    uint8_t full[16*9];\
1734
    uint8_t halfH[72];\
1735
    uint8_t halfV[64];\
1736
    uint8_t halfHV[64];\
1737
    copy_block9(full, src, 16, stride, 9);\
1738
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1739
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1740
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1741
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1742
}\
1743
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1744
    uint8_t full[16*9];\
1745
    uint8_t halfH[72];\
1746
    uint8_t halfHV[64];\
1747
    copy_block9(full, src, 16, stride, 9);\
1748
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1749
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1750
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1751
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1752
}\
1753
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1754
    uint8_t full[16*9];\
1755
    uint8_t halfH[72];\
1756
    uint8_t halfV[64];\
1757
    uint8_t halfHV[64];\
1758
    copy_block9(full, src, 16, stride, 9);\
1759
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1760
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1761
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1762
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1763
}\
1764
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1765
    uint8_t full[16*9];\
1766
    uint8_t halfH[72];\
1767
    uint8_t halfHV[64];\
1768
    copy_block9(full, src, 16, stride, 9);\
1769
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1770
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1771
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1772
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1773
}\
1774
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1775
    uint8_t full[16*9];\
1776
    uint8_t halfH[72];\
1777
    uint8_t halfV[64];\
1778
    uint8_t halfHV[64];\
1779
    copy_block9(full, src, 16, stride, 9);\
1780
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1781
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1782
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1783
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1784
}\
1785
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1786
    uint8_t full[16*9];\
1787
    uint8_t halfH[72];\
1788
    uint8_t halfHV[64];\
1789
    copy_block9(full, src, 16, stride, 9);\
1790
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1791
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1792
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1793
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1794
}\
1795
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1796
    uint8_t full[16*9];\
1797
    uint8_t halfH[72];\
1798
    uint8_t halfV[64];\
1799
    uint8_t halfHV[64];\
1800
    copy_block9(full, src, 16, stride, 9);\
1801
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1802
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1803
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1804
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1805
}\
1806
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1807
    uint8_t full[16*9];\
1808
    uint8_t halfH[72];\
1809
    uint8_t halfHV[64];\
1810
    copy_block9(full, src, 16, stride, 9);\
1811
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1812
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1813
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1814
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1815
}\
1816
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1817
    uint8_t halfH[72];\
1818
    uint8_t halfHV[64];\
1819
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1820
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1821
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1822
}\
1823
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1824
    uint8_t halfH[72];\
1825
    uint8_t halfHV[64];\
1826
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1827
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1828
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1829
}\
1830
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1831
    uint8_t full[16*9];\
1832
    uint8_t halfH[72];\
1833
    uint8_t halfV[64];\
1834
    uint8_t halfHV[64];\
1835
    copy_block9(full, src, 16, stride, 9);\
1836
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1837
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1838
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1839
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1840
}\
1841
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1842
    uint8_t full[16*9];\
1843
    uint8_t halfH[72];\
1844
    copy_block9(full, src, 16, stride, 9);\
1845
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1846
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1847
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1848
}\
1849
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1850
    uint8_t full[16*9];\
1851
    uint8_t halfH[72];\
1852
    uint8_t halfV[64];\
1853
    uint8_t halfHV[64];\
1854
    copy_block9(full, src, 16, stride, 9);\
1855
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1856
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1857
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1858
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1859
}\
1860
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1861
    uint8_t full[16*9];\
1862
    uint8_t halfH[72];\
1863
    copy_block9(full, src, 16, stride, 9);\
1864
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1865
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1866
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1867
}\
1868
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1869
    uint8_t halfH[72];\
1870
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1871
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1872
}\
1873
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1874
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1875
}\
1876
\
1877
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1878
    uint8_t half[256];\
1879
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1880
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1881
}\
1882
\
1883
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1884
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1885
}\
1886
\
1887
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1888
    uint8_t half[256];\
1889
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1890
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1891
}\
1892
\
1893
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1894
    uint8_t full[24*17];\
1895
    uint8_t half[256];\
1896
    copy_block17(full, src, 24, stride, 17);\
1897
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1898
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1899
}\
1900
\
1901
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1902
    uint8_t full[24*17];\
1903
    copy_block17(full, src, 24, stride, 17);\
1904
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1905
}\
1906
\
1907
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1908
    uint8_t full[24*17];\
1909
    uint8_t half[256];\
1910
    copy_block17(full, src, 24, stride, 17);\
1911
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1912
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1913
}\
1914
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1915
    uint8_t full[24*17];\
1916
    uint8_t halfH[272];\
1917
    uint8_t halfV[256];\
1918
    uint8_t halfHV[256];\
1919
    copy_block17(full, src, 24, stride, 17);\
1920
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1921
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1922
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1923
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1924
}\
1925
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1926
    uint8_t full[24*17];\
1927
    uint8_t halfH[272];\
1928
    uint8_t halfHV[256];\
1929
    copy_block17(full, src, 24, stride, 17);\
1930
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1931
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1932
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1933
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1934
}\
1935
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1936
    uint8_t full[24*17];\
1937
    uint8_t halfH[272];\
1938
    uint8_t halfV[256];\
1939
    uint8_t halfHV[256];\
1940
    copy_block17(full, src, 24, stride, 17);\
1941
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1942
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1943
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1944
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1945
}\
1946
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1947
    uint8_t full[24*17];\
1948
    uint8_t halfH[272];\
1949
    uint8_t halfHV[256];\
1950
    copy_block17(full, src, 24, stride, 17);\
1951
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1952
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1953
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1954
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1955
}\
1956
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1957
    uint8_t full[24*17];\
1958
    uint8_t halfH[272];\
1959
    uint8_t halfV[256];\
1960
    uint8_t halfHV[256];\
1961
    copy_block17(full, src, 24, stride, 17);\
1962
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1963
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1964
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1965
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1966
}\
1967
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1968
    uint8_t full[24*17];\
1969
    uint8_t halfH[272];\
1970
    uint8_t halfHV[256];\
1971
    copy_block17(full, src, 24, stride, 17);\
1972
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1973
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1974
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1975
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1976
}\
1977
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1978
    uint8_t full[24*17];\
1979
    uint8_t halfH[272];\
1980
    uint8_t halfV[256];\
1981
    uint8_t halfHV[256];\
1982
    copy_block17(full, src, 24, stride, 17);\
1983
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1984
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1985
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1986
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1987
}\
1988
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1989
    uint8_t full[24*17];\
1990
    uint8_t halfH[272];\
1991
    uint8_t halfHV[256];\
1992
    copy_block17(full, src, 24, stride, 17);\
1993
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1994
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1995
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1996
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1997
}\
1998
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1999
    uint8_t halfH[272];\
2000
    uint8_t halfHV[256];\
2001
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2002
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2003
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2004
}\
2005
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2006
    uint8_t halfH[272];\
2007
    uint8_t halfHV[256];\
2008
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2009
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2010
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2011
}\
2012
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2013
    uint8_t full[24*17];\
2014
    uint8_t halfH[272];\
2015
    uint8_t halfV[256];\
2016
    uint8_t halfHV[256];\
2017
    copy_block17(full, src, 24, stride, 17);\
2018
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2019
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2020
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2021
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2022
}\
2023
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2024
    uint8_t full[24*17];\
2025
    uint8_t halfH[272];\
2026
    copy_block17(full, src, 24, stride, 17);\
2027
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2028
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2029
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2030
}\
2031
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2032
    uint8_t full[24*17];\
2033
    uint8_t halfH[272];\
2034
    uint8_t halfV[256];\
2035
    uint8_t halfHV[256];\
2036
    copy_block17(full, src, 24, stride, 17);\
2037
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2038
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2039
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2040
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2041
}\
2042
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2043
    uint8_t full[24*17];\
2044
    uint8_t halfH[272];\
2045
    copy_block17(full, src, 24, stride, 17);\
2046
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2047
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2048
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2049
}\
2050
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2051
    uint8_t halfH[272];\
2052
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2053
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2054
}
2055

    
2056
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2057
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2058
#define op_put(a, b) a = cm[((b) + 16)>>5]
2059
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2060

    
2061
QPEL_MC(0, put_       , _       , op_put)
2062
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2063
QPEL_MC(0, avg_       , _       , op_avg)
2064
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2065
#undef op_avg
2066
#undef op_avg_no_rnd
2067
#undef op_put
2068
#undef op_put_no_rnd
2069

    
2070
#if 1
2071
#define H264_LOWPASS(OPNAME, OP, OP2) \
2072
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2073
    const int h=2;\
2074
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2075
    int i;\
2076
    for(i=0; i<h; i++)\
2077
    {\
2078
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2079
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2080
        dst+=dstStride;\
2081
        src+=srcStride;\
2082
    }\
2083
}\
2084
\
2085
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2086
    const int w=2;\
2087
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2088
    int i;\
2089
    for(i=0; i<w; i++)\
2090
    {\
2091
        const int srcB= src[-2*srcStride];\
2092
        const int srcA= src[-1*srcStride];\
2093
        const int src0= src[0 *srcStride];\
2094
        const int src1= src[1 *srcStride];\
2095
        const int src2= src[2 *srcStride];\
2096
        const int src3= src[3 *srcStride];\
2097
        const int src4= src[4 *srcStride];\
2098
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2099
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2100
        dst++;\
2101
        src++;\
2102
    }\
2103
}\
2104
\
2105
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2106
    const int h=2;\
2107
    const int w=2;\
2108
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2109
    int i;\
2110
    src -= 2*srcStride;\
2111
    for(i=0; i<h+5; i++)\
2112
    {\
2113
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2114
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2115
        tmp+=tmpStride;\
2116
        src+=srcStride;\
2117
    }\
2118
    tmp -= tmpStride*(h+5-2);\
2119
    for(i=0; i<w; i++)\
2120
    {\
2121
        const int tmpB= tmp[-2*tmpStride];\
2122
        const int tmpA= tmp[-1*tmpStride];\
2123
        const int tmp0= tmp[0 *tmpStride];\
2124
        const int tmp1= tmp[1 *tmpStride];\
2125
        const int tmp2= tmp[2 *tmpStride];\
2126
        const int tmp3= tmp[3 *tmpStride];\
2127
        const int tmp4= tmp[4 *tmpStride];\
2128
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2129
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2130
        dst++;\
2131
        tmp++;\
2132
    }\
2133
}\
2134
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2135
    const int h=4;\
2136
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2137
    int i;\
2138
    for(i=0; i<h; i++)\
2139
    {\
2140
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2141
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2142
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2143
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2144
        dst+=dstStride;\
2145
        src+=srcStride;\
2146
    }\
2147
}\
2148
\
2149
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2150
    const int w=4;\
2151
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2152
    int i;\
2153
    for(i=0; i<w; i++)\
2154
    {\
2155
        const int srcB= src[-2*srcStride];\
2156
        const int srcA= src[-1*srcStride];\
2157
        const int src0= src[0 *srcStride];\
2158
        const int src1= src[1 *srcStride];\
2159
        const int src2= src[2 *srcStride];\
2160
        const int src3= src[3 *srcStride];\
2161
        const int src4= src[4 *srcStride];\
2162
        const int src5= src[5 *srcStride];\
2163
        const int src6= src[6 *srcStride];\
2164
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2165
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2166
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2167
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2168
        dst++;\
2169
        src++;\
2170
    }\
2171
}\
2172
\
2173
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2174
    const int h=4;\
2175
    const int w=4;\
2176
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2177
    int i;\
2178
    src -= 2*srcStride;\
2179
    for(i=0; i<h+5; i++)\
2180
    {\
2181
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2182
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2183
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2184
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2185
        tmp+=tmpStride;\
2186
        src+=srcStride;\
2187
    }\
2188
    tmp -= tmpStride*(h+5-2);\
2189
    for(i=0; i<w; i++)\
2190
    {\
2191
        const int tmpB= tmp[-2*tmpStride];\
2192
        const int tmpA= tmp[-1*tmpStride];\
2193
        const int tmp0= tmp[0 *tmpStride];\
2194
        const int tmp1= tmp[1 *tmpStride];\
2195
        const int tmp2= tmp[2 *tmpStride];\
2196
        const int tmp3= tmp[3 *tmpStride];\
2197
        const int tmp4= tmp[4 *tmpStride];\
2198
        const int tmp5= tmp[5 *tmpStride];\
2199
        const int tmp6= tmp[6 *tmpStride];\
2200
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2201
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2202
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2203
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2204
        dst++;\
2205
        tmp++;\
2206
    }\
2207
}\
2208
\
2209
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2210
    const int h=8;\
2211
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2212
    int i;\
2213
    for(i=0; i<h; i++)\
2214
    {\
2215
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2216
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2217
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2218
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2219
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2220
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2221
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2222
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2223
        dst+=dstStride;\
2224
        src+=srcStride;\
2225
    }\
2226
}\
2227
\
2228
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2229
    const int w=8;\
2230
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2231
    int i;\
2232
    for(i=0; i<w; i++)\
2233
    {\
2234
        const int srcB= src[-2*srcStride];\
2235
        const int srcA= src[-1*srcStride];\
2236
        const int src0= src[0 *srcStride];\
2237
        const int src1= src[1 *srcStride];\
2238
        const int src2= src[2 *srcStride];\
2239
        const int src3= src[3 *srcStride];\
2240
        const int src4= src[4 *srcStride];\
2241
        const int src5= src[5 *srcStride];\
2242
        const int src6= src[6 *srcStride];\
2243
        const int src7= src[7 *srcStride];\
2244
        const int src8= src[8 *srcStride];\
2245
        const int src9= src[9 *srcStride];\
2246
        const int src10=src[10*srcStride];\
2247
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2248
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2249
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2250
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2251
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2252
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2253
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2254
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2255
        dst++;\
2256
        src++;\
2257
    }\
2258
}\
2259
\
2260
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2261
    const int h=8;\
2262
    const int w=8;\
2263
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2264
    int i;\
2265
    src -= 2*srcStride;\
2266
    for(i=0; i<h+5; i++)\
2267
    {\
2268
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2269
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2270
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2271
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2272
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2273
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2274
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2275
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2276
        tmp+=tmpStride;\
2277
        src+=srcStride;\
2278
    }\
2279
    tmp -= tmpStride*(h+5-2);\
2280
    for(i=0; i<w; i++)\
2281
    {\
2282
        const int tmpB= tmp[-2*tmpStride];\
2283
        const int tmpA= tmp[-1*tmpStride];\
2284
        const int tmp0= tmp[0 *tmpStride];\
2285
        const int tmp1= tmp[1 *tmpStride];\
2286
        const int tmp2= tmp[2 *tmpStride];\
2287
        const int tmp3= tmp[3 *tmpStride];\
2288
        const int tmp4= tmp[4 *tmpStride];\
2289
        const int tmp5= tmp[5 *tmpStride];\
2290
        const int tmp6= tmp[6 *tmpStride];\
2291
        const int tmp7= tmp[7 *tmpStride];\
2292
        const int tmp8= tmp[8 *tmpStride];\
2293
        const int tmp9= tmp[9 *tmpStride];\
2294
        const int tmp10=tmp[10*tmpStride];\
2295
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2296
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2297
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2298
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2299
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2300
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2301
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2302
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2303
        dst++;\
2304
        tmp++;\
2305
    }\
2306
}\
2307
\
2308
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2309
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2310
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2311
    src += 8*srcStride;\
2312
    dst += 8*dstStride;\
2313
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2314
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2315
}\
2316
\
2317
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2318
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2319
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2320
    src += 8*srcStride;\
2321
    dst += 8*dstStride;\
2322
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2323
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2324
}\
2325
\
2326
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2327
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2328
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2329
    src += 8*srcStride;\
2330
    dst += 8*dstStride;\
2331
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2332
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2333
}\
2334

    
2335
#define H264_MC(OPNAME, SIZE) \
2336
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2337
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2338
}\
2339
\
2340
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2341
    uint8_t half[SIZE*SIZE];\
2342
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2343
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2344
}\
2345
\
2346
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2347
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2348
}\
2349
\
2350
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2351
    uint8_t half[SIZE*SIZE];\
2352
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2353
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2354
}\
2355
\
2356
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2357
    uint8_t full[SIZE*(SIZE+5)];\
2358
    uint8_t * const full_mid= full + SIZE*2;\
2359
    uint8_t half[SIZE*SIZE];\
2360
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2361
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2362
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2363
}\
2364
\
2365
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2366
    uint8_t full[SIZE*(SIZE+5)];\
2367
    uint8_t * const full_mid= full + SIZE*2;\
2368
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2369
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2370
}\
2371
\
2372
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2373
    uint8_t full[SIZE*(SIZE+5)];\
2374
    uint8_t * const full_mid= full + SIZE*2;\
2375
    uint8_t half[SIZE*SIZE];\
2376
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2377
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2378
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2379
}\
2380
\
2381
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2382
    uint8_t full[SIZE*(SIZE+5)];\
2383
    uint8_t * const full_mid= full + SIZE*2;\
2384
    uint8_t halfH[SIZE*SIZE];\
2385
    uint8_t halfV[SIZE*SIZE];\
2386
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2387
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2388
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2389
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2390
}\
2391
\
2392
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2393
    uint8_t full[SIZE*(SIZE+5)];\
2394
    uint8_t * const full_mid= full + SIZE*2;\
2395
    uint8_t halfH[SIZE*SIZE];\
2396
    uint8_t halfV[SIZE*SIZE];\
2397
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2398
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2399
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2400
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2401
}\
2402
\
2403
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2404
    uint8_t full[SIZE*(SIZE+5)];\
2405
    uint8_t * const full_mid= full + SIZE*2;\
2406
    uint8_t halfH[SIZE*SIZE];\
2407
    uint8_t halfV[SIZE*SIZE];\
2408
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2409
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2410
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2411
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2412
}\
2413
\
2414
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2415
    uint8_t full[SIZE*(SIZE+5)];\
2416
    uint8_t * const full_mid= full + SIZE*2;\
2417
    uint8_t halfH[SIZE*SIZE];\
2418
    uint8_t halfV[SIZE*SIZE];\
2419
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2420
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2421
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2422
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2423
}\
2424
\
2425
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2426
    int16_t tmp[SIZE*(SIZE+5)];\
2427
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2428
}\
2429
\
2430
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2431
    int16_t tmp[SIZE*(SIZE+5)];\
2432
    uint8_t halfH[SIZE*SIZE];\
2433
    uint8_t halfHV[SIZE*SIZE];\
2434
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2435
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2436
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2437
}\
2438
\
2439
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2440
    int16_t tmp[SIZE*(SIZE+5)];\
2441
    uint8_t halfH[SIZE*SIZE];\
2442
    uint8_t halfHV[SIZE*SIZE];\
2443
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2444
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2445
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2446
}\
2447
\
2448
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2449
    uint8_t full[SIZE*(SIZE+5)];\
2450
    uint8_t * const full_mid= full + SIZE*2;\
2451
    int16_t tmp[SIZE*(SIZE+5)];\
2452
    uint8_t halfV[SIZE*SIZE];\
2453
    uint8_t halfHV[SIZE*SIZE];\
2454
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2455
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2456
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2457
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2458
}\
2459
\
2460
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2461
    uint8_t full[SIZE*(SIZE+5)];\
2462
    uint8_t * const full_mid= full + SIZE*2;\
2463
    int16_t tmp[SIZE*(SIZE+5)];\
2464
    uint8_t halfV[SIZE*SIZE];\
2465
    uint8_t halfHV[SIZE*SIZE];\
2466
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2467
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2468
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2469
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2470
}\
2471

    
2472
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2473
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2474
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2475
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2476
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2477

    
2478
H264_LOWPASS(put_       , op_put, op2_put)
2479
H264_LOWPASS(avg_       , op_avg, op2_avg)
2480
H264_MC(put_, 2)
2481
H264_MC(put_, 4)
2482
H264_MC(put_, 8)
2483
H264_MC(put_, 16)
2484
H264_MC(avg_, 4)
2485
H264_MC(avg_, 8)
2486
H264_MC(avg_, 16)
2487

    
2488
#undef op_avg
2489
#undef op_put
2490
#undef op2_avg
2491
#undef op2_put
2492
#endif
2493

    
2494
#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2495
#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2496
#define H264_WEIGHT(W,H) \
2497
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2498
    int y; \
2499
    offset <<= log2_denom; \
2500
    if(log2_denom) offset += 1<<(log2_denom-1); \
2501
    for(y=0; y<H; y++, block += stride){ \
2502
        op_scale1(0); \
2503
        op_scale1(1); \
2504
        if(W==2) continue; \
2505
        op_scale1(2); \
2506
        op_scale1(3); \
2507
        if(W==4) continue; \
2508
        op_scale1(4); \
2509
        op_scale1(5); \
2510
        op_scale1(6); \
2511
        op_scale1(7); \
2512
        if(W==8) continue; \
2513
        op_scale1(8); \
2514
        op_scale1(9); \
2515
        op_scale1(10); \
2516
        op_scale1(11); \
2517
        op_scale1(12); \
2518
        op_scale1(13); \
2519
        op_scale1(14); \
2520
        op_scale1(15); \
2521
    } \
2522
} \
2523
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2524
    int y; \
2525
    offset = ((offset + 1) | 1) << log2_denom; \
2526
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2527
        op_scale2(0); \
2528
        op_scale2(1); \
2529
        if(W==2) continue; \
2530
        op_scale2(2); \
2531
        op_scale2(3); \
2532
        if(W==4) continue; \
2533
        op_scale2(4); \
2534
        op_scale2(5); \
2535
        op_scale2(6); \
2536
        op_scale2(7); \
2537
        if(W==8) continue; \
2538
        op_scale2(8); \
2539
        op_scale2(9); \
2540
        op_scale2(10); \
2541
        op_scale2(11); \
2542
        op_scale2(12); \
2543
        op_scale2(13); \
2544
        op_scale2(14); \
2545
        op_scale2(15); \
2546
    } \
2547
}
2548

    
2549
H264_WEIGHT(16,16)
2550
H264_WEIGHT(16,8)
2551
H264_WEIGHT(8,16)
2552
H264_WEIGHT(8,8)
2553
H264_WEIGHT(8,4)
2554
H264_WEIGHT(4,8)
2555
H264_WEIGHT(4,4)
2556
H264_WEIGHT(4,2)
2557
H264_WEIGHT(2,4)
2558
H264_WEIGHT(2,2)
2559

    
2560
#undef op_scale1
2561
#undef op_scale2
2562
#undef H264_WEIGHT
2563

    
2564
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2565
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2566
    int i;
2567

    
2568
    for(i=0; i<h; i++){
2569
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2570
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2571
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2572
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2573
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2574
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2575
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2576
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2577
        dst+=dstStride;
2578
        src+=srcStride;
2579
    }
2580
}
2581

    
2582
#ifdef CONFIG_CAVS_DECODER
2583
/* AVS specific */
2584
void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2585

    
2586
void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2587
    put_pixels8_c(dst, src, stride, 8);
2588
}
2589
void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2590
    avg_pixels8_c(dst, src, stride, 8);
2591
}
2592
void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2593
    put_pixels16_c(dst, src, stride, 16);
2594
}
2595
void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2596
    avg_pixels16_c(dst, src, stride, 16);
2597
}
2598
#endif /* CONFIG_CAVS_DECODER */
2599

    
2600
#if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2601
/* VC-1 specific */
2602
void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2603

    
2604
void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2605
    put_pixels8_c(dst, src, stride, 8);
2606
}
2607
#endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2608

    
2609
void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2610

    
2611
/* H264 specific */
2612
void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2613

    
2614
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2615
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2616
    int i;
2617

    
2618
    for(i=0; i<w; i++){
2619
        const int src_1= src[ -srcStride];
2620
        const int src0 = src[0          ];
2621
        const int src1 = src[  srcStride];
2622
        const int src2 = src[2*srcStride];
2623
        const int src3 = src[3*srcStride];
2624
        const int src4 = src[4*srcStride];
2625
        const int src5 = src[5*srcStride];
2626
        const int src6 = src[6*srcStride];
2627
        const int src7 = src[7*srcStride];
2628
        const int src8 = src[8*srcStride];
2629
        const int src9 = src[9*srcStride];
2630
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2631
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2632
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2633
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2634
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2635
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2636
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2637
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2638
        src++;
2639
        dst++;
2640
    }
2641
}
2642

    
2643
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2644
    put_pixels8_c(dst, src, stride, 8);
2645
}
2646

    
2647
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2648
    uint8_t half[64];
2649
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2650
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2651
}
2652

    
2653
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2654
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2655
}
2656

    
2657
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2658
    uint8_t half[64];
2659
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2660
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2661
}
2662

    
2663
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2664
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2665
}
2666

    
2667
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2668
    uint8_t halfH[88];
2669
    uint8_t halfV[64];
2670
    uint8_t halfHV[64];
2671
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2672
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2673
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2674
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2675
}
2676
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2677
    uint8_t halfH[88];
2678
    uint8_t halfV[64];
2679
    uint8_t halfHV[64];
2680
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2681
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2682
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2683
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2684
}
2685
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2686
    uint8_t halfH[88];
2687
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2688
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2689
}
2690

    
2691
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2692
    if(ENABLE_ANY_H263) {
2693
    int x;
2694
    const int strength= ff_h263_loop_filter_strength[qscale];
2695

    
2696
    for(x=0; x<8; x++){
2697
        int d1, d2, ad1;
2698
        int p0= src[x-2*stride];
2699
        int p1= src[x-1*stride];
2700
        int p2= src[x+0*stride];
2701
        int p3= src[x+1*stride];
2702
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2703

    
2704
        if     (d<-2*strength) d1= 0;
2705
        else if(d<-  strength) d1=-2*strength - d;
2706
        else if(d<   strength) d1= d;
2707
        else if(d< 2*strength) d1= 2*strength - d;
2708
        else                   d1= 0;
2709

    
2710
        p1 += d1;
2711
        p2 -= d1;
2712
        if(p1&256) p1= ~(p1>>31);
2713
        if(p2&256) p2= ~(p2>>31);
2714

    
2715
        src[x-1*stride] = p1;
2716
        src[x+0*stride] = p2;
2717

    
2718
        ad1= FFABS(d1)>>1;
2719

    
2720
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2721

    
2722
        src[x-2*stride] = p0 - d2;
2723
        src[x+  stride] = p3 + d2;
2724
    }
2725
    }
2726
}
2727

    
2728
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2729
    if(ENABLE_ANY_H263) {
2730
    int y;
2731
    const int strength= ff_h263_loop_filter_strength[qscale];
2732

    
2733
    for(y=0; y<8; y++){
2734
        int d1, d2, ad1;
2735
        int p0= src[y*stride-2];
2736
        int p1= src[y*stride-1];
2737
        int p2= src[y*stride+0];
2738
        int p3= src[y*stride+1];
2739
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2740

    
2741
        if     (d<-2*strength) d1= 0;
2742
        else if(d<-  strength) d1=-2*strength - d;
2743
        else if(d<   strength) d1= d;
2744
        else if(d< 2*strength) d1= 2*strength - d;
2745
        else                   d1= 0;
2746

    
2747
        p1 += d1;
2748
        p2 -= d1;
2749
        if(p1&256) p1= ~(p1>>31);
2750
        if(p2&256) p2= ~(p2>>31);
2751

    
2752
        src[y*stride-1] = p1;
2753
        src[y*stride+0] = p2;
2754

    
2755
        ad1= FFABS(d1)>>1;
2756

    
2757
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2758

    
2759
        src[y*stride-2] = p0 - d2;
2760
        src[y*stride+1] = p3 + d2;
2761
    }
2762
    }
2763
}
2764

    
2765
static void h261_loop_filter_c(uint8_t *src, int stride){
2766
    int x,y,xy,yz;
2767
    int temp[64];
2768

    
2769
    for(x=0; x<8; x++){
2770
        temp[x      ] = 4*src[x           ];
2771
        temp[x + 7*8] = 4*src[x + 7*stride];
2772
    }
2773
    for(y=1; y<7; y++){
2774
        for(x=0; x<8; x++){
2775
            xy = y * stride + x;
2776
            yz = y * 8 + x;
2777
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2778
        }
2779
    }
2780

    
2781
    for(y=0; y<8; y++){
2782
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2783
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2784
        for(x=1; x<7; x++){
2785
            xy = y * stride + x;
2786
            yz = y * 8 + x;
2787
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2788
        }
2789
    }
2790
}
2791

    
2792
static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2793
{
2794
    int i, d;
2795
    for( i = 0; i < 4; i++ ) {
2796
        if( tc0[i] < 0 ) {
2797
            pix += 4*ystride;
2798
            continue;
2799
        }
2800
        for( d = 0; d < 4; d++ ) {
2801
            const int p0 = pix[-1*xstride];
2802
            const int p1 = pix[-2*xstride];
2803
            const int p2 = pix[-3*xstride];
2804
            const int q0 = pix[0];
2805
            const int q1 = pix[1*xstride];
2806
            const int q2 = pix[2*xstride];
2807

    
2808
            if( FFABS( p0 - q0 ) < alpha &&
2809
                FFABS( p1 - p0 ) < beta &&
2810
                FFABS( q1 - q0 ) < beta ) {
2811

    
2812
                int tc = tc0[i];
2813
                int i_delta;
2814

    
2815
                if( FFABS( p2 - p0 ) < beta ) {
2816
                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2817
                    tc++;
2818
                }
2819
                if( FFABS( q2 - q0 ) < beta ) {
2820
                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2821
                    tc++;
2822
                }
2823

    
2824
                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2825
                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
2826
                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
2827
            }
2828
            pix += ystride;
2829
        }
2830
    }
2831
}
2832
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2833
{
2834
    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2835
}
2836
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2837
{
2838
    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2839
}
2840

    
2841
static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2842
{
2843
    int i, d;
2844
    for( i = 0; i < 4; i++ ) {
2845
        const int tc = tc0[i];
2846
        if( tc <= 0 ) {
2847
            pix += 2*ystride;
2848
            continue;
2849
        }
2850
        for( d = 0; d < 2; d++ ) {
2851
            const int p0 = pix[-1*xstride];
2852
            const int p1 = pix[-2*xstride];
2853
            const int q0 = pix[0];
2854
            const int q1 = pix[1*xstride];
2855

    
2856
            if( FFABS( p0 - q0 ) < alpha &&
2857
                FFABS( p1 - p0 ) < beta &&
2858
                FFABS( q1 - q0 ) < beta ) {
2859

    
2860
                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2861

    
2862
                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
2863
                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
2864
            }
2865
            pix += ystride;
2866
        }
2867
    }
2868
}
2869
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2870
{
2871
    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2872
}
2873
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2874
{
2875
    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2876
}
2877

    
2878
static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2879
{
2880
    int d;
2881
    for( d = 0; d < 8; d++ ) {
2882
        const int p0 = pix[-1*xstride];
2883
        const int p1 = pix[-2*xstride];
2884
        const int q0 = pix[0];
2885
        const int q1 = pix[1*xstride];
2886

    
2887
        if( FFABS( p0 - q0 ) < alpha &&
2888
            FFABS( p1 - p0 ) < beta &&
2889
            FFABS( q1 - q0 ) < beta ) {
2890

    
2891
            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
2892
            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
2893
        }
2894
        pix += ystride;
2895
    }
2896
}
2897
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2898
{
2899
    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2900
}
2901
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2902
{
2903
    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2904
}
2905

    
2906
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2907
{
2908
    int s, i;
2909

    
2910
    s = 0;
2911
    for(i=0;i<h;i++) {
2912
        s += abs(pix1[0] - pix2[0]);
2913
        s += abs(pix1[1] - pix2[1]);
2914
        s += abs(pix1[2] - pix2[2]);
2915
        s += abs(pix1[3] - pix2[3]);
2916
        s += abs(pix1[4] - pix2[4]);
2917
        s += abs(pix1[5] - pix2[5]);
2918
        s += abs(pix1[6] - pix2[6]);
2919
        s += abs(pix1[7] - pix2[7]);
2920
        s += abs(pix1[8] - pix2[8]);
2921
        s += abs(pix1[9] - pix2[9]);
2922
        s += abs(pix1[10] - pix2[10]);
2923
        s += abs(pix1[11] - pix2[11]);
2924
        s += abs(pix1[12] - pix2[12]);
2925
        s += abs(pix1[13] - pix2[13]);
2926
        s += abs(pix1[14] - pix2[14]);
2927
        s += abs(pix1[15] - pix2[15]);
2928
        pix1 += line_size;
2929
        pix2 += line_size;
2930
    }
2931
    return s;
2932
}
2933

    
2934
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2935
{
2936
    int s, i;
2937

    
2938
    s = 0;
2939
    for(i=0;i<h;i++) {
2940
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2941
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2942
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2943
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2944
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2945
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2946
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2947
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2948
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2949
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2950
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2951
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2952
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2953
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2954
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2955
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2956
        pix1 += line_size;
2957
        pix2 += line_size;
2958
    }
2959
    return s;
2960
}
2961

    
2962
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2963
{
2964
    int s, i;
2965
    uint8_t *pix3 = pix2 + line_size;
2966

    
2967
    s = 0;
2968
    for(i=0;i<h;i++) {
2969
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2970
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2971
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2972
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2973
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2974
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2975
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2976
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2977
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2978
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2979
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2980
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2981
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2982
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2983
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2984
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2985
        pix1 += line_size;
2986
        pix2 += line_size;
2987
        pix3 += line_size;
2988
    }
2989
    return s;
2990
}
2991

    
2992
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2993
{
2994
    int s, i;
2995
    uint8_t *pix3 = pix2 + line_size;
2996

    
2997
    s = 0;
2998
    for(i=0;i<h;i++) {
2999
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3000
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3001
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3002
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3003
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3004
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3005
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3006
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3007
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3008
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3009
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3010
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3011
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3012
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3013
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3014
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3015
        pix1 += line_size;
3016
        pix2 += line_size;
3017
        pix3 += line_size;
3018
    }
3019
    return s;
3020
}
3021

    
3022
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3023
{
3024
    int s, i;
3025

    
3026
    s = 0;
3027
    for(i=0;i<h;i++) {
3028
        s += abs(pix1[0] - pix2[0]);
3029
        s += abs(pix1[1] - pix2[1]);
3030
        s += abs(pix1[2] - pix2[2]);
3031
        s += abs(pix1[3] - pix2[3]);
3032
        s += abs(pix1[4] - pix2[4]);
3033
        s += abs(pix1[5] - pix2[5]);
3034
        s += abs(pix1[6] - pix2[6]);
3035
        s += abs(pix1[7] - pix2[7]);
3036
        pix1 += line_size;
3037
        pix2 += line_size;
3038
    }
3039
    return s;
3040
}
3041

    
3042
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3043
{
3044
    int s, i;
3045

    
3046
    s = 0;
3047
    for(i=0;i<h;i++) {
3048
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3049
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3050
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3051
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3052
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3053
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3054
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3055
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3056
        pix1 += line_size;
3057
        pix2 += line_size;
3058
    }
3059
    return s;
3060
}
3061

    
3062
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3063
{
3064
    int s, i;
3065
    uint8_t *pix3 = pix2 + line_size;
3066

    
3067
    s = 0;
3068
    for(i=0;i<h;i++) {
3069
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3070
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3071
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3072
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3073
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3074
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3075
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3076
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3077
        pix1 += line_size;
3078
        pix2 += line_size;
3079
        pix3 += line_size;
3080
    }
3081
    return s;
3082
}
3083

    
3084
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3085
{
3086
    int s, i;
3087
    uint8_t *pix3 = pix2 + line_size;
3088

    
3089
    s = 0;
3090
    for(i=0;i<h;i++) {
3091
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3092
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3093
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3094
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3095
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3096
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3097
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3098
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3099
        pix1 += line_size;
3100
        pix2 += line_size;
3101
        pix3 += line_size;
3102
    }
3103
    return s;
3104
}
3105

    
3106
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3107
    MpegEncContext *c = v;
3108
    int score1=0;
3109
    int score2=0;
3110
    int x,y;
3111

    
3112
    for(y=0; y<h; y++){
3113
        for(x=0; x<16; x++){
3114
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3115
        }
3116
        if(y+1<h){
3117
            for(x=0; x<15; x++){
3118
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3119
                             - s1[x+1] + s1[x+1+stride])
3120
                        -FFABS(  s2[x  ] - s2[x  +stride]
3121
                             - s2[x+1] + s2[x+1+stride]);
3122
            }
3123
        }
3124
        s1+= stride;
3125
        s2+= stride;
3126
    }
3127

    
3128
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3129
    else  return score1 + FFABS(score2)*8;
3130
}
3131

    
3132
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3133
    MpegEncContext *c = v;
3134
    int score1=0;
3135
    int score2=0;
3136
    int x,y;
3137

    
3138
    for(y=0; y<h; y++){
3139
        for(x=0; x<8; x++){
3140
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3141
        }
3142
        if(y+1<h){
3143
            for(x=0; x<7; x++){
3144
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3145
                             - s1[x+1] + s1[x+1+stride])
3146
                        -FFABS(  s2[x  ] - s2[x  +stride]
3147
                             - s2[x+1] + s2[x+1+stride]);
3148
            }
3149
        }
3150
        s1+= stride;
3151
        s2+= stride;
3152
    }
3153

    
3154
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3155
    else  return score1 + FFABS(score2)*8;
3156
}
3157

    
3158
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3159
    int i;
3160
    unsigned int sum=0;
3161

    
3162
    for(i=0; i<8*8; i++){
3163
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3164
        int w= weight[i];
3165
        b>>= RECON_SHIFT;
3166
        assert(-512<b && b<512);
3167

    
3168
        sum += (w*b)*(w*b)>>4;
3169
    }
3170
    return sum>>2;
3171
}
3172

    
3173
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3174
    int i;
3175

    
3176
    for(i=0; i<8*8; i++){
3177
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3178
    }
3179
}
3180

    
3181
/**
3182
 * permutes an 8x8 block.
3183
 * @param block the block which will be permuted according to the given permutation vector
3184
 * @param permutation the permutation vector
3185
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3186
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3187
 *                  (inverse) permutated to scantable order!
3188
 */
3189
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3190
{
3191
    int i;
3192
    DCTELEM temp[64];
3193

    
3194
    if(last<=0) return;
3195
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3196

    
3197
    for(i=0; i<=last; i++){
3198
        const int j= scantable[i];
3199
        temp[j]= block[j];
3200
        block[j]=0;
3201
    }
3202

    
3203
    for(i=0; i<=last; i++){
3204
        const int j= scantable[i];
3205
        const int perm_j= permutation[j];
3206
        block[perm_j]= temp[j];
3207
    }
3208
}
3209

    
3210
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3211
    return 0;
3212
}
3213

    
3214
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3215
    int i;
3216

    
3217
    memset(cmp, 0, sizeof(void*)*5);
3218

    
3219
    for(i=0; i<5; i++){
3220
        switch(type&0xFF){
3221
        case FF_CMP_SAD:
3222
            cmp[i]= c->sad[i];
3223
            break;
3224
        case FF_CMP_SATD:
3225
            cmp[i]= c->hadamard8_diff[i];
3226
            break;
3227
        case FF_CMP_SSE:
3228
            cmp[i]= c->sse[i];
3229
            break;
3230
        case FF_CMP_DCT:
3231
            cmp[i]= c->dct_sad[i];
3232
            break;
3233
        case FF_CMP_DCT264:
3234
            cmp[i]= c->dct264_sad[i];
3235
            break;
3236
        case FF_CMP_DCTMAX:
3237
            cmp[i]= c->dct_max[i];
3238
            break;
3239
        case FF_CMP_PSNR:
3240
            cmp[i]= c->quant_psnr[i];
3241
            break;
3242
        case FF_CMP_BIT:
3243
            cmp[i]= c->bit[i];
3244
            break;
3245
        case FF_CMP_RD:
3246
            cmp[i]= c->rd[i];
3247
            break;
3248
        case FF_CMP_VSAD:
3249
            cmp[i]= c->vsad[i];
3250
            break;
3251
        case FF_CMP_VSSE:
3252
            cmp[i]= c->vsse[i];
3253
            break;
3254
        case FF_CMP_ZERO:
3255
            cmp[i]= zero_cmp;
3256
            break;
3257
        case FF_CMP_NSSE:
3258
            cmp[i]= c->nsse[i];
3259
            break;
3260
#ifdef CONFIG_SNOW_ENCODER
3261
        case FF_CMP_W53:
3262
            cmp[i]= c->w53[i];
3263
            break;
3264
        case FF_CMP_W97:
3265
            cmp[i]= c->w97[i];
3266
            break;
3267
#endif
3268
        default:
3269
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3270
        }
3271
    }
3272
}
3273

    
3274
/**
3275
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3276
 */
3277
static void clear_blocks_c(DCTELEM *blocks)
3278
{
3279
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3280
}
3281

    
3282
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3283
    long i;
3284
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3285
        long a = *(long*)(src+i);
3286
        long b = *(long*)(dst+i);
3287
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3288
    }
3289
    for(; i<w; i++)
3290
        dst[i+0] += src[i+0];
3291
}
3292

    
3293
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3294
    long i;
3295
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3296
        long a = *(long*)(src1+i);
3297
        long b = *(long*)(src2+i);
3298
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3299
    }
3300
    for(; i<w; i++)
3301
        dst[i] = src1[i]+src2[i];
3302
}
3303

    
3304
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3305
    long i;
3306
#ifndef HAVE_FAST_UNALIGNED
3307
    if((long)src2 & (sizeof(long)-1)){
3308
        for(i=0; i+7<w; i+=8){
3309
            dst[i+0] = src1[i+0]-src2[i+0];
3310
            dst[i+1] = src1[i+1]-src2[i+1];
3311
            dst[i+2] = src1[i+2]-src2[i+2];
3312
            dst[i+3] = src1[i+3]-src2[i+3];
3313
            dst[i+4] = src1[i+4]-src2[i+4];
3314
            dst[i+5] = src1[i+5]-src2[i+5];
3315
            dst[i+6] = src1[i+6]-src2[i+6];
3316
            dst[i+7] = src1[i+7]-src2[i+7];
3317
        }
3318
    }else
3319
#endif
3320
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3321
        long a = *(long*)(src1+i);
3322
        long b = *(long*)(src2+i);
3323
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3324
    }
3325
    for(; i<w; i++)
3326
        dst[i+0] = src1[i+0]-src2[i+0];
3327
}
3328

    
3329
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3330
    int i;
3331
    uint8_t l, lt;
3332

    
3333
    l= *left;
3334
    lt= *left_top;
3335

    
3336
    for(i=0; i<w; i++){
3337
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3338
        lt= src1[i];
3339
        l= src2[i];
3340
        dst[i]= l - pred;
3341
    }
3342

    
3343
    *left= l;
3344
    *left_top= lt;
3345
}
3346

    
3347
#define BUTTERFLY2(o1,o2,i1,i2) \
3348
o1= (i1)+(i2);\
3349
o2= (i1)-(i2);
3350

    
3351
#define BUTTERFLY1(x,y) \
3352
{\
3353
    int a,b;\
3354
    a= x;\
3355
    b= y;\
3356
    x= a+b;\
3357
    y= a-b;\
3358
}
3359

    
3360
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3361

    
3362
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3363
    int i;
3364
    int temp[64];
3365
    int sum=0;
3366

    
3367
    assert(h==8);
3368

    
3369
    for(i=0; i<8; i++){
3370
        //FIXME try pointer walks
3371
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3372
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3373
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3374
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3375

    
3376
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3377
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3378
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3379
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3380

    
3381
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3382
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3383
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3384
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3385
    }
3386

    
3387
    for(i=0; i<8; i++){
3388
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3389
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3390
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3391
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3392

    
3393
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3394
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3395
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3396
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3397

    
3398
        sum +=
3399
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3400
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3401
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3402
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3403
    }
3404
#if 0
3405
static int maxi=0;
3406
if(sum>maxi){
3407
    maxi=sum;
3408
    printf("MAX:%d\n", maxi);
3409
}
3410
#endif
3411
    return sum;
3412
}
3413

    
3414
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3415
    int i;
3416
    int temp[64];
3417
    int sum=0;
3418

    
3419
    assert(h==8);
3420

    
3421
    for(i=0; i<8; i++){
3422
        //FIXME try pointer walks
3423
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3424
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3425
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3426
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3427

    
3428
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3429
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3430
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3431
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3432

    
3433
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3434
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3435
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3436
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3437
    }
3438

    
3439
    for(i=0; i<8; i++){
3440
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3441
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3442
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3443
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3444

    
3445
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3446
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3447
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3448
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3449

    
3450
        sum +=
3451
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3452
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3453
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3454
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3455
    }
3456

    
3457
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3458

    
3459
    return sum;
3460
}
3461

    
3462
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3463
    MpegEncContext * const s= (MpegEncContext *)c;
3464
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3465
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3466

    
3467
    assert(h==8);
3468

    
3469
    s->dsp.diff_pixels(temp, src1, src2, stride);
3470
    s->dsp.fdct(temp);
3471
    return s->dsp.sum_abs_dctelem(temp);
3472
}
3473

    
3474
#ifdef CONFIG_GPL
3475
#define DCT8_1D {\
3476
    const int s07 = SRC(0) + SRC(7);\
3477
    const int s16 = SRC(1) + SRC(6);\
3478
    const int s25 = SRC(2) + SRC(5);\
3479
    const int s34 = SRC(3) + SRC(4);\
3480
    const int a0 = s07 + s34;\
3481
    const int a1 = s16 + s25;\
3482
    const int a2 = s07 - s34;\
3483
    const int a3 = s16 - s25;\
3484
    const int d07 = SRC(0) - SRC(7);\
3485
    const int d16 = SRC(1) - SRC(6);\
3486
    const int d25 = SRC(2) - SRC(5);\
3487
    const int d34 = SRC(3) - SRC(4);\
3488
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3489
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3490
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3491
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3492
    DST(0,  a0 + a1     ) ;\
3493
    DST(1,  a4 + (a7>>2)) ;\
3494
    DST(2,  a2 + (a3>>1)) ;\
3495
    DST(3,  a5 + (a6>>2)) ;\
3496
    DST(4,  a0 - a1     ) ;\
3497
    DST(5,  a6 - (a5>>2)) ;\
3498
    DST(6, (a2>>1) - a3 ) ;\
3499
    DST(7, (a4>>2) - a7 ) ;\
3500
}
3501

    
3502
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3503
    MpegEncContext * const s= (MpegEncContext *)c;
3504
    DCTELEM dct[8][8];
3505
    int i;
3506
    int sum=0;
3507

    
3508
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3509

    
3510
#define SRC(x) dct[i][x]
3511
#define DST(x,v) dct[i][x]= v
3512
    for( i = 0; i < 8; i++ )
3513
        DCT8_1D
3514
#undef SRC
3515
#undef DST
3516

    
3517
#define SRC(x) dct[x][i]
3518
#define DST(x,v) sum += FFABS(v)
3519
    for( i = 0; i < 8; i++ )
3520
        DCT8_1D
3521
#undef SRC
3522
#undef DST
3523
    return sum;
3524
}
3525
#endif
3526

    
3527
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3528
    MpegEncContext * const s= (MpegEncContext *)c;
3529
    DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3530
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3531
    int sum=0, i;
3532

    
3533
    assert(h==8);
3534

    
3535
    s->dsp.diff_pixels(temp, src1, src2, stride);
3536
    s->dsp.fdct(temp);
3537

    
3538
    for(i=0; i<64; i++)
3539
        sum= FFMAX(sum, FFABS(temp[i]));
3540

    
3541
    return sum;
3542
}
3543

    
3544
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3545
    MpegEncContext * const s= (MpegEncContext *)c;
3546
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3547
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3548
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3549
    int sum=0, i;
3550

    
3551
    assert(h==8);
3552
    s->mb_intra=0;
3553

    
3554
    s->dsp.diff_pixels(temp, src1, src2, stride);
3555

    
3556
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3557

    
3558
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3559
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3560
    ff_simple_idct(temp); //FIXME
3561

    
3562
    for(i=0; i<64; i++)
3563
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3564

    
3565
    return sum;
3566
}
3567

    
3568
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3569
    MpegEncContext * const s= (MpegEncContext *)c;
3570
    const uint8_t *scantable= s->intra_scantable.permutated;
3571
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3572
    DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3573
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3574
    uint8_t * const bak= (uint8_t*)aligned_bak;
3575
    int i, last, run, bits, level, distoration, start_i;
3576
    const int esc_length= s->ac_esc_length;
3577
    uint8_t * length;
3578
    uint8_t * last_length;
3579

    
3580
    assert(h==8);
3581

    
3582
    for(i=0; i<8; i++){
3583
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3584
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3585
    }
3586

    
3587
    s->dsp.diff_pixels(temp, src1, src2, stride);
3588

    
3589
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3590

    
3591
    bits=0;
3592

    
3593
    if (s->mb_intra) {
3594
        start_i = 1;
3595
        length     = s->intra_ac_vlc_length;
3596
        last_length= s->intra_ac_vlc_last_length;
3597
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3598
    } else {
3599
        start_i = 0;
3600
        length     = s->inter_ac_vlc_length;
3601
        last_length= s->inter_ac_vlc_last_length;
3602
    }
3603

    
3604
    if(last>=start_i){
3605
        run=0;
3606
        for(i=start_i; i<last; i++){
3607
            int j= scantable[i];
3608
            level= temp[j];
3609

    
3610
            if(level){
3611
                level+=64;
3612
                if((level&(~127)) == 0){
3613
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3614
                }else
3615
                    bits+= esc_length;
3616
                run=0;
3617
            }else
3618
                run++;
3619
        }
3620
        i= scantable[last];
3621

    
3622
        level= temp[i] + 64;
3623

    
3624
        assert(level - 64);
3625

    
3626
        if((level&(~127)) == 0){
3627
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3628
        }else
3629
            bits+= esc_length;
3630

    
3631
    }
3632

    
3633
    if(last>=0){
3634
        if(s->mb_intra)
3635
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3636
        else
3637
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3638
    }
3639

    
3640
    s->dsp.idct_add(bak, stride, temp);
3641

    
3642
    distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3643

    
3644
    return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3645
}
3646

    
3647
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3648
    MpegEncContext * const s= (MpegEncContext *)c;
3649
    const uint8_t *scantable= s->intra_scantable.permutated;
3650
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3651
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3652
    int i, last, run, bits, level, start_i;
3653
    const int esc_length= s->ac_esc_length;
3654
    uint8_t * length;
3655
    uint8_t * last_length;
3656

    
3657
    assert(h==8);
3658

    
3659
    s->dsp.diff_pixels(temp, src1, src2, stride);
3660

    
3661
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3662

    
3663
    bits=0;
3664

    
3665
    if (s->mb_intra) {
3666
        start_i = 1;
3667
        length     = s->intra_ac_vlc_length;
3668
        last_length= s->intra_ac_vlc_last_length;
3669
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3670
    } else {
3671
        start_i = 0;
3672
        length     = s->inter_ac_vlc_length;
3673
        last_length= s->inter_ac_vlc_last_length;
3674
    }
3675

    
3676
    if(last>=start_i){
3677
        run=0;
3678
        for(i=start_i; i<last; i++){
3679
            int j= scantable[i];
3680
            level= temp[j];
3681

    
3682
            if(level){
3683
                level+=64;
3684
                if((level&(~127)) == 0){
3685
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3686
                }else
3687
                    bits+= esc_length;
3688
                run=0;
3689
            }else
3690
                run++;
3691
        }
3692
        i= scantable[last];
3693

    
3694
        level= temp[i] + 64;
3695

    
3696
        assert(level - 64);
3697

    
3698
        if((level&(~127)) == 0){
3699
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3700
        }else
3701
            bits+= esc_length;
3702
    }
3703

    
3704
    return bits;
3705
}
3706

    
3707
static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3708
    int score=0;
3709
    int x,y;
3710

    
3711
    for(y=1; y<h; y++){
3712
        for(x=0; x<16; x+=4){
3713
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])
3714
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3715
        }
3716
        s+= stride;
3717
    }
3718

    
3719
    return score;
3720
}
3721

    
3722
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3723
    int score=0;
3724
    int x,y;
3725

    
3726
    for(y=1; y<h; y++){
3727
        for(x=0; x<16; x++){
3728
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3729
        }
3730
        s1+= stride;
3731
        s2+= stride;
3732
    }
3733

    
3734
    return score;
3735
}
3736

    
3737
#define SQ(a) ((a)*(a))
3738
static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3739
    int score=0;
3740
    int x,y;
3741

    
3742
    for(y=1; y<h; y++){
3743
        for(x=0; x<16; x+=4){
3744
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3745
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3746
        }
3747
        s+= stride;
3748
    }
3749

    
3750
    return score;
3751
}
3752

    
3753
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3754
    int score=0;
3755
    int x,y;
3756

    
3757
    for(y=1; y<h; y++){
3758
        for(x=0; x<16; x++){
3759
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3760
        }
3761
        s1+= stride;
3762
        s2+= stride;
3763
    }
3764

    
3765
    return score;
3766
}
3767

    
3768
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3769
                               int size){
3770
    int score=0;
3771
    int i;
3772
    for(i=0; i<size; i++)
3773
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3774
    return score;
3775
}
3776

    
3777
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3778
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3779
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3780
#ifdef CONFIG_GPL
3781
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3782
#endif
3783
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3784
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3785
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3786
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3787

    
3788
static void vector_fmul_c(float *dst, const float *src, int len){
3789
    int i;
3790
    for(i=0; i<len; i++)
3791
        dst[i] *= src[i];
3792
}
3793

    
3794
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3795
    int i;
3796
    src1 += len-1;
3797
    for(i=0; i<len; i++)
3798
        dst[i] = src0[i] * src1[-i];
3799
}
3800

    
3801
void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3802
    int i;
3803
    for(i=0; i<len; i++)
3804
        dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3805
}
3806

    
3807
void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3808
    int i;
3809
    for(i=0; i<len; i++) {
3810
        int_fast32_t tmp = ((const int32_t*)src)[i];
3811
        if(tmp & 0xf0000){
3812
            tmp = (0x43c0ffff - tmp)>>31;
3813
            // is this faster on some gcc/cpu combinations?
3814
//          if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3815
//          else                 tmp = 0;
3816
        }
3817
        dst[i] = tmp - 0x8000;
3818
    }
3819
}
3820

    
3821
#define W0 2048
3822
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3823
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3824
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3825
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3826
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3827
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3828
#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
3829

    
3830
static void wmv2_idct_row(short * b)
3831
{
3832
    int s1,s2;
3833
    int a0,a1,a2,a3,a4,a5,a6,a7;
3834
    /*step 1*/
3835
    a1 = W1*b[1]+W7*b[7];
3836
    a7 = W7*b[1]-W1*b[7];
3837
    a5 = W5*b[5]+W3*b[3];
3838
    a3 = W3*b[5]-W5*b[3];
3839
    a2 = W2*b[2]+W6*b[6];
3840
    a6 = W6*b[2]-W2*b[6];
3841
    a0 = W0*b[0]+W0*b[4];
3842
    a4 = W0*b[0]-W0*b[4];
3843
    /*step 2*/
3844
    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3845
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
3846
    /*step 3*/
3847
    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3848
    b[1] = (a4+a6 +s1   + (1<<7))>>8;
3849
    b[2] = (a4-a6 +s2   + (1<<7))>>8;
3850
    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3851
    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3852
    b[5] = (a4-a6 -s2   + (1<<7))>>8;
3853
    b[6] = (a4+a6 -s1   + (1<<7))>>8;
3854
    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3855
}
3856
static void wmv2_idct_col(short * b)
3857
{
3858
    int s1,s2;
3859
    int a0,a1,a2,a3,a4,a5,a6,a7;
3860
    /*step 1, with extended precision*/
3861
    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3862
    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3863
    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3864
    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3865
    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3866
    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3867
    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
3868
    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
3869
    /*step 2*/
3870
    s1 = (181*(a1-a5+a7-a3)+128)>>8;
3871
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
3872
    /*step 3*/
3873
    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3874
    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
3875
    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
3876
    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3877

    
3878
    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3879
    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
3880
    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
3881
    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3882
}
3883
void ff_wmv2_idct_c(short * block){
3884
    int i;
3885

    
3886
    for(i=0;i<64;i+=8){
3887
        wmv2_idct_row(block+i);
3888
    }
3889
    for(i=0;i<8;i++){
3890
        wmv2_idct_col(block+i);
3891
    }
3892
}
3893
/* XXX: those functions should be suppressed ASAP when all IDCTs are
3894
 converted */
3895
static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
3896
{
3897
    ff_wmv2_idct_c(block);
3898
    put_pixels_clamped_c(block, dest, line_size);
3899
}
3900
static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
3901
{
3902
    ff_wmv2_idct_c(block);
3903
    add_pixels_clamped_c(block, dest, line_size);
3904
}
3905
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3906
{
3907
    j_rev_dct (block);
3908
    put_pixels_clamped_c(block, dest, line_size);
3909
}
3910
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3911
{
3912
    j_rev_dct (block);
3913
    add_pixels_clamped_c(block, dest, line_size);
3914
}
3915

    
3916
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3917
{
3918
    j_rev_dct4 (block);
3919
    put_pixels_clamped4_c(block, dest, line_size);
3920
}
3921
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3922
{
3923
    j_rev_dct4 (block);
3924
    add_pixels_clamped4_c(block, dest, line_size);
3925
}
3926

    
3927
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3928
{
3929
    j_rev_dct2 (block);
3930
    put_pixels_clamped2_c(block, dest, line_size);
3931
}
3932
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3933
{
3934
    j_rev_dct2 (block);
3935
    add_pixels_clamped2_c(block, dest, line_size);
3936
}
3937

    
3938
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3939
{
3940
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3941

    
3942
    dest[0] = cm[(block[0] + 4)>>3];
3943
}
3944
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3945
{
3946
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3947

    
3948
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3949
}
3950

    
3951
static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
3952

    
3953
/* init static data */
3954
void dsputil_static_init(void)
3955
{
3956
    int i;
3957

    
3958
    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
3959
    for(i=0;i<MAX_NEG_CROP;i++) {
3960
        ff_cropTbl[i] = 0;
3961
        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
3962
    }
3963

    
3964
    for(i=0;i<512;i++) {
3965
        ff_squareTbl[i] = (i - 256) * (i - 256);
3966
    }
3967

    
3968
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3969
}
3970

    
3971
int ff_check_alignment(void){
3972
    static int did_fail=0;
3973
    DECLARE_ALIGNED_16(int, aligned);
3974

    
3975
    if((long)&aligned & 15){
3976
        if(!did_fail){
3977
#if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
3978
            av_log(NULL, AV_LOG_ERROR,
3979
                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
3980
                "and may be very slow or crash. This is not a bug in libavcodec,\n"
3981
                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
3982
                "Do not report crashes to FFmpeg developers.\n");
3983
#endif
3984
            did_fail=1;
3985
        }
3986
        return -1;
3987
    }
3988
    return 0;
3989
}
3990

    
3991
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3992
{
3993
    int i;
3994

    
3995
    ff_check_alignment();
3996

    
3997
#ifdef CONFIG_ENCODERS
3998
    if(avctx->dct_algo==FF_DCT_FASTINT) {
3999
        c->fdct = fdct_ifast;
4000
        c->fdct248 = fdct_ifast248;
4001
    }
4002
    else if(avctx->dct_algo==FF_DCT_FAAN) {
4003
        c->fdct = ff_faandct;
4004
        c->fdct248 = ff_faandct248;
4005
    }
4006
    else {
4007
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4008
        c->fdct248 = ff_fdct248_islow;
4009
    }
4010
#endif //CONFIG_ENCODERS
4011

    
4012
    if(avctx->lowres==1){
4013
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
4014
            c->idct_put= ff_jref_idct4_put;
4015
            c->idct_add= ff_jref_idct4_add;
4016
        }else{
4017
            c->idct_put= ff_h264_lowres_idct_put_c;
4018
            c->idct_add= ff_h264_lowres_idct_add_c;
4019
        }
4020
        c->idct    = j_rev_dct4;
4021
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4022
    }else if(avctx->lowres==2){
4023
        c->idct_put= ff_jref_idct2_put;
4024
        c->idct_add= ff_jref_idct2_add;
4025
        c->idct    = j_rev_dct2;
4026
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4027
    }else if(avctx->lowres==3){
4028
        c->idct_put= ff_jref_idct1_put;
4029
        c->idct_add= ff_jref_idct1_add;
4030
        c->idct    = j_rev_dct1;
4031
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4032
    }else{
4033
        if(avctx->idct_algo==FF_IDCT_INT){
4034
            c->idct_put= ff_jref_idct_put;
4035
            c->idct_add= ff_jref_idct_add;
4036
            c->idct    = j_rev_dct;
4037
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4038
        }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
4039
                avctx->idct_algo==FF_IDCT_VP3){
4040
            c->idct_put= ff_vp3_idct_put_c;
4041
            c->idct_add= ff_vp3_idct_add_c;
4042
            c->idct    = ff_vp3_idct_c;
4043
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4044
        }else if(avctx->idct_algo==FF_IDCT_WMV2){
4045
            c->idct_put= ff_wmv2_idct_put_c;
4046
            c->idct_add= ff_wmv2_idct_add_c;
4047
            c->idct    = ff_wmv2_idct_c;
4048
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4049
        }else{ //accurate/default
4050
            c->idct_put= ff_simple_idct_put;
4051
            c->idct_add= ff_simple_idct_add;
4052
            c->idct    = ff_simple_idct;
4053
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4054
        }
4055
    }
4056

    
4057
    if (ENABLE_H264_DECODER) {
4058
        c->h264_idct_add= ff_h264_idct_add_c;
4059
        c->h264_idct8_add= ff_h264_idct8_add_c;
4060
        c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4061
        c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4062
    }
4063

    
4064
    c->get_pixels = get_pixels_c;
4065
    c->diff_pixels = diff_pixels_c;
4066
    c->put_pixels_clamped = put_pixels_clamped_c;
4067
    c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4068
    c->add_pixels_clamped = add_pixels_clamped_c;
4069
    c->add_pixels8 = add_pixels8_c;
4070
    c->add_pixels4 = add_pixels4_c;
4071
    c->sum_abs_dctelem = sum_abs_dctelem_c;
4072
    c->gmc1 = gmc1_c;
4073
    c->gmc = ff_gmc_c;
4074
    c->clear_blocks = clear_blocks_c;
4075
    c->pix_sum = pix_sum_c;
4076
    c->pix_norm1 = pix_norm1_c;
4077

    
4078
    /* TODO [0] 16  [1] 8 */
4079
    c->pix_abs[0][0] = pix_abs16_c;
4080
    c->pix_abs[0][1] = pix_abs16_x2_c;
4081
    c->pix_abs[0][2] = pix_abs16_y2_c;
4082
    c->pix_abs[0][3] = pix_abs16_xy2_c;
4083
    c->pix_abs[1][0] = pix_abs8_c;
4084
    c->pix_abs[1][1] = pix_abs8_x2_c;
4085
    c->pix_abs[1][2] = pix_abs8_y2_c;
4086
    c->pix_abs[1][3] = pix_abs8_xy2_c;
4087

    
4088
#define dspfunc(PFX, IDX, NUM) \
4089
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4090
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4091
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4092
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4093

    
4094
    dspfunc(put, 0, 16);
4095
    dspfunc(put_no_rnd, 0, 16);
4096
    dspfunc(put, 1, 8);
4097
    dspfunc(put_no_rnd, 1, 8);
4098
    dspfunc(put, 2, 4);
4099
    dspfunc(put, 3, 2);
4100

    
4101
    dspfunc(avg, 0, 16);
4102
    dspfunc(avg_no_rnd, 0, 16);
4103
    dspfunc(avg, 1, 8);
4104
    dspfunc(avg_no_rnd, 1, 8);
4105
    dspfunc(avg, 2, 4);
4106
    dspfunc(avg, 3, 2);
4107
#undef dspfunc
4108

    
4109
    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4110
    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4111

    
4112
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4113
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4114
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4115
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4116
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4117
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4118
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4119
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4120
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4121

    
4122
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4123
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4124
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4125
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4126
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4127
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4128
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4129
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4130
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4131

    
4132
#define dspfunc(PFX, IDX, NUM) \
4133
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4134
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4135
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4136
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4137
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4138
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4139
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4140
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4141
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4142
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4143
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4144
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4145
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4146
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4147
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4148
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4149

    
4150
    dspfunc(put_qpel, 0, 16);
4151
    dspfunc(put_no_rnd_qpel, 0, 16);
4152

    
4153
    dspfunc(avg_qpel, 0, 16);
4154
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4155

    
4156
    dspfunc(put_qpel, 1, 8);
4157
    dspfunc(put_no_rnd_qpel, 1, 8);
4158

    
4159
    dspfunc(avg_qpel, 1, 8);
4160
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4161

    
4162
    dspfunc(put_h264_qpel, 0, 16);
4163
    dspfunc(put_h264_qpel, 1, 8);
4164
    dspfunc(put_h264_qpel, 2, 4);
4165
    dspfunc(put_h264_qpel, 3, 2);
4166
    dspfunc(avg_h264_qpel, 0, 16);
4167
    dspfunc(avg_h264_qpel, 1, 8);
4168
    dspfunc(avg_h264_qpel, 2, 4);
4169

    
4170
#undef dspfunc
4171
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4172
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4173
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4174
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4175
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4176
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4177
    c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4178

    
4179
    c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4180
    c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4181
    c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4182
    c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4183
    c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4184
    c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4185
    c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4186
    c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4187
    c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4188
    c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4189
    c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4190
    c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4191
    c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4192
    c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4193
    c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4194
    c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4195
    c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4196
    c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4197
    c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4198
    c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4199

    
4200
#ifdef CONFIG_CAVS_DECODER
4201
    ff_cavsdsp_init(c,avctx);
4202
#endif
4203
#if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4204
    ff_vc1dsp_init(c,avctx);
4205
#endif
4206
#if defined(CONFIG_WMV2_DECODER) || defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4207
    ff_intrax8dsp_init(c,avctx);
4208
#endif
4209
#if defined(CONFIG_H264_ENCODER)
4210
    ff_h264dspenc_init(c,avctx);
4211
#endif
4212

    
4213
    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4214
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4215
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4216
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4217
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4218
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4219
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4220
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4221

    
4222