Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 9b5dc867

History | View | Annotate | Download (148 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 *
22
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
23
 */
24

    
25
/**
26
 * @file dsputil.c
27
 * DSP utils
28
 */
29

    
30
#include "avcodec.h"
31
#include "dsputil.h"
32
#include "mpegvideo.h"
33
#include "simple_idct.h"
34
#include "faandct.h"
35
#include "snow.h"
36

    
37
/* snow.c */
38
void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
39

    
40
/* vorbis.c */
41
void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
42

    
43
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44
uint32_t ff_squareTbl[512] = {0, };
45

    
46
const uint8_t ff_zigzag_direct[64] = {
47
    0,   1,  8, 16,  9,  2,  3, 10,
48
    17, 24, 32, 25, 18, 11,  4,  5,
49
    12, 19, 26, 33, 40, 48, 41, 34,
50
    27, 20, 13,  6,  7, 14, 21, 28,
51
    35, 42, 49, 56, 57, 50, 43, 36,
52
    29, 22, 15, 23, 30, 37, 44, 51,
53
    58, 59, 52, 45, 38, 31, 39, 46,
54
    53, 60, 61, 54, 47, 55, 62, 63
55
};
56

    
57
/* Specific zigzag scan for 248 idct. NOTE that unlike the
58
   specification, we interleave the fields */
59
const uint8_t ff_zigzag248_direct[64] = {
60
     0,  8,  1,  9, 16, 24,  2, 10,
61
    17, 25, 32, 40, 48, 56, 33, 41,
62
    18, 26,  3, 11,  4, 12, 19, 27,
63
    34, 42, 49, 57, 50, 58, 35, 43,
64
    20, 28,  5, 13,  6, 14, 21, 29,
65
    36, 44, 51, 59, 52, 60, 37, 45,
66
    22, 30,  7, 15, 23, 31, 38, 46,
67
    53, 61, 54, 62, 39, 47, 55, 63,
68
};
69

    
70
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
71
DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
72

    
73
const uint8_t ff_alternate_horizontal_scan[64] = {
74
    0,  1,   2,  3,  8,  9, 16, 17,
75
    10, 11,  4,  5,  6,  7, 15, 14,
76
    13, 12, 19, 18, 24, 25, 32, 33,
77
    26, 27, 20, 21, 22, 23, 28, 29,
78
    30, 31, 34, 35, 40, 41, 48, 49,
79
    42, 43, 36, 37, 38, 39, 44, 45,
80
    46, 47, 50, 51, 56, 57, 58, 59,
81
    52, 53, 54, 55, 60, 61, 62, 63,
82
};
83

    
84
const uint8_t ff_alternate_vertical_scan[64] = {
85
    0,  8,  16, 24,  1,  9,  2, 10,
86
    17, 25, 32, 40, 48, 56, 57, 49,
87
    41, 33, 26, 18,  3, 11,  4, 12,
88
    19, 27, 34, 42, 50, 58, 35, 43,
89
    51, 59, 20, 28,  5, 13,  6, 14,
90
    21, 29, 36, 44, 52, 60, 37, 45,
91
    53, 61, 22, 30,  7, 15, 23, 31,
92
    38, 46, 54, 62, 39, 47, 55, 63,
93
};
94

    
95
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
96
const uint32_t ff_inverse[256]={
97
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
98
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
99
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
100
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
101
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
102
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
103
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
104
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
105
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
106
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
107
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
108
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
109
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
110
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
111
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
112
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
113
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
114
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
115
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
116
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
117
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
118
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
119
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
120
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
121
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
122
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
123
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
124
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
125
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
126
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
127
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
128
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
129
};
130

    
131
/* Input permutation for the simple_idct_mmx */
132
static const uint8_t simple_mmx_permutation[64]={
133
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
134
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
135
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
136
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
137
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
138
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
139
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
140
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
141
};
142

    
143
static int pix_sum_c(uint8_t * pix, int line_size)
144
{
145
    int s, i, j;
146

    
147
    s = 0;
148
    for (i = 0; i < 16; i++) {
149
        for (j = 0; j < 16; j += 8) {
150
            s += pix[0];
151
            s += pix[1];
152
            s += pix[2];
153
            s += pix[3];
154
            s += pix[4];
155
            s += pix[5];
156
            s += pix[6];
157
            s += pix[7];
158
            pix += 8;
159
        }
160
        pix += line_size - 16;
161
    }
162
    return s;
163
}
164

    
165
static int pix_norm1_c(uint8_t * pix, int line_size)
166
{
167
    int s, i, j;
168
    uint32_t *sq = ff_squareTbl + 256;
169

    
170
    s = 0;
171
    for (i = 0; i < 16; i++) {
172
        for (j = 0; j < 16; j += 8) {
173
#if 0
174
            s += sq[pix[0]];
175
            s += sq[pix[1]];
176
            s += sq[pix[2]];
177
            s += sq[pix[3]];
178
            s += sq[pix[4]];
179
            s += sq[pix[5]];
180
            s += sq[pix[6]];
181
            s += sq[pix[7]];
182
#else
183
#if LONG_MAX > 2147483647
184
            register uint64_t x=*(uint64_t*)pix;
185
            s += sq[x&0xff];
186
            s += sq[(x>>8)&0xff];
187
            s += sq[(x>>16)&0xff];
188
            s += sq[(x>>24)&0xff];
189
            s += sq[(x>>32)&0xff];
190
            s += sq[(x>>40)&0xff];
191
            s += sq[(x>>48)&0xff];
192
            s += sq[(x>>56)&0xff];
193
#else
194
            register uint32_t x=*(uint32_t*)pix;
195
            s += sq[x&0xff];
196
            s += sq[(x>>8)&0xff];
197
            s += sq[(x>>16)&0xff];
198
            s += sq[(x>>24)&0xff];
199
            x=*(uint32_t*)(pix+4);
200
            s += sq[x&0xff];
201
            s += sq[(x>>8)&0xff];
202
            s += sq[(x>>16)&0xff];
203
            s += sq[(x>>24)&0xff];
204
#endif
205
#endif
206
            pix += 8;
207
        }
208
        pix += line_size - 16;
209
    }
210
    return s;
211
}
212

    
213
static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
214
    int i;
215

    
216
    for(i=0; i+8<=w; i+=8){
217
        dst[i+0]= bswap_32(src[i+0]);
218
        dst[i+1]= bswap_32(src[i+1]);
219
        dst[i+2]= bswap_32(src[i+2]);
220
        dst[i+3]= bswap_32(src[i+3]);
221
        dst[i+4]= bswap_32(src[i+4]);
222
        dst[i+5]= bswap_32(src[i+5]);
223
        dst[i+6]= bswap_32(src[i+6]);
224
        dst[i+7]= bswap_32(src[i+7]);
225
    }
226
    for(;i<w; i++){
227
        dst[i+0]= bswap_32(src[i+0]);
228
    }
229
}
230

    
231
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
232
{
233
    int s, i;
234
    uint32_t *sq = ff_squareTbl + 256;
235

    
236
    s = 0;
237
    for (i = 0; i < h; i++) {
238
        s += sq[pix1[0] - pix2[0]];
239
        s += sq[pix1[1] - pix2[1]];
240
        s += sq[pix1[2] - pix2[2]];
241
        s += sq[pix1[3] - pix2[3]];
242
        pix1 += line_size;
243
        pix2 += line_size;
244
    }
245
    return s;
246
}
247

    
248
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
249
{
250
    int s, i;
251
    uint32_t *sq = ff_squareTbl + 256;
252

    
253
    s = 0;
254
    for (i = 0; i < h; i++) {
255
        s += sq[pix1[0] - pix2[0]];
256
        s += sq[pix1[1] - pix2[1]];
257
        s += sq[pix1[2] - pix2[2]];
258
        s += sq[pix1[3] - pix2[3]];
259
        s += sq[pix1[4] - pix2[4]];
260
        s += sq[pix1[5] - pix2[5]];
261
        s += sq[pix1[6] - pix2[6]];
262
        s += sq[pix1[7] - pix2[7]];
263
        pix1 += line_size;
264
        pix2 += line_size;
265
    }
266
    return s;
267
}
268

    
269
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
270
{
271
    int s, i;
272
    uint32_t *sq = ff_squareTbl + 256;
273

    
274
    s = 0;
275
    for (i = 0; i < h; i++) {
276
        s += sq[pix1[ 0] - pix2[ 0]];
277
        s += sq[pix1[ 1] - pix2[ 1]];
278
        s += sq[pix1[ 2] - pix2[ 2]];
279
        s += sq[pix1[ 3] - pix2[ 3]];
280
        s += sq[pix1[ 4] - pix2[ 4]];
281
        s += sq[pix1[ 5] - pix2[ 5]];
282
        s += sq[pix1[ 6] - pix2[ 6]];
283
        s += sq[pix1[ 7] - pix2[ 7]];
284
        s += sq[pix1[ 8] - pix2[ 8]];
285
        s += sq[pix1[ 9] - pix2[ 9]];
286
        s += sq[pix1[10] - pix2[10]];
287
        s += sq[pix1[11] - pix2[11]];
288
        s += sq[pix1[12] - pix2[12]];
289
        s += sq[pix1[13] - pix2[13]];
290
        s += sq[pix1[14] - pix2[14]];
291
        s += sq[pix1[15] - pix2[15]];
292

    
293
        pix1 += line_size;
294
        pix2 += line_size;
295
    }
296
    return s;
297
}
298

    
299

    
300
#ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
301
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
302
    int s, i, j;
303
    const int dec_count= w==8 ? 3 : 4;
304
    int tmp[32*32];
305
    int level, ori;
306
    static const int scale[2][2][4][4]={
307
      {
308
        {
309
            // 9/7 8x8 dec=3
310
            {268, 239, 239, 213},
311
            {  0, 224, 224, 152},
312
            {  0, 135, 135, 110},
313
        },{
314
            // 9/7 16x16 or 32x32 dec=4
315
            {344, 310, 310, 280},
316
            {  0, 320, 320, 228},
317
            {  0, 175, 175, 136},
318
            {  0, 129, 129, 102},
319
        }
320
      },{
321
        {
322
            // 5/3 8x8 dec=3
323
            {275, 245, 245, 218},
324
            {  0, 230, 230, 156},
325
            {  0, 138, 138, 113},
326
        },{
327
            // 5/3 16x16 or 32x32 dec=4
328
            {352, 317, 317, 286},
329
            {  0, 328, 328, 233},
330
            {  0, 180, 180, 140},
331
            {  0, 132, 132, 105},
332
        }
333
      }
334
    };
335

    
336
    for (i = 0; i < h; i++) {
337
        for (j = 0; j < w; j+=4) {
338
            tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
339
            tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
340
            tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
341
            tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
342
        }
343
        pix1 += line_size;
344
        pix2 += line_size;
345
    }
346

    
347
    ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
348

    
349
    s=0;
350
    assert(w==h);
351
    for(level=0; level<dec_count; level++){
352
        for(ori= level ? 1 : 0; ori<4; ori++){
353
            int size= w>>(dec_count-level);
354
            int sx= (ori&1) ? size : 0;
355
            int stride= 32<<(dec_count-level);
356
            int sy= (ori&2) ? stride>>1 : 0;
357

    
358
            for(i=0; i<size; i++){
359
                for(j=0; j<size; j++){
360
                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
361
                    s += FFABS(v);
362
                }
363
            }
364
        }
365
    }
366
    assert(s>=0);
367
    return s>>9;
368
}
369

    
370
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
371
    return w_c(v, pix1, pix2, line_size,  8, h, 1);
372
}
373

    
374
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
375
    return w_c(v, pix1, pix2, line_size,  8, h, 0);
376
}
377

    
378
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
379
    return w_c(v, pix1, pix2, line_size, 16, h, 1);
380
}
381

    
382
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
383
    return w_c(v, pix1, pix2, line_size, 16, h, 0);
384
}
385

    
386
int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
387
    return w_c(v, pix1, pix2, line_size, 32, h, 1);
388
}
389

    
390
int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
391
    return w_c(v, pix1, pix2, line_size, 32, h, 0);
392
}
393
#endif
394

    
395
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
396
{
397
    int i;
398

    
399
    /* read the pixels */
400
    for(i=0;i<8;i++) {
401
        block[0] = pixels[0];
402
        block[1] = pixels[1];
403
        block[2] = pixels[2];
404
        block[3] = pixels[3];
405
        block[4] = pixels[4];
406
        block[5] = pixels[5];
407
        block[6] = pixels[6];
408
        block[7] = pixels[7];
409
        pixels += line_size;
410
        block += 8;
411
    }
412
}
413

    
414
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
415
                          const uint8_t *s2, int stride){
416
    int i;
417

    
418
    /* read the pixels */
419
    for(i=0;i<8;i++) {
420
        block[0] = s1[0] - s2[0];
421
        block[1] = s1[1] - s2[1];
422
        block[2] = s1[2] - s2[2];
423
        block[3] = s1[3] - s2[3];
424
        block[4] = s1[4] - s2[4];
425
        block[5] = s1[5] - s2[5];
426
        block[6] = s1[6] - s2[6];
427
        block[7] = s1[7] - s2[7];
428
        s1 += stride;
429
        s2 += stride;
430
        block += 8;
431
    }
432
}
433

    
434

    
435
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
436
                                 int line_size)
437
{
438
    int i;
439
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
440

    
441
    /* read the pixels */
442
    for(i=0;i<8;i++) {
443
        pixels[0] = cm[block[0]];
444
        pixels[1] = cm[block[1]];
445
        pixels[2] = cm[block[2]];
446
        pixels[3] = cm[block[3]];
447
        pixels[4] = cm[block[4]];
448
        pixels[5] = cm[block[5]];
449
        pixels[6] = cm[block[6]];
450
        pixels[7] = cm[block[7]];
451

    
452
        pixels += line_size;
453
        block += 8;
454
    }
455
}
456

    
457
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
458
                                 int line_size)
459
{
460
    int i;
461
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
462

    
463
    /* read the pixels */
464
    for(i=0;i<4;i++) {
465
        pixels[0] = cm[block[0]];
466
        pixels[1] = cm[block[1]];
467
        pixels[2] = cm[block[2]];
468
        pixels[3] = cm[block[3]];
469

    
470
        pixels += line_size;
471
        block += 8;
472
    }
473
}
474

    
475
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
476
                                 int line_size)
477
{
478
    int i;
479
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
480

    
481
    /* read the pixels */
482
    for(i=0;i<2;i++) {
483
        pixels[0] = cm[block[0]];
484
        pixels[1] = cm[block[1]];
485

    
486
        pixels += line_size;
487
        block += 8;
488
    }
489
}
490

    
491
static void put_signed_pixels_clamped_c(const DCTELEM *block,
492
                                        uint8_t *restrict pixels,
493
                                        int line_size)
494
{
495
    int i, j;
496

    
497
    for (i = 0; i < 8; i++) {
498
        for (j = 0; j < 8; j++) {
499
            if (*block < -128)
500
                *pixels = 0;
501
            else if (*block > 127)
502
                *pixels = 255;
503
            else
504
                *pixels = (uint8_t)(*block + 128);
505
            block++;
506
            pixels++;
507
        }
508
        pixels += (line_size - 8);
509
    }
510
}
511

    
512
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
513
                          int line_size)
514
{
515
    int i;
516
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
517

    
518
    /* read the pixels */
519
    for(i=0;i<8;i++) {
520
        pixels[0] = cm[pixels[0] + block[0]];
521
        pixels[1] = cm[pixels[1] + block[1]];
522
        pixels[2] = cm[pixels[2] + block[2]];
523
        pixels[3] = cm[pixels[3] + block[3]];
524
        pixels[4] = cm[pixels[4] + block[4]];
525
        pixels[5] = cm[pixels[5] + block[5]];
526
        pixels[6] = cm[pixels[6] + block[6]];
527
        pixels[7] = cm[pixels[7] + block[7]];
528
        pixels += line_size;
529
        block += 8;
530
    }
531
}
532

    
533
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
534
                          int line_size)
535
{
536
    int i;
537
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
538

    
539
    /* read the pixels */
540
    for(i=0;i<4;i++) {
541
        pixels[0] = cm[pixels[0] + block[0]];
542
        pixels[1] = cm[pixels[1] + block[1]];
543
        pixels[2] = cm[pixels[2] + block[2]];
544
        pixels[3] = cm[pixels[3] + block[3]];
545
        pixels += line_size;
546
        block += 8;
547
    }
548
}
549

    
550
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
551
                          int line_size)
552
{
553
    int i;
554
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
555

    
556
    /* read the pixels */
557
    for(i=0;i<2;i++) {
558
        pixels[0] = cm[pixels[0] + block[0]];
559
        pixels[1] = cm[pixels[1] + block[1]];
560
        pixels += line_size;
561
        block += 8;
562
    }
563
}
564

    
565
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
566
{
567
    int i;
568
    for(i=0;i<8;i++) {
569
        pixels[0] += block[0];
570
        pixels[1] += block[1];
571
        pixels[2] += block[2];
572
        pixels[3] += block[3];
573
        pixels[4] += block[4];
574
        pixels[5] += block[5];
575
        pixels[6] += block[6];
576
        pixels[7] += block[7];
577
        pixels += line_size;
578
        block += 8;
579
    }
580
}
581

    
582
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
583
{
584
    int i;
585
    for(i=0;i<4;i++) {
586
        pixels[0] += block[0];
587
        pixels[1] += block[1];
588
        pixels[2] += block[2];
589
        pixels[3] += block[3];
590
        pixels += line_size;
591
        block += 4;
592
    }
593
}
594

    
595
static int sum_abs_dctelem_c(DCTELEM *block)
596
{
597
    int sum=0, i;
598
    for(i=0; i<64; i++)
599
        sum+= FFABS(block[i]);
600
    return sum;
601
}
602

    
603
#if 0
604

605
#define PIXOP2(OPNAME, OP) \
606
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
607
{\
608
    int i;\
609
    for(i=0; i<h; i++){\
610
        OP(*((uint64_t*)block), LD64(pixels));\
611
        pixels+=line_size;\
612
        block +=line_size;\
613
    }\
614
}\
615
\
616
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
617
{\
618
    int i;\
619
    for(i=0; i<h; i++){\
620
        const uint64_t a= LD64(pixels  );\
621
        const uint64_t b= LD64(pixels+1);\
622
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
623
        pixels+=line_size;\
624
        block +=line_size;\
625
    }\
626
}\
627
\
628
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
629
{\
630
    int i;\
631
    for(i=0; i<h; i++){\
632
        const uint64_t a= LD64(pixels  );\
633
        const uint64_t b= LD64(pixels+1);\
634
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
635
        pixels+=line_size;\
636
        block +=line_size;\
637
    }\
638
}\
639
\
640
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
641
{\
642
    int i;\
643
    for(i=0; i<h; i++){\
644
        const uint64_t a= LD64(pixels          );\
645
        const uint64_t b= LD64(pixels+line_size);\
646
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
647
        pixels+=line_size;\
648
        block +=line_size;\
649
    }\
650
}\
651
\
652
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
653
{\
654
    int i;\
655
    for(i=0; i<h; i++){\
656
        const uint64_t a= LD64(pixels          );\
657
        const uint64_t b= LD64(pixels+line_size);\
658
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
659
        pixels+=line_size;\
660
        block +=line_size;\
661
    }\
662
}\
663
\
664
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
665
{\
666
        int i;\
667
        const uint64_t a= LD64(pixels  );\
668
        const uint64_t b= LD64(pixels+1);\
669
        uint64_t l0=  (a&0x0303030303030303ULL)\
670
                    + (b&0x0303030303030303ULL)\
671
                    + 0x0202020202020202ULL;\
672
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
673
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
674
        uint64_t l1,h1;\
675
\
676
        pixels+=line_size;\
677
        for(i=0; i<h; i+=2){\
678
            uint64_t a= LD64(pixels  );\
679
            uint64_t b= LD64(pixels+1);\
680
            l1=  (a&0x0303030303030303ULL)\
681
               + (b&0x0303030303030303ULL);\
682
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
683
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
684
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
685
            pixels+=line_size;\
686
            block +=line_size;\
687
            a= LD64(pixels  );\
688
            b= LD64(pixels+1);\
689
            l0=  (a&0x0303030303030303ULL)\
690
               + (b&0x0303030303030303ULL)\
691
               + 0x0202020202020202ULL;\
692
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
693
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
694
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
695
            pixels+=line_size;\
696
            block +=line_size;\
697
        }\
698
}\
699
\
700
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
701
{\
702
        int i;\
703
        const uint64_t a= LD64(pixels  );\
704
        const uint64_t b= LD64(pixels+1);\
705
        uint64_t l0=  (a&0x0303030303030303ULL)\
706
                    + (b&0x0303030303030303ULL)\
707
                    + 0x0101010101010101ULL;\
708
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
709
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
710
        uint64_t l1,h1;\
711
\
712
        pixels+=line_size;\
713
        for(i=0; i<h; i+=2){\
714
            uint64_t a= LD64(pixels  );\
715
            uint64_t b= LD64(pixels+1);\
716
            l1=  (a&0x0303030303030303ULL)\
717
               + (b&0x0303030303030303ULL);\
718
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
719
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
720
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
721
            pixels+=line_size;\
722
            block +=line_size;\
723
            a= LD64(pixels  );\
724
            b= LD64(pixels+1);\
725
            l0=  (a&0x0303030303030303ULL)\
726
               + (b&0x0303030303030303ULL)\
727
               + 0x0101010101010101ULL;\
728
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
729
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
730
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
731
            pixels+=line_size;\
732
            block +=line_size;\
733
        }\
734
}\
735
\
736
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
737
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
738
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
739
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
740
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
741
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
742
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
743

744
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
745
#else // 64 bit variant
746

    
747
#define PIXOP2(OPNAME, OP) \
748
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
749
    int i;\
750
    for(i=0; i<h; i++){\
751
        OP(*((uint16_t*)(block  )), LD16(pixels  ));\
752
        pixels+=line_size;\
753
        block +=line_size;\
754
    }\
755
}\
756
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
757
    int i;\
758
    for(i=0; i<h; i++){\
759
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
760
        pixels+=line_size;\
761
        block +=line_size;\
762
    }\
763
}\
764
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
765
    int i;\
766
    for(i=0; i<h; i++){\
767
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
768
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
769
        pixels+=line_size;\
770
        block +=line_size;\
771
    }\
772
}\
773
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
774
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
775
}\
776
\
777
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
778
                                                int src_stride1, int src_stride2, int h){\
779
    int i;\
780
    for(i=0; i<h; i++){\
781
        uint32_t a,b;\
782
        a= LD32(&src1[i*src_stride1  ]);\
783
        b= LD32(&src2[i*src_stride2  ]);\
784
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
785
        a= LD32(&src1[i*src_stride1+4]);\
786
        b= LD32(&src2[i*src_stride2+4]);\
787
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
788
    }\
789
}\
790
\
791
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
792
                                                int src_stride1, int src_stride2, int h){\
793
    int i;\
794
    for(i=0; i<h; i++){\
795
        uint32_t a,b;\
796
        a= LD32(&src1[i*src_stride1  ]);\
797
        b= LD32(&src2[i*src_stride2  ]);\
798
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
799
        a= LD32(&src1[i*src_stride1+4]);\
800
        b= LD32(&src2[i*src_stride2+4]);\
801
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
802
    }\
803
}\
804
\
805
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
806
                                                int src_stride1, int src_stride2, int h){\
807
    int i;\
808
    for(i=0; i<h; i++){\
809
        uint32_t a,b;\
810
        a= LD32(&src1[i*src_stride1  ]);\
811
        b= LD32(&src2[i*src_stride2  ]);\
812
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
813
    }\
814
}\
815
\
816
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
817
                                                int src_stride1, int src_stride2, int h){\
818
    int i;\
819
    for(i=0; i<h; i++){\
820
        uint32_t a,b;\
821
        a= LD16(&src1[i*src_stride1  ]);\
822
        b= LD16(&src2[i*src_stride2  ]);\
823
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
824
    }\
825
}\
826
\
827
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
828
                                                int src_stride1, int src_stride2, int h){\
829
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
830
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
831
}\
832
\
833
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
834
                                                int src_stride1, int src_stride2, int h){\
835
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
836
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
837
}\
838
\
839
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
840
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
841
}\
842
\
843
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
844
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
845
}\
846
\
847
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
848
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
849
}\
850
\
851
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
852
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
853
}\
854
\
855
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
856
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
857
    int i;\
858
    for(i=0; i<h; i++){\
859
        uint32_t a, b, c, d, l0, l1, h0, h1;\
860
        a= LD32(&src1[i*src_stride1]);\
861
        b= LD32(&src2[i*src_stride2]);\
862
        c= LD32(&src3[i*src_stride3]);\
863
        d= LD32(&src4[i*src_stride4]);\
864
        l0=  (a&0x03030303UL)\
865
           + (b&0x03030303UL)\
866
           + 0x02020202UL;\
867
        h0= ((a&0xFCFCFCFCUL)>>2)\
868
          + ((b&0xFCFCFCFCUL)>>2);\
869
        l1=  (c&0x03030303UL)\
870
           + (d&0x03030303UL);\
871
        h1= ((c&0xFCFCFCFCUL)>>2)\
872
          + ((d&0xFCFCFCFCUL)>>2);\
873
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
874
        a= LD32(&src1[i*src_stride1+4]);\
875
        b= LD32(&src2[i*src_stride2+4]);\
876
        c= LD32(&src3[i*src_stride3+4]);\
877
        d= LD32(&src4[i*src_stride4+4]);\
878
        l0=  (a&0x03030303UL)\
879
           + (b&0x03030303UL)\
880
           + 0x02020202UL;\
881
        h0= ((a&0xFCFCFCFCUL)>>2)\
882
          + ((b&0xFCFCFCFCUL)>>2);\
883
        l1=  (c&0x03030303UL)\
884
           + (d&0x03030303UL);\
885
        h1= ((c&0xFCFCFCFCUL)>>2)\
886
          + ((d&0xFCFCFCFCUL)>>2);\
887
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
888
    }\
889
}\
890
\
891
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
892
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
893
}\
894
\
895
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
896
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
897
}\
898
\
899
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
900
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
901
}\
902
\
903
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
904
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
905
}\
906
\
907
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
908
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
909
    int i;\
910
    for(i=0; i<h; i++){\
911
        uint32_t a, b, c, d, l0, l1, h0, h1;\
912
        a= LD32(&src1[i*src_stride1]);\
913
        b= LD32(&src2[i*src_stride2]);\
914
        c= LD32(&src3[i*src_stride3]);\
915
        d= LD32(&src4[i*src_stride4]);\
916
        l0=  (a&0x03030303UL)\
917
           + (b&0x03030303UL)\
918
           + 0x01010101UL;\
919
        h0= ((a&0xFCFCFCFCUL)>>2)\
920
          + ((b&0xFCFCFCFCUL)>>2);\
921
        l1=  (c&0x03030303UL)\
922
           + (d&0x03030303UL);\
923
        h1= ((c&0xFCFCFCFCUL)>>2)\
924
          + ((d&0xFCFCFCFCUL)>>2);\
925
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
926
        a= LD32(&src1[i*src_stride1+4]);\
927
        b= LD32(&src2[i*src_stride2+4]);\
928
        c= LD32(&src3[i*src_stride3+4]);\
929
        d= LD32(&src4[i*src_stride4+4]);\
930
        l0=  (a&0x03030303UL)\
931
           + (b&0x03030303UL)\
932
           + 0x01010101UL;\
933
        h0= ((a&0xFCFCFCFCUL)>>2)\
934
          + ((b&0xFCFCFCFCUL)>>2);\
935
        l1=  (c&0x03030303UL)\
936
           + (d&0x03030303UL);\
937
        h1= ((c&0xFCFCFCFCUL)>>2)\
938
          + ((d&0xFCFCFCFCUL)>>2);\
939
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
940
    }\
941
}\
942
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
943
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
944
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
945
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
946
}\
947
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
948
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
949
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
950
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
951
}\
952
\
953
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
954
{\
955
        int i, a0, b0, a1, b1;\
956
        a0= pixels[0];\
957
        b0= pixels[1] + 2;\
958
        a0 += b0;\
959
        b0 += pixels[2];\
960
\
961
        pixels+=line_size;\
962
        for(i=0; i<h; i+=2){\
963
            a1= pixels[0];\
964
            b1= pixels[1];\
965
            a1 += b1;\
966
            b1 += pixels[2];\
967
\
968
            block[0]= (a1+a0)>>2; /* FIXME non put */\
969
            block[1]= (b1+b0)>>2;\
970
\
971
            pixels+=line_size;\
972
            block +=line_size;\
973
\
974
            a0= pixels[0];\
975
            b0= pixels[1] + 2;\
976
            a0 += b0;\
977
            b0 += pixels[2];\
978
\
979
            block[0]= (a1+a0)>>2;\
980
            block[1]= (b1+b0)>>2;\
981
            pixels+=line_size;\
982
            block +=line_size;\
983
        }\
984
}\
985
\
986
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
987
{\
988
        int i;\
989
        const uint32_t a= LD32(pixels  );\
990
        const uint32_t b= LD32(pixels+1);\
991
        uint32_t l0=  (a&0x03030303UL)\
992
                    + (b&0x03030303UL)\
993
                    + 0x02020202UL;\
994
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
995
                   + ((b&0xFCFCFCFCUL)>>2);\
996
        uint32_t l1,h1;\
997
\
998
        pixels+=line_size;\
999
        for(i=0; i<h; i+=2){\
1000
            uint32_t a= LD32(pixels  );\
1001
            uint32_t b= LD32(pixels+1);\
1002
            l1=  (a&0x03030303UL)\
1003
               + (b&0x03030303UL);\
1004
            h1= ((a&0xFCFCFCFCUL)>>2)\
1005
              + ((b&0xFCFCFCFCUL)>>2);\
1006
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1007
            pixels+=line_size;\
1008
            block +=line_size;\
1009
            a= LD32(pixels  );\
1010
            b= LD32(pixels+1);\
1011
            l0=  (a&0x03030303UL)\
1012
               + (b&0x03030303UL)\
1013
               + 0x02020202UL;\
1014
            h0= ((a&0xFCFCFCFCUL)>>2)\
1015
              + ((b&0xFCFCFCFCUL)>>2);\
1016
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1017
            pixels+=line_size;\
1018
            block +=line_size;\
1019
        }\
1020
}\
1021
\
1022
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1023
{\
1024
    int j;\
1025
    for(j=0; j<2; j++){\
1026
        int i;\
1027
        const uint32_t a= LD32(pixels  );\
1028
        const uint32_t b= LD32(pixels+1);\
1029
        uint32_t l0=  (a&0x03030303UL)\
1030
                    + (b&0x03030303UL)\
1031
                    + 0x02020202UL;\
1032
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1033
                   + ((b&0xFCFCFCFCUL)>>2);\
1034
        uint32_t l1,h1;\
1035
\
1036
        pixels+=line_size;\
1037
        for(i=0; i<h; i+=2){\
1038
            uint32_t a= LD32(pixels  );\
1039
            uint32_t b= LD32(pixels+1);\
1040
            l1=  (a&0x03030303UL)\
1041
               + (b&0x03030303UL);\
1042
            h1= ((a&0xFCFCFCFCUL)>>2)\
1043
              + ((b&0xFCFCFCFCUL)>>2);\
1044
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1045
            pixels+=line_size;\
1046
            block +=line_size;\
1047
            a= LD32(pixels  );\
1048
            b= LD32(pixels+1);\
1049
            l0=  (a&0x03030303UL)\
1050
               + (b&0x03030303UL)\
1051
               + 0x02020202UL;\
1052
            h0= ((a&0xFCFCFCFCUL)>>2)\
1053
              + ((b&0xFCFCFCFCUL)>>2);\
1054
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1055
            pixels+=line_size;\
1056
            block +=line_size;\
1057
        }\
1058
        pixels+=4-line_size*(h+1);\
1059
        block +=4-line_size*h;\
1060
    }\
1061
}\
1062
\
1063
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1064
{\
1065
    int j;\
1066
    for(j=0; j<2; j++){\
1067
        int i;\
1068
        const uint32_t a= LD32(pixels  );\
1069
        const uint32_t b= LD32(pixels+1);\
1070
        uint32_t l0=  (a&0x03030303UL)\
1071
                    + (b&0x03030303UL)\
1072
                    + 0x01010101UL;\
1073
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1074
                   + ((b&0xFCFCFCFCUL)>>2);\
1075
        uint32_t l1,h1;\
1076
\
1077
        pixels+=line_size;\
1078
        for(i=0; i<h; i+=2){\
1079
            uint32_t a= LD32(pixels  );\
1080
            uint32_t b= LD32(pixels+1);\
1081
            l1=  (a&0x03030303UL)\
1082
               + (b&0x03030303UL);\
1083
            h1= ((a&0xFCFCFCFCUL)>>2)\
1084
              + ((b&0xFCFCFCFCUL)>>2);\
1085
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1086
            pixels+=line_size;\
1087
            block +=line_size;\
1088
            a= LD32(pixels  );\
1089
            b= LD32(pixels+1);\
1090
            l0=  (a&0x03030303UL)\
1091
               + (b&0x03030303UL)\
1092
               + 0x01010101UL;\
1093
            h0= ((a&0xFCFCFCFCUL)>>2)\
1094
              + ((b&0xFCFCFCFCUL)>>2);\
1095
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1096
            pixels+=line_size;\
1097
            block +=line_size;\
1098
        }\
1099
        pixels+=4-line_size*(h+1);\
1100
        block +=4-line_size*h;\
1101
    }\
1102
}\
1103
\
1104
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1105
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1106
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1107
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1108
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1109
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1110
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1111
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1112

    
1113
#define op_avg(a, b) a = rnd_avg32(a, b)
1114
#endif
1115
#define op_put(a, b) a = b
1116

    
1117
PIXOP2(avg, op_avg)
1118
PIXOP2(put, op_put)
1119
#undef op_avg
1120
#undef op_put
1121

    
1122
#define avg2(a,b) ((a+b+1)>>1)
1123
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1124

    
1125
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1126
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1127
}
1128

    
1129
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1130
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1131
}
1132

    
1133
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1134
{
1135
    const int A=(16-x16)*(16-y16);
1136
    const int B=(   x16)*(16-y16);
1137
    const int C=(16-x16)*(   y16);
1138
    const int D=(   x16)*(   y16);
1139
    int i;
1140

    
1141
    for(i=0; i<h; i++)
1142
    {
1143
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1144
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1145
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1146
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1147
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1148
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1149
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1150
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1151
        dst+= stride;
1152
        src+= stride;
1153
    }
1154
}
1155

    
1156
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1157
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1158
{
1159
    int y, vx, vy;
1160
    const int s= 1<<shift;
1161

    
1162
    width--;
1163
    height--;
1164

    
1165
    for(y=0; y<h; y++){
1166
        int x;
1167

    
1168
        vx= ox;
1169
        vy= oy;
1170
        for(x=0; x<8; x++){ //XXX FIXME optimize
1171
            int src_x, src_y, frac_x, frac_y, index;
1172

    
1173
            src_x= vx>>16;
1174
            src_y= vy>>16;
1175
            frac_x= src_x&(s-1);
1176
            frac_y= src_y&(s-1);
1177
            src_x>>=shift;
1178
            src_y>>=shift;
1179

    
1180
            if((unsigned)src_x < width){
1181
                if((unsigned)src_y < height){
1182
                    index= src_x + src_y*stride;
1183
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1184
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1185
                                        + (  src[index+stride  ]*(s-frac_x)
1186
                                           + src[index+stride+1]*   frac_x )*   frac_y
1187
                                        + r)>>(shift*2);
1188
                }else{
1189
                    index= src_x + av_clip(src_y, 0, height)*stride;
1190
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1191
                                          + src[index       +1]*   frac_x )*s
1192
                                        + r)>>(shift*2);
1193
                }
1194
            }else{
1195
                if((unsigned)src_y < height){
1196
                    index= av_clip(src_x, 0, width) + src_y*stride;
1197
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1198
                                           + src[index+stride  ]*   frac_y )*s
1199
                                        + r)>>(shift*2);
1200
                }else{
1201
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1202
                    dst[y*stride + x]=    src[index         ];
1203
                }
1204
            }
1205

    
1206
            vx+= dxx;
1207
            vy+= dyx;
1208
        }
1209
        ox += dxy;
1210
        oy += dyy;
1211
    }
1212
}
1213

    
1214
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1215
    switch(width){
1216
    case 2: put_pixels2_c (dst, src, stride, height); break;
1217
    case 4: put_pixels4_c (dst, src, stride, height); break;
1218
    case 8: put_pixels8_c (dst, src, stride, height); break;
1219
    case 16:put_pixels16_c(dst, src, stride, height); break;
1220
    }
1221
}
1222

    
1223
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1224
    int i,j;
1225
    for (i=0; i < height; i++) {
1226
      for (j=0; j < width; j++) {
1227
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1228
      }
1229
      src += stride;
1230
      dst += stride;
1231
    }
1232
}
1233

    
1234
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1235
    int i,j;
1236
    for (i=0; i < height; i++) {
1237
      for (j=0; j < width; j++) {
1238
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1239
      }
1240
      src += stride;
1241
      dst += stride;
1242
    }
1243
}
1244

    
1245
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1246
    int i,j;
1247
    for (i=0; i < height; i++) {
1248
      for (j=0; j < width; j++) {
1249
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1250
      }
1251
      src += stride;
1252
      dst += stride;
1253
    }
1254
}
1255

    
1256
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1257
    int i,j;
1258
    for (i=0; i < height; i++) {
1259
      for (j=0; j < width; j++) {
1260
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1261
      }
1262
      src += stride;
1263
      dst += stride;
1264
    }
1265
}
1266

    
1267
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1268
    int i,j;
1269
    for (i=0; i < height; i++) {
1270
      for (j=0; j < width; j++) {
1271
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1272
      }
1273
      src += stride;
1274
      dst += stride;
1275
    }
1276
}
1277

    
1278
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1279
    int i,j;
1280
    for (i=0; i < height; i++) {
1281
      for (j=0; j < width; j++) {
1282
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1283
      }
1284
      src += stride;
1285
      dst += stride;
1286
    }
1287
}
1288

    
1289
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1290
    int i,j;
1291
    for (i=0; i < height; i++) {
1292
      for (j=0; j < width; j++) {
1293
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1294
      }
1295
      src += stride;
1296
      dst += stride;
1297
    }
1298
}
1299

    
1300
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1301
    int i,j;
1302
    for (i=0; i < height; i++) {
1303
      for (j=0; j < width; j++) {
1304
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1305
      }
1306
      src += stride;
1307
      dst += stride;
1308
    }
1309
}
1310

    
1311
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1312
    switch(width){
1313
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1314
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1315
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1316
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1317
    }
1318
}
1319

    
1320
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1321
    int i,j;
1322
    for (i=0; i < height; i++) {
1323
      for (j=0; j < width; j++) {
1324
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1325
      }
1326
      src += stride;
1327
      dst += stride;
1328
    }
1329
}
1330

    
1331
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1332
    int i,j;
1333
    for (i=0; i < height; i++) {
1334
      for (j=0; j < width; j++) {
1335
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1336
      }
1337
      src += stride;
1338
      dst += stride;
1339
    }
1340
}
1341

    
1342
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1343
    int i,j;
1344
    for (i=0; i < height; i++) {
1345
      for (j=0; j < width; j++) {
1346
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1347
      }
1348
      src += stride;
1349
      dst += stride;
1350
    }
1351
}
1352

    
1353
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1354
    int i,j;
1355
    for (i=0; i < height; i++) {
1356
      for (j=0; j < width; j++) {
1357
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1358
      }
1359
      src += stride;
1360
      dst += stride;
1361
    }
1362
}
1363

    
1364
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1365
    int i,j;
1366
    for (i=0; i < height; i++) {
1367
      for (j=0; j < width; j++) {
1368
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1369
      }
1370
      src += stride;
1371
      dst += stride;
1372
    }
1373
}
1374

    
1375
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1376
    int i,j;
1377
    for (i=0; i < height; i++) {
1378
      for (j=0; j < width; j++) {
1379
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1380
      }
1381
      src += stride;
1382
      dst += stride;
1383
    }
1384
}
1385

    
1386
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1387
    int i,j;
1388
    for (i=0; i < height; i++) {
1389
      for (j=0; j < width; j++) {
1390
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1391
      }
1392
      src += stride;
1393
      dst += stride;
1394
    }
1395
}
1396

    
1397
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1398
    int i,j;
1399
    for (i=0; i < height; i++) {
1400
      for (j=0; j < width; j++) {
1401
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1402
      }
1403
      src += stride;
1404
      dst += stride;
1405
    }
1406
}
1407
#if 0
1408
#define TPEL_WIDTH(width)\
1409
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1410
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1411
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1412
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1413
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1415
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1417
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1418
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1419
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1420
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1421
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1422
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1423
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1424
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1425
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1426
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1427
#endif
1428

    
1429
#define H264_CHROMA_MC(OPNAME, OP)\
1430
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1431
    const int A=(8-x)*(8-y);\
1432
    const int B=(  x)*(8-y);\
1433
    const int C=(8-x)*(  y);\
1434
    const int D=(  x)*(  y);\
1435
    int i;\
1436
    \
1437
    assert(x<8 && y<8 && x>=0 && y>=0);\
1438
\
1439
    for(i=0; i<h; i++)\
1440
    {\
1441
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1442
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1443
        dst+= stride;\
1444
        src+= stride;\
1445
    }\
1446
}\
1447
\
1448
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1449
    const int A=(8-x)*(8-y);\
1450
    const int B=(  x)*(8-y);\
1451
    const int C=(8-x)*(  y);\
1452
    const int D=(  x)*(  y);\
1453
    int i;\
1454
    \
1455
    assert(x<8 && y<8 && x>=0 && y>=0);\
1456
\
1457
    for(i=0; i<h; i++)\
1458
    {\
1459
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1460
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1461
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1462
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1463
        dst+= stride;\
1464
        src+= stride;\
1465
    }\
1466
}\
1467
\
1468
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1469
    const int A=(8-x)*(8-y);\
1470
    const int B=(  x)*(8-y);\
1471
    const int C=(8-x)*(  y);\
1472
    const int D=(  x)*(  y);\
1473
    int i;\
1474
    \
1475
    assert(x<8 && y<8 && x>=0 && y>=0);\
1476
\
1477
    for(i=0; i<h; i++)\
1478
    {\
1479
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1480
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1481
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1482
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1483
        OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1484
        OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1485
        OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1486
        OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1487
        dst+= stride;\
1488
        src+= stride;\
1489
    }\
1490
}
1491

    
1492
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1493
#define op_put(a, b) a = (((b) + 32)>>6)
1494

    
1495
H264_CHROMA_MC(put_       , op_put)
1496
H264_CHROMA_MC(avg_       , op_avg)
1497
#undef op_avg
1498
#undef op_put
1499

    
1500
static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1501
    const int A=(8-x)*(8-y);
1502
    const int B=(  x)*(8-y);
1503
    const int C=(8-x)*(  y);
1504
    const int D=(  x)*(  y);
1505
    int i;
1506

    
1507
    assert(x<8 && y<8 && x>=0 && y>=0);
1508

    
1509
    for(i=0; i<h; i++)
1510
    {
1511
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1512
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1513
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1514
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1515
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1516
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1517
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1518
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1519
        dst+= stride;
1520
        src+= stride;
1521
    }
1522
}
1523

    
1524
#define QPEL_MC(r, OPNAME, RND, OP) \
1525
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1526
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1527
    int i;\
1528
    for(i=0; i<h; i++)\
1529
    {\
1530
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1531
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1532
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1533
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1534
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1535
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1536
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1537
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1538
        dst+=dstStride;\
1539
        src+=srcStride;\
1540
    }\
1541
}\
1542
\
1543
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1544
    const int w=8;\
1545
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1546
    int i;\
1547
    for(i=0; i<w; i++)\
1548
    {\
1549
        const int src0= src[0*srcStride];\
1550
        const int src1= src[1*srcStride];\
1551
        const int src2= src[2*srcStride];\
1552
        const int src3= src[3*srcStride];\
1553
        const int src4= src[4*srcStride];\
1554
        const int src5= src[5*srcStride];\
1555
        const int src6= src[6*srcStride];\
1556
        const int src7= src[7*srcStride];\
1557
        const int src8= src[8*srcStride];\
1558
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1559
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1560
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1561
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1562
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1563
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1564
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1565
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1566
        dst++;\
1567
        src++;\
1568
    }\
1569
}\
1570
\
1571
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1572
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1573
    int i;\
1574
    \
1575
    for(i=0; i<h; i++)\
1576
    {\
1577
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1578
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1579
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1580
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1581
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1582
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1583
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1584
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1585
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1586
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1587
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1588
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1589
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1590
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1591
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1592
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1593
        dst+=dstStride;\
1594
        src+=srcStride;\
1595
    }\
1596
}\
1597
\
1598
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1599
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1600
    int i;\
1601
    const int w=16;\
1602
    for(i=0; i<w; i++)\
1603
    {\
1604
        const int src0= src[0*srcStride];\
1605
        const int src1= src[1*srcStride];\
1606
        const int src2= src[2*srcStride];\
1607
        const int src3= src[3*srcStride];\
1608
        const int src4= src[4*srcStride];\
1609
        const int src5= src[5*srcStride];\
1610
        const int src6= src[6*srcStride];\
1611
        const int src7= src[7*srcStride];\
1612
        const int src8= src[8*srcStride];\
1613
        const int src9= src[9*srcStride];\
1614
        const int src10= src[10*srcStride];\
1615
        const int src11= src[11*srcStride];\
1616
        const int src12= src[12*srcStride];\
1617
        const int src13= src[13*srcStride];\
1618
        const int src14= src[14*srcStride];\
1619
        const int src15= src[15*srcStride];\
1620
        const int src16= src[16*srcStride];\
1621
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1622
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1623
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1624
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1625
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1626
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1627
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1628
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1629
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1630
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1631
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1632
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1633
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1634
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1635
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1636
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1637
        dst++;\
1638
        src++;\
1639
    }\
1640
}\
1641
\
1642
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1643
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1644
}\
1645
\
1646
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1647
    uint8_t half[64];\
1648
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1649
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1650
}\
1651
\
1652
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1653
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1654
}\
1655
\
1656
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1657
    uint8_t half[64];\
1658
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1659
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1660
}\
1661
\
1662
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1663
    uint8_t full[16*9];\
1664
    uint8_t half[64];\
1665
    copy_block9(full, src, 16, stride, 9);\
1666
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1667
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1668
}\
1669
\
1670
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1671
    uint8_t full[16*9];\
1672
    copy_block9(full, src, 16, stride, 9);\
1673
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1674
}\
1675
\
1676
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1677
    uint8_t full[16*9];\
1678
    uint8_t half[64];\
1679
    copy_block9(full, src, 16, stride, 9);\
1680
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1681
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1682
}\
1683
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1684
    uint8_t full[16*9];\
1685
    uint8_t halfH[72];\
1686
    uint8_t halfV[64];\
1687
    uint8_t halfHV[64];\
1688
    copy_block9(full, src, 16, stride, 9);\
1689
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1690
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1691
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1692
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1693
}\
1694
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1695
    uint8_t full[16*9];\
1696
    uint8_t halfH[72];\
1697
    uint8_t halfHV[64];\
1698
    copy_block9(full, src, 16, stride, 9);\
1699
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1700
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1701
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1702
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1703
}\
1704
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1705
    uint8_t full[16*9];\
1706
    uint8_t halfH[72];\
1707
    uint8_t halfV[64];\
1708
    uint8_t halfHV[64];\
1709
    copy_block9(full, src, 16, stride, 9);\
1710
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1711
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1712
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1713
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1714
}\
1715
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1716
    uint8_t full[16*9];\
1717
    uint8_t halfH[72];\
1718
    uint8_t halfHV[64];\
1719
    copy_block9(full, src, 16, stride, 9);\
1720
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1721
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1722
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1723
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1724
}\
1725
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1726
    uint8_t full[16*9];\
1727
    uint8_t halfH[72];\
1728
    uint8_t halfV[64];\
1729
    uint8_t halfHV[64];\
1730
    copy_block9(full, src, 16, stride, 9);\
1731
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1732
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1733
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1734
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1735
}\
1736
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1737
    uint8_t full[16*9];\
1738
    uint8_t halfH[72];\
1739
    uint8_t halfHV[64];\
1740
    copy_block9(full, src, 16, stride, 9);\
1741
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1742
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1743
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1744
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1745
}\
1746
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1747
    uint8_t full[16*9];\
1748
    uint8_t halfH[72];\
1749
    uint8_t halfV[64];\
1750
    uint8_t halfHV[64];\
1751
    copy_block9(full, src, 16, stride, 9);\
1752
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1753
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1754
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1755
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1756
}\
1757
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1758
    uint8_t full[16*9];\
1759
    uint8_t halfH[72];\
1760
    uint8_t halfHV[64];\
1761
    copy_block9(full, src, 16, stride, 9);\
1762
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1763
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1764
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1765
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1766
}\
1767
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1768
    uint8_t halfH[72];\
1769
    uint8_t halfHV[64];\
1770
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1771
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1772
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1773
}\
1774
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1775
    uint8_t halfH[72];\
1776
    uint8_t halfHV[64];\
1777
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1778
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1779
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1780
}\
1781
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1782
    uint8_t full[16*9];\
1783
    uint8_t halfH[72];\
1784
    uint8_t halfV[64];\
1785
    uint8_t halfHV[64];\
1786
    copy_block9(full, src, 16, stride, 9);\
1787
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1788
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1789
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1790
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1791
}\
1792
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1793
    uint8_t full[16*9];\
1794
    uint8_t halfH[72];\
1795
    copy_block9(full, src, 16, stride, 9);\
1796
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1797
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1798
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1799
}\
1800
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1801
    uint8_t full[16*9];\
1802
    uint8_t halfH[72];\
1803
    uint8_t halfV[64];\
1804
    uint8_t halfHV[64];\
1805
    copy_block9(full, src, 16, stride, 9);\
1806
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1807
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1808
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1809
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1810
}\
1811
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1812
    uint8_t full[16*9];\
1813
    uint8_t halfH[72];\
1814
    copy_block9(full, src, 16, stride, 9);\
1815
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1816
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1817
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1818
}\
1819
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1820
    uint8_t halfH[72];\
1821
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1822
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1823
}\
1824
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1825
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1826
}\
1827
\
1828
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1829
    uint8_t half[256];\
1830
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1831
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1832
}\
1833
\
1834
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1835
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1836
}\
1837
\
1838
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1839
    uint8_t half[256];\
1840
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1841
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1842
}\
1843
\
1844
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1845
    uint8_t full[24*17];\
1846
    uint8_t half[256];\
1847
    copy_block17(full, src, 24, stride, 17);\
1848
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1849
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1850
}\
1851
\
1852
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1853
    uint8_t full[24*17];\
1854
    copy_block17(full, src, 24, stride, 17);\
1855
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1856
}\
1857
\
1858
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1859
    uint8_t full[24*17];\
1860
    uint8_t half[256];\
1861
    copy_block17(full, src, 24, stride, 17);\
1862
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1863
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1864
}\
1865
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1866
    uint8_t full[24*17];\
1867
    uint8_t halfH[272];\
1868
    uint8_t halfV[256];\
1869
    uint8_t halfHV[256];\
1870
    copy_block17(full, src, 24, stride, 17);\
1871
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1872
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1873
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1874
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1875
}\
1876
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1877
    uint8_t full[24*17];\
1878
    uint8_t halfH[272];\
1879
    uint8_t halfHV[256];\
1880
    copy_block17(full, src, 24, stride, 17);\
1881
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1882
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1883
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1884
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1885
}\
1886
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1887
    uint8_t full[24*17];\
1888
    uint8_t halfH[272];\
1889
    uint8_t halfV[256];\
1890
    uint8_t halfHV[256];\
1891
    copy_block17(full, src, 24, stride, 17);\
1892
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1893
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1894
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1895
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1896
}\
1897
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1898
    uint8_t full[24*17];\
1899
    uint8_t halfH[272];\
1900
    uint8_t halfHV[256];\
1901
    copy_block17(full, src, 24, stride, 17);\
1902
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1903
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1904
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1905
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1906
}\
1907
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1908
    uint8_t full[24*17];\
1909
    uint8_t halfH[272];\
1910
    uint8_t halfV[256];\
1911
    uint8_t halfHV[256];\
1912
    copy_block17(full, src, 24, stride, 17);\
1913
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1914
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1915
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1916
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1917
}\
1918
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1919
    uint8_t full[24*17];\
1920
    uint8_t halfH[272];\
1921
    uint8_t halfHV[256];\
1922
    copy_block17(full, src, 24, stride, 17);\
1923
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1924
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1925
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1926
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1927
}\
1928
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1929
    uint8_t full[24*17];\
1930
    uint8_t halfH[272];\
1931
    uint8_t halfV[256];\
1932
    uint8_t halfHV[256];\
1933
    copy_block17(full, src, 24, stride, 17);\
1934
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1935
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1936
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1937
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1938
}\
1939
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1940
    uint8_t full[24*17];\
1941
    uint8_t halfH[272];\
1942
    uint8_t halfHV[256];\
1943
    copy_block17(full, src, 24, stride, 17);\
1944
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1945
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1946
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1947
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1948
}\
1949
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1950
    uint8_t halfH[272];\
1951
    uint8_t halfHV[256];\
1952
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1953
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1954
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1955
}\
1956
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1957
    uint8_t halfH[272];\
1958
    uint8_t halfHV[256];\
1959
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1960
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1961
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1962
}\
1963
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1964
    uint8_t full[24*17];\
1965
    uint8_t halfH[272];\
1966
    uint8_t halfV[256];\
1967
    uint8_t halfHV[256];\
1968
    copy_block17(full, src, 24, stride, 17);\
1969
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1970
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1971
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1972
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1973
}\
1974
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1975
    uint8_t full[24*17];\
1976
    uint8_t halfH[272];\
1977
    copy_block17(full, src, 24, stride, 17);\
1978
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1979
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1980
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1981
}\
1982
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1983
    uint8_t full[24*17];\
1984
    uint8_t halfH[272];\
1985
    uint8_t halfV[256];\
1986
    uint8_t halfHV[256];\
1987
    copy_block17(full, src, 24, stride, 17);\
1988
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1989
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1990
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1991
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1992
}\
1993
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1994
    uint8_t full[24*17];\
1995
    uint8_t halfH[272];\
1996
    copy_block17(full, src, 24, stride, 17);\
1997
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1998
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1999
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2000
}\
2001
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2002
    uint8_t halfH[272];\
2003
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2004
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2005
}
2006

    
2007
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2008
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2009
#define op_put(a, b) a = cm[((b) + 16)>>5]
2010
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2011

    
2012
QPEL_MC(0, put_       , _       , op_put)
2013
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2014
QPEL_MC(0, avg_       , _       , op_avg)
2015
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2016
#undef op_avg
2017
#undef op_avg_no_rnd
2018
#undef op_put
2019
#undef op_put_no_rnd
2020

    
2021
#if 1
2022
#define H264_LOWPASS(OPNAME, OP, OP2) \
2023
static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2024
    const int h=2;\
2025
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2026
    int i;\
2027
    for(i=0; i<h; i++)\
2028
    {\
2029
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2030
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2031
        dst+=dstStride;\
2032
        src+=srcStride;\
2033
    }\
2034
}\
2035
\
2036
static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2037
    const int w=2;\
2038
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2039
    int i;\
2040
    for(i=0; i<w; i++)\
2041
    {\
2042
        const int srcB= src[-2*srcStride];\
2043
        const int srcA= src[-1*srcStride];\
2044
        const int src0= src[0 *srcStride];\
2045
        const int src1= src[1 *srcStride];\
2046
        const int src2= src[2 *srcStride];\
2047
        const int src3= src[3 *srcStride];\
2048
        const int src4= src[4 *srcStride];\
2049
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2050
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2051
        dst++;\
2052
        src++;\
2053
    }\
2054
}\
2055
\
2056
static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2057
    const int h=2;\
2058
    const int w=2;\
2059
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2060
    int i;\
2061
    src -= 2*srcStride;\
2062
    for(i=0; i<h+5; i++)\
2063
    {\
2064
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2065
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2066
        tmp+=tmpStride;\
2067
        src+=srcStride;\
2068
    }\
2069
    tmp -= tmpStride*(h+5-2);\
2070
    for(i=0; i<w; i++)\
2071
    {\
2072
        const int tmpB= tmp[-2*tmpStride];\
2073
        const int tmpA= tmp[-1*tmpStride];\
2074
        const int tmp0= tmp[0 *tmpStride];\
2075
        const int tmp1= tmp[1 *tmpStride];\
2076
        const int tmp2= tmp[2 *tmpStride];\
2077
        const int tmp3= tmp[3 *tmpStride];\
2078
        const int tmp4= tmp[4 *tmpStride];\
2079
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2080
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2081
        dst++;\
2082
        tmp++;\
2083
    }\
2084
}\
2085
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2086
    const int h=4;\
2087
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2088
    int i;\
2089
    for(i=0; i<h; i++)\
2090
    {\
2091
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2092
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2093
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2094
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2095
        dst+=dstStride;\
2096
        src+=srcStride;\
2097
    }\
2098
}\
2099
\
2100
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2101
    const int w=4;\
2102
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2103
    int i;\
2104
    for(i=0; i<w; i++)\
2105
    {\
2106
        const int srcB= src[-2*srcStride];\
2107
        const int srcA= src[-1*srcStride];\
2108
        const int src0= src[0 *srcStride];\
2109
        const int src1= src[1 *srcStride];\
2110
        const int src2= src[2 *srcStride];\
2111
        const int src3= src[3 *srcStride];\
2112
        const int src4= src[4 *srcStride];\
2113
        const int src5= src[5 *srcStride];\
2114
        const int src6= src[6 *srcStride];\
2115
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2116
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2117
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2118
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2119
        dst++;\
2120
        src++;\
2121
    }\
2122
}\
2123
\
2124
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2125
    const int h=4;\
2126
    const int w=4;\
2127
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2128
    int i;\
2129
    src -= 2*srcStride;\
2130
    for(i=0; i<h+5; i++)\
2131
    {\
2132
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2133
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2134
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2135
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2136
        tmp+=tmpStride;\
2137
        src+=srcStride;\
2138
    }\
2139
    tmp -= tmpStride*(h+5-2);\
2140
    for(i=0; i<w; i++)\
2141
    {\
2142
        const int tmpB= tmp[-2*tmpStride];\
2143
        const int tmpA= tmp[-1*tmpStride];\
2144
        const int tmp0= tmp[0 *tmpStride];\
2145
        const int tmp1= tmp[1 *tmpStride];\
2146
        const int tmp2= tmp[2 *tmpStride];\
2147
        const int tmp3= tmp[3 *tmpStride];\
2148
        const int tmp4= tmp[4 *tmpStride];\
2149
        const int tmp5= tmp[5 *tmpStride];\
2150
        const int tmp6= tmp[6 *tmpStride];\
2151
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2152
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2153
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2154
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2155
        dst++;\
2156
        tmp++;\
2157
    }\
2158
}\
2159
\
2160
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2161
    const int h=8;\
2162
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2163
    int i;\
2164
    for(i=0; i<h; i++)\
2165
    {\
2166
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2167
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2168
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2169
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2170
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2171
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2172
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2173
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2174
        dst+=dstStride;\
2175
        src+=srcStride;\
2176
    }\
2177
}\
2178
\
2179
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2180
    const int w=8;\
2181
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2182
    int i;\
2183
    for(i=0; i<w; i++)\
2184
    {\
2185
        const int srcB= src[-2*srcStride];\
2186
        const int srcA= src[-1*srcStride];\
2187
        const int src0= src[0 *srcStride];\
2188
        const int src1= src[1 *srcStride];\
2189
        const int src2= src[2 *srcStride];\
2190
        const int src3= src[3 *srcStride];\
2191
        const int src4= src[4 *srcStride];\
2192
        const int src5= src[5 *srcStride];\
2193
        const int src6= src[6 *srcStride];\
2194
        const int src7= src[7 *srcStride];\
2195
        const int src8= src[8 *srcStride];\
2196
        const int src9= src[9 *srcStride];\
2197
        const int src10=src[10*srcStride];\
2198
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2199
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2200
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2201
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2202
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2203
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2204
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2205
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2206
        dst++;\
2207
        src++;\
2208
    }\
2209
}\
2210
\
2211
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2212
    const int h=8;\
2213
    const int w=8;\
2214
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2215
    int i;\
2216
    src -= 2*srcStride;\
2217
    for(i=0; i<h+5; i++)\
2218
    {\
2219
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2220
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2221
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2222
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2223
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2224
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2225
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2226
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2227
        tmp+=tmpStride;\
2228
        src+=srcStride;\
2229
    }\
2230
    tmp -= tmpStride*(h+5-2);\
2231
    for(i=0; i<w; i++)\
2232
    {\
2233
        const int tmpB= tmp[-2*tmpStride];\
2234
        const int tmpA= tmp[-1*tmpStride];\
2235
        const int tmp0= tmp[0 *tmpStride];\
2236
        const int tmp1= tmp[1 *tmpStride];\
2237
        const int tmp2= tmp[2 *tmpStride];\
2238
        const int tmp3= tmp[3 *tmpStride];\
2239
        const int tmp4= tmp[4 *tmpStride];\
2240
        const int tmp5= tmp[5 *tmpStride];\
2241
        const int tmp6= tmp[6 *tmpStride];\
2242
        const int tmp7= tmp[7 *tmpStride];\
2243
        const int tmp8= tmp[8 *tmpStride];\
2244
        const int tmp9= tmp[9 *tmpStride];\
2245
        const int tmp10=tmp[10*tmpStride];\
2246
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2247
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2248
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2249
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2250
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2251
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2252
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2253
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2254
        dst++;\
2255
        tmp++;\
2256
    }\
2257
}\
2258
\
2259
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2260
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2261
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2262
    src += 8*srcStride;\
2263
    dst += 8*dstStride;\
2264
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2265
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2266
}\
2267
\
2268
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2269
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2270
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2271
    src += 8*srcStride;\
2272
    dst += 8*dstStride;\
2273
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2274
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2275
}\
2276
\
2277
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2278
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2279
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2280
    src += 8*srcStride;\
2281
    dst += 8*dstStride;\
2282
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2283
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2284
}\
2285

    
2286
#define H264_MC(OPNAME, SIZE) \
2287
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2288
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2289
}\
2290
\
2291
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2292
    uint8_t half[SIZE*SIZE];\
2293
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2294
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2295
}\
2296
\
2297
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2298
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2299
}\
2300
\
2301
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2302
    uint8_t half[SIZE*SIZE];\
2303
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2304
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2305
}\
2306
\
2307
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2308
    uint8_t full[SIZE*(SIZE+5)];\
2309
    uint8_t * const full_mid= full + SIZE*2;\
2310
    uint8_t half[SIZE*SIZE];\
2311
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2312
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2313
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2314
}\
2315
\
2316
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2317
    uint8_t full[SIZE*(SIZE+5)];\
2318
    uint8_t * const full_mid= full + SIZE*2;\
2319
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2320
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2321
}\
2322
\
2323
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2324
    uint8_t full[SIZE*(SIZE+5)];\
2325
    uint8_t * const full_mid= full + SIZE*2;\
2326
    uint8_t half[SIZE*SIZE];\
2327
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2328
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2329
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2330
}\
2331
\
2332
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2333
    uint8_t full[SIZE*(SIZE+5)];\
2334
    uint8_t * const full_mid= full + SIZE*2;\
2335
    uint8_t halfH[SIZE*SIZE];\
2336
    uint8_t halfV[SIZE*SIZE];\
2337
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2338
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2339
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2340
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2341
}\
2342
\
2343
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2344
    uint8_t full[SIZE*(SIZE+5)];\
2345
    uint8_t * const full_mid= full + SIZE*2;\
2346
    uint8_t halfH[SIZE*SIZE];\
2347
    uint8_t halfV[SIZE*SIZE];\
2348
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2349
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2350
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2351
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2352
}\
2353
\
2354
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2355
    uint8_t full[SIZE*(SIZE+5)];\
2356
    uint8_t * const full_mid= full + SIZE*2;\
2357
    uint8_t halfH[SIZE*SIZE];\
2358
    uint8_t halfV[SIZE*SIZE];\
2359
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2360
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2361
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2362
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2363
}\
2364
\
2365
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2366
    uint8_t full[SIZE*(SIZE+5)];\
2367
    uint8_t * const full_mid= full + SIZE*2;\
2368
    uint8_t halfH[SIZE*SIZE];\
2369
    uint8_t halfV[SIZE*SIZE];\
2370
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2371
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2372
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2373
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2374
}\
2375
\
2376
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2377
    int16_t tmp[SIZE*(SIZE+5)];\
2378
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2379
}\
2380
\
2381
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2382
    int16_t tmp[SIZE*(SIZE+5)];\
2383
    uint8_t halfH[SIZE*SIZE];\
2384
    uint8_t halfHV[SIZE*SIZE];\
2385
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2386
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2387
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2388
}\
2389
\
2390
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2391
    int16_t tmp[SIZE*(SIZE+5)];\
2392
    uint8_t halfH[SIZE*SIZE];\
2393
    uint8_t halfHV[SIZE*SIZE];\
2394
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2395
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2396
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2397
}\
2398
\
2399
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2400
    uint8_t full[SIZE*(SIZE+5)];\
2401
    uint8_t * const full_mid= full + SIZE*2;\
2402
    int16_t tmp[SIZE*(SIZE+5)];\
2403
    uint8_t halfV[SIZE*SIZE];\
2404
    uint8_t halfHV[SIZE*SIZE];\
2405
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2406
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2407
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2408
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2409
}\
2410
\
2411
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2412
    uint8_t full[SIZE*(SIZE+5)];\
2413
    uint8_t * const full_mid= full + SIZE*2;\
2414
    int16_t tmp[SIZE*(SIZE+5)];\
2415
    uint8_t halfV[SIZE*SIZE];\
2416
    uint8_t halfHV[SIZE*SIZE];\
2417
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2418
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2419
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2420
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2421
}\
2422

    
2423
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2424
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2425
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2426
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2427
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2428

    
2429
H264_LOWPASS(put_       , op_put, op2_put)
2430
H264_LOWPASS(avg_       , op_avg, op2_avg)
2431
H264_MC(put_, 2)
2432
H264_MC(put_, 4)
2433
H264_MC(put_, 8)
2434
H264_MC(put_, 16)
2435
H264_MC(avg_, 4)
2436
H264_MC(avg_, 8)
2437
H264_MC(avg_, 16)
2438

    
2439
#undef op_avg
2440
#undef op_put
2441
#undef op2_avg
2442
#undef op2_put
2443
#endif
2444

    
2445
#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2446
#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2447
#define H264_WEIGHT(W,H) \
2448
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2449
    int y; \
2450
    offset <<= log2_denom; \
2451
    if(log2_denom) offset += 1<<(log2_denom-1); \
2452
    for(y=0; y<H; y++, block += stride){ \
2453
        op_scale1(0); \
2454
        op_scale1(1); \
2455
        if(W==2) continue; \
2456
        op_scale1(2); \
2457
        op_scale1(3); \
2458
        if(W==4) continue; \
2459
        op_scale1(4); \
2460
        op_scale1(5); \
2461
        op_scale1(6); \
2462
        op_scale1(7); \
2463
        if(W==8) continue; \
2464
        op_scale1(8); \
2465
        op_scale1(9); \
2466
        op_scale1(10); \
2467
        op_scale1(11); \
2468
        op_scale1(12); \
2469
        op_scale1(13); \
2470
        op_scale1(14); \
2471
        op_scale1(15); \
2472
    } \
2473
} \
2474
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2475
    int y; \
2476
    offset = ((offset + 1) | 1) << log2_denom; \
2477
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2478
        op_scale2(0); \
2479
        op_scale2(1); \
2480
        if(W==2) continue; \
2481
        op_scale2(2); \
2482
        op_scale2(3); \
2483
        if(W==4) continue; \
2484
        op_scale2(4); \
2485
        op_scale2(5); \
2486
        op_scale2(6); \
2487
        op_scale2(7); \
2488
        if(W==8) continue; \
2489
        op_scale2(8); \
2490
        op_scale2(9); \
2491
        op_scale2(10); \
2492
        op_scale2(11); \
2493
        op_scale2(12); \
2494
        op_scale2(13); \
2495
        op_scale2(14); \
2496
        op_scale2(15); \
2497
    } \
2498
}
2499

    
2500
H264_WEIGHT(16,16)
2501
H264_WEIGHT(16,8)
2502
H264_WEIGHT(8,16)
2503
H264_WEIGHT(8,8)
2504
H264_WEIGHT(8,4)
2505
H264_WEIGHT(4,8)
2506
H264_WEIGHT(4,4)
2507
H264_WEIGHT(4,2)
2508
H264_WEIGHT(2,4)
2509
H264_WEIGHT(2,2)
2510

    
2511
#undef op_scale1
2512
#undef op_scale2
2513
#undef H264_WEIGHT
2514

    
2515
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2516
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2517
    int i;
2518

    
2519
    for(i=0; i<h; i++){
2520
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2521
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2522
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2523
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2524
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2525
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2526
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2527
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2528
        dst+=dstStride;
2529
        src+=srcStride;
2530
    }
2531
}
2532

    
2533
#ifdef CONFIG_CAVS_DECODER
2534
/* AVS specific */
2535
void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2536

    
2537
void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2538
    put_pixels8_c(dst, src, stride, 8);
2539
}
2540
void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2541
    avg_pixels8_c(dst, src, stride, 8);
2542
}
2543
void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2544
    put_pixels16_c(dst, src, stride, 16);
2545
}
2546
void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2547
    avg_pixels16_c(dst, src, stride, 16);
2548
}
2549
#endif /* CONFIG_CAVS_DECODER */
2550

    
2551
#if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2552
/* VC-1 specific */
2553
void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2554

    
2555
void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2556
    put_pixels8_c(dst, src, stride, 8);
2557
}
2558
#endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2559

    
2560
#if defined(CONFIG_H264_ENCODER)
2561
/* H264 specific */
2562
void ff_h264dsp_init(DSPContext* c, AVCodecContext *avctx);
2563
#endif /* CONFIG_H264_ENCODER */
2564

    
2565
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2566
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2567
    int i;
2568

    
2569
    for(i=0; i<w; i++){
2570
        const int src_1= src[ -srcStride];
2571
        const int src0 = src[0          ];
2572
        const int src1 = src[  srcStride];
2573
        const int src2 = src[2*srcStride];
2574
        const int src3 = src[3*srcStride];
2575
        const int src4 = src[4*srcStride];
2576
        const int src5 = src[5*srcStride];
2577
        const int src6 = src[6*srcStride];
2578
        const int src7 = src[7*srcStride];
2579
        const int src8 = src[8*srcStride];
2580
        const int src9 = src[9*srcStride];
2581
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2582
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2583
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2584
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2585
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2586
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2587
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2588
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2589
        src++;
2590
        dst++;
2591
    }
2592
}
2593

    
2594
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2595
    put_pixels8_c(dst, src, stride, 8);
2596
}
2597

    
2598
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2599
    uint8_t half[64];
2600
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2601
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2602
}
2603

    
2604
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2605
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2606
}
2607

    
2608
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2609
    uint8_t half[64];
2610
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2611
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2612
}
2613

    
2614
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2615
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2616
}
2617

    
2618
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2619
    uint8_t halfH[88];
2620
    uint8_t halfV[64];
2621
    uint8_t halfHV[64];
2622
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2623
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2624
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2625
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2626
}
2627
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2628
    uint8_t halfH[88];
2629
    uint8_t halfV[64];
2630
    uint8_t halfHV[64];
2631
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2632
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2633
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2634
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2635
}
2636
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2637
    uint8_t halfH[88];
2638
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2639
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2640
}
2641

    
2642
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2643
    int x;
2644
    const int strength= ff_h263_loop_filter_strength[qscale];
2645

    
2646
    for(x=0; x<8; x++){
2647
        int d1, d2, ad1;
2648
        int p0= src[x-2*stride];
2649
        int p1= src[x-1*stride];
2650
        int p2= src[x+0*stride];
2651
        int p3= src[x+1*stride];
2652
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2653

    
2654
        if     (d<-2*strength) d1= 0;
2655
        else if(d<-  strength) d1=-2*strength - d;
2656
        else if(d<   strength) d1= d;
2657
        else if(d< 2*strength) d1= 2*strength - d;
2658
        else                   d1= 0;
2659

    
2660
        p1 += d1;
2661
        p2 -= d1;
2662
        if(p1&256) p1= ~(p1>>31);
2663
        if(p2&256) p2= ~(p2>>31);
2664

    
2665
        src[x-1*stride] = p1;
2666
        src[x+0*stride] = p2;
2667

    
2668
        ad1= FFABS(d1)>>1;
2669

    
2670
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2671

    
2672
        src[x-2*stride] = p0 - d2;
2673
        src[x+  stride] = p3 + d2;
2674
    }
2675
}
2676

    
2677
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2678
    int y;
2679
    const int strength= ff_h263_loop_filter_strength[qscale];
2680

    
2681
    for(y=0; y<8; y++){
2682
        int d1, d2, ad1;
2683
        int p0= src[y*stride-2];
2684
        int p1= src[y*stride-1];
2685
        int p2= src[y*stride+0];
2686
        int p3= src[y*stride+1];
2687
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2688

    
2689
        if     (d<-2*strength) d1= 0;
2690
        else if(d<-  strength) d1=-2*strength - d;
2691
        else if(d<   strength) d1= d;
2692
        else if(d< 2*strength) d1= 2*strength - d;
2693
        else                   d1= 0;
2694

    
2695
        p1 += d1;
2696
        p2 -= d1;
2697
        if(p1&256) p1= ~(p1>>31);
2698
        if(p2&256) p2= ~(p2>>31);
2699

    
2700
        src[y*stride-1] = p1;
2701
        src[y*stride+0] = p2;
2702

    
2703
        ad1= FFABS(d1)>>1;
2704

    
2705
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2706

    
2707
        src[y*stride-2] = p0 - d2;
2708
        src[y*stride+1] = p3 + d2;
2709
    }
2710
}
2711

    
2712
static void h261_loop_filter_c(uint8_t *src, int stride){
2713
    int x,y,xy,yz;
2714
    int temp[64];
2715

    
2716
    for(x=0; x<8; x++){
2717
        temp[x      ] = 4*src[x           ];
2718
        temp[x + 7*8] = 4*src[x + 7*stride];
2719
    }
2720
    for(y=1; y<7; y++){
2721
        for(x=0; x<8; x++){
2722
            xy = y * stride + x;
2723
            yz = y * 8 + x;
2724
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2725
        }
2726
    }
2727

    
2728
    for(y=0; y<8; y++){
2729
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2730
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2731
        for(x=1; x<7; x++){
2732
            xy = y * stride + x;
2733
            yz = y * 8 + x;
2734
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2735
        }
2736
    }
2737
}
2738

    
2739
static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2740
{
2741
    int i, d;
2742
    for( i = 0; i < 4; i++ ) {
2743
        if( tc0[i] < 0 ) {
2744
            pix += 4*ystride;
2745
            continue;
2746
        }
2747
        for( d = 0; d < 4; d++ ) {
2748
            const int p0 = pix[-1*xstride];
2749
            const int p1 = pix[-2*xstride];
2750
            const int p2 = pix[-3*xstride];
2751
            const int q0 = pix[0];
2752
            const int q1 = pix[1*xstride];
2753
            const int q2 = pix[2*xstride];
2754

    
2755
            if( FFABS( p0 - q0 ) < alpha &&
2756
                FFABS( p1 - p0 ) < beta &&
2757
                FFABS( q1 - q0 ) < beta ) {
2758

    
2759
                int tc = tc0[i];
2760
                int i_delta;
2761

    
2762
                if( FFABS( p2 - p0 ) < beta ) {
2763
                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2764
                    tc++;
2765
                }
2766
                if( FFABS( q2 - q0 ) < beta ) {
2767
                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2768
                    tc++;
2769
                }
2770

    
2771
                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2772
                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
2773
                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
2774
            }
2775
            pix += ystride;
2776
        }
2777
    }
2778
}
2779
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2780
{
2781
    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2782
}
2783
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2784
{
2785
    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2786
}
2787

    
2788
static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2789
{
2790
    int i, d;
2791
    for( i = 0; i < 4; i++ ) {
2792
        const int tc = tc0[i];
2793
        if( tc <= 0 ) {
2794
            pix += 2*ystride;
2795
            continue;
2796
        }
2797
        for( d = 0; d < 2; d++ ) {
2798
            const int p0 = pix[-1*xstride];
2799
            const int p1 = pix[-2*xstride];
2800
            const int q0 = pix[0];
2801
            const int q1 = pix[1*xstride];
2802

    
2803
            if( FFABS( p0 - q0 ) < alpha &&
2804
                FFABS( p1 - p0 ) < beta &&
2805
                FFABS( q1 - q0 ) < beta ) {
2806

    
2807
                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2808

    
2809
                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
2810
                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
2811
            }
2812
            pix += ystride;
2813
        }
2814
    }
2815
}
2816
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2817
{
2818
    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2819
}
2820
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2821
{
2822
    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2823
}
2824

    
2825
static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2826
{
2827
    int d;
2828
    for( d = 0; d < 8; d++ ) {
2829
        const int p0 = pix[-1*xstride];
2830
        const int p1 = pix[-2*xstride];
2831
        const int q0 = pix[0];
2832
        const int q1 = pix[1*xstride];
2833

    
2834
        if( FFABS( p0 - q0 ) < alpha &&
2835
            FFABS( p1 - p0 ) < beta &&
2836
            FFABS( q1 - q0 ) < beta ) {
2837

    
2838
            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
2839
            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
2840
        }
2841
        pix += ystride;
2842
    }
2843
}
2844
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2845
{
2846
    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2847
}
2848
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2849
{
2850
    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2851
}
2852

    
2853
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2854
{
2855
    int s, i;
2856

    
2857
    s = 0;
2858
    for(i=0;i<h;i++) {
2859
        s += abs(pix1[0] - pix2[0]);
2860
        s += abs(pix1[1] - pix2[1]);
2861
        s += abs(pix1[2] - pix2[2]);
2862
        s += abs(pix1[3] - pix2[3]);
2863
        s += abs(pix1[4] - pix2[4]);
2864
        s += abs(pix1[5] - pix2[5]);
2865
        s += abs(pix1[6] - pix2[6]);
2866
        s += abs(pix1[7] - pix2[7]);
2867
        s += abs(pix1[8] - pix2[8]);
2868
        s += abs(pix1[9] - pix2[9]);
2869
        s += abs(pix1[10] - pix2[10]);
2870
        s += abs(pix1[11] - pix2[11]);
2871
        s += abs(pix1[12] - pix2[12]);
2872
        s += abs(pix1[13] - pix2[13]);
2873
        s += abs(pix1[14] - pix2[14]);
2874
        s += abs(pix1[15] - pix2[15]);
2875
        pix1 += line_size;
2876
        pix2 += line_size;
2877
    }
2878
    return s;
2879
}
2880

    
2881
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2882
{
2883
    int s, i;
2884

    
2885
    s = 0;
2886
    for(i=0;i<h;i++) {
2887
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2888
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2889
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2890
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2891
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2892
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2893
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2894
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2895
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2896
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2897
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2898
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2899
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2900
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2901
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2902
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2903
        pix1 += line_size;
2904
        pix2 += line_size;
2905
    }
2906
    return s;
2907
}
2908

    
2909
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2910
{
2911
    int s, i;
2912
    uint8_t *pix3 = pix2 + line_size;
2913

    
2914
    s = 0;
2915
    for(i=0;i<h;i++) {
2916
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2917
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2918
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2919
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2920
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2921
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2922
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2923
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2924
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2925
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2926
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2927
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2928
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2929
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2930
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2931
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2932
        pix1 += line_size;
2933
        pix2 += line_size;
2934
        pix3 += line_size;
2935
    }
2936
    return s;
2937
}
2938

    
2939
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2940
{
2941
    int s, i;
2942
    uint8_t *pix3 = pix2 + line_size;
2943

    
2944
    s = 0;
2945
    for(i=0;i<h;i++) {
2946
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2947
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2948
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2949
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2950
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2951
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2952
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2953
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2954
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2955
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2956
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2957
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2958
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2959
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2960
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2961
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2962
        pix1 += line_size;
2963
        pix2 += line_size;
2964
        pix3 += line_size;
2965
    }
2966
    return s;
2967
}
2968

    
2969
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2970
{
2971
    int s, i;
2972

    
2973
    s = 0;
2974
    for(i=0;i<h;i++) {
2975
        s += abs(pix1[0] - pix2[0]);
2976
        s += abs(pix1[1] - pix2[1]);
2977
        s += abs(pix1[2] - pix2[2]);
2978
        s += abs(pix1[3] - pix2[3]);
2979
        s += abs(pix1[4] - pix2[4]);
2980
        s += abs(pix1[5] - pix2[5]);
2981
        s += abs(pix1[6] - pix2[6]);
2982
        s += abs(pix1[7] - pix2[7]);
2983
        pix1 += line_size;
2984
        pix2 += line_size;
2985
    }
2986
    return s;
2987
}
2988

    
2989
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2990
{
2991
    int s, i;
2992

    
2993
    s = 0;
2994
    for(i=0;i<h;i++) {
2995
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2996
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2997
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2998
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2999
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3000
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3001
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3002
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3003
        pix1 += line_size;
3004
        pix2 += line_size;
3005
    }
3006
    return s;
3007
}
3008

    
3009
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3010
{
3011
    int s, i;
3012
    uint8_t *pix3 = pix2 + line_size;
3013

    
3014
    s = 0;
3015
    for(i=0;i<h;i++) {
3016
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3017
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3018
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3019
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3020
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3021
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3022
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3023
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3024
        pix1 += line_size;
3025
        pix2 += line_size;
3026
        pix3 += line_size;
3027
    }
3028
    return s;
3029
}
3030

    
3031
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3032
{
3033
    int s, i;
3034
    uint8_t *pix3 = pix2 + line_size;
3035

    
3036
    s = 0;
3037
    for(i=0;i<h;i++) {
3038
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3039
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3040
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3041
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3042
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3043
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3044
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3045
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3046
        pix1 += line_size;
3047
        pix2 += line_size;
3048
        pix3 += line_size;
3049
    }
3050
    return s;
3051
}
3052

    
3053
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3054
    MpegEncContext *c = v;
3055
    int score1=0;
3056
    int score2=0;
3057
    int x,y;
3058

    
3059
    for(y=0; y<h; y++){
3060
        for(x=0; x<16; x++){
3061
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3062
        }
3063
        if(y+1<h){
3064
            for(x=0; x<15; x++){
3065
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3066
                             - s1[x+1] + s1[x+1+stride])
3067
                        -FFABS(  s2[x  ] - s2[x  +stride]
3068
                             - s2[x+1] + s2[x+1+stride]);
3069
            }
3070
        }
3071
        s1+= stride;
3072
        s2+= stride;
3073
    }
3074

    
3075
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3076
    else  return score1 + FFABS(score2)*8;
3077
}
3078

    
3079
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3080
    MpegEncContext *c = v;
3081
    int score1=0;
3082
    int score2=0;
3083
    int x,y;
3084

    
3085
    for(y=0; y<h; y++){
3086
        for(x=0; x<8; x++){
3087
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3088
        }
3089
        if(y+1<h){
3090
            for(x=0; x<7; x++){
3091
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3092
                             - s1[x+1] + s1[x+1+stride])
3093
                        -FFABS(  s2[x  ] - s2[x  +stride]
3094
                             - s2[x+1] + s2[x+1+stride]);
3095
            }
3096
        }
3097
        s1+= stride;
3098
        s2+= stride;
3099
    }
3100

    
3101
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3102
    else  return score1 + FFABS(score2)*8;
3103
}
3104

    
3105
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3106
    int i;
3107
    unsigned int sum=0;
3108

    
3109
    for(i=0; i<8*8; i++){
3110
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3111
        int w= weight[i];
3112
        b>>= RECON_SHIFT;
3113
        assert(-512<b && b<512);
3114

    
3115
        sum += (w*b)*(w*b)>>4;
3116
    }
3117
    return sum>>2;
3118
}
3119

    
3120
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3121
    int i;
3122

    
3123
    for(i=0; i<8*8; i++){
3124
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3125
    }
3126
}
3127

    
3128
/**
3129
 * permutes an 8x8 block.
3130
 * @param block the block which will be permuted according to the given permutation vector
3131
 * @param permutation the permutation vector
3132
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3133
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3134
 *                  (inverse) permutated to scantable order!
3135
 */
3136
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3137
{
3138
    int i;
3139
    DCTELEM temp[64];
3140

    
3141
    if(last<=0) return;
3142
    //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3143

    
3144
    for(i=0; i<=last; i++){
3145
        const int j= scantable[i];
3146
        temp[j]= block[j];
3147
        block[j]=0;
3148
    }
3149

    
3150
    for(i=0; i<=last; i++){
3151
        const int j= scantable[i];
3152
        const int perm_j= permutation[j];
3153
        block[perm_j]= temp[j];
3154
    }
3155
}
3156

    
3157
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3158
    return 0;
3159
}
3160

    
3161
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3162
    int i;
3163

    
3164
    memset(cmp, 0, sizeof(void*)*5);
3165

    
3166
    for(i=0; i<5; i++){
3167
        switch(type&0xFF){
3168
        case FF_CMP_SAD:
3169
            cmp[i]= c->sad[i];
3170
            break;
3171
        case FF_CMP_SATD:
3172
            cmp[i]= c->hadamard8_diff[i];
3173
            break;
3174
        case FF_CMP_SSE:
3175
            cmp[i]= c->sse[i];
3176
            break;
3177
        case FF_CMP_DCT:
3178
            cmp[i]= c->dct_sad[i];
3179
            break;
3180
        case FF_CMP_DCT264:
3181
            cmp[i]= c->dct264_sad[i];
3182
            break;
3183
        case FF_CMP_DCTMAX:
3184
            cmp[i]= c->dct_max[i];
3185
            break;
3186
        case FF_CMP_PSNR:
3187
            cmp[i]= c->quant_psnr[i];
3188
            break;
3189
        case FF_CMP_BIT:
3190
            cmp[i]= c->bit[i];
3191
            break;
3192
        case FF_CMP_RD:
3193
            cmp[i]= c->rd[i];
3194
            break;
3195
        case FF_CMP_VSAD:
3196
            cmp[i]= c->vsad[i];
3197
            break;
3198
        case FF_CMP_VSSE:
3199
            cmp[i]= c->vsse[i];
3200
            break;
3201
        case FF_CMP_ZERO:
3202
            cmp[i]= zero_cmp;
3203
            break;
3204
        case FF_CMP_NSSE:
3205
            cmp[i]= c->nsse[i];
3206
            break;
3207
#ifdef CONFIG_SNOW_ENCODER
3208
        case FF_CMP_W53:
3209
            cmp[i]= c->w53[i];
3210
            break;
3211
        case FF_CMP_W97:
3212
            cmp[i]= c->w97[i];
3213
            break;
3214
#endif
3215
        default:
3216
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3217
        }
3218
    }
3219
}
3220

    
3221
/**
3222
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3223
 */
3224
static void clear_blocks_c(DCTELEM *blocks)
3225
{
3226
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3227
}
3228

    
3229
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3230
    int i;
3231
    for(i=0; i+7<w; i+=8){
3232
        dst[i+0] += src[i+0];
3233
        dst[i+1] += src[i+1];
3234
        dst[i+2] += src[i+2];
3235
        dst[i+3] += src[i+3];
3236
        dst[i+4] += src[i+4];
3237
        dst[i+5] += src[i+5];
3238
        dst[i+6] += src[i+6];
3239
        dst[i+7] += src[i+7];
3240
    }
3241
    for(; i<w; i++)
3242
        dst[i+0] += src[i+0];
3243
}
3244

    
3245
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3246
    int i;
3247
    for(i=0; i+7<w; i+=8){
3248
        dst[i+0] = src1[i+0]-src2[i+0];
3249
        dst[i+1] = src1[i+1]-src2[i+1];
3250
        dst[i+2] = src1[i+2]-src2[i+2];
3251
        dst[i+3] = src1[i+3]-src2[i+3];
3252
        dst[i+4] = src1[i+4]-src2[i+4];
3253
        dst[i+5] = src1[i+5]-src2[i+5];
3254
        dst[i+6] = src1[i+6]-src2[i+6];
3255
        dst[i+7] = src1[i+7]-src2[i+7];
3256
    }
3257
    for(; i<w; i++)
3258
        dst[i+0] = src1[i+0]-src2[i+0];
3259
}
3260

    
3261
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3262
    int i;
3263
    uint8_t l, lt;
3264

    
3265
    l= *left;
3266
    lt= *left_top;
3267

    
3268
    for(i=0; i<w; i++){
3269
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3270
        lt= src1[i];
3271
        l= src2[i];
3272
        dst[i]= l - pred;
3273
    }
3274

    
3275
    *left= l;
3276
    *left_top= lt;
3277
}
3278

    
3279
#define BUTTERFLY2(o1,o2,i1,i2) \
3280
o1= (i1)+(i2);\
3281
o2= (i1)-(i2);
3282

    
3283
#define BUTTERFLY1(x,y) \
3284
{\
3285
    int a,b;\
3286
    a= x;\
3287
    b= y;\
3288
    x= a+b;\
3289
    y= a-b;\
3290
}
3291

    
3292
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3293

    
3294
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3295
    int i;
3296
    int temp[64];
3297
    int sum=0;
3298

    
3299
    assert(h==8);
3300

    
3301
    for(i=0; i<8; i++){
3302
        //FIXME try pointer walks
3303
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3304
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3305
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3306
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3307

    
3308
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3309
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3310
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3311
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3312

    
3313
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3314
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3315
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3316
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3317
    }
3318

    
3319
    for(i=0; i<8; i++){
3320
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3321
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3322
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3323
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3324

    
3325
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3326
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3327
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3328
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3329

    
3330
        sum +=
3331
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3332
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3333
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3334
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3335
    }
3336
#if 0
3337
static int maxi=0;
3338
if(sum>maxi){
3339
    maxi=sum;
3340
    printf("MAX:%d\n", maxi);
3341
}
3342
#endif
3343
    return sum;
3344
}
3345

    
3346
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3347
    int i;
3348
    int temp[64];
3349
    int sum=0;
3350

    
3351
    assert(h==8);
3352

    
3353
    for(i=0; i<8; i++){
3354
        //FIXME try pointer walks
3355
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3356
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3357
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3358
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3359

    
3360
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3361
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3362
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3363
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3364

    
3365
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3366
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3367
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3368
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3369
    }
3370

    
3371
    for(i=0; i<8; i++){
3372
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3373
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3374
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3375
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3376

    
3377
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3378
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3379
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3380
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3381

    
3382
        sum +=
3383
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3384
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3385
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3386
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3387
    }
3388

    
3389
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3390

    
3391
    return sum;
3392
}
3393

    
3394
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3395
    MpegEncContext * const s= (MpegEncContext *)c;
3396
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3397
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3398

    
3399
    assert(h==8);
3400

    
3401
    s->dsp.diff_pixels(temp, src1, src2, stride);
3402
    s->dsp.fdct(temp);
3403
    return s->dsp.sum_abs_dctelem(temp);
3404
}
3405

    
3406
#ifdef CONFIG_GPL
3407
#define DCT8_1D {\
3408
    const int s07 = SRC(0) + SRC(7);\
3409
    const int s16 = SRC(1) + SRC(6);\
3410
    const int s25 = SRC(2) + SRC(5);\
3411
    const int s34 = SRC(3) + SRC(4);\
3412
    const int a0 = s07 + s34;\
3413
    const int a1 = s16 + s25;\
3414
    const int a2 = s07 - s34;\
3415
    const int a3 = s16 - s25;\
3416
    const int d07 = SRC(0) - SRC(7);\
3417
    const int d16 = SRC(1) - SRC(6);\
3418
    const int d25 = SRC(2) - SRC(5);\
3419
    const int d34 = SRC(3) - SRC(4);\
3420
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3421
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3422
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3423
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3424
    DST(0,  a0 + a1     ) ;\
3425
    DST(1,  a4 + (a7>>2)) ;\
3426
    DST(2,  a2 + (a3>>1)) ;\
3427
    DST(3,  a5 + (a6>>2)) ;\
3428
    DST(4,  a0 - a1     ) ;\
3429
    DST(5,  a6 - (a5>>2)) ;\
3430
    DST(6, (a2>>1) - a3 ) ;\
3431
    DST(7, (a4>>2) - a7 ) ;\
3432
}
3433

    
3434
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3435
    MpegEncContext * const s= (MpegEncContext *)c;
3436
    int16_t dct[8][8];
3437
    int i;
3438
    int sum=0;
3439

    
3440
    s->dsp.diff_pixels(dct, src1, src2, stride);
3441

    
3442
#define SRC(x) dct[i][x]
3443
#define DST(x,v) dct[i][x]= v
3444
    for( i = 0; i < 8; i++ )
3445
        DCT8_1D
3446
#undef SRC
3447
#undef DST
3448

    
3449
#define SRC(x) dct[x][i]
3450
#define DST(x,v) sum += FFABS(v)
3451
    for( i = 0; i < 8; i++ )
3452
        DCT8_1D
3453
#undef SRC
3454
#undef DST
3455
    return sum;
3456
}
3457
#endif
3458

    
3459
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3460
    MpegEncContext * const s= (MpegEncContext *)c;
3461
    DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3462
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3463
    int sum=0, i;
3464

    
3465
    assert(h==8);
3466

    
3467
    s->dsp.diff_pixels(temp, src1, src2, stride);
3468
    s->dsp.fdct(temp);
3469

    
3470
    for(i=0; i<64; i++)
3471
        sum= FFMAX(sum, FFABS(temp[i]));
3472

    
3473
    return sum;
3474
}
3475

    
3476
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3477
    MpegEncContext * const s= (MpegEncContext *)c;
3478
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3479
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3480
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3481
    int sum=0, i;
3482

    
3483
    assert(h==8);
3484
    s->mb_intra=0;
3485

    
3486
    s->dsp.diff_pixels(temp, src1, src2, stride);
3487

    
3488
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3489

    
3490
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3491
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3492
    simple_idct(temp); //FIXME
3493

    
3494
    for(i=0; i<64; i++)
3495
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3496

    
3497
    return sum;
3498
}
3499

    
3500
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3501
    MpegEncContext * const s= (MpegEncContext *)c;
3502
    const uint8_t *scantable= s->intra_scantable.permutated;
3503
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3504
    DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3505
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3506
    uint8_t * const bak= (uint8_t*)aligned_bak;
3507
    int i, last, run, bits, level, distoration, start_i;
3508
    const int esc_length= s->ac_esc_length;
3509
    uint8_t * length;
3510
    uint8_t * last_length;
3511

    
3512
    assert(h==8);
3513

    
3514
    for(i=0; i<8; i++){
3515
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3516
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3517
    }
3518

    
3519
    s->dsp.diff_pixels(temp, src1, src2, stride);
3520

    
3521
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3522

    
3523
    bits=0;
3524

    
3525
    if (s->mb_intra) {
3526
        start_i = 1;
3527
        length     = s->intra_ac_vlc_length;
3528
        last_length= s->intra_ac_vlc_last_length;
3529
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3530
    } else {
3531
        start_i = 0;
3532
        length     = s->inter_ac_vlc_length;
3533
        last_length= s->inter_ac_vlc_last_length;
3534
    }
3535

    
3536
    if(last>=start_i){
3537
        run=0;
3538
        for(i=start_i; i<last; i++){
3539
            int j= scantable[i];
3540
            level= temp[j];
3541

    
3542
            if(level){
3543
                level+=64;
3544
                if((level&(~127)) == 0){
3545
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3546
                }else
3547
                    bits+= esc_length;
3548
                run=0;
3549
            }else
3550
                run++;
3551
        }
3552
        i= scantable[last];
3553

    
3554
        level= temp[i] + 64;
3555

    
3556
        assert(level - 64);
3557

    
3558
        if((level&(~127)) == 0){
3559
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3560
        }else
3561
            bits+= esc_length;
3562

    
3563
    }
3564

    
3565
    if(last>=0){
3566
        if(s->mb_intra)
3567
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3568
        else
3569
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3570
    }
3571

    
3572
    s->dsp.idct_add(bak, stride, temp);
3573

    
3574
    distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3575

    
3576
    return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3577
}
3578

    
3579
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3580
    MpegEncContext * const s= (MpegEncContext *)c;
3581
    const uint8_t *scantable= s->intra_scantable.permutated;
3582
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3583
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3584
    int i, last, run, bits, level, start_i;
3585
    const int esc_length= s->ac_esc_length;
3586
    uint8_t * length;
3587
    uint8_t * last_length;
3588

    
3589
    assert(h==8);
3590

    
3591
    s->dsp.diff_pixels(temp, src1, src2, stride);
3592

    
3593
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3594

    
3595
    bits=0;
3596

    
3597
    if (s->mb_intra) {
3598
        start_i = 1;
3599
        length     = s->intra_ac_vlc_length;
3600
        last_length= s->intra_ac_vlc_last_length;
3601
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3602
    } else {
3603
        start_i = 0;
3604
        length     = s->inter_ac_vlc_length;
3605
        last_length= s->inter_ac_vlc_last_length;
3606
    }
3607

    
3608
    if(last>=start_i){
3609
        run=0;
3610
        for(i=start_i; i<last; i++){
3611
            int j= scantable[i];
3612
            level= temp[j];
3613

    
3614
            if(level){
3615
                level+=64;
3616
                if((level&(~127)) == 0){
3617
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3618
                }else
3619
                    bits+= esc_length;
3620
                run=0;
3621
            }else
3622
                run++;
3623
        }
3624
        i= scantable[last];
3625

    
3626
        level= temp[i] + 64;
3627

    
3628
        assert(level - 64);
3629

    
3630
        if((level&(~127)) == 0){
3631
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3632
        }else
3633
            bits+= esc_length;
3634
    }
3635

    
3636
    return bits;
3637
}
3638

    
3639
static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3640
    int score=0;
3641
    int x,y;
3642

    
3643
    for(y=1; y<h; y++){
3644
        for(x=0; x<16; x+=4){
3645
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])
3646
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3647
        }
3648
        s+= stride;
3649
    }
3650

    
3651
    return score;
3652
}
3653

    
3654
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3655
    int score=0;
3656
    int x,y;
3657

    
3658
    for(y=1; y<h; y++){
3659
        for(x=0; x<16; x++){
3660
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3661
        }
3662
        s1+= stride;
3663
        s2+= stride;
3664
    }
3665

    
3666
    return score;
3667
}
3668

    
3669
#define SQ(a) ((a)*(a))
3670
static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3671
    int score=0;
3672
    int x,y;
3673

    
3674
    for(y=1; y<h; y++){
3675
        for(x=0; x<16; x+=4){
3676
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3677
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3678
        }
3679
        s+= stride;
3680
    }
3681

    
3682
    return score;
3683
}
3684

    
3685
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3686
    int score=0;
3687
    int x,y;
3688

    
3689
    for(y=1; y<h; y++){
3690
        for(x=0; x<16; x++){
3691
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3692
        }
3693
        s1+= stride;
3694
        s2+= stride;
3695
    }
3696

    
3697
    return score;
3698
}
3699

    
3700
static int ssd_int8_vs_int16_c(int8_t *pix1, int16_t *pix2, int size){
3701
    int score=0;
3702
    int i;
3703
    for(i=0; i<size; i++)
3704
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3705
    return score;
3706
}
3707

    
3708
WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3709
WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3710
WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3711
#ifdef CONFIG_GPL
3712
WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3713
#endif
3714
WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3715
WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3716
WARPER8_16_SQ(rd8x8_c, rd16_c)
3717
WARPER8_16_SQ(bit8x8_c, bit16_c)
3718

    
3719
static void vector_fmul_c(float *dst, const float *src, int len){
3720
    int i;
3721
    for(i=0; i<len; i++)
3722
        dst[i] *= src[i];
3723
}
3724

    
3725
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3726
    int i;
3727
    src1 += len-1;
3728
    for(i=0; i<len; i++)
3729
        dst[i] = src0[i] * src1[-i];
3730
}
3731

    
3732
void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3733
    int i;
3734
    for(i=0; i<len; i++)
3735
        dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3736
}
3737

    
3738
void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3739
    int i;
3740
    for(i=0; i<len; i++) {
3741
        int_fast32_t tmp = ((int32_t*)src)[i];
3742
        if(tmp & 0xf0000){
3743
            tmp = (0x43c0ffff - tmp)>>31;
3744
            // is this faster on some gcc/cpu combinations?
3745
//          if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3746
//          else                 tmp = 0;
3747
        }
3748
        dst[i] = tmp - 0x8000;
3749
    }
3750
}
3751

    
3752
/* XXX: those functions should be suppressed ASAP when all IDCTs are
3753
 converted */
3754
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3755
{
3756
    j_rev_dct (block);
3757
    put_pixels_clamped_c(block, dest, line_size);
3758
}
3759
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3760
{
3761
    j_rev_dct (block);
3762
    add_pixels_clamped_c(block, dest, line_size);
3763
}
3764

    
3765
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3766
{
3767
    j_rev_dct4 (block);
3768
    put_pixels_clamped4_c(block, dest, line_size);
3769
}
3770
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3771
{
3772
    j_rev_dct4 (block);
3773
    add_pixels_clamped4_c(block, dest, line_size);
3774
}
3775

    
3776
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3777
{
3778
    j_rev_dct2 (block);
3779
    put_pixels_clamped2_c(block, dest, line_size);
3780
}
3781
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3782
{
3783
    j_rev_dct2 (block);
3784
    add_pixels_clamped2_c(block, dest, line_size);
3785
}
3786

    
3787
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3788
{
3789
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3790

    
3791
    dest[0] = cm[(block[0] + 4)>>3];
3792
}
3793
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3794
{
3795
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3796

    
3797
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3798
}
3799

    
3800
static void just_return() { return; }
3801

    
3802
/* init static data */
3803
void dsputil_static_init(void)
3804
{
3805
    int i;
3806

    
3807
    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
3808
    for(i=0;i<MAX_NEG_CROP;i++) {
3809
        ff_cropTbl[i] = 0;
3810
        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
3811
    }
3812

    
3813
    for(i=0;i<512;i++) {
3814
        ff_squareTbl[i] = (i - 256) * (i - 256);
3815
    }
3816

    
3817
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3818
}
3819

    
3820
int ff_check_alignment(void){
3821
    static int did_fail=0;
3822
    DECLARE_ALIGNED_16(int, aligned);
3823

    
3824
    if((int)&aligned & 15){
3825
        if(!did_fail){
3826
#if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
3827
            av_log(NULL, AV_LOG_ERROR,
3828
                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
3829
                "and may be very slow or crash. This is not a bug in libavcodec,\n"
3830
                "but in the compiler. Do not report crashes to FFmpeg developers.\n");
3831
#endif
3832
            did_fail=1;
3833
        }
3834
        return -1;
3835
    }
3836
    return 0;
3837
}
3838

    
3839
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3840
{
3841
    int i;
3842

    
3843
    ff_check_alignment();
3844

    
3845
#ifdef CONFIG_ENCODERS
3846
    if(avctx->dct_algo==FF_DCT_FASTINT) {
3847
        c->fdct = fdct_ifast;
3848
        c->fdct248 = fdct_ifast248;
3849
    }
3850
    else if(avctx->dct_algo==FF_DCT_FAAN) {
3851
        c->fdct = ff_faandct;
3852
        c->fdct248 = ff_faandct248;
3853
    }
3854
    else {
3855
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3856
        c->fdct248 = ff_fdct248_islow;
3857
    }
3858
#endif //CONFIG_ENCODERS
3859

    
3860
    if(avctx->lowres==1){
3861
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3862
            c->idct_put= ff_jref_idct4_put;
3863
            c->idct_add= ff_jref_idct4_add;
3864
        }else{
3865
            c->idct_put= ff_h264_lowres_idct_put_c;
3866
            c->idct_add= ff_h264_lowres_idct_add_c;
3867
        }
3868
        c->idct    = j_rev_dct4;
3869
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3870
    }else if(avctx->lowres==2){
3871
        c->idct_put= ff_jref_idct2_put;
3872
        c->idct_add= ff_jref_idct2_add;
3873
        c->idct    = j_rev_dct2;
3874
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3875
    }else if(avctx->lowres==3){
3876
        c->idct_put= ff_jref_idct1_put;
3877
        c->idct_add= ff_jref_idct1_add;
3878
        c->idct    = j_rev_dct1;
3879
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3880
    }else{
3881
        if(avctx->idct_algo==FF_IDCT_INT){
3882
            c->idct_put= ff_jref_idct_put;
3883
            c->idct_add= ff_jref_idct_add;
3884
            c->idct    = j_rev_dct;
3885
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3886
        }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
3887
                avctx->idct_algo==FF_IDCT_VP3){
3888
            c->idct_put= ff_vp3_idct_put_c;
3889
            c->idct_add= ff_vp3_idct_add_c;
3890
            c->idct    = ff_vp3_idct_c;
3891
            c->idct_permutation_type= FF_NO_IDCT_PERM;
3892
        }else{ //accurate/default
3893
            c->idct_put= simple_idct_put;
3894
            c->idct_add= simple_idct_add;
3895
            c->idct    = simple_idct;
3896
            c->idct_permutation_type= FF_NO_IDCT_PERM;
3897
        }
3898
    }
3899

    
3900
    c->h264_idct_add= ff_h264_idct_add_c;
3901
    c->h264_idct8_add= ff_h264_idct8_add_c;
3902
    c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
3903
    c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
3904

    
3905
    c->get_pixels = get_pixels_c;
3906
    c->diff_pixels = diff_pixels_c;
3907
    c->put_pixels_clamped = put_pixels_clamped_c;
3908
    c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3909
    c->add_pixels_clamped = add_pixels_clamped_c;
3910
    c->add_pixels8 = add_pixels8_c;
3911
    c->add_pixels4 = add_pixels4_c;
3912
    c->sum_abs_dctelem = sum_abs_dctelem_c;
3913
    c->gmc1 = gmc1_c;
3914
    c->gmc = ff_gmc_c;
3915
    c->clear_blocks = clear_blocks_c;
3916
    c->pix_sum = pix_sum_c;
3917
    c->pix_norm1 = pix_norm1_c;
3918

    
3919
    /* TODO [0] 16  [1] 8 */
3920
    c->pix_abs[0][0] = pix_abs16_c;
3921
    c->pix_abs[0][1] = pix_abs16_x2_c;
3922
    c->pix_abs[0][2] = pix_abs16_y2_c;
3923
    c->pix_abs[0][3] = pix_abs16_xy2_c;
3924
    c->pix_abs[1][0] = pix_abs8_c;
3925
    c->pix_abs[1][1] = pix_abs8_x2_c;
3926
    c->pix_abs[1][2] = pix_abs8_y2_c;
3927
    c->pix_abs[1][3] = pix_abs8_xy2_c;
3928

    
3929
#define dspfunc(PFX, IDX, NUM) \
3930
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3931
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3932
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3933
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3934

    
3935
    dspfunc(put, 0, 16);
3936
    dspfunc(put_no_rnd, 0, 16);
3937
    dspfunc(put, 1, 8);
3938
    dspfunc(put_no_rnd, 1, 8);
3939
    dspfunc(put, 2, 4);
3940
    dspfunc(put, 3, 2);
3941

    
3942
    dspfunc(avg, 0, 16);
3943
    dspfunc(avg_no_rnd, 0, 16);
3944
    dspfunc(avg, 1, 8);
3945
    dspfunc(avg_no_rnd, 1, 8);
3946
    dspfunc(avg, 2, 4);
3947
    dspfunc(avg, 3, 2);
3948
#undef dspfunc
3949

    
3950
    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3951
    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3952

    
3953
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3954
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3955
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3956
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3957
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3958
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3959
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3960
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3961
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3962

    
3963
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3964
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3965
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3966
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3967
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3968
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3969
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3970
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3971
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3972

    
3973
#define dspfunc(PFX, IDX, NUM) \
3974
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3975
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3976
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3977
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3978
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3979
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3980
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3981
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3982
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3983
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3984
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3985
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3986
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3987
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3988
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3989
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3990

    
3991
    dspfunc(put_qpel, 0, 16);
3992
    dspfunc(put_no_rnd_qpel, 0, 16);
3993

    
3994
    dspfunc(avg_qpel, 0, 16);
3995
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3996

    
3997
    dspfunc(put_qpel, 1, 8);
3998
    dspfunc(put_no_rnd_qpel, 1, 8);
3999

    
4000
    dspfunc(avg_qpel, 1, 8);
4001
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4002

    
4003
    dspfunc(put_h264_qpel, 0, 16);
4004
    dspfunc(put_h264_qpel, 1, 8);
4005
    dspfunc(put_h264_qpel, 2, 4);
4006
    dspfunc(put_h264_qpel, 3, 2);
4007
    dspfunc(avg_h264_qpel, 0, 16);
4008
    dspfunc(avg_h264_qpel, 1, 8);
4009
    dspfunc(avg_h264_qpel, 2, 4);
4010

    
4011
#undef dspfunc
4012
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4013
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4014
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4015
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4016
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4017
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4018
    c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4019

    
4020
    c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4021
    c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4022
    c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4023
    c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4024
    c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4025
    c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4026
    c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4027
    c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4028
    c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4029
    c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4030
    c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4031
    c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4032
    c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4033
    c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4034
    c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4035
    c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4036
    c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4037
    c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4038
    c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4039
    c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4040

    
4041
#ifdef CONFIG_CAVS_DECODER
4042
    ff_cavsdsp_init(c,avctx);
4043
#endif
4044
#if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4045
    ff_vc1dsp_init(c,avctx);
4046
#endif
4047
#if defined(CONFIG_H264_ENCODER)
4048
    ff_h264dsp_init(c,avctx);
4049
#endif
4050

    
4051
    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4052
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4053
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4054
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4055
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4056
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4057
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4058
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4059

    
4060
#define SET_CMP_FUNC(name) \
4061
    c->name[0]= name ## 16_c;\
4062
    c->name[1]= name ## 8x8_c;
4063

    
4064
    SET_CMP_FUNC(hadamard8_diff)
4065
    c->hadamard8_diff[4]= hadamard8_intra16_c;
4066
    SET_CMP_FUNC(dct_sad)
4067
    SET_CMP_FUNC(dct_max)
4068
#ifdef CONFIG_GPL
4069
    SET_CMP_FUNC(dct264_sad)
4070
#endif
4071
    c->sad[0]= pix_abs16_c;
4072
    c->sad[1]= pix_abs8_c;
4073
    c->sse[0]= sse16_c;
4074
    c->sse[1]= sse8_c;
4075
    c->sse[2]= sse4_c;
4076
    SET_CMP_FUNC(quant_psnr)
4077
    SET_CMP_FUNC(rd)
4078
    SET_CMP_FUNC(bit)
4079
    c->vsad[0]= vsad16_c;
4080
    c->vsad[4]= vsad_intra16_c;
4081
    c->vsse[0]= vsse16_c;
4082
    c->vsse[4]= vsse_intra16_c;
4083
    c->nsse[0]= nsse16_c;
4084
    c->nsse[1]= nsse8_c;
4085
#ifdef CONFIG_SNOW_ENCODER
4086
    c->w53[0]= w53_16_c;
4087
    c->w53[1]= w53_8_c;
4088
    c->w97[0]= w97_16_c;
4089
    c->w97[1]= w97_8_c;
4090
#endif
4091

    
4092
    c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4093

    
4094
    c->add_bytes= add_bytes_c;
4095
    c->diff_bytes= diff_bytes_c;
4096
    c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4097
    c->bswap_buf= bswap_buf;
4098

    
4099
    c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4100
    c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4101
    c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4102
    c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4103
    c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4104
    c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4105
    c->h264_loop_filter_strength= NULL;
4106

    
4107
    c->h263_h_loop_filter= h263_h_loop_filter_c;
4108
    c->h263_v_loop_filter= h263_v_loop_filter_c;
4109

    
4110
    c->h261_loop_filter= h261_loop_filter_c;
4111

    
4112
    c->try_8x8basis= try_8x8basis_c;
4113
    c->add_8x8basis= add_8x8basis_c;
4114

    
4115
#ifdef CONFIG_SNOW_DECODER
4116
    c->vertical_compose97i = ff_snow_vertical_compose97i;
4117
    c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4118
    c->inner_add_yblock = ff_snow_inner_add_yblock;
4119
#endif
4120

    
4121
#ifdef CONFIG_VORBIS_DECODER
4122
    c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4123
#endif
4124
    c->vector_fmul = vector_fmul_c;
4125
    c->vector_fmul_reverse = vector_fmul_reverse_c;
4126
    c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4127
    c->float_to_int16 = ff_float_to_int16_c;
4128

    
4129
    c->shrink[0]= ff_img_copy_plane;
4130
    c->shrink[1]= ff_shrink22;
4131
    c->shrink[2]= ff_shrink44;
4132
    c->shrink[3]= ff_shrink88;
4133

    
4134
    c->prefetch= just_return;
4135

    
4136
    memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4137
    memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4138

    
4139
#ifdef HAVE_MMX
4140
    dsputil_init_mmx(c, avctx);
4141
#endif
4142
#ifdef ARCH_ARMV4L
4143
    dsputil_init_armv4l(c, avctx);
4144
#endif
4145
#ifdef HAVE_MLIB
4146
    dsputil_init_mlib(c, avctx);
4147
#endif
4148
#ifdef ARCH_SPARC
4149
   dsputil_init_vis(c,avctx);
4150
#endif
4151
#ifdef ARCH_ALPHA
4152
    dsputil_init_alpha(c, avctx);
4153
#endif
4154
#ifdef ARCH_POWERPC
4155
    dsputil_init_ppc(c, avctx);
4156
#endif
4157
#ifdef HAVE_MMI
4158
    dsputil_init_mmi(c, avctx);
4159
#endif
4160
#ifdef ARCH_SH4
4161
    dsputil_init_sh4(c,avctx);
4162
#endif
4163
#ifdef ARCH_BFIN
4164
    dsputil_init_bfin(c,avctx);
4165
#endif
4166

    
4167
    for(i=0; i<64; i++){
4168
        if(!c->put_2tap_qpel_pixels_tab[0][i])
4169
            c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4170
        if(!c->avg_2tap_qpel_pixels_tab[0][i])
4171
            c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4172
    }
4173

    
4174
    switch(c->idct_permutation_type){
4175
    case FF_NO_IDCT_PERM:
4176
        for(i=0; i<64; i++)
4177
            c->idct_permutation[i]= i;
4178
        break;
4179
    case FF_LIBMPEG2_IDCT_PERM:
4180
        for(i=0; i<64; i++)
4181
            c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4182
        break;
4183
    case FF_SIMPLE_IDCT_PERM:
4184
        for(i=0; i<64; i++)
4185
            c->idct_permutation[i]= simple_mmx_permutation[i];
4186
        break;
4187
    case FF_TRANSPOSE_IDCT_PERM:
4188
        for(i=0; i<64; i++)
4189
            c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4190
        break;
4191
    case FF_PARTTRANS_IDCT_PERM:
4192
        for(i=0; i<64; i++)
4193
            c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4194
        break;
4195
    default:
4196
        av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4197
    }
4198
}
4199