Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 59006372

History | View | Annotate | Download (148 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 *
22
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
23
 */
24

    
25
/**
26
 * @file dsputil.c
27
 * DSP utils
28
 */
29

    
30
#include "avcodec.h"
31
#include "dsputil.h"
32
#include "mpegvideo.h"
33
#include "simple_idct.h"
34
#include "faandct.h"
35
#include "snow.h"
36

    
37
/* snow.c */
38
void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
39

    
40
/* vorbis.c */
41
void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
42

    
43
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44
uint32_t ff_squareTbl[512] = {0, };
45

    
46
const uint8_t ff_zigzag_direct[64] = {
47
    0,   1,  8, 16,  9,  2,  3, 10,
48
    17, 24, 32, 25, 18, 11,  4,  5,
49
    12, 19, 26, 33, 40, 48, 41, 34,
50
    27, 20, 13,  6,  7, 14, 21, 28,
51
    35, 42, 49, 56, 57, 50, 43, 36,
52
    29, 22, 15, 23, 30, 37, 44, 51,
53
    58, 59, 52, 45, 38, 31, 39, 46,
54
    53, 60, 61, 54, 47, 55, 62, 63
55
};
56

    
57
/* Specific zigzag scan for 248 idct. NOTE that unlike the
58
   specification, we interleave the fields */
59
const uint8_t ff_zigzag248_direct[64] = {
60
     0,  8,  1,  9, 16, 24,  2, 10,
61
    17, 25, 32, 40, 48, 56, 33, 41,
62
    18, 26,  3, 11,  4, 12, 19, 27,
63
    34, 42, 49, 57, 50, 58, 35, 43,
64
    20, 28,  5, 13,  6, 14, 21, 29,
65
    36, 44, 51, 59, 52, 60, 37, 45,
66
    22, 30,  7, 15, 23, 31, 38, 46,
67
    53, 61, 54, 62, 39, 47, 55, 63,
68
};
69

    
70
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
71
DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
72

    
73
const uint8_t ff_alternate_horizontal_scan[64] = {
74
    0,  1,   2,  3,  8,  9, 16, 17,
75
    10, 11,  4,  5,  6,  7, 15, 14,
76
    13, 12, 19, 18, 24, 25, 32, 33,
77
    26, 27, 20, 21, 22, 23, 28, 29,
78
    30, 31, 34, 35, 40, 41, 48, 49,
79
    42, 43, 36, 37, 38, 39, 44, 45,
80
    46, 47, 50, 51, 56, 57, 58, 59,
81
    52, 53, 54, 55, 60, 61, 62, 63,
82
};
83

    
84
const uint8_t ff_alternate_vertical_scan[64] = {
85
    0,  8,  16, 24,  1,  9,  2, 10,
86
    17, 25, 32, 40, 48, 56, 57, 49,
87
    41, 33, 26, 18,  3, 11,  4, 12,
88
    19, 27, 34, 42, 50, 58, 35, 43,
89
    51, 59, 20, 28,  5, 13,  6, 14,
90
    21, 29, 36, 44, 52, 60, 37, 45,
91
    53, 61, 22, 30,  7, 15, 23, 31,
92
    38, 46, 54, 62, 39, 47, 55, 63,
93
};
94

    
95
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
96
const uint32_t ff_inverse[256]={
97
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
98
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
99
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
100
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
101
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
102
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
103
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
104
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
105
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
106
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
107
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
108
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
109
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
110
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
111
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
112
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
113
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
114
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
115
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
116
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
117
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
118
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
119
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
120
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
121
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
122
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
123
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
124
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
125
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
126
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
127
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
128
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
129
};
130

    
131
/* Input permutation for the simple_idct_mmx */
132
static const uint8_t simple_mmx_permutation[64]={
133
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
134
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
135
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
136
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
137
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
138
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
139
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
140
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
141
};
142

    
143
static int pix_sum_c(uint8_t * pix, int line_size)
144
{
145
    int s, i, j;
146

    
147
    s = 0;
148
    for (i = 0; i < 16; i++) {
149
        for (j = 0; j < 16; j += 8) {
150
            s += pix[0];
151
            s += pix[1];
152
            s += pix[2];
153
            s += pix[3];
154
            s += pix[4];
155
            s += pix[5];
156
            s += pix[6];
157
            s += pix[7];
158
            pix += 8;
159
        }
160
        pix += line_size - 16;
161
    }
162
    return s;
163
}
164

    
165
static int pix_norm1_c(uint8_t * pix, int line_size)
166
{
167
    int s, i, j;
168
    uint32_t *sq = ff_squareTbl + 256;
169

    
170
    s = 0;
171
    for (i = 0; i < 16; i++) {
172
        for (j = 0; j < 16; j += 8) {
173
#if 0
174
            s += sq[pix[0]];
175
            s += sq[pix[1]];
176
            s += sq[pix[2]];
177
            s += sq[pix[3]];
178
            s += sq[pix[4]];
179
            s += sq[pix[5]];
180
            s += sq[pix[6]];
181
            s += sq[pix[7]];
182
#else
183
#if LONG_MAX > 2147483647
184
            register uint64_t x=*(uint64_t*)pix;
185
            s += sq[x&0xff];
186
            s += sq[(x>>8)&0xff];
187
            s += sq[(x>>16)&0xff];
188
            s += sq[(x>>24)&0xff];
189
            s += sq[(x>>32)&0xff];
190
            s += sq[(x>>40)&0xff];
191
            s += sq[(x>>48)&0xff];
192
            s += sq[(x>>56)&0xff];
193
#else
194
            register uint32_t x=*(uint32_t*)pix;
195
            s += sq[x&0xff];
196
            s += sq[(x>>8)&0xff];
197
            s += sq[(x>>16)&0xff];
198
            s += sq[(x>>24)&0xff];
199
            x=*(uint32_t*)(pix+4);
200
            s += sq[x&0xff];
201
            s += sq[(x>>8)&0xff];
202
            s += sq[(x>>16)&0xff];
203
            s += sq[(x>>24)&0xff];
204
#endif
205
#endif
206
            pix += 8;
207
        }
208
        pix += line_size - 16;
209
    }
210
    return s;
211
}
212

    
213
static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
214
    int i;
215

    
216
    for(i=0; i+8<=w; i+=8){
217
        dst[i+0]= bswap_32(src[i+0]);
218
        dst[i+1]= bswap_32(src[i+1]);
219
        dst[i+2]= bswap_32(src[i+2]);
220
        dst[i+3]= bswap_32(src[i+3]);
221
        dst[i+4]= bswap_32(src[i+4]);
222
        dst[i+5]= bswap_32(src[i+5]);
223
        dst[i+6]= bswap_32(src[i+6]);
224
        dst[i+7]= bswap_32(src[i+7]);
225
    }
226
    for(;i<w; i++){
227
        dst[i+0]= bswap_32(src[i+0]);
228
    }
229
}
230

    
231
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
232
{
233
    int s, i;
234
    uint32_t *sq = ff_squareTbl + 256;
235

    
236
    s = 0;
237
    for (i = 0; i < h; i++) {
238
        s += sq[pix1[0] - pix2[0]];
239
        s += sq[pix1[1] - pix2[1]];
240
        s += sq[pix1[2] - pix2[2]];
241
        s += sq[pix1[3] - pix2[3]];
242
        pix1 += line_size;
243
        pix2 += line_size;
244
    }
245
    return s;
246
}
247

    
248
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
249
{
250
    int s, i;
251
    uint32_t *sq = ff_squareTbl + 256;
252

    
253
    s = 0;
254
    for (i = 0; i < h; i++) {
255
        s += sq[pix1[0] - pix2[0]];
256
        s += sq[pix1[1] - pix2[1]];
257
        s += sq[pix1[2] - pix2[2]];
258
        s += sq[pix1[3] - pix2[3]];
259
        s += sq[pix1[4] - pix2[4]];
260
        s += sq[pix1[5] - pix2[5]];
261
        s += sq[pix1[6] - pix2[6]];
262
        s += sq[pix1[7] - pix2[7]];
263
        pix1 += line_size;
264
        pix2 += line_size;
265
    }
266
    return s;
267
}
268

    
269
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
270
{
271
    int s, i;
272
    uint32_t *sq = ff_squareTbl + 256;
273

    
274
    s = 0;
275
    for (i = 0; i < h; i++) {
276
        s += sq[pix1[ 0] - pix2[ 0]];
277
        s += sq[pix1[ 1] - pix2[ 1]];
278
        s += sq[pix1[ 2] - pix2[ 2]];
279
        s += sq[pix1[ 3] - pix2[ 3]];
280
        s += sq[pix1[ 4] - pix2[ 4]];
281
        s += sq[pix1[ 5] - pix2[ 5]];
282
        s += sq[pix1[ 6] - pix2[ 6]];
283
        s += sq[pix1[ 7] - pix2[ 7]];
284
        s += sq[pix1[ 8] - pix2[ 8]];
285
        s += sq[pix1[ 9] - pix2[ 9]];
286
        s += sq[pix1[10] - pix2[10]];
287
        s += sq[pix1[11] - pix2[11]];
288
        s += sq[pix1[12] - pix2[12]];
289
        s += sq[pix1[13] - pix2[13]];
290
        s += sq[pix1[14] - pix2[14]];
291
        s += sq[pix1[15] - pix2[15]];
292

    
293
        pix1 += line_size;
294
        pix2 += line_size;
295
    }
296
    return s;
297
}
298

    
299

    
300
#ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
301
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
302
    int s, i, j;
303
    const int dec_count= w==8 ? 3 : 4;
304
    int tmp[32*32];
305
    int level, ori;
306
    static const int scale[2][2][4][4]={
307
      {
308
        {
309
            // 9/7 8x8 dec=3
310
            {268, 239, 239, 213},
311
            {  0, 224, 224, 152},
312
            {  0, 135, 135, 110},
313
        },{
314
            // 9/7 16x16 or 32x32 dec=4
315
            {344, 310, 310, 280},
316
            {  0, 320, 320, 228},
317
            {  0, 175, 175, 136},
318
            {  0, 129, 129, 102},
319
        }
320
      },{
321
        {
322
            // 5/3 8x8 dec=3
323
            {275, 245, 245, 218},
324
            {  0, 230, 230, 156},
325
            {  0, 138, 138, 113},
326
        },{
327
            // 5/3 16x16 or 32x32 dec=4
328
            {352, 317, 317, 286},
329
            {  0, 328, 328, 233},
330
            {  0, 180, 180, 140},
331
            {  0, 132, 132, 105},
332
        }
333
      }
334
    };
335

    
336
    for (i = 0; i < h; i++) {
337
        for (j = 0; j < w; j+=4) {
338
            tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
339
            tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
340
            tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
341
            tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
342
        }
343
        pix1 += line_size;
344
        pix2 += line_size;
345
    }
346

    
347
    ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
348

    
349
    s=0;
350
    assert(w==h);
351
    for(level=0; level<dec_count; level++){
352
        for(ori= level ? 1 : 0; ori<4; ori++){
353
            int size= w>>(dec_count-level);
354
            int sx= (ori&1) ? size : 0;
355
            int stride= 32<<(dec_count-level);
356
            int sy= (ori&2) ? stride>>1 : 0;
357

    
358
            for(i=0; i<size; i++){
359
                for(j=0; j<size; j++){
360
                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
361
                    s += FFABS(v);
362
                }
363
            }
364
        }
365
    }
366
    assert(s>=0);
367
    return s>>9;
368
}
369

    
370
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
371
    return w_c(v, pix1, pix2, line_size,  8, h, 1);
372
}
373

    
374
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
375
    return w_c(v, pix1, pix2, line_size,  8, h, 0);
376
}
377

    
378
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
379
    return w_c(v, pix1, pix2, line_size, 16, h, 1);
380
}
381

    
382
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
383
    return w_c(v, pix1, pix2, line_size, 16, h, 0);
384
}
385

    
386
int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
387
    return w_c(v, pix1, pix2, line_size, 32, h, 1);
388
}
389

    
390
int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
391
    return w_c(v, pix1, pix2, line_size, 32, h, 0);
392
}
393
#endif
394

    
395
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
396
{
397
    int i;
398

    
399
    /* read the pixels */
400
    for(i=0;i<8;i++) {
401
        block[0] = pixels[0];
402
        block[1] = pixels[1];
403
        block[2] = pixels[2];
404
        block[3] = pixels[3];
405
        block[4] = pixels[4];
406
        block[5] = pixels[5];
407
        block[6] = pixels[6];
408
        block[7] = pixels[7];
409
        pixels += line_size;
410
        block += 8;
411
    }
412
}
413

    
414
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
415
                          const uint8_t *s2, int stride){
416
    int i;
417

    
418
    /* read the pixels */
419
    for(i=0;i<8;i++) {
420
        block[0] = s1[0] - s2[0];
421
        block[1] = s1[1] - s2[1];
422
        block[2] = s1[2] - s2[2];
423
        block[3] = s1[3] - s2[3];
424
        block[4] = s1[4] - s2[4];
425
        block[5] = s1[5] - s2[5];
426
        block[6] = s1[6] - s2[6];
427
        block[7] = s1[7] - s2[7];
428
        s1 += stride;
429
        s2 += stride;
430
        block += 8;
431
    }
432
}
433

    
434

    
435
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
436
                                 int line_size)
437
{
438
    int i;
439
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
440

    
441
    /* read the pixels */
442
    for(i=0;i<8;i++) {
443
        pixels[0] = cm[block[0]];
444
        pixels[1] = cm[block[1]];
445
        pixels[2] = cm[block[2]];
446
        pixels[3] = cm[block[3]];
447
        pixels[4] = cm[block[4]];
448
        pixels[5] = cm[block[5]];
449
        pixels[6] = cm[block[6]];
450
        pixels[7] = cm[block[7]];
451

    
452
        pixels += line_size;
453
        block += 8;
454
    }
455
}
456

    
457
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
458
                                 int line_size)
459
{
460
    int i;
461
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
462

    
463
    /* read the pixels */
464
    for(i=0;i<4;i++) {
465
        pixels[0] = cm[block[0]];
466
        pixels[1] = cm[block[1]];
467
        pixels[2] = cm[block[2]];
468
        pixels[3] = cm[block[3]];
469

    
470
        pixels += line_size;
471
        block += 8;
472
    }
473
}
474

    
475
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
476
                                 int line_size)
477
{
478
    int i;
479
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
480

    
481
    /* read the pixels */
482
    for(i=0;i<2;i++) {
483
        pixels[0] = cm[block[0]];
484
        pixels[1] = cm[block[1]];
485

    
486
        pixels += line_size;
487
        block += 8;
488
    }
489
}
490

    
491
static void put_signed_pixels_clamped_c(const DCTELEM *block,
492
                                        uint8_t *restrict pixels,
493
                                        int line_size)
494
{
495
    int i, j;
496

    
497
    for (i = 0; i < 8; i++) {
498
        for (j = 0; j < 8; j++) {
499
            if (*block < -128)
500
                *pixels = 0;
501
            else if (*block > 127)
502
                *pixels = 255;
503
            else
504
                *pixels = (uint8_t)(*block + 128);
505
            block++;
506
            pixels++;
507
        }
508
        pixels += (line_size - 8);
509
    }
510
}
511

    
512
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
513
                          int line_size)
514
{
515
    int i;
516
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
517

    
518
    /* read the pixels */
519
    for(i=0;i<8;i++) {
520
        pixels[0] = cm[pixels[0] + block[0]];
521
        pixels[1] = cm[pixels[1] + block[1]];
522
        pixels[2] = cm[pixels[2] + block[2]];
523
        pixels[3] = cm[pixels[3] + block[3]];
524
        pixels[4] = cm[pixels[4] + block[4]];
525
        pixels[5] = cm[pixels[5] + block[5]];
526
        pixels[6] = cm[pixels[6] + block[6]];
527
        pixels[7] = cm[pixels[7] + block[7]];
528
        pixels += line_size;
529
        block += 8;
530
    }
531
}
532

    
533
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
534
                          int line_size)
535
{
536
    int i;
537
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
538

    
539
    /* read the pixels */
540
    for(i=0;i<4;i++) {
541
        pixels[0] = cm[pixels[0] + block[0]];
542
        pixels[1] = cm[pixels[1] + block[1]];
543
        pixels[2] = cm[pixels[2] + block[2]];
544
        pixels[3] = cm[pixels[3] + block[3]];
545
        pixels += line_size;
546
        block += 8;
547
    }
548
}
549

    
550
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
551
                          int line_size)
552
{
553
    int i;
554
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
555

    
556
    /* read the pixels */
557
    for(i=0;i<2;i++) {
558
        pixels[0] = cm[pixels[0] + block[0]];
559
        pixels[1] = cm[pixels[1] + block[1]];
560
        pixels += line_size;
561
        block += 8;
562
    }
563
}
564

    
565
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
566
{
567
    int i;
568
    for(i=0;i<8;i++) {
569
        pixels[0] += block[0];
570
        pixels[1] += block[1];
571
        pixels[2] += block[2];
572
        pixels[3] += block[3];
573
        pixels[4] += block[4];
574
        pixels[5] += block[5];
575
        pixels[6] += block[6];
576
        pixels[7] += block[7];
577
        pixels += line_size;
578
        block += 8;
579
    }
580
}
581

    
582
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
583
{
584
    int i;
585
    for(i=0;i<4;i++) {
586
        pixels[0] += block[0];
587
        pixels[1] += block[1];
588
        pixels[2] += block[2];
589
        pixels[3] += block[3];
590
        pixels += line_size;
591
        block += 4;
592
    }
593
}
594

    
595
#if 0
596

597
#define PIXOP2(OPNAME, OP) \
598
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
599
{\
600
    int i;\
601
    for(i=0; i<h; i++){\
602
        OP(*((uint64_t*)block), LD64(pixels));\
603
        pixels+=line_size;\
604
        block +=line_size;\
605
    }\
606
}\
607
\
608
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
609
{\
610
    int i;\
611
    for(i=0; i<h; i++){\
612
        const uint64_t a= LD64(pixels  );\
613
        const uint64_t b= LD64(pixels+1);\
614
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
615
        pixels+=line_size;\
616
        block +=line_size;\
617
    }\
618
}\
619
\
620
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
621
{\
622
    int i;\
623
    for(i=0; i<h; i++){\
624
        const uint64_t a= LD64(pixels  );\
625
        const uint64_t b= LD64(pixels+1);\
626
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
627
        pixels+=line_size;\
628
        block +=line_size;\
629
    }\
630
}\
631
\
632
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
633
{\
634
    int i;\
635
    for(i=0; i<h; i++){\
636
        const uint64_t a= LD64(pixels          );\
637
        const uint64_t b= LD64(pixels+line_size);\
638
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
639
        pixels+=line_size;\
640
        block +=line_size;\
641
    }\
642
}\
643
\
644
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
645
{\
646
    int i;\
647
    for(i=0; i<h; i++){\
648
        const uint64_t a= LD64(pixels          );\
649
        const uint64_t b= LD64(pixels+line_size);\
650
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
651
        pixels+=line_size;\
652
        block +=line_size;\
653
    }\
654
}\
655
\
656
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
657
{\
658
        int i;\
659
        const uint64_t a= LD64(pixels  );\
660
        const uint64_t b= LD64(pixels+1);\
661
        uint64_t l0=  (a&0x0303030303030303ULL)\
662
                    + (b&0x0303030303030303ULL)\
663
                    + 0x0202020202020202ULL;\
664
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
665
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
666
        uint64_t l1,h1;\
667
\
668
        pixels+=line_size;\
669
        for(i=0; i<h; i+=2){\
670
            uint64_t a= LD64(pixels  );\
671
            uint64_t b= LD64(pixels+1);\
672
            l1=  (a&0x0303030303030303ULL)\
673
               + (b&0x0303030303030303ULL);\
674
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
675
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
676
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
677
            pixels+=line_size;\
678
            block +=line_size;\
679
            a= LD64(pixels  );\
680
            b= LD64(pixels+1);\
681
            l0=  (a&0x0303030303030303ULL)\
682
               + (b&0x0303030303030303ULL)\
683
               + 0x0202020202020202ULL;\
684
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
685
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
686
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
687
            pixels+=line_size;\
688
            block +=line_size;\
689
        }\
690
}\
691
\
692
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
693
{\
694
        int i;\
695
        const uint64_t a= LD64(pixels  );\
696
        const uint64_t b= LD64(pixels+1);\
697
        uint64_t l0=  (a&0x0303030303030303ULL)\
698
                    + (b&0x0303030303030303ULL)\
699
                    + 0x0101010101010101ULL;\
700
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
701
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
702
        uint64_t l1,h1;\
703
\
704
        pixels+=line_size;\
705
        for(i=0; i<h; i+=2){\
706
            uint64_t a= LD64(pixels  );\
707
            uint64_t b= LD64(pixels+1);\
708
            l1=  (a&0x0303030303030303ULL)\
709
               + (b&0x0303030303030303ULL);\
710
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
711
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
712
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
713
            pixels+=line_size;\
714
            block +=line_size;\
715
            a= LD64(pixels  );\
716
            b= LD64(pixels+1);\
717
            l0=  (a&0x0303030303030303ULL)\
718
               + (b&0x0303030303030303ULL)\
719
               + 0x0101010101010101ULL;\
720
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
721
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
722
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
723
            pixels+=line_size;\
724
            block +=line_size;\
725
        }\
726
}\
727
\
728
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
729
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
730
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
731
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
732
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
733
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
734
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
735

736
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
737
#else // 64 bit variant
738

    
739
#define PIXOP2(OPNAME, OP) \
740
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
741
    int i;\
742
    for(i=0; i<h; i++){\
743
        OP(*((uint16_t*)(block  )), LD16(pixels  ));\
744
        pixels+=line_size;\
745
        block +=line_size;\
746
    }\
747
}\
748
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
749
    int i;\
750
    for(i=0; i<h; i++){\
751
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
752
        pixels+=line_size;\
753
        block +=line_size;\
754
    }\
755
}\
756
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
757
    int i;\
758
    for(i=0; i<h; i++){\
759
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
760
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
761
        pixels+=line_size;\
762
        block +=line_size;\
763
    }\
764
}\
765
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
766
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
767
}\
768
\
769
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
770
                                                int src_stride1, int src_stride2, int h){\
771
    int i;\
772
    for(i=0; i<h; i++){\
773
        uint32_t a,b;\
774
        a= LD32(&src1[i*src_stride1  ]);\
775
        b= LD32(&src2[i*src_stride2  ]);\
776
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
777
        a= LD32(&src1[i*src_stride1+4]);\
778
        b= LD32(&src2[i*src_stride2+4]);\
779
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
780
    }\
781
}\
782
\
783
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
784
                                                int src_stride1, int src_stride2, int h){\
785
    int i;\
786
    for(i=0; i<h; i++){\
787
        uint32_t a,b;\
788
        a= LD32(&src1[i*src_stride1  ]);\
789
        b= LD32(&src2[i*src_stride2  ]);\
790
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
791
        a= LD32(&src1[i*src_stride1+4]);\
792
        b= LD32(&src2[i*src_stride2+4]);\
793
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
794
    }\
795
}\
796
\
797
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
798
                                                int src_stride1, int src_stride2, int h){\
799
    int i;\
800
    for(i=0; i<h; i++){\
801
        uint32_t a,b;\
802
        a= LD32(&src1[i*src_stride1  ]);\
803
        b= LD32(&src2[i*src_stride2  ]);\
804
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
805
    }\
806
}\
807
\
808
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
809
                                                int src_stride1, int src_stride2, int h){\
810
    int i;\
811
    for(i=0; i<h; i++){\
812
        uint32_t a,b;\
813
        a= LD16(&src1[i*src_stride1  ]);\
814
        b= LD16(&src2[i*src_stride2  ]);\
815
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
816
    }\
817
}\
818
\
819
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
820
                                                int src_stride1, int src_stride2, int h){\
821
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
822
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
823
}\
824
\
825
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
826
                                                int src_stride1, int src_stride2, int h){\
827
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
828
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
829
}\
830
\
831
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
832
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
833
}\
834
\
835
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
836
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
837
}\
838
\
839
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
840
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
841
}\
842
\
843
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
844
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
845
}\
846
\
847
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
848
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
849
    int i;\
850
    for(i=0; i<h; i++){\
851
        uint32_t a, b, c, d, l0, l1, h0, h1;\
852
        a= LD32(&src1[i*src_stride1]);\
853
        b= LD32(&src2[i*src_stride2]);\
854
        c= LD32(&src3[i*src_stride3]);\
855
        d= LD32(&src4[i*src_stride4]);\
856
        l0=  (a&0x03030303UL)\
857
           + (b&0x03030303UL)\
858
           + 0x02020202UL;\
859
        h0= ((a&0xFCFCFCFCUL)>>2)\
860
          + ((b&0xFCFCFCFCUL)>>2);\
861
        l1=  (c&0x03030303UL)\
862
           + (d&0x03030303UL);\
863
        h1= ((c&0xFCFCFCFCUL)>>2)\
864
          + ((d&0xFCFCFCFCUL)>>2);\
865
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
866
        a= LD32(&src1[i*src_stride1+4]);\
867
        b= LD32(&src2[i*src_stride2+4]);\
868
        c= LD32(&src3[i*src_stride3+4]);\
869
        d= LD32(&src4[i*src_stride4+4]);\
870
        l0=  (a&0x03030303UL)\
871
           + (b&0x03030303UL)\
872
           + 0x02020202UL;\
873
        h0= ((a&0xFCFCFCFCUL)>>2)\
874
          + ((b&0xFCFCFCFCUL)>>2);\
875
        l1=  (c&0x03030303UL)\
876
           + (d&0x03030303UL);\
877
        h1= ((c&0xFCFCFCFCUL)>>2)\
878
          + ((d&0xFCFCFCFCUL)>>2);\
879
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
880
    }\
881
}\
882
\
883
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
884
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
885
}\
886
\
887
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
888
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
889
}\
890
\
891
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
892
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
893
}\
894
\
895
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
896
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
897
}\
898
\
899
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
900
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
901
    int i;\
902
    for(i=0; i<h; i++){\
903
        uint32_t a, b, c, d, l0, l1, h0, h1;\
904
        a= LD32(&src1[i*src_stride1]);\
905
        b= LD32(&src2[i*src_stride2]);\
906
        c= LD32(&src3[i*src_stride3]);\
907
        d= LD32(&src4[i*src_stride4]);\
908
        l0=  (a&0x03030303UL)\
909
           + (b&0x03030303UL)\
910
           + 0x01010101UL;\
911
        h0= ((a&0xFCFCFCFCUL)>>2)\
912
          + ((b&0xFCFCFCFCUL)>>2);\
913
        l1=  (c&0x03030303UL)\
914
           + (d&0x03030303UL);\
915
        h1= ((c&0xFCFCFCFCUL)>>2)\
916
          + ((d&0xFCFCFCFCUL)>>2);\
917
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
918
        a= LD32(&src1[i*src_stride1+4]);\
919
        b= LD32(&src2[i*src_stride2+4]);\
920
        c= LD32(&src3[i*src_stride3+4]);\
921
        d= LD32(&src4[i*src_stride4+4]);\
922
        l0=  (a&0x03030303UL)\
923
           + (b&0x03030303UL)\
924
           + 0x01010101UL;\
925
        h0= ((a&0xFCFCFCFCUL)>>2)\
926
          + ((b&0xFCFCFCFCUL)>>2);\
927
        l1=  (c&0x03030303UL)\
928
           + (d&0x03030303UL);\
929
        h1= ((c&0xFCFCFCFCUL)>>2)\
930
          + ((d&0xFCFCFCFCUL)>>2);\
931
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
932
    }\
933
}\
934
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
935
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
936
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
937
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
938
}\
939
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
940
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
941
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
942
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
943
}\
944
\
945
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
946
{\
947
        int i, a0, b0, a1, b1;\
948
        a0= pixels[0];\
949
        b0= pixels[1] + 2;\
950
        a0 += b0;\
951
        b0 += pixels[2];\
952
\
953
        pixels+=line_size;\
954
        for(i=0; i<h; i+=2){\
955
            a1= pixels[0];\
956
            b1= pixels[1];\
957
            a1 += b1;\
958
            b1 += pixels[2];\
959
\
960
            block[0]= (a1+a0)>>2; /* FIXME non put */\
961
            block[1]= (b1+b0)>>2;\
962
\
963
            pixels+=line_size;\
964
            block +=line_size;\
965
\
966
            a0= pixels[0];\
967
            b0= pixels[1] + 2;\
968
            a0 += b0;\
969
            b0 += pixels[2];\
970
\
971
            block[0]= (a1+a0)>>2;\
972
            block[1]= (b1+b0)>>2;\
973
            pixels+=line_size;\
974
            block +=line_size;\
975
        }\
976
}\
977
\
978
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
979
{\
980
        int i;\
981
        const uint32_t a= LD32(pixels  );\
982
        const uint32_t b= LD32(pixels+1);\
983
        uint32_t l0=  (a&0x03030303UL)\
984
                    + (b&0x03030303UL)\
985
                    + 0x02020202UL;\
986
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
987
                   + ((b&0xFCFCFCFCUL)>>2);\
988
        uint32_t l1,h1;\
989
\
990
        pixels+=line_size;\
991
        for(i=0; i<h; i+=2){\
992
            uint32_t a= LD32(pixels  );\
993
            uint32_t b= LD32(pixels+1);\
994
            l1=  (a&0x03030303UL)\
995
               + (b&0x03030303UL);\
996
            h1= ((a&0xFCFCFCFCUL)>>2)\
997
              + ((b&0xFCFCFCFCUL)>>2);\
998
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
999
            pixels+=line_size;\
1000
            block +=line_size;\
1001
            a= LD32(pixels  );\
1002
            b= LD32(pixels+1);\
1003
            l0=  (a&0x03030303UL)\
1004
               + (b&0x03030303UL)\
1005
               + 0x02020202UL;\
1006
            h0= ((a&0xFCFCFCFCUL)>>2)\
1007
              + ((b&0xFCFCFCFCUL)>>2);\
1008
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1009
            pixels+=line_size;\
1010
            block +=line_size;\
1011
        }\
1012
}\
1013
\
1014
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1015
{\
1016
    int j;\
1017
    for(j=0; j<2; j++){\
1018
        int i;\
1019
        const uint32_t a= LD32(pixels  );\
1020
        const uint32_t b= LD32(pixels+1);\
1021
        uint32_t l0=  (a&0x03030303UL)\
1022
                    + (b&0x03030303UL)\
1023
                    + 0x02020202UL;\
1024
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1025
                   + ((b&0xFCFCFCFCUL)>>2);\
1026
        uint32_t l1,h1;\
1027
\
1028
        pixels+=line_size;\
1029
        for(i=0; i<h; i+=2){\
1030
            uint32_t a= LD32(pixels  );\
1031
            uint32_t b= LD32(pixels+1);\
1032
            l1=  (a&0x03030303UL)\
1033
               + (b&0x03030303UL);\
1034
            h1= ((a&0xFCFCFCFCUL)>>2)\
1035
              + ((b&0xFCFCFCFCUL)>>2);\
1036
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1037
            pixels+=line_size;\
1038
            block +=line_size;\
1039
            a= LD32(pixels  );\
1040
            b= LD32(pixels+1);\
1041
            l0=  (a&0x03030303UL)\
1042
               + (b&0x03030303UL)\
1043
               + 0x02020202UL;\
1044
            h0= ((a&0xFCFCFCFCUL)>>2)\
1045
              + ((b&0xFCFCFCFCUL)>>2);\
1046
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1047
            pixels+=line_size;\
1048
            block +=line_size;\
1049
        }\
1050
        pixels+=4-line_size*(h+1);\
1051
        block +=4-line_size*h;\
1052
    }\
1053
}\
1054
\
1055
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1056
{\
1057
    int j;\
1058
    for(j=0; j<2; j++){\
1059
        int i;\
1060
        const uint32_t a= LD32(pixels  );\
1061
        const uint32_t b= LD32(pixels+1);\
1062
        uint32_t l0=  (a&0x03030303UL)\
1063
                    + (b&0x03030303UL)\
1064
                    + 0x01010101UL;\
1065
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1066
                   + ((b&0xFCFCFCFCUL)>>2);\
1067
        uint32_t l1,h1;\
1068
\
1069
        pixels+=line_size;\
1070
        for(i=0; i<h; i+=2){\
1071
            uint32_t a= LD32(pixels  );\
1072
            uint32_t b= LD32(pixels+1);\
1073
            l1=  (a&0x03030303UL)\
1074
               + (b&0x03030303UL);\
1075
            h1= ((a&0xFCFCFCFCUL)>>2)\
1076
              + ((b&0xFCFCFCFCUL)>>2);\
1077
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1078
            pixels+=line_size;\
1079
            block +=line_size;\
1080
            a= LD32(pixels  );\
1081
            b= LD32(pixels+1);\
1082
            l0=  (a&0x03030303UL)\
1083
               + (b&0x03030303UL)\
1084
               + 0x01010101UL;\
1085
            h0= ((a&0xFCFCFCFCUL)>>2)\
1086
              + ((b&0xFCFCFCFCUL)>>2);\
1087
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1088
            pixels+=line_size;\
1089
            block +=line_size;\
1090
        }\
1091
        pixels+=4-line_size*(h+1);\
1092
        block +=4-line_size*h;\
1093
    }\
1094
}\
1095
\
1096
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1097
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1098
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1099
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1100
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1101
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1102
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1103
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1104

    
1105
#define op_avg(a, b) a = rnd_avg32(a, b)
1106
#endif
1107
#define op_put(a, b) a = b
1108

    
1109
PIXOP2(avg, op_avg)
1110
PIXOP2(put, op_put)
1111
#undef op_avg
1112
#undef op_put
1113

    
1114
#define avg2(a,b) ((a+b+1)>>1)
1115
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1116

    
1117
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1118
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1119
}
1120

    
1121
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1122
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1123
}
1124

    
1125
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1126
{
1127
    const int A=(16-x16)*(16-y16);
1128
    const int B=(   x16)*(16-y16);
1129
    const int C=(16-x16)*(   y16);
1130
    const int D=(   x16)*(   y16);
1131
    int i;
1132

    
1133
    for(i=0; i<h; i++)
1134
    {
1135
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1136
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1137
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1138
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1139
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1140
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1141
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1142
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1143
        dst+= stride;
1144
        src+= stride;
1145
    }
1146
}
1147

    
1148
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1149
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1150
{
1151
    int y, vx, vy;
1152
    const int s= 1<<shift;
1153

    
1154
    width--;
1155
    height--;
1156

    
1157
    for(y=0; y<h; y++){
1158
        int x;
1159

    
1160
        vx= ox;
1161
        vy= oy;
1162
        for(x=0; x<8; x++){ //XXX FIXME optimize
1163
            int src_x, src_y, frac_x, frac_y, index;
1164

    
1165
            src_x= vx>>16;
1166
            src_y= vy>>16;
1167
            frac_x= src_x&(s-1);
1168
            frac_y= src_y&(s-1);
1169
            src_x>>=shift;
1170
            src_y>>=shift;
1171

    
1172
            if((unsigned)src_x < width){
1173
                if((unsigned)src_y < height){
1174
                    index= src_x + src_y*stride;
1175
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1176
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1177
                                        + (  src[index+stride  ]*(s-frac_x)
1178
                                           + src[index+stride+1]*   frac_x )*   frac_y
1179
                                        + r)>>(shift*2);
1180
                }else{
1181
                    index= src_x + av_clip(src_y, 0, height)*stride;
1182
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1183
                                          + src[index       +1]*   frac_x )*s
1184
                                        + r)>>(shift*2);
1185
                }
1186
            }else{
1187
                if((unsigned)src_y < height){
1188
                    index= av_clip(src_x, 0, width) + src_y*stride;
1189
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1190
                                           + src[index+stride  ]*   frac_y )*s
1191
                                        + r)>>(shift*2);
1192
                }else{
1193
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1194
                    dst[y*stride + x]=    src[index         ];
1195
                }
1196
            }
1197

    
1198
            vx+= dxx;
1199
            vy+= dyx;
1200
        }
1201
        ox += dxy;
1202
        oy += dyy;
1203
    }
1204
}
1205

    
1206
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1207
    switch(width){
1208
    case 2: put_pixels2_c (dst, src, stride, height); break;
1209
    case 4: put_pixels4_c (dst, src, stride, height); break;
1210
    case 8: put_pixels8_c (dst, src, stride, height); break;
1211
    case 16:put_pixels16_c(dst, src, stride, height); break;
1212
    }
1213
}
1214

    
1215
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1216
    int i,j;
1217
    for (i=0; i < height; i++) {
1218
      for (j=0; j < width; j++) {
1219
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1220
      }
1221
      src += stride;
1222
      dst += stride;
1223
    }
1224
}
1225

    
1226
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1227
    int i,j;
1228
    for (i=0; i < height; i++) {
1229
      for (j=0; j < width; j++) {
1230
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1231
      }
1232
      src += stride;
1233
      dst += stride;
1234
    }
1235
}
1236

    
1237
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1238
    int i,j;
1239
    for (i=0; i < height; i++) {
1240
      for (j=0; j < width; j++) {
1241
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1242
      }
1243
      src += stride;
1244
      dst += stride;
1245
    }
1246
}
1247

    
1248
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1249
    int i,j;
1250
    for (i=0; i < height; i++) {
1251
      for (j=0; j < width; j++) {
1252
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1253
      }
1254
      src += stride;
1255
      dst += stride;
1256
    }
1257
}
1258

    
1259
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1260
    int i,j;
1261
    for (i=0; i < height; i++) {
1262
      for (j=0; j < width; j++) {
1263
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1264
      }
1265
      src += stride;
1266
      dst += stride;
1267
    }
1268
}
1269

    
1270
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1271
    int i,j;
1272
    for (i=0; i < height; i++) {
1273
      for (j=0; j < width; j++) {
1274
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1275
      }
1276
      src += stride;
1277
      dst += stride;
1278
    }
1279
}
1280

    
1281
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1282
    int i,j;
1283
    for (i=0; i < height; i++) {
1284
      for (j=0; j < width; j++) {
1285
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1286
      }
1287
      src += stride;
1288
      dst += stride;
1289
    }
1290
}
1291

    
1292
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1293
    int i,j;
1294
    for (i=0; i < height; i++) {
1295
      for (j=0; j < width; j++) {
1296
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1297
      }
1298
      src += stride;
1299
      dst += stride;
1300
    }
1301
}
1302

    
1303
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1304
    switch(width){
1305
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1306
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1307
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1308
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1309
    }
1310
}
1311

    
1312
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1313
    int i,j;
1314
    for (i=0; i < height; i++) {
1315
      for (j=0; j < width; j++) {
1316
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1317
      }
1318
      src += stride;
1319
      dst += stride;
1320
    }
1321
}
1322

    
1323
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1324
    int i,j;
1325
    for (i=0; i < height; i++) {
1326
      for (j=0; j < width; j++) {
1327
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1328
      }
1329
      src += stride;
1330
      dst += stride;
1331
    }
1332
}
1333

    
1334
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1335
    int i,j;
1336
    for (i=0; i < height; i++) {
1337
      for (j=0; j < width; j++) {
1338
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1339
      }
1340
      src += stride;
1341
      dst += stride;
1342
    }
1343
}
1344

    
1345
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1346
    int i,j;
1347
    for (i=0; i < height; i++) {
1348
      for (j=0; j < width; j++) {
1349
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1350
      }
1351
      src += stride;
1352
      dst += stride;
1353
    }
1354
}
1355

    
1356
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1357
    int i,j;
1358
    for (i=0; i < height; i++) {
1359
      for (j=0; j < width; j++) {
1360
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1361
      }
1362
      src += stride;
1363
      dst += stride;
1364
    }
1365
}
1366

    
1367
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1368
    int i,j;
1369
    for (i=0; i < height; i++) {
1370
      for (j=0; j < width; j++) {
1371
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1372
      }
1373
      src += stride;
1374
      dst += stride;
1375
    }
1376
}
1377

    
1378
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1379
    int i,j;
1380
    for (i=0; i < height; i++) {
1381
      for (j=0; j < width; j++) {
1382
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1383
      }
1384
      src += stride;
1385
      dst += stride;
1386
    }
1387
}
1388

    
1389
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1390
    int i,j;
1391
    for (i=0; i < height; i++) {
1392
      for (j=0; j < width; j++) {
1393
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1394
      }
1395
      src += stride;
1396
      dst += stride;
1397
    }
1398
}
1399
#if 0
1400
#define TPEL_WIDTH(width)\
1401
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1402
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1403
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1404
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1405
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1406
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1407
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1408
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1409
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1410
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1411
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1412
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1413
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1415
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1417
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1418
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1419
#endif
1420

    
1421
#define H264_CHROMA_MC(OPNAME, OP)\
1422
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1423
    const int A=(8-x)*(8-y);\
1424
    const int B=(  x)*(8-y);\
1425
    const int C=(8-x)*(  y);\
1426
    const int D=(  x)*(  y);\
1427
    int i;\
1428
    \
1429
    assert(x<8 && y<8 && x>=0 && y>=0);\
1430
\
1431
    for(i=0; i<h; i++)\
1432
    {\
1433
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1434
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1435
        dst+= stride;\
1436
        src+= stride;\
1437
    }\
1438
}\
1439
\
1440
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1441
    const int A=(8-x)*(8-y);\
1442
    const int B=(  x)*(8-y);\
1443
    const int C=(8-x)*(  y);\
1444
    const int D=(  x)*(  y);\
1445
    int i;\
1446
    \
1447
    assert(x<8 && y<8 && x>=0 && y>=0);\
1448
\
1449
    for(i=0; i<h; i++)\
1450
    {\
1451
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1452
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1453
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1454
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1455
        dst+= stride;\
1456
        src+= stride;\
1457
    }\
1458
}\
1459
\
1460
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1461
    const int A=(8-x)*(8-y);\
1462
    const int B=(  x)*(8-y);\
1463
    const int C=(8-x)*(  y);\
1464
    const int D=(  x)*(  y);\
1465
    int i;\
1466
    \
1467
    assert(x<8 && y<8 && x>=0 && y>=0);\
1468
\
1469
    for(i=0; i<h; i++)\
1470
    {\
1471
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1472
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1473
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1474
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1475
        OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1476
        OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1477
        OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1478
        OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1479
        dst+= stride;\
1480
        src+= stride;\
1481
    }\
1482
}
1483

    
1484
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1485
#define op_put(a, b) a = (((b) + 32)>>6)
1486

    
1487
H264_CHROMA_MC(put_       , op_put)
1488
H264_CHROMA_MC(avg_       , op_avg)
1489
#undef op_avg
1490
#undef op_put
1491

    
1492
static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1493
    const int A=(8-x)*(8-y);
1494
    const int B=(  x)*(8-y);
1495
    const int C=(8-x)*(  y);
1496
    const int D=(  x)*(  y);
1497
    int i;
1498

    
1499
    assert(x<8 && y<8 && x>=0 && y>=0);
1500

    
1501
    for(i=0; i<h; i++)
1502
    {
1503
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1504
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1505
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1506
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1507
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1508
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1509
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1510
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1511
        dst+= stride;
1512
        src+= stride;
1513
    }
1514
}
1515

    
1516
#define QPEL_MC(r, OPNAME, RND, OP) \
1517
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1518
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1519
    int i;\
1520
    for(i=0; i<h; i++)\
1521
    {\
1522
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1523
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1524
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1525
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1526
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1527
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1528
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1529
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1530
        dst+=dstStride;\
1531
        src+=srcStride;\
1532
    }\
1533
}\
1534
\
1535
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1536
    const int w=8;\
1537
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1538
    int i;\
1539
    for(i=0; i<w; i++)\
1540
    {\
1541
        const int src0= src[0*srcStride];\
1542
        const int src1= src[1*srcStride];\
1543
        const int src2= src[2*srcStride];\
1544
        const int src3= src[3*srcStride];\
1545
        const int src4= src[4*srcStride];\
1546
        const int src5= src[5*srcStride];\
1547
        const int src6= src[6*srcStride];\
1548
        const int src7= src[7*srcStride];\
1549
        const int src8= src[8*srcStride];\
1550
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1551
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1552
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1553
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1554
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1555
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1556
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1557
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1558
        dst++;\
1559
        src++;\
1560
    }\
1561
}\
1562
\
1563
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1564
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1565
    int i;\
1566
    \
1567
    for(i=0; i<h; i++)\
1568
    {\
1569
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1570
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1571
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1572
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1573
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1574
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1575
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1576
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1577
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1578
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1579
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1580
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1581
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1582
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1583
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1584
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1585
        dst+=dstStride;\
1586
        src+=srcStride;\
1587
    }\
1588
}\
1589
\
1590
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1591
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1592
    int i;\
1593
    const int w=16;\
1594
    for(i=0; i<w; i++)\
1595
    {\
1596
        const int src0= src[0*srcStride];\
1597
        const int src1= src[1*srcStride];\
1598
        const int src2= src[2*srcStride];\
1599
        const int src3= src[3*srcStride];\
1600
        const int src4= src[4*srcStride];\
1601
        const int src5= src[5*srcStride];\
1602
        const int src6= src[6*srcStride];\
1603
        const int src7= src[7*srcStride];\
1604
        const int src8= src[8*srcStride];\
1605
        const int src9= src[9*srcStride];\
1606
        const int src10= src[10*srcStride];\
1607
        const int src11= src[11*srcStride];\
1608
        const int src12= src[12*srcStride];\
1609
        const int src13= src[13*srcStride];\
1610
        const int src14= src[14*srcStride];\
1611
        const int src15= src[15*srcStride];\
1612
        const int src16= src[16*srcStride];\
1613
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1614
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1615
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1616
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1617
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1618
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1619
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1620
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1621
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1622
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1623
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1624
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1625
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1626
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1627
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1628
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1629
        dst++;\
1630
        src++;\
1631
    }\
1632
}\
1633
\
1634
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1635
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1636
}\
1637
\
1638
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1639
    uint8_t half[64];\
1640
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1641
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1642
}\
1643
\
1644
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1645
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1646
}\
1647
\
1648
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1649
    uint8_t half[64];\
1650
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1651
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1652
}\
1653
\
1654
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1655
    uint8_t full[16*9];\
1656
    uint8_t half[64];\
1657
    copy_block9(full, src, 16, stride, 9);\
1658
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1659
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1660
}\
1661
\
1662
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1663
    uint8_t full[16*9];\
1664
    copy_block9(full, src, 16, stride, 9);\
1665
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1666
}\
1667
\
1668
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1669
    uint8_t full[16*9];\
1670
    uint8_t half[64];\
1671
    copy_block9(full, src, 16, stride, 9);\
1672
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1673
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1674
}\
1675
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1676
    uint8_t full[16*9];\
1677
    uint8_t halfH[72];\
1678
    uint8_t halfV[64];\
1679
    uint8_t halfHV[64];\
1680
    copy_block9(full, src, 16, stride, 9);\
1681
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1682
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1683
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1684
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1685
}\
1686
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1687
    uint8_t full[16*9];\
1688
    uint8_t halfH[72];\
1689
    uint8_t halfHV[64];\
1690
    copy_block9(full, src, 16, stride, 9);\
1691
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1692
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1693
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1694
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1695
}\
1696
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1697
    uint8_t full[16*9];\
1698
    uint8_t halfH[72];\
1699
    uint8_t halfV[64];\
1700
    uint8_t halfHV[64];\
1701
    copy_block9(full, src, 16, stride, 9);\
1702
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1703
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1704
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1705
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1706
}\
1707
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1708
    uint8_t full[16*9];\
1709
    uint8_t halfH[72];\
1710
    uint8_t halfHV[64];\
1711
    copy_block9(full, src, 16, stride, 9);\
1712
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1713
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1714
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1715
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1716
}\
1717
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1718
    uint8_t full[16*9];\
1719
    uint8_t halfH[72];\
1720
    uint8_t halfV[64];\
1721
    uint8_t halfHV[64];\
1722
    copy_block9(full, src, 16, stride, 9);\
1723
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1724
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1725
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1726
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1727
}\
1728
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1729
    uint8_t full[16*9];\
1730
    uint8_t halfH[72];\
1731
    uint8_t halfHV[64];\
1732
    copy_block9(full, src, 16, stride, 9);\
1733
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1734
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1735
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1736
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1737
}\
1738
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1739
    uint8_t full[16*9];\
1740
    uint8_t halfH[72];\
1741
    uint8_t halfV[64];\
1742
    uint8_t halfHV[64];\
1743
    copy_block9(full, src, 16, stride, 9);\
1744
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1745
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1746
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1747
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1748
}\
1749
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1750
    uint8_t full[16*9];\
1751
    uint8_t halfH[72];\
1752
    uint8_t halfHV[64];\
1753
    copy_block9(full, src, 16, stride, 9);\
1754
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1755
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1756
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1757
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1758
}\
1759
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1760
    uint8_t halfH[72];\
1761
    uint8_t halfHV[64];\
1762
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1763
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1764
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1765
}\
1766
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1767
    uint8_t halfH[72];\
1768
    uint8_t halfHV[64];\
1769
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1770
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1771
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1772
}\
1773
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1774
    uint8_t full[16*9];\
1775
    uint8_t halfH[72];\
1776
    uint8_t halfV[64];\
1777
    uint8_t halfHV[64];\
1778
    copy_block9(full, src, 16, stride, 9);\
1779
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1780
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1781
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1782
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1783
}\
1784
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1785
    uint8_t full[16*9];\
1786
    uint8_t halfH[72];\
1787
    copy_block9(full, src, 16, stride, 9);\
1788
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1789
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1790
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1791
}\
1792
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1793
    uint8_t full[16*9];\
1794
    uint8_t halfH[72];\
1795
    uint8_t halfV[64];\
1796
    uint8_t halfHV[64];\
1797
    copy_block9(full, src, 16, stride, 9);\
1798
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1799
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1800
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1801
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1802
}\
1803
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1804
    uint8_t full[16*9];\
1805
    uint8_t halfH[72];\
1806
    copy_block9(full, src, 16, stride, 9);\
1807
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1808
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1809
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1810
}\
1811
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1812
    uint8_t halfH[72];\
1813
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1814
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1815
}\
1816
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1817
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1818
}\
1819
\
1820
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1821
    uint8_t half[256];\
1822
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1823
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1824
}\
1825
\
1826
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1827
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1828
}\
1829
\
1830
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1831
    uint8_t half[256];\
1832
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1833
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1834
}\
1835
\
1836
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1837
    uint8_t full[24*17];\
1838
    uint8_t half[256];\
1839
    copy_block17(full, src, 24, stride, 17);\
1840
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1841
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1842
}\
1843
\
1844
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1845
    uint8_t full[24*17];\
1846
    copy_block17(full, src, 24, stride, 17);\
1847
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1848
}\
1849
\
1850
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1851
    uint8_t full[24*17];\
1852
    uint8_t half[256];\
1853
    copy_block17(full, src, 24, stride, 17);\
1854
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1855
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1856
}\
1857
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1858
    uint8_t full[24*17];\
1859
    uint8_t halfH[272];\
1860
    uint8_t halfV[256];\
1861
    uint8_t halfHV[256];\
1862
    copy_block17(full, src, 24, stride, 17);\
1863
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1864
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1865
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1866
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1867
}\
1868
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1869
    uint8_t full[24*17];\
1870
    uint8_t halfH[272];\
1871
    uint8_t halfHV[256];\
1872
    copy_block17(full, src, 24, stride, 17);\
1873
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1874
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1875
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1876
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1877
}\
1878
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1879
    uint8_t full[24*17];\
1880
    uint8_t halfH[272];\
1881
    uint8_t halfV[256];\
1882
    uint8_t halfHV[256];\
1883
    copy_block17(full, src, 24, stride, 17);\
1884
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1885
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1886
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1887
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1888
}\
1889
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1890
    uint8_t full[24*17];\
1891
    uint8_t halfH[272];\
1892
    uint8_t halfHV[256];\
1893
    copy_block17(full, src, 24, stride, 17);\
1894
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1895
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1896
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1897
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1898
}\
1899
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1900
    uint8_t full[24*17];\
1901
    uint8_t halfH[272];\
1902
    uint8_t halfV[256];\
1903
    uint8_t halfHV[256];\
1904
    copy_block17(full, src, 24, stride, 17);\
1905
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1906
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1907
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1908
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1909
}\
1910
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1911
    uint8_t full[24*17];\
1912
    uint8_t halfH[272];\
1913
    uint8_t halfHV[256];\
1914
    copy_block17(full, src, 24, stride, 17);\
1915
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1916
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1917
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1918
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1919
}\
1920
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1921
    uint8_t full[24*17];\
1922
    uint8_t halfH[272];\
1923
    uint8_t halfV[256];\
1924
    uint8_t halfHV[256];\
1925
    copy_block17(full, src, 24, stride, 17);\
1926
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1927
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1928
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1929
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1930
}\
1931
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1932
    uint8_t full[24*17];\
1933
    uint8_t halfH[272];\
1934
    uint8_t halfHV[256];\
1935
    copy_block17(full, src, 24, stride, 17);\
1936
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1937
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1938
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1939
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1940
}\
1941
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1942
    uint8_t halfH[272];\
1943
    uint8_t halfHV[256];\
1944
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1945
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1946
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1947
}\
1948
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1949
    uint8_t halfH[272];\
1950
    uint8_t halfHV[256];\
1951
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1952
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1953
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1954
}\
1955
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1956
    uint8_t full[24*17];\
1957
    uint8_t halfH[272];\
1958
    uint8_t halfV[256];\
1959
    uint8_t halfHV[256];\
1960
    copy_block17(full, src, 24, stride, 17);\
1961
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1962
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1963
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1964
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1965
}\
1966
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1967
    uint8_t full[24*17];\
1968
    uint8_t halfH[272];\
1969
    copy_block17(full, src, 24, stride, 17);\
1970
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1971
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1972
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1973
}\
1974
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1975
    uint8_t full[24*17];\
1976
    uint8_t halfH[272];\
1977
    uint8_t halfV[256];\
1978
    uint8_t halfHV[256];\
1979
    copy_block17(full, src, 24, stride, 17);\
1980
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1981
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1982
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1983
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1984
}\
1985
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1986
    uint8_t full[24*17];\
1987
    uint8_t halfH[272];\
1988
    copy_block17(full, src, 24, stride, 17);\
1989
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1990
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1991
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1992
}\
1993
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1994
    uint8_t halfH[272];\
1995
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1996
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1997
}
1998

    
1999
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2000
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2001
#define op_put(a, b) a = cm[((b) + 16)>>5]
2002
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2003

    
2004
QPEL_MC(0, put_       , _       , op_put)
2005
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2006
QPEL_MC(0, avg_       , _       , op_avg)
2007
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2008
#undef op_avg
2009
#undef op_avg_no_rnd
2010
#undef op_put
2011
#undef op_put_no_rnd
2012

    
2013
#if 1
2014
#define H264_LOWPASS(OPNAME, OP, OP2) \
2015
static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2016
    const int h=2;\
2017
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2018
    int i;\
2019
    for(i=0; i<h; i++)\
2020
    {\
2021
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2022
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2023
        dst+=dstStride;\
2024
        src+=srcStride;\
2025
    }\
2026
}\
2027
\
2028
static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2029
    const int w=2;\
2030
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2031
    int i;\
2032
    for(i=0; i<w; i++)\
2033
    {\
2034
        const int srcB= src[-2*srcStride];\
2035
        const int srcA= src[-1*srcStride];\
2036
        const int src0= src[0 *srcStride];\
2037
        const int src1= src[1 *srcStride];\
2038
        const int src2= src[2 *srcStride];\
2039
        const int src3= src[3 *srcStride];\
2040
        const int src4= src[4 *srcStride];\
2041
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2042
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2043
        dst++;\
2044
        src++;\
2045
    }\
2046
}\
2047
\
2048
static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2049
    const int h=2;\
2050
    const int w=2;\
2051
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2052
    int i;\
2053
    src -= 2*srcStride;\
2054
    for(i=0; i<h+5; i++)\
2055
    {\
2056
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2057
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2058
        tmp+=tmpStride;\
2059
        src+=srcStride;\
2060
    }\
2061
    tmp -= tmpStride*(h+5-2);\
2062
    for(i=0; i<w; i++)\
2063
    {\
2064
        const int tmpB= tmp[-2*tmpStride];\
2065
        const int tmpA= tmp[-1*tmpStride];\
2066
        const int tmp0= tmp[0 *tmpStride];\
2067
        const int tmp1= tmp[1 *tmpStride];\
2068
        const int tmp2= tmp[2 *tmpStride];\
2069
        const int tmp3= tmp[3 *tmpStride];\
2070
        const int tmp4= tmp[4 *tmpStride];\
2071
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2072
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2073
        dst++;\
2074
        tmp++;\
2075
    }\
2076
}\
2077
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2078
    const int h=4;\
2079
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2080
    int i;\
2081
    for(i=0; i<h; i++)\
2082
    {\
2083
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2084
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2085
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2086
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2087
        dst+=dstStride;\
2088
        src+=srcStride;\
2089
    }\
2090
}\
2091
\
2092
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2093
    const int w=4;\
2094
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2095
    int i;\
2096
    for(i=0; i<w; i++)\
2097
    {\
2098
        const int srcB= src[-2*srcStride];\
2099
        const int srcA= src[-1*srcStride];\
2100
        const int src0= src[0 *srcStride];\
2101
        const int src1= src[1 *srcStride];\
2102
        const int src2= src[2 *srcStride];\
2103
        const int src3= src[3 *srcStride];\
2104
        const int src4= src[4 *srcStride];\
2105
        const int src5= src[5 *srcStride];\
2106
        const int src6= src[6 *srcStride];\
2107
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2108
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2109
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2110
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2111
        dst++;\
2112
        src++;\
2113
    }\
2114
}\
2115
\
2116
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2117
    const int h=4;\
2118
    const int w=4;\
2119
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2120
    int i;\
2121
    src -= 2*srcStride;\
2122
    for(i=0; i<h+5; i++)\
2123
    {\
2124
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2125
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2126
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2127
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2128
        tmp+=tmpStride;\
2129
        src+=srcStride;\
2130
    }\
2131
    tmp -= tmpStride*(h+5-2);\
2132
    for(i=0; i<w; i++)\
2133
    {\
2134
        const int tmpB= tmp[-2*tmpStride];\
2135
        const int tmpA= tmp[-1*tmpStride];\
2136
        const int tmp0= tmp[0 *tmpStride];\
2137
        const int tmp1= tmp[1 *tmpStride];\
2138
        const int tmp2= tmp[2 *tmpStride];\
2139
        const int tmp3= tmp[3 *tmpStride];\
2140
        const int tmp4= tmp[4 *tmpStride];\
2141
        const int tmp5= tmp[5 *tmpStride];\
2142
        const int tmp6= tmp[6 *tmpStride];\
2143
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2144
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2145
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2146
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2147
        dst++;\
2148
        tmp++;\
2149
    }\
2150
}\
2151
\
2152
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2153
    const int h=8;\
2154
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2155
    int i;\
2156
    for(i=0; i<h; i++)\
2157
    {\
2158
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2159
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2160
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2161
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2162
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2163
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2164
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2165
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2166
        dst+=dstStride;\
2167
        src+=srcStride;\
2168
    }\
2169
}\
2170
\
2171
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2172
    const int w=8;\
2173
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2174
    int i;\
2175
    for(i=0; i<w; i++)\
2176
    {\
2177
        const int srcB= src[-2*srcStride];\
2178
        const int srcA= src[-1*srcStride];\
2179
        const int src0= src[0 *srcStride];\
2180
        const int src1= src[1 *srcStride];\
2181
        const int src2= src[2 *srcStride];\
2182
        const int src3= src[3 *srcStride];\
2183
        const int src4= src[4 *srcStride];\
2184
        const int src5= src[5 *srcStride];\
2185
        const int src6= src[6 *srcStride];\
2186
        const int src7= src[7 *srcStride];\
2187
        const int src8= src[8 *srcStride];\
2188
        const int src9= src[9 *srcStride];\
2189
        const int src10=src[10*srcStride];\
2190
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2191
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2192
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2193
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2194
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2195
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2196
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2197
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2198
        dst++;\
2199
        src++;\
2200
    }\
2201
}\
2202
\
2203
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2204
    const int h=8;\
2205
    const int w=8;\
2206
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2207
    int i;\
2208
    src -= 2*srcStride;\
2209
    for(i=0; i<h+5; i++)\
2210
    {\
2211
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2212
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2213
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2214
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2215
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2216
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2217
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2218
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2219
        tmp+=tmpStride;\
2220
        src+=srcStride;\
2221
    }\
2222
    tmp -= tmpStride*(h+5-2);\
2223
    for(i=0; i<w; i++)\
2224
    {\
2225
        const int tmpB= tmp[-2*tmpStride];\
2226
        const int tmpA= tmp[-1*tmpStride];\
2227
        const int tmp0= tmp[0 *tmpStride];\
2228
        const int tmp1= tmp[1 *tmpStride];\
2229
        const int tmp2= tmp[2 *tmpStride];\
2230
        const int tmp3= tmp[3 *tmpStride];\
2231
        const int tmp4= tmp[4 *tmpStride];\
2232
        const int tmp5= tmp[5 *tmpStride];\
2233
        const int tmp6= tmp[6 *tmpStride];\
2234
        const int tmp7= tmp[7 *tmpStride];\
2235
        const int tmp8= tmp[8 *tmpStride];\
2236
        const int tmp9= tmp[9 *tmpStride];\
2237
        const int tmp10=tmp[10*tmpStride];\
2238
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2239
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2240
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2241
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2242
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2243
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2244
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2245
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2246
        dst++;\
2247
        tmp++;\
2248
    }\
2249
}\
2250
\
2251
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2252
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2253
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2254
    src += 8*srcStride;\
2255
    dst += 8*dstStride;\
2256
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2257
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2258
}\
2259
\
2260
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2261
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2262
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2263
    src += 8*srcStride;\
2264
    dst += 8*dstStride;\
2265
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2266
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2267
}\
2268
\
2269
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2270
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2271
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2272
    src += 8*srcStride;\
2273
    dst += 8*dstStride;\
2274
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2275
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2276
}\
2277

    
2278
#define H264_MC(OPNAME, SIZE) \
2279
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2280
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2281
}\
2282
\
2283
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2284
    uint8_t half[SIZE*SIZE];\
2285
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2286
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2287
}\
2288
\
2289
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2290
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2291
}\
2292
\
2293
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2294
    uint8_t half[SIZE*SIZE];\
2295
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2296
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2297
}\
2298
\
2299
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2300
    uint8_t full[SIZE*(SIZE+5)];\
2301
    uint8_t * const full_mid= full + SIZE*2;\
2302
    uint8_t half[SIZE*SIZE];\
2303
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2304
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2305
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2306
}\
2307
\
2308
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2309
    uint8_t full[SIZE*(SIZE+5)];\
2310
    uint8_t * const full_mid= full + SIZE*2;\
2311
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2312
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2313
}\
2314
\
2315
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2316
    uint8_t full[SIZE*(SIZE+5)];\
2317
    uint8_t * const full_mid= full + SIZE*2;\
2318
    uint8_t half[SIZE*SIZE];\
2319
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2320
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2321
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2322
}\
2323
\
2324
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2325
    uint8_t full[SIZE*(SIZE+5)];\
2326
    uint8_t * const full_mid= full + SIZE*2;\
2327
    uint8_t halfH[SIZE*SIZE];\
2328
    uint8_t halfV[SIZE*SIZE];\
2329
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2330
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2331
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2332
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2333
}\
2334
\
2335
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2336
    uint8_t full[SIZE*(SIZE+5)];\
2337
    uint8_t * const full_mid= full + SIZE*2;\
2338
    uint8_t halfH[SIZE*SIZE];\
2339
    uint8_t halfV[SIZE*SIZE];\
2340
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2341
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2342
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2343
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2344
}\
2345
\
2346
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2347
    uint8_t full[SIZE*(SIZE+5)];\
2348
    uint8_t * const full_mid= full + SIZE*2;\
2349
    uint8_t halfH[SIZE*SIZE];\
2350
    uint8_t halfV[SIZE*SIZE];\
2351
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2352
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2353
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2354
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2355
}\
2356
\
2357
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2358
    uint8_t full[SIZE*(SIZE+5)];\
2359
    uint8_t * const full_mid= full + SIZE*2;\
2360
    uint8_t halfH[SIZE*SIZE];\
2361
    uint8_t halfV[SIZE*SIZE];\
2362
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2363
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2364
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2365
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2366
}\
2367
\
2368
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2369
    int16_t tmp[SIZE*(SIZE+5)];\
2370
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2371
}\
2372
\
2373
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2374
    int16_t tmp[SIZE*(SIZE+5)];\
2375
    uint8_t halfH[SIZE*SIZE];\
2376
    uint8_t halfHV[SIZE*SIZE];\
2377
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2378
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2379
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2380
}\
2381
\
2382
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2383
    int16_t tmp[SIZE*(SIZE+5)];\
2384
    uint8_t halfH[SIZE*SIZE];\
2385
    uint8_t halfHV[SIZE*SIZE];\
2386
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2387
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2388
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2389
}\
2390
\
2391
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2392
    uint8_t full[SIZE*(SIZE+5)];\
2393
    uint8_t * const full_mid= full + SIZE*2;\
2394
    int16_t tmp[SIZE*(SIZE+5)];\
2395
    uint8_t halfV[SIZE*SIZE];\
2396
    uint8_t halfHV[SIZE*SIZE];\
2397
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2398
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2399
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2400
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2401
}\
2402
\
2403
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2404
    uint8_t full[SIZE*(SIZE+5)];\
2405
    uint8_t * const full_mid= full + SIZE*2;\
2406
    int16_t tmp[SIZE*(SIZE+5)];\
2407
    uint8_t halfV[SIZE*SIZE];\
2408
    uint8_t halfHV[SIZE*SIZE];\
2409
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2410
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2411
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2412
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2413
}\
2414

    
2415
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2416
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2417
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2418
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2419
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2420

    
2421
H264_LOWPASS(put_       , op_put, op2_put)
2422
H264_LOWPASS(avg_       , op_avg, op2_avg)
2423
H264_MC(put_, 2)
2424
H264_MC(put_, 4)
2425
H264_MC(put_, 8)
2426
H264_MC(put_, 16)
2427
H264_MC(avg_, 4)
2428
H264_MC(avg_, 8)
2429
H264_MC(avg_, 16)
2430

    
2431
#undef op_avg
2432
#undef op_put
2433
#undef op2_avg
2434
#undef op2_put
2435
#endif
2436

    
2437
#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2438
#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2439
#define H264_WEIGHT(W,H) \
2440
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2441
    int y; \
2442
    offset <<= log2_denom; \
2443
    if(log2_denom) offset += 1<<(log2_denom-1); \
2444
    for(y=0; y<H; y++, block += stride){ \
2445
        op_scale1(0); \
2446
        op_scale1(1); \
2447
        if(W==2) continue; \
2448
        op_scale1(2); \
2449
        op_scale1(3); \
2450
        if(W==4) continue; \
2451
        op_scale1(4); \
2452
        op_scale1(5); \
2453
        op_scale1(6); \
2454
        op_scale1(7); \
2455
        if(W==8) continue; \
2456
        op_scale1(8); \
2457
        op_scale1(9); \
2458
        op_scale1(10); \
2459
        op_scale1(11); \
2460
        op_scale1(12); \
2461
        op_scale1(13); \
2462
        op_scale1(14); \
2463
        op_scale1(15); \
2464
    } \
2465
} \
2466
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2467
    int y; \
2468
    offset = ((offset + 1) | 1) << log2_denom; \
2469
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2470
        op_scale2(0); \
2471
        op_scale2(1); \
2472
        if(W==2) continue; \
2473
        op_scale2(2); \
2474
        op_scale2(3); \
2475
        if(W==4) continue; \
2476
        op_scale2(4); \
2477
        op_scale2(5); \
2478
        op_scale2(6); \
2479
        op_scale2(7); \
2480
        if(W==8) continue; \
2481
        op_scale2(8); \
2482
        op_scale2(9); \
2483
        op_scale2(10); \
2484
        op_scale2(11); \
2485
        op_scale2(12); \
2486
        op_scale2(13); \
2487
        op_scale2(14); \
2488
        op_scale2(15); \
2489
    } \
2490
}
2491

    
2492
H264_WEIGHT(16,16)
2493
H264_WEIGHT(16,8)
2494
H264_WEIGHT(8,16)
2495
H264_WEIGHT(8,8)
2496
H264_WEIGHT(8,4)
2497
H264_WEIGHT(4,8)
2498
H264_WEIGHT(4,4)
2499
H264_WEIGHT(4,2)
2500
H264_WEIGHT(2,4)
2501
H264_WEIGHT(2,2)
2502

    
2503
#undef op_scale1
2504
#undef op_scale2
2505
#undef H264_WEIGHT
2506

    
2507
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2508
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2509
    int i;
2510

    
2511
    for(i=0; i<h; i++){
2512
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2513
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2514
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2515
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2516
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2517
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2518
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2519
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2520
        dst+=dstStride;
2521
        src+=srcStride;
2522
    }
2523
}
2524

    
2525
#ifdef CONFIG_CAVS_DECODER
2526
/* AVS specific */
2527
void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2528

    
2529
void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2530
    put_pixels8_c(dst, src, stride, 8);
2531
}
2532
void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2533
    avg_pixels8_c(dst, src, stride, 8);
2534
}
2535
void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2536
    put_pixels16_c(dst, src, stride, 16);
2537
}
2538
void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2539
    avg_pixels16_c(dst, src, stride, 16);
2540
}
2541
#endif /* CONFIG_CAVS_DECODER */
2542

    
2543
#if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2544
/* VC-1 specific */
2545
void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2546

    
2547
void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2548
    put_pixels8_c(dst, src, stride, 8);
2549
}
2550
#endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2551

    
2552
#if defined(CONFIG_H264_ENCODER)
2553
/* H264 specific */
2554
void ff_h264dsp_init(DSPContext* c, AVCodecContext *avctx);
2555
#endif /* CONFIG_H264_ENCODER */
2556

    
2557
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2558
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2559
    int i;
2560

    
2561
    for(i=0; i<w; i++){
2562
        const int src_1= src[ -srcStride];
2563
        const int src0 = src[0          ];
2564
        const int src1 = src[  srcStride];
2565
        const int src2 = src[2*srcStride];
2566
        const int src3 = src[3*srcStride];
2567
        const int src4 = src[4*srcStride];
2568
        const int src5 = src[5*srcStride];
2569
        const int src6 = src[6*srcStride];
2570
        const int src7 = src[7*srcStride];
2571
        const int src8 = src[8*srcStride];
2572
        const int src9 = src[9*srcStride];
2573
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2574
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2575
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2576
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2577
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2578
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2579
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2580
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2581
        src++;
2582
        dst++;
2583
    }
2584
}
2585

    
2586
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2587
    put_pixels8_c(dst, src, stride, 8);
2588
}
2589

    
2590
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2591
    uint8_t half[64];
2592
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2593
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2594
}
2595

    
2596
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2597
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2598
}
2599

    
2600
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2601
    uint8_t half[64];
2602
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2603
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2604
}
2605

    
2606
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2607
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2608
}
2609

    
2610
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2611
    uint8_t halfH[88];
2612
    uint8_t halfV[64];
2613
    uint8_t halfHV[64];
2614
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2615
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2616
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2617
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2618
}
2619
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2620
    uint8_t halfH[88];
2621
    uint8_t halfV[64];
2622
    uint8_t halfHV[64];
2623
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2624
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2625
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2626
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2627
}
2628
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2629
    uint8_t halfH[88];
2630
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2631
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2632
}
2633

    
2634
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2635
    int x;
2636
    const int strength= ff_h263_loop_filter_strength[qscale];
2637

    
2638
    for(x=0; x<8; x++){
2639
        int d1, d2, ad1;
2640
        int p0= src[x-2*stride];
2641
        int p1= src[x-1*stride];
2642
        int p2= src[x+0*stride];
2643
        int p3= src[x+1*stride];
2644
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2645

    
2646
        if     (d<-2*strength) d1= 0;
2647
        else if(d<-  strength) d1=-2*strength - d;
2648
        else if(d<   strength) d1= d;
2649
        else if(d< 2*strength) d1= 2*strength - d;
2650
        else                   d1= 0;
2651

    
2652
        p1 += d1;
2653
        p2 -= d1;
2654
        if(p1&256) p1= ~(p1>>31);
2655
        if(p2&256) p2= ~(p2>>31);
2656

    
2657
        src[x-1*stride] = p1;
2658
        src[x+0*stride] = p2;
2659

    
2660
        ad1= FFABS(d1)>>1;
2661

    
2662
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2663

    
2664
        src[x-2*stride] = p0 - d2;
2665
        src[x+  stride] = p3 + d2;
2666
    }
2667
}
2668

    
2669
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2670
    int y;
2671
    const int strength= ff_h263_loop_filter_strength[qscale];
2672

    
2673
    for(y=0; y<8; y++){
2674
        int d1, d2, ad1;
2675
        int p0= src[y*stride-2];
2676
        int p1= src[y*stride-1];
2677
        int p2= src[y*stride+0];
2678
        int p3= src[y*stride+1];
2679
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2680

    
2681
        if     (d<-2*strength) d1= 0;
2682
        else if(d<-  strength) d1=-2*strength - d;
2683
        else if(d<   strength) d1= d;
2684
        else if(d< 2*strength) d1= 2*strength - d;
2685
        else                   d1= 0;
2686

    
2687
        p1 += d1;
2688
        p2 -= d1;
2689
        if(p1&256) p1= ~(p1>>31);
2690
        if(p2&256) p2= ~(p2>>31);
2691

    
2692
        src[y*stride-1] = p1;
2693
        src[y*stride+0] = p2;
2694

    
2695
        ad1= FFABS(d1)>>1;
2696

    
2697
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2698

    
2699
        src[y*stride-2] = p0 - d2;
2700
        src[y*stride+1] = p3 + d2;
2701
    }
2702
}
2703

    
2704
static void h261_loop_filter_c(uint8_t *src, int stride){
2705
    int x,y,xy,yz;
2706
    int temp[64];
2707

    
2708
    for(x=0; x<8; x++){
2709
        temp[x      ] = 4*src[x           ];
2710
        temp[x + 7*8] = 4*src[x + 7*stride];
2711
    }
2712
    for(y=1; y<7; y++){
2713
        for(x=0; x<8; x++){
2714
            xy = y * stride + x;
2715
            yz = y * 8 + x;
2716
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2717
        }
2718
    }
2719

    
2720
    for(y=0; y<8; y++){
2721
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2722
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2723
        for(x=1; x<7; x++){
2724
            xy = y * stride + x;
2725
            yz = y * 8 + x;
2726
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2727
        }
2728
    }
2729
}
2730

    
2731
static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2732
{
2733
    int i, d;
2734
    for( i = 0; i < 4; i++ ) {
2735
        if( tc0[i] < 0 ) {
2736
            pix += 4*ystride;
2737
            continue;
2738
        }
2739
        for( d = 0; d < 4; d++ ) {
2740
            const int p0 = pix[-1*xstride];
2741
            const int p1 = pix[-2*xstride];
2742
            const int p2 = pix[-3*xstride];
2743
            const int q0 = pix[0];
2744
            const int q1 = pix[1*xstride];
2745
            const int q2 = pix[2*xstride];
2746

    
2747
            if( FFABS( p0 - q0 ) < alpha &&
2748
                FFABS( p1 - p0 ) < beta &&
2749
                FFABS( q1 - q0 ) < beta ) {
2750

    
2751
                int tc = tc0[i];
2752
                int i_delta;
2753

    
2754
                if( FFABS( p2 - p0 ) < beta ) {
2755
                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2756
                    tc++;
2757
                }
2758
                if( FFABS( q2 - q0 ) < beta ) {
2759
                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2760
                    tc++;
2761
                }
2762

    
2763
                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2764
                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
2765
                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
2766
            }
2767
            pix += ystride;
2768
        }
2769
    }
2770
}
2771
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2772
{
2773
    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2774
}
2775
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2776
{
2777
    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2778
}
2779

    
2780
static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2781
{
2782
    int i, d;
2783
    for( i = 0; i < 4; i++ ) {
2784
        const int tc = tc0[i];
2785
        if( tc <= 0 ) {
2786
            pix += 2*ystride;
2787
            continue;
2788
        }
2789
        for( d = 0; d < 2; d++ ) {
2790
            const int p0 = pix[-1*xstride];
2791
            const int p1 = pix[-2*xstride];
2792
            const int q0 = pix[0];
2793
            const int q1 = pix[1*xstride];
2794

    
2795
            if( FFABS( p0 - q0 ) < alpha &&
2796
                FFABS( p1 - p0 ) < beta &&
2797
                FFABS( q1 - q0 ) < beta ) {
2798

    
2799
                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2800

    
2801
                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
2802
                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
2803
            }
2804
            pix += ystride;
2805
        }
2806
    }
2807
}
2808
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2809
{
2810
    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2811
}
2812
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2813
{
2814
    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2815
}
2816

    
2817
static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2818
{
2819
    int d;
2820
    for( d = 0; d < 8; d++ ) {
2821
        const int p0 = pix[-1*xstride];
2822
        const int p1 = pix[-2*xstride];
2823
        const int q0 = pix[0];
2824
        const int q1 = pix[1*xstride];
2825

    
2826
        if( FFABS( p0 - q0 ) < alpha &&
2827
            FFABS( p1 - p0 ) < beta &&
2828
            FFABS( q1 - q0 ) < beta ) {
2829

    
2830
            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
2831
            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
2832
        }
2833
        pix += ystride;
2834
    }
2835
}
2836
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2837
{
2838
    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2839
}
2840
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2841
{
2842
    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2843
}
2844

    
2845
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2846
{
2847
    int s, i;
2848

    
2849
    s = 0;
2850
    for(i=0;i<h;i++) {
2851
        s += abs(pix1[0] - pix2[0]);
2852
        s += abs(pix1[1] - pix2[1]);
2853
        s += abs(pix1[2] - pix2[2]);
2854
        s += abs(pix1[3] - pix2[3]);
2855
        s += abs(pix1[4] - pix2[4]);
2856
        s += abs(pix1[5] - pix2[5]);
2857
        s += abs(pix1[6] - pix2[6]);
2858
        s += abs(pix1[7] - pix2[7]);
2859
        s += abs(pix1[8] - pix2[8]);
2860
        s += abs(pix1[9] - pix2[9]);
2861
        s += abs(pix1[10] - pix2[10]);
2862
        s += abs(pix1[11] - pix2[11]);
2863
        s += abs(pix1[12] - pix2[12]);
2864
        s += abs(pix1[13] - pix2[13]);
2865
        s += abs(pix1[14] - pix2[14]);
2866
        s += abs(pix1[15] - pix2[15]);
2867
        pix1 += line_size;
2868
        pix2 += line_size;
2869
    }
2870
    return s;
2871
}
2872

    
2873
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2874
{
2875
    int s, i;
2876

    
2877
    s = 0;
2878
    for(i=0;i<h;i++) {
2879
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2880
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2881
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2882
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2883
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2884
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2885
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2886
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2887
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2888
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2889
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2890
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2891
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2892
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2893
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2894
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2895
        pix1 += line_size;
2896
        pix2 += line_size;
2897
    }
2898
    return s;
2899
}
2900

    
2901
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2902
{
2903
    int s, i;
2904
    uint8_t *pix3 = pix2 + line_size;
2905

    
2906
    s = 0;
2907
    for(i=0;i<h;i++) {
2908
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2909
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2910
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2911
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2912
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2913
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2914
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2915
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2916
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2917
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2918
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2919
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2920
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2921
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2922
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2923
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2924
        pix1 += line_size;
2925
        pix2 += line_size;
2926
        pix3 += line_size;
2927
    }
2928
    return s;
2929
}
2930

    
2931
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2932
{
2933
    int s, i;
2934
    uint8_t *pix3 = pix2 + line_size;
2935

    
2936
    s = 0;
2937
    for(i=0;i<h;i++) {
2938
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2939
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2940
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2941
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2942
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2943
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2944
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2945
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2946
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2947
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2948
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2949
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2950
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2951
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2952
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2953
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2954
        pix1 += line_size;
2955
        pix2 += line_size;
2956
        pix3 += line_size;
2957
    }
2958
    return s;
2959
}
2960

    
2961
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2962
{
2963
    int s, i;
2964

    
2965
    s = 0;
2966
    for(i=0;i<h;i++) {
2967
        s += abs(pix1[0] - pix2[0]);
2968
        s += abs(pix1[1] - pix2[1]);
2969
        s += abs(pix1[2] - pix2[2]);
2970
        s += abs(pix1[3] - pix2[3]);
2971
        s += abs(pix1[4] - pix2[4]);
2972
        s += abs(pix1[5] - pix2[5]);
2973
        s += abs(pix1[6] - pix2[6]);
2974
        s += abs(pix1[7] - pix2[7]);
2975
        pix1 += line_size;
2976
        pix2 += line_size;
2977
    }
2978
    return s;
2979
}
2980

    
2981
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2982
{
2983
    int s, i;
2984

    
2985
    s = 0;
2986
    for(i=0;i<h;i++) {
2987
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2988
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2989
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2990
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2991
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2992
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2993
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2994
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2995
        pix1 += line_size;
2996
        pix2 += line_size;
2997
    }
2998
    return s;
2999
}
3000

    
3001
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3002
{
3003
    int s, i;
3004
    uint8_t *pix3 = pix2 + line_size;
3005

    
3006
    s = 0;
3007
    for(i=0;i<h;i++) {
3008
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3009
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3010
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3011
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3012
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3013
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3014
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3015
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3016
        pix1 += line_size;
3017
        pix2 += line_size;
3018
        pix3 += line_size;
3019
    }
3020
    return s;
3021
}
3022

    
3023
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3024
{
3025
    int s, i;
3026
    uint8_t *pix3 = pix2 + line_size;
3027

    
3028
    s = 0;
3029
    for(i=0;i<h;i++) {
3030
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3031
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3032
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3033
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3034
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3035
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3036
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3037
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3038
        pix1 += line_size;
3039
        pix2 += line_size;
3040
        pix3 += line_size;
3041
    }
3042
    return s;
3043
}
3044

    
3045
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3046
    MpegEncContext *c = v;
3047
    int score1=0;
3048
    int score2=0;
3049
    int x,y;
3050

    
3051
    for(y=0; y<h; y++){
3052
        for(x=0; x<16; x++){
3053
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3054
        }
3055
        if(y+1<h){
3056
            for(x=0; x<15; x++){
3057
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3058
                             - s1[x+1] + s1[x+1+stride])
3059
                        -FFABS(  s2[x  ] - s2[x  +stride]
3060
                             - s2[x+1] + s2[x+1+stride]);
3061
            }
3062
        }
3063
        s1+= stride;
3064
        s2+= stride;
3065
    }
3066

    
3067
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3068
    else  return score1 + FFABS(score2)*8;
3069
}
3070

    
3071
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3072
    MpegEncContext *c = v;
3073
    int score1=0;
3074
    int score2=0;
3075
    int x,y;
3076

    
3077
    for(y=0; y<h; y++){
3078
        for(x=0; x<8; x++){
3079
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3080
        }
3081
        if(y+1<h){
3082
            for(x=0; x<7; x++){
3083
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3084
                             - s1[x+1] + s1[x+1+stride])
3085
                        -FFABS(  s2[x  ] - s2[x  +stride]
3086
                             - s2[x+1] + s2[x+1+stride]);
3087
            }
3088
        }
3089
        s1+= stride;
3090
        s2+= stride;
3091
    }
3092

    
3093
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3094
    else  return score1 + FFABS(score2)*8;
3095
}
3096

    
3097
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3098
    int i;
3099
    unsigned int sum=0;
3100

    
3101
    for(i=0; i<8*8; i++){
3102
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3103
        int w= weight[i];
3104
        b>>= RECON_SHIFT;
3105
        assert(-512<b && b<512);
3106

    
3107
        sum += (w*b)*(w*b)>>4;
3108
    }
3109
    return sum>>2;
3110
}
3111

    
3112
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3113
    int i;
3114

    
3115
    for(i=0; i<8*8; i++){
3116
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3117
    }
3118
}
3119

    
3120
/**
3121
 * permutes an 8x8 block.
3122
 * @param block the block which will be permuted according to the given permutation vector
3123
 * @param permutation the permutation vector
3124
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3125
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3126
 *                  (inverse) permutated to scantable order!
3127
 */
3128
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3129
{
3130
    int i;
3131
    DCTELEM temp[64];
3132

    
3133
    if(last<=0) return;
3134
    //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3135

    
3136
    for(i=0; i<=last; i++){
3137
        const int j= scantable[i];
3138
        temp[j]= block[j];
3139
        block[j]=0;
3140
    }
3141

    
3142
    for(i=0; i<=last; i++){
3143
        const int j= scantable[i];
3144
        const int perm_j= permutation[j];
3145
        block[perm_j]= temp[j];
3146
    }
3147
}
3148

    
3149
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3150
    return 0;
3151
}
3152

    
3153
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3154
    int i;
3155

    
3156
    memset(cmp, 0, sizeof(void*)*5);
3157

    
3158
    for(i=0; i<5; i++){
3159
        switch(type&0xFF){
3160
        case FF_CMP_SAD:
3161
            cmp[i]= c->sad[i];
3162
            break;
3163
        case FF_CMP_SATD:
3164
            cmp[i]= c->hadamard8_diff[i];
3165
            break;
3166
        case FF_CMP_SSE:
3167
            cmp[i]= c->sse[i];
3168
            break;
3169
        case FF_CMP_DCT:
3170
            cmp[i]= c->dct_sad[i];
3171
            break;
3172
        case FF_CMP_DCT264:
3173
            cmp[i]= c->dct264_sad[i];
3174
            break;
3175
        case FF_CMP_DCTMAX:
3176
            cmp[i]= c->dct_max[i];
3177
            break;
3178
        case FF_CMP_PSNR:
3179
            cmp[i]= c->quant_psnr[i];
3180
            break;
3181
        case FF_CMP_BIT:
3182
            cmp[i]= c->bit[i];
3183
            break;
3184
        case FF_CMP_RD:
3185
            cmp[i]= c->rd[i];
3186
            break;
3187
        case FF_CMP_VSAD:
3188
            cmp[i]= c->vsad[i];
3189
            break;
3190
        case FF_CMP_VSSE:
3191
            cmp[i]= c->vsse[i];
3192
            break;
3193
        case FF_CMP_ZERO:
3194
            cmp[i]= zero_cmp;
3195
            break;
3196
        case FF_CMP_NSSE:
3197
            cmp[i]= c->nsse[i];
3198
            break;
3199
#ifdef CONFIG_SNOW_ENCODER
3200
        case FF_CMP_W53:
3201
            cmp[i]= c->w53[i];
3202
            break;
3203
        case FF_CMP_W97:
3204
            cmp[i]= c->w97[i];
3205
            break;
3206
#endif
3207
        default:
3208
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3209
        }
3210
    }
3211
}
3212

    
3213
/**
3214
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3215
 */
3216
static void clear_blocks_c(DCTELEM *blocks)
3217
{
3218
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3219
}
3220

    
3221
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3222
    int i;
3223
    for(i=0; i+7<w; i+=8){
3224
        dst[i+0] += src[i+0];
3225
        dst[i+1] += src[i+1];
3226
        dst[i+2] += src[i+2];
3227
        dst[i+3] += src[i+3];
3228
        dst[i+4] += src[i+4];
3229
        dst[i+5] += src[i+5];
3230
        dst[i+6] += src[i+6];
3231
        dst[i+7] += src[i+7];
3232
    }
3233
    for(; i<w; i++)
3234
        dst[i+0] += src[i+0];
3235
}
3236

    
3237
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3238
    int i;
3239
    for(i=0; i+7<w; i+=8){
3240
        dst[i+0] = src1[i+0]-src2[i+0];
3241
        dst[i+1] = src1[i+1]-src2[i+1];
3242
        dst[i+2] = src1[i+2]-src2[i+2];
3243
        dst[i+3] = src1[i+3]-src2[i+3];
3244
        dst[i+4] = src1[i+4]-src2[i+4];
3245
        dst[i+5] = src1[i+5]-src2[i+5];
3246
        dst[i+6] = src1[i+6]-src2[i+6];
3247
        dst[i+7] = src1[i+7]-src2[i+7];
3248
    }
3249
    for(; i<w; i++)
3250
        dst[i+0] = src1[i+0]-src2[i+0];
3251
}
3252

    
3253
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3254
    int i;
3255
    uint8_t l, lt;
3256

    
3257
    l= *left;
3258
    lt= *left_top;
3259

    
3260
    for(i=0; i<w; i++){
3261
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3262
        lt= src1[i];
3263
        l= src2[i];
3264
        dst[i]= l - pred;
3265
    }
3266

    
3267
    *left= l;
3268
    *left_top= lt;
3269
}
3270

    
3271
#define BUTTERFLY2(o1,o2,i1,i2) \
3272
o1= (i1)+(i2);\
3273
o2= (i1)-(i2);
3274

    
3275
#define BUTTERFLY1(x,y) \
3276
{\
3277
    int a,b;\
3278
    a= x;\
3279
    b= y;\
3280
    x= a+b;\
3281
    y= a-b;\
3282
}
3283

    
3284
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3285

    
3286
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3287
    int i;
3288
    int temp[64];
3289
    int sum=0;
3290

    
3291
    assert(h==8);
3292

    
3293
    for(i=0; i<8; i++){
3294
        //FIXME try pointer walks
3295
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3296
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3297
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3298
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3299

    
3300
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3301
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3302
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3303
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3304

    
3305
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3306
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3307
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3308
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3309
    }
3310

    
3311
    for(i=0; i<8; i++){
3312
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3313
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3314
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3315
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3316

    
3317
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3318
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3319
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3320
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3321

    
3322
        sum +=
3323
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3324
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3325
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3326
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3327
    }
3328
#if 0
3329
static int maxi=0;
3330
if(sum>maxi){
3331
    maxi=sum;
3332
    printf("MAX:%d\n", maxi);
3333
}
3334
#endif
3335
    return sum;
3336
}
3337

    
3338
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3339
    int i;
3340
    int temp[64];
3341
    int sum=0;
3342

    
3343
    assert(h==8);
3344

    
3345
    for(i=0; i<8; i++){
3346
        //FIXME try pointer walks
3347
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3348
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3349
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3350
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3351

    
3352
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3353
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3354
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3355
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3356

    
3357
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3358
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3359
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3360
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3361
    }
3362

    
3363
    for(i=0; i<8; i++){
3364
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3365
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3366
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3367
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3368

    
3369
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3370
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3371
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3372
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3373

    
3374
        sum +=
3375
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3376
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3377
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3378
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3379
    }
3380

    
3381
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3382

    
3383
    return sum;
3384
}
3385

    
3386
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3387
    MpegEncContext * const s= (MpegEncContext *)c;
3388
    DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3389
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3390
    int sum=0, i;
3391

    
3392
    assert(h==8);
3393

    
3394
    s->dsp.diff_pixels(temp, src1, src2, stride);
3395
    s->dsp.fdct(temp);
3396

    
3397
    for(i=0; i<64; i++)
3398
        sum+= FFABS(temp[i]);
3399

    
3400
    return sum;
3401
}
3402

    
3403
#ifdef CONFIG_GPL
3404
#define DCT8_1D {\
3405
    const int s07 = SRC(0) + SRC(7);\
3406
    const int s16 = SRC(1) + SRC(6);\
3407
    const int s25 = SRC(2) + SRC(5);\
3408
    const int s34 = SRC(3) + SRC(4);\
3409
    const int a0 = s07 + s34;\
3410
    const int a1 = s16 + s25;\
3411
    const int a2 = s07 - s34;\
3412
    const int a3 = s16 - s25;\
3413
    const int d07 = SRC(0) - SRC(7);\
3414
    const int d16 = SRC(1) - SRC(6);\
3415
    const int d25 = SRC(2) - SRC(5);\
3416
    const int d34 = SRC(3) - SRC(4);\
3417
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3418
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3419
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3420
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3421
    DST(0,  a0 + a1     ) ;\
3422
    DST(1,  a4 + (a7>>2)) ;\
3423
    DST(2,  a2 + (a3>>1)) ;\
3424
    DST(3,  a5 + (a6>>2)) ;\
3425
    DST(4,  a0 - a1     ) ;\
3426
    DST(5,  a6 - (a5>>2)) ;\
3427
    DST(6, (a2>>1) - a3 ) ;\
3428
    DST(7, (a4>>2) - a7 ) ;\
3429
}
3430

    
3431
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3432
    MpegEncContext * const s= (MpegEncContext *)c;
3433
    int16_t dct[8][8];
3434
    int i;
3435
    int sum=0;
3436

    
3437
    s->dsp.diff_pixels(dct, src1, src2, stride);
3438

    
3439
#define SRC(x) dct[i][x]
3440
#define DST(x,v) dct[i][x]= v
3441
    for( i = 0; i < 8; i++ )
3442
        DCT8_1D
3443
#undef SRC
3444
#undef DST
3445

    
3446
#define SRC(x) dct[x][i]
3447
#define DST(x,v) sum += FFABS(v)
3448
    for( i = 0; i < 8; i++ )
3449
        DCT8_1D
3450
#undef SRC
3451
#undef DST
3452
    return sum;
3453
}
3454
#endif
3455

    
3456
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3457
    MpegEncContext * const s= (MpegEncContext *)c;
3458
    DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3459
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3460
    int sum=0, i;
3461

    
3462
    assert(h==8);
3463

    
3464
    s->dsp.diff_pixels(temp, src1, src2, stride);
3465
    s->dsp.fdct(temp);
3466

    
3467
    for(i=0; i<64; i++)
3468
        sum= FFMAX(sum, FFABS(temp[i]));
3469

    
3470
    return sum;
3471
}
3472

    
3473
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3474
    MpegEncContext * const s= (MpegEncContext *)c;
3475
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3476
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3477
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3478
    int sum=0, i;
3479

    
3480
    assert(h==8);
3481
    s->mb_intra=0;
3482

    
3483
    s->dsp.diff_pixels(temp, src1, src2, stride);
3484

    
3485
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3486

    
3487
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3488
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3489
    simple_idct(temp); //FIXME
3490

    
3491
    for(i=0; i<64; i++)
3492
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3493

    
3494
    return sum;
3495
}
3496

    
3497
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3498
    MpegEncContext * const s= (MpegEncContext *)c;
3499
    const uint8_t *scantable= s->intra_scantable.permutated;
3500
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3501
    DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3502
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3503
    uint8_t * const bak= (uint8_t*)aligned_bak;
3504
    int i, last, run, bits, level, distoration, start_i;
3505
    const int esc_length= s->ac_esc_length;
3506
    uint8_t * length;
3507
    uint8_t * last_length;
3508

    
3509
    assert(h==8);
3510

    
3511
    for(i=0; i<8; i++){
3512
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3513
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3514
    }
3515

    
3516
    s->dsp.diff_pixels(temp, src1, src2, stride);
3517

    
3518
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3519

    
3520
    bits=0;
3521

    
3522
    if (s->mb_intra) {
3523
        start_i = 1;
3524
        length     = s->intra_ac_vlc_length;
3525
        last_length= s->intra_ac_vlc_last_length;
3526
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3527
    } else {
3528
        start_i = 0;
3529
        length     = s->inter_ac_vlc_length;
3530
        last_length= s->inter_ac_vlc_last_length;
3531
    }
3532

    
3533
    if(last>=start_i){
3534
        run=0;
3535
        for(i=start_i; i<last; i++){
3536
            int j= scantable[i];
3537
            level= temp[j];
3538

    
3539
            if(level){
3540
                level+=64;
3541
                if((level&(~127)) == 0){
3542
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3543
                }else
3544
                    bits+= esc_length;
3545
                run=0;
3546
            }else
3547
                run++;
3548
        }
3549
        i= scantable[last];
3550

    
3551
        level= temp[i] + 64;
3552

    
3553
        assert(level - 64);
3554

    
3555
        if((level&(~127)) == 0){
3556
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3557
        }else
3558
            bits+= esc_length;
3559

    
3560
    }
3561

    
3562
    if(last>=0){
3563
        if(s->mb_intra)
3564
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3565
        else
3566
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3567
    }
3568

    
3569
    s->dsp.idct_add(bak, stride, temp);
3570

    
3571
    distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3572

    
3573
    return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3574
}
3575

    
3576
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3577
    MpegEncContext * const s= (MpegEncContext *)c;
3578
    const uint8_t *scantable= s->intra_scantable.permutated;
3579
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3580
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3581
    int i, last, run, bits, level, start_i;
3582
    const int esc_length= s->ac_esc_length;
3583
    uint8_t * length;
3584
    uint8_t * last_length;
3585

    
3586
    assert(h==8);
3587

    
3588
    s->dsp.diff_pixels(temp, src1, src2, stride);
3589

    
3590
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3591

    
3592
    bits=0;
3593

    
3594
    if (s->mb_intra) {
3595
        start_i = 1;
3596
        length     = s->intra_ac_vlc_length;
3597
        last_length= s->intra_ac_vlc_last_length;
3598
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3599
    } else {
3600
        start_i = 0;
3601
        length     = s->inter_ac_vlc_length;
3602
        last_length= s->inter_ac_vlc_last_length;
3603
    }
3604

    
3605
    if(last>=start_i){
3606
        run=0;
3607
        for(i=start_i; i<last; i++){
3608
            int j= scantable[i];
3609
            level= temp[j];
3610

    
3611
            if(level){
3612
                level+=64;
3613
                if((level&(~127)) == 0){
3614
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3615
                }else
3616
                    bits+= esc_length;
3617
                run=0;
3618
            }else
3619
                run++;
3620
        }
3621
        i= scantable[last];
3622

    
3623
        level= temp[i] + 64;
3624

    
3625
        assert(level - 64);
3626

    
3627
        if((level&(~127)) == 0){
3628
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3629
        }else
3630
            bits+= esc_length;
3631
    }
3632

    
3633
    return bits;
3634
}
3635

    
3636
static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3637
    int score=0;
3638
    int x,y;
3639

    
3640
    for(y=1; y<h; y++){
3641
        for(x=0; x<16; x+=4){
3642
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])
3643
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3644
        }
3645
        s+= stride;
3646
    }
3647

    
3648
    return score;
3649
}
3650

    
3651
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3652
    int score=0;
3653
    int x,y;
3654

    
3655
    for(y=1; y<h; y++){
3656
        for(x=0; x<16; x++){
3657
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3658
        }
3659
        s1+= stride;
3660
        s2+= stride;
3661
    }
3662

    
3663
    return score;
3664
}
3665

    
3666
#define SQ(a) ((a)*(a))
3667
static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3668
    int score=0;
3669
    int x,y;
3670

    
3671
    for(y=1; y<h; y++){
3672
        for(x=0; x<16; x+=4){
3673
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3674
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3675
        }
3676
        s+= stride;
3677
    }
3678

    
3679
    return score;
3680
}
3681

    
3682
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3683
    int score=0;
3684
    int x,y;
3685

    
3686
    for(y=1; y<h; y++){
3687
        for(x=0; x<16; x++){
3688
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3689
        }
3690
        s1+= stride;
3691
        s2+= stride;
3692
    }
3693

    
3694
    return score;
3695
}
3696

    
3697
static int ssd_int8_vs_int16_c(int8_t *pix1, int16_t *pix2, int size){
3698
    int score=0;
3699
    int i;
3700
    for(i=0; i<size; i++)
3701
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3702
    return score;
3703
}
3704

    
3705
WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3706
WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3707
WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3708
#ifdef CONFIG_GPL
3709
WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3710
#endif
3711
WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3712
WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3713
WARPER8_16_SQ(rd8x8_c, rd16_c)
3714
WARPER8_16_SQ(bit8x8_c, bit16_c)
3715

    
3716
static void vector_fmul_c(float *dst, const float *src, int len){
3717
    int i;
3718
    for(i=0; i<len; i++)
3719
        dst[i] *= src[i];
3720
}
3721

    
3722
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3723
    int i;
3724
    src1 += len-1;
3725
    for(i=0; i<len; i++)
3726
        dst[i] = src0[i] * src1[-i];
3727
}
3728

    
3729
void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3730
    int i;
3731
    for(i=0; i<len; i++)
3732
        dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3733
}
3734

    
3735
void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3736
    int i;
3737
    for(i=0; i<len; i++) {
3738
        int_fast32_t tmp = ((int32_t*)src)[i];
3739
        if(tmp & 0xf0000){
3740
            tmp = (0x43c0ffff - tmp)>>31;
3741
            // is this faster on some gcc/cpu combinations?
3742
//          if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3743
//          else                 tmp = 0;
3744
        }
3745
        dst[i] = tmp - 0x8000;
3746
    }
3747
}
3748

    
3749
/* XXX: those functions should be suppressed ASAP when all IDCTs are
3750
 converted */
3751
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3752
{
3753
    j_rev_dct (block);
3754
    put_pixels_clamped_c(block, dest, line_size);
3755
}
3756
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3757
{
3758
    j_rev_dct (block);
3759
    add_pixels_clamped_c(block, dest, line_size);
3760
}
3761

    
3762
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3763
{
3764
    j_rev_dct4 (block);
3765
    put_pixels_clamped4_c(block, dest, line_size);
3766
}
3767
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3768
{
3769
    j_rev_dct4 (block);
3770
    add_pixels_clamped4_c(block, dest, line_size);
3771
}
3772

    
3773
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3774
{
3775
    j_rev_dct2 (block);
3776
    put_pixels_clamped2_c(block, dest, line_size);
3777
}
3778
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3779
{
3780
    j_rev_dct2 (block);
3781
    add_pixels_clamped2_c(block, dest, line_size);
3782
}
3783

    
3784
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3785
{
3786
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3787

    
3788
    dest[0] = cm[(block[0] + 4)>>3];
3789
}
3790
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3791
{
3792
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3793

    
3794
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3795
}
3796

    
3797
static void just_return() { return; }
3798

    
3799
/* init static data */
3800
void dsputil_static_init(void)
3801
{
3802
    int i;
3803

    
3804
    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
3805
    for(i=0;i<MAX_NEG_CROP;i++) {
3806
        ff_cropTbl[i] = 0;
3807
        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
3808
    }
3809

    
3810
    for(i=0;i<512;i++) {
3811
        ff_squareTbl[i] = (i - 256) * (i - 256);
3812
    }
3813

    
3814
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3815
}
3816

    
3817
int ff_check_alignment(void){
3818
    static int did_fail=0;
3819
    DECLARE_ALIGNED_16(int, aligned);
3820

    
3821
    if((int)&aligned & 15){
3822
        if(!did_fail){
3823
#if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
3824
            av_log(NULL, AV_LOG_ERROR,
3825
                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
3826
                "and may be very slow or crash. This is not a bug in libavcodec,\n"
3827
                "but in the compiler. Do not report crashes to FFmpeg developers.\n");
3828
#endif
3829
            did_fail=1;
3830
        }
3831
        return -1;
3832
    }
3833
    return 0;
3834
}
3835

    
3836
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3837
{
3838
    int i;
3839

    
3840
    ff_check_alignment();
3841

    
3842
#ifdef CONFIG_ENCODERS
3843
    if(avctx->dct_algo==FF_DCT_FASTINT) {
3844
        c->fdct = fdct_ifast;
3845
        c->fdct248 = fdct_ifast248;
3846
    }
3847
    else if(avctx->dct_algo==FF_DCT_FAAN) {
3848
        c->fdct = ff_faandct;
3849
        c->fdct248 = ff_faandct248;
3850
    }
3851
    else {
3852
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3853
        c->fdct248 = ff_fdct248_islow;
3854
    }
3855
#endif //CONFIG_ENCODERS
3856

    
3857
    if(avctx->lowres==1){
3858
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3859
            c->idct_put= ff_jref_idct4_put;
3860
            c->idct_add= ff_jref_idct4_add;
3861
        }else{
3862
            c->idct_put= ff_h264_lowres_idct_put_c;
3863
            c->idct_add= ff_h264_lowres_idct_add_c;
3864
        }
3865
        c->idct    = j_rev_dct4;
3866
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3867
    }else if(avctx->lowres==2){
3868
        c->idct_put= ff_jref_idct2_put;
3869
        c->idct_add= ff_jref_idct2_add;
3870
        c->idct    = j_rev_dct2;
3871
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3872
    }else if(avctx->lowres==3){
3873
        c->idct_put= ff_jref_idct1_put;
3874
        c->idct_add= ff_jref_idct1_add;
3875
        c->idct    = j_rev_dct1;
3876
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3877
    }else{
3878
        if(avctx->idct_algo==FF_IDCT_INT){
3879
            c->idct_put= ff_jref_idct_put;
3880
            c->idct_add= ff_jref_idct_add;
3881
            c->idct    = j_rev_dct;
3882
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3883
        }else if(avctx->idct_algo==FF_IDCT_VP3){
3884
            c->idct_put= ff_vp3_idct_put_c;
3885
            c->idct_add= ff_vp3_idct_add_c;
3886
            c->idct    = ff_vp3_idct_c;
3887
            c->idct_permutation_type= FF_NO_IDCT_PERM;
3888
        }else{ //accurate/default
3889
            c->idct_put= simple_idct_put;
3890
            c->idct_add= simple_idct_add;
3891
            c->idct    = simple_idct;
3892
            c->idct_permutation_type= FF_NO_IDCT_PERM;
3893
        }
3894
    }
3895

    
3896
    c->h264_idct_add= ff_h264_idct_add_c;
3897
    c->h264_idct8_add= ff_h264_idct8_add_c;
3898
    c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
3899
    c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
3900

    
3901
    c->get_pixels = get_pixels_c;
3902
    c->diff_pixels = diff_pixels_c;
3903
    c->put_pixels_clamped = put_pixels_clamped_c;
3904
    c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3905
    c->add_pixels_clamped = add_pixels_clamped_c;
3906
    c->add_pixels8 = add_pixels8_c;
3907
    c->add_pixels4 = add_pixels4_c;
3908
    c->gmc1 = gmc1_c;
3909
    c->gmc = ff_gmc_c;
3910
    c->clear_blocks = clear_blocks_c;
3911
    c->pix_sum = pix_sum_c;
3912
    c->pix_norm1 = pix_norm1_c;
3913

    
3914
    /* TODO [0] 16  [1] 8 */
3915
    c->pix_abs[0][0] = pix_abs16_c;
3916
    c->pix_abs[0][1] = pix_abs16_x2_c;
3917
    c->pix_abs[0][2] = pix_abs16_y2_c;
3918
    c->pix_abs[0][3] = pix_abs16_xy2_c;
3919
    c->pix_abs[1][0] = pix_abs8_c;
3920
    c->pix_abs[1][1] = pix_abs8_x2_c;
3921
    c->pix_abs[1][2] = pix_abs8_y2_c;
3922
    c->pix_abs[1][3] = pix_abs8_xy2_c;
3923

    
3924
#define dspfunc(PFX, IDX, NUM) \
3925
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3926
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3927
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3928
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3929

    
3930
    dspfunc(put, 0, 16);
3931
    dspfunc(put_no_rnd, 0, 16);
3932
    dspfunc(put, 1, 8);
3933
    dspfunc(put_no_rnd, 1, 8);
3934
    dspfunc(put, 2, 4);
3935
    dspfunc(put, 3, 2);
3936

    
3937
    dspfunc(avg, 0, 16);
3938
    dspfunc(avg_no_rnd, 0, 16);
3939
    dspfunc(avg, 1, 8);
3940
    dspfunc(avg_no_rnd, 1, 8);
3941
    dspfunc(avg, 2, 4);
3942
    dspfunc(avg, 3, 2);
3943
#undef dspfunc
3944

    
3945
    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3946
    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3947

    
3948
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3949
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3950
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3951
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3952
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3953
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3954
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3955
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3956
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3957

    
3958
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3959
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3960
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3961
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3962
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3963
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3964
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3965
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3966
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3967

    
3968
#define dspfunc(PFX, IDX, NUM) \
3969
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3970
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3971
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3972
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3973
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3974
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3975
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3976
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3977
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3978
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3979
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3980
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3981
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3982
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3983
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3984
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3985

    
3986
    dspfunc(put_qpel, 0, 16);
3987
    dspfunc(put_no_rnd_qpel, 0, 16);
3988

    
3989
    dspfunc(avg_qpel, 0, 16);
3990
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3991

    
3992
    dspfunc(put_qpel, 1, 8);
3993
    dspfunc(put_no_rnd_qpel, 1, 8);
3994

    
3995
    dspfunc(avg_qpel, 1, 8);
3996
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3997

    
3998
    dspfunc(put_h264_qpel, 0, 16);
3999
    dspfunc(put_h264_qpel, 1, 8);
4000
    dspfunc(put_h264_qpel, 2, 4);
4001
    dspfunc(put_h264_qpel, 3, 2);
4002
    dspfunc(avg_h264_qpel, 0, 16);
4003
    dspfunc(avg_h264_qpel, 1, 8);
4004
    dspfunc(avg_h264_qpel, 2, 4);
4005

    
4006
#undef dspfunc
4007
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4008
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4009
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4010
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4011
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4012
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4013
    c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4014

    
4015
    c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4016
    c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4017
    c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4018
    c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4019
    c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4020
    c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4021
    c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4022
    c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4023
    c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4024
    c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4025
    c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4026
    c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4027
    c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4028
    c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4029
    c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4030
    c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4031
    c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4032
    c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4033
    c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4034
    c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4035

    
4036
#ifdef CONFIG_CAVS_DECODER
4037
    ff_cavsdsp_init(c,avctx);
4038
#endif
4039
#if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4040
    ff_vc1dsp_init(c,avctx);
4041
#endif
4042
#if defined(CONFIG_H264_ENCODER)
4043
    ff_h264dsp_init(c,avctx);
4044
#endif
4045

    
4046
    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4047
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4048
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4049
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4050
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4051
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4052
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4053
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4054

    
4055
#define SET_CMP_FUNC(name) \
4056
    c->name[0]= name ## 16_c;\
4057
    c->name[1]= name ## 8x8_c;
4058

    
4059
    SET_CMP_FUNC(hadamard8_diff)
4060
    c->hadamard8_diff[4]= hadamard8_intra16_c;
4061
    SET_CMP_FUNC(dct_sad)
4062
    SET_CMP_FUNC(dct_max)
4063
#ifdef CONFIG_GPL
4064
    SET_CMP_FUNC(dct264_sad)
4065
#endif
4066
    c->sad[0]= pix_abs16_c;
4067
    c->sad[1]= pix_abs8_c;
4068
    c->sse[0]= sse16_c;
4069
    c->sse[1]= sse8_c;
4070
    c->sse[2]= sse4_c;
4071
    SET_CMP_FUNC(quant_psnr)
4072
    SET_CMP_FUNC(rd)
4073
    SET_CMP_FUNC(bit)
4074
    c->vsad[0]= vsad16_c;
4075
    c->vsad[4]= vsad_intra16_c;
4076
    c->vsse[0]= vsse16_c;
4077
    c->vsse[4]= vsse_intra16_c;
4078
    c->nsse[0]= nsse16_c;
4079
    c->nsse[1]= nsse8_c;
4080
#ifdef CONFIG_SNOW_ENCODER
4081
    c->w53[0]= w53_16_c;
4082
    c->w53[1]= w53_8_c;
4083
    c->w97[0]= w97_16_c;
4084
    c->w97[1]= w97_8_c;
4085
#endif
4086

    
4087
    c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4088

    
4089
    c->add_bytes= add_bytes_c;
4090
    c->diff_bytes= diff_bytes_c;
4091
    c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4092
    c->bswap_buf= bswap_buf;
4093

    
4094
    c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4095
    c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4096
    c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4097
    c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4098
    c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4099
    c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4100
    c->h264_loop_filter_strength= NULL;
4101

    
4102
    c->h263_h_loop_filter= h263_h_loop_filter_c;
4103
    c->h263_v_loop_filter= h263_v_loop_filter_c;
4104

    
4105
    c->h261_loop_filter= h261_loop_filter_c;
4106

    
4107
    c->try_8x8basis= try_8x8basis_c;
4108
    c->add_8x8basis= add_8x8basis_c;
4109

    
4110
#ifdef CONFIG_SNOW_DECODER
4111
    c->vertical_compose97i = ff_snow_vertical_compose97i;
4112
    c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4113
    c->inner_add_yblock = ff_snow_inner_add_yblock;
4114
#endif
4115

    
4116
#ifdef CONFIG_VORBIS_DECODER
4117
    c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4118
#endif
4119
    c->vector_fmul = vector_fmul_c;
4120
    c->vector_fmul_reverse = vector_fmul_reverse_c;
4121
    c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4122
    c->float_to_int16 = ff_float_to_int16_c;
4123

    
4124
    c->shrink[0]= ff_img_copy_plane;
4125
    c->shrink[1]= ff_shrink22;
4126
    c->shrink[2]= ff_shrink44;
4127
    c->shrink[3]= ff_shrink88;
4128

    
4129
    c->prefetch= just_return;
4130

    
4131
    memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4132
    memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4133

    
4134
#ifdef HAVE_MMX
4135
    dsputil_init_mmx(c, avctx);
4136
#endif
4137
#ifdef ARCH_ARMV4L
4138
    dsputil_init_armv4l(c, avctx);
4139
#endif
4140
#ifdef HAVE_MLIB
4141
    dsputil_init_mlib(c, avctx);
4142
#endif
4143
#ifdef ARCH_SPARC
4144
   dsputil_init_vis(c,avctx);
4145
#endif
4146
#ifdef ARCH_ALPHA
4147
    dsputil_init_alpha(c, avctx);
4148
#endif
4149
#ifdef ARCH_POWERPC
4150
    dsputil_init_ppc(c, avctx);
4151
#endif
4152
#ifdef HAVE_MMI
4153
    dsputil_init_mmi(c, avctx);
4154
#endif
4155
#ifdef ARCH_SH4
4156
    dsputil_init_sh4(c,avctx);
4157
#endif
4158
#ifdef ARCH_BFIN
4159
    dsputil_init_bfin(c,avctx);
4160
#endif
4161

    
4162
    for(i=0; i<64; i++){
4163
        if(!c->put_2tap_qpel_pixels_tab[0][i])
4164
            c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4165
        if(!c->avg_2tap_qpel_pixels_tab[0][i])
4166
            c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4167
    }
4168

    
4169
    switch(c->idct_permutation_type){
4170
    case FF_NO_IDCT_PERM:
4171
        for(i=0; i<64; i++)
4172
            c->idct_permutation[i]= i;
4173
        break;
4174
    case FF_LIBMPEG2_IDCT_PERM:
4175
        for(i=0; i<64; i++)
4176
            c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4177
        break;
4178
    case FF_SIMPLE_IDCT_PERM:
4179
        for(i=0; i<64; i++)
4180
            c->idct_permutation[i]= simple_mmx_permutation[i];
4181
        break;
4182
    case FF_TRANSPOSE_IDCT_PERM:
4183
        for(i=0; i<64; i++)
4184
            c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4185
        break;
4186
    case FF_PARTTRANS_IDCT_PERM:
4187
        for(i=0; i<64; i++)
4188
            c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4189
        break;
4190
    default:
4191
        av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4192
    }
4193
}
4194