Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 36cd3069

History | View | Annotate | Download (149 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 *
22
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
23
 */
24

    
25
/**
26
 * @file dsputil.c
27
 * DSP utils
28
 */
29

    
30
#include "avcodec.h"
31
#include "dsputil.h"
32
#include "mpegvideo.h"
33
#include "simple_idct.h"
34
#include "faandct.h"
35
#include "snow.h"
36

    
37
/* snow.c */
38
void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
39

    
40
/* vorbis.c */
41
void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
42

    
43
uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44
uint32_t squareTbl[512] = {0, };
45

    
46
const uint8_t ff_zigzag_direct[64] = {
47
    0,   1,  8, 16,  9,  2,  3, 10,
48
    17, 24, 32, 25, 18, 11,  4,  5,
49
    12, 19, 26, 33, 40, 48, 41, 34,
50
    27, 20, 13,  6,  7, 14, 21, 28,
51
    35, 42, 49, 56, 57, 50, 43, 36,
52
    29, 22, 15, 23, 30, 37, 44, 51,
53
    58, 59, 52, 45, 38, 31, 39, 46,
54
    53, 60, 61, 54, 47, 55, 62, 63
55
};
56

    
57
/* Specific zigzag scan for 248 idct. NOTE that unlike the
58
   specification, we interleave the fields */
59
const uint8_t ff_zigzag248_direct[64] = {
60
     0,  8,  1,  9, 16, 24,  2, 10,
61
    17, 25, 32, 40, 48, 56, 33, 41,
62
    18, 26,  3, 11,  4, 12, 19, 27,
63
    34, 42, 49, 57, 50, 58, 35, 43,
64
    20, 28,  5, 13,  6, 14, 21, 29,
65
    36, 44, 51, 59, 52, 60, 37, 45,
66
    22, 30,  7, 15, 23, 31, 38, 46,
67
    53, 61, 54, 62, 39, 47, 55, 63,
68
};
69

    
70
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
71
DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
72

    
73
const uint8_t ff_alternate_horizontal_scan[64] = {
74
    0,  1,   2,  3,  8,  9, 16, 17,
75
    10, 11,  4,  5,  6,  7, 15, 14,
76
    13, 12, 19, 18, 24, 25, 32, 33,
77
    26, 27, 20, 21, 22, 23, 28, 29,
78
    30, 31, 34, 35, 40, 41, 48, 49,
79
    42, 43, 36, 37, 38, 39, 44, 45,
80
    46, 47, 50, 51, 56, 57, 58, 59,
81
    52, 53, 54, 55, 60, 61, 62, 63,
82
};
83

    
84
const uint8_t ff_alternate_vertical_scan[64] = {
85
    0,  8,  16, 24,  1,  9,  2, 10,
86
    17, 25, 32, 40, 48, 56, 57, 49,
87
    41, 33, 26, 18,  3, 11,  4, 12,
88
    19, 27, 34, 42, 50, 58, 35, 43,
89
    51, 59, 20, 28,  5, 13,  6, 14,
90
    21, 29, 36, 44, 52, 60, 37, 45,
91
    53, 61, 22, 30,  7, 15, 23, 31,
92
    38, 46, 54, 62, 39, 47, 55, 63,
93
};
94

    
95
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
96
const uint32_t ff_inverse[256]={
97
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
98
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
99
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
100
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
101
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
102
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
103
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
104
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
105
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
106
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
107
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
108
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
109
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
110
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
111
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
112
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
113
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
114
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
115
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
116
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
117
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
118
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
119
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
120
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
121
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
122
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
123
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
124
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
125
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
126
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
127
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
128
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
129
};
130

    
131
/* Input permutation for the simple_idct_mmx */
132
static const uint8_t simple_mmx_permutation[64]={
133
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
134
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
135
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
136
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
137
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
138
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
139
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
140
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
141
};
142

    
143
static int pix_sum_c(uint8_t * pix, int line_size)
144
{
145
    int s, i, j;
146

    
147
    s = 0;
148
    for (i = 0; i < 16; i++) {
149
        for (j = 0; j < 16; j += 8) {
150
            s += pix[0];
151
            s += pix[1];
152
            s += pix[2];
153
            s += pix[3];
154
            s += pix[4];
155
            s += pix[5];
156
            s += pix[6];
157
            s += pix[7];
158
            pix += 8;
159
        }
160
        pix += line_size - 16;
161
    }
162
    return s;
163
}
164

    
165
static int pix_norm1_c(uint8_t * pix, int line_size)
166
{
167
    int s, i, j;
168
    uint32_t *sq = squareTbl + 256;
169

    
170
    s = 0;
171
    for (i = 0; i < 16; i++) {
172
        for (j = 0; j < 16; j += 8) {
173
#if 0
174
            s += sq[pix[0]];
175
            s += sq[pix[1]];
176
            s += sq[pix[2]];
177
            s += sq[pix[3]];
178
            s += sq[pix[4]];
179
            s += sq[pix[5]];
180
            s += sq[pix[6]];
181
            s += sq[pix[7]];
182
#else
183
#if LONG_MAX > 2147483647
184
            register uint64_t x=*(uint64_t*)pix;
185
            s += sq[x&0xff];
186
            s += sq[(x>>8)&0xff];
187
            s += sq[(x>>16)&0xff];
188
            s += sq[(x>>24)&0xff];
189
            s += sq[(x>>32)&0xff];
190
            s += sq[(x>>40)&0xff];
191
            s += sq[(x>>48)&0xff];
192
            s += sq[(x>>56)&0xff];
193
#else
194
            register uint32_t x=*(uint32_t*)pix;
195
            s += sq[x&0xff];
196
            s += sq[(x>>8)&0xff];
197
            s += sq[(x>>16)&0xff];
198
            s += sq[(x>>24)&0xff];
199
            x=*(uint32_t*)(pix+4);
200
            s += sq[x&0xff];
201
            s += sq[(x>>8)&0xff];
202
            s += sq[(x>>16)&0xff];
203
            s += sq[(x>>24)&0xff];
204
#endif
205
#endif
206
            pix += 8;
207
        }
208
        pix += line_size - 16;
209
    }
210
    return s;
211
}
212

    
213
static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
214
    int i;
215

    
216
    for(i=0; i+8<=w; i+=8){
217
        dst[i+0]= bswap_32(src[i+0]);
218
        dst[i+1]= bswap_32(src[i+1]);
219
        dst[i+2]= bswap_32(src[i+2]);
220
        dst[i+3]= bswap_32(src[i+3]);
221
        dst[i+4]= bswap_32(src[i+4]);
222
        dst[i+5]= bswap_32(src[i+5]);
223
        dst[i+6]= bswap_32(src[i+6]);
224
        dst[i+7]= bswap_32(src[i+7]);
225
    }
226
    for(;i<w; i++){
227
        dst[i+0]= bswap_32(src[i+0]);
228
    }
229
}
230

    
231
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
232
{
233
    int s, i;
234
    uint32_t *sq = squareTbl + 256;
235

    
236
    s = 0;
237
    for (i = 0; i < h; i++) {
238
        s += sq[pix1[0] - pix2[0]];
239
        s += sq[pix1[1] - pix2[1]];
240
        s += sq[pix1[2] - pix2[2]];
241
        s += sq[pix1[3] - pix2[3]];
242
        pix1 += line_size;
243
        pix2 += line_size;
244
    }
245
    return s;
246
}
247

    
248
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
249
{
250
    int s, i;
251
    uint32_t *sq = squareTbl + 256;
252

    
253
    s = 0;
254
    for (i = 0; i < h; i++) {
255
        s += sq[pix1[0] - pix2[0]];
256
        s += sq[pix1[1] - pix2[1]];
257
        s += sq[pix1[2] - pix2[2]];
258
        s += sq[pix1[3] - pix2[3]];
259
        s += sq[pix1[4] - pix2[4]];
260
        s += sq[pix1[5] - pix2[5]];
261
        s += sq[pix1[6] - pix2[6]];
262
        s += sq[pix1[7] - pix2[7]];
263
        pix1 += line_size;
264
        pix2 += line_size;
265
    }
266
    return s;
267
}
268

    
269
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
270
{
271
    int s, i;
272
    uint32_t *sq = squareTbl + 256;
273

    
274
    s = 0;
275
    for (i = 0; i < h; i++) {
276
        s += sq[pix1[ 0] - pix2[ 0]];
277
        s += sq[pix1[ 1] - pix2[ 1]];
278
        s += sq[pix1[ 2] - pix2[ 2]];
279
        s += sq[pix1[ 3] - pix2[ 3]];
280
        s += sq[pix1[ 4] - pix2[ 4]];
281
        s += sq[pix1[ 5] - pix2[ 5]];
282
        s += sq[pix1[ 6] - pix2[ 6]];
283
        s += sq[pix1[ 7] - pix2[ 7]];
284
        s += sq[pix1[ 8] - pix2[ 8]];
285
        s += sq[pix1[ 9] - pix2[ 9]];
286
        s += sq[pix1[10] - pix2[10]];
287
        s += sq[pix1[11] - pix2[11]];
288
        s += sq[pix1[12] - pix2[12]];
289
        s += sq[pix1[13] - pix2[13]];
290
        s += sq[pix1[14] - pix2[14]];
291
        s += sq[pix1[15] - pix2[15]];
292

    
293
        pix1 += line_size;
294
        pix2 += line_size;
295
    }
296
    return s;
297
}
298

    
299

    
300
#ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
301
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
302
    int s, i, j;
303
    const int dec_count= w==8 ? 3 : 4;
304
    int tmp[32*32];
305
    int level, ori;
306
    static const int scale[2][2][4][4]={
307
      {
308
        {
309
            // 9/7 8x8 dec=3
310
            {268, 239, 239, 213},
311
            {  0, 224, 224, 152},
312
            {  0, 135, 135, 110},
313
        },{
314
            // 9/7 16x16 or 32x32 dec=4
315
            {344, 310, 310, 280},
316
            {  0, 320, 320, 228},
317
            {  0, 175, 175, 136},
318
            {  0, 129, 129, 102},
319
        }
320
      },{
321
        {
322
            // 5/3 8x8 dec=3
323
            {275, 245, 245, 218},
324
            {  0, 230, 230, 156},
325
            {  0, 138, 138, 113},
326
        },{
327
            // 5/3 16x16 or 32x32 dec=4
328
            {352, 317, 317, 286},
329
            {  0, 328, 328, 233},
330
            {  0, 180, 180, 140},
331
            {  0, 132, 132, 105},
332
        }
333
      }
334
    };
335

    
336
    for (i = 0; i < h; i++) {
337
        for (j = 0; j < w; j+=4) {
338
            tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
339
            tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
340
            tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
341
            tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
342
        }
343
        pix1 += line_size;
344
        pix2 += line_size;
345
    }
346

    
347
    ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
348

    
349
    s=0;
350
    assert(w==h);
351
    for(level=0; level<dec_count; level++){
352
        for(ori= level ? 1 : 0; ori<4; ori++){
353
            int size= w>>(dec_count-level);
354
            int sx= (ori&1) ? size : 0;
355
            int stride= 32<<(dec_count-level);
356
            int sy= (ori&2) ? stride>>1 : 0;
357

    
358
            for(i=0; i<size; i++){
359
                for(j=0; j<size; j++){
360
                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
361
                    s += FFABS(v);
362
                }
363
            }
364
        }
365
    }
366
    assert(s>=0);
367
    return s>>9;
368
}
369

    
370
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
371
    return w_c(v, pix1, pix2, line_size,  8, h, 1);
372
}
373

    
374
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
375
    return w_c(v, pix1, pix2, line_size,  8, h, 0);
376
}
377

    
378
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
379
    return w_c(v, pix1, pix2, line_size, 16, h, 1);
380
}
381

    
382
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
383
    return w_c(v, pix1, pix2, line_size, 16, h, 0);
384
}
385

    
386
int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
387
    return w_c(v, pix1, pix2, line_size, 32, h, 1);
388
}
389

    
390
int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
391
    return w_c(v, pix1, pix2, line_size, 32, h, 0);
392
}
393
#endif
394

    
395
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
396
{
397
    int i;
398

    
399
    /* read the pixels */
400
    for(i=0;i<8;i++) {
401
        block[0] = pixels[0];
402
        block[1] = pixels[1];
403
        block[2] = pixels[2];
404
        block[3] = pixels[3];
405
        block[4] = pixels[4];
406
        block[5] = pixels[5];
407
        block[6] = pixels[6];
408
        block[7] = pixels[7];
409
        pixels += line_size;
410
        block += 8;
411
    }
412
}
413

    
414
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
415
                          const uint8_t *s2, int stride){
416
    int i;
417

    
418
    /* read the pixels */
419
    for(i=0;i<8;i++) {
420
        block[0] = s1[0] - s2[0];
421
        block[1] = s1[1] - s2[1];
422
        block[2] = s1[2] - s2[2];
423
        block[3] = s1[3] - s2[3];
424
        block[4] = s1[4] - s2[4];
425
        block[5] = s1[5] - s2[5];
426
        block[6] = s1[6] - s2[6];
427
        block[7] = s1[7] - s2[7];
428
        s1 += stride;
429
        s2 += stride;
430
        block += 8;
431
    }
432
}
433

    
434

    
435
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
436
                                 int line_size)
437
{
438
    int i;
439
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
440

    
441
    /* read the pixels */
442
    for(i=0;i<8;i++) {
443
        pixels[0] = cm[block[0]];
444
        pixels[1] = cm[block[1]];
445
        pixels[2] = cm[block[2]];
446
        pixels[3] = cm[block[3]];
447
        pixels[4] = cm[block[4]];
448
        pixels[5] = cm[block[5]];
449
        pixels[6] = cm[block[6]];
450
        pixels[7] = cm[block[7]];
451

    
452
        pixels += line_size;
453
        block += 8;
454
    }
455
}
456

    
457
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
458
                                 int line_size)
459
{
460
    int i;
461
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
462

    
463
    /* read the pixels */
464
    for(i=0;i<4;i++) {
465
        pixels[0] = cm[block[0]];
466
        pixels[1] = cm[block[1]];
467
        pixels[2] = cm[block[2]];
468
        pixels[3] = cm[block[3]];
469

    
470
        pixels += line_size;
471
        block += 8;
472
    }
473
}
474

    
475
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
476
                                 int line_size)
477
{
478
    int i;
479
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
480

    
481
    /* read the pixels */
482
    for(i=0;i<2;i++) {
483
        pixels[0] = cm[block[0]];
484
        pixels[1] = cm[block[1]];
485

    
486
        pixels += line_size;
487
        block += 8;
488
    }
489
}
490

    
491
static void put_signed_pixels_clamped_c(const DCTELEM *block,
492
                                        uint8_t *restrict pixels,
493
                                        int line_size)
494
{
495
    int i, j;
496

    
497
    for (i = 0; i < 8; i++) {
498
        for (j = 0; j < 8; j++) {
499
            if (*block < -128)
500
                *pixels = 0;
501
            else if (*block > 127)
502
                *pixels = 255;
503
            else
504
                *pixels = (uint8_t)(*block + 128);
505
            block++;
506
            pixels++;
507
        }
508
        pixels += (line_size - 8);
509
    }
510
}
511

    
512
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
513
                          int line_size)
514
{
515
    int i;
516
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
517

    
518
    /* read the pixels */
519
    for(i=0;i<8;i++) {
520
        pixels[0] = cm[pixels[0] + block[0]];
521
        pixels[1] = cm[pixels[1] + block[1]];
522
        pixels[2] = cm[pixels[2] + block[2]];
523
        pixels[3] = cm[pixels[3] + block[3]];
524
        pixels[4] = cm[pixels[4] + block[4]];
525
        pixels[5] = cm[pixels[5] + block[5]];
526
        pixels[6] = cm[pixels[6] + block[6]];
527
        pixels[7] = cm[pixels[7] + block[7]];
528
        pixels += line_size;
529
        block += 8;
530
    }
531
}
532

    
533
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
534
                          int line_size)
535
{
536
    int i;
537
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
538

    
539
    /* read the pixels */
540
    for(i=0;i<4;i++) {
541
        pixels[0] = cm[pixels[0] + block[0]];
542
        pixels[1] = cm[pixels[1] + block[1]];
543
        pixels[2] = cm[pixels[2] + block[2]];
544
        pixels[3] = cm[pixels[3] + block[3]];
545
        pixels += line_size;
546
        block += 8;
547
    }
548
}
549

    
550
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
551
                          int line_size)
552
{
553
    int i;
554
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
555

    
556
    /* read the pixels */
557
    for(i=0;i<2;i++) {
558
        pixels[0] = cm[pixels[0] + block[0]];
559
        pixels[1] = cm[pixels[1] + block[1]];
560
        pixels += line_size;
561
        block += 8;
562
    }
563
}
564

    
565
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
566
{
567
    int i;
568
    for(i=0;i<8;i++) {
569
        pixels[0] += block[0];
570
        pixels[1] += block[1];
571
        pixels[2] += block[2];
572
        pixels[3] += block[3];
573
        pixels[4] += block[4];
574
        pixels[5] += block[5];
575
        pixels[6] += block[6];
576
        pixels[7] += block[7];
577
        pixels += line_size;
578
        block += 8;
579
    }
580
}
581

    
582
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
583
{
584
    int i;
585
    for(i=0;i<4;i++) {
586
        pixels[0] += block[0];
587
        pixels[1] += block[1];
588
        pixels[2] += block[2];
589
        pixels[3] += block[3];
590
        pixels += line_size;
591
        block += 4;
592
    }
593
}
594

    
595
#if 0
596

597
#define PIXOP2(OPNAME, OP) \
598
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
599
{\
600
    int i;\
601
    for(i=0; i<h; i++){\
602
        OP(*((uint64_t*)block), LD64(pixels));\
603
        pixels+=line_size;\
604
        block +=line_size;\
605
    }\
606
}\
607
\
608
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
609
{\
610
    int i;\
611
    for(i=0; i<h; i++){\
612
        const uint64_t a= LD64(pixels  );\
613
        const uint64_t b= LD64(pixels+1);\
614
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
615
        pixels+=line_size;\
616
        block +=line_size;\
617
    }\
618
}\
619
\
620
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
621
{\
622
    int i;\
623
    for(i=0; i<h; i++){\
624
        const uint64_t a= LD64(pixels  );\
625
        const uint64_t b= LD64(pixels+1);\
626
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
627
        pixels+=line_size;\
628
        block +=line_size;\
629
    }\
630
}\
631
\
632
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
633
{\
634
    int i;\
635
    for(i=0; i<h; i++){\
636
        const uint64_t a= LD64(pixels          );\
637
        const uint64_t b= LD64(pixels+line_size);\
638
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
639
        pixels+=line_size;\
640
        block +=line_size;\
641
    }\
642
}\
643
\
644
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
645
{\
646
    int i;\
647
    for(i=0; i<h; i++){\
648
        const uint64_t a= LD64(pixels          );\
649
        const uint64_t b= LD64(pixels+line_size);\
650
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
651
        pixels+=line_size;\
652
        block +=line_size;\
653
    }\
654
}\
655
\
656
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
657
{\
658
        int i;\
659
        const uint64_t a= LD64(pixels  );\
660
        const uint64_t b= LD64(pixels+1);\
661
        uint64_t l0=  (a&0x0303030303030303ULL)\
662
                    + (b&0x0303030303030303ULL)\
663
                    + 0x0202020202020202ULL;\
664
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
665
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
666
        uint64_t l1,h1;\
667
\
668
        pixels+=line_size;\
669
        for(i=0; i<h; i+=2){\
670
            uint64_t a= LD64(pixels  );\
671
            uint64_t b= LD64(pixels+1);\
672
            l1=  (a&0x0303030303030303ULL)\
673
               + (b&0x0303030303030303ULL);\
674
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
675
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
676
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
677
            pixels+=line_size;\
678
            block +=line_size;\
679
            a= LD64(pixels  );\
680
            b= LD64(pixels+1);\
681
            l0=  (a&0x0303030303030303ULL)\
682
               + (b&0x0303030303030303ULL)\
683
               + 0x0202020202020202ULL;\
684
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
685
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
686
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
687
            pixels+=line_size;\
688
            block +=line_size;\
689
        }\
690
}\
691
\
692
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
693
{\
694
        int i;\
695
        const uint64_t a= LD64(pixels  );\
696
        const uint64_t b= LD64(pixels+1);\
697
        uint64_t l0=  (a&0x0303030303030303ULL)\
698
                    + (b&0x0303030303030303ULL)\
699
                    + 0x0101010101010101ULL;\
700
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
701
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
702
        uint64_t l1,h1;\
703
\
704
        pixels+=line_size;\
705
        for(i=0; i<h; i+=2){\
706
            uint64_t a= LD64(pixels  );\
707
            uint64_t b= LD64(pixels+1);\
708
            l1=  (a&0x0303030303030303ULL)\
709
               + (b&0x0303030303030303ULL);\
710
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
711
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
712
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
713
            pixels+=line_size;\
714
            block +=line_size;\
715
            a= LD64(pixels  );\
716
            b= LD64(pixels+1);\
717
            l0=  (a&0x0303030303030303ULL)\
718
               + (b&0x0303030303030303ULL)\
719
               + 0x0101010101010101ULL;\
720
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
721
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
722
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
723
            pixels+=line_size;\
724
            block +=line_size;\
725
        }\
726
}\
727
\
728
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
729
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
730
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
731
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
732
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
733
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
734
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
735

736
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
737
#else // 64 bit variant
738

    
739
#define PIXOP2(OPNAME, OP) \
740
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
741
    int i;\
742
    for(i=0; i<h; i++){\
743
        OP(*((uint16_t*)(block  )), LD16(pixels  ));\
744
        pixels+=line_size;\
745
        block +=line_size;\
746
    }\
747
}\
748
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
749
    int i;\
750
    for(i=0; i<h; i++){\
751
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
752
        pixels+=line_size;\
753
        block +=line_size;\
754
    }\
755
}\
756
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
757
    int i;\
758
    for(i=0; i<h; i++){\
759
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
760
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
761
        pixels+=line_size;\
762
        block +=line_size;\
763
    }\
764
}\
765
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
766
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
767
}\
768
\
769
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
770
                                                int src_stride1, int src_stride2, int h){\
771
    int i;\
772
    for(i=0; i<h; i++){\
773
        uint32_t a,b;\
774
        a= LD32(&src1[i*src_stride1  ]);\
775
        b= LD32(&src2[i*src_stride2  ]);\
776
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
777
        a= LD32(&src1[i*src_stride1+4]);\
778
        b= LD32(&src2[i*src_stride2+4]);\
779
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
780
    }\
781
}\
782
\
783
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
784
                                                int src_stride1, int src_stride2, int h){\
785
    int i;\
786
    for(i=0; i<h; i++){\
787
        uint32_t a,b;\
788
        a= LD32(&src1[i*src_stride1  ]);\
789
        b= LD32(&src2[i*src_stride2  ]);\
790
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
791
        a= LD32(&src1[i*src_stride1+4]);\
792
        b= LD32(&src2[i*src_stride2+4]);\
793
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
794
    }\
795
}\
796
\
797
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
798
                                                int src_stride1, int src_stride2, int h){\
799
    int i;\
800
    for(i=0; i<h; i++){\
801
        uint32_t a,b;\
802
        a= LD32(&src1[i*src_stride1  ]);\
803
        b= LD32(&src2[i*src_stride2  ]);\
804
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
805
    }\
806
}\
807
\
808
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
809
                                                int src_stride1, int src_stride2, int h){\
810
    int i;\
811
    for(i=0; i<h; i++){\
812
        uint32_t a,b;\
813
        a= LD16(&src1[i*src_stride1  ]);\
814
        b= LD16(&src2[i*src_stride2  ]);\
815
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
816
    }\
817
}\
818
\
819
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
820
                                                int src_stride1, int src_stride2, int h){\
821
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
822
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
823
}\
824
\
825
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
826
                                                int src_stride1, int src_stride2, int h){\
827
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
828
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
829
}\
830
\
831
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
832
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
833
}\
834
\
835
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
836
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
837
}\
838
\
839
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
840
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
841
}\
842
\
843
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
844
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
845
}\
846
\
847
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
848
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
849
    int i;\
850
    for(i=0; i<h; i++){\
851
        uint32_t a, b, c, d, l0, l1, h0, h1;\
852
        a= LD32(&src1[i*src_stride1]);\
853
        b= LD32(&src2[i*src_stride2]);\
854
        c= LD32(&src3[i*src_stride3]);\
855
        d= LD32(&src4[i*src_stride4]);\
856
        l0=  (a&0x03030303UL)\
857
           + (b&0x03030303UL)\
858
           + 0x02020202UL;\
859
        h0= ((a&0xFCFCFCFCUL)>>2)\
860
          + ((b&0xFCFCFCFCUL)>>2);\
861
        l1=  (c&0x03030303UL)\
862
           + (d&0x03030303UL);\
863
        h1= ((c&0xFCFCFCFCUL)>>2)\
864
          + ((d&0xFCFCFCFCUL)>>2);\
865
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
866
        a= LD32(&src1[i*src_stride1+4]);\
867
        b= LD32(&src2[i*src_stride2+4]);\
868
        c= LD32(&src3[i*src_stride3+4]);\
869
        d= LD32(&src4[i*src_stride4+4]);\
870
        l0=  (a&0x03030303UL)\
871
           + (b&0x03030303UL)\
872
           + 0x02020202UL;\
873
        h0= ((a&0xFCFCFCFCUL)>>2)\
874
          + ((b&0xFCFCFCFCUL)>>2);\
875
        l1=  (c&0x03030303UL)\
876
           + (d&0x03030303UL);\
877
        h1= ((c&0xFCFCFCFCUL)>>2)\
878
          + ((d&0xFCFCFCFCUL)>>2);\
879
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
880
    }\
881
}\
882
\
883
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
884
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
885
}\
886
\
887
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
888
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
889
}\
890
\
891
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
892
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
893
}\
894
\
895
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
896
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
897
}\
898
\
899
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
900
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
901
    int i;\
902
    for(i=0; i<h; i++){\
903
        uint32_t a, b, c, d, l0, l1, h0, h1;\
904
        a= LD32(&src1[i*src_stride1]);\
905
        b= LD32(&src2[i*src_stride2]);\
906
        c= LD32(&src3[i*src_stride3]);\
907
        d= LD32(&src4[i*src_stride4]);\
908
        l0=  (a&0x03030303UL)\
909
           + (b&0x03030303UL)\
910
           + 0x01010101UL;\
911
        h0= ((a&0xFCFCFCFCUL)>>2)\
912
          + ((b&0xFCFCFCFCUL)>>2);\
913
        l1=  (c&0x03030303UL)\
914
           + (d&0x03030303UL);\
915
        h1= ((c&0xFCFCFCFCUL)>>2)\
916
          + ((d&0xFCFCFCFCUL)>>2);\
917
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
918
        a= LD32(&src1[i*src_stride1+4]);\
919
        b= LD32(&src2[i*src_stride2+4]);\
920
        c= LD32(&src3[i*src_stride3+4]);\
921
        d= LD32(&src4[i*src_stride4+4]);\
922
        l0=  (a&0x03030303UL)\
923
           + (b&0x03030303UL)\
924
           + 0x01010101UL;\
925
        h0= ((a&0xFCFCFCFCUL)>>2)\
926
          + ((b&0xFCFCFCFCUL)>>2);\
927
        l1=  (c&0x03030303UL)\
928
           + (d&0x03030303UL);\
929
        h1= ((c&0xFCFCFCFCUL)>>2)\
930
          + ((d&0xFCFCFCFCUL)>>2);\
931
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
932
    }\
933
}\
934
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
935
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
936
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
937
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
938
}\
939
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
940
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
941
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
942
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
943
}\
944
\
945
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
946
{\
947
        int i, a0, b0, a1, b1;\
948
        a0= pixels[0];\
949
        b0= pixels[1] + 2;\
950
        a0 += b0;\
951
        b0 += pixels[2];\
952
\
953
        pixels+=line_size;\
954
        for(i=0; i<h; i+=2){\
955
            a1= pixels[0];\
956
            b1= pixels[1];\
957
            a1 += b1;\
958
            b1 += pixels[2];\
959
\
960
            block[0]= (a1+a0)>>2; /* FIXME non put */\
961
            block[1]= (b1+b0)>>2;\
962
\
963
            pixels+=line_size;\
964
            block +=line_size;\
965
\
966
            a0= pixels[0];\
967
            b0= pixels[1] + 2;\
968
            a0 += b0;\
969
            b0 += pixels[2];\
970
\
971
            block[0]= (a1+a0)>>2;\
972
            block[1]= (b1+b0)>>2;\
973
            pixels+=line_size;\
974
            block +=line_size;\
975
        }\
976
}\
977
\
978
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
979
{\
980
        int i;\
981
        const uint32_t a= LD32(pixels  );\
982
        const uint32_t b= LD32(pixels+1);\
983
        uint32_t l0=  (a&0x03030303UL)\
984
                    + (b&0x03030303UL)\
985
                    + 0x02020202UL;\
986
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
987
                   + ((b&0xFCFCFCFCUL)>>2);\
988
        uint32_t l1,h1;\
989
\
990
        pixels+=line_size;\
991
        for(i=0; i<h; i+=2){\
992
            uint32_t a= LD32(pixels  );\
993
            uint32_t b= LD32(pixels+1);\
994
            l1=  (a&0x03030303UL)\
995
               + (b&0x03030303UL);\
996
            h1= ((a&0xFCFCFCFCUL)>>2)\
997
              + ((b&0xFCFCFCFCUL)>>2);\
998
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
999
            pixels+=line_size;\
1000
            block +=line_size;\
1001
            a= LD32(pixels  );\
1002
            b= LD32(pixels+1);\
1003
            l0=  (a&0x03030303UL)\
1004
               + (b&0x03030303UL)\
1005
               + 0x02020202UL;\
1006
            h0= ((a&0xFCFCFCFCUL)>>2)\
1007
              + ((b&0xFCFCFCFCUL)>>2);\
1008
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1009
            pixels+=line_size;\
1010
            block +=line_size;\
1011
        }\
1012
}\
1013
\
1014
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1015
{\
1016
    int j;\
1017
    for(j=0; j<2; j++){\
1018
        int i;\
1019
        const uint32_t a= LD32(pixels  );\
1020
        const uint32_t b= LD32(pixels+1);\
1021
        uint32_t l0=  (a&0x03030303UL)\
1022
                    + (b&0x03030303UL)\
1023
                    + 0x02020202UL;\
1024
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1025
                   + ((b&0xFCFCFCFCUL)>>2);\
1026
        uint32_t l1,h1;\
1027
\
1028
        pixels+=line_size;\
1029
        for(i=0; i<h; i+=2){\
1030
            uint32_t a= LD32(pixels  );\
1031
            uint32_t b= LD32(pixels+1);\
1032
            l1=  (a&0x03030303UL)\
1033
               + (b&0x03030303UL);\
1034
            h1= ((a&0xFCFCFCFCUL)>>2)\
1035
              + ((b&0xFCFCFCFCUL)>>2);\
1036
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1037
            pixels+=line_size;\
1038
            block +=line_size;\
1039
            a= LD32(pixels  );\
1040
            b= LD32(pixels+1);\
1041
            l0=  (a&0x03030303UL)\
1042
               + (b&0x03030303UL)\
1043
               + 0x02020202UL;\
1044
            h0= ((a&0xFCFCFCFCUL)>>2)\
1045
              + ((b&0xFCFCFCFCUL)>>2);\
1046
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1047
            pixels+=line_size;\
1048
            block +=line_size;\
1049
        }\
1050
        pixels+=4-line_size*(h+1);\
1051
        block +=4-line_size*h;\
1052
    }\
1053
}\
1054
\
1055
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1056
{\
1057
    int j;\
1058
    for(j=0; j<2; j++){\
1059
        int i;\
1060
        const uint32_t a= LD32(pixels  );\
1061
        const uint32_t b= LD32(pixels+1);\
1062
        uint32_t l0=  (a&0x03030303UL)\
1063
                    + (b&0x03030303UL)\
1064
                    + 0x01010101UL;\
1065
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1066
                   + ((b&0xFCFCFCFCUL)>>2);\
1067
        uint32_t l1,h1;\
1068
\
1069
        pixels+=line_size;\
1070
        for(i=0; i<h; i+=2){\
1071
            uint32_t a= LD32(pixels  );\
1072
            uint32_t b= LD32(pixels+1);\
1073
            l1=  (a&0x03030303UL)\
1074
               + (b&0x03030303UL);\
1075
            h1= ((a&0xFCFCFCFCUL)>>2)\
1076
              + ((b&0xFCFCFCFCUL)>>2);\
1077
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1078
            pixels+=line_size;\
1079
            block +=line_size;\
1080
            a= LD32(pixels  );\
1081
            b= LD32(pixels+1);\
1082
            l0=  (a&0x03030303UL)\
1083
               + (b&0x03030303UL)\
1084
               + 0x01010101UL;\
1085
            h0= ((a&0xFCFCFCFCUL)>>2)\
1086
              + ((b&0xFCFCFCFCUL)>>2);\
1087
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1088
            pixels+=line_size;\
1089
            block +=line_size;\
1090
        }\
1091
        pixels+=4-line_size*(h+1);\
1092
        block +=4-line_size*h;\
1093
    }\
1094
}\
1095
\
1096
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1097
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1098
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1099
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1100
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1101
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1102
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1103
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1104

    
1105
#define op_avg(a, b) a = rnd_avg32(a, b)
1106
#endif
1107
#define op_put(a, b) a = b
1108

    
1109
PIXOP2(avg, op_avg)
1110
PIXOP2(put, op_put)
1111
#undef op_avg
1112
#undef op_put
1113

    
1114
#define avg2(a,b) ((a+b+1)>>1)
1115
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1116

    
1117
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1118
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1119
}
1120

    
1121
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1122
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1123
}
1124

    
1125
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1126
{
1127
    const int A=(16-x16)*(16-y16);
1128
    const int B=(   x16)*(16-y16);
1129
    const int C=(16-x16)*(   y16);
1130
    const int D=(   x16)*(   y16);
1131
    int i;
1132

    
1133
    for(i=0; i<h; i++)
1134
    {
1135
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1136
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1137
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1138
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1139
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1140
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1141
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1142
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1143
        dst+= stride;
1144
        src+= stride;
1145
    }
1146
}
1147

    
1148
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1149
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1150
{
1151
    int y, vx, vy;
1152
    const int s= 1<<shift;
1153

    
1154
    width--;
1155
    height--;
1156

    
1157
    for(y=0; y<h; y++){
1158
        int x;
1159

    
1160
        vx= ox;
1161
        vy= oy;
1162
        for(x=0; x<8; x++){ //XXX FIXME optimize
1163
            int src_x, src_y, frac_x, frac_y, index;
1164

    
1165
            src_x= vx>>16;
1166
            src_y= vy>>16;
1167
            frac_x= src_x&(s-1);
1168
            frac_y= src_y&(s-1);
1169
            src_x>>=shift;
1170
            src_y>>=shift;
1171

    
1172
            if((unsigned)src_x < width){
1173
                if((unsigned)src_y < height){
1174
                    index= src_x + src_y*stride;
1175
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1176
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1177
                                        + (  src[index+stride  ]*(s-frac_x)
1178
                                           + src[index+stride+1]*   frac_x )*   frac_y
1179
                                        + r)>>(shift*2);
1180
                }else{
1181
                    index= src_x + clip(src_y, 0, height)*stride;
1182
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1183
                                          + src[index       +1]*   frac_x )*s
1184
                                        + r)>>(shift*2);
1185
                }
1186
            }else{
1187
                if((unsigned)src_y < height){
1188
                    index= clip(src_x, 0, width) + src_y*stride;
1189
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1190
                                           + src[index+stride  ]*   frac_y )*s
1191
                                        + r)>>(shift*2);
1192
                }else{
1193
                    index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1194
                    dst[y*stride + x]=    src[index         ];
1195
                }
1196
            }
1197

    
1198
            vx+= dxx;
1199
            vy+= dyx;
1200
        }
1201
        ox += dxy;
1202
        oy += dyy;
1203
    }
1204
}
1205

    
1206
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1207
    switch(width){
1208
    case 2: put_pixels2_c (dst, src, stride, height); break;
1209
    case 4: put_pixels4_c (dst, src, stride, height); break;
1210
    case 8: put_pixels8_c (dst, src, stride, height); break;
1211
    case 16:put_pixels16_c(dst, src, stride, height); break;
1212
    }
1213
}
1214

    
1215
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1216
    int i,j;
1217
    for (i=0; i < height; i++) {
1218
      for (j=0; j < width; j++) {
1219
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1220
      }
1221
      src += stride;
1222
      dst += stride;
1223
    }
1224
}
1225

    
1226
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1227
    int i,j;
1228
    for (i=0; i < height; i++) {
1229
      for (j=0; j < width; j++) {
1230
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1231
      }
1232
      src += stride;
1233
      dst += stride;
1234
    }
1235
}
1236

    
1237
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1238
    int i,j;
1239
    for (i=0; i < height; i++) {
1240
      for (j=0; j < width; j++) {
1241
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1242
      }
1243
      src += stride;
1244
      dst += stride;
1245
    }
1246
}
1247

    
1248
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1249
    int i,j;
1250
    for (i=0; i < height; i++) {
1251
      for (j=0; j < width; j++) {
1252
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1253
      }
1254
      src += stride;
1255
      dst += stride;
1256
    }
1257
}
1258

    
1259
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1260
    int i,j;
1261
    for (i=0; i < height; i++) {
1262
      for (j=0; j < width; j++) {
1263
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1264
      }
1265
      src += stride;
1266
      dst += stride;
1267
    }
1268
}
1269

    
1270
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1271
    int i,j;
1272
    for (i=0; i < height; i++) {
1273
      for (j=0; j < width; j++) {
1274
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1275
      }
1276
      src += stride;
1277
      dst += stride;
1278
    }
1279
}
1280

    
1281
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1282
    int i,j;
1283
    for (i=0; i < height; i++) {
1284
      for (j=0; j < width; j++) {
1285
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1286
      }
1287
      src += stride;
1288
      dst += stride;
1289
    }
1290
}
1291

    
1292
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1293
    int i,j;
1294
    for (i=0; i < height; i++) {
1295
      for (j=0; j < width; j++) {
1296
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1297
      }
1298
      src += stride;
1299
      dst += stride;
1300
    }
1301
}
1302

    
1303
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1304
    switch(width){
1305
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1306
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1307
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1308
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1309
    }
1310
}
1311

    
1312
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1313
    int i,j;
1314
    for (i=0; i < height; i++) {
1315
      for (j=0; j < width; j++) {
1316
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1317
      }
1318
      src += stride;
1319
      dst += stride;
1320
    }
1321
}
1322

    
1323
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1324
    int i,j;
1325
    for (i=0; i < height; i++) {
1326
      for (j=0; j < width; j++) {
1327
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1328
      }
1329
      src += stride;
1330
      dst += stride;
1331
    }
1332
}
1333

    
1334
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1335
    int i,j;
1336
    for (i=0; i < height; i++) {
1337
      for (j=0; j < width; j++) {
1338
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1339
      }
1340
      src += stride;
1341
      dst += stride;
1342
    }
1343
}
1344

    
1345
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1346
    int i,j;
1347
    for (i=0; i < height; i++) {
1348
      for (j=0; j < width; j++) {
1349
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1350
      }
1351
      src += stride;
1352
      dst += stride;
1353
    }
1354
}
1355

    
1356
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1357
    int i,j;
1358
    for (i=0; i < height; i++) {
1359
      for (j=0; j < width; j++) {
1360
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1361
      }
1362
      src += stride;
1363
      dst += stride;
1364
    }
1365
}
1366

    
1367
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1368
    int i,j;
1369
    for (i=0; i < height; i++) {
1370
      for (j=0; j < width; j++) {
1371
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1372
      }
1373
      src += stride;
1374
      dst += stride;
1375
    }
1376
}
1377

    
1378
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1379
    int i,j;
1380
    for (i=0; i < height; i++) {
1381
      for (j=0; j < width; j++) {
1382
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1383
      }
1384
      src += stride;
1385
      dst += stride;
1386
    }
1387
}
1388

    
1389
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1390
    int i,j;
1391
    for (i=0; i < height; i++) {
1392
      for (j=0; j < width; j++) {
1393
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1394
      }
1395
      src += stride;
1396
      dst += stride;
1397
    }
1398
}
1399
#if 0
1400
#define TPEL_WIDTH(width)\
1401
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1402
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1403
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1404
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1405
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1406
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1407
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1408
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1409
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1410
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1411
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1412
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1413
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1415
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1417
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1418
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1419
#endif
1420

    
1421
#define H264_CHROMA_MC(OPNAME, OP)\
1422
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1423
    const int A=(8-x)*(8-y);\
1424
    const int B=(  x)*(8-y);\
1425
    const int C=(8-x)*(  y);\
1426
    const int D=(  x)*(  y);\
1427
    int i;\
1428
    \
1429
    assert(x<8 && y<8 && x>=0 && y>=0);\
1430
\
1431
    for(i=0; i<h; i++)\
1432
    {\
1433
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1434
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1435
        dst+= stride;\
1436
        src+= stride;\
1437
    }\
1438
}\
1439
\
1440
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1441
    const int A=(8-x)*(8-y);\
1442
    const int B=(  x)*(8-y);\
1443
    const int C=(8-x)*(  y);\
1444
    const int D=(  x)*(  y);\
1445
    int i;\
1446
    \
1447
    assert(x<8 && y<8 && x>=0 && y>=0);\
1448
\
1449
    for(i=0; i<h; i++)\
1450
    {\
1451
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1452
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1453
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1454
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1455
        dst+= stride;\
1456
        src+= stride;\
1457
    }\
1458
}\
1459
\
1460
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1461
    const int A=(8-x)*(8-y);\
1462
    const int B=(  x)*(8-y);\
1463
    const int C=(8-x)*(  y);\
1464
    const int D=(  x)*(  y);\
1465
    int i;\
1466
    \
1467
    assert(x<8 && y<8 && x>=0 && y>=0);\
1468
\
1469
    for(i=0; i<h; i++)\
1470
    {\
1471
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1472
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1473
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1474
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1475
        OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1476
        OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1477
        OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1478
        OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1479
        dst+= stride;\
1480
        src+= stride;\
1481
    }\
1482
}
1483

    
1484
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1485
#define op_put(a, b) a = (((b) + 32)>>6)
1486

    
1487
H264_CHROMA_MC(put_       , op_put)
1488
H264_CHROMA_MC(avg_       , op_avg)
1489
#undef op_avg
1490
#undef op_put
1491

    
1492
static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1493
    const int A=(8-x)*(8-y);
1494
    const int B=(  x)*(8-y);
1495
    const int C=(8-x)*(  y);
1496
    const int D=(  x)*(  y);
1497
    int i;
1498

    
1499
    assert(x<8 && y<8 && x>=0 && y>=0);
1500

    
1501
    for(i=0; i<h; i++)
1502
    {
1503
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1504
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1505
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1506
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1507
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1508
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1509
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1510
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1511
        dst+= stride;
1512
        src+= stride;
1513
    }
1514
}
1515

    
1516
static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1517
{
1518
    int i;
1519
    for(i=0; i<h; i++)
1520
    {
1521
        ST16(dst   , LD16(src   ));
1522
        dst+=dstStride;
1523
        src+=srcStride;
1524
    }
1525
}
1526

    
1527
static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1528
{
1529
    int i;
1530
    for(i=0; i<h; i++)
1531
    {
1532
        ST32(dst   , LD32(src   ));
1533
        dst+=dstStride;
1534
        src+=srcStride;
1535
    }
1536
}
1537

    
1538
static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1539
{
1540
    int i;
1541
    for(i=0; i<h; i++)
1542
    {
1543
        ST32(dst   , LD32(src   ));
1544
        ST32(dst+4 , LD32(src+4 ));
1545
        dst+=dstStride;
1546
        src+=srcStride;
1547
    }
1548
}
1549

    
1550
static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1551
{
1552
    int i;
1553
    for(i=0; i<h; i++)
1554
    {
1555
        ST32(dst   , LD32(src   ));
1556
        ST32(dst+4 , LD32(src+4 ));
1557
        ST32(dst+8 , LD32(src+8 ));
1558
        ST32(dst+12, LD32(src+12));
1559
        dst+=dstStride;
1560
        src+=srcStride;
1561
    }
1562
}
1563

    
1564
static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1565
{
1566
    int i;
1567
    for(i=0; i<h; i++)
1568
    {
1569
        ST32(dst   , LD32(src   ));
1570
        ST32(dst+4 , LD32(src+4 ));
1571
        ST32(dst+8 , LD32(src+8 ));
1572
        ST32(dst+12, LD32(src+12));
1573
        dst[16]= src[16];
1574
        dst+=dstStride;
1575
        src+=srcStride;
1576
    }
1577
}
1578

    
1579
static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1580
{
1581
    int i;
1582
    for(i=0; i<h; i++)
1583
    {
1584
        ST32(dst   , LD32(src   ));
1585
        ST32(dst+4 , LD32(src+4 ));
1586
        dst[8]= src[8];
1587
        dst+=dstStride;
1588
        src+=srcStride;
1589
    }
1590
}
1591

    
1592

    
1593
#define QPEL_MC(r, OPNAME, RND, OP) \
1594
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1595
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1596
    int i;\
1597
    for(i=0; i<h; i++)\
1598
    {\
1599
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1600
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1601
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1602
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1603
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1604
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1605
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1606
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1607
        dst+=dstStride;\
1608
        src+=srcStride;\
1609
    }\
1610
}\
1611
\
1612
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1613
    const int w=8;\
1614
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1615
    int i;\
1616
    for(i=0; i<w; i++)\
1617
    {\
1618
        const int src0= src[0*srcStride];\
1619
        const int src1= src[1*srcStride];\
1620
        const int src2= src[2*srcStride];\
1621
        const int src3= src[3*srcStride];\
1622
        const int src4= src[4*srcStride];\
1623
        const int src5= src[5*srcStride];\
1624
        const int src6= src[6*srcStride];\
1625
        const int src7= src[7*srcStride];\
1626
        const int src8= src[8*srcStride];\
1627
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1628
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1629
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1630
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1631
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1632
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1633
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1634
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1635
        dst++;\
1636
        src++;\
1637
    }\
1638
}\
1639
\
1640
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1641
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1642
    int i;\
1643
    \
1644
    for(i=0; i<h; i++)\
1645
    {\
1646
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1647
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1648
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1649
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1650
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1651
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1652
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1653
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1654
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1655
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1656
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1657
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1658
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1659
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1660
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1661
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1662
        dst+=dstStride;\
1663
        src+=srcStride;\
1664
    }\
1665
}\
1666
\
1667
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1668
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1669
    int i;\
1670
    const int w=16;\
1671
    for(i=0; i<w; i++)\
1672
    {\
1673
        const int src0= src[0*srcStride];\
1674
        const int src1= src[1*srcStride];\
1675
        const int src2= src[2*srcStride];\
1676
        const int src3= src[3*srcStride];\
1677
        const int src4= src[4*srcStride];\
1678
        const int src5= src[5*srcStride];\
1679
        const int src6= src[6*srcStride];\
1680
        const int src7= src[7*srcStride];\
1681
        const int src8= src[8*srcStride];\
1682
        const int src9= src[9*srcStride];\
1683
        const int src10= src[10*srcStride];\
1684
        const int src11= src[11*srcStride];\
1685
        const int src12= src[12*srcStride];\
1686
        const int src13= src[13*srcStride];\
1687
        const int src14= src[14*srcStride];\
1688
        const int src15= src[15*srcStride];\
1689
        const int src16= src[16*srcStride];\
1690
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1691
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1692
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1693
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1694
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1695
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1696
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1697
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1698
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1699
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1700
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1701
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1702
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1703
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1704
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1705
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1706
        dst++;\
1707
        src++;\
1708
    }\
1709
}\
1710
\
1711
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1712
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1713
}\
1714
\
1715
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1716
    uint8_t half[64];\
1717
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1718
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1719
}\
1720
\
1721
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1722
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1723
}\
1724
\
1725
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1726
    uint8_t half[64];\
1727
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1728
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1729
}\
1730
\
1731
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1732
    uint8_t full[16*9];\
1733
    uint8_t half[64];\
1734
    copy_block9(full, src, 16, stride, 9);\
1735
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1736
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1737
}\
1738
\
1739
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1740
    uint8_t full[16*9];\
1741
    copy_block9(full, src, 16, stride, 9);\
1742
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1743
}\
1744
\
1745
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1746
    uint8_t full[16*9];\
1747
    uint8_t half[64];\
1748
    copy_block9(full, src, 16, stride, 9);\
1749
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1750
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1751
}\
1752
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1753
    uint8_t full[16*9];\
1754
    uint8_t halfH[72];\
1755
    uint8_t halfV[64];\
1756
    uint8_t halfHV[64];\
1757
    copy_block9(full, src, 16, stride, 9);\
1758
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1759
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1760
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1761
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1762
}\
1763
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1764
    uint8_t full[16*9];\
1765
    uint8_t halfH[72];\
1766
    uint8_t halfHV[64];\
1767
    copy_block9(full, src, 16, stride, 9);\
1768
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1769
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1770
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1771
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1772
}\
1773
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1774
    uint8_t full[16*9];\
1775
    uint8_t halfH[72];\
1776
    uint8_t halfV[64];\
1777
    uint8_t halfHV[64];\
1778
    copy_block9(full, src, 16, stride, 9);\
1779
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1780
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1781
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1782
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1783
}\
1784
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1785
    uint8_t full[16*9];\
1786
    uint8_t halfH[72];\
1787
    uint8_t halfHV[64];\
1788
    copy_block9(full, src, 16, stride, 9);\
1789
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1790
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1791
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1792
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1793
}\
1794
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1795
    uint8_t full[16*9];\
1796
    uint8_t halfH[72];\
1797
    uint8_t halfV[64];\
1798
    uint8_t halfHV[64];\
1799
    copy_block9(full, src, 16, stride, 9);\
1800
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1801
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1802
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1803
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1804
}\
1805
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1806
    uint8_t full[16*9];\
1807
    uint8_t halfH[72];\
1808
    uint8_t halfHV[64];\
1809
    copy_block9(full, src, 16, stride, 9);\
1810
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1811
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1812
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1813
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1814
}\
1815
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1816
    uint8_t full[16*9];\
1817
    uint8_t halfH[72];\
1818
    uint8_t halfV[64];\
1819
    uint8_t halfHV[64];\
1820
    copy_block9(full, src, 16, stride, 9);\
1821
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1822
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1823
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1824
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1825
}\
1826
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1827
    uint8_t full[16*9];\
1828
    uint8_t halfH[72];\
1829
    uint8_t halfHV[64];\
1830
    copy_block9(full, src, 16, stride, 9);\
1831
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1832
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1833
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1834
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1835
}\
1836
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1837
    uint8_t halfH[72];\
1838
    uint8_t halfHV[64];\
1839
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1840
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1841
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1842
}\
1843
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1844
    uint8_t halfH[72];\
1845
    uint8_t halfHV[64];\
1846
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1847
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1848
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1849
}\
1850
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1851
    uint8_t full[16*9];\
1852
    uint8_t halfH[72];\
1853
    uint8_t halfV[64];\
1854
    uint8_t halfHV[64];\
1855
    copy_block9(full, src, 16, stride, 9);\
1856
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1857
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1858
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1859
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1860
}\
1861
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1862
    uint8_t full[16*9];\
1863
    uint8_t halfH[72];\
1864
    copy_block9(full, src, 16, stride, 9);\
1865
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1866
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1867
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1868
}\
1869
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1870
    uint8_t full[16*9];\
1871
    uint8_t halfH[72];\
1872
    uint8_t halfV[64];\
1873
    uint8_t halfHV[64];\
1874
    copy_block9(full, src, 16, stride, 9);\
1875
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1876
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1877
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1878
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1879
}\
1880
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1881
    uint8_t full[16*9];\
1882
    uint8_t halfH[72];\
1883
    copy_block9(full, src, 16, stride, 9);\
1884
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1885
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1886
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1887
}\
1888
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1889
    uint8_t halfH[72];\
1890
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1891
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1892
}\
1893
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1894
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1895
}\
1896
\
1897
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1898
    uint8_t half[256];\
1899
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1900
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1901
}\
1902
\
1903
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1904
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1905
}\
1906
\
1907
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1908
    uint8_t half[256];\
1909
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1910
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1911
}\
1912
\
1913
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1914
    uint8_t full[24*17];\
1915
    uint8_t half[256];\
1916
    copy_block17(full, src, 24, stride, 17);\
1917
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1918
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1919
}\
1920
\
1921
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1922
    uint8_t full[24*17];\
1923
    copy_block17(full, src, 24, stride, 17);\
1924
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1925
}\
1926
\
1927
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1928
    uint8_t full[24*17];\
1929
    uint8_t half[256];\
1930
    copy_block17(full, src, 24, stride, 17);\
1931
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1932
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1933
}\
1934
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1935
    uint8_t full[24*17];\
1936
    uint8_t halfH[272];\
1937
    uint8_t halfV[256];\
1938
    uint8_t halfHV[256];\
1939
    copy_block17(full, src, 24, stride, 17);\
1940
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1941
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1942
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1943
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1944
}\
1945
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1946
    uint8_t full[24*17];\
1947
    uint8_t halfH[272];\
1948
    uint8_t halfHV[256];\
1949
    copy_block17(full, src, 24, stride, 17);\
1950
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1951
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1952
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1953
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1954
}\
1955
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1956
    uint8_t full[24*17];\
1957
    uint8_t halfH[272];\
1958
    uint8_t halfV[256];\
1959
    uint8_t halfHV[256];\
1960
    copy_block17(full, src, 24, stride, 17);\
1961
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1962
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1963
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1964
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1965
}\
1966
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1967
    uint8_t full[24*17];\
1968
    uint8_t halfH[272];\
1969
    uint8_t halfHV[256];\
1970
    copy_block17(full, src, 24, stride, 17);\
1971
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1972
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1973
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1974
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1975
}\
1976
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1977
    uint8_t full[24*17];\
1978
    uint8_t halfH[272];\
1979
    uint8_t halfV[256];\
1980
    uint8_t halfHV[256];\
1981
    copy_block17(full, src, 24, stride, 17);\
1982
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1983
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1984
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1985
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1986
}\
1987
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1988
    uint8_t full[24*17];\
1989
    uint8_t halfH[272];\
1990
    uint8_t halfHV[256];\
1991
    copy_block17(full, src, 24, stride, 17);\
1992
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1993
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1994
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1995
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1996
}\
1997
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1998
    uint8_t full[24*17];\
1999
    uint8_t halfH[272];\
2000
    uint8_t halfV[256];\
2001
    uint8_t halfHV[256];\
2002
    copy_block17(full, src, 24, stride, 17);\
2003
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2004
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2005
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2006
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2007
}\
2008
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2009
    uint8_t full[24*17];\
2010
    uint8_t halfH[272];\
2011
    uint8_t halfHV[256];\
2012
    copy_block17(full, src, 24, stride, 17);\
2013
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2014
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2015
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2016
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2017
}\
2018
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2019
    uint8_t halfH[272];\
2020
    uint8_t halfHV[256];\
2021
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2022
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2023
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2024
}\
2025
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2026
    uint8_t halfH[272];\
2027
    uint8_t halfHV[256];\
2028
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2029
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2030
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2031
}\
2032
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2033
    uint8_t full[24*17];\
2034
    uint8_t halfH[272];\
2035
    uint8_t halfV[256];\
2036
    uint8_t halfHV[256];\
2037
    copy_block17(full, src, 24, stride, 17);\
2038
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2039
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2040
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2041
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2042
}\
2043
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2044
    uint8_t full[24*17];\
2045
    uint8_t halfH[272];\
2046
    copy_block17(full, src, 24, stride, 17);\
2047
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2048
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2049
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2050
}\
2051
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2052
    uint8_t full[24*17];\
2053
    uint8_t halfH[272];\
2054
    uint8_t halfV[256];\
2055
    uint8_t halfHV[256];\
2056
    copy_block17(full, src, 24, stride, 17);\
2057
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2058
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2059
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2060
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2061
}\
2062
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2063
    uint8_t full[24*17];\
2064
    uint8_t halfH[272];\
2065
    copy_block17(full, src, 24, stride, 17);\
2066
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2067
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2068
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2069
}\
2070
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2071
    uint8_t halfH[272];\
2072
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2073
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2074
}
2075

    
2076
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2077
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2078
#define op_put(a, b) a = cm[((b) + 16)>>5]
2079
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2080

    
2081
QPEL_MC(0, put_       , _       , op_put)
2082
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2083
QPEL_MC(0, avg_       , _       , op_avg)
2084
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2085
#undef op_avg
2086
#undef op_avg_no_rnd
2087
#undef op_put
2088
#undef op_put_no_rnd
2089

    
2090
#if 1
2091
#define H264_LOWPASS(OPNAME, OP, OP2) \
2092
static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2093
    const int h=2;\
2094
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2095
    int i;\
2096
    for(i=0; i<h; i++)\
2097
    {\
2098
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2099
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2100
        dst+=dstStride;\
2101
        src+=srcStride;\
2102
    }\
2103
}\
2104
\
2105
static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2106
    const int w=2;\
2107
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2108
    int i;\
2109
    for(i=0; i<w; i++)\
2110
    {\
2111
        const int srcB= src[-2*srcStride];\
2112
        const int srcA= src[-1*srcStride];\
2113
        const int src0= src[0 *srcStride];\
2114
        const int src1= src[1 *srcStride];\
2115
        const int src2= src[2 *srcStride];\
2116
        const int src3= src[3 *srcStride];\
2117
        const int src4= src[4 *srcStride];\
2118
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2119
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2120
        dst++;\
2121
        src++;\
2122
    }\
2123
}\
2124
\
2125
static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2126
    const int h=2;\
2127
    const int w=2;\
2128
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2129
    int i;\
2130
    src -= 2*srcStride;\
2131
    for(i=0; i<h+5; i++)\
2132
    {\
2133
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2134
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2135
        tmp+=tmpStride;\
2136
        src+=srcStride;\
2137
    }\
2138
    tmp -= tmpStride*(h+5-2);\
2139
    for(i=0; i<w; i++)\
2140
    {\
2141
        const int tmpB= tmp[-2*tmpStride];\
2142
        const int tmpA= tmp[-1*tmpStride];\
2143
        const int tmp0= tmp[0 *tmpStride];\
2144
        const int tmp1= tmp[1 *tmpStride];\
2145
        const int tmp2= tmp[2 *tmpStride];\
2146
        const int tmp3= tmp[3 *tmpStride];\
2147
        const int tmp4= tmp[4 *tmpStride];\
2148
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2149
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2150
        dst++;\
2151
        tmp++;\
2152
    }\
2153
}\
2154
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2155
    const int h=4;\
2156
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2157
    int i;\
2158
    for(i=0; i<h; i++)\
2159
    {\
2160
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2161
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2162
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2163
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2164
        dst+=dstStride;\
2165
        src+=srcStride;\
2166
    }\
2167
}\
2168
\
2169
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2170
    const int w=4;\
2171
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2172
    int i;\
2173
    for(i=0; i<w; i++)\
2174
    {\
2175
        const int srcB= src[-2*srcStride];\
2176
        const int srcA= src[-1*srcStride];\
2177
        const int src0= src[0 *srcStride];\
2178
        const int src1= src[1 *srcStride];\
2179
        const int src2= src[2 *srcStride];\
2180
        const int src3= src[3 *srcStride];\
2181
        const int src4= src[4 *srcStride];\
2182
        const int src5= src[5 *srcStride];\
2183
        const int src6= src[6 *srcStride];\
2184
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2185
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2186
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2187
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2188
        dst++;\
2189
        src++;\
2190
    }\
2191
}\
2192
\
2193
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2194
    const int h=4;\
2195
    const int w=4;\
2196
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2197
    int i;\
2198
    src -= 2*srcStride;\
2199
    for(i=0; i<h+5; i++)\
2200
    {\
2201
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2202
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2203
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2204
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2205
        tmp+=tmpStride;\
2206
        src+=srcStride;\
2207
    }\
2208
    tmp -= tmpStride*(h+5-2);\
2209
    for(i=0; i<w; i++)\
2210
    {\
2211
        const int tmpB= tmp[-2*tmpStride];\
2212
        const int tmpA= tmp[-1*tmpStride];\
2213
        const int tmp0= tmp[0 *tmpStride];\
2214
        const int tmp1= tmp[1 *tmpStride];\
2215
        const int tmp2= tmp[2 *tmpStride];\
2216
        const int tmp3= tmp[3 *tmpStride];\
2217
        const int tmp4= tmp[4 *tmpStride];\
2218
        const int tmp5= tmp[5 *tmpStride];\
2219
        const int tmp6= tmp[6 *tmpStride];\
2220
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2221
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2222
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2223
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2224
        dst++;\
2225
        tmp++;\
2226
    }\
2227
}\
2228
\
2229
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2230
    const int h=8;\
2231
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2232
    int i;\
2233
    for(i=0; i<h; i++)\
2234
    {\
2235
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2236
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2237
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2238
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2239
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2240
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2241
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2242
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2243
        dst+=dstStride;\
2244
        src+=srcStride;\
2245
    }\
2246
}\
2247
\
2248
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2249
    const int w=8;\
2250
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2251
    int i;\
2252
    for(i=0; i<w; i++)\
2253
    {\
2254
        const int srcB= src[-2*srcStride];\
2255
        const int srcA= src[-1*srcStride];\
2256
        const int src0= src[0 *srcStride];\
2257
        const int src1= src[1 *srcStride];\
2258
        const int src2= src[2 *srcStride];\
2259
        const int src3= src[3 *srcStride];\
2260
        const int src4= src[4 *srcStride];\
2261
        const int src5= src[5 *srcStride];\
2262
        const int src6= src[6 *srcStride];\
2263
        const int src7= src[7 *srcStride];\
2264
        const int src8= src[8 *srcStride];\
2265
        const int src9= src[9 *srcStride];\
2266
        const int src10=src[10*srcStride];\
2267
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2268
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2269
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2270
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2271
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2272
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2273
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2274
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2275
        dst++;\
2276
        src++;\
2277
    }\
2278
}\
2279
\
2280
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2281
    const int h=8;\
2282
    const int w=8;\
2283
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2284
    int i;\
2285
    src -= 2*srcStride;\
2286
    for(i=0; i<h+5; i++)\
2287
    {\
2288
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2289
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2290
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2291
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2292
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2293
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2294
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2295
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2296
        tmp+=tmpStride;\
2297
        src+=srcStride;\
2298
    }\
2299
    tmp -= tmpStride*(h+5-2);\
2300
    for(i=0; i<w; i++)\
2301
    {\
2302
        const int tmpB= tmp[-2*tmpStride];\
2303
        const int tmpA= tmp[-1*tmpStride];\
2304
        const int tmp0= tmp[0 *tmpStride];\
2305
        const int tmp1= tmp[1 *tmpStride];\
2306
        const int tmp2= tmp[2 *tmpStride];\
2307
        const int tmp3= tmp[3 *tmpStride];\
2308
        const int tmp4= tmp[4 *tmpStride];\
2309
        const int tmp5= tmp[5 *tmpStride];\
2310
        const int tmp6= tmp[6 *tmpStride];\
2311
        const int tmp7= tmp[7 *tmpStride];\
2312
        const int tmp8= tmp[8 *tmpStride];\
2313
        const int tmp9= tmp[9 *tmpStride];\
2314
        const int tmp10=tmp[10*tmpStride];\
2315
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2316
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2317
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2318
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2319
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2320
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2321
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2322
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2323
        dst++;\
2324
        tmp++;\
2325
    }\
2326
}\
2327
\
2328
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2329
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2330
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2331
    src += 8*srcStride;\
2332
    dst += 8*dstStride;\
2333
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2334
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2335
}\
2336
\
2337
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2338
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2339
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2340
    src += 8*srcStride;\
2341
    dst += 8*dstStride;\
2342
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2343
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2344
}\
2345
\
2346
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2347
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2348
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2349
    src += 8*srcStride;\
2350
    dst += 8*dstStride;\
2351
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2352
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2353
}\
2354

    
2355
#define H264_MC(OPNAME, SIZE) \
2356
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2357
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2358
}\
2359
\
2360
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2361
    uint8_t half[SIZE*SIZE];\
2362
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2363
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2364
}\
2365
\
2366
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2367
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2368
}\
2369
\
2370
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2371
    uint8_t half[SIZE*SIZE];\
2372
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2373
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2374
}\
2375
\
2376
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2377
    uint8_t full[SIZE*(SIZE+5)];\
2378
    uint8_t * const full_mid= full + SIZE*2;\
2379
    uint8_t half[SIZE*SIZE];\
2380
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2381
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2382
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2383
}\
2384
\
2385
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2386
    uint8_t full[SIZE*(SIZE+5)];\
2387
    uint8_t * const full_mid= full + SIZE*2;\
2388
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2389
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2390
}\
2391
\
2392
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2393
    uint8_t full[SIZE*(SIZE+5)];\
2394
    uint8_t * const full_mid= full + SIZE*2;\
2395
    uint8_t half[SIZE*SIZE];\
2396
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2397
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2398
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2399
}\
2400
\
2401
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2402
    uint8_t full[SIZE*(SIZE+5)];\
2403
    uint8_t * const full_mid= full + SIZE*2;\
2404
    uint8_t halfH[SIZE*SIZE];\
2405
    uint8_t halfV[SIZE*SIZE];\
2406
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2407
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2408
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2409
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2410
}\
2411
\
2412
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2413
    uint8_t full[SIZE*(SIZE+5)];\
2414
    uint8_t * const full_mid= full + SIZE*2;\
2415
    uint8_t halfH[SIZE*SIZE];\
2416
    uint8_t halfV[SIZE*SIZE];\
2417
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2418
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2419
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2420
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2421
}\
2422
\
2423
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2424
    uint8_t full[SIZE*(SIZE+5)];\
2425
    uint8_t * const full_mid= full + SIZE*2;\
2426
    uint8_t halfH[SIZE*SIZE];\
2427
    uint8_t halfV[SIZE*SIZE];\
2428
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2429
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2430
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2431
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2432
}\
2433
\
2434
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2435
    uint8_t full[SIZE*(SIZE+5)];\
2436
    uint8_t * const full_mid= full + SIZE*2;\
2437
    uint8_t halfH[SIZE*SIZE];\
2438
    uint8_t halfV[SIZE*SIZE];\
2439
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2440
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2441
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2442
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2443
}\
2444
\
2445
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2446
    int16_t tmp[SIZE*(SIZE+5)];\
2447
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2448
}\
2449
\
2450
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2451
    int16_t tmp[SIZE*(SIZE+5)];\
2452
    uint8_t halfH[SIZE*SIZE];\
2453
    uint8_t halfHV[SIZE*SIZE];\
2454
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2455
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2456
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2457
}\
2458
\
2459
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2460
    int16_t tmp[SIZE*(SIZE+5)];\
2461
    uint8_t halfH[SIZE*SIZE];\
2462
    uint8_t halfHV[SIZE*SIZE];\
2463
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2464
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2465
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2466
}\
2467
\
2468
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2469
    uint8_t full[SIZE*(SIZE+5)];\
2470
    uint8_t * const full_mid= full + SIZE*2;\
2471
    int16_t tmp[SIZE*(SIZE+5)];\
2472
    uint8_t halfV[SIZE*SIZE];\
2473
    uint8_t halfHV[SIZE*SIZE];\
2474
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2475
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2476
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2477
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2478
}\
2479
\
2480
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2481
    uint8_t full[SIZE*(SIZE+5)];\
2482
    uint8_t * const full_mid= full + SIZE*2;\
2483
    int16_t tmp[SIZE*(SIZE+5)];\
2484
    uint8_t halfV[SIZE*SIZE];\
2485
    uint8_t halfHV[SIZE*SIZE];\
2486
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2487
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2488
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2489
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2490
}\
2491

    
2492
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2493
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2494
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2495
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2496
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2497

    
2498
H264_LOWPASS(put_       , op_put, op2_put)
2499
H264_LOWPASS(avg_       , op_avg, op2_avg)
2500
H264_MC(put_, 2)
2501
H264_MC(put_, 4)
2502
H264_MC(put_, 8)
2503
H264_MC(put_, 16)
2504
H264_MC(avg_, 4)
2505
H264_MC(avg_, 8)
2506
H264_MC(avg_, 16)
2507

    
2508
#undef op_avg
2509
#undef op_put
2510
#undef op2_avg
2511
#undef op2_put
2512
#endif
2513

    
2514
#define op_scale1(x)  block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2515
#define op_scale2(x)  dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2516
#define H264_WEIGHT(W,H) \
2517
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2518
    int y; \
2519
    offset <<= log2_denom; \
2520
    if(log2_denom) offset += 1<<(log2_denom-1); \
2521
    for(y=0; y<H; y++, block += stride){ \
2522
        op_scale1(0); \
2523
        op_scale1(1); \
2524
        if(W==2) continue; \
2525
        op_scale1(2); \
2526
        op_scale1(3); \
2527
        if(W==4) continue; \
2528
        op_scale1(4); \
2529
        op_scale1(5); \
2530
        op_scale1(6); \
2531
        op_scale1(7); \
2532
        if(W==8) continue; \
2533
        op_scale1(8); \
2534
        op_scale1(9); \
2535
        op_scale1(10); \
2536
        op_scale1(11); \
2537
        op_scale1(12); \
2538
        op_scale1(13); \
2539
        op_scale1(14); \
2540
        op_scale1(15); \
2541
    } \
2542
} \
2543
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2544
    int y; \
2545
    offset = ((offset + 1) | 1) << log2_denom; \
2546
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2547
        op_scale2(0); \
2548
        op_scale2(1); \
2549
        if(W==2) continue; \
2550
        op_scale2(2); \
2551
        op_scale2(3); \
2552
        if(W==4) continue; \
2553
        op_scale2(4); \
2554
        op_scale2(5); \
2555
        op_scale2(6); \
2556
        op_scale2(7); \
2557
        if(W==8) continue; \
2558
        op_scale2(8); \
2559
        op_scale2(9); \
2560
        op_scale2(10); \
2561
        op_scale2(11); \
2562
        op_scale2(12); \
2563
        op_scale2(13); \
2564
        op_scale2(14); \
2565
        op_scale2(15); \
2566
    } \
2567
}
2568

    
2569
H264_WEIGHT(16,16)
2570
H264_WEIGHT(16,8)
2571
H264_WEIGHT(8,16)
2572
H264_WEIGHT(8,8)
2573
H264_WEIGHT(8,4)
2574
H264_WEIGHT(4,8)
2575
H264_WEIGHT(4,4)
2576
H264_WEIGHT(4,2)
2577
H264_WEIGHT(2,4)
2578
H264_WEIGHT(2,2)
2579

    
2580
#undef op_scale1
2581
#undef op_scale2
2582
#undef H264_WEIGHT
2583

    
2584
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2585
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2586
    int i;
2587

    
2588
    for(i=0; i<h; i++){
2589
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2590
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2591
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2592
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2593
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2594
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2595
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2596
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2597
        dst+=dstStride;
2598
        src+=srcStride;
2599
    }
2600
}
2601

    
2602
#ifdef CONFIG_CAVS_DECODER
2603
/* AVS specific */
2604
void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2605

    
2606
void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2607
    put_pixels8_c(dst, src, stride, 8);
2608
}
2609
void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2610
    avg_pixels8_c(dst, src, stride, 8);
2611
}
2612
void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2613
    put_pixels16_c(dst, src, stride, 16);
2614
}
2615
void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2616
    avg_pixels16_c(dst, src, stride, 16);
2617
}
2618
#endif /* CONFIG_CAVS_DECODER */
2619

    
2620
#if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2621
/* VC-1 specific */
2622
void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2623

    
2624
void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2625
    put_pixels8_c(dst, src, stride, 8);
2626
}
2627
#endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2628

    
2629
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2630
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2631
    int i;
2632

    
2633
    for(i=0; i<w; i++){
2634
        const int src_1= src[ -srcStride];
2635
        const int src0 = src[0          ];
2636
        const int src1 = src[  srcStride];
2637
        const int src2 = src[2*srcStride];
2638
        const int src3 = src[3*srcStride];
2639
        const int src4 = src[4*srcStride];
2640
        const int src5 = src[5*srcStride];
2641
        const int src6 = src[6*srcStride];
2642
        const int src7 = src[7*srcStride];
2643
        const int src8 = src[8*srcStride];
2644
        const int src9 = src[9*srcStride];
2645
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2646
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2647
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2648
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2649
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2650
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2651
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2652
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2653
        src++;
2654
        dst++;
2655
    }
2656
}
2657

    
2658
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2659
    put_pixels8_c(dst, src, stride, 8);
2660
}
2661

    
2662
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2663
    uint8_t half[64];
2664
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2665
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2666
}
2667

    
2668
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2669
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2670
}
2671

    
2672
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2673
    uint8_t half[64];
2674
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2675
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2676
}
2677

    
2678
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2679
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2680
}
2681

    
2682
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2683
    uint8_t halfH[88];
2684
    uint8_t halfV[64];
2685
    uint8_t halfHV[64];
2686
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2687
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2688
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2689
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2690
}
2691
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2692
    uint8_t halfH[88];
2693
    uint8_t halfV[64];
2694
    uint8_t halfHV[64];
2695
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2696
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2697
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2698
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2699
}
2700
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2701
    uint8_t halfH[88];
2702
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2703
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2704
}
2705

    
2706
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2707
    int x;
2708
    const int strength= ff_h263_loop_filter_strength[qscale];
2709

    
2710
    for(x=0; x<8; x++){
2711
        int d1, d2, ad1;
2712
        int p0= src[x-2*stride];
2713
        int p1= src[x-1*stride];
2714
        int p2= src[x+0*stride];
2715
        int p3= src[x+1*stride];
2716
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2717

    
2718
        if     (d<-2*strength) d1= 0;
2719
        else if(d<-  strength) d1=-2*strength - d;
2720
        else if(d<   strength) d1= d;
2721
        else if(d< 2*strength) d1= 2*strength - d;
2722
        else                   d1= 0;
2723

    
2724
        p1 += d1;
2725
        p2 -= d1;
2726
        if(p1&256) p1= ~(p1>>31);
2727
        if(p2&256) p2= ~(p2>>31);
2728

    
2729
        src[x-1*stride] = p1;
2730
        src[x+0*stride] = p2;
2731

    
2732
        ad1= FFABS(d1)>>1;
2733

    
2734
        d2= clip((p0-p3)/4, -ad1, ad1);
2735

    
2736
        src[x-2*stride] = p0 - d2;
2737
        src[x+  stride] = p3 + d2;
2738
    }
2739
}
2740

    
2741
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2742
    int y;
2743
    const int strength= ff_h263_loop_filter_strength[qscale];
2744

    
2745
    for(y=0; y<8; y++){
2746
        int d1, d2, ad1;
2747
        int p0= src[y*stride-2];
2748
        int p1= src[y*stride-1];
2749
        int p2= src[y*stride+0];
2750
        int p3= src[y*stride+1];
2751
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2752

    
2753
        if     (d<-2*strength) d1= 0;
2754
        else if(d<-  strength) d1=-2*strength - d;
2755
        else if(d<   strength) d1= d;
2756
        else if(d< 2*strength) d1= 2*strength - d;
2757
        else                   d1= 0;
2758

    
2759
        p1 += d1;
2760
        p2 -= d1;
2761
        if(p1&256) p1= ~(p1>>31);
2762
        if(p2&256) p2= ~(p2>>31);
2763

    
2764
        src[y*stride-1] = p1;
2765
        src[y*stride+0] = p2;
2766

    
2767
        ad1= FFABS(d1)>>1;
2768

    
2769
        d2= clip((p0-p3)/4, -ad1, ad1);
2770

    
2771
        src[y*stride-2] = p0 - d2;
2772
        src[y*stride+1] = p3 + d2;
2773
    }
2774
}
2775

    
2776
static void h261_loop_filter_c(uint8_t *src, int stride){
2777
    int x,y,xy,yz;
2778
    int temp[64];
2779

    
2780
    for(x=0; x<8; x++){
2781
        temp[x      ] = 4*src[x           ];
2782
        temp[x + 7*8] = 4*src[x + 7*stride];
2783
    }
2784
    for(y=1; y<7; y++){
2785
        for(x=0; x<8; x++){
2786
            xy = y * stride + x;
2787
            yz = y * 8 + x;
2788
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2789
        }
2790
    }
2791

    
2792
    for(y=0; y<8; y++){
2793
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2794
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2795
        for(x=1; x<7; x++){
2796
            xy = y * stride + x;
2797
            yz = y * 8 + x;
2798
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2799
        }
2800
    }
2801
}
2802

    
2803
static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2804
{
2805
    int i, d;
2806
    for( i = 0; i < 4; i++ ) {
2807
        if( tc0[i] < 0 ) {
2808
            pix += 4*ystride;
2809
            continue;
2810
        }
2811
        for( d = 0; d < 4; d++ ) {
2812
            const int p0 = pix[-1*xstride];
2813
            const int p1 = pix[-2*xstride];
2814
            const int p2 = pix[-3*xstride];
2815
            const int q0 = pix[0];
2816
            const int q1 = pix[1*xstride];
2817
            const int q2 = pix[2*xstride];
2818

    
2819
            if( FFABS( p0 - q0 ) < alpha &&
2820
                FFABS( p1 - p0 ) < beta &&
2821
                FFABS( q1 - q0 ) < beta ) {
2822

    
2823
                int tc = tc0[i];
2824
                int i_delta;
2825

    
2826
                if( FFABS( p2 - p0 ) < beta ) {
2827
                    pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2828
                    tc++;
2829
                }
2830
                if( FFABS( q2 - q0 ) < beta ) {
2831
                    pix[   xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2832
                    tc++;
2833
                }
2834

    
2835
                i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2836
                pix[-xstride] = clip_uint8( p0 + i_delta );    /* p0' */
2837
                pix[0]        = clip_uint8( q0 - i_delta );    /* q0' */
2838
            }
2839
            pix += ystride;
2840
        }
2841
    }
2842
}
2843
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2844
{
2845
    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2846
}
2847
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2848
{
2849
    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2850
}
2851

    
2852
static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2853
{
2854
    int i, d;
2855
    for( i = 0; i < 4; i++ ) {
2856
        const int tc = tc0[i];
2857
        if( tc <= 0 ) {
2858
            pix += 2*ystride;
2859
            continue;
2860
        }
2861
        for( d = 0; d < 2; d++ ) {
2862
            const int p0 = pix[-1*xstride];
2863
            const int p1 = pix[-2*xstride];
2864
            const int q0 = pix[0];
2865
            const int q1 = pix[1*xstride];
2866

    
2867
            if( FFABS( p0 - q0 ) < alpha &&
2868
                FFABS( p1 - p0 ) < beta &&
2869
                FFABS( q1 - q0 ) < beta ) {
2870

    
2871
                int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2872

    
2873
                pix[-xstride] = clip_uint8( p0 + delta );    /* p0' */
2874
                pix[0]        = clip_uint8( q0 - delta );    /* q0' */
2875
            }
2876
            pix += ystride;
2877
        }
2878
    }
2879
}
2880
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2881
{
2882
    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2883
}
2884
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2885
{
2886
    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2887
}
2888

    
2889
static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2890
{
2891
    int d;
2892
    for( d = 0; d < 8; d++ ) {
2893
        const int p0 = pix[-1*xstride];
2894
        const int p1 = pix[-2*xstride];
2895
        const int q0 = pix[0];
2896
        const int q1 = pix[1*xstride];
2897

    
2898
        if( FFABS( p0 - q0 ) < alpha &&
2899
            FFABS( p1 - p0 ) < beta &&
2900
            FFABS( q1 - q0 ) < beta ) {
2901

    
2902
            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
2903
            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
2904
        }
2905
        pix += ystride;
2906
    }
2907
}
2908
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2909
{
2910
    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2911
}
2912
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2913
{
2914
    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2915
}
2916

    
2917
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2918
{
2919
    int s, i;
2920

    
2921
    s = 0;
2922
    for(i=0;i<h;i++) {
2923
        s += abs(pix1[0] - pix2[0]);
2924
        s += abs(pix1[1] - pix2[1]);
2925
        s += abs(pix1[2] - pix2[2]);
2926
        s += abs(pix1[3] - pix2[3]);
2927
        s += abs(pix1[4] - pix2[4]);
2928
        s += abs(pix1[5] - pix2[5]);
2929
        s += abs(pix1[6] - pix2[6]);
2930
        s += abs(pix1[7] - pix2[7]);
2931
        s += abs(pix1[8] - pix2[8]);
2932
        s += abs(pix1[9] - pix2[9]);
2933
        s += abs(pix1[10] - pix2[10]);
2934
        s += abs(pix1[11] - pix2[11]);
2935
        s += abs(pix1[12] - pix2[12]);
2936
        s += abs(pix1[13] - pix2[13]);
2937
        s += abs(pix1[14] - pix2[14]);
2938
        s += abs(pix1[15] - pix2[15]);
2939
        pix1 += line_size;
2940
        pix2 += line_size;
2941
    }
2942
    return s;
2943
}
2944

    
2945
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2946
{
2947
    int s, i;
2948

    
2949
    s = 0;
2950
    for(i=0;i<h;i++) {
2951
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2952
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2953
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2954
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2955
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2956
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2957
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2958
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2959
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2960
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2961
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2962
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2963
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2964
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2965
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2966
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2967
        pix1 += line_size;
2968
        pix2 += line_size;
2969
    }
2970
    return s;
2971
}
2972

    
2973
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2974
{
2975
    int s, i;
2976
    uint8_t *pix3 = pix2 + line_size;
2977

    
2978
    s = 0;
2979
    for(i=0;i<h;i++) {
2980
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2981
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2982
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2983
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2984
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2985
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2986
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2987
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2988
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2989
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2990
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2991
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2992
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2993
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2994
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2995
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2996
        pix1 += line_size;
2997
        pix2 += line_size;
2998
        pix3 += line_size;
2999
    }
3000
    return s;
3001
}
3002

    
3003
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3004
{
3005
    int s, i;
3006
    uint8_t *pix3 = pix2 + line_size;
3007

    
3008
    s = 0;
3009
    for(i=0;i<h;i++) {
3010
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3011
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3012
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3013
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3014
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3015
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3016
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3017
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3018
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3019
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3020
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3021
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3022
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3023
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3024
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3025
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3026
        pix1 += line_size;
3027
        pix2 += line_size;
3028
        pix3 += line_size;
3029
    }
3030
    return s;
3031
}
3032

    
3033
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3034
{
3035
    int s, i;
3036

    
3037
    s = 0;
3038
    for(i=0;i<h;i++) {
3039
        s += abs(pix1[0] - pix2[0]);
3040
        s += abs(pix1[1] - pix2[1]);
3041
        s += abs(pix1[2] - pix2[2]);
3042
        s += abs(pix1[3] - pix2[3]);
3043
        s += abs(pix1[4] - pix2[4]);
3044
        s += abs(pix1[5] - pix2[5]);
3045
        s += abs(pix1[6] - pix2[6]);
3046
        s += abs(pix1[7] - pix2[7]);
3047
        pix1 += line_size;
3048
        pix2 += line_size;
3049
    }
3050
    return s;
3051
}
3052

    
3053
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3054
{
3055
    int s, i;
3056

    
3057
    s = 0;
3058
    for(i=0;i<h;i++) {
3059
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3060
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3061
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3062
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3063
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3064
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3065
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3066
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3067
        pix1 += line_size;
3068
        pix2 += line_size;
3069
    }
3070
    return s;
3071
}
3072

    
3073
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3074
{
3075
    int s, i;
3076
    uint8_t *pix3 = pix2 + line_size;
3077

    
3078
    s = 0;
3079
    for(i=0;i<h;i++) {
3080
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3081
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3082
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3083
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3084
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3085
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3086
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3087
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3088
        pix1 += line_size;
3089
        pix2 += line_size;
3090
        pix3 += line_size;
3091
    }
3092
    return s;
3093
}
3094

    
3095
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3096
{
3097
    int s, i;
3098
    uint8_t *pix3 = pix2 + line_size;
3099

    
3100
    s = 0;
3101
    for(i=0;i<h;i++) {
3102
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3103
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3104
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3105
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3106
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3107
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3108
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3109
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3110
        pix1 += line_size;
3111
        pix2 += line_size;
3112
        pix3 += line_size;
3113
    }
3114
    return s;
3115
}
3116

    
3117
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3118
    MpegEncContext *c = v;
3119
    int score1=0;
3120
    int score2=0;
3121
    int x,y;
3122

    
3123
    for(y=0; y<h; y++){
3124
        for(x=0; x<16; x++){
3125
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3126
        }
3127
        if(y+1<h){
3128
            for(x=0; x<15; x++){
3129
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3130
                             - s1[x+1] + s1[x+1+stride])
3131
                        -FFABS(  s2[x  ] - s2[x  +stride]
3132
                             - s2[x+1] + s2[x+1+stride]);
3133
            }
3134
        }
3135
        s1+= stride;
3136
        s2+= stride;
3137
    }
3138

    
3139
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3140
    else  return score1 + FFABS(score2)*8;
3141
}
3142

    
3143
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3144
    MpegEncContext *c = v;
3145
    int score1=0;
3146
    int score2=0;
3147
    int x,y;
3148

    
3149
    for(y=0; y<h; y++){
3150
        for(x=0; x<8; x++){
3151
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3152
        }
3153
        if(y+1<h){
3154
            for(x=0; x<7; x++){
3155
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3156
                             - s1[x+1] + s1[x+1+stride])
3157
                        -FFABS(  s2[x  ] - s2[x  +stride]
3158
                             - s2[x+1] + s2[x+1+stride]);
3159
            }
3160
        }
3161
        s1+= stride;
3162
        s2+= stride;
3163
    }
3164

    
3165
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3166
    else  return score1 + FFABS(score2)*8;
3167
}
3168

    
3169
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3170
    int i;
3171
    unsigned int sum=0;
3172

    
3173
    for(i=0; i<8*8; i++){
3174
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3175
        int w= weight[i];
3176
        b>>= RECON_SHIFT;
3177
        assert(-512<b && b<512);
3178

    
3179
        sum += (w*b)*(w*b)>>4;
3180
    }
3181
    return sum>>2;
3182
}
3183

    
3184
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3185
    int i;
3186

    
3187
    for(i=0; i<8*8; i++){
3188
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3189
    }
3190
}
3191

    
3192
/**
3193
 * permutes an 8x8 block.
3194
 * @param block the block which will be permuted according to the given permutation vector
3195
 * @param permutation the permutation vector
3196
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3197
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3198
 *                  (inverse) permutated to scantable order!
3199
 */
3200
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3201
{
3202
    int i;
3203
    DCTELEM temp[64];
3204

    
3205
    if(last<=0) return;
3206
    //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3207

    
3208
    for(i=0; i<=last; i++){
3209
        const int j= scantable[i];
3210
        temp[j]= block[j];
3211
        block[j]=0;
3212
    }
3213

    
3214
    for(i=0; i<=last; i++){
3215
        const int j= scantable[i];
3216
        const int perm_j= permutation[j];
3217
        block[perm_j]= temp[j];
3218
    }
3219
}
3220

    
3221
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3222
    return 0;
3223
}
3224

    
3225
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3226
    int i;
3227

    
3228
    memset(cmp, 0, sizeof(void*)*5);
3229

    
3230
    for(i=0; i<5; i++){
3231
        switch(type&0xFF){
3232
        case FF_CMP_SAD:
3233
            cmp[i]= c->sad[i];
3234
            break;
3235
        case FF_CMP_SATD:
3236
            cmp[i]= c->hadamard8_diff[i];
3237
            break;
3238
        case FF_CMP_SSE:
3239
            cmp[i]= c->sse[i];
3240
            break;
3241
        case FF_CMP_DCT:
3242
            cmp[i]= c->dct_sad[i];
3243
            break;
3244
        case FF_CMP_DCT264:
3245
            cmp[i]= c->dct264_sad[i];
3246
            break;
3247
        case FF_CMP_DCTMAX:
3248
            cmp[i]= c->dct_max[i];
3249
            break;
3250
        case FF_CMP_PSNR:
3251
            cmp[i]= c->quant_psnr[i];
3252
            break;
3253
        case FF_CMP_BIT:
3254
            cmp[i]= c->bit[i];
3255
            break;
3256
        case FF_CMP_RD:
3257
            cmp[i]= c->rd[i];
3258
            break;
3259
        case FF_CMP_VSAD:
3260
            cmp[i]= c->vsad[i];
3261
            break;
3262
        case FF_CMP_VSSE:
3263
            cmp[i]= c->vsse[i];
3264
            break;
3265
        case FF_CMP_ZERO:
3266
            cmp[i]= zero_cmp;
3267
            break;
3268
        case FF_CMP_NSSE:
3269
            cmp[i]= c->nsse[i];
3270
            break;
3271
#ifdef CONFIG_SNOW_ENCODER
3272
        case FF_CMP_W53:
3273
            cmp[i]= c->w53[i];
3274
            break;
3275
        case FF_CMP_W97:
3276
            cmp[i]= c->w97[i];
3277
            break;
3278
#endif
3279
        default:
3280
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3281
        }
3282
    }
3283
}
3284

    
3285
/**
3286
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3287
 */
3288
static void clear_blocks_c(DCTELEM *blocks)
3289
{
3290
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3291
}
3292

    
3293
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3294
    int i;
3295
    for(i=0; i+7<w; i+=8){
3296
        dst[i+0] += src[i+0];
3297
        dst[i+1] += src[i+1];
3298
        dst[i+2] += src[i+2];
3299
        dst[i+3] += src[i+3];
3300
        dst[i+4] += src[i+4];
3301
        dst[i+5] += src[i+5];
3302
        dst[i+6] += src[i+6];
3303
        dst[i+7] += src[i+7];
3304
    }
3305
    for(; i<w; i++)
3306
        dst[i+0] += src[i+0];
3307
}
3308

    
3309
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3310
    int i;
3311
    for(i=0; i+7<w; i+=8){
3312
        dst[i+0] = src1[i+0]-src2[i+0];
3313
        dst[i+1] = src1[i+1]-src2[i+1];
3314
        dst[i+2] = src1[i+2]-src2[i+2];
3315
        dst[i+3] = src1[i+3]-src2[i+3];
3316
        dst[i+4] = src1[i+4]-src2[i+4];
3317
        dst[i+5] = src1[i+5]-src2[i+5];
3318
        dst[i+6] = src1[i+6]-src2[i+6];
3319
        dst[i+7] = src1[i+7]-src2[i+7];
3320
    }
3321
    for(; i<w; i++)
3322
        dst[i+0] = src1[i+0]-src2[i+0];
3323
}
3324

    
3325
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3326
    int i;
3327
    uint8_t l, lt;
3328

    
3329
    l= *left;
3330
    lt= *left_top;
3331

    
3332
    for(i=0; i<w; i++){
3333
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3334
        lt= src1[i];
3335
        l= src2[i];
3336
        dst[i]= l - pred;
3337
    }
3338

    
3339
    *left= l;
3340
    *left_top= lt;
3341
}
3342

    
3343
#define BUTTERFLY2(o1,o2,i1,i2) \
3344
o1= (i1)+(i2);\
3345
o2= (i1)-(i2);
3346

    
3347
#define BUTTERFLY1(x,y) \
3348
{\
3349
    int a,b;\
3350
    a= x;\
3351
    b= y;\
3352
    x= a+b;\
3353
    y= a-b;\
3354
}
3355

    
3356
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3357

    
3358
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3359
    int i;
3360
    int temp[64];
3361
    int sum=0;
3362

    
3363
    assert(h==8);
3364

    
3365
    for(i=0; i<8; i++){
3366
        //FIXME try pointer walks
3367
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3368
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3369
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3370
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3371

    
3372
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3373
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3374
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3375
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3376

    
3377
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3378
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3379
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3380
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3381
    }
3382

    
3383
    for(i=0; i<8; i++){
3384
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3385
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3386
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3387
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3388

    
3389
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3390
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3391
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3392
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3393

    
3394
        sum +=
3395
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3396
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3397
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3398
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3399
    }
3400
#if 0
3401
static int maxi=0;
3402
if(sum>maxi){
3403
    maxi=sum;
3404
    printf("MAX:%d\n", maxi);
3405
}
3406
#endif
3407
    return sum;
3408
}
3409

    
3410
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3411
    int i;
3412
    int temp[64];
3413
    int sum=0;
3414

    
3415
    assert(h==8);
3416

    
3417
    for(i=0; i<8; i++){
3418
        //FIXME try pointer walks
3419
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3420
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3421
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3422
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3423

    
3424
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3425
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3426
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3427
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3428

    
3429
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3430
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3431
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3432
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3433
    }
3434

    
3435
    for(i=0; i<8; i++){
3436
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3437
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3438
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3439
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3440

    
3441
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3442
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3443
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3444
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3445

    
3446
        sum +=
3447
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3448
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3449
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3450
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3451
    }
3452

    
3453
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3454

    
3455
    return sum;
3456
}
3457

    
3458
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3459
    MpegEncContext * const s= (MpegEncContext *)c;
3460
    DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3461
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3462
    int sum=0, i;
3463

    
3464
    assert(h==8);
3465

    
3466
    s->dsp.diff_pixels(temp, src1, src2, stride);
3467
    s->dsp.fdct(temp);
3468

    
3469
    for(i=0; i<64; i++)
3470
        sum+= FFABS(temp[i]);
3471

    
3472
    return sum;
3473
}
3474

    
3475
#ifdef CONFIG_GPL
3476
#define DCT8_1D {\
3477
    const int s07 = SRC(0) + SRC(7);\
3478
    const int s16 = SRC(1) + SRC(6);\
3479
    const int s25 = SRC(2) + SRC(5);\
3480
    const int s34 = SRC(3) + SRC(4);\
3481
    const int a0 = s07 + s34;\
3482
    const int a1 = s16 + s25;\
3483
    const int a2 = s07 - s34;\
3484
    const int a3 = s16 - s25;\
3485
    const int d07 = SRC(0) - SRC(7);\
3486
    const int d16 = SRC(1) - SRC(6);\
3487
    const int d25 = SRC(2) - SRC(5);\
3488
    const int d34 = SRC(3) - SRC(4);\
3489
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3490
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3491
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3492
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3493
    DST(0,  a0 + a1     ) ;\
3494
    DST(1,  a4 + (a7>>2)) ;\
3495
    DST(2,  a2 + (a3>>1)) ;\
3496
    DST(3,  a5 + (a6>>2)) ;\
3497
    DST(4,  a0 - a1     ) ;\
3498
    DST(5,  a6 - (a5>>2)) ;\
3499
    DST(6, (a2>>1) - a3 ) ;\
3500
    DST(7, (a4>>2) - a7 ) ;\
3501
}
3502

    
3503
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3504
    MpegEncContext * const s= (MpegEncContext *)c;
3505
    int16_t dct[8][8];
3506
    int i;
3507
    int sum=0;
3508

    
3509
    s->dsp.diff_pixels(dct, src1, src2, stride);
3510

    
3511
#define SRC(x) dct[i][x]
3512
#define DST(x,v) dct[i][x]= v
3513
    for( i = 0; i < 8; i++ )
3514
        DCT8_1D
3515
#undef SRC
3516
#undef DST
3517

    
3518
#define SRC(x) dct[x][i]
3519
#define DST(x,v) sum += FFABS(v)
3520
    for( i = 0; i < 8; i++ )
3521
        DCT8_1D
3522
#undef SRC
3523
#undef DST
3524
    return sum;
3525
}
3526
#endif
3527

    
3528
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3529
    MpegEncContext * const s= (MpegEncContext *)c;
3530
    DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3531
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3532
    int sum=0, i;
3533

    
3534
    assert(h==8);
3535

    
3536
    s->dsp.diff_pixels(temp, src1, src2, stride);
3537
    s->dsp.fdct(temp);
3538

    
3539
    for(i=0; i<64; i++)
3540
        sum= FFMAX(sum, FFABS(temp[i]));
3541

    
3542
    return sum;
3543
}
3544

    
3545
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3546
    MpegEncContext * const s= (MpegEncContext *)c;
3547
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3548
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3549
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3550
    int sum=0, i;
3551

    
3552
    assert(h==8);
3553
    s->mb_intra=0;
3554

    
3555
    s->dsp.diff_pixels(temp, src1, src2, stride);
3556

    
3557
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3558

    
3559
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3560
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3561
    simple_idct(temp); //FIXME
3562

    
3563
    for(i=0; i<64; i++)
3564
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3565

    
3566
    return sum;
3567
}
3568

    
3569
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3570
    MpegEncContext * const s= (MpegEncContext *)c;
3571
    const uint8_t *scantable= s->intra_scantable.permutated;
3572
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3573
    DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3574
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3575
    uint8_t * const bak= (uint8_t*)aligned_bak;
3576
    int i, last, run, bits, level, distoration, start_i;
3577
    const int esc_length= s->ac_esc_length;
3578
    uint8_t * length;
3579
    uint8_t * last_length;
3580

    
3581
    assert(h==8);
3582

    
3583
    for(i=0; i<8; i++){
3584
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3585
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3586
    }
3587

    
3588
    s->dsp.diff_pixels(temp, src1, src2, stride);
3589

    
3590
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3591

    
3592
    bits=0;
3593

    
3594
    if (s->mb_intra) {
3595
        start_i = 1;
3596
        length     = s->intra_ac_vlc_length;
3597
        last_length= s->intra_ac_vlc_last_length;
3598
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3599
    } else {
3600
        start_i = 0;
3601
        length     = s->inter_ac_vlc_length;
3602
        last_length= s->inter_ac_vlc_last_length;
3603
    }
3604

    
3605
    if(last>=start_i){
3606
        run=0;
3607
        for(i=start_i; i<last; i++){
3608
            int j= scantable[i];
3609
            level= temp[j];
3610

    
3611
            if(level){
3612
                level+=64;
3613
                if((level&(~127)) == 0){
3614
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3615
                }else
3616
                    bits+= esc_length;
3617
                run=0;
3618
            }else
3619
                run++;
3620
        }
3621
        i= scantable[last];
3622

    
3623
        level= temp[i] + 64;
3624

    
3625
        assert(level - 64);
3626

    
3627
        if((level&(~127)) == 0){
3628
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3629
        }else
3630
            bits+= esc_length;
3631

    
3632
    }
3633

    
3634
    if(last>=0){
3635
        if(s->mb_intra)
3636
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3637
        else
3638
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3639
    }
3640

    
3641
    s->dsp.idct_add(bak, stride, temp);
3642

    
3643
    distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3644

    
3645
    return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3646
}
3647

    
3648
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3649
    MpegEncContext * const s= (MpegEncContext *)c;
3650
    const uint8_t *scantable= s->intra_scantable.permutated;
3651
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3652
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3653
    int i, last, run, bits, level, start_i;
3654
    const int esc_length= s->ac_esc_length;
3655
    uint8_t * length;
3656
    uint8_t * last_length;
3657

    
3658
    assert(h==8);
3659

    
3660
    s->dsp.diff_pixels(temp, src1, src2, stride);
3661

    
3662
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3663

    
3664
    bits=0;
3665

    
3666
    if (s->mb_intra) {
3667
        start_i = 1;
3668
        length     = s->intra_ac_vlc_length;
3669
        last_length= s->intra_ac_vlc_last_length;
3670
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3671
    } else {
3672
        start_i = 0;
3673
        length     = s->inter_ac_vlc_length;
3674
        last_length= s->inter_ac_vlc_last_length;
3675
    }
3676

    
3677
    if(last>=start_i){
3678
        run=0;
3679
        for(i=start_i; i<last; i++){
3680
            int j= scantable[i];
3681
            level= temp[j];
3682

    
3683
            if(level){
3684
                level+=64;
3685
                if((level&(~127)) == 0){
3686
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3687
                }else
3688
                    bits+= esc_length;
3689
                run=0;
3690
            }else
3691
                run++;
3692
        }
3693
        i= scantable[last];
3694

    
3695
        level= temp[i] + 64;
3696

    
3697
        assert(level - 64);
3698

    
3699
        if((level&(~127)) == 0){
3700
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3701
        }else
3702
            bits+= esc_length;
3703
    }
3704

    
3705
    return bits;
3706
}
3707

    
3708
static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3709
    int score=0;
3710
    int x,y;
3711

    
3712
    for(y=1; y<h; y++){
3713
        for(x=0; x<16; x+=4){
3714
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])
3715
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3716
        }
3717
        s+= stride;
3718
    }
3719

    
3720
    return score;
3721
}
3722

    
3723
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3724
    int score=0;
3725
    int x,y;
3726

    
3727
    for(y=1; y<h; y++){
3728
        for(x=0; x<16; x++){
3729
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3730
        }
3731
        s1+= stride;
3732
        s2+= stride;
3733
    }
3734

    
3735
    return score;
3736
}
3737

    
3738
#define SQ(a) ((a)*(a))
3739
static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3740
    int score=0;
3741
    int x,y;
3742

    
3743
    for(y=1; y<h; y++){
3744
        for(x=0; x<16; x+=4){
3745
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3746
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3747
        }
3748
        s+= stride;
3749
    }
3750

    
3751
    return score;
3752
}
3753

    
3754
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3755
    int score=0;
3756
    int x,y;
3757

    
3758
    for(y=1; y<h; y++){
3759
        for(x=0; x<16; x++){
3760
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3761
        }
3762
        s1+= stride;
3763
        s2+= stride;
3764
    }
3765

    
3766
    return score;
3767
}
3768

    
3769
WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3770
WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3771
WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3772
#ifdef CONFIG_GPL
3773
WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3774
#endif
3775
WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3776
WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3777
WARPER8_16_SQ(rd8x8_c, rd16_c)
3778
WARPER8_16_SQ(bit8x8_c, bit16_c)
3779

    
3780
static void vector_fmul_c(float *dst, const float *src, int len){
3781
    int i;
3782
    for(i=0; i<len; i++)
3783
        dst[i] *= src[i];
3784
}
3785

    
3786
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3787
    int i;
3788
    src1 += len-1;
3789
    for(i=0; i<len; i++)
3790
        dst[i] = src0[i] * src1[-i];
3791
}
3792

    
3793
void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3794
    int i;
3795
    for(i=0; i<len; i++)
3796
        dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3797
}
3798

    
3799
void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3800
    int i;
3801
    for(i=0; i<len; i++) {
3802
        int_fast32_t tmp = ((int32_t*)src)[i];
3803
        if(tmp & 0xf0000){
3804
            tmp = (0x43c0ffff - tmp)>>31;
3805
            // is this faster on some gcc/cpu combinations?
3806
//          if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3807
//          else                 tmp = 0;
3808
        }
3809
        dst[i] = tmp - 0x8000;
3810
    }
3811
}
3812

    
3813
/* XXX: those functions should be suppressed ASAP when all IDCTs are
3814
 converted */
3815
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3816
{
3817
    j_rev_dct (block);
3818
    put_pixels_clamped_c(block, dest, line_size);
3819
}
3820
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3821
{
3822
    j_rev_dct (block);
3823
    add_pixels_clamped_c(block, dest, line_size);
3824
}
3825

    
3826
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3827
{
3828
    j_rev_dct4 (block);
3829
    put_pixels_clamped4_c(block, dest, line_size);
3830
}
3831
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3832
{
3833
    j_rev_dct4 (block);
3834
    add_pixels_clamped4_c(block, dest, line_size);
3835
}
3836

    
3837
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3838
{
3839
    j_rev_dct2 (block);
3840
    put_pixels_clamped2_c(block, dest, line_size);
3841
}
3842
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3843
{
3844
    j_rev_dct2 (block);
3845
    add_pixels_clamped2_c(block, dest, line_size);
3846
}
3847

    
3848
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3849
{
3850
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
3851

    
3852
    dest[0] = cm[(block[0] + 4)>>3];
3853
}
3854
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3855
{
3856
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
3857

    
3858
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3859
}
3860

    
3861
static void just_return() { return; }
3862

    
3863
/* init static data */
3864
void dsputil_static_init(void)
3865
{
3866
    int i;
3867

    
3868
    for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3869
    for(i=0;i<MAX_NEG_CROP;i++) {
3870
        cropTbl[i] = 0;
3871
        cropTbl[i + MAX_NEG_CROP + 256] = 255;
3872
    }
3873

    
3874
    for(i=0;i<512;i++) {
3875
        squareTbl[i] = (i - 256) * (i - 256);
3876
    }
3877

    
3878
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3879
}
3880

    
3881

    
3882
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3883
{
3884
    int i;
3885

    
3886
#ifdef CONFIG_ENCODERS
3887
    if(avctx->dct_algo==FF_DCT_FASTINT) {
3888
        c->fdct = fdct_ifast;
3889
        c->fdct248 = fdct_ifast248;
3890
    }
3891
    else if(avctx->dct_algo==FF_DCT_FAAN) {
3892
        c->fdct = ff_faandct;
3893
        c->fdct248 = ff_faandct248;
3894
    }
3895
    else {
3896
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3897
        c->fdct248 = ff_fdct248_islow;
3898
    }
3899
#endif //CONFIG_ENCODERS
3900

    
3901
    if(avctx->lowres==1){
3902
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3903
            c->idct_put= ff_jref_idct4_put;
3904
            c->idct_add= ff_jref_idct4_add;
3905
        }else{
3906
            c->idct_put= ff_h264_lowres_idct_put_c;
3907
            c->idct_add= ff_h264_lowres_idct_add_c;
3908
        }
3909
        c->idct    = j_rev_dct4;
3910
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3911
    }else if(avctx->lowres==2){
3912
        c->idct_put= ff_jref_idct2_put;
3913
        c->idct_add= ff_jref_idct2_add;
3914
        c->idct    = j_rev_dct2;
3915
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3916
    }else if(avctx->lowres==3){
3917
        c->idct_put= ff_jref_idct1_put;
3918
        c->idct_add= ff_jref_idct1_add;
3919
        c->idct    = j_rev_dct1;
3920
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3921
    }else{
3922
        if(avctx->idct_algo==FF_IDCT_INT){
3923
            c->idct_put= ff_jref_idct_put;
3924
            c->idct_add= ff_jref_idct_add;
3925
            c->idct    = j_rev_dct;
3926
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3927
        }else if(avctx->idct_algo==FF_IDCT_VP3){
3928
            c->idct_put= ff_vp3_idct_put_c;
3929
            c->idct_add= ff_vp3_idct_add_c;
3930
            c->idct    = ff_vp3_idct_c;
3931
            c->idct_permutation_type= FF_NO_IDCT_PERM;
3932
        }else{ //accurate/default
3933
            c->idct_put= simple_idct_put;
3934
            c->idct_add= simple_idct_add;
3935
            c->idct    = simple_idct;
3936
            c->idct_permutation_type= FF_NO_IDCT_PERM;
3937
        }
3938
    }
3939

    
3940
    c->h264_idct_add= ff_h264_idct_add_c;
3941
    c->h264_idct8_add= ff_h264_idct8_add_c;
3942
    c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
3943
    c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
3944

    
3945
    c->get_pixels = get_pixels_c;
3946
    c->diff_pixels = diff_pixels_c;
3947
    c->put_pixels_clamped = put_pixels_clamped_c;
3948
    c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3949
    c->add_pixels_clamped = add_pixels_clamped_c;
3950
    c->add_pixels8 = add_pixels8_c;
3951
    c->add_pixels4 = add_pixels4_c;
3952
    c->gmc1 = gmc1_c;
3953
    c->gmc = ff_gmc_c;
3954
    c->clear_blocks = clear_blocks_c;
3955
    c->pix_sum = pix_sum_c;
3956
    c->pix_norm1 = pix_norm1_c;
3957

    
3958
    /* TODO [0] 16  [1] 8 */
3959
    c->pix_abs[0][0] = pix_abs16_c;
3960
    c->pix_abs[0][1] = pix_abs16_x2_c;
3961
    c->pix_abs[0][2] = pix_abs16_y2_c;
3962
    c->pix_abs[0][3] = pix_abs16_xy2_c;
3963
    c->pix_abs[1][0] = pix_abs8_c;
3964
    c->pix_abs[1][1] = pix_abs8_x2_c;
3965
    c->pix_abs[1][2] = pix_abs8_y2_c;
3966
    c->pix_abs[1][3] = pix_abs8_xy2_c;
3967

    
3968
#define dspfunc(PFX, IDX, NUM) \
3969
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3970
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3971
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3972
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3973

    
3974
    dspfunc(put, 0, 16);
3975
    dspfunc(put_no_rnd, 0, 16);
3976
    dspfunc(put, 1, 8);
3977
    dspfunc(put_no_rnd, 1, 8);
3978
    dspfunc(put, 2, 4);
3979
    dspfunc(put, 3, 2);
3980

    
3981
    dspfunc(avg, 0, 16);
3982
    dspfunc(avg_no_rnd, 0, 16);
3983
    dspfunc(avg, 1, 8);
3984
    dspfunc(avg_no_rnd, 1, 8);
3985
    dspfunc(avg, 2, 4);
3986
    dspfunc(avg, 3, 2);
3987
#undef dspfunc
3988

    
3989
    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3990
    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3991

    
3992
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3993
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3994
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3995
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3996
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3997
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3998
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3999
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4000
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4001

    
4002
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4003
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4004
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4005
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4006
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4007
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4008
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4009
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4010
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4011

    
4012
#define dspfunc(PFX, IDX, NUM) \
4013
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4014
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4015
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4016
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4017
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4018
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4019
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4020
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4021
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4022
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4023
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4024
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4025
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4026
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4027
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4028
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4029

    
4030
    dspfunc(put_qpel, 0, 16);
4031
    dspfunc(put_no_rnd_qpel, 0, 16);
4032

    
4033
    dspfunc(avg_qpel, 0, 16);
4034
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4035

    
4036
    dspfunc(put_qpel, 1, 8);
4037
    dspfunc(put_no_rnd_qpel, 1, 8);
4038

    
4039
    dspfunc(avg_qpel, 1, 8);
4040
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4041

    
4042
    dspfunc(put_h264_qpel, 0, 16);
4043
    dspfunc(put_h264_qpel, 1, 8);
4044
    dspfunc(put_h264_qpel, 2, 4);
4045
    dspfunc(put_h264_qpel, 3, 2);
4046
    dspfunc(avg_h264_qpel, 0, 16);
4047
    dspfunc(avg_h264_qpel, 1, 8);
4048
    dspfunc(avg_h264_qpel, 2, 4);
4049

    
4050
#undef dspfunc
4051
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4052
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4053
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4054
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4055
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4056
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4057
    c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4058

    
4059
    c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4060
    c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4061
    c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4062
    c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4063
    c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4064
    c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4065
    c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4066
    c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4067
    c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4068
    c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4069
    c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4070
    c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4071
    c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4072
    c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4073
    c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4074
    c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4075
    c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4076
    c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4077
    c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4078
    c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4079

    
4080
#ifdef CONFIG_CAVS_DECODER
4081
    ff_cavsdsp_init(c,avctx);
4082
#endif
4083
#if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4084
    ff_vc1dsp_init(c,avctx);
4085
#endif
4086

    
4087
    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4088
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4089
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4090
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4091
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4092
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4093
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4094
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4095

    
4096
#define SET_CMP_FUNC(name) \
4097
    c->name[0]= name ## 16_c;\
4098
    c->name[1]= name ## 8x8_c;
4099

    
4100
    SET_CMP_FUNC(hadamard8_diff)
4101
    c->hadamard8_diff[4]= hadamard8_intra16_c;
4102
    SET_CMP_FUNC(dct_sad)
4103
    SET_CMP_FUNC(dct_max)
4104
#ifdef CONFIG_GPL
4105
    SET_CMP_FUNC(dct264_sad)
4106
#endif
4107
    c->sad[0]= pix_abs16_c;
4108
    c->sad[1]= pix_abs8_c;
4109
    c->sse[0]= sse16_c;
4110
    c->sse[1]= sse8_c;
4111
    c->sse[2]= sse4_c;
4112
    SET_CMP_FUNC(quant_psnr)
4113
    SET_CMP_FUNC(rd)
4114
    SET_CMP_FUNC(bit)
4115
    c->vsad[0]= vsad16_c;
4116
    c->vsad[4]= vsad_intra16_c;
4117
    c->vsse[0]= vsse16_c;
4118
    c->vsse[4]= vsse_intra16_c;
4119
    c->nsse[0]= nsse16_c;
4120
    c->nsse[1]= nsse8_c;
4121
#ifdef CONFIG_SNOW_ENCODER
4122
    c->w53[0]= w53_16_c;
4123
    c->w53[1]= w53_8_c;
4124
    c->w97[0]= w97_16_c;
4125
    c->w97[1]= w97_8_c;
4126
#endif
4127

    
4128
    c->add_bytes= add_bytes_c;
4129
    c->diff_bytes= diff_bytes_c;
4130
    c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4131
    c->bswap_buf= bswap_buf;
4132

    
4133
    c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4134
    c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4135
    c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4136
    c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4137
    c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4138
    c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4139
    c->h264_loop_filter_strength= NULL;
4140

    
4141
    c->h263_h_loop_filter= h263_h_loop_filter_c;
4142
    c->h263_v_loop_filter= h263_v_loop_filter_c;
4143

    
4144
    c->h261_loop_filter= h261_loop_filter_c;
4145

    
4146
    c->try_8x8basis= try_8x8basis_c;
4147
    c->add_8x8basis= add_8x8basis_c;
4148

    
4149
#ifdef CONFIG_SNOW_ENCODER
4150
    c->vertical_compose97i = ff_snow_vertical_compose97i;
4151
    c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4152
    c->inner_add_yblock = ff_snow_inner_add_yblock;
4153
#endif
4154

    
4155
#ifdef CONFIG_VORBIS_DECODER
4156
    c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4157
#endif
4158
    c->vector_fmul = vector_fmul_c;
4159
    c->vector_fmul_reverse = vector_fmul_reverse_c;
4160
    c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4161
    c->float_to_int16 = ff_float_to_int16_c;
4162

    
4163
    c->shrink[0]= ff_img_copy_plane;
4164
    c->shrink[1]= ff_shrink22;
4165
    c->shrink[2]= ff_shrink44;
4166
    c->shrink[3]= ff_shrink88;
4167

    
4168
    c->prefetch= just_return;
4169

    
4170
    memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4171
    memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4172

    
4173
#ifdef HAVE_MMX
4174
    dsputil_init_mmx(c, avctx);
4175
#endif
4176
#ifdef ARCH_ARMV4L
4177
    dsputil_init_armv4l(c, avctx);
4178
#endif
4179
#ifdef HAVE_MLIB
4180
    dsputil_init_mlib(c, avctx);
4181
#endif
4182
#ifdef ARCH_SPARC
4183
   dsputil_init_vis(c,avctx);
4184
#endif
4185
#ifdef ARCH_ALPHA
4186
    dsputil_init_alpha(c, avctx);
4187
#endif
4188
#ifdef ARCH_POWERPC
4189
    dsputil_init_ppc(c, avctx);
4190
#endif
4191
#ifdef HAVE_MMI
4192
    dsputil_init_mmi(c, avctx);
4193
#endif
4194
#ifdef ARCH_SH4
4195
    dsputil_init_sh4(c,avctx);
4196
#endif
4197
#ifdef ARCH_BFIN
4198
    dsputil_init_bfin(c,avctx);
4199
#endif
4200

    
4201
    for(i=0; i<64; i++){
4202
        if(!c->put_2tap_qpel_pixels_tab[0][i])
4203
            c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4204
        if(!c->avg_2tap_qpel_pixels_tab[0][i])
4205
            c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4206
    }
4207

    
4208
    switch(c->idct_permutation_type){
4209
    case FF_NO_IDCT_PERM:
4210
        for(i=0; i<64; i++)
4211
            c->idct_permutation[i]= i;
4212
        break;
4213
    case FF_LIBMPEG2_IDCT_PERM:
4214
        for(i=0; i<64; i++)
4215
            c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4216
        break;
4217
    case FF_SIMPLE_IDCT_PERM:
4218
        for(i=0; i<64; i++)
4219
            c->idct_permutation[i]= simple_mmx_permutation[i];
4220
        break;
4221
    case FF_TRANSPOSE_IDCT_PERM:
4222
        for(i=0; i<64; i++)
4223
            c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4224
        break;
4225
    case FF_PARTTRANS_IDCT_PERM:
4226
        for(i=0; i<64; i++)
4227
            c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4228
        break;
4229
    default:
4230
        av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4231
    }
4232
}
4233